Re: data analysis

new topic     » goto parent     » topic index » view thread      » older message » newer message

Here's an updated version of the pattern matcher. The output is changed a
bit (hopefully it's more clear), and I've added some more features. It now
checks for swapped letters, and prefers removal over resyncing.

Given the test data:

   "this", "that"
   "actually", "actaully"
   "Pittsborough", "Pittsburg"
   "Pittsberg", "Pittsburg"
   "shello", "hellos"
   "fhfellos", "shello"
   "Sweigsdunka", "Zweigsdanka"


It produces:

   th[is,at]
   act[ua,au]lly
   Pittsb[o,u]r[ou,]g[h,]
   Pittsb[e,u]rg
   [s,]hello[,s]
   [f,s]h[f,]ello[s,]
   [S,Z]weigsd[u,a]nka

-- David Cuny

constant MaxGap = 6

global function diff( sequence s1, sequence s2 )

    integer at1, at2, sync1, sync2, diff1, diff2, best
    sequence result,left,right

    result = ""
    left = ""
    right = ""

    at1 = 0
    at2 = 0

    -- process until the end of one string
    while 1 do

        -- move ahead
        at1 += 1
        at2 += 1

        -- past end of one string?
        if at1 > length( s1 )
        or at2 > length( s2 ) then
            exit
        end if

        -- same?
        if s1[at1] = s2[at2] then
            if length( left ) or length( right ) then
                result &= sprintf( "[%s,%s]", {left,right})
                left = ""
                right = ""
            end if
            result &= s1[at1]
        else

            -- attempt to resync
            while 1 do

                -- swapped letters?
                if at1 < length( s1 )
                and at2 < length( s2 )
                and s1[at1] = s2[at2+1]
                and s1[at1+1] = s2[at2] then
                    left &= s1[at1..at1+1]
                    right &= s2[at2..at2+1]
                    at1 += 1
                    at2 += 1
                    exit
                end if

                -- find closest sync
                diff1 = find( s2[at2], s1[at1..length(s1)] )

                -- too far?
                if diff1 > 0 and diff1 < MaxGap then
                    sync1 = at1 + diff1 - 1
                else
                    sync1 = 9999
                    diff1 = 9999
                end if

                -- find closest sync point
                diff2 = find( s1[at1], s2[at2..length(s2)] )

                -- too far?
                if diff2 > 0 and diff2 < MaxGap then
                    sync2 = at2 + diff2 - 1
                else
                    sync2 = 9999
                    diff2 = 9999
                end if

                -- better to remove chars?
                if sync1 != 9999
                or sync2 != 9999 then

                    if diff1 < diff2 then
                        best = diff1
                    else
                        best = diff2
                    end if

                    for i = 1 to best do
                        if at1+i > length( s1 )
                        or at2+i > length( s2 ) then
                            exit
                        end if

                        if s1[at1+i-1] = s2[at2+i-1] then
                            -- better match
                            diff1 = i
                            diff2 = i
                            sync1 = at1+i-1
                            sync2 = at2+i-1
                            exit
                        end if
                    end for

                end if


                -- evaluate sync
                if sync1 = 9999
                and sync2 = 9999 then
                    -- no sync
                    left &= s1[at1]
                    right &= s2[at2]

                    -- at end?
                    if at1 = length( s1 )
                    or at2 = length( s2 ) then
                        exit
                    end if

                    -- skip
                    at1 += 1
                    at2 += 1

                    -- on a match?
                    if s1[at1] = s2[at2] then
                        if length( left ) or length( right ) then
                            result &= sprintf( "[%s,%s]", {left,right})
                            left = ""
                            right = ""
                        end if
                        result &= s1[at1]
                        exit
                    end if

                elsif diff1 = diff2
                and s1[sync1] = s2[sync2] then
                    -- match to both
                    for i = 0 to diff1-2 do
                        left &= s1[at1+i]
                        right &= s2[at2+i]
                    end for

                    -- sync
                    at1 = sync1
                    at2 = sync2

                    if length( left ) or length( right ) then
                        result &= sprintf( "[%s,%s]", {left,right})
                        left = ""
                        right = ""
                    end if
                    result &= s1[at1]

                    -- leave loop
                    exit


                elsif diff1 <= diff2 then
                    -- match on sync1
                    for i = at1 to sync1-1 do
                        left &= s1[i]
                    end for

                    -- sync
                    at1 = sync1
                    if length( left ) or length( right ) then
                        result &= sprintf( "[%s,%s]", {left,right})
                        left = ""
                        right = ""
                    end if
                    result &= s1[at1]

                    -- leave loop
                    exit


                else
                    -- match on sync2
                    for i = at2 to sync2-1 do
                        right &= s2[i]
                    end for

                    -- sync
                    at2 = sync2
                    if length( left ) or length( right ) then
                        result &= sprintf( "[%s,%s]", {left,right})
                        left = ""
                        right = ""
                    end if
                    result &= s2[at2]

                    -- leave loop
                    exit


                end if

            end while

        end if

    end while

    -- remainder?
    if at1 <= length( s1 ) then
        for i = at1 to length(s1) do
            left &= s1[i]
        end for
    elsif at2 <= length( s2 ) then
        for i = at2 to length(s2) do
            right &= s2[i]
        end for

    end if

    if length( left ) or length( right ) then
        result &= sprintf( "[%s,%s]", {left,right})
    end if

    return result

end function

new topic     » goto parent     » topic index » view thread      » older message » newer message

Search



Quick Links

User menu

Not signed in.

Misc Menu