Re: data analysis

new topic     » goto parent     » topic index » view thread      » older message » newer message

Graeme wrote:

> results as follows

Thanks, I've made a couple of corrections:

1. Replaced the buggy code ('at1+1' should have read 'at+1') with find().
This corrected a bug, and made my code only about half as slow as Graeme's.
Thanks for the hint!

2. Added a MaxGap value that rejects matches past a certain length. This
allows matches text like the final example's. There are better ways to
decide this, but it was quick and dirty.

Thanks!

-- David Cuny


constant MaxGap = 6

global function diff( sequence s1, sequence s2 )

    integer at1, at2, len, sync1, sync2
    sequence result

    result = ""

    -- find shortest string
    if length( s1 ) > length( s2 ) then
        len = length( s1 )
    else
        len = length( s2 )
    end if

    at1 = 0
    at2 = 0

    -- process until the end of one string
    while 1 do

        -- move ahead
        at1 += 1
        at2 += 1

        -- past end of one string?
        if at1 > length( s1 )
        or at2 > length( s2 ) then
            exit
        end if

        -- same?
        if s1[at1] = s2[at2] then
            result &= s1[at1]
        else
            -- attempt to resync
            while 1 do

                -- find closest sync
                sync1 = find( s2[at2], s1[at1..length(s1)] )

                -- too far?
                if sync1 > 0 and sync1 < MaxGap then
                    sync1 += at1 - 1
                else
                    sync1 = 9999
                end if

                find closest sync point
                sync2 = find( s1[at1], s2[at2..length(s2)] )

                -- too far?
                if sync2 > 0 and sync2 < MaxGap then
                    sync2 += at2 - 1
                else
                    sync2 = 9999
                end if

                -- evaluate sync
                if sync1 = 9999
                and sync2 = 9999 then
                    -- no sync
                    result &= sprintf( "[%s,%s]", {s1[at1],s2[at2]} )

                    -- at end?
                    if at1 = length( s1 )
                    or at2 = length( s2 ) then
                        exit
                    end if

                    -- skip
                    at1 += 1
                    at2 += 1


                elsif sync1 < sync2 then
                    -- match on sync1
                    for i = at1 to sync1-1 do
                        result &= sprintf( "[%s,]", {s1[i]} )
                    end for

                    -- sync
                    at1 = sync1
                    result &= s1[at1]

                    -- leave loop
                    exit


                else
                    -- match on sync2
                    for i = at2 to sync2-1 do
                        result &= sprintf( "[,%s]", {s2[i]})
                    end for

                    -- sync
                    at2 = sync2
                    result &= s2[at2]

                    -- leave loop
                    exit


                end if

            end while

        end if

    end while

    -- remainder?
    if at1 <= length( s1 ) then
        for i = at1 to length(s1) do
            result &= sprintf( "[%s,]", {s1[i]} )
        end for
    elsif at2 <= length( s2 ) then
        for i = at2 to length(s2) do
            result &= sprintf( "[,%s]", {s2[i]} )
        end for

    end if

    return result

end function

new topic     » goto parent     » topic index » view thread      » older message » newer message

Search



Quick Links

User menu

Not signed in.

Misc Menu