Re: data analysis

new topic     » goto parent     » topic index » view thread      » older message » newer message

Kat wrote:

> Graeme, since you did that amazing
> pattern finding code, i was wondering
> what your thoughts would be on this problem...

I'm obviously not Graeme, but I couldn't resist. This routine will compare
two strings, returning the differences between them. For example:

   diff( "abczz", "dbnz" )

will return:

   "[a,d]b[c,n]z[z,]"

The notation is a bit funky, but you can adjust it as you see fit:

1. 'x' means that 'x' was found in both strings.

2. '[x,]' means that 'x' was found in the first string, but not the second.

3. '[,x]' means that 'x' was found in the second string, but not the first.

4. [x,y] means that 'x' was found in the first string, but not the second
and 'y' was found in the second string but not the first.

-- David Cuny

-- compare two strings, return differences

function diff( sequence s1, sequence s2 )

    integer at1, at2, len, sync1, sync2
    sequence result

    result = ""

    -- find shortest string
    if length( s1 ) > length( s2 ) then
        len = length( s1 )
    else
        len = length( s2 )
    end if

    at1 = 0
    at2 = 0

    -- process until the end of one string
    while 1 do

        -- move ahead
        at1 += 1
        at2 += 1

        -- past end of one string?
        if at1 > length( s1 )
        or at2 > length( s2 ) then
            exit
        end if

        -- same?
        if s1[at1] = s2[at2] then
            result &= s1[at1]
        else
            -- attempt to resync
            while 1 do

                -- find closest sync
                sync1 = 9999
                for i = at1+1 to length(s1) do
                    if s2[at2] = s1[i] then
                        sync1 = i
                        exit
                    end if
                end for

                sync2 = 9999
                for i = at2+1 to length(s2) do
                    if s1[at1] = s2[i] then
                        sync2 = i
                        exit
                    end if
                end for

                -- result of sync
                if sync1 = 9999
                and sync2 = 9999 then
                    -- no sync
                    result &= sprintf( "[%s,%s]", {s1[at1],s2[at2]} )

                    -- at end?
                    if at1 = length( s1 )
                    or at2 = length( s2 ) then
                        exit
                    end if

                    -- skip
                    at1 += 1
                    at2 += 1


                elsif sync1 < sync2 then
                    -- match on sync1
                    for i = at1 to sync1-1 do
                        result &= sprintf( "[%s,]", {s1[i]} )
                    end for

                    -- sync
                    at1 = sync1
                    result &= s1[at1]

                    -- leave loop
                    exit


                else
                    -- match on sync2
                    for i = at2 to sync2-1 do
                        result &= sprintf( "[,%s]", {s2[i]})
                    end for

                    -- sync
                    at2 = sync2
                    result &= s2[at2]

                    -- leave loop
                    exit


                end if

            end while

        end if

    end while

    -- remainder?
    if at1 <= length( s1 ) then
        for i = at1 to length(s1) do
            result &= sprintf( "[%s,]", {s1[i]} )
        end for
    elsif at2 <= length( s2 ) then
        for i = at2 to length(s2) do
            result &= sprintf( "[,%s]", {s2[i]} )
        end for

    end if

    return result

end function

new topic     » goto parent     » topic index » view thread      » older message » newer message

Search



Quick Links

User menu

Not signed in.

Misc Menu