Re: data analysis
Here's an updated version of the pattern matcher. The output is changed a
bit (hopefully it's more clear), and I've added some more features. It now
checks for swapped letters, and prefers removal over resyncing.
Given the test data:
"this", "that"
"actually", "actaully"
"Pittsborough", "Pittsburg"
"Pittsberg", "Pittsburg"
"shello", "hellos"
"fhfellos", "shello"
"Sweigsdunka", "Zweigsdanka"
It produces:
th[is,at]
act[ua,au]lly
Pittsb[o,u]r[ou,]g[h,]
Pittsb[e,u]rg
[s,]hello[,s]
[f,s]h[f,]ello[s,]
[S,Z]weigsd[u,a]nka
-- David Cuny
constant MaxGap = 6
global function diff( sequence s1, sequence s2 )
integer at1, at2, sync1, sync2, diff1, diff2, best
sequence result,left,right
result = ""
left = ""
right = ""
at1 = 0
at2 = 0
-- process until the end of one string
while 1 do
-- move ahead
at1 += 1
at2 += 1
-- past end of one string?
if at1 > length( s1 )
or at2 > length( s2 ) then
exit
end if
-- same?
if s1[at1] = s2[at2] then
if length( left ) or length( right ) then
result &= sprintf( "[%s,%s]", {left,right})
left = ""
right = ""
end if
result &= s1[at1]
else
-- attempt to resync
while 1 do
-- swapped letters?
if at1 < length( s1 )
and at2 < length( s2 )
and s1[at1] = s2[at2+1]
and s1[at1+1] = s2[at2] then
left &= s1[at1..at1+1]
right &= s2[at2..at2+1]
at1 += 1
at2 += 1
exit
end if
-- find closest sync
diff1 = find( s2[at2], s1[at1..length(s1)] )
-- too far?
if diff1 > 0 and diff1 < MaxGap then
sync1 = at1 + diff1 - 1
else
sync1 = 9999
diff1 = 9999
end if
-- find closest sync point
diff2 = find( s1[at1], s2[at2..length(s2)] )
-- too far?
if diff2 > 0 and diff2 < MaxGap then
sync2 = at2 + diff2 - 1
else
sync2 = 9999
diff2 = 9999
end if
-- better to remove chars?
if sync1 != 9999
or sync2 != 9999 then
if diff1 < diff2 then
best = diff1
else
best = diff2
end if
for i = 1 to best do
if at1+i > length( s1 )
or at2+i > length( s2 ) then
exit
end if
if s1[at1+i-1] = s2[at2+i-1] then
-- better match
diff1 = i
diff2 = i
sync1 = at1+i-1
sync2 = at2+i-1
exit
end if
end for
end if
-- evaluate sync
if sync1 = 9999
and sync2 = 9999 then
-- no sync
left &= s1[at1]
right &= s2[at2]
-- at end?
if at1 = length( s1 )
or at2 = length( s2 ) then
exit
end if
-- skip
at1 += 1
at2 += 1
-- on a match?
if s1[at1] = s2[at2] then
if length( left ) or length( right ) then
result &= sprintf( "[%s,%s]", {left,right})
left = ""
right = ""
end if
result &= s1[at1]
exit
end if
elsif diff1 = diff2
and s1[sync1] = s2[sync2] then
-- match to both
for i = 0 to diff1-2 do
left &= s1[at1+i]
right &= s2[at2+i]
end for
-- sync
at1 = sync1
at2 = sync2
if length( left ) or length( right ) then
result &= sprintf( "[%s,%s]", {left,right})
left = ""
right = ""
end if
result &= s1[at1]
-- leave loop
exit
elsif diff1 <= diff2 then
-- match on sync1
for i = at1 to sync1-1 do
left &= s1[i]
end for
-- sync
at1 = sync1
if length( left ) or length( right ) then
result &= sprintf( "[%s,%s]", {left,right})
left = ""
right = ""
end if
result &= s1[at1]
-- leave loop
exit
else
-- match on sync2
for i = at2 to sync2-1 do
right &= s2[i]
end for
-- sync
at2 = sync2
if length( left ) or length( right ) then
result &= sprintf( "[%s,%s]", {left,right})
left = ""
right = ""
end if
result &= s2[at2]
-- leave loop
exit
end if
end while
end if
end while
-- remainder?
if at1 <= length( s1 ) then
for i = at1 to length(s1) do
left &= s1[i]
end for
elsif at2 <= length( s2 ) then
for i = at2 to length(s2) do
right &= s2[i]
end for
end if
if length( left ) or length( right ) then
result &= sprintf( "[%s,%s]", {left,right})
end if
return result
end function
|
Not Categorized, Please Help
|
|