OpenEuphoria: Forum: Similarity routines

Similarity routines

new topic » topic index » view thread » older message » newer message
Posted by rforno at tutopia.com Sep 02, 2001
434 views
This is a multi-part message in MIME format.

------=_NextPart_000_0008_01C133B5.2E14D960
	charset="iso-8859-1"

I have sent these routines to the User Contributions. The first time I
failed to attach the routines themselves when I sent them to the forum, so I
am correcting this now.

------=_NextPart_000_0008_01C133B5.2E14D960
	name="Examples.ex"
	filename="Examples.ex"

--Examples for the similarity routines

include similar.e

initdif() --Execute it only once at the beginning of your program

procedure testsim(sequence a, sequence b)
    atom x, y, z
    puts(1, a & " <-> " & b & "\n")
    x =3D simil(a, b)
    printf(1, "%s %f\n", {"Similarity:", x})
    puts(1, b & " <-> " & a & "\n")
    y =3D simil(b, a)
    printf(1, "%s %f\n", {"Similarity:", y})
    if x < y then
	z =3D y
    else
	z =3D x
    end if
    printf(1, "%s %f %s %f\n", {"Average:", (x + y) / 2, "Greater:", z})
end procedure

testsim("aaaaaa", "kkkkkkkkkkkk")
testsim("Napole=A2n", "Napole=A2n")
testsim("Napole=A2n", "Napoleon")
testsim("=A0=8A=93=81=C6=8D", "AEOUa=8C")
testsim("Kennedy John Fitzgerald", "John Fitzgerald Kennedy")
testsim("Beethoven", "Ludwig van Beethoven")

------=_NextPart_000_0008_01C133B5.2E14D960
Content-Type: application/octet-stream;
	name="similar.e"
Content-Transfer-Encoding: quoted-printable
Content-Disposition: attachment;
	filename="similar.e"

--Routine to evaluate similarity coefficient between 2 strings
--Author: R. M. Forno - Version 1.0 - 2001/08/26
--Based on an algorithm developed by the author in 1973
--Published in Spanish during 1979

--Given two strings (usually ASCII) s and t, the algorithm determines a
-- similarity coefficient between 0 (minimum) and 1 (maximum).
--It should be noted that the result is non-commutative, that is, it is =
not
-- the same similarity(a, b) than similarity(b, a).
--Unless there is a reason to the contrary, it is advised to compute
-- the two alternatives and take the greater or the average.
--Some experimentation with these routines will lead to better results.

--As this algorithm is rather slow, it is advised to use previously =
another one
-- in order to reject rapidly cases of low similarity.
--A simple algorithm to that end is also included here.
--This simple algorith is called simil(sequence s, sequence t)
--The main algorithm is called similarity(sequence s, sequence t)
--Previously, it is necessary to execute algorithm initdif() only once.

--A useful routine to extend upper() to national characters is included

--There are some examples in an attached file

constant MAXDIF =3D 1000, --Used to avoid computation with floating =
numbers
	 MAXINT =3D 1073741823,  --Greatest integer allowed
	 ALPHABET =3D"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVXYZ" &
	   =
"=A0=82=A1=A2=A3=B5=90=D6=E0=E9=81=9A=84=89=8B=94=85=8A=8D=95=97=83=88=8C=
=93=96=8E=D3=D8=99=B7=D4=DE=E3=EB=B6=D2=D7=E2=EA" &
	   "=87=80=A4=A5=86=8F=9B=9D=C6=C7=E4=E5=EC=ED",
	--These constants apply mainly to the English, Spanish, French,
	-- Portuguese, Italian and some other languages.
	--It is possible to modify them to adapt to other languages as well.
	 UPPER =3D "ABCDEFGHIJKLMNOPQRSTUVWXYZ=ED=A5=80",
	 LOWER =3D "abcdefghijklmnopqrstuvwxyz=EC=A4=87",
	 UPPERA =3D "A=B5=B7=B6=8E=8F=C7",
	 LOWERA =3D "a=A0=85=83=84=86=C6",
	 UPPERE =3D "E=90=D4=D2=D3",
	 LOWERE =3D "e=82=8A=88=89",
	 UPPERI =3D "I=D6=DE=D7=D8",
	 LOWERI =3D "i=A1=8D=8C=8B",
	 UPPERO =3D "O=E0=E3=E2=99=E5=9D",
	 LOWERO =3D "o=A2=95=93=94=E4=9B",
	 UPPERU =3D "U=E9=EB=EA=9A",
	 LOWERU =3D "u=A3=97=96=81",
	 VOWELS =3D =
"AEIOUaeiou=B5=90=D6=E0=E9=B7=D4=DE=E3=EB=B6=D2=D7=E2=EA=8E=D3=D8=99=9A=8F=
=9D=C7=E5=A0=82=A1=A2=A3=85=8A=8D=95=97=83=88=8C=93=96=84=89=8B=94=81=86=9B=
=C6=E4",
	 CONSON =3D =
"bcdfghjklmnpqrstvwxyzBCDFGHJKLMNPQRSTVWXYZ=A4=A5=87=80=EC=ED",
	 RESULT =3D UPPER &
	   repeat('A', length(UPPERA)) & repeat('E', length(UPPERE)) &
	   repeat('I', length(UPPERI)) & repeat('O', length(UPPERO)) &
	   repeat('U', length(UPPERU)) &
	   repeat('A', length(UPPERA)) & repeat('E', length(UPPERE)) &
	   repeat('I', length(UPPERI)) & repeat('O', length(UPPERO)) &
	   repeat('U', length(UPPERU)),
	 BASE =3D LOWER & LOWERA & LOWERE & LOWERI & LOWERO & LOWERU &
	   UPPERA & UPPERE & UPPERI & UPPERO & UPPERU,
	 LOWERTOUPPER =3D 0.1, --Lower to upper and viceversa coefficient
	 ACCENTFACTOR =3D 0.1 --Accented - non-accented coefficient
sequence difmat, --Matrix of differences
--You can fill this matrix by specifying its coefficients one by one,
-- but this is a tedious process.
	 shiftvect --Vector for the extended_upper routine

include wildcard.e

function round(object o)
    return floor(o + 0.5)
end function

procedure rel(integer a, integer b, atom c)
--Initializes difference matrix for selected pairs of letters
    integer x, y
    atom d
    d =3D c * (1 - LOWERTOUPPER)
    x =3D upper(a) + 1
    y =3D upper(b) + 1
    a +=3D 1
    b +=3D 1
    difmat[a][b] =3D d
    difmat[b][a] =3D d
    difmat[x][y] =3D d
    difmat[y][x] =3D d
    difmat[a][y] =3D c
    difmat[y][a] =3D c
    difmat[x][b] =3D c
    difmat[b][x] =3D c
end procedure

procedure uplow(sequence up, sequence lo, atom c)
--Process upper/lower accented vowels
    atom d
    integer len
    d =3D c * (1 - LOWERTOUPPER)
    len =3D length(up)
    for i =3D 1 to len do
	for j =3D 1 to len do
	    difmat[up[i] + 1][up[j] + 1] =3D d
	    difmat[lo[i] + 1][lo[j] + 1] =3D d
	    difmat[up[i] + 1][lo[j] + 1] =3D c
	    difmat[lo[i] + 1][up[j] + 1] =3D c
	end for
    end for
end procedure

global procedure initdif()
--Initializes difference matrix. It should be run only once.
--Same letter cell =3D 0, very different =3D 1, others in between.
    sequence s
    integer len
    difmat =3D repeat(repeat(1, 256), 256)
    rel('b', 'v', 0.6) --Fill cells for selected letter pairs
    rel('c', 'z', 0.6)
    rel('c', 's', 0.6)
    rel('s', 'z', 0.5)
    rel('x', 'c', 0.6)
    rel('x', 's', 0.6)
    rel('x', 'z', 0.6)
    rel('c', 'k', 0.7)
    rel('c', 'q', 0.7)
    rel('k', 'q', 0.6)
    rel('=87', 'z', 0.7)
    rel('=80', 'z', 0.7)
    rel('=87', 's', 0.8)
    rel('=80', 's', 0.8)
    rel('=87', 'x', 0.8)
    rel('=80', 'x', 0.8)
    rel('=87', 'c', 0.5)
    rel('=80', 'c', 0.5)
    for i =3D 1 to length(LOWERI) do --Consider upper() does not work
	rel('y', LOWERI[i], 0.7) -- for national characters=20
	rel('=EC', LOWERI[i], 0.7)
	rel('=ED', LOWERI[i], 0.7)
    end for
    for i =3D 1 to length(LOWERI) do
	rel('j', LOWERI[i], 0.8)
    end for
    for i =3D 1 to length(LOWERU) do
	rel('w', LOWERU[i], 0.7)
    end for
    rel('w', 'v', 0.7)
    rel('m', 'n', 0.8)
    rel('g', 'j', 0.7)
    rel('d', 't', 0.9)
    rel('h', 'j', 0.95)
    rel('h', 'g', 0.95)
    rel('p', 'b', 0.96)
    rel('p', 'v', 0.97)
    rel('l', 'r', 0.97)
    rel('f', 'v', 0.97)
    rel('f', 'b', 0.98)
    rel('f', 's', 0.98)
    uplow(UPPERA, LOWERA, ACCENTFACTOR)
    uplow(UPPERE, LOWERE, ACCENTFACTOR)
    uplow(UPPERI, LOWERI, ACCENTFACTOR)
    uplow(UPPERO, LOWERO, ACCENTFACTOR)
    uplow(UPPERU, LOWERU, ACCENTFACTOR)
    for i =3D 1 to length(UPPER) do
	difmat[UPPER[i] + 1][LOWER[i] + 1] =3D LOWERTOUPPER
	difmat[LOWER[i] + 1][UPPER[i] + 1] =3D LOWERTOUPPER
    end for
    len =3D length(VOWELS)
    for i =3D 1 to len do
	for j =3D 1 to len do
	    difmat[VOWELS[i] + 1][VOWELS[j] + 1] *=3D 0.75
	end for
    end for
    len =3D length(CONSON)
    for i =3D 1 to len do
	for j =3D 1 to len do
	    difmat[CONSON[i] + 1][CONSON[j] + 1] *=3D 0.7
	end for
    end for
    s =3D repeat(0, 256)
    for i =3D 1 to length(ALPHABET) do --Flag alpha chars
	s[ALPHABET[i] + 1] =3D 1
    end for
    for i =3D 1 to 256 do --Non-alpha chars nearly equal among them
	for j =3D 1 to 256 do
	    if s[i] =3D 0 and s[j] =3D 0 then
		difmat[i][j] =3D 0.05
	    end if
	end for
    end for
    for i =3D 1 to 256 do --Same char has 0 difference
	difmat[i][i] =3D 0
    end for
    difmat =3D round(difmat * MAXDIF)
	--Generate vector for the extended_upper routine
    shiftvect =3D repeat(0, 256)
    for i =3D 1 to 256 do
	shiftvect[i] =3D i - 1
    end for
    for i =3D 1 to length(BASE) do
	shiftvect[BASE[i] + 1] =3D RESULT[i]
    end for
end procedure

function func(sequence s) --Computes factor for distance between =
characters
--You may modify it as desired
    s *=3D s
    return floor(s / (1 + s) * MAXDIF)
end function

global function similarity(sequence s, sequence t) --Main algorithm
--This algorithm is somewhat difficult to explain.
--It is based on the technique of dynamic programming (Bellman / =
Dreyfus).
--It obtains successive minimums from differences between characters and
-- the distance between them.
    integer lens, lent, a
    sequence bmat, wvec, rmat, aux, aux1
    lens =3D length(s)
    lent =3D length(t)
    if lens =3D 0 or lent =3D 0 then
	return 0
    end if
    bmat =3D repeat(repeat(0, lent), lens)
    for i =3D 1 to lens do
	aux =3D difmat[s[i] + 1]
	for j =3D 1 to lent do
	    bmat[i][j] =3D aux[t[j] + 1]
	end for
    end for
    aux =3D repeat(0, lent)
    for i =3D 1 to lent do
	aux[i] =3D lent - i
    end for
    bmat[lens] +=3D func(aux)
    for i =3D 1 to lent do
	aux[i] =3D i - 1
    end for
    wvec =3D bmat[1] + func(aux)
    aux =3D repeat(repeat(0, lent), lent)
    for i =3D 1 to lent do
	for j =3D 1 to lent do
	    aux[i][j] =3D j - i + 1
	end for
    end for
    rmat =3D func(aux)
    for i =3D 2 to lens do
	aux =3D wvec
	for j =3D 1 to lent do
	    aux1 =3D aux + rmat[j]
	    a =3D MAXINT
	    for k =3D 1 to lent do
		if a > aux1[k] then
		    a =3D aux1[k]
		end if
	    end for
	    wvec[j] =3D a
	end for
	wvec +=3D bmat[i]
    end for
    a =3D MAXINT
    for i =3D 1 to lent do
	if a > wvec[i] then
	    a =3D wvec[i]
	end if
    end for
    return 1.0 - a / MAXDIF / (lens + 1)
end function

function extended_upper(sequence s) --Extends upper() to national =
characters
    for i =3D 1 to length(s) do
	s[i] =3D shiftvect[s[i] + 1]
    end for
    return s
end function

global function simil(sequence s, sequence t) --Screens low similarity =
data
    sequence x, y, w, z
    integer c, a, lens, lent
    atom r
    lens =3D length(s)
    lent =3D length(t)
    x =3D extended_upper(s)
    y =3D extended_upper(t)
    w =3D repeat(0, 256)
    z =3D w
    for i =3D 1 to lens do --Count characters
	w[x[i] + 1] +=3D 1
    end for
    for i =3D 1 to lent do
	z[y[i] + 1] +=3D 1
    end for
    c =3D 0
    for i =3D 1 to 256 do
	a =3D w[i] - z[i]
	if a < 0 then --Use absolute value
	    c -=3D a
	else
	    c +=3D a
	end if
    end for
    r =3D 1.0 - c / (lens + lent)
    if r < 0.5 then --Reject low similarity data
	return r
    end if
    return similarity(s, t)
end function

--END OF FILE--


------=_NextPart_000_0008_01C133B5.2E14D960--
OpenEuphoria

Similarity routines

Search

Include:

Quick Links

User menu

Misc Menu