Re: 20gigabyte string searching
Kat, my Euphoria is a bit rusty, I have not touch it for almost two years, but
you might be interested in following code.
Let me know if you are, in principle, because a number of substantial
optimizations is still possible.
jiri
-- kat.ex : version 1.00
-- jiri babor
-- jbabor at paradise.net.nz
-- 04-10-04
include file.e -- dir
function search(sequence s, object key)
-- binary search of sorted sequence
-- return {flag, index}, where
-- if flag = 1 ... key found at index
-- if flag = 0 ... key not found, index is insertion point
integer c,hi,lo,mid
lo = 1
hi = length(s)
while lo <= hi do
mid = floor((lo + hi) / 2)
c = compare(key, s[mid])
if c < 0 then -- key < s[mid]
hi = mid - 1
elsif c > 0 then -- key > s[mid]
lo = mid + 1
else -- key = s[mid]
return {1,mid}
end if
end while
return {0,lo}
end function
function pword(integer n) -- pseudo word: random ASCII string
sequence w
w = {}
for j = 1 to rand(n) do
w &= rand(26)+96
end for
return w
end function
procedure hoard(sequence path, integer rid, sequence params, integer maxn)
-- path: data directory
-- rid: routine_id of function returning next address
-- params: parameter sequence of rid
-- maximum number of addresses held in single file
object a,x
sequence data,last,lasts,name,names,newname,r
integer count,f,f1,f2,hits,m,m1,n,r2
count = 0 -- address counter
hits = 0 -- address collision counter
if path[length(path)] != '\\' then
path &= '\\'
end if
x = dir(path)
if atom(x) then
puts(1,"Error: data directory does not exist. Aborting...\n")
abort(1)
else
names = {}
lasts = {}
f1 = open(path & "names","r")
f2 = open(path & "lasts","r")
if f1 != -1 then -- initialized database
while 1 do
x = gets(f1)
if atom(x) then exit end if
x = x[1..length(x)-1] -- strip '\n'
names = append(names, x)
end while
close(f1)
lasts = {}
while 1 do
x = gets(f2)
if atom(x) then exit end if
x = x[1..length(x)-1] -- strip '\n'
lasts = append(lasts, x)
end while
close(f2)
end if
n = length(names)
end if
-- main loop
while get_key() != 27 do -- esc
a = call_func(rid,params)
count += 1
r = search(lasts, a)
r2 = r[2]
if not r[1] then -- not found among last addresses
if r2 > n then -- address > last address of last data
file
if n = 0 then -- no data files yet
n = 1
name = "1" -- new file name
f = open(path & name, "w")
puts(f, a & '\n')
close(f)
names = append(names, name)
lasts = append(lasts, a)
else -- append the last data file
name = names[n]
f = open(path & name, "a")
puts(f, a & '\n')
close(f)
lasts[n] = a
end if
else
name = names[r2]
data = {}
f = open(path & name, "r")
while 1 do
x = gets(f)
if atom(x) then exit end if
x = x[1..length(x)-1] -- strip '\n'
data = append(data, x)
end while
close(f)
r = search(data, a)
if r[1] = 0 then -- not found: insert
data = data[1..r[2]-1] & {a} & data[r[2]..length(data)]
-- write it out
m = length(data)
if m > maxn then -- subdivide
m1 = floor(m/2)
newname = sprint(n+1)
names = names[1..r2-1] & {newname} & names[r2..n]
last = data[m1]
lasts = lasts[1..r2-1] & {last} & lasts[r2..n]
n += 1
f = open(path & newname, "w")
for i = 1 to m1 do
puts(f, data[i] & '\n')
end for
close(f)
data = data[m1+1..m]
end if
f = open(path & name, "w")
for i = 1 to length(data) do
puts(f, data[i] & '\n')
end for
close(f)
else
hits += 1
end if
end if
else
hits += 1
end if
end while
-- save control sequences
f1 = open(path & "names", "w")
for i = 1 to length(names) do
puts(f1, names[i] & '\n')
end for
close(f1)
f2 = open(path & "lasts", "w")
for i = 1 to length(lasts) do
puts(f2, lasts[i] & '\n')
end for
close(f2)
printf(1, "%d addresses\n", count)
printf(1, "%d collisions\n", hits)
end procedure
hoard("c:\\data", routine_id("pword"), {4}, 100)
|
Not Categorized, Please Help
|
|