Re: 20gigabyte string searching

new topic     » goto parent     » topic index » view thread      » older message » newer message

Kat, my Euphoria is a bit rusty, I have not touch it for almost two years, but
you might be interested in following code.
Let me know if you are, in principle, because a number of substantial
optimizations is still possible.

jiri

--  kat.ex : version 1.00
--  jiri babor
--  jbabor at paradise.net.nz
--  04-10-04

include file.e                          -- dir    

function search(sequence s, object key)
    --  binary search of sorted sequence
    --  return {flag, index}, where
    --      if flag = 1 ... key found at index 
    --      if flag = 0 ... key not found, index is insertion point

    integer c,hi,lo,mid

    lo = 1
    hi = length(s)

    while lo <= hi do
        mid = floor((lo + hi) / 2)
        c = compare(key, s[mid])
        if    c < 0 then                -- key < s[mid]
            hi = mid - 1
        elsif c > 0 then                -- key > s[mid]
            lo = mid + 1
        else                            -- key = s[mid]
            return {1,mid}
        end if
    end while

    return {0,lo}
end function

function pword(integer n)               -- pseudo word: random ASCII string 
    sequence w

    w = {}
    for j = 1 to rand(n) do
        w &= rand(26)+96
    end for
    return w
end function

procedure hoard(sequence path, integer rid, sequence params, integer maxn)
    --  path: data directory
    --  rid: routine_id of function returning next address
    --  params: parameter sequence of rid
    --  maximum number of addresses held in single file

    object a,x
    sequence data,last,lasts,name,names,newname,r
    integer count,f,f1,f2,hits,m,m1,n,r2

    count = 0                           -- address counter
    hits = 0                            -- address collision counter

    if path[length(path)] != '\\' then
        path &= '\\'
    end if

    x = dir(path)
    if atom(x) then
        puts(1,"Error: data directory does not exist. Aborting...\n")
        abort(1)
    else
        names = {}
        lasts = {}

        f1 = open(path & "names","r")
        f2 = open(path & "lasts","r")

        if f1 != -1 then                -- initialized database
            while 1 do
                x = gets(f1)
                if atom(x) then exit end if
                x = x[1..length(x)-1]   -- strip '\n'
                names = append(names, x)
            end while
            close(f1)

            lasts = {}
            while 1 do
                x = gets(f2)
                if atom(x) then exit end if
                x = x[1..length(x)-1]   -- strip '\n'
                lasts = append(lasts, x)
            end while
            close(f2)
        end if
        n = length(names)
    end if

    -- main loop
    while get_key() != 27 do            -- esc
        a = call_func(rid,params)
        count += 1
        r = search(lasts, a)
        r2 = r[2]
        if not r[1] then                -- not found among last addresses
if r2 > n then              -- address > last address of last data
            file
                if n = 0 then           -- no data files yet
                    n = 1
                    name = "1"          -- new file name
                    f = open(path & name, "w")
                    puts(f, a & '\n')
                    close(f)
                    names = append(names, name)
                    lasts = append(lasts, a)
                else                    -- append the last data file
                    name = names[n]
                    f = open(path & name, "a")
                    puts(f, a & '\n')
                    close(f)
                    lasts[n] = a
                end if
            else
                name = names[r2]
                data = {}
                f = open(path & name, "r")
                while 1 do
                    x = gets(f)
                    if atom(x) then exit end if
                    x = x[1..length(x)-1]   -- strip '\n'
                    data = append(data, x)
                end while
                close(f)
                r = search(data, a)
                if r[1] = 0 then        -- not found: insert
                    data = data[1..r[2]-1] & {a} & data[r[2]..length(data)]
                    -- write it out
                    m = length(data)
                    if m > maxn then    -- subdivide
                        m1 = floor(m/2)
                        newname = sprint(n+1)
                        names = names[1..r2-1] & {newname} & names[r2..n]
                        last = data[m1]
                        lasts = lasts[1..r2-1] & {last} & lasts[r2..n]
                        n += 1
                        f = open(path & newname, "w")
                        for i = 1 to m1 do
                            puts(f, data[i] & '\n')
                        end for
                        close(f)
                        data = data[m1+1..m]
                    end if
                    f = open(path & name, "w")
                    for i = 1 to length(data) do
                        puts(f, data[i] & '\n')
                    end for
                    close(f)
                else
                    hits += 1
                end if
            end if
        else
            hits += 1
        end if
    end while

    -- save control sequences
    f1 = open(path & "names", "w")
    for i = 1 to length(names) do
        puts(f1, names[i] & '\n')
    end for
    close(f1)
    f2 = open(path & "lasts", "w")
    for i = 1 to length(lasts) do
        puts(f2, lasts[i] & '\n')
    end for
    close(f2)

    printf(1, "%d addresses\n", count)
    printf(1, "%d collisions\n", hits)
end procedure

hoard("c:\\data", routine_id("pword"), {4}, 100)


new topic     » goto parent     » topic index » view thread      » older message » newer message

Search



Quick Links

User menu

Not signed in.

Misc Menu