Re: Compressed Linetables

new topic     » goto parent     » topic index » view thread      » older message » newer message

Here's a first attempt:

 
-- use bit precision that fit euphoria's integer type 
constant 
    bit_precision = 30, 
    limit = power(2, bit_precision) - 1, 
    bit1 = power(2, bit_precision - 1) 
 
-- sum a sequence 
function sum(sequence s) 
    atom result 
    result = 0 
    for i = 1 to length(s) do 
        result += s[i] 
    end for 
    return result 
end function 
 
-- generate look-up-table of weights, normalized to bit precision 
function generate_lut(sequence s) 
    sequence lut 
    integer x 
    atom sum_s 
    -- scale weights from 0 to bit limit 
    sum_s = sum(s) 
    lut = repeat(0, length(s)) 
    -- change independent weights to increasing ranges 
    x = 0 
    for i = 1 to length(lut) do 
        lut[i] = x 
        x += floor(s[i] * limit / sum_s) 
    end for 
    return lut 
end function 
 
-- add an integer x to a binary sequence bits 
function badd(sequence bits, atom x) 
    integer c, s 
    atom j 
    j = 1 
    c = 0 
    for i = length(bits) to 1 by -1 do 
        s = bits[i] + (and_bits(x, j) != 0) + c 
        c = s > 1 
        bits[i] = and_bits(s, 1) 
        j += j 
    end for 
    if c then 
        printf(1, "badd overflow! x=%d bits=", {x}) ? bits 
    end if 
    return bits 
end function 
 
-- return an integer representing the binary difference of x and y 
function bdif(sequence x, sequence y) 
    integer c, s 
    atom j, result 
    j = 1 
    c = 0 
    result = 0 
    for i = length(x) to 1 by -1 do 
        if i <= length(y) then 
            s = y[i] 
        else 
            s = 0 
        end if 
        s -= x[i] + c 
        c = s < 0 
        result += j * and_bits(s, 1) 
        j += j 
    end for 
    return result * (1 - 2 * c) 
end function 
 
-- binary search between l range d for a symbol in lut that matches code 
function bsearch(sequence l, integer d, sequence code, sequence lut) 
    integer a, b, mid, x 
    sequence tmp 
 
    a = 1 
    b = length(lut) - 1 
 
    while a != b do 
        mid = floor((a + b + 1) / 2) 
        x = lut[mid] 
        tmp = badd(l, floor(x * d / (limit+1))) 
        if bdif(tmp, code) < 0 then 
            b = mid - 1 
        else 
            a = mid 
        end if 
    end while 
    return a 
end function 
 
-- arithmetic encode a list of symbols, given a weighted symbol table 
function arith_encode(sequence list, sequence syms, sequence weights) 
    sequence lut, result, u, l 
    integer sym, a, b, d 
 
    lut = generate_lut(weights) & limit 
    l = repeat(0, bit_precision) 
    u = repeat(1, bit_precision) 
    result = {} 
 
    for i = 1 to length(list) do 
        -- get the symbol index from the list 
        sym = find(list[i], syms) 
        if sym = 0 then 
            puts(1, "symbol not found: ") ? list[i] 
            return 0 
        end if 
 
        -- find the difference of the upper and lower range 
        d = bdif(l, u) 
 
        -- scale up d so that we have more precision for bounds 
        while d < bit1 and length(l) < bit_precision do 
            d += d 
            l &= 0 
        end while 
        --printf(1, "sym=%d d=%d\n", {sym, d}) 
 
        -- get the symbol bounds from look-up-table 
        a = lut[sym] 
        b = lut[sym+1] 
 
        -- scale the symbol bounds by the difference 
        a = floor(a * d / (limit+1)) 
        b = floor(b * d / (limit+1)) 
 
        -- create the new range from the symbol bounds 
        u = badd(l, b) 
        l = badd(l, a) 
 
        -- last item? 
        if i = length(list) then 
            -- figure out how many bits we need to flush 
            u = badd(l, floor((b - a) / 2)) 
            for j = 1 to length(l) do 
                if l[j] != u[j] then 
                    l[j] = u[j] 
                    if j < length(l) then 
                        l[j+1] = 1 - u[j+1] 
                    end if 
                    exit 
                end if 
            end for 
        end if 
 
        -- leading bits that are the same won't change, 
        -- and can be moved to the result. 
        while length(l) and l[1] = u[1] do 
            result &= l[1] 
            l = l[2..$] 
            u = u[2..$] 
        end while 
        -- TODO: this will output bytes instead of bits 
        while 0 and length(l) > 8 and equal(l[1..8], u[1..8]) do 
            result &= l[1] + l[2]*2 + l[3]*4 + l[4]*8 + 
                l[5]*16 + l[6]*32 + l[7]*64 + l[8]*128 
            l = l[9..$] 
            u = u[9..$] 
        end while 
 
    end for 
 
    return result 
end function 
 
-- arithmetic decode a coded sequence, given a weighted symbol table 
function arith_decode(sequence code, sequence syms, sequence weights) 
    sequence lut, result, u, l 
    integer sym, a, b, len, d 
 
    lut = generate_lut(weights) & limit 
    l = repeat(0, bit_precision) 
    u = repeat(1, bit_precision) 
if 0 then 
    -- convert bytes to bits 
    result = repeat(0, 8 * length(code)) 
    for i = 1 to length(code) do 
        d = 1 
        for j = 7 to 0 by -1 do 
            result[i*8-j] = and_bits(code[i], d) != 0 
            d += d 
        end for 
    end for 
    code = result 
    ? code 
end if 
    result = {} 
 
    len = length(code) - 1 
 
    while len do 
 
        -- find the difference of the upper and lower range 
        d = bdif(l, u) 
 
        -- scale up d so that we have more precision for bounds 
        while d < bit1 and length(l) < bit_precision do 
            d += d 
            l &= 0 
        end while 
 
        -- do a binary search of the lut to find the next symbol index 
        sym = bsearch(l, d, code, lut) 
 
        -- save the symbol in the results 
        result = append(result, syms[sym]) 
        --printf(1, "sym=%d d=%d\n", {sym, d}) 
 
        -- get the symbol bounds from look-up-table 
        a = lut[sym] 
        b = lut[sym+1] 
 
        -- scale the symbol bounds by the difference 
        a = floor(a * d / (limit+1)) 
        b = floor(b * d / (limit+1)) 
 
        -- create the new range from the symbol bounds 
        u = badd(l, b) 
        l = badd(l, a) 
 
        -- leading bits that are the same won't change, 
        -- and can be removed. 
        while l[1] = u[1] do 
            code = code[2..$] 
            l = l[2..$] 
            u = u[2..$] 
            len -= 1 
        end while 
 
    end while 
 
    return result 
end function 
 
 
sequence code, table, symbols, weights 
 
 
-- symbol mapping: 
-- 0..61 maps to integers 0..61 
-- 62 means add 62 to the next symbol value 
-- 63 means to make the next symbol negative 
symbols = repeat(0, 64) 
for i = 1 to length(symbols) do 
    symbols[i] = i-1 
end for 
 
-- weights are loosely based on the histogram of the table below 
weights = {10,2000,2000,2000,1000,1000,1000,500,50,50,50,50,50,50,50,50,50,50,10, 
10,10,10,10,10,10,5,5,5,5,5,5,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,100,2000} 
 
 
function compress_linetable(sequence table) 
    sequence s 
    integer x, y 
    s = {} 
    y = 0 
     
    -- convert the table to a list of symbols 0..63 
    for i = 1 to length(table) do 
        x = table[i] 
        if x < 0 then 
            s &= 63 -- 63 means negative 
            x = -x 
        else 
            -- calculate delta for offsets 
            x -= y 
            y = table[i] 
        end if 
        while x >= 62 do 
            s &= 62 
            x -= 62 
        end while 
        s &= x 
    end for 
 
    return arith_encode(s, symbols, weights) 
end function 
 
function decompress_linetable(sequence code) 
    sequence table, s 
    integer x, y, neg 
 
    x = 0 
    y = 0 
    neg = 0 
    table = {} 
 
    s = arith_decode(code, symbols, weights) 
 
    for i = 1 to length(s) do 
        if s[i] = 63 then 
            neg = 1 
        elsif s[i] = 62 then 
            x += 62 
        else 
            x += s[i] 
            if neg then 
                table &= -x 
                neg = 0 
            else 
                y += x 
                table &= y 
            end if 
            x = 0 
        end if 
    end for 
 
    return table 
end function 
 
 
 
table= 
    {-637,0,-23,5,6,-2,12,13,14,-71,20,-78,21,27,29,-2,31,35,37,39,44,46,-1,48,53,58,60,66,68,71,74,75,81,91,93,98, 
     100,-1,110,116,117,-3,123,-1,124,-42,132,-11,133,136,142,146,152,158,161,-1,163,169,172,175,178,180,183,185,  
     192,203,209,220,226,227,-1,228,-3,234,238,242,245,248,-1,252,-55,258,-4,263,264,266,269,275,-1,281,283,285,287,  
     293,298,302,306,-1,308,310,-2,312,314,-1,318,321,323,326,332,338,340,-33,342,-2,344,-5,347,350,353,357,359,361,  
     365,367,369,-1,372,376,-1,378,381,383,385,-1,388,392,395,397,401,405,-33,407,-4,412,413,416,420,423,426,430,437,  
     443,447,450,452,454,458,460,-1,463,-21,467,-14,470,473,477,481,-3,485,486,487,488,493,497,-23,499,501,-6,502,504,  
     509,-1,514,521,523,525,532,533,534,535,-7,536,541,542,543,544,-1,545,550,552,-1,559,562,564,567,569,-41,573,-9,  
     574,575,-10,576,-1,577,-4,583,584,590,-26,592,-1,594,-1,599,607,-11,613,619,622,628,635,637,644,646,647,-1,648,  
     -24,654,-1,659,661,-2,667,671,675,679,683,-1,687,-17,693,-3,695,699,701,703,710,716,-8,720,-3,722,723,-1,724,-1,  
     725,731,737,740,745,-1,750,752,754,756,759,-1,761,762,-23,763,767,769,775,782,784,790,792,795,798,-4,800,-1,806,  
     810,812,814,819,821,828,829,-1,830,-3,836,841,-8,843,-2,848,-45,854,-3,856,862,866,-7,869,-3,874,-1,880,881,882,  
     889,900,906,913,919,920,-1,921,-20,927,931,-4,933,-2,935,938,942,-7,946,-9,952,953,954,956,-1,958,961,964,970,  
     976,979,981,987,993,999,1002,1008,-19,1009,-3,1014,1016,1019,-6,1022,-3,1028,1032,-1,1035,1038,1040,1045,-19,  
     1050,1056,1057,1058,1061,1068,1079,1085,1092,1098,1099,1100,-1,1101,-22,1107,1111,1116,1118,-1,1120,1124,1127,-1,  
     1131,-2,1135,-20,1141,1142,1143,1145,1147,1150,-13,1151,-7,1157,-1,1158,1164,-2,1167,1171,1174,1177,-1,1180,-13,  
     1183,-3,1188,1189,1190,1191,1192,1199,1206,1212,1219,1225,1232,1238,1239,-1,1240,-32,1246,1253,1259,1266,1273,-8,  
     1277,-12,1283,1287,1289,-19,1295,-1,1300,1301,1302,1303,-12,1304,-5,1309,1312,1315,1317,1319,1323,1325,1327,-1,  
     1330,1337,1343,1347,-1,1349,1352,1354,1356,-32,1359,1363,-3,1366,1369,1371,1373,1376,1379,1385,1389,1392,1395,  
     1399,1405,1409,1413,-41,1417,-3,1419,1425,1428,1431,1434,1440,-12,1446,-7,1447,-1,1450,1456,1462,1468,1474,1477,  
     1482,-1,1487,-1,1489,1491,1493,1496,-2,1498,1504,1507,-24,1513,1515,1516,-4,1517,-11,1523,1524,1527,-4,1530,-2,  
     1532,1533,1534,1535,1536,1537,1538,1543,-10,1546,-3,1547,1554,1560,1567,1574,1581,1587,1594,1600,1601,-1,1602,-22,  
     1608,1612,1615,1618,1621,1624,1627,1630,-10,1633,-10,1639,-1,1642,-7,1643,-1,1645,1651,1652,1655,1660,1662,1668,  
     1672,1674,1677,1680,1683,1690,1694,1701,-32,1702,-7,1703,-1,1705,1711,1712,1716,1721,1723,1729,1731,1735,1738,  
     1741,1744,1747,1754,1758,1765,-35,1766,-17,1767,-1,1768,1771,1774,1777,1780,1782,1785,1788,1791,1793,-1,1795,1796,  
     1798,1800,1802,1804,1806,1808,1814,1816,1821,1823,-1,1828,-38,1829,-3,1830,1836,1838,1842,1847,1849,1852,1854,  
     1857,1858,-1,1859,1862,1864,1869,-1,1871,-1,1875,1880,1885,1887,1893,1896,1899,1905,1908,1915,1917,1920,-38,1921,  
     -15,1922,1925,1932,1934,-1,1935,-1,1942,-1,1945,-4,1948,1949,-23,1952,1956,-2,1958,1962,-1,1968,1970,1975,1980,  
     -13,1981,-3,1983,1987,-1,1993,1996,1999,-10,2004,-4,2005,2009,-1,2015,-1,2022,-1,2025,-1,2027,-14,2030,-1,2032,  
     2035,2038,2041,2044,2047,2048,2053,2054,2056,2058,2060,2062,-23,2064,2067,2069,2075,2077,2082,2085,2087,-12,2094,  
     -1,2096,2097,2104,-1,2107,-2,2110,-2,2112,2113,-16,2120,-3,2124,-3,2126,2130,-1,2136,2137,2140,2145,2150,-17,2151,  
     -2,2153,-3,2156,-3,2158,2162,-1,2168,-2,2169,2172,2175,2180,-19,2181,-4,2186,2190,-1,2196,2199,2202,-9,2204,-143,  
     2209,2214}  
 
 
code = compress_linetable(table) 
 
table = decompress_linetable(code) 
? table 
 
printf(1, "%d entries -> %d bits\n", {length(table), length(code)}) 
printf(1, "%g bits per entry\n", {length(code) / length(table)}) 
 

Outputs:

821 entries -> 3549 bits 
4.32278 bits per entry 

There's still a problem with the decoder - it doesn't know where to stop, so there ends up being an extra entry in the decoded table. I haven't decided whether to encode the length of the output into the coded bits, or to use a special symbol to indicate when to stop decoding.

new topic     » goto parent     » topic index » view thread      » older message » newer message

Search



Quick Links

User menu

Not signed in.

Misc Menu