Re: Compressed Linetables
- Posted by PeteE Aug 31, 2014
- 1380 views
Here's a first attempt:
-- use bit precision that fit euphoria's integer type constant bit_precision = 30, limit = power(2, bit_precision) - 1, bit1 = power(2, bit_precision - 1) -- sum a sequence function sum(sequence s) atom result result = 0 for i = 1 to length(s) do result += s[i] end for return result end function -- generate look-up-table of weights, normalized to bit precision function generate_lut(sequence s) sequence lut integer x atom sum_s -- scale weights from 0 to bit limit sum_s = sum(s) lut = repeat(0, length(s)) -- change independent weights to increasing ranges x = 0 for i = 1 to length(lut) do lut[i] = x x += floor(s[i] * limit / sum_s) end for return lut end function -- add an integer x to a binary sequence bits function badd(sequence bits, atom x) integer c, s atom j j = 1 c = 0 for i = length(bits) to 1 by -1 do s = bits[i] + (and_bits(x, j) != 0) + c c = s > 1 bits[i] = and_bits(s, 1) j += j end for if c then printf(1, "badd overflow! x=%d bits=", {x}) ? bits end if return bits end function -- return an integer representing the binary difference of x and y function bdif(sequence x, sequence y) integer c, s atom j, result j = 1 c = 0 result = 0 for i = length(x) to 1 by -1 do if i <= length(y) then s = y[i] else s = 0 end if s -= x[i] + c c = s < 0 result += j * and_bits(s, 1) j += j end for return result * (1 - 2 * c) end function -- binary search between l range d for a symbol in lut that matches code function bsearch(sequence l, integer d, sequence code, sequence lut) integer a, b, mid, x sequence tmp a = 1 b = length(lut) - 1 while a != b do mid = floor((a + b + 1) / 2) x = lut[mid] tmp = badd(l, floor(x * d / (limit+1))) if bdif(tmp, code) < 0 then b = mid - 1 else a = mid end if end while return a end function -- arithmetic encode a list of symbols, given a weighted symbol table function arith_encode(sequence list, sequence syms, sequence weights) sequence lut, result, u, l integer sym, a, b, d lut = generate_lut(weights) & limit l = repeat(0, bit_precision) u = repeat(1, bit_precision) result = {} for i = 1 to length(list) do -- get the symbol index from the list sym = find(list[i], syms) if sym = 0 then puts(1, "symbol not found: ") ? list[i] return 0 end if -- find the difference of the upper and lower range d = bdif(l, u) -- scale up d so that we have more precision for bounds while d < bit1 and length(l) < bit_precision do d += d l &= 0 end while --printf(1, "sym=%d d=%d\n", {sym, d}) -- get the symbol bounds from look-up-table a = lut[sym] b = lut[sym+1] -- scale the symbol bounds by the difference a = floor(a * d / (limit+1)) b = floor(b * d / (limit+1)) -- create the new range from the symbol bounds u = badd(l, b) l = badd(l, a) -- last item? if i = length(list) then -- figure out how many bits we need to flush u = badd(l, floor((b - a) / 2)) for j = 1 to length(l) do if l[j] != u[j] then l[j] = u[j] if j < length(l) then l[j+1] = 1 - u[j+1] end if exit end if end for end if -- leading bits that are the same won't change, -- and can be moved to the result. while length(l) and l[1] = u[1] do result &= l[1] l = l[2..$] u = u[2..$] end while -- TODO: this will output bytes instead of bits while 0 and length(l) > 8 and equal(l[1..8], u[1..8]) do result &= l[1] + l[2]*2 + l[3]*4 + l[4]*8 + l[5]*16 + l[6]*32 + l[7]*64 + l[8]*128 l = l[9..$] u = u[9..$] end while end for return result end function -- arithmetic decode a coded sequence, given a weighted symbol table function arith_decode(sequence code, sequence syms, sequence weights) sequence lut, result, u, l integer sym, a, b, len, d lut = generate_lut(weights) & limit l = repeat(0, bit_precision) u = repeat(1, bit_precision) if 0 then -- convert bytes to bits result = repeat(0, 8 * length(code)) for i = 1 to length(code) do d = 1 for j = 7 to 0 by -1 do result[i*8-j] = and_bits(code[i], d) != 0 d += d end for end for code = result ? code end if result = {} len = length(code) - 1 while len do -- find the difference of the upper and lower range d = bdif(l, u) -- scale up d so that we have more precision for bounds while d < bit1 and length(l) < bit_precision do d += d l &= 0 end while -- do a binary search of the lut to find the next symbol index sym = bsearch(l, d, code, lut) -- save the symbol in the results result = append(result, syms[sym]) --printf(1, "sym=%d d=%d\n", {sym, d}) -- get the symbol bounds from look-up-table a = lut[sym] b = lut[sym+1] -- scale the symbol bounds by the difference a = floor(a * d / (limit+1)) b = floor(b * d / (limit+1)) -- create the new range from the symbol bounds u = badd(l, b) l = badd(l, a) -- leading bits that are the same won't change, -- and can be removed. while l[1] = u[1] do code = code[2..$] l = l[2..$] u = u[2..$] len -= 1 end while end while return result end function sequence code, table, symbols, weights -- symbol mapping: -- 0..61 maps to integers 0..61 -- 62 means add 62 to the next symbol value -- 63 means to make the next symbol negative symbols = repeat(0, 64) for i = 1 to length(symbols) do symbols[i] = i-1 end for -- weights are loosely based on the histogram of the table below weights = {10,2000,2000,2000,1000,1000,1000,500,50,50,50,50,50,50,50,50,50,50,10, 10,10,10,10,10,10,5,5,5,5,5,5,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,100,2000} function compress_linetable(sequence table) sequence s integer x, y s = {} y = 0 -- convert the table to a list of symbols 0..63 for i = 1 to length(table) do x = table[i] if x < 0 then s &= 63 -- 63 means negative x = -x else -- calculate delta for offsets x -= y y = table[i] end if while x >= 62 do s &= 62 x -= 62 end while s &= x end for return arith_encode(s, symbols, weights) end function function decompress_linetable(sequence code) sequence table, s integer x, y, neg x = 0 y = 0 neg = 0 table = {} s = arith_decode(code, symbols, weights) for i = 1 to length(s) do if s[i] = 63 then neg = 1 elsif s[i] = 62 then x += 62 else x += s[i] if neg then table &= -x neg = 0 else y += x table &= y end if x = 0 end if end for return table end function table= {-637,0,-23,5,6,-2,12,13,14,-71,20,-78,21,27,29,-2,31,35,37,39,44,46,-1,48,53,58,60,66,68,71,74,75,81,91,93,98, 100,-1,110,116,117,-3,123,-1,124,-42,132,-11,133,136,142,146,152,158,161,-1,163,169,172,175,178,180,183,185, 192,203,209,220,226,227,-1,228,-3,234,238,242,245,248,-1,252,-55,258,-4,263,264,266,269,275,-1,281,283,285,287, 293,298,302,306,-1,308,310,-2,312,314,-1,318,321,323,326,332,338,340,-33,342,-2,344,-5,347,350,353,357,359,361, 365,367,369,-1,372,376,-1,378,381,383,385,-1,388,392,395,397,401,405,-33,407,-4,412,413,416,420,423,426,430,437, 443,447,450,452,454,458,460,-1,463,-21,467,-14,470,473,477,481,-3,485,486,487,488,493,497,-23,499,501,-6,502,504, 509,-1,514,521,523,525,532,533,534,535,-7,536,541,542,543,544,-1,545,550,552,-1,559,562,564,567,569,-41,573,-9, 574,575,-10,576,-1,577,-4,583,584,590,-26,592,-1,594,-1,599,607,-11,613,619,622,628,635,637,644,646,647,-1,648, -24,654,-1,659,661,-2,667,671,675,679,683,-1,687,-17,693,-3,695,699,701,703,710,716,-8,720,-3,722,723,-1,724,-1, 725,731,737,740,745,-1,750,752,754,756,759,-1,761,762,-23,763,767,769,775,782,784,790,792,795,798,-4,800,-1,806, 810,812,814,819,821,828,829,-1,830,-3,836,841,-8,843,-2,848,-45,854,-3,856,862,866,-7,869,-3,874,-1,880,881,882, 889,900,906,913,919,920,-1,921,-20,927,931,-4,933,-2,935,938,942,-7,946,-9,952,953,954,956,-1,958,961,964,970, 976,979,981,987,993,999,1002,1008,-19,1009,-3,1014,1016,1019,-6,1022,-3,1028,1032,-1,1035,1038,1040,1045,-19, 1050,1056,1057,1058,1061,1068,1079,1085,1092,1098,1099,1100,-1,1101,-22,1107,1111,1116,1118,-1,1120,1124,1127,-1, 1131,-2,1135,-20,1141,1142,1143,1145,1147,1150,-13,1151,-7,1157,-1,1158,1164,-2,1167,1171,1174,1177,-1,1180,-13, 1183,-3,1188,1189,1190,1191,1192,1199,1206,1212,1219,1225,1232,1238,1239,-1,1240,-32,1246,1253,1259,1266,1273,-8, 1277,-12,1283,1287,1289,-19,1295,-1,1300,1301,1302,1303,-12,1304,-5,1309,1312,1315,1317,1319,1323,1325,1327,-1, 1330,1337,1343,1347,-1,1349,1352,1354,1356,-32,1359,1363,-3,1366,1369,1371,1373,1376,1379,1385,1389,1392,1395, 1399,1405,1409,1413,-41,1417,-3,1419,1425,1428,1431,1434,1440,-12,1446,-7,1447,-1,1450,1456,1462,1468,1474,1477, 1482,-1,1487,-1,1489,1491,1493,1496,-2,1498,1504,1507,-24,1513,1515,1516,-4,1517,-11,1523,1524,1527,-4,1530,-2, 1532,1533,1534,1535,1536,1537,1538,1543,-10,1546,-3,1547,1554,1560,1567,1574,1581,1587,1594,1600,1601,-1,1602,-22, 1608,1612,1615,1618,1621,1624,1627,1630,-10,1633,-10,1639,-1,1642,-7,1643,-1,1645,1651,1652,1655,1660,1662,1668, 1672,1674,1677,1680,1683,1690,1694,1701,-32,1702,-7,1703,-1,1705,1711,1712,1716,1721,1723,1729,1731,1735,1738, 1741,1744,1747,1754,1758,1765,-35,1766,-17,1767,-1,1768,1771,1774,1777,1780,1782,1785,1788,1791,1793,-1,1795,1796, 1798,1800,1802,1804,1806,1808,1814,1816,1821,1823,-1,1828,-38,1829,-3,1830,1836,1838,1842,1847,1849,1852,1854, 1857,1858,-1,1859,1862,1864,1869,-1,1871,-1,1875,1880,1885,1887,1893,1896,1899,1905,1908,1915,1917,1920,-38,1921, -15,1922,1925,1932,1934,-1,1935,-1,1942,-1,1945,-4,1948,1949,-23,1952,1956,-2,1958,1962,-1,1968,1970,1975,1980, -13,1981,-3,1983,1987,-1,1993,1996,1999,-10,2004,-4,2005,2009,-1,2015,-1,2022,-1,2025,-1,2027,-14,2030,-1,2032, 2035,2038,2041,2044,2047,2048,2053,2054,2056,2058,2060,2062,-23,2064,2067,2069,2075,2077,2082,2085,2087,-12,2094, -1,2096,2097,2104,-1,2107,-2,2110,-2,2112,2113,-16,2120,-3,2124,-3,2126,2130,-1,2136,2137,2140,2145,2150,-17,2151, -2,2153,-3,2156,-3,2158,2162,-1,2168,-2,2169,2172,2175,2180,-19,2181,-4,2186,2190,-1,2196,2199,2202,-9,2204,-143, 2209,2214} code = compress_linetable(table) table = decompress_linetable(code) ? table printf(1, "%d entries -> %d bits\n", {length(table), length(code)}) printf(1, "%g bits per entry\n", {length(code) / length(table)})
Outputs:
821 entries -> 3549 bits 4.32278 bits per entry
There's still a problem with the decoder - it doesn't know where to stop, so there ends up being an extra entry in the decoded table. I haven't decided whether to encode the length of the output into the coded bits, or to use a special symbol to indicate when to stop decoding.