### forum-msg-id-124974-edit

Original date:2014-08-31 03:22:51 Edited by: PeteE Subject: Re: Compressed Linetables

Here's a first attempt:

```
-- use bit precision that fit euphoria's integer type
constant
bit_precision = 30,
limit = power(2, bit_precision) - 1,
bit1 = power(2, bit_precision - 1),
bit2 = power(2, bit_precision - 2)

-- sum a sequence
function sum(sequence s)
atom result
result = 0
for i = 1 to length(s) do
result += s[i]
end for
return result
end function

-- generate look-up-table of weights, normalized to bit precision
function generate_lut(sequence s)
sequence lut
integer x, y
-- scale weights from 0 to bit limit
lut = floor(s * limit / sum(s))
-- change independent weights to increasing ranges
x = 0
for i = 1 to length(lut) do
y = lut[i]
lut[i] = x
x += y
end for
return lut
end function

-- add an integer x to a binary sequence bits
function badd(sequence bits, atom x)
integer c, s, j
j = 1
c = 0
for i = length(bits) to 1 by -1 do
s = bits[i] + (and_bits(x, j) != 0) + c
c = s > 1
bits[i] = and_bits(s, 1)
j += j
end for
if c then
puts(1, "badd overflow!\n")
end if
return bits
end function

-- return an integer representing the binary difference of x and y
function bdif(sequence x, sequence y)
integer c, s, j, result
j = 1
c = 0
result = 0
for i = length(x) to 1 by -1 do
if i <= length(y) then
s = y[i]
else
s = 0
end if
s -= x[i] + c
c = s < 0
result += j * and_bits(s, 1)
j += j
end for
return result * (1 - 2 * c)
end function

-- binary search between l range d for a symbol in lut that matches code
function bsearch(sequence l, integer d, sequence code, sequence lut)
integer a, b, mid, x
sequence tmp

a = 1
b = length(lut)

while a != b do
mid = floor((a + b + 1) / 2)
if mid <= length(lut) then
x = lut[mid]
else
x = limit
end if
tmp = badd(l, floor(x * d / (limit+1)))
if bdif(tmp, code) < 0 then
b = mid - 1
else
a = mid
end if
end while
return a
end function

-- arithmetic encode a list of symbols, given a weighted symbol table
function arith_encode(sequence list, sequence syms, sequence weights)
sequence lut, result, u, l
integer sym, a, b, d

lut = generate_lut(weights)
l = repeat(0, bit_precision)
u = repeat(1, bit_precision)
result = {}

for i = 1 to length(list) do
-- get the symbol index from the list
sym = find(list[i], syms)
if sym = 0 then
puts(1, "symbol not found: ") ? list[i]
return 0
end if

-- find the difference of the upper and lower range
d = bdif(l, u)

-- scale up d so that we have more precision for bounds
while d < bit1 do
d += d
l &= 0
end while
--printf(1, "sym=%d d=%d\n", {sym, d})

-- get the symbol bounds from look-up-table
a = lut[sym]
if sym < length(lut) then
b = lut[sym+1]
else
b = limit
end if

-- scale the symbol bounds by the difference
a = floor(a * d / (limit+1))
b = floor(b * d / (limit+1))

-- create the new range from the symbol bounds
u = badd(l, b)
l = badd(l, a)

-- last item?
if i = length(list) then
-- figure out how many bits we need to flush
u = badd(l, floor((b - a) / 2))
for j = 1 to length(l) do
if l[j] != u[j] then
l[j] = u[j]
if j < length(l) then
l[j+1] = 1 - u[j+1]
end if
exit
end if
end for
end if

-- leading bits that are the same won't change,
-- and can be moved to the result.
while length(l) and l[1] = u[1] do
result &= l[1]
l = l[2..\$]
u = u[2..\$]
end while
-- TODO: this will output bytes instead of bits
while 0 and length(l) > 8 and equal(l[1..8], u[1..8]) do
result &= l[1] + l[2]*2 + l[3]*4 + l[4]*8 +
l[5]*16 + l[6]*32 + l[7]*64 + l[8]*128
l = l[9..\$]
u = u[9..\$]
end while

end for

return result
end function

-- arithmetic decode a coded sequence, given a weighted symbol table
function arith_decode(sequence code, sequence syms, sequence weights)
sequence lut, result, u, l
integer sym, a, b, len, d

lut = generate_lut(weights)
l = repeat(0, bit_precision)
u = repeat(1, bit_precision)
if 0 then
-- convert bytes to bits
result = repeat(0, 8 * length(code))
for i = 1 to length(code) do
d = 1
for j = 7 to 0 by -1 do
result[i*8-j] = and_bits(code[i], d) != 0
d += d
end for
end for
code = result
? code
end if
result = {}

len = length(code) - 1

while len do

-- find the difference of the upper and lower range
d = bdif(l, u)

-- scale up d so that we have more precision for bounds
while d < bit1 do
d += d
l &= 0
end while

-- do a binary search of the lut to find the next symbol index
sym = bsearch(l, d, code, lut)

-- save the symbol in the results
result = append(result, syms[sym])
--printf(1, "sym=%d d=%d\n", {sym, d})

-- get the symbol bounds from look-up-table
a = lut[sym]
if sym < length(lut) then
b = lut[sym+1]
else
b = limit
end if

-- scale the symbol bounds by the difference
a = floor(a * d / (limit+1))
b = floor(b * d / (limit+1))

-- create the new range from the symbol bounds
u = badd(l, b)
l = badd(l, a)

-- leading bits that are the same won't change,
-- and can be removed.
while l[1] = u[1] do
code = code[2..\$]
l = l[2..\$]
u = u[2..\$]
len -= 1
end while

end while

return result
end function

sequence code, table, symbols, weights

-- symbol mapping:
-- 0..61 maps to integers 0..61
-- 62 means add 62 to the next symbol value
-- 63 means to make the next symbol negative
symbols = repeat(0, 64)
for i = 1 to length(symbols) do
symbols[i] = i-1
end for

-- weights are loosely based on the histogram of the table below
weights = {10,2000,2000,2000,1000,1000,1000,500,50,50,50,50,50,50,50,50,50,50,10,
10,10,10,10,10,10,5,5,5,5,5,5,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,100,2000}

function compress_linetable(sequence table)
sequence s
integer x, y
s = {}
y = 0

-- convert the table to a list of symbols 0..63
for i = 1 to length(table) do
x = table[i]
if x < 0 then
s &= 63 -- 63 means negative
x = -x
else
-- calculate delta for offsets
x -= y
y = table[i]
end if
while x >= 62 do
s &= 62
x -= 62
end while
s &= x
end for

return arith_encode(s, symbols, weights)
end function

function decompress_linetable(sequence code)
sequence table, s
integer x, y, neg

x = 0
y = 0
neg = 0
table = {}

s = arith_decode(code, symbols, weights)

for i = 1 to length(s) do
if s[i] = 63 then
neg = 1
elsif s[i] = 62 then
x += 62
else
x += s[i]
if neg then
table &= -x
neg = 0
else
y += x
table &= y
end if
x = 0
end if
end for

return table
end function

table=
{-637,0,-23,5,6,-2,12,13,14,-71,20,-78,21,27,29,-2,31,35,37,39,44,46,-1,48,53,58,60,66,68,71,74,75,81,91,93,98,
100,-1,110,116,117,-3,123,-1,124,-42,132,-11,133,136,142,146,152,158,161,-1,163,169,172,175,178,180,183,185,
192,203,209,220,226,227,-1,228,-3,234,238,242,245,248,-1,252,-55,258,-4,263,264,266,269,275,-1,281,283,285,287,
293,298,302,306,-1,308,310,-2,312,314,-1,318,321,323,326,332,338,340,-33,342,-2,344,-5,347,350,353,357,359,361,
365,367,369,-1,372,376,-1,378,381,383,385,-1,388,392,395,397,401,405,-33,407,-4,412,413,416,420,423,426,430,437,
443,447,450,452,454,458,460,-1,463,-21,467,-14,470,473,477,481,-3,485,486,487,488,493,497,-23,499,501,-6,502,504,
509,-1,514,521,523,525,532,533,534,535,-7,536,541,542,543,544,-1,545,550,552,-1,559,562,564,567,569,-41,573,-9,
574,575,-10,576,-1,577,-4,583,584,590,-26,592,-1,594,-1,599,607,-11,613,619,622,628,635,637,644,646,647,-1,648,
-24,654,-1,659,661,-2,667,671,675,679,683,-1,687,-17,693,-3,695,699,701,703,710,716,-8,720,-3,722,723,-1,724,-1,
725,731,737,740,745,-1,750,752,754,756,759,-1,761,762,-23,763,767,769,775,782,784,790,792,795,798,-4,800,-1,806,
810,812,814,819,821,828,829,-1,830,-3,836,841,-8,843,-2,848,-45,854,-3,856,862,866,-7,869,-3,874,-1,880,881,882,
889,900,906,913,919,920,-1,921,-20,927,931,-4,933,-2,935,938,942,-7,946,-9,952,953,954,956,-1,958,961,964,970,
976,979,981,987,993,999,1002,1008,-19,1009,-3,1014,1016,1019,-6,1022,-3,1028,1032,-1,1035,1038,1040,1045,-19,
1050,1056,1057,1058,1061,1068,1079,1085,1092,1098,1099,1100,-1,1101,-22,1107,1111,1116,1118,-1,1120,1124,1127,-1,
1131,-2,1135,-20,1141,1142,1143,1145,1147,1150,-13,1151,-7,1157,-1,1158,1164,-2,1167,1171,1174,1177,-1,1180,-13,
1183,-3,1188,1189,1190,1191,1192,1199,1206,1212,1219,1225,1232,1238,1239,-1,1240,-32,1246,1253,1259,1266,1273,-8,
1277,-12,1283,1287,1289,-19,1295,-1,1300,1301,1302,1303,-12,1304,-5,1309,1312,1315,1317,1319,1323,1325,1327,-1,
1330,1337,1343,1347,-1,1349,1352,1354,1356,-32,1359,1363,-3,1366,1369,1371,1373,1376,1379,1385,1389,1392,1395,
1399,1405,1409,1413,-41,1417,-3,1419,1425,1428,1431,1434,1440,-12,1446,-7,1447,-1,1450,1456,1462,1468,1474,1477,
1482,-1,1487,-1,1489,1491,1493,1496,-2,1498,1504,1507,-24,1513,1515,1516,-4,1517,-11,1523,1524,1527,-4,1530,-2,
1532,1533,1534,1535,1536,1537,1538,1543,-10,1546,-3,1547,1554,1560,1567,1574,1581,1587,1594,1600,1601,-1,1602,-22,
1608,1612,1615,1618,1621,1624,1627,1630,-10,1633,-10,1639,-1,1642,-7,1643,-1,1645,1651,1652,1655,1660,1662,1668,
1672,1674,1677,1680,1683,1690,1694,1701,-32,1702,-7,1703,-1,1705,1711,1712,1716,1721,1723,1729,1731,1735,1738,
1741,1744,1747,1754,1758,1765,-35,1766,-17,1767,-1,1768,1771,1774,1777,1780,1782,1785,1788,1791,1793,-1,1795,1796,
1798,1800,1802,1804,1806,1808,1814,1816,1821,1823,-1,1828,-38,1829,-3,1830,1836,1838,1842,1847,1849,1852,1854,
1857,1858,-1,1859,1862,1864,1869,-1,1871,-1,1875,1880,1885,1887,1893,1896,1899,1905,1908,1915,1917,1920,-38,1921,
-15,1922,1925,1932,1934,-1,1935,-1,1942,-1,1945,-4,1948,1949,-23,1952,1956,-2,1958,1962,-1,1968,1970,1975,1980,
-13,1981,-3,1983,1987,-1,1993,1996,1999,-10,2004,-4,2005,2009,-1,2015,-1,2022,-1,2025,-1,2027,-14,2030,-1,2032,
2035,2038,2041,2044,2047,2048,2053,2054,2056,2058,2060,2062,-23,2064,2067,2069,2075,2077,2082,2085,2087,-12,2094,
-1,2096,2097,2104,-1,2107,-2,2110,-2,2112,2113,-16,2120,-3,2124,-3,2126,2130,-1,2136,2137,2140,2145,2150,-17,2151,
-2,2153,-3,2156,-3,2158,2162,-1,2168,-2,2169,2172,2175,2180,-19,2181,-4,2186,2190,-1,2196,2199,2202,-9,2204,-143,
2209,2214}

code = compress_linetable(table)

table = decompress_linetable(code)
? table

printf(1, "%d entries -> %d bits\n", {length(table), length(code)})
printf(1, "%g bits per entry\n", {length(code) / length(table)})

```

Outputs:

```821 entries -> 3549 bits
4.32278 bits per entry
```

There's still a problem with the decoder - it doesn't know where to stop, so there ends up being an extra entry in the decoded table. I haven't decided whether to encode the length of the output into the coded bits, or to use a special symbol to indicate when to stop decoding.

 Not Categorized, Please Help

Not signed in.