### Re: Primes | A Software Drag Race - Dave's Garage

Results so far collected on this machine, slowest first:

```   Python: 14 (version 1, in 10 seconds so 7 really)
Wren: 68
Julia: 110 (version1)
Julia: 111 (version2)
Phix: 675 (hll init and main loop, aka "if true then")
FreeBASIC: 1054 (prime_boolean, the 8of30 one gave incorrect results)
Python: 2270 (version 2)
Go: 3064 (solution 1)
Python: 3524 (solution 3, unfaithful)
Phix: 4081 (hll init and asm main loop)
Go: 4233 (solution 2)
Phix: 5419 (asm init and main loop)
```

Still not sure about the Docker thing, let's see if anyone else can improve things here first anyway.

Actually, I'm pretty sure allocate()+rep movsd/q would be faster, and of course a single peek() afterwards both gets the easier-to-handle (binary) string and is not timed. update: tried that, no gain, in fact it was slower[?].

```--
-- demo/prime_drag_race.exw
-- ========================
--
?"begin"
--constant limit = 100_000_000
--constant limit = 10_000_000
constant limit = 1_000_000
--constant limit = 100_000
--constant limit = 10_000
--constant limit = 1_000
--constant limit = 100
--constant limit = 10

constant prime_counts = new_dict({{10, 4},
{100, 25},
{1000, 168},
{10000, 1229},
{100000, 9592},
{1000000, 78498},
{10000000, 664579},
{100000000, 5761455}})

function sieve(integer n)
integer bytes = floor((n+15)/16)    -- (round up to whole no of odd-bit bytes)
-- (eg a limit of 10 ==> 1..15, in 1 byte)
string res = repeat('\0',bytes+iff(machine_bits()=32?11:23))
-- initialise with every 3rd bit set (strike 3/9/15 etc)
--  done in blocks of 12/24 bytes, hence padding above
--  if true then                        -- use slow hll version
if false then                       -- use fast asm version
for i=1 to length(res) by 3 do
res[i]   = 0b1001_0010
res[i+1] = 0b0010_0100
res[i+2] = 0b0100_1001
end for
else
-- For the asm version of the above I used a "block size" of 12/24,
-- simply because that fits comfortably in the available registers.
#ilASM{

mov edi,[res]
mov ecx,[bytes]
shl edi,2
mov eax,0b1001_0010_0100_1001_0010_0100_1001_0010
mov edx,0b0010_0100_1001_0010_0100_1001_0010_0100
mov esi,0b0100_1001_0010_0100_1001_0010_0100_1001
@@:
mov [edi],eax
mov [edi+4],edx
mov [edi+8],esi
lea edi,[edi+12]
sub ecx,12
jg @b

-- Here we use a bit of bit-fiddling in preference to lots
-- of massive literal constants, because it is easy enough.
-- Of course you could do much the same in the 32-bit code.
-- Note we should avoid setting the low-dword of rax to a
-- signed value, since that fills the high-dword with 1s.
mov rax,0b0100_1001_0010_0100_1001_0010_0100_1001
mov rdi,[res]
mov rdx,rax
shl rax,1
mov rcx,[bytes]
shl rdx,34
shl rdi,2
lea rdx,[rbx+rax*4+1]
lea rsi,[rbx+rax*2]
@@:
mov [rdi],rax
mov [rdi+8],rdx
mov [rdi+16],rsi
lea rdi,[rdi+24]
sub rcx,24
jg @b
}
end if
-- then clear 3rd and strike 1st bit too.
res = 0b1001_0001
--
-- now strike 25,35,45...       5^2, {+5*2}
--       then 49,63,77...       7^2, {+7*2}
--       then 121,143,165...    11^2, {+11*2}
--       then 169,195,221...    13^2, {+13*2}
--       then 289,323,357...    17^2, {+17*2}
--
integer p = 5,  -- (first prime to test)
p2 = p*p, -- (squared)
pb = 2,   -- (bit index for 5)
mask = 0b0100  -- (mask for 5)
while p2<=n do
--      if true then                        -- use slow hll version
if false then                       -- use fast asm version
integer bdx = floor(pb/8)+1,
byte = res[bdx]
for k2=p2 to n by p*2 do
bdx = floor(k2/16)+1
byte = res[bdx]
integer kl = remainder(k2,16)
integer mask2 = power(2,floor(kl/2))
res[bdx] = byte
end for
end if
else
-- Obviously this would be much harder to write without the above
-- to refer to and check against, likewise having a printable kl
-- etc in the above loop helps (vs. doing the calculation inline).
-- Most users bother not with this sort of thing, for good reason.
-- In truth, #ilASM{} is for self-hosting, not really performance,
-- and there never was any remote intention for it be easy to use.
#ilASM{

mov ecx,[pb]
mov esi,[res]
shr ecx,3                   -- bdx := floor(bp/8)
mov eax,[p2]                -- k2
--                      test byte[esi*4+ecx],dl     -- byte,mask -- (needs 1.0.1)
test [esi*4+ecx],edx        -- byte,mask -- (works too)
jne :nextp
::by2p
mov ecx,eax             -- kl := k2
mov edi,eax             -- bdx := k2
and ecx,15              -- kl := remainder(k2,16)
mov edx,1               -- mask2 := 1
shr ecx,1               -- kl := kl/2
shr edi,4               -- bdx := floor(k2/16)
shl edx,cl              -- mask2 := power(2,floor(kl/2))
mov ecx,[p]
or byte[esi*4+edi],dl
lea eax,[eax+ecx*2]     -- k2 += p*2
cmp eax,[n]
jle :by2p

mov rcx,[pb]
mov rsi,[res]
shr rcx,3                   -- bdx := floor(bp/8)
mov rax,[p2]                -- k2
--                      test byte[rsi*4+rcx],dl     -- byte,mask -- (needs 1.0.1)
test [rsi*4+rcx],rdx        -- byte,mask -- (works too)
jne :nextp
::by2p
mov rcx,rax             -- kl := k2
mov rdi,rax             -- bdx := k2
and rcx,15              -- kl := remainder(k2,16)
mov dl,1                -- mask := 1
shr rcx,1               -- kl := kl/2
shr rdi,4               -- bdx := floor(k2/16)
shl dl,cl               -- mask := power(2,floor(kl/2))
mov rcx,[p]
or byte[rsi*4+rdi],dl
lea rax,[rax+rcx*2]     -- k2 += p*2
cmp rax,[n]
jle :by2p
[]
::nextp
}
end if
p += 2
p2 = p*p
pb += 1
if mask>#FF then mask /= #100 end if
end while
res = res[1..bytes]
return res -- (or return {res,bytes,n} should that be needed/desired)
end function

atom t5 = time()+5
--atom t5 = time()-.05  -- (useful for verifying different sizes)
integer passes = 0
string res
while true do
res = sieve(limit)
passes += 1
if time()>t5 then exit end if
end while
atom actual_time = time()-t5+5

integer prime_count = 1 -- (no bit for 2 so count it automatically)
--sequence primes = repeat(0,limit) -- (neither helps nor hinders)
--primes = 1                                 -- ""
integer p = 1
for i=1 to length(res) do
integer ri = res[i]
for j=1 to 8 do
if even(ri) then
if p>limit then exit end if
prime_count += 1
--          primes[p] = 1                       -- ""
end if
ri = floor(ri/2)
p += 2
end for
end for
puts(1,"\n")
integer verify = getd(limit,prime_counts)
if prime_count==verify then
--  printf(1,"%d primes < %d (verified)\n",{prime_count,limit})
printf(1,"phix;%d;%f;1;algorithm=base,faithful=yes,bits=1\n", {passes, actual_time})
--  phix;5419;5.015000;1;algorithm=base,faithful=yes,bits=1
else
printf(1,"**ERROR** %d primes(<%d) should be %d\n",{prime_count,limit,verify})
end if
{} = wait_key()
```

recap: the above is creating an odd-bit-only sieve bitmap, so res contains the bits for {15,13,11,9,7,5,3,1}, res the bits for {31,29,27,25,23,21,18,17}, etc. (8 bits per byte)

aside: If you don't like the #ilASM{} in the above (who would) you can just delete it and still join in, albeit trying to improve the 675 instead of the 5419.