forum-msg-id-136271-edit

Original date:2021-07-16 11:07:32 Edited by: petelomax Subject: Re: Primes | A Software Drag Race - Dave's Garage

Results so far collected on this machine, slowest first:

   Python: 14 (version 1, in 10 seconds so 7 really) 
     Wren: 68 
    Julia: 110 (version1) 
    Julia: 111 (version2) 
     Phix: 675 (hll init and main loop, aka "if true then") 
FreeBASIC: 1054 (prime_boolean, the 8of30 one gave incorrect results) 
   Python: 2270 (version 2) 
       Go: 3064 (solution 1) 
   Python: 3524 (solution 3, unfaithful) 
     Phix: 4081 (hll init and asm main loop) 
       Go: 4233 (solution 2) 
     Phix: 5419 (asm init and main loop) 

Still not sure about the Docker thing, let's see if anyone else can improve things here first anyway.

Actually, I'm pretty sure allocate()+rep movsd/q would be faster, and of course a single peek() afterwards both gets the easier-to-handle (binary) string and is not timed. update: tried that, no gain, in fact it was slower[?].

-- 
-- demo/prime_drag_race.exw 
-- ======================== 
-- 
?"begin" 
--constant limit = 100_000_000 
--constant limit = 10_000_000 
constant limit = 1_000_000 
--constant limit = 100_000 
--constant limit = 10_000 
--constant limit = 1_000 
--constant limit = 100 
--constant limit = 10 
 
constant prime_counts = new_dict({{10, 4}, 
                                  {100, 25}, 
                                  {1000, 168}, 
                                  {10000, 1229}, 
                                  {100000, 9592}, 
                                  {1000000, 78498}, 
                                  {10000000, 664579}, 
                                  {100000000, 5761455}}) 
 
function sieve(integer n) 
    integer bytes = floor((n+15)/16)    -- (round up to whole no of odd-bit bytes) 
                                        -- (eg a limit of 10 ==> 1..15, in 1 byte) 
    string res = repeat('\0',bytes+iff(machine_bits()=32?11:23)) 
    -- initialise with every 3rd bit set (strike 3/9/15 etc) 
    --  done in blocks of 12/24 bytes, hence padding above 
--  if true then                        -- use slow hll version 
    if false then                       -- use fast asm version 
        for i=1 to length(res) by 3 do 
            res[i]   = 0b1001_0010 
            res[i+1] = 0b0010_0100 
            res[i+2] = 0b0100_1001 
        end for 
    else 
        -- For the asm version of the above I used a "block size" of 12/24, 
        -- simply because that fits comfortably in the available registers. 
        #ilASM{ 
            [32] 
                mov edi,[res] 
                mov ecx,[bytes] 
                shl edi,2 
                mov eax,0b1001_0010_0100_1001_0010_0100_1001_0010 
                mov edx,0b0010_0100_1001_0010_0100_1001_0010_0100 
                mov esi,0b0100_1001_0010_0100_1001_0010_0100_1001 
              @@: 
                mov [edi],eax 
                mov [edi+4],edx 
                mov [edi+8],esi 
                lea edi,[edi+12] 
                sub ecx,12 
                jg @b 
            [64] 
                -- Here we use a bit of bit-fiddling in preference to lots 
                -- of massive literal constants, because it is easy enough. 
                -- Of course you could do much the same in the 32-bit code. 
                -- Note we should avoid setting the low-dword of rax to a 
                -- signed value, since that fills the high-dword with 1s. 
                mov rax,0b0100_1001_0010_0100_1001_0010_0100_1001 
                mov rdi,[res] 
                mov rdx,rax 
                shl rax,1 
                mov rcx,[bytes] 
                shl rdx,34 
                shl rdi,2 
                add rax,rdx 
                lea rdx,[rbx+rax*4+1] 
                lea rsi,[rbx+rax*2] 
              @@: 
                mov [rdi],rax 
                mov [rdi+8],rdx 
                mov [rdi+16],rsi 
                lea rdi,[rdi+24] 
                sub rcx,24 
                jg @b 
              } 
    end if 
    -- then clear 3rd and strike 1st bit too. 
    res[1] = 0b1001_0001 
    -- 
    -- now strike 25,35,45...       5^2, {+5*2} 
    --       then 49,63,77...       7^2, {+7*2} 
    --       then 121,143,165...    11^2, {+11*2} 
    --       then 169,195,221...    13^2, {+13*2} 
    --       then 289,323,357...    17^2, {+17*2} 
    -- 
    integer p = 5,  -- (first prime to test) 
            p2 = p*p, -- (squared) 
            pb = 2,   -- (bit index for 5) 
            mask = 0b0100  -- (mask for 5) 
    while p2<=n do 
--      if true then                        -- use slow hll version 
        if false then                       -- use fast asm version 
            integer bdx = floor(pb/8)+1, 
                    byte = res[bdx] 
            if and_bits(byte,mask)=0 then 
                for k2=p2 to n by p*2 do 
                    bdx = floor(k2/16)+1 
                    byte = res[bdx] 
                    integer kl = remainder(k2,16) 
                    integer mask2 = power(2,floor(kl/2)) 
                    byte = or_bits(byte,mask2) 
                    res[bdx] = byte 
                end for 
            end if 
        else 
            -- Obviously this would be much harder to write without the above 
            -- to refer to and check against, likewise having a printable kl 
            -- etc in the above loop helps (vs. doing the calculation inline). 
            -- Most users bother not with this sort of thing, for good reason. 
            -- In truth, #ilASM{} is for self-hosting, not really performance, 
            -- and there never was any remote intention for it be easy to use. 
            #ilASM{ 
                    [32] 
                        mov ecx,[pb] 
                        mov esi,[res] 
                        shr ecx,3                   -- bdx := floor(bp/8) 
                        mov edx,[mask] 
                        mov eax,[p2]                -- k2 
--                      test byte[esi*4+ecx],dl     -- byte,mask -- (needs 1.0.1) 
                        test [esi*4+ecx],edx        -- byte,mask -- (works too) 
                        jne :nextp 
                          ::by2p 
                            mov ecx,eax             -- kl := k2 
                            mov edi,eax             -- bdx := k2 
                            and ecx,15              -- kl := remainder(k2,16) 
                            mov edx,1               -- mask2 := 1 
                            shr ecx,1               -- kl := kl/2 
                            shr edi,4               -- bdx := floor(k2/16) 
                            shl edx,cl              -- mask2 := power(2,floor(kl/2)) 
                            mov ecx,[p] 
                            or byte[esi*4+edi],dl 
                            lea eax,[eax+ecx*2]     -- k2 += p*2 
                            cmp eax,[n] 
                            jle :by2p 
                    [64] 
                        mov rcx,[pb] 
                        mov rsi,[res] 
                        shr rcx,3                   -- bdx := floor(bp/8) 
                        mov rdx,[mask] 
                        mov rax,[p2]                -- k2 
--                      test byte[rsi*4+rcx],dl     -- byte,mask -- (needs 1.0.1) 
                        test [rsi*4+rcx],rdx        -- byte,mask -- (works too) 
                        jne :nextp 
                          ::by2p 
                            mov rcx,rax             -- kl := k2 
                            mov rdi,rax             -- bdx := k2 
                            and rcx,15              -- kl := remainder(k2,16) 
                            mov dl,1                -- mask := 1 
                            shr rcx,1               -- kl := kl/2 
                            shr rdi,4               -- bdx := floor(k2/16) 
                            shl dl,cl               -- mask := power(2,floor(kl/2)) 
                            mov rcx,[p] 
                            or byte[rsi*4+rdi],dl 
                            lea rax,[rax+rcx*2]     -- k2 += p*2 
                            cmp rax,[n] 
                            jle :by2p 
                    [] 
                      ::nextp 
                  } 
        end if 
        p += 2 
        p2 = p*p 
        pb += 1 
        mask *= 2 
        if mask>#FF then mask /= #100 end if 
    end while 
    res = res[1..bytes] 
    return res -- (or return {res,bytes,n} should that be needed/desired) 
end function 
 
atom t5 = time()+5 
--atom t5 = time()-.05  -- (useful for verifying different sizes) 
integer passes = 0 
string res 
while true do 
    res = sieve(limit) 
    passes += 1 
    if time()>t5 then exit end if 
end while 
atom actual_time = time()-t5+5 
 
integer prime_count = 1 -- (no bit for 2 so count it automatically) 
--sequence primes = repeat(0,limit) -- (neither helps nor hinders) 
--primes[2] = 1                                 -- "" 
integer p = 1 
for i=1 to length(res) do 
    integer ri = res[i] 
    for j=1 to 8 do 
        if even(ri) then 
            if p>limit then exit end if 
            prime_count += 1 
--          primes[p] = 0                       -- "" 
        end if 
        ri = floor(ri/2) 
        p += 2 
    end for 
end for 
puts(1,"\n") 
integer verify = getd(limit,prime_counts) 
if prime_count==verify then 
--  printf(1,"%d primes < %d (verified)\n",{prime_count,limit}) 
    printf(1,"phix;%d;%f;1;algorithm=base,faithful=yes,bits=1\n", {passes, actual_time}) 
--  phix;5419;5.015000;1;algorithm=base,faithful=yes,bits=1 
else 
    printf(1,"**ERROR** %d primes(<%d) should be %d\n",{prime_count,limit,verify}) 
end if 
{} = wait_key() 

recap: the above is creating an odd-bit-only sieve bitmap, so res[1] contains the bits for {15,13,11,9,7,5,3,1}, res[2] the bits for {31,29,27,25,23,21,18,17}, etc. (8 bits per byte)

aside: If you don't like the #ilASM{} in the above (who would) you can just delete it and still join in, albeit trying to improve the 675 instead of the 5419.

Not Categorized, Please Help

Search



Quick Links

User menu

Not signed in.

Misc Menu