Re: which is faster?

new topic     » goto parent     » topic index » view thread      » older message » newer message

Robert Craig wrote:
> 
> Matt Lewis wrote:
> > CChris wrote:
> > > I thought that precomputing s[i] would speed things up, but this didn't
> > > occur.
> > > Not sure why.
> > 
> > I think you didn't use s[i] enough to make the difference.  Also, the 
> > interpreter is smart enough to reuse the temp variable that it creates.
> > Here's the IL that's generated when you don't try to cache s[i]:
> > 
> > 40: 058 119          # STARTLINE: string.exw(119)<<sum += power(s[i], 
> >                      #     2)>>
> > 42: 025 171 178 179  # RHS_SUBS: [s:171] sub [i:178] => [_temp_:179]
> > 46: 013 179 179 179  # MULTIPLY: [_temp_:179], [_temp_:179] => 
> >                      #     [_temp_:179]
> > 50: 011 174 179 174  # PLUS: [sum:174], [_temp_:179] => [sum:174]
> > 
> > Here's the IL when you *do* cache s[i]:
> > 52: 058 138          # STARTLINE: string.exw(138)<<sd = s[i]>>
> > 54: 025 181 189 185  # RHS_SUBS: [s:181] sub [i:189] => [sd:185]
> > 58: 101 185          # ATOM_CHECK: [sd:185]
> > 60: 087 185          # DISPLAY_VAR: [sd:185] 
> > 62: 058 139          # STARTLINE: string.exw(139)<<sum += sd*sd>>
> > 64: 013 185 185 190  # MULTIPLY: [sd:185], [sd:185] => [_temp_:190]
> > 68: 011 184 190 184  # PLUS: [sum:184], [_temp_:190] => [sum:184]
> > 
> > The STARTLINE and DISPLAY_VAR codes are due to with trace (which I put in
> > so we could see the actual lines for clarity) and don't really affect
> > the algorithm (assuming that a production release would make sure that
> > without trace was in effect).
> > 
> > But the bottom line is that you've added an extra ATOM_CHECK op that 
> > doesn't happen when you use it straight.  I think the interpreter is able
> > to do this because it's on the same line (though I'll admit that the
> > logic of when to use temps and when to save them is still opaque to
> > me, so Rob is probably the only one who could explain this without pouring
> > over the code).
> 
> When type_check is in force (by default) then
> the interpreter has to do the ATOM_CHECK when you
> assign to a variable declared as atom. If you say
> "without type_check", that check should be eliminated.
> 
> As Michael Sabal noted, power(x, 2) is optimized
> to x * x (see emit.e), so you get:
> 
>    temp = s[i]
>    temp = temp * temp
> 
> It's smart enough to re-use the same temp.
> 
> Regards,
>    Rob Craig
>    Rapid Deployment Software
>    <a href="http://www.RapidEuphoria.com">http://www.RapidEuphoria.com</a>

Indeed, I had forgotten about that type_check one...
A modified benchmark, and a pretty much modified output:
sequence s,s1  -- data
constant num_runs=10000, len_data=10000
s=repeat(0,num_runs)
s1=repeat(0,len_data)

without type_check
-- generate random data
for k=1 to num_runs do
for i=1 to len_data do
s1[i]=rand(12345+k)
end for
s[k]=s1
end for
puts(1,"Data initialised, starting benchmark...\n")

function mean(sequence s)
-- adds verything up in s, returns average
atom result
result=s[1]
for i=2 to length(s) do result+=s[i] end for
return result/length(s)
end function

global function st_dev0(sequence s, atom X)
    atom m, sum, sd
    integer l
    sum = 0
    l = length(s)
    m = mean(s)
    s -= m
    for i = 1 to length(s) do
	sum += power(s[i], 2)
    end for
    sd = sqrt(sum/(l-1))
    if X = 0 then
	return sd
    else -- return the z-score
	return (X - m)/sd
    end if
end function

global function st_dev1(sequence s, atom X)
    atom m, sum, sd
    integer l
    sum = 0
    l = length(s)
    m = mean(s)
    for i = 1 to length(s) do
	sum += power(s[i], 2)
    end for
    sum -= m*m*l
    sd = sqrt(sum/(l-1))
    if X = 0 then
	return sd
    else -- return the z-score
	return (X - m)/sd
    end if
end function

global function st_dev2(sequence s, atom X)
    atom m, sum, sd
    integer l
    sum = 0
    l = length(s)
    m = mean(s)
    for i = 1 to length(s) do
	    sd = s[i]
        sum += sd*sd
    end for
    sum -= m*m*l
    sd = sqrt(sum/(l-1))
    if X = 0 then
	return sd
    else -- return the z-score
	return (X - m)/sd
    end if
end function

global function st_dev3(sequence s, atom X)
    atom m, sum, sd , m2
    integer l
    sum = 0
    l = length(s)
    m = mean(s)
    m2 = m*m
    for i = 1 to length(s) do
	    sd = s[i]
        sum += sd*sd
    end for
    sum -= m2
    sd = sqrt((sum/(l-1)) - m2)
    if X = 0 then
	return sd
    else -- return the z-score
	return (X - m)/sd
    end if
end function

global function st_dev4(sequence s, atom X)
    atom m, sum, sum_sq, sd , m2
    integer l
    sum_sq = 0.0
    sum = 0.0
    l = length(s)
    for i = 1 to length(s) do
	    sd = s[i]
	    sum += sd
        sum_sq += sd*sd
    end for
    m = sum/l
    m2 = m*m
    sum_sq -= m2
    sd = sqrt((sum_sq/(l-1)) - m2)
    if X = 0 then
	return sd
    else -- return the z-score
	return (X - m)/sd
    end if
end function

global function st_dev5(sequence s, atom X)
    atom m, sum, sum_sq, sd , m2
    integer l
    object x

    sum_sq = 0.0
    sum = 0.0
    l = length(s)
    for i = 1 to length(s) do
	    x = s[i]
        sum += x
        sum_sq += x*x
    end for
    m = sum/l
    m2 = m*m
    sum_sq -= m2
    sd = sqrt((sum_sq/(l-1)) - m2)
    if X = 0 then
	return sd
    else -- return the z-score
	return (X - m)/sd
    end if
end function

atom t,junk
t=time()
for i=1 to length(s) do
   junk=st_dev0(s[i],0.0)
end for
printf(1,"Variant form #0 took %3.6gs.\n",time()-t)
t=time()
for i=1 to length(s) do
   junk=st_dev1(s[i],0.0)
end for
printf(1,"Variant form #1 took %3.6gs.\n",time()-t)
t=time()
for i=1 to length(s) do
   junk=st_dev2(s[i],0.0)
end for
printf(1,"Variant form #2 took %3.6gs.\n",time()-t)
t=time()
for i=1 to length(s) do
   junk=st_dev3(s[i],0.0)
end for
printf(1,"Variant form #3 took %3.6gs.\n",time()-t)
t=time()
for i=1 to length(s) do
   junk=st_dev4(s[i],0.0)
end for
printf(1,"Variant form #4 took %3.6gs.\n",time()-t)
t=time()
for i=1 to length(s) do
   junk=st_dev5(s[i],0.0)
end for
printf(1,"Variant form #5 took %3.6gs.\n",time()-t)

?machine_func(26,0)

Note that variant #5 was modified.
This now results in:
Data initialised, starting benchmark...
Variant form #0 took 70.96s.
Variant form #1 took 13.35s.
Variant form #2 took 13.51s.
Variant form #3 took 13.62s.
Variant form #4 took 12.42s.
Variant form #5 took 12.41s.

You will notice how the call to a helper function, while it makes the code
clearer and easier to maintain, induces a 7-8% speed penalty.
I vote for macros in Eu. Of course, they have to be carefully managed: the C way
of doing it has syntactical problems which have been talked about at length. The
mixin() construct from D would probably be a better idea.

CChris

new topic     » goto parent     » topic index » view thread      » older message » newer message

Search



Quick Links

User menu

Not signed in.

Misc Menu