
On Wed, Jan 28, 2009 at 04:42:49PM -0800, drblanco wrote:
Here's my attempt, which takes about 75s for r=10^8.
circ2 r = (1+4*r) + 4 * (circ2' (rs+1) r 1 0) where rs = r^2 circ2' rad x y sum | x
For comparison, the C code below runs in <1 second.
typedef unsigned long long bigint;
bigint gausscount(bigint r) { bigint sum=0; bigint x, y; bigint rs=r*r; bigint rad=rs+1; x=r; y=1;
while (y
rs) { rad-=2*x; rad++; x--; } sum+=1+2*(x-y); rad+=2*y+1; y++; } sum*=4; sum++;
return sum; }
for the curious, here is the C output of jhc when fed circ2 annotated with the obvious bang-patterns (the strictness analyzer still needs some help): other than giving every intermediate value a name, and only performing one operation a line (which doesn't bother gcc at all), it is pretty darn close to the hand optimized C code... variable guide: x -> v86 y -> v88 rs -> v4146 rad -> v84 and sum is accumulated in v90
static uint64_t circ2(uint64_t v12) { uint64_t v212; uint64_t v4146 = (v12 * v12); uint64_t v4758 = (4 * v12); uint64_t v4786 = (1 + v4758); uint64_t v4160 = (1 + v4146); uint64_t v84 = v4160; uint64_t v86 = v12; uint64_t v88 = 1; uint64_t v90 = 0; fW_a__fMain__3_ucirc2_t_u2:; { uint16_t v100000 = (((int64_t)v86) < ((int64_t)v88)); if (0 == v100000) { uint16_t v100002 = (((int64_t)v84) <= ((int64_t)v4146)); if (0 == v100002) { uint64_t v4338 = (1 + v84); uint64_t v4352 = (2 * v86); uint64_t v4366 = (v4338 - v4352); uint64_t v4380 = (v86 - 1); v84 = v4366; v86 = v4380; v88 = v88; v90 = v90; goto fW_a__fMain__3_ucirc2_t_u2; } else { /* 1 */ uint64_t v4226 = (1 + v90); uint64_t v4772 = (v86 - v88); uint64_t v4282 = (2 * v4772); uint64_t v4296 = (v4226 + v4282); uint64_t v4310 = (1 + v88); uint64_t v4240 = (1 + v84); uint64_t v4254 = (2 * v88); uint64_t v4324 = (v4240 + v4254); v84 = v4324; v86 = v86; v88 = v4310; v90 = v4296; goto fW_a__fMain__3_ucirc2_t_u2; } } else { /* 1 */ v212 = v90; } } uint64_t v4174 = (4 * v212); return v4786 + v4174; }
John -- John Meacham - ⑆repetae.net⑆john⑈