Execution time bottleneck: How to speed up execution?

Dan Bernstein brnstnd at kramden.acf.nyu.edu
Thu Feb 14 06:36:12 AEST 1991


In article <4763 at goanna.cs.rmit.oz.au> ok at goanna.cs.rmit.oz.au (Richard A. O'Keefe) writes:
> 	register int i, j;
> 	register double t;
> 	for (j = 1; j <= n; j++) a[j] = 1.0;
> 	for (j = 1; j <= n; j++)
> 	    for (i = j+1; i <= n; i++) {
> 		t = x[i] - x[j];
> 		t = exp(t*t*con);	/* c**((x[i]-x[j])**2) */
> 		a[i] += t;
> 		a[j] += t;		/* symmetric! */
> 	    }
> 	for (j = 1; j <= n; j++) a[j] *= k;

Some random observations, starting from this point: It's wasteful to
repeatedly multiply a homogeneous formula by a constant. First multiply
all the x's by the square root of the constant. Assuming there's space
available (or you can trash the original array, or you can divide
afterwards and not worry about roundoff error):

	register int i;
	register int j;
	register double t;
	register double sqrtc;

	sqrtc = sqrt(c);
	for (j = 1;j <= n;++j) y[j] = x[j] * sqrtc;
	for (j = 1;j <= n;++j) a[j] = 1.0;
	for (j = 1;j <= n;++j)
	  for (i = j + 1;i <= n;++i) {
	    t = y[i] - y[j];
	    t = exp(t * t);
	    a[i] += t;
	    a[j] += t;
	  }
 	for (j = 1;j <= n;j++) a[j] *= k;

Next, the loops will run faster backwards on most machines.

	register int i;
	register int j;
	register double t;
	register double sqrtc;

	sqrtc = sqrt(c);
	j = n; do { y[j] = x[j] * sqrtc; } while(--j);
	j = n; do { a[j] = 1.0; } while(--j);
	j = n; do {
	  i = n; do {
	    t = y[i] - y[j];
	    t = exp(t * t);
	    a[i] += t;
	    a[j] += t;
	  } while(--i != j);
	} while(--j);
	j = n; do { a[j] *= k; } while(--j);

Next, the accumulated a[j] sum might as well be kept in a register, as
well as y[j]:

	register int i;
	register int j;
	register double t;
	register double sqrtc;
	register double yj;
	register double aj;

	sqrtc = sqrt(c);
	j = n; do { y[j] = x[j] * sqrtc; } while(--j);
	j = n; do { a[j] = 1.0; } while(--j);
	j = n; do {
	  aj = a[j]; yj = y[j];
	  i = n; do {
	    t = y[i] - yj;
	    t = exp(t * t);
	    aj += t;
	    a[i] += t;
	  } while(--i != j);
	  a[j] = aj;
	} while(--j);
	j = n; do { a[j] *= k; } while(--j);

Under some compilers on some machines it will produce a slight speedup
to keep a + i and y + i in registers:

	register int i;
	register int j;
	register double t;
	register double sqrtc;
	register double yj;
	register double aj;
	register double *ypi;
	register double *api;

	sqrtc = sqrt(c);
	j = n; do { y[j] = x[j] * sqrtc; a[j] = 1.0; } while(--j);
	j = n; do {
	  aj = a[j]; yj = y[j];
	  i = n; api = a + i; ypi = y + i; do {
	    t = *ypi - yj;
	    t = exp(t * t);
	    aj += t;
	    *api += t;
	  } while(--api, --ypi, (--i != j));
	  a[j] = aj;
	} while(--j);
	j = n; do { a[j] *= k; } while(--j);

On the other hand, the loop should be written somewhat differently for
vector machines:

	register int i;
	register int j;
	register double t;
	register double sqrtc;
	register double yj;
	register double aj;

	sqrtc = sqrt(c);
	j = n; do { y[j] = x[j] * sqrtc; a[j] = 1.0; } while(--j);
	j = n; do {
	  yj = y[j];
	  i = n; do {
	    t = y[i] - yj;
	    t = exp(t * t);
	    a[i] += t;
	    z[i] = t;
	  } while(--i != j);
	  aj = a[j];
	  i = n; do {
	    aj += z[i];
	  } while(--i != j);
	  a[j] = aj;
	} while(--j);
	j = n; do { a[j] *= k; } while(--j);

If the machine can't vectorize exp then the loop should be split still
differently.

Another strategy to compute (yi - yj)^2 is to keep yi^2 + yj^2 and 2yiyj
in registers, and precompute tables of y[i+1]^2 - yi^2 and y[i+1]/yi. To
update the registers requires an addition and a multiplication; to get
the final value requires a subtraction and an exp. On some wacko machine
where multiplications are faster than additions, you could even keep
exp(yi^2 + yj^2) and 2yiyj around, with a table of exp(y[i+1]^2 - yi^2)
and a table of y[i+1]/yi. Then updating needs two multiplications, and
the final value requires an exp and a multiplication. These won't be
faster on any computer I've used.

Caveat: These are all untested.

---Dan



More information about the Comp.lang.c mailing list