int ilog2(int n) {

    int nn, lg;
    if (n == 1) {
	return 0;
    }
    lg = 1;
    nn = 2;
    while (nn < n) {
	nn = nn << 1;
	lg++;
    }

    return lg;
}

void fftz2 (int is, int l, int m, int n, int ny, int ny1,
	    dcomplex u[NX], dcomplex x[NX][FFTBLOCKPAD],
	    dcomplex y[NX][FFTBLOCKPAD]) {

/*--------------------------------------------------------------------
c   Performs the L-th iteration of the second variant of the Stockham FFT.
c-------------------------------------------------------------------*/

    int k,n1,li,lj,lk,ku,i,j,i11,i12,i21,i22;
    dcomplex u1; // x11,x21;

/*--------------------------------------------------------------------
c   Set initial parameters.
c-------------------------------------------------------------------*/

    n1 = n / 2;
    if (l-1 == 0) {
	lk = 1;
    } else {
	lk = 2 << ((l - 1)-1);
    }
    if (m-l == 0) {
	li = 1;
    } else {
	li = 2 << ((m - l)-1);
    }
    lj = 2 * lk;
    ku = li;

    for (i = 0; i < li; i++) {
      
        i11 = i * lk;
        i12 = i11 + n1;
        i21 = i * lj;
        i22 = i21 + lk;
        if (is >= 1) {
          u1.real = u[ku+i].real;
          u1.imag = u[ku+i].imag;
        } else {
          u1.real = u[ku+i].real;
          u1.imag = -u[ku+i].imag;
        }

/*--------------------------------------------------------------------
c   This loop is vectorizable.
c-------------------------------------------------------------------*/
        for (k = 0; k < lk; k++) {
	    for (j = 0; j < ny; j++) {
		double x11real, x11imag;
		double x21real, x21imag;
		x11real = x[i11+k][j].real;
		x11imag = x[i11+k][j].imag;
		x21real = x[i12+k][j].real;
		x21imag = x[i12+k][j].imag;
		y[i21+k][j].real = x11real + x21real;
		y[i21+k][j].imag = x11imag + x21imag;
		y[i22+k][j].real = u1.real * (x11real - x21real)
		    - u1.imag * (x11imag - x21imag);
		y[i22+k][j].imag = u1.real * (x11imag - x21imag)
		    + u1.imag * (x11real - x21real);
	    }
	}
    }

}

void cfftz (int is, int m, int n, int fftblock2, int fftblockpad2,
	    dcomplex u[NX], dcomplex x[NX][FFTBLOCKPAD], dcomplex y[NX][FFTBLOCKPAD], int pe) {

/*--------------------------------------------------------------------
c   Computes NY N-point complex-to-complex FFTs of X using an algorithm due
c   to Swarztrauber.  X is both the input and the output array, while Y is a 
c   scratch array.  It is assumed that N = 2^M.  Before calling CFFTZ to 
c   perform FFTs, the array U must be initialized by calling CFFTZ with IS 
c   set to 0 and M set to MX, where MX is the maximum value of M for any 
c   subsequent call.
c-------------------------------------------------------------------*/

    int i,j,l,mx;

    //int fftblock = FFTBLOCK_DEFAULT;
    //int fftblockpad = FFTBLOCKPAD_DEFAULT;
    int fftblock = fftblock2;
    int fftblockpad = fftblockpad2;
    CLOCK_BEGIN(T_FFTLOW, pe);

/*--------------------------------------------------------------------
   Check if input parameters are invalid.
 -------------------------------------------------------------------*/
    mx = (int)(u[0].real);
    if ((is != 1 && is != -1) || m < 1 || m > mx) {
	printf("CFFTZ: Either U has not been initialized, or else\n"
	       "one of the input parameters is invalid%5d%5d%5d\n",
	       is, m, mx);
	exit(1);
    }

/*--------------------------------------------------------------------
c   Perform one variant of the Stockham FFT.
c-------------------------------------------------------------------*/
    for (l = 1; l <= m; l+=2) {
      fftz2 (is, l, m, n, fftblock, fftblockpad, u, x, y);
      if (l == m) break;
      fftz2 (is, l + 1, m, n, fftblock, fftblockpad, u, y, x);
    }

/*--------------------------------------------------------------------
c   Copy Y to X.
c-------------------------------------------------------------------*/
    if (m % 2 == 1) {
	for (j = 0; j < n; j++) {
	    for (i = 0; i < fftblock; i++) {
		x[j][i].real = y[j][i].real;
		x[j][i].imag = y[j][i].imag;
	    }
	}
    }
    CLOCK_END(T_FFTLOW, pe);
}

#define LOOP_BLOCK(A,B)				\
  for (i=0; i+7 < sp.fftblock;) {		\
    A.real = B.real; A.imag = B.imag; i++; A.real = B.real; A.imag = B.imag; i++; \
    A.real = B.real; A.imag = B.imag; i++; A.real = B.real; A.imag = B.imag; i++; \
    A.real = B.real; A.imag = B.imag; i++; A.real = B.real; A.imag = B.imag; i++; \
    A.real = B.real; A.imag = B.imag; i++; A.real = B.real; A.imag = B.imag; i++; \
  }						\
  for (; i < sp.fftblock; i++) { A.real = B.real; A.imag = B.imag; }						\

#define LOOP_DOM(A,B)						\
  for (i = dom.s3, i2 = 0; i+7 <= dom.e3;) {			\
    A.real = B.real; A.imag = B.imag; i++; i2++; A.real = B.real; A.imag = B.imag; i++; i2++; \
    A.real = B.real; A.imag = B.imag; i++; i2++; A.real = B.real; A.imag = B.imag; i++; i2++; \
    A.real = B.real; A.imag = B.imag; i++; i2++; A.real = B.real; A.imag = B.imag; i++; i2++; \
    A.real = B.real; A.imag = B.imag; i++; i2++; A.real = B.real; A.imag = B.imag; i++; i2++; \
  }								\
  for (; i <= dom.e3; i++, i2++) { A.real = B.real; A.imag = B.imag; }

void cffts1 (Setup &sp, Int &dir, Vector3Dcplx &A, Range3D &dom 
             /* out A */)
{
  int lenx = dom.l3(), loglx = ilog2(lenx), pe = getPE();
  dcomplex (*yy0)[FFTBLOCKPAD] = sp.y0; dcomplex (*yy1)[FFTBLOCKPAD] = sp.y1;
  int i, i2, j, k, jj;

  // full X, block on Y  
  for (k = dom.s1; k <= dom.e1; k++) {
    for (jj = dom.s2; jj <= dom.e2 + 1 - sp.fftblock; jj+=sp.fftblock) {
      CLOCK_BEGIN(T_FFTCOPY,pe);
      for (j = 0; j < sp.fftblock; j++) {
	LOOP_DOM (yy0[i2][j], A(k,j+jj,i));
      }
      CLOCK_END(T_FFTCOPY,pe);
      cfftz (dir, loglx, lenx, sp.fftblock, sp.fftblockpad, sp.u, yy0, yy1, pe);
      CLOCK_BEGIN(T_FFTCOPY,pe);
      for (j = 0; j < sp.fftblock; j++) {
	LOOP_DOM (A(k,j+jj,i), yy0[i2][j]);
      }
      CLOCK_END(T_FFTCOPY,pe);
    }
  }
}

void cffts2 (Setup &sp, Int &dir, Vector3Dcplx &A, Range3D &dom 
             /* out A */)
{
  int leny = dom.l2(), logly = ilog2(leny), pe = getPE();
  dcomplex (*yy0)[FFTBLOCKPAD] = sp.y0; dcomplex (*yy1)[FFTBLOCKPAD] = sp.y1;
  int i, j, j2, k, ii;
  // int sz = sp.fftblock*sizeof(dcomplex);

  // full Y, block on X
  for (k = dom.s1; k <= dom.e1; k++) {
    for (ii = dom.s3; ii <= dom.e3 + 1 - sp.fftblock; ii+=sp.fftblock) {
      CLOCK_BEGIN(T_FFTCOPY,pe);
      for (j = dom.s2, j2 = 0; j <= dom.e2; j++, j2++) {
	LOOP_BLOCK (yy0[j2][i], A(k,j,i+ii));
	// MEMCPY (&yy0[j2][0], &A(k,j,ii), sz);
      }
      CLOCK_END(T_FFTCOPY,pe);
      cfftz (dir, logly, leny, sp.fftblock, sp.fftblockpad, sp.u, yy0, yy1, pe);
      CLOCK_BEGIN(T_FFTCOPY,pe);
      for (j = dom.s2, j2 = 0; j <= dom.e2; j++, j2++) {
	LOOP_BLOCK (A(k,j,i+ii), yy0[j2][i]);
	// MEMCPY (&A(k,j,ii), &yy0[j2][0], sz);
      }
      CLOCK_END(T_FFTCOPY,pe);
    }
  }
}

void cffts3 (Setup &sp, Int &dir, Vector3Dcplx &A, Range3D &dom 
             /* out A */)
{
  int lenz = dom.l1(), loglz = ilog2(lenz), pe = getPE();
  dcomplex (*yy0)[FFTBLOCKPAD] = sp.y0; dcomplex (*yy1)[FFTBLOCKPAD] = sp.y1;
  int i, j, k, k2, ii;
  // int sz = sp.fftblock*sizeof(dcomplex);

  // full Z, block on X
  for (j = dom.s2; j <= dom.e2; j++) {
    for (ii = dom.s3; ii <= dom.e3 + 1 - sp.fftblock; ii+=sp.fftblock) {
      CLOCK_BEGIN(T_FFTCOPY,pe);
      for (k = dom.s1, k2 = 0; k <= dom.e1; k++, k2++) {
	LOOP_BLOCK (yy0[k2][i], A(k,j,i+ii));
	// MEMCPY (&yy0[k2][0], &A(k,j,ii), sz);
      }
      CLOCK_END(T_FFTCOPY,pe);
      cfftz (dir, loglz, lenz, sp.fftblock, sp.fftblockpad, sp.u, yy0, yy1, pe);
      CLOCK_BEGIN(T_FFTCOPY,pe);
      for (k = dom.s1, k2 = 0; k <= dom.e1; k++, k2++) {
	LOOP_BLOCK (A(k,j,i+ii), yy0[k2][i]);
	// MEMCPY (&A(k,j,ii), &yy0[k2][0], sz);
      }
      CLOCK_END(T_FFTCOPY,pe);
    }
  }
}

// Aux

void fft_init (int n, dcomplex ua[NX]) {

/*--------------------------------------------------------------------
c compute the roots-of-unity array that will be used for subsequent FFTs. 
c-------------------------------------------------------------------*/

    int m,nu,ku,i,j,ln;
    double t, ti;

/*--------------------------------------------------------------------
c   Initialize the U array with sines and cosines in a manner that permits
c   stride one access at each FFT iteration.
c-------------------------------------------------------------------*/
    nu = n;
    m = ilog2(n);
    ua[0].real = (double)m;
    ua[0].imag = 0.0;
    ku = ln = 1;

    for (j = 1; j <= m; j++) {
	t = PI / ln;
         
	for (i = 0; i <= ln - 1; i++) {
            ti = i * t;
            ua[i+ku].real = cos(ti);
	    ua[i+ku].imag = sin(ti);
	}
         
	ku = ku + ln;
	ln = 2 * ln;
    }
}

