void compute_layout (int pe, int np1, int np2, Env &env)
{
  int zs0, ze0, ys0, ye0, xs1, xe1, ys2, ye2;
  int me1 = pe / np2, me2 = pe % np2;

  env.dom0.setNo(pe, 1); env.dom1.setNo(pe, 1); env.dom2.setNo(pe, 1);

  splitInt (0, NY-1, me1, np1, ys0, ye0);
  splitInt (0, NZ-1, me2, np2, zs0, ze0);
  env.dom0[pe].set(zs0, ze0, ys0, ye0, 0, NX-1);
  
  splitInt (0, NX-1, me1, np1, xs1, xe1);
  env.dom1[pe].set(zs0, ze0, 0, NY-1, xs1, xe1);
  
  splitInt (0, NY-1, me2, np2, ys2, ye2);
  env.dom2[pe].set(0, NZ-1, ys2, ye2, xs1, xe1);
}

void compute_decompositions (Env &env)
{
  Alias (int, pe, env.pre.pe); Alias (int, block, env.sp.fftblock);

  int np1 = 1, np2 = 1;
  if (noPE <= NZ) {
    env.lay1d = 1;
    np2 = noPE;
  } else {
    env.lay1d = 0;
    while (np2*np2 <= noPE) np2=np2<<1; np2=np2>>1; 
    np1 = noPE / np2;
  }
  // ASSERT (np1*np2==noPE,"no proc %d not power of 2",noPE);
  // ASSERT (NX%np1==0 && NY%np1==0,"NX or NY not power of 2");
  
  block = MIN(block,NX/np1); block = MIN(block,NY/np1);
  // ASSERT (NX%block==0 && NY%block==0,"block %d doesn't divide NX or NY",block);
  PRINTP0("Layout %s np1: %d np2: %d; Blocking %d\n",env.lay1d?"1D":"2D",np1,np2,block);
  
  // partitions
  Alias (RangeNP, ppen, env.pre.PPEn[pe]); Alias (RangeNP, alln, env.pre.ALLn[pe]);
  Alloc (env.dom0, alln); Alloc (env.dom1, alln); Alloc (env.dom2, alln);
  Alloc (env.sw01, ppen); Alloc (env.sw12, ppen); 
  Alloc (env.sw21, ppen); Alloc (env.sw10, ppen);
  Alloc (env.sw02, ppen); Alloc (env.sw20, ppen);

  for (int i=0; i<noPE; i++) compute_layout (i, np1, np2, env);

  // swaps
  computeSwap (env.dom0, env.dom1, env.sw01);
  computeSwap (env.dom1, env.dom2, env.sw12);
  computeSwap (env.dom2, env.dom1, env.sw21);
  computeSwap (env.dom1, env.dom0, env.sw10);
  computeSwap (env.dom0, env.dom2, env.sw02);
  computeSwap (env.dom2, env.dom0, env.sw20);
}

