NPB FT

The FT NASA parallel benchmark solves a 3D PDE using spectral methods. The most computation intensive sections compute X,Y,Z-wise FFTs on local domains, while the most communication intensive steps are the X-Y and Y-Z transpositions for 2D domain decompositions. The CGD implementation is similar to the original MPI Fortran version supporting both 1D and 2D decompositions. The CGD sequential computations are based on C code derived by Omni from Fortran NPB 2.3 (see files).

Types

// Domain specific types
type range Range3D;
type partition <Range3D> PartRange3D [PartNP];
type swap <Range3D> SwapRange3D [PartNP];

// Data types
type data Setup, Checksums, Int, Bool, Cplx;
type data Vector3Dcplx [PartRange3D];
type data <Cplx> ArrayCplx [PartNP];

Init Data

const Setup sp[ALL1], Int niter[ALL1], fdir[ALL1], finv[ALL1], lay1d[ALL1];
const PartRange3D dom0[ALLn], dom1[ALLn], dom2[ALLn];
const SwapRange3D sw01[PPEn], sw12[PPEn], sw21[PPEn], sw10[PPEn],
 sw02[PPEn], sw20[PPEn];

Decomposition Rules

// 3D domain redistribution rules
part sw01 : dom0 -> dom1;
part sw12 : dom1 -> dom2;
part sw21 : dom2 -> dom1;
part sw10 : dom1 -> dom0;
part sw02 : dom0 -> dom2;
part sw20 : dom2 -> dom0;

Parallel Computations

// Top level loop

function mainloop ( -> success) [gen]
Bool success[ALL1]
{
init_checksums (sp -> cksum);
compute_init_cond (sp -> u0[dom0]);

fft (u0[dom0] -> u1[dom2]);

loop (iter, cond; cksum[ONE1] -> cksum2[ONE1])
{
evolve (sp, iter, u1[dom2] -> u2[dom2]);

ifft (u2[dom2] -> u3[dom0]);

checksum (iter, u3[dom0], cksum -> cksum2);
econd (niter, iter -> cond);
}

verify <ONE1> (sp, cksum2[ONE1] -> success[ONE1]);
}

// Direct 3D FFT

function fft (A -> D)
Vector3Dcplx A[dom0], D[dom2]
{
cffts1 (sp, fdir, A[dom0] -> B[dom0]);
if (lay1d, B[dom0] -> C[dom2]) {
// 1D
cffts2 (sp, fdir, B[dom0] -> C[dom0]);
} else {
// 2D
cffts2 (sp, fdir, B[dom1] -> C[dom1]);
}
cffts3 (sp, fdir, C[dom2] -> D[dom2]);
}

// Inverse 3D FFT

function ifft (A -> D)
Vector3Dcplx A[dom2], D[dom0]
{
cffts3 (sp, finv, A[dom2] -> B[dom2]);
if (lay1d, B[dom2] -> C[dom0]) {
// 1D
cffts2 (sp, finv, B[dom0] -> C[dom0]);
} else {
// 2D
cffts2 (sp, finv, B[dom1] -> C[dom1]);
}
cffts1 (sp, finv, C[dom0] -> D[dom0]);
}

// Checksum

function checksum (iter, x, cksum, dom -> cksum2)
Int iter[ALL1], PartRange3D dom[ALLn],
Vector3Dcplx x[dom], Checksums cksum[ONE1], cksum2[ONE1]
{
checksum_dom <ALL1> (x[dom] -> c1[PPEn]);
sumproc <ONE1> (c1[ONEn] -> c2[ONE1]);
checksum_set <ONE1> (sp, c2[ONE1], iter, cksum[ONE1] -> cksum2[ONE1]);
}

Declarations

// Main

function evolve (sp, idx, u0, dom -> u1)
Setup sp, Int idx, Range3D dom,
Vector3Dcplx u0[dom], u1[dom];

function cffts1 (sp, dir, A, dom -> A*)
Setup sp, Int dir, Range3D dom, Vector3Dcplx A[dom], A*[dom];

function cffts2 (sp, dir, A, dom -> A*)
Setup sp, Int dir, Range3D dom, Vector3Dcplx A[dom], A*[dom];

function cffts3 (sp, dir, A, dom -> A*)
Setup sp, Int dir, Range3D dom, Vector3Dcplx A[dom], A*[dom];

// Checksum

function sumproc (arr, ra -> res)
ArrayCplx arr[ra], RangeNP ra, Cplx res;

function checksum_dom (x, dom -> res)
Vector3Dcplx x[dom], Range3D dom, Cplx res;

function checksum_set (sp, res, iter, cksum -> cksum*)
Setup sp, Cplx res, Int iter, Checksums cksum, cksum*;

// Helper

function init_checksums (sp -> cks)
Setup sp, Checksums cks;

function compute_init_cond (sp, dom -> u0)
Setup sp, Vector3Dcplx u0[dom], Range3D dom;

function econd (max, idx -> cond)
Int max, Int idx, cond;

function verify (sp, cks -> success)
Setup sp, Checksums cks, Bool success;

CGD Library

The compiler generates the following C++ code corresponding to the top-level section:

  swapSetup (sw02, u2_dom0, u0_dom2, 0, 0, pe);
swapSetup (sw01, u2_dom0, u2_dom1, 0, 1, pe);
swapSetup (sw12, u2_dom1, u0_dom2, 0, 2, pe);
swapSetup (sw20, u2_dom2, u2_dom0, 0, 3, pe);
swapSetup (sw21, u2_dom2, u2_dom1, 0, 4, pe);
swapSetup (sw10, u2_dom1, u2_dom0, 0, 5, pe);
swapSetup (SwPPE2ONEn, c1e_PPEn, c1e_ONEn, 0, 6, pe);

init_checksums (sp, cksum);
compute_init_cond (sp, dom0[pe], u2_dom0);

cffts1 (sp, fdir, u2_dom0 /* mod */, dom0[pe]);

if (lay1d)
{
cffts2 (sp, fdir, u2_dom0 /* mod */, dom0[pe]);

swapBegin (sw02, u2_dom0, u0_dom2, 0, 0, pe);
swapEnd (sw02, u2_dom0, u0_dom2, 0, 0, pe);
}
else
{
swapBegin (sw01, u2_dom0, u2_dom1, 0, 1, pe);
swapEnd (sw01, u2_dom0, u2_dom1, 0, 1, pe);

cffts2 (sp, fdir, u2_dom1 /* mod */, dom1[pe]);

swapBegin (sw12, u2_dom1, u0_dom2, 0, 2, pe);
swapEnd (sw12, u2_dom1, u0_dom2, 0, 2, pe);
}

cffts3 (sp, fdir, u0_dom2 /* mod */, dom2[pe]);

for (iter=0, cond=1; cond; iter++)
{
evolve (sp, iter, u0_dom2, dom2[pe], u2_dom2);

cffts3 (sp, finv, u2_dom2 /* mod */, dom2[pe]);

if (lay1d)
{
swapBegin (sw20, u2_dom2, u2_dom0, 0, 3, pe);
swapEnd (sw20, u2_dom2, u2_dom0, 0, 3, pe);

cffts2 (sp, finv, u2_dom0 /* mod */, dom0[pe]);
}
else
{
swapBegin (sw21, u2_dom2, u2_dom1, 0, 4, pe);
swapEnd (sw21, u2_dom2, u2_dom1, 0, 4, pe);

cffts2 (sp, finv, u2_dom1 /* mod */, dom1[pe]);

swapBegin (sw10, u2_dom1, u2_dom0, 0, 5, pe);
swapEnd (sw10, u2_dom1, u2_dom0, 0, 5, pe);
}

cffts1 (sp, finv, u2_dom0 /* mod */, dom0[pe]);

checksum_dom (u2_dom0, dom0[pe], c1e_PPEn[pe]);

swapBegin (SwPPE2ONEn, c1e_PPEn, c1e_ONEn, 0, 6, pe);
econd (niter, iter, cond);

swapEnd (SwPPE2ONEn, c1e_PPEn, c1e_ONEn, 0, 6, pe);

if (ONE1.getNo(pe))
{
sumproc (c1e_ONEn, ONEn[pe], c2e);
}

if (ONE1.getNo(pe))
{
checksum_set (sp, c2e, iter, cksum /* mod */);
}
}

if (ONE1.getNo(pe))
{
verify (sp, cksum, success);
}

swapBegin (SwONE2ALL1, success, success, 0, 7, pe);
swapEnd (SwONE2ALL1, success, success, 0, 7, pe);

Slabs Optimization

Communication overlap reduces communication cost by executing communication asynchronously. Overlap can be exposed for the FT algorithm by splitting each FFT computation into smaller grain computations executed on smaller "slab" domains. By doing this data needed to compute FFTs on slabs will be needed later, and communication can be overlapped with execution of FFTs on other slabs. This high level optimization is easily implemented in CGD by changing domain distributions. All lines modified or inserted by the "slabs" optimization are marked with ±.

Types

type range Range3D;
type partition <Range3D> PartRange3D [PartNP];
type mpartition <Range3D> MPartRange3D [PartNP]; ±
type mswap <Range3D> MSwapRange3D [PartNP]; ±

type data Setup, Checksums, Int, Bool, Cplx;
type data Vector3Dcplx [PartRange3D, MPartRange3D];
type data <Cplx> ArrayCplx [PartNP];

Init Data

const Setup sp[ALL1], Int niter[ALL1], fdir[ALL1], finv[ALL1], lay1d[ALL1];

const MPartRange3D mdom0[ALLn], mdom1[ALLn], mdom2[ALLn]; ±
const PartRange3D dom0[ALLn], dom1[ALLn], dom2[ALLn];
const MSwapRange3D sw01[PPEn], sw12[PPEn], sw21[PPEn], sw10[PPEn], ±
   sw02[PPEn], sw20[PPEn];

Decomposition Rules

part sw01 : dom0 -> mdom1; ±
part sw12 : dom1 -> mdom2; ±
part sw21 : dom2 -> mdom1; ±
part sw10 : dom1 -> mdom0; ±
part sw02 : dom0 -> mdom2; ±
part sw20 : dom2 -> mdom0; ±

part dom0 = mdom0; ±
part dom1 = mdom1; ±
part dom2 = mdom2; ±

Parallel Computations

function fft (A -> D)
Vector3Dcplx A[dom0], D[dom2]
{
cffts1 <ALL1> (sp, fdir, A[dom0] -> B[dom0]);
if (lay1d, B[dom0] -> D[dom2]) {
// 1D
cffts2 <ALL1> (sp, fdir, B[dom0] -> C[dom0]);
cffts3 <mdom2> (sp, fdir, C[mdom2] -> D[mdom2]); ±
 } else {
// 2D
cffts2 <mdom1> (sp, fdir, B[mdom1] -> C[mdom1]); ±
 cffts3 <mdom2> (sp, fdir, C[mdom2] -> D[mdom2]); ±
 }
}

function ifft (A -> D)
Vector3Dcplx A[dom2], D[dom0]
{
cffts3 <ALL1> (sp, finv, A[dom2] -> B[dom2]);
if (lay1d, B[dom2] -> D[dom0]) {
// 1D
cffts2 <mdom0> (sp, finv, B[mdom0] -> C[mdom0]); ±
 cffts1 <ALL1> (sp, finv, C[dom0] -> D[dom0]);
} else {
// 2D
cffts2 <mdom1> (sp, finv, B[mdom1] -> C[mdom1]); ±
 cffts1 <mdom0> (sp, finv, C[mdom0] -> D[mdom0]); ±
 }
}

CGD Library

swapEndAM first ends then starts a new communication step (when needed). This overlaps the cfftx computation executed during an interation with communication bringing data for the next iteration.

  swapSetupAM (sw02, u2_dom0, u0_dom2, 0, 0, pe);
swapSetupAM (sw01, u2_dom0, u2_dom1, 0, 1, pe);
swapSetupAM (sw12, u2_dom1, u0_dom2, 0, 2, pe);
swapSetupAM (sw20, u2_dom2, u2_dom0, 0, 3, pe);
swapSetupAM (sw21, u2_dom2, u2_dom1, 0, 4, pe);
swapSetupAM (sw10, u2_dom1, u2_dom0, 0, 5, pe);
swapSetup (SwPPE2ONEn, c1e_PPEn, c1e_ONEn, 0, 6, pe);

init_checksums (sp, cksum);
compute_init_cond (sp, dom0[pe], u2_dom0);

cffts1 (sp, fdir, u2_dom0 /* mod */, dom0[pe]);

if (lay1d)
{
cffts2 (sp, fdir, u2_dom0 /* mod */, dom0[pe]);

swapBeginAM (sw02, u2_dom0, u0_dom2, 0, 0, pe);
for (int _mi=0; _mi<mdom2.getNo(pe); _mi++)
{
swapEndAM (sw02, u2_dom0, u0_dom2, _mi, 0, 0, pe);

cffts3 (sp, fdir, u0_dom2 /* mod */, mdom2.idx(pe,_mi));
}
}
else
{
swapBeginAM (sw01, u2_dom0, u2_dom1, 0, 1, pe);
for (int _mi=0; _mi<mdom1.getNo(pe); _mi++)
{
swapEndAM (sw01, u2_dom0, u2_dom1, _mi, 0, 1, pe);

cffts2 (sp, fdir, u2_dom1 /* mod */, mdom1.idx(pe,_mi));
}

swapBeginAM (sw12, u2_dom1, u0_dom2, 0, 2, pe);
for (int _mi=0; _mi<mdom2.getNo(pe); _mi++)
{
swapEndAM (sw12, u2_dom1, u0_dom2, _mi, 0, 2, pe);

cffts3 (sp, fdir, u0_dom2 /* mod */, mdom2.idx(pe,_mi));
}
}

for (iter=0, cond=1; cond; iter++)
{
evolve (sp, iter, u0_dom2, dom2[pe], u2_dom2);

cffts3 (sp, finv, u2_dom2 /* mod */, dom2[pe]);

if (lay1d)
{
swapBeginAM (sw20, u2_dom2, u2_dom0, 0, 3, pe);
for (int _mi=0; _mi<mdom0.getNo(pe); _mi++)
{
swapEndAM (sw20, u2_dom2, u2_dom0, _mi, 0, 3, pe);

cffts2 (sp, finv, u2_dom0 /* mod */, mdom0.idx(pe,_mi));
}

cffts1 (sp, finv, u2_dom0 /* mod */, dom0[pe]);
}
else
{
swapBeginAM (sw21, u2_dom2, u2_dom1, 0, 4, pe);
for (int _mi=0; _mi<mdom1.getNo(pe); _mi++)
{
swapEndAM (sw21, u2_dom2, u2_dom1, _mi, 0, 4, pe);

cffts2 (sp, finv, u2_dom1 /* mod */, mdom1.idx(pe,_mi));
}

swapBeginAM (sw10, u2_dom1, u2_dom0, 0, 5, pe);
for (int _mi=0; _mi<mdom0.getNo(pe); _mi++)
{
swapEndAM (sw10, u2_dom1, u2_dom0, _mi, 0, 5, pe);

cffts1 (sp, finv, u2_dom0 /* mod */, mdom0.idx(pe,_mi));
}
}

checksum_dom (u2_dom0, dom0[pe], c1e_PPEn[pe]);

swapBegin (SwPPE2ONEn, c1e_PPEn, c1e_ONEn, 0, 6, pe);
econd (niter, iter, cond);

swapEnd (SwPPE2ONEn, c1e_PPEn, c1e_ONEn, 0, 6, pe);

if (ONE1.getNo(pe))
{
sumproc (c1e_ONEn, ONEn[pe], c2e);
}

if (ONE1.getNo(pe))
{
checksum_set (sp, c2e, iter, cksum /* mod */);
}
}

if (ONE1.getNo(pe))
{
verify (sp, cksum, success);
}

swapBegin (SwONE2ALL1, success, success, 0, 7, pe);
swapEnd (SwONE2ALL1, success, success, 0, 7, pe);

Files


ProgrammerCompiler generated
CGD sourceType DeclarationsSequential ComputationsComputation DeclarationsParallel Functions
FT 1D,2Dft.pdft.hft.c
init.c
ft_decl.hft_auto.cc
FT slabsft_sl.pd
ft_sl.c
init_sl.c
ft_sl_decl.hft_sl_auto.cc



fft.c
other.c