2D FFT

The 2D Fast Fourier Transform (FFT) application is particularly demanding in terms of global communication bandwidth. Each processor starts holding a set of columns, and computes column-wise FFTs. After a transpose each processor holds a set of rows and computes row-wise FFTs. Similarly, after another transpose the inverse FFT is computed.

Types

type range Range1D, Range2D;
type partition <Range1D> PartRange1D [PartNP];
type partition <Range2D> PartRange2D [PartNP];
type swap <Range2D> SwapRange2D [PartNP];

type data Int, Config, File;
type data Vector1Dcplx [PartRange1D];
type data Vector2Dcplx [PartRange2D];

Declarations

// FFTs

function fft_col (A, W, col, r1 -> B)
Vector2Dcplx A[col], B[col],
Vector1Dcplx W[r1],
Range2D col, Range1D r1;

function fft_row (A, W, row, r1 -> B)
Vector2Dcplx A[row], B[row],
Vector1Dcplx W[r1],
Range2D row, Range1D r1;

function negate_im (A, ra -> B)
Vector2Dcplx A[ra], B[ra],
Range2D ra;

function pre_fft (r1 -> W)
Vector1Dcplx W[r1],
Range1D r1;

// Auxiliary computations

function matrix_read (cfg, ra -> A, R)
Vector2Dcplx A[ra], R[ra],
Range2D ra,
Config cfg;

function check_equal(A, B, ra)
Vector2Dcplx A[ra], B[ra],
Range2D ra;

function econd (cfg, idx -> cond)
Int idx, cond,
Config cfg;

function init (cfg -> row, col, r1, f)
PartRange2D row[ALLn], col[ALLn],
PartRange1D r1[ALLn],
File f, Config cfg;

Computations

// Parallel functions

function test (cfg) [gen]
Config cfg[ALL1]
{
computeSwap (col, row -> sw_cr[PPEn]);
computeSwap (row, col -> sw_rc[PPEn]);

init (cfg -> row, col, r1, fout);
pre_fft (r1 -> W);
matrix_read (cfg, col -> A, R);

loop (i, cond ; A [col] -> B [col]) [clock:MAIN]
{
fft2d (A[col], W[r1], sw_cr, sw_rc, row -> X[col]);
ifft2d (X[col], W[r1], sw_cr, sw_rc, row -> B[col]);

econd (cfg, i -> cond);
}

check_equal (B, R, col);
print (fout, B, col);
}

function fft2d (A, W, sw_cr, sw_rc, row, col, r1 -> B)
Vector2Dcplx A[col], B[col],
Vector1Dcplx W[r1],
PartRange2D row[ALLn], col[ALLn],
PartRange1D r1[ALLn],
SwapRange2D sw_cr[PPEn], sw_rc[PPEn]
{
// DECOMPOSITION RULES
sw_cr : col -> row;
sw_rc : row -> col;

fft_col (A[col], W[r1] -> X[col]);
fft_row (X[row], W[r1] -> B[row]);
}

// inverse FFT transform


function ifft2d (A, W, sw_cr, sw_rc, row, col, r1 -> B)
Vector2Dcplx A[col], B[col],
Vector1Dcplx W[r1],
PartRange2D row[ALLn], col[ALLn],
PartRange1D r1[ALLn],
SwapRange2D sw_cr[PPEn], sw_rc[PPEn]
{
// DECOMPOSITION RULES
sw_cr : col -> row;
sw_rc : row -> col;

negate_im (A[col] -> Am[col]);

fft_col (Am[col], W[r1] -> X[col]);
fft_row (X[row], W[r1] -> Bm[row]);

negate_im (Bm[row] -> B[row]);
}

CGD Library

The following code generated by the CGD compiler shows how all functions are merged together and datastructure naming allows memory reuse. This code covers the main loop.

  for (i=0, cond=1; cond; i++)
{
CLK_MAIN_BEGIN(i);

fft_col (A_col, W_r1, col[pe], r1[pe], Xe2_col);

swapBegin (sw_cr, Xe2_col, Bme_row, 0, pe);

econd (cfg, i, cond);

swapEnd (sw_cr, Xe2_col, Bme_row, 0, pe);

fft_row (Bme_row, W_r1, row[pe], r1[pe], A_row);

swapBegin (sw_rc, A_row, Xe2_col, 0, pe);

swapEnd (sw_rc, A_row, Xe2_col, 0, pe);

negate_im (Xe2_col, col[pe], Ame_col);

fft_col (Ame_col, W_r1, col[pe], r1[pe], Xe2_col);

swapBegin (sw_cr, Xe2_col, A_row, 0, pe);

swapEnd (sw_cr, Xe2_col, A_row, 0, pe);

fft_row (A_row, W_r1, row[pe], r1[pe], Bme_row);

negate_im (Bme_row, row[pe], A_row);

swapBegin (sw_rc, A_row, A_col, 0, pe);

swapEnd (sw_rc, A_row, A_col, 0, pe);

CLK_MAIN_END(i);
}

Files


ProgrammerCompiler generated
CGD sourceType DeclarationsSequential ComputationsComputation DeclarationsParallel Functions
FFT 2Dfft2d.pdfft2d.hfft2d.ccfft2d_decl.hfft2d_auto.cc


fft1d.hfft1d.cc