Open CL - Example: Computing The FFT

Example: Computing The FFT

This example will load a fast Fourier transform (FFT) implementation and execute it. The implementation is shown below.

// create a compute context with GPU device context = clCreateContextFromType(NULL, CL_DEVICE_TYPE_GPU, NULL, NULL, NULL); // create a command queue clGetDeviceIDs( NULL, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, NULL ); queue = clCreateCommandQueue(context, device_id, 0, NULL); // allocate the buffer memory objects memobjs = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(float)*2*num_entries, srcA, NULL); memobjs = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float)*2*num_entries, NULL, NULL); // create the compute program program = clCreateProgramWithSource(context, 1, &fft1D_1024_kernel_src, NULL, NULL); // build the compute program executable clBuildProgram(program, 0, NULL, NULL, NULL, NULL); // create the compute kernel kernel = clCreateKernel(program, "fft1D_1024", NULL); // set the args values clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&memobjs); clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&memobjs); clSetKernelArg(kernel, 2, sizeof(float)*(local_work_size+1)*16, NULL); clSetKernelArg(kernel, 3, sizeof(float)*(local_work_size+1)*16, NULL); // create N-D range object with work-item dimensions and execute kernel global_work_size = num_entries; local_work_size = 64; //Nvidia: 192 or 256 clEnqueueNDRangeKernel(queue, kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL);

The actual calculation (based on Fitting FFT onto the G80 Architecture):

// This kernel computes FFT of length 1024. The 1024 length FFT is decomposed into // calls to a radix 16 function, another radix 16 function and then a radix 4 function __kernel void fft1D_1024 (__global float2 *in, __global float2 *out, __local float *sMemx, __local float *sMemy) { int tid = get_local_id(0); int blockIdx = get_group_id(0) * 1024 + tid; float2 data; // starting index of data to/from global memory in = in + blockIdx; out = out + blockIdx; globalLoads(data, in, 64); // coalesced global reads fftRadix16Pass(data); // in-place radix-16 pass twiddleFactorMul(data, tid, 1024, 0); // local shuffle using local memory localShuffle(data, sMemx, sMemy, tid, (((tid & 15) * 65) + (tid >> 4))); fftRadix16Pass(data); // in-place radix-16 pass twiddleFactorMul(data, tid, 64, 4); // twiddle factor multiplication localShuffle(data, sMemx, sMemy, tid, (((tid >> 4) * 64) + (tid & 15))); // four radix-4 function calls fftRadix4Pass(data); // radix-4 function number 1 fftRadix4Pass(data + 4); // radix-4 function number 2 fftRadix4Pass(data + 8); // radix-4 function number 3 fftRadix4Pass(data + 12); // radix-4 function number 4 // coalesced global writes globalStores(data, out, 64); }

A full, open source implementation of an OpenCL FFT can be found on Apple's website.