2 GPU multi-rate FIR filter bank example software
4 Oxford e-Research Centre, Oxford University
6 Centre for Digital Music, Queen Mary, University of London.
8 This program is free software: you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation, either version 3 of the License,
11 or (at your option) any later version.
12 See the file COPYING included with this distribution for more information.
16 #include <helper_cuda.h>
19 ///////////////////////////////////////////////////////////////////////////////
21 ///////////////////////////////////////////////////////////////////////////////
23 __global__ void multiFilterGpuDevice_2dgrid( float *in, float *out, float *filter, int osize, int isize, int bsize)
26 * This GPU kernel applies multiple filters, one per y direction.
27 * It is assumed that input data arrays are contiguous in memory.
32 int tidx = threadIdx.x + blockIdx.x*blockDim.x;
34 // dynamically allocated shared memory, passed at kernel call
35 extern __shared__ float indata[];
36 float tempdata[B_SIZE];
44 for (int i=0; i< B_SIZE; i++)
45 tempdata[i] = filter[by * B_SIZE + i];
55 indata[tx + B_SIZE ] = in[tx + B_SIZE];
60 // copy rest of blocks in overlapped fashion
61 if (blockIdx.x > 0 ) {
62 if (tx < B_SIZE && tidx < isize)
63 indata[tx] = in[tidx];
66 indata[tx + B_SIZE] = in[tidx + B_SIZE];
69 //finished loading signal
73 for (int i=0; i<B_SIZE; i++){
74 outdata += tempdata[i] * indata[tx + B_SIZE - i];
78 out[by * osize + tidx] = outdata;
81 __global__ void multiFilterGpuDevice_2dgrid_shmem( float *in, float *out, float *filter, int osize, int isize, int bsize)
84 This kernel applies multiple filters, one per y direction
85 It is assumed that input data arrays are contiguous in memory
90 int tidx = threadIdx.x + blockIdx.x*blockDim.x;
92 extern __shared__ float indata[];
93 __shared__ float tempdata[B_SIZE];
101 for (int i=0; i< B_SIZE; i++)
102 tempdata[i] = filter[by * B_SIZE + i];
107 if (blockIdx.x == 0){
113 indata[tx + B_SIZE ] = in[tx + B_SIZE];
118 // copy rest of blocks in overlapped fashion
119 if (blockIdx.x > 0 ) {
120 if (tx < B_SIZE && tidx < isize)
121 indata[tx] = in[tidx];
124 indata[tx + B_SIZE] = in[tidx + B_SIZE];
127 //finished loading signal
131 for (int i=0; i<B_SIZE; i++){
132 outdata += tempdata[i] * indata[tx + B_SIZE - i];
136 out[by * osize + tidx] = outdata;
139 void cudaMultiFilterFirInit(float **in, float **out, float *filters,
140 params *params, gpu_arrays *arrays)
143 Perform all memory allocations and transfers required
144 for a GPU multiple filter operation.
145 This also allows to keep allocated memory and reusable
146 data on the GPU, in order to repeat CUDA calls
147 along the whole input signal.
151 @param in: the input buffers (2D array), for resampled inputs
152 @param out: the output buffers (2D array), different sampling
153 @param filters: a two dimensional array containing the coefficients
155 @param params: structure for the filters parameters, which
156 contains the input and output sizes for each filter
157 as well as the number of filters and sampling rates
159 @param arrays: a structure of pointers to the GPU arrays for reuse
162 int fsize = params->fsize;
163 int rnumfs; // number of filters for a given sampling rate
164 int nrates = params->nrates;
166 int pos; //starting position of filters for a given rate
168 // initialise card (cuda_helper.h routine)
169 findCudaDevice(0, 0);
172 if (params->streams){
173 for (int i = 0; i < nrates; ++i)
174 checkCudaErrors(cudaStreamCreate(&arrays->stream[i]));
177 //allocate memory in separate locations for the different sampling rates
181 for (int i=0; i<nrates; i++){
182 rnumfs = params->rnumf[i]; // number of filters for this rate
184 //allocate memory for these filters
185 checkCudaErrors(cudaMalloc((void**)&arrays->d_filters[i], fsize * rnumfs * sizeof(float)));
187 //copy this filter array to GPU memory
188 checkCudaErrors(cudaMemcpy(arrays->d_filters[i], filters + pos*fsize,
189 fsize * rnumfs * sizeof(float), cudaMemcpyHostToDevice));
191 pos += params->rnumf[i];
195 // Inputs and outputs
197 for (int i=0; i<nrates; i++){
198 // allocate host page locked memory for staging
199 // to allow CUDA asynchronous transfers if we are working with steams
200 // if we are not using streams the input and output pointers
201 // from the host are passed.
202 if (params->streams){
203 checkCudaErrors(cudaMallocHost((void**)&arrays->h_in[i],
204 arrays->isize[i] * sizeof(float)));
205 checkCudaErrors(cudaMallocHost((void**)&arrays->h_out[i],
206 arrays->osize[i] * params->rnumf[i] * sizeof(float)));
208 arrays->h_in[i] = in[i];
209 arrays->h_out[i] = out[i];
211 // allocate device arrays
212 checkCudaErrors(cudaMalloc((void**)&arrays->d_in[i],
213 arrays->isize[i] * sizeof(float)));
214 checkCudaErrors(cudaMalloc((void**)&arrays->d_out[i],
215 arrays->osize[i] * params->rnumf[i] * sizeof(float)));
220 void cudaMultiFilterFirClose(params *params, gpu_arrays *arrays)
223 * Clean up CUDA resources
225 @param params: structure for the filters parameters, which
226 contains the input and output sizes for each filter
227 as well as the number of filters and sampling rates
229 @param arrays: a structure of pointers to the GPU arrays for reuse
231 int nrates = params->nrates;
233 for (int i=0; i<nrates; i++){
234 checkCudaErrors(cudaFree(arrays->d_filters[i]));
235 checkCudaErrors(cudaFree(arrays->d_in[i]));
236 checkCudaErrors(cudaFree(arrays->d_out[i]));
238 if (params->streams){
239 checkCudaErrors(cudaFreeHost(arrays->h_in[i]));
240 checkCudaErrors(cudaFreeHost(arrays->h_out[i]));
242 cudaStreamDestroy(arrays->stream[i]);
248 void cudaMultiFilterFirStreams(gpu_arrays *arrays, params *params)
251 This function performs multiple filters with multiple input
252 sampling rates on a GPU.
253 The required memory is pre-allocated on the GPU
254 and filters also copied in advance
255 The different rates are executed asynchronously on the GPU
258 @param arrays: all the GPU and host pinned memory input
259 and output arrays we need for asynchronous operation
260 @param params: structure for the filters parameters, which
261 contains the input and output sizes for each filter
262 as well as the number of filters and sampling rates
266 int nrates = params->nrates;
268 //setup execution configuration
271 size_t shmem[nrates];
273 for (int i = 0; i < nrates; ++i){
274 if ( arrays->osize[i] < 64)
275 threads[i] = arrays->osize[i];
277 // filter size must be less than number of threads
278 //nearest multiple of 32
279 threads[i] = B_SIZE + (64 - B_SIZE % 32);
282 blocks[i].x = ( arrays->osize[i]) / threads[i] + 1;
283 blocks[i].y = params->rnumf[i];
284 shmem[i] = (threads[i] + params->fsize) * sizeof(float);
288 // copy inputs with different sampling rates asynchronously to the GPU
289 for (int i = 0; i < nrates; ++i){
290 cudaMemcpyAsync(arrays->d_in[i], arrays->h_in[i],
291 arrays->isize[i] * sizeof(float),
292 cudaMemcpyHostToDevice, arrays->stream[i]);
294 //run kernels for different rates concurrently
296 multiFilterGpuDevice_2dgrid <<< blocks[i], threads[i], shmem[i], arrays->stream[i] >>>
297 (arrays->d_in[i], arrays->d_out[i], arrays->d_filters[i],
298 arrays->osize[i], arrays->isize[i], params->fsize);
300 multiFilterGpuDevice_2dgrid_shmem <<< blocks[i], threads[i], shmem[i], arrays->stream[i] >>>
301 (arrays->d_in[i], arrays->d_out[i], arrays->d_filters[i],
302 arrays->osize[i], arrays->isize[i], params->fsize);
304 /* // transfer data back */
305 cudaMemcpyAsync(arrays->h_out[i], arrays->d_out[i],
306 arrays->osize[i] * params->rnumf[i] * sizeof(float),
307 cudaMemcpyDeviceToHost, arrays->stream[i]);