filters_cuda.cu
Go to the documentation of this file.
1 /*
2  GPU multi-rate FIR filter bank example software
3 
4  Oxford e-Research Centre, Oxford University
5 
6  Centre for Digital Music, Queen Mary, University of London.
7 
8  This program is free software: you can redistribute it and/or modify
9  it under the terms of the GNU General Public License as published by
10  the Free Software Foundation, either version 3 of the License,
11  or (at your option) any later version.
12  See the file COPYING included with this distribution for more information.
13 */
14 
15 #include "filters.h"
16 #include <helper_cuda.h>
17 
18 
19 ///////////////////////////////////////////////////////////////////////////////
20 // GPU kernels
21 ///////////////////////////////////////////////////////////////////////////////
22 
23 __global__ void multiFilterGpuDevice_2dgrid( float *in, float *out, float *filter, int osize, int isize, int bsize)
24 {
25  /**
26  * This GPU kernel applies multiple filters, one per y direction.
27  * It is assumed that input data arrays are contiguous in memory.
28  */
29 
30  int tx = threadIdx.x;
31  int by = blockIdx.y;
32  int tidx = threadIdx.x + blockIdx.x*blockDim.x;
33 
34 // dynamically allocated shared memory, passed at kernel call
35  extern __shared__ float indata[];
36  float tempdata[B_SIZE];
37  float outdata;
38 
39  //initialize output
40  outdata = 0.0f;
41 
42  //load filter
43 #pragma unroll 20
44  for (int i=0; i< B_SIZE; i++)
45  tempdata[i] = filter[by * B_SIZE + i];
46 
47  //load signal
48  //--first block
49  if (blockIdx.x == 0){
50 
51  if (tx < B_SIZE)
52  indata[tx] = in[tx];
53  __syncthreads();
54 
55  indata[tx + B_SIZE ] = in[tx + B_SIZE];
56  }
57  __syncthreads();
58 
59 //--
60  // copy rest of blocks in overlapped fashion
61  if (blockIdx.x > 0 ) {
62  if (tx < B_SIZE && tidx < isize)
63  indata[tx] = in[tidx];
64  __syncthreads();
65  if (tidx < isize)
66  indata[tx + B_SIZE] = in[tidx + B_SIZE];
67  }
68  __syncthreads();
69  //finished loading signal
70 
71  //convolution loop
72 #pragma unroll 20
73  for (int i=0; i<B_SIZE; i++){
74  outdata += tempdata[i] * indata[tx + B_SIZE - i];
75 }
76 
77  if (tidx < osize)
78  out[by * osize + tidx] = outdata;
79 }
80 
81 __global__ void multiFilterGpuDevice_2dgrid_shmem( float *in, float *out, float *filter, int osize, int isize, int bsize)
82 {
83  /**
84  This kernel applies multiple filters, one per y direction
85  It is assumed that input data arrays are contiguous in memory
86  */
87 
88  int tx = threadIdx.x;
89  int by = blockIdx.y;
90  int tidx = threadIdx.x + blockIdx.x*blockDim.x;
91 
92  extern __shared__ float indata[];
93  __shared__ float tempdata[B_SIZE];
94  float outdata;
95 
96  //initialize output
97  outdata = 0.0f;
98 
99  //load filter
100 #pragma unroll
101  for (int i=0; i< B_SIZE; i++)
102  tempdata[i] = filter[by * B_SIZE + i];
103  __syncthreads();
104 
105  //load signal
106  //--first block
107  if (blockIdx.x == 0){
108 
109  if (tx < B_SIZE)
110  indata[tx] = in[tx];
111  __syncthreads();
112 
113  indata[tx + B_SIZE ] = in[tx + B_SIZE];
114  }
115  __syncthreads();
116 
117 //--
118  // copy rest of blocks in overlapped fashion
119  if (blockIdx.x > 0 ) {
120  if (tx < B_SIZE && tidx < isize)
121  indata[tx] = in[tidx];
122  __syncthreads();
123  if (tidx < isize)
124  indata[tx + B_SIZE] = in[tidx + B_SIZE];
125  }
126  __syncthreads();
127  //finished loading signal
128 
129  //convolution loop
130 #pragma unroll
131  for (int i=0; i<B_SIZE; i++){
132  outdata += tempdata[i] * indata[tx + B_SIZE - i];
133 }
134 
135  if (tidx < osize)
136  out[by * osize + tidx] = outdata;
137 }
138 
139 void cudaMultiFilterFirInit(float **in, float **out, float *filters,
140  params *params, gpu_arrays *arrays)
141 {
142  /**
143  Perform all memory allocations and transfers required
144  for a GPU multiple filter operation.
145  This also allows to keep allocated memory and reusable
146  data on the GPU, in order to repeat CUDA calls
147  along the whole input signal.
148  */
149 
150  /**
151  @param in: the input buffers (2D array), for resampled inputs
152  @param out: the output buffers (2D array), different sampling
153  @param filters: a two dimensional array containing the coefficients
154  for multiple filters
155  @param params: structure for the filters parameters, which
156  contains the input and output sizes for each filter
157  as well as the number of filters and sampling rates
158  to be processed.
159  @param arrays: a structure of pointers to the GPU arrays for reuse
160  */
161 
162  int fsize = params->fsize;
163  int rnumfs; // number of filters for a given sampling rate
164  int nrates = params->nrates;
165 
166  int pos; //starting position of filters for a given rate
167 
168  // initialise card (cuda_helper.h routine)
169  findCudaDevice(0, 0);
170 
171  // setup streams
172  if (params->streams){
173  for (int i = 0; i < nrates; ++i)
174  checkCudaErrors(cudaStreamCreate(&arrays->stream[i]));
175  }
176 
177  //allocate memory in separate locations for the different sampling rates
178 
179  // filters
180  pos = 0;
181  for (int i=0; i<nrates; i++){
182  rnumfs = params->rnumf[i]; // number of filters for this rate
183 
184  //allocate memory for these filters
185  checkCudaErrors(cudaMalloc((void**)&arrays->d_filters[i], fsize * rnumfs * sizeof(float)));
186 
187  //copy this filter array to GPU memory
188  checkCudaErrors(cudaMemcpy(arrays->d_filters[i], filters + pos*fsize,
189  fsize * rnumfs * sizeof(float), cudaMemcpyHostToDevice));
190 
191  pos += params->rnumf[i];
192 
193  }
194 
195  // Inputs and outputs
196 
197  for (int i=0; i<nrates; i++){
198  // allocate host page locked memory for staging
199  // to allow CUDA asynchronous transfers if we are working with steams
200  // if we are not using streams the input and output pointers
201  // from the host are passed.
202  if (params->streams){
203  checkCudaErrors(cudaMallocHost((void**)&arrays->h_in[i],
204  arrays->isize[i] * sizeof(float)));
205  checkCudaErrors(cudaMallocHost((void**)&arrays->h_out[i],
206  arrays->osize[i] * params->rnumf[i] * sizeof(float)));
207  } else {
208  arrays->h_in[i] = in[i];
209  arrays->h_out[i] = out[i];
210  }
211  // allocate device arrays
212  checkCudaErrors(cudaMalloc((void**)&arrays->d_in[i],
213  arrays->isize[i] * sizeof(float)));
214  checkCudaErrors(cudaMalloc((void**)&arrays->d_out[i],
215  arrays->osize[i] * params->rnumf[i] * sizeof(float)));
216  }
217 
218 }
219 
220 void cudaMultiFilterFirClose(params *params, gpu_arrays *arrays)
221 {
222  /**
223  * Clean up CUDA resources
224 
225  @param params: structure for the filters parameters, which
226  contains the input and output sizes for each filter
227  as well as the number of filters and sampling rates
228  to be processed.
229  @param arrays: a structure of pointers to the GPU arrays for reuse
230  */
231  int nrates = params->nrates;
232 
233  for (int i=0; i<nrates; i++){
234  checkCudaErrors(cudaFree(arrays->d_filters[i]));
235  checkCudaErrors(cudaFree(arrays->d_in[i]));
236  checkCudaErrors(cudaFree(arrays->d_out[i]));
237 
238  if (params->streams){
239  checkCudaErrors(cudaFreeHost(arrays->h_in[i]));
240  checkCudaErrors(cudaFreeHost(arrays->h_out[i]));
241  //destroy streams
242  cudaStreamDestroy(arrays->stream[i]);
243 
244  }
245  }
246 }
247 
248 void cudaMultiFilterFirStreams(gpu_arrays *arrays, params *params)
249 {
250  /**
251  This function performs multiple filters with multiple input
252  sampling rates on a GPU.
253  The required memory is pre-allocated on the GPU
254  and filters also copied in advance
255  The different rates are executed asynchronously on the GPU
256  using CUDA streams.
257 
258  @param arrays: all the GPU and host pinned memory input
259  and output arrays we need for asynchronous operation
260  @param params: structure for the filters parameters, which
261  contains the input and output sizes for each filter
262  as well as the number of filters and sampling rates
263  to be processed.
264  */
265 
266  int nrates = params->nrates;
267 
268  //setup execution configuration
269  int threads[nrates];
270  dim3 blocks[nrates];
271  size_t shmem[nrates];
272 
273  for (int i = 0; i < nrates; ++i){
274  if ( arrays->osize[i] < 64)
275  threads[i] = arrays->osize[i];
276  else {
277  // filter size must be less than number of threads
278  //nearest multiple of 32
279  threads[i] = B_SIZE + (64 - B_SIZE % 32);
280  }
281 
282  blocks[i].x = ( arrays->osize[i]) / threads[i] + 1;
283  blocks[i].y = params->rnumf[i];
284  shmem[i] = (threads[i] + params->fsize) * sizeof(float);
285 
286  }
287 
288  // copy inputs with different sampling rates asynchronously to the GPU
289  for (int i = 0; i < nrates; ++i){
290  cudaMemcpyAsync(arrays->d_in[i], arrays->h_in[i],
291  arrays->isize[i] * sizeof(float),
292  cudaMemcpyHostToDevice, arrays->stream[i]);
293 
294  //run kernels for different rates concurrently
295  if(!CUDA_SHM)
296  multiFilterGpuDevice_2dgrid <<< blocks[i], threads[i], shmem[i], arrays->stream[i] >>>
297  (arrays->d_in[i], arrays->d_out[i], arrays->d_filters[i],
298  arrays->osize[i], arrays->isize[i], params->fsize);
299  else
300  multiFilterGpuDevice_2dgrid_shmem <<< blocks[i], threads[i], shmem[i], arrays->stream[i] >>>
301  (arrays->d_in[i], arrays->d_out[i], arrays->d_filters[i],
302  arrays->osize[i], arrays->isize[i], params->fsize);
303 
304  /* // transfer data back */
305  cudaMemcpyAsync(arrays->h_out[i], arrays->d_out[i],
306  arrays->osize[i] * params->rnumf[i] * sizeof(float),
307  cudaMemcpyDeviceToHost, arrays->stream[i]);
308  }
309 
310 }