sofia@0
|
1 /*
|
sofia@0
|
2 GPU multi-rate FIR filter bank example software
|
sofia@0
|
3
|
sofia@0
|
4 Oxford e-Research Centre, Oxford University
|
sofia@0
|
5
|
sofia@0
|
6 Centre for Digital Music, Queen Mary, University of London.
|
sofia@0
|
7
|
sofia@0
|
8 This program is free software: you can redistribute it and/or modify
|
sofia@0
|
9 it under the terms of the GNU General Public License as published by
|
sofia@0
|
10 the Free Software Foundation, either version 3 of the License,
|
sofia@0
|
11 or (at your option) any later version.
|
sofia@0
|
12 See the file COPYING included with this distribution for more information.
|
sofia@0
|
13 */
|
sofia@0
|
14
|
sofia@0
|
15 #include "filters.h"
|
sofia@0
|
16 #include <helper_cuda.h>
|
sofia@0
|
17
|
sofia@0
|
18
|
sofia@0
|
19 ///////////////////////////////////////////////////////////////////////////////
|
sofia@0
|
20 // GPU kernels
|
sofia@0
|
21 ///////////////////////////////////////////////////////////////////////////////
|
sofia@0
|
22
|
sofia@0
|
23 __global__ void multiFilterGpuDevice_2dgrid( float *in, float *out, float *filter, int osize, int isize, int bsize)
|
sofia@0
|
24 {
|
sofia@0
|
25 /**
|
sofia@0
|
26 * This GPU kernel applies multiple filters, one per y direction.
|
sofia@0
|
27 * It is assumed that input data arrays are contiguous in memory.
|
sofia@0
|
28 */
|
sofia@0
|
29
|
sofia@0
|
30 int tx = threadIdx.x;
|
sofia@0
|
31 int by = blockIdx.y;
|
sofia@0
|
32 int tidx = threadIdx.x + blockIdx.x*blockDim.x;
|
sofia@0
|
33
|
sofia@0
|
34 // dynamically allocated shared memory, passed at kernel call
|
sofia@0
|
35 extern __shared__ float indata[];
|
sofia@0
|
36 float tempdata[B_SIZE];
|
sofia@0
|
37 float outdata;
|
sofia@0
|
38
|
sofia@0
|
39 //initialize output
|
sofia@0
|
40 outdata = 0.0f;
|
sofia@0
|
41
|
sofia@0
|
42 //load filter
|
sofia@0
|
43 #pragma unroll 20
|
sofia@0
|
44 for (int i=0; i< B_SIZE; i++)
|
sofia@0
|
45 tempdata[i] = filter[by * B_SIZE + i];
|
sofia@0
|
46
|
sofia@0
|
47 //load signal
|
sofia@0
|
48 //--first block
|
sofia@0
|
49 if (blockIdx.x == 0){
|
sofia@0
|
50
|
sofia@0
|
51 if (tx < B_SIZE)
|
sofia@0
|
52 indata[tx] = in[tx];
|
sofia@0
|
53 __syncthreads();
|
sofia@0
|
54
|
sofia@0
|
55 indata[tx + B_SIZE ] = in[tx + B_SIZE];
|
sofia@0
|
56 }
|
sofia@0
|
57 __syncthreads();
|
sofia@0
|
58
|
sofia@0
|
59 //--
|
sofia@0
|
60 // copy rest of blocks in overlapped fashion
|
sofia@0
|
61 if (blockIdx.x > 0 ) {
|
sofia@0
|
62 if (tx < B_SIZE && tidx < isize)
|
sofia@0
|
63 indata[tx] = in[tidx];
|
sofia@0
|
64 __syncthreads();
|
sofia@0
|
65 if (tidx < isize)
|
sofia@0
|
66 indata[tx + B_SIZE] = in[tidx + B_SIZE];
|
sofia@0
|
67 }
|
sofia@0
|
68 __syncthreads();
|
sofia@0
|
69 //finished loading signal
|
sofia@0
|
70
|
sofia@0
|
71 //convolution loop
|
sofia@0
|
72 #pragma unroll 20
|
sofia@0
|
73 for (int i=0; i<B_SIZE; i++){
|
sofia@0
|
74 outdata += tempdata[i] * indata[tx + B_SIZE - i];
|
sofia@0
|
75 }
|
sofia@0
|
76
|
sofia@0
|
77 if (tidx < osize)
|
sofia@0
|
78 out[by * osize + tidx] = outdata;
|
sofia@0
|
79 }
|
sofia@0
|
80
|
sofia@0
|
81 __global__ void multiFilterGpuDevice_2dgrid_shmem( float *in, float *out, float *filter, int osize, int isize, int bsize)
|
sofia@0
|
82 {
|
sofia@0
|
83 /**
|
sofia@0
|
84 This kernel applies multiple filters, one per y direction
|
sofia@0
|
85 It is assumed that input data arrays are contiguous in memory
|
sofia@0
|
86 */
|
sofia@0
|
87
|
sofia@0
|
88 int tx = threadIdx.x;
|
sofia@0
|
89 int by = blockIdx.y;
|
sofia@0
|
90 int tidx = threadIdx.x + blockIdx.x*blockDim.x;
|
sofia@0
|
91
|
sofia@0
|
92 extern __shared__ float indata[];
|
sofia@0
|
93 __shared__ float tempdata[B_SIZE];
|
sofia@0
|
94 float outdata;
|
sofia@0
|
95
|
sofia@0
|
96 //initialize output
|
sofia@0
|
97 outdata = 0.0f;
|
sofia@0
|
98
|
sofia@0
|
99 //load filter
|
sofia@0
|
100 #pragma unroll
|
sofia@0
|
101 for (int i=0; i< B_SIZE; i++)
|
sofia@0
|
102 tempdata[i] = filter[by * B_SIZE + i];
|
sofia@0
|
103 __syncthreads();
|
sofia@0
|
104
|
sofia@0
|
105 //load signal
|
sofia@0
|
106 //--first block
|
sofia@0
|
107 if (blockIdx.x == 0){
|
sofia@0
|
108
|
sofia@0
|
109 if (tx < B_SIZE)
|
sofia@0
|
110 indata[tx] = in[tx];
|
sofia@0
|
111 __syncthreads();
|
sofia@0
|
112
|
sofia@0
|
113 indata[tx + B_SIZE ] = in[tx + B_SIZE];
|
sofia@0
|
114 }
|
sofia@0
|
115 __syncthreads();
|
sofia@0
|
116
|
sofia@0
|
117 //--
|
sofia@0
|
118 // copy rest of blocks in overlapped fashion
|
sofia@0
|
119 if (blockIdx.x > 0 ) {
|
sofia@0
|
120 if (tx < B_SIZE && tidx < isize)
|
sofia@0
|
121 indata[tx] = in[tidx];
|
sofia@0
|
122 __syncthreads();
|
sofia@0
|
123 if (tidx < isize)
|
sofia@0
|
124 indata[tx + B_SIZE] = in[tidx + B_SIZE];
|
sofia@0
|
125 }
|
sofia@0
|
126 __syncthreads();
|
sofia@0
|
127 //finished loading signal
|
sofia@0
|
128
|
sofia@0
|
129 //convolution loop
|
sofia@0
|
130 #pragma unroll
|
sofia@0
|
131 for (int i=0; i<B_SIZE; i++){
|
sofia@0
|
132 outdata += tempdata[i] * indata[tx + B_SIZE - i];
|
sofia@0
|
133 }
|
sofia@0
|
134
|
sofia@0
|
135 if (tidx < osize)
|
sofia@0
|
136 out[by * osize + tidx] = outdata;
|
sofia@0
|
137 }
|
sofia@0
|
138
|
sofia@0
|
139 void cudaMultiFilterFirInit(float **in, float **out, float *filters,
|
sofia@0
|
140 params *params, gpu_arrays *arrays)
|
sofia@0
|
141 {
|
sofia@0
|
142 /**
|
sofia@0
|
143 Perform all memory allocations and transfers required
|
sofia@0
|
144 for a GPU multiple filter operation.
|
sofia@0
|
145 This also allows to keep allocated memory and reusable
|
sofia@0
|
146 data on the GPU, in order to repeat CUDA calls
|
sofia@0
|
147 along the whole input signal.
|
sofia@0
|
148 */
|
sofia@0
|
149
|
sofia@0
|
150 /**
|
sofia@0
|
151 @param in: the input buffers (2D array), for resampled inputs
|
sofia@0
|
152 @param out: the output buffers (2D array), different sampling
|
sofia@0
|
153 @param filters: a two dimensional array containing the coefficients
|
sofia@0
|
154 for multiple filters
|
sofia@0
|
155 @param params: structure for the filters parameters, which
|
sofia@0
|
156 contains the input and output sizes for each filter
|
sofia@0
|
157 as well as the number of filters and sampling rates
|
sofia@0
|
158 to be processed.
|
sofia@0
|
159 @param arrays: a structure of pointers to the GPU arrays for reuse
|
sofia@0
|
160 */
|
sofia@0
|
161
|
sofia@0
|
162 int fsize = params->fsize;
|
sofia@0
|
163 int rnumfs; // number of filters for a given sampling rate
|
sofia@0
|
164 int nrates = params->nrates;
|
sofia@0
|
165
|
sofia@0
|
166 int pos; //starting position of filters for a given rate
|
sofia@0
|
167
|
sofia@0
|
168 // initialise card (cuda_helper.h routine)
|
sofia@0
|
169 findCudaDevice(0, 0);
|
sofia@0
|
170
|
sofia@0
|
171 // setup streams
|
sofia@0
|
172 if (params->streams){
|
sofia@0
|
173 for (int i = 0; i < nrates; ++i)
|
sofia@0
|
174 checkCudaErrors(cudaStreamCreate(&arrays->stream[i]));
|
sofia@0
|
175 }
|
sofia@0
|
176
|
sofia@0
|
177 //allocate memory in separate locations for the different sampling rates
|
sofia@0
|
178
|
sofia@0
|
179 // filters
|
sofia@0
|
180 pos = 0;
|
sofia@0
|
181 for (int i=0; i<nrates; i++){
|
sofia@0
|
182 rnumfs = params->rnumf[i]; // number of filters for this rate
|
sofia@0
|
183
|
sofia@0
|
184 //allocate memory for these filters
|
sofia@0
|
185 checkCudaErrors(cudaMalloc((void**)&arrays->d_filters[i], fsize * rnumfs * sizeof(float)));
|
sofia@0
|
186
|
sofia@0
|
187 //copy this filter array to GPU memory
|
sofia@0
|
188 checkCudaErrors(cudaMemcpy(arrays->d_filters[i], filters + pos*fsize,
|
sofia@0
|
189 fsize * rnumfs * sizeof(float), cudaMemcpyHostToDevice));
|
sofia@0
|
190
|
sofia@0
|
191 pos += params->rnumf[i];
|
sofia@0
|
192
|
sofia@0
|
193 }
|
sofia@0
|
194
|
sofia@0
|
195 // Inputs and outputs
|
sofia@0
|
196
|
sofia@0
|
197 for (int i=0; i<nrates; i++){
|
sofia@0
|
198 // allocate host page locked memory for staging
|
sofia@0
|
199 // to allow CUDA asynchronous transfers if we are working with steams
|
sofia@0
|
200 // if we are not using streams the input and output pointers
|
sofia@0
|
201 // from the host are passed.
|
sofia@0
|
202 if (params->streams){
|
sofia@0
|
203 checkCudaErrors(cudaMallocHost((void**)&arrays->h_in[i],
|
sofia@0
|
204 arrays->isize[i] * sizeof(float)));
|
sofia@0
|
205 checkCudaErrors(cudaMallocHost((void**)&arrays->h_out[i],
|
sofia@0
|
206 arrays->osize[i] * params->rnumf[i] * sizeof(float)));
|
sofia@0
|
207 } else {
|
sofia@0
|
208 arrays->h_in[i] = in[i];
|
sofia@0
|
209 arrays->h_out[i] = out[i];
|
sofia@0
|
210 }
|
sofia@0
|
211 // allocate device arrays
|
sofia@0
|
212 checkCudaErrors(cudaMalloc((void**)&arrays->d_in[i],
|
sofia@0
|
213 arrays->isize[i] * sizeof(float)));
|
sofia@0
|
214 checkCudaErrors(cudaMalloc((void**)&arrays->d_out[i],
|
sofia@0
|
215 arrays->osize[i] * params->rnumf[i] * sizeof(float)));
|
sofia@0
|
216 }
|
sofia@0
|
217
|
sofia@0
|
218 }
|
sofia@0
|
219
|
sofia@0
|
220 void cudaMultiFilterFirClose(params *params, gpu_arrays *arrays)
|
sofia@0
|
221 {
|
sofia@0
|
222 /**
|
sofia@0
|
223 * Clean up CUDA resources
|
sofia@0
|
224
|
sofia@0
|
225 @param params: structure for the filters parameters, which
|
sofia@0
|
226 contains the input and output sizes for each filter
|
sofia@0
|
227 as well as the number of filters and sampling rates
|
sofia@0
|
228 to be processed.
|
sofia@0
|
229 @param arrays: a structure of pointers to the GPU arrays for reuse
|
sofia@0
|
230 */
|
sofia@0
|
231 int nrates = params->nrates;
|
sofia@0
|
232
|
sofia@0
|
233 for (int i=0; i<nrates; i++){
|
sofia@0
|
234 checkCudaErrors(cudaFree(arrays->d_filters[i]));
|
sofia@0
|
235 checkCudaErrors(cudaFree(arrays->d_in[i]));
|
sofia@0
|
236 checkCudaErrors(cudaFree(arrays->d_out[i]));
|
sofia@0
|
237
|
sofia@0
|
238 if (params->streams){
|
sofia@0
|
239 checkCudaErrors(cudaFreeHost(arrays->h_in[i]));
|
sofia@0
|
240 checkCudaErrors(cudaFreeHost(arrays->h_out[i]));
|
sofia@0
|
241 //destroy streams
|
sofia@0
|
242 cudaStreamDestroy(arrays->stream[i]);
|
sofia@0
|
243
|
sofia@0
|
244 }
|
sofia@0
|
245 }
|
sofia@0
|
246 }
|
sofia@0
|
247
|
sofia@0
|
248 void cudaMultiFilterFirStreams(gpu_arrays *arrays, params *params)
|
sofia@0
|
249 {
|
sofia@0
|
250 /**
|
sofia@0
|
251 This function performs multiple filters with multiple input
|
sofia@0
|
252 sampling rates on a GPU.
|
sofia@0
|
253 The required memory is pre-allocated on the GPU
|
sofia@0
|
254 and filters also copied in advance
|
sofia@0
|
255 The different rates are executed asynchronously on the GPU
|
sofia@0
|
256 using CUDA streams.
|
sofia@0
|
257
|
sofia@0
|
258 @param arrays: all the GPU and host pinned memory input
|
sofia@0
|
259 and output arrays we need for asynchronous operation
|
sofia@0
|
260 @param params: structure for the filters parameters, which
|
sofia@0
|
261 contains the input and output sizes for each filter
|
sofia@0
|
262 as well as the number of filters and sampling rates
|
sofia@0
|
263 to be processed.
|
sofia@0
|
264 */
|
sofia@0
|
265
|
sofia@0
|
266 int nrates = params->nrates;
|
sofia@0
|
267
|
sofia@0
|
268 //setup execution configuration
|
sofia@0
|
269 int threads[nrates];
|
sofia@0
|
270 dim3 blocks[nrates];
|
sofia@0
|
271 size_t shmem[nrates];
|
sofia@0
|
272
|
sofia@0
|
273 for (int i = 0; i < nrates; ++i){
|
sofia@0
|
274 if ( arrays->osize[i] < 64)
|
sofia@0
|
275 threads[i] = arrays->osize[i];
|
sofia@0
|
276 else {
|
sofia@0
|
277 // filter size must be less than number of threads
|
sofia@0
|
278 //nearest multiple of 32
|
sofia@0
|
279 threads[i] = B_SIZE + (64 - B_SIZE % 32);
|
sofia@0
|
280 }
|
sofia@0
|
281
|
sofia@0
|
282 blocks[i].x = ( arrays->osize[i]) / threads[i] + 1;
|
sofia@0
|
283 blocks[i].y = params->rnumf[i];
|
sofia@0
|
284 shmem[i] = (threads[i] + params->fsize) * sizeof(float);
|
sofia@0
|
285
|
sofia@0
|
286 }
|
sofia@0
|
287
|
sofia@0
|
288 // copy inputs with different sampling rates asynchronously to the GPU
|
sofia@0
|
289 for (int i = 0; i < nrates; ++i){
|
sofia@0
|
290 cudaMemcpyAsync(arrays->d_in[i], arrays->h_in[i],
|
sofia@0
|
291 arrays->isize[i] * sizeof(float),
|
sofia@0
|
292 cudaMemcpyHostToDevice, arrays->stream[i]);
|
sofia@0
|
293
|
sofia@0
|
294 //run kernels for different rates concurrently
|
sofia@0
|
295 if(!CUDA_SHM)
|
sofia@0
|
296 multiFilterGpuDevice_2dgrid <<< blocks[i], threads[i], shmem[i], arrays->stream[i] >>>
|
sofia@0
|
297 (arrays->d_in[i], arrays->d_out[i], arrays->d_filters[i],
|
sofia@0
|
298 arrays->osize[i], arrays->isize[i], params->fsize);
|
sofia@0
|
299 else
|
sofia@0
|
300 multiFilterGpuDevice_2dgrid_shmem <<< blocks[i], threads[i], shmem[i], arrays->stream[i] >>>
|
sofia@0
|
301 (arrays->d_in[i], arrays->d_out[i], arrays->d_filters[i],
|
sofia@0
|
302 arrays->osize[i], arrays->isize[i], params->fsize);
|
sofia@0
|
303
|
sofia@0
|
304 /* // transfer data back */
|
sofia@0
|
305 cudaMemcpyAsync(arrays->h_out[i], arrays->d_out[i],
|
sofia@0
|
306 arrays->osize[i] * params->rnumf[i] * sizeof(float),
|
sofia@0
|
307 cudaMemcpyDeviceToHost, arrays->stream[i]);
|
sofia@0
|
308 }
|
sofia@0
|
309
|
sofia@0
|
310 }
|