annotate filters_cuda.cu @ 1:ae0773634e0b tip

project report.
author Sofia Dimoudi <sofia.dimoudi@gmail.com>
date Fri, 16 Sep 2016 15:49:19 +0100
parents 2b63f74a3010
children
rev   line source
sofia@0 1 /*
sofia@0 2 GPU multi-rate FIR filter bank example software
sofia@0 3
sofia@0 4 Oxford e-Research Centre, Oxford University
sofia@0 5
sofia@0 6 Centre for Digital Music, Queen Mary, University of London.
sofia@0 7
sofia@0 8 This program is free software: you can redistribute it and/or modify
sofia@0 9 it under the terms of the GNU General Public License as published by
sofia@0 10 the Free Software Foundation, either version 3 of the License,
sofia@0 11 or (at your option) any later version.
sofia@0 12 See the file COPYING included with this distribution for more information.
sofia@0 13 */
sofia@0 14
sofia@0 15 #include "filters.h"
sofia@0 16 #include <helper_cuda.h>
sofia@0 17
sofia@0 18
sofia@0 19 ///////////////////////////////////////////////////////////////////////////////
sofia@0 20 // GPU kernels
sofia@0 21 ///////////////////////////////////////////////////////////////////////////////
sofia@0 22
sofia@0 23 __global__ void multiFilterGpuDevice_2dgrid( float *in, float *out, float *filter, int osize, int isize, int bsize)
sofia@0 24 {
sofia@0 25 /**
sofia@0 26 * This GPU kernel applies multiple filters, one per y direction.
sofia@0 27 * It is assumed that input data arrays are contiguous in memory.
sofia@0 28 */
sofia@0 29
sofia@0 30 int tx = threadIdx.x;
sofia@0 31 int by = blockIdx.y;
sofia@0 32 int tidx = threadIdx.x + blockIdx.x*blockDim.x;
sofia@0 33
sofia@0 34 // dynamically allocated shared memory, passed at kernel call
sofia@0 35 extern __shared__ float indata[];
sofia@0 36 float tempdata[B_SIZE];
sofia@0 37 float outdata;
sofia@0 38
sofia@0 39 //initialize output
sofia@0 40 outdata = 0.0f;
sofia@0 41
sofia@0 42 //load filter
sofia@0 43 #pragma unroll 20
sofia@0 44 for (int i=0; i< B_SIZE; i++)
sofia@0 45 tempdata[i] = filter[by * B_SIZE + i];
sofia@0 46
sofia@0 47 //load signal
sofia@0 48 //--first block
sofia@0 49 if (blockIdx.x == 0){
sofia@0 50
sofia@0 51 if (tx < B_SIZE)
sofia@0 52 indata[tx] = in[tx];
sofia@0 53 __syncthreads();
sofia@0 54
sofia@0 55 indata[tx + B_SIZE ] = in[tx + B_SIZE];
sofia@0 56 }
sofia@0 57 __syncthreads();
sofia@0 58
sofia@0 59 //--
sofia@0 60 // copy rest of blocks in overlapped fashion
sofia@0 61 if (blockIdx.x > 0 ) {
sofia@0 62 if (tx < B_SIZE && tidx < isize)
sofia@0 63 indata[tx] = in[tidx];
sofia@0 64 __syncthreads();
sofia@0 65 if (tidx < isize)
sofia@0 66 indata[tx + B_SIZE] = in[tidx + B_SIZE];
sofia@0 67 }
sofia@0 68 __syncthreads();
sofia@0 69 //finished loading signal
sofia@0 70
sofia@0 71 //convolution loop
sofia@0 72 #pragma unroll 20
sofia@0 73 for (int i=0; i<B_SIZE; i++){
sofia@0 74 outdata += tempdata[i] * indata[tx + B_SIZE - i];
sofia@0 75 }
sofia@0 76
sofia@0 77 if (tidx < osize)
sofia@0 78 out[by * osize + tidx] = outdata;
sofia@0 79 }
sofia@0 80
sofia@0 81 __global__ void multiFilterGpuDevice_2dgrid_shmem( float *in, float *out, float *filter, int osize, int isize, int bsize)
sofia@0 82 {
sofia@0 83 /**
sofia@0 84 This kernel applies multiple filters, one per y direction
sofia@0 85 It is assumed that input data arrays are contiguous in memory
sofia@0 86 */
sofia@0 87
sofia@0 88 int tx = threadIdx.x;
sofia@0 89 int by = blockIdx.y;
sofia@0 90 int tidx = threadIdx.x + blockIdx.x*blockDim.x;
sofia@0 91
sofia@0 92 extern __shared__ float indata[];
sofia@0 93 __shared__ float tempdata[B_SIZE];
sofia@0 94 float outdata;
sofia@0 95
sofia@0 96 //initialize output
sofia@0 97 outdata = 0.0f;
sofia@0 98
sofia@0 99 //load filter
sofia@0 100 #pragma unroll
sofia@0 101 for (int i=0; i< B_SIZE; i++)
sofia@0 102 tempdata[i] = filter[by * B_SIZE + i];
sofia@0 103 __syncthreads();
sofia@0 104
sofia@0 105 //load signal
sofia@0 106 //--first block
sofia@0 107 if (blockIdx.x == 0){
sofia@0 108
sofia@0 109 if (tx < B_SIZE)
sofia@0 110 indata[tx] = in[tx];
sofia@0 111 __syncthreads();
sofia@0 112
sofia@0 113 indata[tx + B_SIZE ] = in[tx + B_SIZE];
sofia@0 114 }
sofia@0 115 __syncthreads();
sofia@0 116
sofia@0 117 //--
sofia@0 118 // copy rest of blocks in overlapped fashion
sofia@0 119 if (blockIdx.x > 0 ) {
sofia@0 120 if (tx < B_SIZE && tidx < isize)
sofia@0 121 indata[tx] = in[tidx];
sofia@0 122 __syncthreads();
sofia@0 123 if (tidx < isize)
sofia@0 124 indata[tx + B_SIZE] = in[tidx + B_SIZE];
sofia@0 125 }
sofia@0 126 __syncthreads();
sofia@0 127 //finished loading signal
sofia@0 128
sofia@0 129 //convolution loop
sofia@0 130 #pragma unroll
sofia@0 131 for (int i=0; i<B_SIZE; i++){
sofia@0 132 outdata += tempdata[i] * indata[tx + B_SIZE - i];
sofia@0 133 }
sofia@0 134
sofia@0 135 if (tidx < osize)
sofia@0 136 out[by * osize + tidx] = outdata;
sofia@0 137 }
sofia@0 138
sofia@0 139 void cudaMultiFilterFirInit(float **in, float **out, float *filters,
sofia@0 140 params *params, gpu_arrays *arrays)
sofia@0 141 {
sofia@0 142 /**
sofia@0 143 Perform all memory allocations and transfers required
sofia@0 144 for a GPU multiple filter operation.
sofia@0 145 This also allows to keep allocated memory and reusable
sofia@0 146 data on the GPU, in order to repeat CUDA calls
sofia@0 147 along the whole input signal.
sofia@0 148 */
sofia@0 149
sofia@0 150 /**
sofia@0 151 @param in: the input buffers (2D array), for resampled inputs
sofia@0 152 @param out: the output buffers (2D array), different sampling
sofia@0 153 @param filters: a two dimensional array containing the coefficients
sofia@0 154 for multiple filters
sofia@0 155 @param params: structure for the filters parameters, which
sofia@0 156 contains the input and output sizes for each filter
sofia@0 157 as well as the number of filters and sampling rates
sofia@0 158 to be processed.
sofia@0 159 @param arrays: a structure of pointers to the GPU arrays for reuse
sofia@0 160 */
sofia@0 161
sofia@0 162 int fsize = params->fsize;
sofia@0 163 int rnumfs; // number of filters for a given sampling rate
sofia@0 164 int nrates = params->nrates;
sofia@0 165
sofia@0 166 int pos; //starting position of filters for a given rate
sofia@0 167
sofia@0 168 // initialise card (cuda_helper.h routine)
sofia@0 169 findCudaDevice(0, 0);
sofia@0 170
sofia@0 171 // setup streams
sofia@0 172 if (params->streams){
sofia@0 173 for (int i = 0; i < nrates; ++i)
sofia@0 174 checkCudaErrors(cudaStreamCreate(&arrays->stream[i]));
sofia@0 175 }
sofia@0 176
sofia@0 177 //allocate memory in separate locations for the different sampling rates
sofia@0 178
sofia@0 179 // filters
sofia@0 180 pos = 0;
sofia@0 181 for (int i=0; i<nrates; i++){
sofia@0 182 rnumfs = params->rnumf[i]; // number of filters for this rate
sofia@0 183
sofia@0 184 //allocate memory for these filters
sofia@0 185 checkCudaErrors(cudaMalloc((void**)&arrays->d_filters[i], fsize * rnumfs * sizeof(float)));
sofia@0 186
sofia@0 187 //copy this filter array to GPU memory
sofia@0 188 checkCudaErrors(cudaMemcpy(arrays->d_filters[i], filters + pos*fsize,
sofia@0 189 fsize * rnumfs * sizeof(float), cudaMemcpyHostToDevice));
sofia@0 190
sofia@0 191 pos += params->rnumf[i];
sofia@0 192
sofia@0 193 }
sofia@0 194
sofia@0 195 // Inputs and outputs
sofia@0 196
sofia@0 197 for (int i=0; i<nrates; i++){
sofia@0 198 // allocate host page locked memory for staging
sofia@0 199 // to allow CUDA asynchronous transfers if we are working with steams
sofia@0 200 // if we are not using streams the input and output pointers
sofia@0 201 // from the host are passed.
sofia@0 202 if (params->streams){
sofia@0 203 checkCudaErrors(cudaMallocHost((void**)&arrays->h_in[i],
sofia@0 204 arrays->isize[i] * sizeof(float)));
sofia@0 205 checkCudaErrors(cudaMallocHost((void**)&arrays->h_out[i],
sofia@0 206 arrays->osize[i] * params->rnumf[i] * sizeof(float)));
sofia@0 207 } else {
sofia@0 208 arrays->h_in[i] = in[i];
sofia@0 209 arrays->h_out[i] = out[i];
sofia@0 210 }
sofia@0 211 // allocate device arrays
sofia@0 212 checkCudaErrors(cudaMalloc((void**)&arrays->d_in[i],
sofia@0 213 arrays->isize[i] * sizeof(float)));
sofia@0 214 checkCudaErrors(cudaMalloc((void**)&arrays->d_out[i],
sofia@0 215 arrays->osize[i] * params->rnumf[i] * sizeof(float)));
sofia@0 216 }
sofia@0 217
sofia@0 218 }
sofia@0 219
sofia@0 220 void cudaMultiFilterFirClose(params *params, gpu_arrays *arrays)
sofia@0 221 {
sofia@0 222 /**
sofia@0 223 * Clean up CUDA resources
sofia@0 224
sofia@0 225 @param params: structure for the filters parameters, which
sofia@0 226 contains the input and output sizes for each filter
sofia@0 227 as well as the number of filters and sampling rates
sofia@0 228 to be processed.
sofia@0 229 @param arrays: a structure of pointers to the GPU arrays for reuse
sofia@0 230 */
sofia@0 231 int nrates = params->nrates;
sofia@0 232
sofia@0 233 for (int i=0; i<nrates; i++){
sofia@0 234 checkCudaErrors(cudaFree(arrays->d_filters[i]));
sofia@0 235 checkCudaErrors(cudaFree(arrays->d_in[i]));
sofia@0 236 checkCudaErrors(cudaFree(arrays->d_out[i]));
sofia@0 237
sofia@0 238 if (params->streams){
sofia@0 239 checkCudaErrors(cudaFreeHost(arrays->h_in[i]));
sofia@0 240 checkCudaErrors(cudaFreeHost(arrays->h_out[i]));
sofia@0 241 //destroy streams
sofia@0 242 cudaStreamDestroy(arrays->stream[i]);
sofia@0 243
sofia@0 244 }
sofia@0 245 }
sofia@0 246 }
sofia@0 247
sofia@0 248 void cudaMultiFilterFirStreams(gpu_arrays *arrays, params *params)
sofia@0 249 {
sofia@0 250 /**
sofia@0 251 This function performs multiple filters with multiple input
sofia@0 252 sampling rates on a GPU.
sofia@0 253 The required memory is pre-allocated on the GPU
sofia@0 254 and filters also copied in advance
sofia@0 255 The different rates are executed asynchronously on the GPU
sofia@0 256 using CUDA streams.
sofia@0 257
sofia@0 258 @param arrays: all the GPU and host pinned memory input
sofia@0 259 and output arrays we need for asynchronous operation
sofia@0 260 @param params: structure for the filters parameters, which
sofia@0 261 contains the input and output sizes for each filter
sofia@0 262 as well as the number of filters and sampling rates
sofia@0 263 to be processed.
sofia@0 264 */
sofia@0 265
sofia@0 266 int nrates = params->nrates;
sofia@0 267
sofia@0 268 //setup execution configuration
sofia@0 269 int threads[nrates];
sofia@0 270 dim3 blocks[nrates];
sofia@0 271 size_t shmem[nrates];
sofia@0 272
sofia@0 273 for (int i = 0; i < nrates; ++i){
sofia@0 274 if ( arrays->osize[i] < 64)
sofia@0 275 threads[i] = arrays->osize[i];
sofia@0 276 else {
sofia@0 277 // filter size must be less than number of threads
sofia@0 278 //nearest multiple of 32
sofia@0 279 threads[i] = B_SIZE + (64 - B_SIZE % 32);
sofia@0 280 }
sofia@0 281
sofia@0 282 blocks[i].x = ( arrays->osize[i]) / threads[i] + 1;
sofia@0 283 blocks[i].y = params->rnumf[i];
sofia@0 284 shmem[i] = (threads[i] + params->fsize) * sizeof(float);
sofia@0 285
sofia@0 286 }
sofia@0 287
sofia@0 288 // copy inputs with different sampling rates asynchronously to the GPU
sofia@0 289 for (int i = 0; i < nrates; ++i){
sofia@0 290 cudaMemcpyAsync(arrays->d_in[i], arrays->h_in[i],
sofia@0 291 arrays->isize[i] * sizeof(float),
sofia@0 292 cudaMemcpyHostToDevice, arrays->stream[i]);
sofia@0 293
sofia@0 294 //run kernels for different rates concurrently
sofia@0 295 if(!CUDA_SHM)
sofia@0 296 multiFilterGpuDevice_2dgrid <<< blocks[i], threads[i], shmem[i], arrays->stream[i] >>>
sofia@0 297 (arrays->d_in[i], arrays->d_out[i], arrays->d_filters[i],
sofia@0 298 arrays->osize[i], arrays->isize[i], params->fsize);
sofia@0 299 else
sofia@0 300 multiFilterGpuDevice_2dgrid_shmem <<< blocks[i], threads[i], shmem[i], arrays->stream[i] >>>
sofia@0 301 (arrays->d_in[i], arrays->d_out[i], arrays->d_filters[i],
sofia@0 302 arrays->osize[i], arrays->isize[i], params->fsize);
sofia@0 303
sofia@0 304 /* // transfer data back */
sofia@0 305 cudaMemcpyAsync(arrays->h_out[i], arrays->d_out[i],
sofia@0 306 arrays->osize[i] * params->rnumf[i] * sizeof(float),
sofia@0 307 cudaMemcpyDeviceToHost, arrays->stream[i]);
sofia@0 308 }
sofia@0 309
sofia@0 310 }