Mercurial > hg > gpufilter
view filters_cuda.cu @ 1:ae0773634e0b tip
project report.
author | Sofia Dimoudi <sofia.dimoudi@gmail.com> |
---|---|
date | Fri, 16 Sep 2016 15:49:19 +0100 |
parents | 2b63f74a3010 |
children |
line wrap: on
line source
/* GPU multi-rate FIR filter bank example software Oxford e-Research Centre, Oxford University Centre for Digital Music, Queen Mary, University of London. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. See the file COPYING included with this distribution for more information. */ #include "filters.h" #include <helper_cuda.h> /////////////////////////////////////////////////////////////////////////////// // GPU kernels /////////////////////////////////////////////////////////////////////////////// __global__ void multiFilterGpuDevice_2dgrid( float *in, float *out, float *filter, int osize, int isize, int bsize) { /** * This GPU kernel applies multiple filters, one per y direction. * It is assumed that input data arrays are contiguous in memory. */ int tx = threadIdx.x; int by = blockIdx.y; int tidx = threadIdx.x + blockIdx.x*blockDim.x; // dynamically allocated shared memory, passed at kernel call extern __shared__ float indata[]; float tempdata[B_SIZE]; float outdata; //initialize output outdata = 0.0f; //load filter #pragma unroll 20 for (int i=0; i< B_SIZE; i++) tempdata[i] = filter[by * B_SIZE + i]; //load signal //--first block if (blockIdx.x == 0){ if (tx < B_SIZE) indata[tx] = in[tx]; __syncthreads(); indata[tx + B_SIZE ] = in[tx + B_SIZE]; } __syncthreads(); //-- // copy rest of blocks in overlapped fashion if (blockIdx.x > 0 ) { if (tx < B_SIZE && tidx < isize) indata[tx] = in[tidx]; __syncthreads(); if (tidx < isize) indata[tx + B_SIZE] = in[tidx + B_SIZE]; } __syncthreads(); //finished loading signal //convolution loop #pragma unroll 20 for (int i=0; i<B_SIZE; i++){ outdata += tempdata[i] * indata[tx + B_SIZE - i]; } if (tidx < osize) out[by * osize + tidx] = outdata; } __global__ void multiFilterGpuDevice_2dgrid_shmem( float *in, float *out, float *filter, int osize, int isize, int bsize) { /** This kernel applies multiple filters, one per y direction It is assumed that input data arrays are contiguous in memory */ int tx = threadIdx.x; int by = blockIdx.y; int tidx = threadIdx.x + blockIdx.x*blockDim.x; extern __shared__ float indata[]; __shared__ float tempdata[B_SIZE]; float outdata; //initialize output outdata = 0.0f; //load filter #pragma unroll for (int i=0; i< B_SIZE; i++) tempdata[i] = filter[by * B_SIZE + i]; __syncthreads(); //load signal //--first block if (blockIdx.x == 0){ if (tx < B_SIZE) indata[tx] = in[tx]; __syncthreads(); indata[tx + B_SIZE ] = in[tx + B_SIZE]; } __syncthreads(); //-- // copy rest of blocks in overlapped fashion if (blockIdx.x > 0 ) { if (tx < B_SIZE && tidx < isize) indata[tx] = in[tidx]; __syncthreads(); if (tidx < isize) indata[tx + B_SIZE] = in[tidx + B_SIZE]; } __syncthreads(); //finished loading signal //convolution loop #pragma unroll for (int i=0; i<B_SIZE; i++){ outdata += tempdata[i] * indata[tx + B_SIZE - i]; } if (tidx < osize) out[by * osize + tidx] = outdata; } void cudaMultiFilterFirInit(float **in, float **out, float *filters, params *params, gpu_arrays *arrays) { /** Perform all memory allocations and transfers required for a GPU multiple filter operation. This also allows to keep allocated memory and reusable data on the GPU, in order to repeat CUDA calls along the whole input signal. */ /** @param in: the input buffers (2D array), for resampled inputs @param out: the output buffers (2D array), different sampling @param filters: a two dimensional array containing the coefficients for multiple filters @param params: structure for the filters parameters, which contains the input and output sizes for each filter as well as the number of filters and sampling rates to be processed. @param arrays: a structure of pointers to the GPU arrays for reuse */ int fsize = params->fsize; int rnumfs; // number of filters for a given sampling rate int nrates = params->nrates; int pos; //starting position of filters for a given rate // initialise card (cuda_helper.h routine) findCudaDevice(0, 0); // setup streams if (params->streams){ for (int i = 0; i < nrates; ++i) checkCudaErrors(cudaStreamCreate(&arrays->stream[i])); } //allocate memory in separate locations for the different sampling rates // filters pos = 0; for (int i=0; i<nrates; i++){ rnumfs = params->rnumf[i]; // number of filters for this rate //allocate memory for these filters checkCudaErrors(cudaMalloc((void**)&arrays->d_filters[i], fsize * rnumfs * sizeof(float))); //copy this filter array to GPU memory checkCudaErrors(cudaMemcpy(arrays->d_filters[i], filters + pos*fsize, fsize * rnumfs * sizeof(float), cudaMemcpyHostToDevice)); pos += params->rnumf[i]; } // Inputs and outputs for (int i=0; i<nrates; i++){ // allocate host page locked memory for staging // to allow CUDA asynchronous transfers if we are working with steams // if we are not using streams the input and output pointers // from the host are passed. if (params->streams){ checkCudaErrors(cudaMallocHost((void**)&arrays->h_in[i], arrays->isize[i] * sizeof(float))); checkCudaErrors(cudaMallocHost((void**)&arrays->h_out[i], arrays->osize[i] * params->rnumf[i] * sizeof(float))); } else { arrays->h_in[i] = in[i]; arrays->h_out[i] = out[i]; } // allocate device arrays checkCudaErrors(cudaMalloc((void**)&arrays->d_in[i], arrays->isize[i] * sizeof(float))); checkCudaErrors(cudaMalloc((void**)&arrays->d_out[i], arrays->osize[i] * params->rnumf[i] * sizeof(float))); } } void cudaMultiFilterFirClose(params *params, gpu_arrays *arrays) { /** * Clean up CUDA resources @param params: structure for the filters parameters, which contains the input and output sizes for each filter as well as the number of filters and sampling rates to be processed. @param arrays: a structure of pointers to the GPU arrays for reuse */ int nrates = params->nrates; for (int i=0; i<nrates; i++){ checkCudaErrors(cudaFree(arrays->d_filters[i])); checkCudaErrors(cudaFree(arrays->d_in[i])); checkCudaErrors(cudaFree(arrays->d_out[i])); if (params->streams){ checkCudaErrors(cudaFreeHost(arrays->h_in[i])); checkCudaErrors(cudaFreeHost(arrays->h_out[i])); //destroy streams cudaStreamDestroy(arrays->stream[i]); } } } void cudaMultiFilterFirStreams(gpu_arrays *arrays, params *params) { /** This function performs multiple filters with multiple input sampling rates on a GPU. The required memory is pre-allocated on the GPU and filters also copied in advance The different rates are executed asynchronously on the GPU using CUDA streams. @param arrays: all the GPU and host pinned memory input and output arrays we need for asynchronous operation @param params: structure for the filters parameters, which contains the input and output sizes for each filter as well as the number of filters and sampling rates to be processed. */ int nrates = params->nrates; //setup execution configuration int threads[nrates]; dim3 blocks[nrates]; size_t shmem[nrates]; for (int i = 0; i < nrates; ++i){ if ( arrays->osize[i] < 64) threads[i] = arrays->osize[i]; else { // filter size must be less than number of threads //nearest multiple of 32 threads[i] = B_SIZE + (64 - B_SIZE % 32); } blocks[i].x = ( arrays->osize[i]) / threads[i] + 1; blocks[i].y = params->rnumf[i]; shmem[i] = (threads[i] + params->fsize) * sizeof(float); } // copy inputs with different sampling rates asynchronously to the GPU for (int i = 0; i < nrates; ++i){ cudaMemcpyAsync(arrays->d_in[i], arrays->h_in[i], arrays->isize[i] * sizeof(float), cudaMemcpyHostToDevice, arrays->stream[i]); //run kernels for different rates concurrently if(!CUDA_SHM) multiFilterGpuDevice_2dgrid <<< blocks[i], threads[i], shmem[i], arrays->stream[i] >>> (arrays->d_in[i], arrays->d_out[i], arrays->d_filters[i], arrays->osize[i], arrays->isize[i], params->fsize); else multiFilterGpuDevice_2dgrid_shmem <<< blocks[i], threads[i], shmem[i], arrays->stream[i] >>> (arrays->d_in[i], arrays->d_out[i], arrays->d_filters[i], arrays->osize[i], arrays->isize[i], params->fsize); /* // transfer data back */ cudaMemcpyAsync(arrays->h_out[i], arrays->d_out[i], arrays->osize[i] * params->rnumf[i] * sizeof(float), cudaMemcpyDeviceToHost, arrays->stream[i]); } }