GPU Multirate FIR Filter: filters_cuda.cu Source File - gpufilter

GPU Multirate FIR Filter
 /*
   GPU multi-rate FIR filter bank example software
 
   Oxford e-Research Centre, Oxford University
 
   Centre for Digital Music, Queen Mary, University of London.
 
   This program is free software: you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation, either version 3 of the License, 
   or (at your option) any later version.
   See the file COPYING included with this distribution for more information.
 */
 
 #include "filters.h"
 #include <helper_cuda.h>
 
 
 ///////////////////////////////////////////////////////////////////////////////
 // GPU kernels
 ///////////////////////////////////////////////////////////////////////////////
 
 __global__ void multiFilterGpuDevice_2dgrid( float *in, float *out, float *filter, int osize, int isize, int bsize)
 {
   /**
    * This GPU kernel applies multiple filters, one per y direction.
    * It is assumed that input data arrays are contiguous in memory.
   */
 
   int tx = threadIdx.x;
   int by = blockIdx.y;
   int tidx = threadIdx.x + blockIdx.x*blockDim.x;
 
 // dynamically allocated shared memory, passed at kernel call
   extern __shared__ float indata[]; 
   float tempdata[B_SIZE];
   float outdata;
 
   //initialize output 
   outdata = 0.0f;
   
   //load filter
 #pragma unroll 20
   for (int i=0; i< B_SIZE; i++)
     tempdata[i] = filter[by * B_SIZE + i];
 
   //load signal
   //--first block 
   if (blockIdx.x == 0){
 
     if (tx < B_SIZE)
       indata[tx] = in[tx];
     __syncthreads();
 
     indata[tx + B_SIZE ] = in[tx + B_SIZE];
   }
   __syncthreads();
  
 //--
   // copy rest of blocks in overlapped fashion
   if (blockIdx.x > 0 ) { 
     if (tx < B_SIZE && tidx < isize)
       indata[tx] = in[tidx];
     __syncthreads();
     if (tidx < isize)
       indata[tx + B_SIZE] = in[tidx + B_SIZE];
   }
   __syncthreads();
   //finished loading signal
 
   //convolution loop
 #pragma unroll 20
   for (int i=0; i<B_SIZE; i++){
     outdata += tempdata[i] * indata[tx + B_SIZE - i];
 }
 
   if (tidx < osize)
     out[by * osize + tidx] = outdata;  
 }
 
 __global__ void multiFilterGpuDevice_2dgrid_shmem( float *in, float *out, float *filter, int osize, int isize, int bsize)
 {
   /**
      This kernel applies multiple filters, one per y direction
      It is assumed that input data arrays are contiguous in memory
   */
 
   int tx = threadIdx.x;
   int by = blockIdx.y;
   int tidx = threadIdx.x + blockIdx.x*blockDim.x;
 
   extern __shared__ float indata[];
   __shared__ float tempdata[B_SIZE];
   float outdata;
 
   //initialize output 
   outdata = 0.0f;
   
   //load filter
 #pragma unroll 
   for (int i=0; i< B_SIZE; i++)
     tempdata[i] = filter[by * B_SIZE + i];
   __syncthreads();
 
   //load signal
   //--first block 
   if (blockIdx.x == 0){
 
     if (tx < B_SIZE)
       indata[tx] = in[tx];
     __syncthreads();
 
     indata[tx + B_SIZE ] = in[tx + B_SIZE];
   }
   __syncthreads();
  
 //--
   // copy rest of blocks in overlapped fashion
   if (blockIdx.x > 0 ) {
     if (tx < B_SIZE && tidx < isize)
       indata[tx] = in[tidx];
     __syncthreads();
     if (tidx < isize)
       indata[tx + B_SIZE] = in[tidx + B_SIZE];
   }
   __syncthreads();
   //finished loading signal
 
   //convolution loop
 #pragma unroll 
   for (int i=0; i<B_SIZE; i++){
     outdata += tempdata[i] * indata[tx + B_SIZE - i];
 }
 
   if (tidx < osize)
     out[by * osize + tidx] = outdata;  
 }
 
 void cudaMultiFilterFirInit(float **in, float **out, float *filters, 
                params *params, gpu_arrays *arrays)
 {
   /**
      Perform all memory allocations and transfers required  
      for a GPU multiple filter operation.  
      This also allows to keep allocated memory and reusable  
      data on the GPU, in order to repeat CUDA calls  
      along the whole input signal. 
   */
 
   /**
     @param in:      the input buffers (2D array), for resampled inputs 
     @param out:     the output buffers (2D array), different sampling 
     @param filters: a two dimensional array containing the coefficients 
     for multiple filters 
     @param params:  structure for the filters parameters, which 
     contains the input and output sizes for each filter 
     as well as the number of filters and sampling rates 
     to be processed. 
      @param arrays:  a structure of pointers to the GPU arrays for reuse 
   */
 
   int fsize = params->fsize;
   int rnumfs; // number of filters for a given sampling rate
   int nrates = params->nrates;
 
   int pos; //starting position of filters for a given rate 
 
   // initialise card (cuda_helper.h routine)
   findCudaDevice(0, 0);
 
   // setup streams
   if (params->streams){
     for (int i = 0; i < nrates; ++i)
       checkCudaErrors(cudaStreamCreate(&arrays->stream[i]));
   }
   
   //allocate memory in separate locations for the different sampling rates 
 
   // filters
   pos = 0;
   for (int i=0; i<nrates; i++){
     rnumfs = params->rnumf[i]; // number of filters for this rate
 
     //allocate memory for these filters 
     checkCudaErrors(cudaMalloc((void**)&arrays->d_filters[i], fsize * rnumfs * sizeof(float)));
 
     //copy this filter array to GPU memory
     checkCudaErrors(cudaMemcpy(arrays->d_filters[i], filters + pos*fsize, 
                   fsize * rnumfs * sizeof(float), cudaMemcpyHostToDevice));
  
    pos += params->rnumf[i];
 
   }
 
     // Inputs and outputs
 
   for (int i=0; i<nrates; i++){
     // allocate host page locked memory for staging 
     // to allow CUDA asynchronous transfers if we are working with steams
     // if we are not using streams the input and output pointers 
     // from the host are passed.
     if (params->streams){
       checkCudaErrors(cudaMallocHost((void**)&arrays->h_in[i], 
                      arrays->isize[i] * sizeof(float)));
       checkCudaErrors(cudaMallocHost((void**)&arrays->h_out[i], 
                     arrays->osize[i] * params->rnumf[i] * sizeof(float)));
     } else {
       arrays->h_in[i] = in[i]; 
       arrays->h_out[i] = out[i]; 
     }
     // allocate device arrays
     checkCudaErrors(cudaMalloc((void**)&arrays->d_in[i], 
                   arrays->isize[i] * sizeof(float)));
     checkCudaErrors(cudaMalloc((void**)&arrays->d_out[i], 
                   arrays->osize[i] * params->rnumf[i] * sizeof(float)));
   }
   
 }
 
 void cudaMultiFilterFirClose(params *params, gpu_arrays *arrays)
 {
   /**
    * Clean up CUDA resources 
 
     @param params:  structure for the filters parameters, which 
               contains the input and output sizes for each filter 
               as well as the number of filters and sampling rates 
               to be processed. 
      @param arrays:  a structure of pointers to the GPU arrays for reuse 
    */
   int nrates = params->nrates;
 
   for (int i=0; i<nrates; i++){
     checkCudaErrors(cudaFree(arrays->d_filters[i]));
     checkCudaErrors(cudaFree(arrays->d_in[i]));
     checkCudaErrors(cudaFree(arrays->d_out[i]));
     
     if (params->streams){
       checkCudaErrors(cudaFreeHost(arrays->h_in[i]));
       checkCudaErrors(cudaFreeHost(arrays->h_out[i]));
       //destroy streams
       cudaStreamDestroy(arrays->stream[i]);
 
     }
   }
 }
 
 void cudaMultiFilterFirStreams(gpu_arrays *arrays, params *params)
 {
   /**
      This function performs multiple filters with multiple input 
      sampling rates on a GPU. 
      The required memory is pre-allocated on the GPU  
      and filters also copied in advance 
      The different rates are executed asynchronously on the GPU  
      using CUDA streams. 
      
      @param arrays:  all the GPU and host pinned memory  input 
               and output arrays we need for asynchronous operation  
      @param params:  structure for the filters parameters, which 
               contains the input and output sizes for each filter 
               as well as the number of filters and sampling rates 
               to be processed. 
   */
 
   int nrates = params->nrates;
  
   //setup execution configuration
   int threads[nrates];
   dim3 blocks[nrates];
   size_t shmem[nrates];
 
   for (int i = 0; i < nrates; ++i){
     if ( arrays->osize[i] < 64)
       threads[i] = arrays->osize[i]; 
     else { 
       // filter size must be less than number of threads
       //nearest multiple of 32
       threads[i] =  B_SIZE + (64 - B_SIZE % 32);
     }
 
     blocks[i].x = ( arrays->osize[i]) / threads[i] + 1;
     blocks[i].y = params->rnumf[i];
     shmem[i] = (threads[i] + params->fsize) * sizeof(float);
 
   }
 
   // copy inputs with different sampling rates asynchronously to the GPU
   for (int i = 0; i < nrates; ++i){
     cudaMemcpyAsync(arrays->d_in[i], arrays->h_in[i],
            arrays->isize[i] * sizeof(float),
            cudaMemcpyHostToDevice, arrays->stream[i]);
     
     //run kernels for different rates concurrently
     if(!CUDA_SHM)
       multiFilterGpuDevice_2dgrid <<< blocks[i], threads[i], shmem[i], arrays->stream[i] >>>
        (arrays->d_in[i], arrays->d_out[i], arrays->d_filters[i],
         arrays->osize[i], arrays->isize[i], params->fsize);
     else
       multiFilterGpuDevice_2dgrid_shmem <<< blocks[i], threads[i], shmem[i], arrays->stream[i] >>>
        (arrays->d_in[i], arrays->d_out[i], arrays->d_filters[i],
         arrays->osize[i], arrays->isize[i], params->fsize);
     
     /* // transfer data back  */
     cudaMemcpyAsync(arrays->h_out[i], arrays->d_out[i],
            arrays->osize[i] * params->rnumf[i] * sizeof(float),
            cudaMemcpyDeviceToHost, arrays->stream[i]);
   }
 
 }