view filters_cuda.cu @ 1:ae0773634e0b tip

project report.
author Sofia Dimoudi <sofia.dimoudi@gmail.com>
date Fri, 16 Sep 2016 15:49:19 +0100
parents 2b63f74a3010
children
line wrap: on
line source
/*
  GPU multi-rate FIR filter bank example software

  Oxford e-Research Centre, Oxford University

  Centre for Digital Music, Queen Mary, University of London.

  This program is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation, either version 3 of the License, 
  or (at your option) any later version.
  See the file COPYING included with this distribution for more information.
*/

#include "filters.h"
#include <helper_cuda.h>


///////////////////////////////////////////////////////////////////////////////
// GPU kernels
///////////////////////////////////////////////////////////////////////////////

__global__ void multiFilterGpuDevice_2dgrid( float *in, float *out, float *filter, int osize, int isize, int bsize)
{
  /**
   * This GPU kernel applies multiple filters, one per y direction.
   * It is assumed that input data arrays are contiguous in memory.
  */

  int tx = threadIdx.x;
  int by = blockIdx.y;
  int tidx = threadIdx.x + blockIdx.x*blockDim.x;

// dynamically allocated shared memory, passed at kernel call
  extern __shared__ float indata[]; 
  float tempdata[B_SIZE];
  float outdata;

  //initialize output 
  outdata = 0.0f;
  
  //load filter
#pragma unroll 20
  for (int i=0; i< B_SIZE; i++)
    tempdata[i] = filter[by * B_SIZE + i];

  //load signal
  //--first block 
  if (blockIdx.x == 0){

    if (tx < B_SIZE)
      indata[tx] = in[tx];
    __syncthreads();

    indata[tx + B_SIZE ] = in[tx + B_SIZE];
  }
  __syncthreads();
 
//--
  // copy rest of blocks in overlapped fashion
  if (blockIdx.x > 0 ) { 
    if (tx < B_SIZE && tidx < isize)
      indata[tx] = in[tidx];
    __syncthreads();
    if (tidx < isize)
      indata[tx + B_SIZE] = in[tidx + B_SIZE];
  }
  __syncthreads();
  //finished loading signal

  //convolution loop
#pragma unroll 20
  for (int i=0; i<B_SIZE; i++){
    outdata += tempdata[i] * indata[tx + B_SIZE - i];
}

  if (tidx < osize)
    out[by * osize + tidx] = outdata;  
}

__global__ void multiFilterGpuDevice_2dgrid_shmem( float *in, float *out, float *filter, int osize, int isize, int bsize)
{
  /**
     This kernel applies multiple filters, one per y direction
     It is assumed that input data arrays are contiguous in memory
  */

  int tx = threadIdx.x;
  int by = blockIdx.y;
  int tidx = threadIdx.x + blockIdx.x*blockDim.x;

  extern __shared__ float indata[];
  __shared__ float tempdata[B_SIZE];
  float outdata;

  //initialize output 
  outdata = 0.0f;
  
  //load filter
#pragma unroll 
  for (int i=0; i< B_SIZE; i++)
    tempdata[i] = filter[by * B_SIZE + i];
  __syncthreads();

  //load signal
  //--first block 
  if (blockIdx.x == 0){

    if (tx < B_SIZE)
      indata[tx] = in[tx];
    __syncthreads();

    indata[tx + B_SIZE ] = in[tx + B_SIZE];
  }
  __syncthreads();
 
//--
  // copy rest of blocks in overlapped fashion
  if (blockIdx.x > 0 ) {
    if (tx < B_SIZE && tidx < isize)
      indata[tx] = in[tidx];
    __syncthreads();
    if (tidx < isize)
      indata[tx + B_SIZE] = in[tidx + B_SIZE];
  }
  __syncthreads();
  //finished loading signal

  //convolution loop
#pragma unroll 
  for (int i=0; i<B_SIZE; i++){
    outdata += tempdata[i] * indata[tx + B_SIZE - i];
}

  if (tidx < osize)
    out[by * osize + tidx] = outdata;  
}

void cudaMultiFilterFirInit(float **in, float **out, float *filters, 
			    params *params, gpu_arrays *arrays)
{
  /**
     Perform all memory allocations and transfers required  
     for a GPU multiple filter operation.  
     This also allows to keep allocated memory and reusable  
     data on the GPU, in order to repeat CUDA calls  
     along the whole input signal. 
  */

  /**
    @param in:      the input buffers (2D array), for resampled inputs 
    @param out:     the output buffers (2D array), different sampling 
    @param filters: a two dimensional array containing the coefficients 
    for multiple filters 
    @param params:  structure for the filters parameters, which 
    contains the input and output sizes for each filter 
    as well as the number of filters and sampling rates 
    to be processed. 
     @param arrays:  a structure of pointers to the GPU arrays for reuse 
  */

  int fsize = params->fsize;
  int rnumfs; // number of filters for a given sampling rate
  int nrates = params->nrates;

  int pos; //starting position of filters for a given rate 

  // initialise card (cuda_helper.h routine)
  findCudaDevice(0, 0);

  // setup streams
  if (params->streams){
    for (int i = 0; i < nrates; ++i)
      checkCudaErrors(cudaStreamCreate(&arrays->stream[i]));
  }
  
  //allocate memory in separate locations for the different sampling rates 

  // filters
  pos = 0;
  for (int i=0; i<nrates; i++){
    rnumfs = params->rnumf[i]; // number of filters for this rate

    //allocate memory for these filters 
    checkCudaErrors(cudaMalloc((void**)&arrays->d_filters[i], fsize * rnumfs * sizeof(float)));

    //copy this filter array to GPU memory
    checkCudaErrors(cudaMemcpy(arrays->d_filters[i], filters + pos*fsize, 
			       fsize * rnumfs * sizeof(float), cudaMemcpyHostToDevice));
 
   pos += params->rnumf[i];

  }

    // Inputs and outputs

  for (int i=0; i<nrates; i++){
    // allocate host page locked memory for staging 
    // to allow CUDA asynchronous transfers if we are working with steams
    // if we are not using streams the input and output pointers 
    // from the host are passed.
    if (params->streams){
      checkCudaErrors(cudaMallocHost((void**)&arrays->h_in[i], 
				      arrays->isize[i] * sizeof(float)));
      checkCudaErrors(cudaMallocHost((void**)&arrays->h_out[i], 
				     arrays->osize[i] * params->rnumf[i] * sizeof(float)));
    } else {
      arrays->h_in[i] = in[i]; 
      arrays->h_out[i] = out[i]; 
    }
    // allocate device arrays
    checkCudaErrors(cudaMalloc((void**)&arrays->d_in[i], 
			       arrays->isize[i] * sizeof(float)));
    checkCudaErrors(cudaMalloc((void**)&arrays->d_out[i], 
			       arrays->osize[i] * params->rnumf[i] * sizeof(float)));
  }
  
}

void cudaMultiFilterFirClose(params *params, gpu_arrays *arrays)
{
  /**
   * Clean up CUDA resources 

    @param params:  structure for the filters parameters, which 
              contains the input and output sizes for each filter 
              as well as the number of filters and sampling rates 
              to be processed. 
     @param arrays:  a structure of pointers to the GPU arrays for reuse 
   */
  int nrates = params->nrates;

  for (int i=0; i<nrates; i++){
    checkCudaErrors(cudaFree(arrays->d_filters[i]));
    checkCudaErrors(cudaFree(arrays->d_in[i]));
    checkCudaErrors(cudaFree(arrays->d_out[i]));
    
    if (params->streams){
      checkCudaErrors(cudaFreeHost(arrays->h_in[i]));
      checkCudaErrors(cudaFreeHost(arrays->h_out[i]));
      //destroy streams
      cudaStreamDestroy(arrays->stream[i]);

    }
  }
}

void cudaMultiFilterFirStreams(gpu_arrays *arrays, params *params)
{
  /**
     This function performs multiple filters with multiple input 
     sampling rates on a GPU. 
     The required memory is pre-allocated on the GPU  
     and filters also copied in advance 
     The different rates are executed asynchronously on the GPU  
     using CUDA streams. 
     
     @param arrays:  all the GPU and host pinned memory  input 
              and output arrays we need for asynchronous operation  
     @param params:  structure for the filters parameters, which 
              contains the input and output sizes for each filter 
              as well as the number of filters and sampling rates 
              to be processed. 
  */

  int nrates = params->nrates;
 
  //setup execution configuration
  int threads[nrates];
  dim3 blocks[nrates];
  size_t shmem[nrates];

  for (int i = 0; i < nrates; ++i){
    if ( arrays->osize[i] < 64)
      threads[i] = arrays->osize[i]; 
    else { 
      // filter size must be less than number of threads
      //nearest multiple of 32
      threads[i] =  B_SIZE + (64 - B_SIZE % 32);
    }

    blocks[i].x = ( arrays->osize[i]) / threads[i] + 1;
    blocks[i].y = params->rnumf[i];
    shmem[i] = (threads[i] + params->fsize) * sizeof(float);

  }

  // copy inputs with different sampling rates asynchronously to the GPU
  for (int i = 0; i < nrates; ++i){
    cudaMemcpyAsync(arrays->d_in[i], arrays->h_in[i],
		    arrays->isize[i] * sizeof(float),
		    cudaMemcpyHostToDevice, arrays->stream[i]);
    
    //run kernels for different rates concurrently
    if(!CUDA_SHM)
      multiFilterGpuDevice_2dgrid <<< blocks[i], threads[i], shmem[i], arrays->stream[i] >>>
    	(arrays->d_in[i], arrays->d_out[i], arrays->d_filters[i],
    	 arrays->osize[i], arrays->isize[i], params->fsize);
    else
      multiFilterGpuDevice_2dgrid_shmem <<< blocks[i], threads[i], shmem[i], arrays->stream[i] >>>
    	(arrays->d_in[i], arrays->d_out[i], arrays->d_filters[i],
    	 arrays->osize[i], arrays->isize[i], params->fsize);
    
    /* // transfer data back  */
    cudaMemcpyAsync(arrays->h_out[i], arrays->d_out[i],
		    arrays->osize[i] * params->rnumf[i] * sizeof(float),
		    cudaMemcpyDeviceToHost, arrays->stream[i]);
  }

}