Mercurial > hg > gpufilter

/*
  GPU multi-rate FIR filter bank example software

  Oxford e-Research Centre, Oxford University

  Centre for Digital Music, Queen Mary, University of London.

  This program is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation, either version 3 of the License,
  or (at your option) any later version.
  See the file COPYING included with this distribution for more information.
*/

#include "filters.h"
#include <helper_cuda.h>


///////////////////////////////////////////////////////////////////////////////
// GPU kernels
///////////////////////////////////////////////////////////////////////////////

__global__ void multiFilterGpuDevice_2dgrid( float *in, float *out, float *filter, int osize, int isize, int bsize)
{
  /**
   * This GPU kernel applies multiple filters, one per y direction.
   * It is assumed that input data arrays are contiguous in memory.
  */

  int tx = threadIdx.x;
  int by = blockIdx.y;
  int tidx = threadIdx.x + blockIdx.x*blockDim.x;

// dynamically allocated shared memory, passed at kernel call
  extern __shared__ float indata[];
  float tempdata[B_SIZE];
  float outdata;

  //initialize output
  outdata = 0.0f;

  //load filter
#pragma unroll 20
  for (int i=0; i< B_SIZE; i++)
    tempdata[i] = filter[by * B_SIZE + i];

  //load signal
  //--first block
  if (blockIdx.x == 0){

    if (tx < B_SIZE)
      indata[tx] = in[tx];
    __syncthreads();

    indata[tx + B_SIZE ] = in[tx + B_SIZE];
  }
  __syncthreads();

//--
  // copy rest of blocks in overlapped fashion
  if (blockIdx.x > 0 ) {
    if (tx < B_SIZE && tidx < isize)
      indata[tx] = in[tidx];
    __syncthreads();
    if (tidx < isize)
      indata[tx + B_SIZE] = in[tidx + B_SIZE];
  }
  __syncthreads();
  //finished loading signal

  //convolution loop
#pragma unroll 20
  for (int i=0; i<B_SIZE; i++){
    outdata += tempdata[i] * indata[tx + B_SIZE - i];
}

  if (tidx < osize)
    out[by * osize + tidx] = outdata;
}

__global__ void multiFilterGpuDevice_2dgrid_shmem( float *in, float *out, float *filter, int osize, int isize, int bsize)
{
  /**
     This kernel applies multiple filters, one per y direction
     It is assumed that input data arrays are contiguous in memory
  */

  int tx = threadIdx.x;
  int by = blockIdx.y;
  int tidx = threadIdx.x + blockIdx.x*blockDim.x;

  extern __shared__ float indata[];
  __shared__ float tempdata[B_SIZE];
  float outdata;

  //initialize output
  outdata = 0.0f;

  //load filter
#pragma unroll
  for (int i=0; i< B_SIZE; i++)
    tempdata[i] = filter[by * B_SIZE + i];
  __syncthreads();

  //load signal
  //--first block
  if (blockIdx.x == 0){

    if (tx < B_SIZE)
      indata[tx] = in[tx];
    __syncthreads();

    indata[tx + B_SIZE ] = in[tx + B_SIZE];
  }
  __syncthreads();

//--
  // copy rest of blocks in overlapped fashion
  if (blockIdx.x > 0 ) {
    if (tx < B_SIZE && tidx < isize)
      indata[tx] = in[tidx];
    __syncthreads();
    if (tidx < isize)
      indata[tx + B_SIZE] = in[tidx + B_SIZE];
  }
  __syncthreads();
  //finished loading signal

  //convolution loop
#pragma unroll
  for (int i=0; i<B_SIZE; i++){
    outdata += tempdata[i] * indata[tx + B_SIZE - i];
}

  if (tidx < osize)
    out[by * osize + tidx] = outdata;
}

void cudaMultiFilterFirInit(float **in, float **out, float *filters,
			    params *params, gpu_arrays *arrays)
{
  /**
     Perform all memory allocations and transfers required
     for a GPU multiple filter operation.
     This also allows to keep allocated memory and reusable
     data on the GPU, in order to repeat CUDA calls
     along the whole input signal.
  */

  /**
    @param in:      the input buffers (2D array), for resampled inputs
    @param out:     the output buffers (2D array), different sampling
    @param filters: a two dimensional array containing the coefficients
    for multiple filters
    @param params:  structure for the filters parameters, which
    contains the input and output sizes for each filter
    as well as the number of filters and sampling rates
    to be processed.
     @param arrays:  a structure of pointers to the GPU arrays for reuse
  */

  int fsize = params->fsize;
  int rnumfs; // number of filters for a given sampling rate
  int nrates = params->nrates;

  int pos; //starting position of filters for a given rate

  // initialise card (cuda_helper.h routine)
  findCudaDevice(0, 0);

  // setup streams
  if (params->streams){
    for (int i = 0; i < nrates; ++i)
      checkCudaErrors(cudaStreamCreate(&arrays->stream[i]));
  }

  //allocate memory in separate locations for the different sampling rates

  // filters
  pos = 0;
  for (int i=0; i<nrates; i++){
    rnumfs = params->rnumf[i]; // number of filters for this rate

    //allocate memory for these filters
    checkCudaErrors(cudaMalloc((void**)&arrays->d_filters[i], fsize * rnumfs * sizeof(float)));

    //copy this filter array to GPU memory
    checkCudaErrors(cudaMemcpy(arrays->d_filters[i], filters + pos*fsize,
			       fsize * rnumfs * sizeof(float), cudaMemcpyHostToDevice));

   pos += params->rnumf[i];

  }

    // Inputs and outputs

  for (int i=0; i<nrates; i++){
    // allocate host page locked memory for staging
    // to allow CUDA asynchronous transfers if we are working with steams
    // if we are not using streams the input and output pointers
    // from the host are passed.
    if (params->streams){
      checkCudaErrors(cudaMallocHost((void**)&arrays->h_in[i],
				      arrays->isize[i] * sizeof(float)));
      checkCudaErrors(cudaMallocHost((void**)&arrays->h_out[i],
				     arrays->osize[i] * params->rnumf[i] * sizeof(float)));
    } else {
      arrays->h_in[i] = in[i];
      arrays->h_out[i] = out[i];
    }
    // allocate device arrays
    checkCudaErrors(cudaMalloc((void**)&arrays->d_in[i],
			       arrays->isize[i] * sizeof(float)));
    checkCudaErrors(cudaMalloc((void**)&arrays->d_out[i],
			       arrays->osize[i] * params->rnumf[i] * sizeof(float)));
  }

}

void cudaMultiFilterFirClose(params *params, gpu_arrays *arrays)
{
  /**
   * Clean up CUDA resources

    @param params:  structure for the filters parameters, which
              contains the input and output sizes for each filter
              as well as the number of filters and sampling rates
              to be processed.
     @param arrays:  a structure of pointers to the GPU arrays for reuse
   */
  int nrates = params->nrates;

  for (int i=0; i<nrates; i++){
    checkCudaErrors(cudaFree(arrays->d_filters[i]));
    checkCudaErrors(cudaFree(arrays->d_in[i]));
    checkCudaErrors(cudaFree(arrays->d_out[i]));

    if (params->streams){
      checkCudaErrors(cudaFreeHost(arrays->h_in[i]));
      checkCudaErrors(cudaFreeHost(arrays->h_out[i]));
      //destroy streams
      cudaStreamDestroy(arrays->stream[i]);

    }
  }
}

void cudaMultiFilterFirStreams(gpu_arrays *arrays, params *params)
{
  /**
     This function performs multiple filters with multiple input
     sampling rates on a GPU.
     The required memory is pre-allocated on the GPU
     and filters also copied in advance
     The different rates are executed asynchronously on the GPU
     using CUDA streams.

     @param arrays:  all the GPU and host pinned memory  input
              and output arrays we need for asynchronous operation
     @param params:  structure for the filters parameters, which
              contains the input and output sizes for each filter
              as well as the number of filters and sampling rates
              to be processed.
  */

  int nrates = params->nrates;

  //setup execution configuration
  int threads[nrates];
  dim3 blocks[nrates];
  size_t shmem[nrates];

  for (int i = 0; i < nrates; ++i){
    if ( arrays->osize[i] < 64)
      threads[i] = arrays->osize[i];
    else {
      // filter size must be less than number of threads
      //nearest multiple of 32
      threads[i] =  B_SIZE + (64 - B_SIZE % 32);
    }

    blocks[i].x = ( arrays->osize[i]) / threads[i] + 1;
    blocks[i].y = params->rnumf[i];
    shmem[i] = (threads[i] + params->fsize) * sizeof(float);

  }

  // copy inputs with different sampling rates asynchronously to the GPU
  for (int i = 0; i < nrates; ++i){
    cudaMemcpyAsync(arrays->d_in[i], arrays->h_in[i],
		    arrays->isize[i] * sizeof(float),
		    cudaMemcpyHostToDevice, arrays->stream[i]);

    //run kernels for different rates concurrently
    if(!CUDA_SHM)
      multiFilterGpuDevice_2dgrid <<< blocks[i], threads[i], shmem[i], arrays->stream[i] >>>
    	(arrays->d_in[i], arrays->d_out[i], arrays->d_filters[i],
    	 arrays->osize[i], arrays->isize[i], params->fsize);
    else
      multiFilterGpuDevice_2dgrid_shmem <<< blocks[i], threads[i], shmem[i], arrays->stream[i] >>>
    	(arrays->d_in[i], arrays->d_out[i], arrays->d_filters[i],
    	 arrays->osize[i], arrays->isize[i], params->fsize);

    /* // transfer data back  */
    cudaMemcpyAsync(arrays->h_out[i], arrays->d_out[i],
		    arrays->osize[i] * params->rnumf[i] * sizeof(float),
		    cudaMemcpyDeviceToHost, arrays->stream[i]);
  }

}
author	Sofia Dimoudi <sofia.dimoudi@gmail.com>
date	Fri, 16 Sep 2016 15:49:19 +0100
parents	2b63f74a3010
children