sofia@0: /* sofia@0: GPU multi-rate FIR filter bank example software sofia@0: sofia@0: Oxford e-Research Centre, Oxford University sofia@0: sofia@0: Centre for Digital Music, Queen Mary, University of London. sofia@0: sofia@0: This program is free software: you can redistribute it and/or modify sofia@0: it under the terms of the GNU General Public License as published by sofia@0: the Free Software Foundation, either version 3 of the License, sofia@0: or (at your option) any later version. sofia@0: See the file COPYING included with this distribution for more information. sofia@0: */ sofia@0: sofia@0: #include "filters.h" sofia@0: #include sofia@0: sofia@0: sofia@0: /////////////////////////////////////////////////////////////////////////////// sofia@0: // GPU kernels sofia@0: /////////////////////////////////////////////////////////////////////////////// sofia@0: sofia@0: __global__ void multiFilterGpuDevice_2dgrid( float *in, float *out, float *filter, int osize, int isize, int bsize) sofia@0: { sofia@0: /** sofia@0: * This GPU kernel applies multiple filters, one per y direction. sofia@0: * It is assumed that input data arrays are contiguous in memory. sofia@0: */ sofia@0: sofia@0: int tx = threadIdx.x; sofia@0: int by = blockIdx.y; sofia@0: int tidx = threadIdx.x + blockIdx.x*blockDim.x; sofia@0: sofia@0: // dynamically allocated shared memory, passed at kernel call sofia@0: extern __shared__ float indata[]; sofia@0: float tempdata[B_SIZE]; sofia@0: float outdata; sofia@0: sofia@0: //initialize output sofia@0: outdata = 0.0f; sofia@0: sofia@0: //load filter sofia@0: #pragma unroll 20 sofia@0: for (int i=0; i< B_SIZE; i++) sofia@0: tempdata[i] = filter[by * B_SIZE + i]; sofia@0: sofia@0: //load signal sofia@0: //--first block sofia@0: if (blockIdx.x == 0){ sofia@0: sofia@0: if (tx < B_SIZE) sofia@0: indata[tx] = in[tx]; sofia@0: __syncthreads(); sofia@0: sofia@0: indata[tx + B_SIZE ] = in[tx + B_SIZE]; sofia@0: } sofia@0: __syncthreads(); sofia@0: sofia@0: //-- sofia@0: // copy rest of blocks in overlapped fashion sofia@0: if (blockIdx.x > 0 ) { sofia@0: if (tx < B_SIZE && tidx < isize) sofia@0: indata[tx] = in[tidx]; sofia@0: __syncthreads(); sofia@0: if (tidx < isize) sofia@0: indata[tx + B_SIZE] = in[tidx + B_SIZE]; sofia@0: } sofia@0: __syncthreads(); sofia@0: //finished loading signal sofia@0: sofia@0: //convolution loop sofia@0: #pragma unroll 20 sofia@0: for (int i=0; i 0 ) { sofia@0: if (tx < B_SIZE && tidx < isize) sofia@0: indata[tx] = in[tidx]; sofia@0: __syncthreads(); sofia@0: if (tidx < isize) sofia@0: indata[tx + B_SIZE] = in[tidx + B_SIZE]; sofia@0: } sofia@0: __syncthreads(); sofia@0: //finished loading signal sofia@0: sofia@0: //convolution loop sofia@0: #pragma unroll sofia@0: for (int i=0; ifsize; sofia@0: int rnumfs; // number of filters for a given sampling rate sofia@0: int nrates = params->nrates; sofia@0: sofia@0: int pos; //starting position of filters for a given rate sofia@0: sofia@0: // initialise card (cuda_helper.h routine) sofia@0: findCudaDevice(0, 0); sofia@0: sofia@0: // setup streams sofia@0: if (params->streams){ sofia@0: for (int i = 0; i < nrates; ++i) sofia@0: checkCudaErrors(cudaStreamCreate(&arrays->stream[i])); sofia@0: } sofia@0: sofia@0: //allocate memory in separate locations for the different sampling rates sofia@0: sofia@0: // filters sofia@0: pos = 0; sofia@0: for (int i=0; irnumf[i]; // number of filters for this rate sofia@0: sofia@0: //allocate memory for these filters sofia@0: checkCudaErrors(cudaMalloc((void**)&arrays->d_filters[i], fsize * rnumfs * sizeof(float))); sofia@0: sofia@0: //copy this filter array to GPU memory sofia@0: checkCudaErrors(cudaMemcpy(arrays->d_filters[i], filters + pos*fsize, sofia@0: fsize * rnumfs * sizeof(float), cudaMemcpyHostToDevice)); sofia@0: sofia@0: pos += params->rnumf[i]; sofia@0: sofia@0: } sofia@0: sofia@0: // Inputs and outputs sofia@0: sofia@0: for (int i=0; istreams){ sofia@0: checkCudaErrors(cudaMallocHost((void**)&arrays->h_in[i], sofia@0: arrays->isize[i] * sizeof(float))); sofia@0: checkCudaErrors(cudaMallocHost((void**)&arrays->h_out[i], sofia@0: arrays->osize[i] * params->rnumf[i] * sizeof(float))); sofia@0: } else { sofia@0: arrays->h_in[i] = in[i]; sofia@0: arrays->h_out[i] = out[i]; sofia@0: } sofia@0: // allocate device arrays sofia@0: checkCudaErrors(cudaMalloc((void**)&arrays->d_in[i], sofia@0: arrays->isize[i] * sizeof(float))); sofia@0: checkCudaErrors(cudaMalloc((void**)&arrays->d_out[i], sofia@0: arrays->osize[i] * params->rnumf[i] * sizeof(float))); sofia@0: } sofia@0: sofia@0: } sofia@0: sofia@0: void cudaMultiFilterFirClose(params *params, gpu_arrays *arrays) sofia@0: { sofia@0: /** sofia@0: * Clean up CUDA resources sofia@0: sofia@0: @param params: structure for the filters parameters, which sofia@0: contains the input and output sizes for each filter sofia@0: as well as the number of filters and sampling rates sofia@0: to be processed. sofia@0: @param arrays: a structure of pointers to the GPU arrays for reuse sofia@0: */ sofia@0: int nrates = params->nrates; sofia@0: sofia@0: for (int i=0; id_filters[i])); sofia@0: checkCudaErrors(cudaFree(arrays->d_in[i])); sofia@0: checkCudaErrors(cudaFree(arrays->d_out[i])); sofia@0: sofia@0: if (params->streams){ sofia@0: checkCudaErrors(cudaFreeHost(arrays->h_in[i])); sofia@0: checkCudaErrors(cudaFreeHost(arrays->h_out[i])); sofia@0: //destroy streams sofia@0: cudaStreamDestroy(arrays->stream[i]); sofia@0: sofia@0: } sofia@0: } sofia@0: } sofia@0: sofia@0: void cudaMultiFilterFirStreams(gpu_arrays *arrays, params *params) sofia@0: { sofia@0: /** sofia@0: This function performs multiple filters with multiple input sofia@0: sampling rates on a GPU. sofia@0: The required memory is pre-allocated on the GPU sofia@0: and filters also copied in advance sofia@0: The different rates are executed asynchronously on the GPU sofia@0: using CUDA streams. sofia@0: sofia@0: @param arrays: all the GPU and host pinned memory input sofia@0: and output arrays we need for asynchronous operation sofia@0: @param params: structure for the filters parameters, which sofia@0: contains the input and output sizes for each filter sofia@0: as well as the number of filters and sampling rates sofia@0: to be processed. sofia@0: */ sofia@0: sofia@0: int nrates = params->nrates; sofia@0: sofia@0: //setup execution configuration sofia@0: int threads[nrates]; sofia@0: dim3 blocks[nrates]; sofia@0: size_t shmem[nrates]; sofia@0: sofia@0: for (int i = 0; i < nrates; ++i){ sofia@0: if ( arrays->osize[i] < 64) sofia@0: threads[i] = arrays->osize[i]; sofia@0: else { sofia@0: // filter size must be less than number of threads sofia@0: //nearest multiple of 32 sofia@0: threads[i] = B_SIZE + (64 - B_SIZE % 32); sofia@0: } sofia@0: sofia@0: blocks[i].x = ( arrays->osize[i]) / threads[i] + 1; sofia@0: blocks[i].y = params->rnumf[i]; sofia@0: shmem[i] = (threads[i] + params->fsize) * sizeof(float); sofia@0: sofia@0: } sofia@0: sofia@0: // copy inputs with different sampling rates asynchronously to the GPU sofia@0: for (int i = 0; i < nrates; ++i){ sofia@0: cudaMemcpyAsync(arrays->d_in[i], arrays->h_in[i], sofia@0: arrays->isize[i] * sizeof(float), sofia@0: cudaMemcpyHostToDevice, arrays->stream[i]); sofia@0: sofia@0: //run kernels for different rates concurrently sofia@0: if(!CUDA_SHM) sofia@0: multiFilterGpuDevice_2dgrid <<< blocks[i], threads[i], shmem[i], arrays->stream[i] >>> sofia@0: (arrays->d_in[i], arrays->d_out[i], arrays->d_filters[i], sofia@0: arrays->osize[i], arrays->isize[i], params->fsize); sofia@0: else sofia@0: multiFilterGpuDevice_2dgrid_shmem <<< blocks[i], threads[i], shmem[i], arrays->stream[i] >>> sofia@0: (arrays->d_in[i], arrays->d_out[i], arrays->d_filters[i], sofia@0: arrays->osize[i], arrays->isize[i], params->fsize); sofia@0: sofia@0: /* // transfer data back */ sofia@0: cudaMemcpyAsync(arrays->h_out[i], arrays->d_out[i], sofia@0: arrays->osize[i] * params->rnumf[i] * sizeof(float), sofia@0: cudaMemcpyDeviceToHost, arrays->stream[i]); sofia@0: } sofia@0: sofia@0: }