Mercurial > hg > gpufilter
view filtermain.cpp @ 0:2b63f74a3010
First commit
author | Sofia Dimoudi <sofia.dimoudi@gmail.com> |
---|---|
date | Fri, 16 Sep 2016 15:38:38 +0100 |
parents | |
children |
line wrap: on
line source
/* GPU multi-rate FIR filter bank example software Oxford e-Research Centre, Oxford University Centre for Digital Music, Queen Mary, University of London. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. See the file COPYING included with this distribution for more information. */ #include <cstdio> #include <cstdlib> #include <cstring> #include <sys/time.h> #include "filters.h" //////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// using namespace std; int main( int argc, char** argv) { unsigned int N = 100; // total number of input blocks before resampling int bufb_size = B_SIZE + OFFSET; // the total input signal to loop through is // essentially used to check results and provide multiple // block iterations. float *h_in[MAX_RATES], *h_out[MAX_RATES], *h_reference[MAX_RATES]; // input and output arrays int out_sz[MAX_RATES]; //total output size for all filters //timing struct timeval t_start, t_end; double t_gpu = 0.0, t_mcpu = 0.0; // parameters, filter and GPU arrays structures // object pointers can be passed to these for C++ code filter_arrays farr; params gparams; gpu_arrays gpuarrays; int ratenumf[MAX_RATES]; // command line arguments cmd_args args; //deaults args.nrates = 3; args.nf = 60; args.insize = 1024; args.rconst = 0; args.tim=1; read_command_line(argc, argv, &args); //initialise parameters int nf = args.nf; int numrates = args.nrates; int rem= nf%numrates; printf("\nGPU FIR filter parameters\n------\n"); printf("\nTotal number of input blocks = %d\n", N); gparams.nfilters = nf; // not sure if this is needed printf("\nTotal number of filters = %d\n", gparams.nfilters); gparams.fsize = B_SIZE; printf("\nFilter size = %d\n", gparams.fsize); gparams.nrates = numrates; printf("\nNumber of sampling rates = %d\n", gparams.nrates); gparams.streams = 1; printf("\nCUDA streams flag = %d\n", gparams.streams); //dividing sampling rates equally... for (int i=0; i< numrates; ++i){ ratenumf[i] = nf/numrates; } if (rem > 0){ for (int i=0; i<rem; ++i) ratenumf[i]++; } int in_sz = args.insize; if (numrates == 1 && args.rconst==1){ gpuarrays.osize[0] = in_sz; } else{ for (int i=0; i< numrates; ++i){ gpuarrays.osize[i] = in_sz/rdiv[i]; if (gpuarrays.osize[i] < B_SIZE ){ printf("\n\nInput size for rate %d is shorter than filter size.\nChose a longer input block or a shorter filter.\n", i); printf("\nFilter size = %d\n", B_SIZE); printf("\nDecimated size = %d\n", gpuarrays.osize[i]); printf("\nInitial input block size = %d\n", in_sz); printf("\nDecimation factor = %d\n\n", rdiv[i]); exit(EXIT_FAILURE); } } } int out_blk_sz = 0; for (int i=0; i<numrates; i++){ gparams.rnumf[i] = ratenumf[i]; printf("\nNumber of filters for rate %d = %d\n", i, gparams.rnumf[i]); gpuarrays.isize[i] = gpuarrays.osize[i] + gparams.fsize; printf("\nGPU Input size for rate %d = %d\n", i, gpuarrays.isize[i]); out_sz[i] = N * gpuarrays.osize[i]; out_blk_sz +=ratenumf[i] *gpuarrays.osize[i]; printf("\nGPU output size for 1 block for this rate: %d\n", ratenumf[i] *gpuarrays.osize[i]); } printf("\ntotal output size for 1 block for all rates: %d\n", out_blk_sz); printf("----------------\n"); int oindex = 0; int pos[MAX_RATES]; int sum = 0; pos[0] = 0; for (int r=1; r<numrates; r++){ sum+=ratenumf[r-1]; pos[r] = sum; } // Initialize arrays for (int i=0; i<numrates; i++){ h_in[i] = (float*) malloc((gpuarrays.osize[i]*N + gparams.fsize)*sizeof(float)); h_out[i] = (float*) malloc(gpuarrays.osize[i]*ratenumf[i]*N*sizeof(float)); h_reference[i] = (float*) malloc(gpuarrays.osize[i]*ratenumf[i]*N*sizeof(float)); for (int n=0; n<gparams.fsize; ++n) h_in[i][n] = 0.0f; for (int n=0; n<gpuarrays.osize[i]*N; ++n) h_in[i][n + gparams.fsize] = rand() / (float)RAND_MAX; for (int n=0; n<gpuarrays.osize[i]*ratenumf[i]*N; ++n){ h_out[i][n] = 0.0f; h_reference[i][n] = 0.0f; } } // initialize filters with random numbers farr.bk = (float*) malloc(nf*gparams.fsize*sizeof(float)); for (int i=0; i < nf; ++i){ farr.m_offb[i] = OFFSET; for (int b=0; b < gparams.fsize; ++b) farr.bk[i*gparams.fsize+b] = rand() / (float)RAND_MAX; } //initialize cpu buffers for each filter for (int f=0; f < nf; ++f){ for (int i=0; i < bufb_size ; i++){ farr.buf_in[f][i] = 0.0; } } printf("\nRunning multirate filter bank test on GPU, CPU, and CPU OpenMP\n"); // printf("----------------\n"); // compute reference solution printf("\ncompute reference solution (1 CPU)\n"); compute_ref( h_in, h_reference, &gpuarrays, &gparams, &args, &farr, N); //reset CPU buffers for (int f=0; f < nf; ++f){ farr.m_offb[f] = OFFSET; for (int i=0; i < bufb_size ; ++i) farr.buf_in[f][i] = 0.0; } // compute OPENMP solution printf("\nCompute OPENMP solution\n"); compute_omp( h_in, h_reference, &gpuarrays, &gparams, &args, &farr, N); // compute CUDA solution printf("\nCompute cuda solution\n"); // init cudaMultiFilterFirInit(h_in, h_out, farr.bk, &gparams, &gpuarrays); for (int i=0; i<numrates; i++) memcpy(gpuarrays.h_in[i], &h_in[i][0], gpuarrays.isize[i]*sizeof(float) ); //time execution oindex = 0; if (args.tim) gettimeofday(&t_start, NULL); for (int ii=0; ii<N; ++ii){ // loop through input blocks //need to copy data to pinned memory for streams... for (int i=0; i<numrates; i++){ if (ii==0) memcpy(gpuarrays.h_in[i], &h_in[i][0], gpuarrays.isize[i]*sizeof(float) ); else memcpy(gpuarrays.h_in[i], &h_in[i][ii*gpuarrays.osize[i]], gpuarrays.isize[i]*sizeof(float) ); } //call GPU function cudaMultiFilterFirStreams(&gpuarrays, &gparams); // ... and copy data back from pinned memory... for (int i=0; i<numrates; i++){ oindex = ii*ratenumf[i]*gpuarrays.osize[i]; cudaDeviceSynchronize(); memcpy(&h_out[i][oindex], gpuarrays.h_out[i], gpuarrays.osize[i]* ratenumf[i] *sizeof(float) ); } } if (args.tim){ cudaDeviceSynchronize(); gettimeofday(&t_end, NULL); t_gpu = (double) (t_end.tv_sec + (t_end.tv_usec / 1000000.0) - t_start.tv_sec - (t_start.tv_usec/ 1000000.0)) * 1000.0; printf("Finished GPU FIR\nProcessing gpu took: %f ms\n", t_gpu/(double)N); } // check results check_results(h_reference, h_out, &gpuarrays, &gparams, N); //close GPU cudaMultiFilterFirClose(&gparams, &gpuarrays); // cleanup memory for (int i = 0; i < numrates; ++i){ free(h_out[i]); free(h_in[i]); free(h_reference[i]); } free(farr.bk); }