29 int main( 
int argc, 
char** argv) 
    43   struct timeval t_start, t_end;
    44   double t_gpu = 0.0, t_mcpu = 0.0;
    67   int numrates = args.
nrates;
    70   printf(
"\nGPU FIR filter parameters\n------\n");
    71   printf(
"\nTotal number of input blocks = %d\n", N);
    73   printf(
"\nTotal number of filters = %d\n", gparams.
nfilters);
    75   printf(
"\nFilter size = %d\n", gparams.
fsize);
    77   printf(
"\nNumber of sampling rates = %d\n", gparams.
nrates);
    79   printf(
"\nCUDA streams flag = %d\n", gparams.
streams);
    82   for (
int i=0; i< numrates; ++i){
    83    ratenumf[i]  = nf/numrates;
    86     for (
int i=0; i<rem; ++i)
    91   if (numrates == 1 && args.
rconst==1){
    92       gpuarrays.
osize[0] = in_sz;
    95     for (
int i=0; i< numrates; ++i){
    98     printf(
"\n\nInput size for rate %d is  shorter than filter size.\nChose a longer input block or a shorter filter.\n", i);
    99     printf(
"\nFilter size = %d\n", 
B_SIZE);
   100     printf(
"\nDecimated  size = %d\n", gpuarrays.
osize[i]);
   101     printf(
"\nInitial input block size = %d\n", in_sz);
   102     printf(
"\nDecimation factor = %d\n\n", 
rdiv[i]);
   110   for (
int i=0; i<numrates; i++){
   111     gparams.
rnumf[i] = ratenumf[i];
   112     printf(
"\nNumber of filters for rate %d = %d\n", i, gparams.
rnumf[i]);
   114     printf(
"\nGPU Input size for rate %d = %d\n", i, gpuarrays.
isize[i]);
   115     out_sz[i] = N * gpuarrays.
osize[i];
   116     out_blk_sz +=ratenumf[i] *gpuarrays.
osize[i];
   117     printf(
"\nGPU output size for 1 block for this rate: %d\n", ratenumf[i] *gpuarrays.
osize[i]);
   119     printf(
"\ntotal output size for 1 block for all rates: %d\n", out_blk_sz);
   120     printf(
"----------------\n");
   126   for (
int r=1; r<numrates; r++){
   132   for (
int i=0; i<numrates; i++){
   133     h_in[i] = (
float*) malloc((gpuarrays.
osize[i]*N + gparams.
fsize)*
sizeof(float));
   135     h_out[i] = (
float*) malloc(gpuarrays.
osize[i]*ratenumf[i]*N*
sizeof(
float));
   137     h_reference[i] = (
float*) malloc(gpuarrays.
osize[i]*ratenumf[i]*N*
sizeof(
float));
   139     for (
int n=0; n<gparams.
fsize; ++n)
   142     for (
int n=0; n<gpuarrays.
osize[i]*N; ++n)
   143       h_in[i][n + gparams.
fsize] = rand() / (float)RAND_MAX; 
   145     for (
int n=0; n<gpuarrays.
osize[i]*ratenumf[i]*N; ++n){
   147       h_reference[i][n] = 0.0f;
   152   farr.
bk = (
float*) malloc(nf*gparams.
fsize*
sizeof(
float)); 
   153   for (
int i=0; i < nf; ++i){
   155     for (
int b=0; b < gparams.
fsize;  ++b)
   156       farr.
bk[i*gparams.
fsize+b] = rand() / (float)RAND_MAX;
   160   for (
int f=0; f < nf; ++f){
   161     for (
int i=0; i < bufb_size ; i++){
   166   printf(
"\nRunning multirate filter bank test on GPU, CPU, and CPU OpenMP\n");
   169   printf(
"\ncompute reference solution (1 CPU)\n");
   171   compute_ref( h_in, h_reference, &gpuarrays, &gparams, &args, &farr, N);
   174   for (
int f=0; f < nf; ++f){
   176     for (
int i=0; i < bufb_size ; ++i)
   181   printf(
"\nCompute OPENMP solution\n");
   183   compute_omp( h_in, h_reference, &gpuarrays, &gparams, &args, &farr, N);
   186  printf(
"\nCompute cuda solution\n");
   189  cudaMultiFilterFirInit(h_in, h_out, farr.
bk, &gparams, &gpuarrays);
   190  for (
int i=0; i<numrates; i++)
   191    memcpy(gpuarrays.
h_in[i], &h_in[i][0], gpuarrays.
isize[i]*
sizeof(
float) );
   195    gettimeofday(&t_start, NULL);
   197  for (
int ii=0; ii<N; ++ii){ 
   200    for (
int i=0; i<numrates; i++){
   202        memcpy(gpuarrays.
h_in[i], &h_in[i][0], gpuarrays.
isize[i]*
sizeof(
float) );
   204        memcpy(gpuarrays.
h_in[i], &h_in[i][ii*gpuarrays.
osize[i]], gpuarrays.
isize[i]*
sizeof(
float) );
   208    cudaMultiFilterFirStreams(&gpuarrays, &gparams);
   211    for (
int i=0; i<numrates; i++){
   212      oindex = ii*ratenumf[i]*gpuarrays.
osize[i];
   213      cudaDeviceSynchronize(); 
   214      memcpy(&h_out[i][oindex], gpuarrays.
h_out[i], gpuarrays.
osize[i]* ratenumf[i] *
sizeof(
float) );
   219    cudaDeviceSynchronize();
   220    gettimeofday(&t_end, NULL); 
   221    t_gpu = (double) (t_end.tv_sec + (t_end.tv_usec / 1000000.0)  - t_start.tv_sec - (t_start.tv_usec/ 1000000.0)) * 1000.0; 
   222    printf(
"Finished GPU FIR\nProcessing gpu took: %f ms\n", t_gpu/(
double)N);   
   229  cudaMultiFilterFirClose(&gparams, &gpuarrays);
   232  for (
int i = 0; i < numrates; ++i){
   235    free(h_reference[i]);
 static int rdiv[MAX_RATES]
Decimation factors for multiple input rates. 
int rnumf[MAX_RATES]
number of filters for each sampling rate 
float * bk
filter coefficients array 
void read_command_line(int argc, char *argv[], cmd_args *args)
float buf_in[MAX_FILTERS][B_SIZE+OFFSET]
CPU buffer. 
float * h_in[MAX_RATES]
host input arrays for GPU transfer 
int nrates
how many sampling rates to process 
void compute_omp(float *h_in[], float *h_reference[], gpu_arrays *gpuarrays, params *gparams, cmd_args *args, filter_arrays *farr, int N)
int rconst
for nrates=1 keep the initial input size 
int nrates
number of input sampling rates to process 
int m_offb[MAX_FILTERS]
offset counter for CPU buffers 
int isize[MAX_RATES]
input size for each sampling rate 
int nf
total number of filters to process 
int main(int argc, char **argv)
int nfilters
total number of filters 
void compute_ref(float *h_in[], float *h_reference[], gpu_arrays *gpuarrays, params *gparams, cmd_args *args, filter_arrays *farr, int N)
int insize
input size before resampling 
void check_results(float *h_reference[], float *h_out[], gpu_arrays *gpuarrays, params *gparams, int N)
#define MAX_RATES
Maximum number of sampling rates. 
#define OFFSET
Offset for CPU filter input buffer. 
#define B_SIZE
filter length 
int osize[MAX_RATES]
total output size for each set of filters 
float * h_out[MAX_RATES]
host output arrays for GPU transfer