29 int main(
int argc,
char** argv)
43 struct timeval t_start, t_end;
44 double t_gpu = 0.0, t_mcpu = 0.0;
67 int numrates = args.
nrates;
70 printf(
"\nGPU FIR filter parameters\n------\n");
71 printf(
"\nTotal number of input blocks = %d\n", N);
73 printf(
"\nTotal number of filters = %d\n", gparams.
nfilters);
75 printf(
"\nFilter size = %d\n", gparams.
fsize);
77 printf(
"\nNumber of sampling rates = %d\n", gparams.
nrates);
79 printf(
"\nCUDA streams flag = %d\n", gparams.
streams);
82 for (
int i=0; i< numrates; ++i){
83 ratenumf[i] = nf/numrates;
86 for (
int i=0; i<rem; ++i)
91 if (numrates == 1 && args.
rconst==1){
92 gpuarrays.
osize[0] = in_sz;
95 for (
int i=0; i< numrates; ++i){
98 printf(
"\n\nInput size for rate %d is shorter than filter size.\nChose a longer input block or a shorter filter.\n", i);
99 printf(
"\nFilter size = %d\n",
B_SIZE);
100 printf(
"\nDecimated size = %d\n", gpuarrays.
osize[i]);
101 printf(
"\nInitial input block size = %d\n", in_sz);
102 printf(
"\nDecimation factor = %d\n\n",
rdiv[i]);
110 for (
int i=0; i<numrates; i++){
111 gparams.
rnumf[i] = ratenumf[i];
112 printf(
"\nNumber of filters for rate %d = %d\n", i, gparams.
rnumf[i]);
114 printf(
"\nGPU Input size for rate %d = %d\n", i, gpuarrays.
isize[i]);
115 out_sz[i] = N * gpuarrays.
osize[i];
116 out_blk_sz +=ratenumf[i] *gpuarrays.
osize[i];
117 printf(
"\nGPU output size for 1 block for this rate: %d\n", ratenumf[i] *gpuarrays.
osize[i]);
119 printf(
"\ntotal output size for 1 block for all rates: %d\n", out_blk_sz);
120 printf(
"----------------\n");
126 for (
int r=1; r<numrates; r++){
132 for (
int i=0; i<numrates; i++){
133 h_in[i] = (
float*) malloc((gpuarrays.
osize[i]*N + gparams.
fsize)*
sizeof(float));
135 h_out[i] = (
float*) malloc(gpuarrays.
osize[i]*ratenumf[i]*N*
sizeof(
float));
137 h_reference[i] = (
float*) malloc(gpuarrays.
osize[i]*ratenumf[i]*N*
sizeof(
float));
139 for (
int n=0; n<gparams.
fsize; ++n)
142 for (
int n=0; n<gpuarrays.
osize[i]*N; ++n)
143 h_in[i][n + gparams.
fsize] = rand() / (float)RAND_MAX;
145 for (
int n=0; n<gpuarrays.
osize[i]*ratenumf[i]*N; ++n){
147 h_reference[i][n] = 0.0f;
152 farr.
bk = (
float*) malloc(nf*gparams.
fsize*
sizeof(
float));
153 for (
int i=0; i < nf; ++i){
155 for (
int b=0; b < gparams.
fsize; ++b)
156 farr.
bk[i*gparams.
fsize+b] = rand() / (float)RAND_MAX;
160 for (
int f=0; f < nf; ++f){
161 for (
int i=0; i < bufb_size ; i++){
166 printf(
"\nRunning multirate filter bank test on GPU, CPU, and CPU OpenMP\n");
169 printf(
"\ncompute reference solution (1 CPU)\n");
171 compute_ref( h_in, h_reference, &gpuarrays, &gparams, &args, &farr, N);
174 for (
int f=0; f < nf; ++f){
176 for (
int i=0; i < bufb_size ; ++i)
181 printf(
"\nCompute OPENMP solution\n");
183 compute_omp( h_in, h_reference, &gpuarrays, &gparams, &args, &farr, N);
186 printf(
"\nCompute cuda solution\n");
189 cudaMultiFilterFirInit(h_in, h_out, farr.
bk, &gparams, &gpuarrays);
190 for (
int i=0; i<numrates; i++)
191 memcpy(gpuarrays.
h_in[i], &h_in[i][0], gpuarrays.
isize[i]*
sizeof(
float) );
195 gettimeofday(&t_start, NULL);
197 for (
int ii=0; ii<N; ++ii){
200 for (
int i=0; i<numrates; i++){
202 memcpy(gpuarrays.
h_in[i], &h_in[i][0], gpuarrays.
isize[i]*
sizeof(
float) );
204 memcpy(gpuarrays.
h_in[i], &h_in[i][ii*gpuarrays.
osize[i]], gpuarrays.
isize[i]*
sizeof(
float) );
208 cudaMultiFilterFirStreams(&gpuarrays, &gparams);
211 for (
int i=0; i<numrates; i++){
212 oindex = ii*ratenumf[i]*gpuarrays.
osize[i];
213 cudaDeviceSynchronize();
214 memcpy(&h_out[i][oindex], gpuarrays.
h_out[i], gpuarrays.
osize[i]* ratenumf[i] *
sizeof(
float) );
219 cudaDeviceSynchronize();
220 gettimeofday(&t_end, NULL);
221 t_gpu = (double) (t_end.tv_sec + (t_end.tv_usec / 1000000.0) - t_start.tv_sec - (t_start.tv_usec/ 1000000.0)) * 1000.0;
222 printf(
"Finished GPU FIR\nProcessing gpu took: %f ms\n", t_gpu/(
double)N);
229 cudaMultiFilterFirClose(&gparams, &gpuarrays);
232 for (
int i = 0; i < numrates; ++i){
235 free(h_reference[i]);
static int rdiv[MAX_RATES]
Decimation factors for multiple input rates.
int rnumf[MAX_RATES]
number of filters for each sampling rate
float * bk
filter coefficients array
void read_command_line(int argc, char *argv[], cmd_args *args)
float buf_in[MAX_FILTERS][B_SIZE+OFFSET]
CPU buffer.
float * h_in[MAX_RATES]
host input arrays for GPU transfer
int nrates
how many sampling rates to process
void compute_omp(float *h_in[], float *h_reference[], gpu_arrays *gpuarrays, params *gparams, cmd_args *args, filter_arrays *farr, int N)
int rconst
for nrates=1 keep the initial input size
int nrates
number of input sampling rates to process
int m_offb[MAX_FILTERS]
offset counter for CPU buffers
int isize[MAX_RATES]
input size for each sampling rate
int nf
total number of filters to process
int main(int argc, char **argv)
int nfilters
total number of filters
void compute_ref(float *h_in[], float *h_reference[], gpu_arrays *gpuarrays, params *gparams, cmd_args *args, filter_arrays *farr, int N)
int insize
input size before resampling
void check_results(float *h_reference[], float *h_out[], gpu_arrays *gpuarrays, params *gparams, int N)
#define MAX_RATES
Maximum number of sampling rates.
#define OFFSET
Offset for CPU filter input buffer.
#define B_SIZE
filter length
int osize[MAX_RATES]
total output size for each set of filters
float * h_out[MAX_RATES]
host output arrays for GPU transfer