sofia@0
|
1 /*
|
sofia@0
|
2 GPU multi-rate FIR filter bank example software
|
sofia@0
|
3
|
sofia@0
|
4 Oxford e-Research Centre, Oxford University
|
sofia@0
|
5
|
sofia@0
|
6 Centre for Digital Music, Queen Mary, University of London.
|
sofia@0
|
7
|
sofia@0
|
8 This program is free software: you can redistribute it and/or modify
|
sofia@0
|
9 it under the terms of the GNU General Public License as published by
|
sofia@0
|
10 the Free Software Foundation, either version 3 of the License,
|
sofia@0
|
11 or (at your option) any later version.
|
sofia@0
|
12 See the file COPYING included with this distribution for more information.
|
sofia@0
|
13 */
|
sofia@0
|
14
|
sofia@0
|
15 #include <cstdio>
|
sofia@0
|
16 #include <cstdlib>
|
sofia@0
|
17 #include <cstring>
|
sofia@0
|
18
|
sofia@0
|
19 #include <sys/time.h>
|
sofia@0
|
20 #include "filters.h"
|
sofia@0
|
21
|
sofia@0
|
22 ////////////////////////////////////////////////////////////////////////////////
|
sofia@0
|
23 // Program main
|
sofia@0
|
24 ////////////////////////////////////////////////////////////////////////////////
|
sofia@0
|
25
|
sofia@0
|
26 using namespace std;
|
sofia@0
|
27
|
sofia@0
|
28
|
sofia@0
|
29 int main( int argc, char** argv)
|
sofia@0
|
30 {
|
sofia@0
|
31 unsigned int N = 100; // total number of input blocks before resampling
|
sofia@0
|
32
|
sofia@0
|
33 int bufb_size = B_SIZE + OFFSET;
|
sofia@0
|
34
|
sofia@0
|
35 // the total input signal to loop through is
|
sofia@0
|
36 // essentially used to check results and provide multiple
|
sofia@0
|
37 // block iterations.
|
sofia@0
|
38
|
sofia@0
|
39 float *h_in[MAX_RATES], *h_out[MAX_RATES], *h_reference[MAX_RATES]; // input and output arrays
|
sofia@0
|
40 int out_sz[MAX_RATES]; //total output size for all filters
|
sofia@0
|
41
|
sofia@0
|
42 //timing
|
sofia@0
|
43 struct timeval t_start, t_end;
|
sofia@0
|
44 double t_gpu = 0.0, t_mcpu = 0.0;
|
sofia@0
|
45
|
sofia@0
|
46 // parameters, filter and GPU arrays structures
|
sofia@0
|
47 // object pointers can be passed to these for C++ code
|
sofia@0
|
48 filter_arrays farr;
|
sofia@0
|
49 params gparams;
|
sofia@0
|
50 gpu_arrays gpuarrays;
|
sofia@0
|
51 int ratenumf[MAX_RATES];
|
sofia@0
|
52
|
sofia@0
|
53 // command line arguments
|
sofia@0
|
54 cmd_args args;
|
sofia@0
|
55
|
sofia@0
|
56 //deaults
|
sofia@0
|
57 args.nrates = 3;
|
sofia@0
|
58 args.nf = 60;
|
sofia@0
|
59 args.insize = 1024;
|
sofia@0
|
60 args.rconst = 0;
|
sofia@0
|
61 args.tim=1;
|
sofia@0
|
62
|
sofia@0
|
63 read_command_line(argc, argv, &args);
|
sofia@0
|
64
|
sofia@0
|
65 //initialise parameters
|
sofia@0
|
66 int nf = args.nf;
|
sofia@0
|
67 int numrates = args.nrates;
|
sofia@0
|
68 int rem= nf%numrates;
|
sofia@0
|
69
|
sofia@0
|
70 printf("\nGPU FIR filter parameters\n------\n");
|
sofia@0
|
71 printf("\nTotal number of input blocks = %d\n", N);
|
sofia@0
|
72 gparams.nfilters = nf; // not sure if this is needed
|
sofia@0
|
73 printf("\nTotal number of filters = %d\n", gparams.nfilters);
|
sofia@0
|
74 gparams.fsize = B_SIZE;
|
sofia@0
|
75 printf("\nFilter size = %d\n", gparams.fsize);
|
sofia@0
|
76 gparams.nrates = numrates;
|
sofia@0
|
77 printf("\nNumber of sampling rates = %d\n", gparams.nrates);
|
sofia@0
|
78 gparams.streams = 1;
|
sofia@0
|
79 printf("\nCUDA streams flag = %d\n", gparams.streams);
|
sofia@0
|
80
|
sofia@0
|
81 //dividing sampling rates equally...
|
sofia@0
|
82 for (int i=0; i< numrates; ++i){
|
sofia@0
|
83 ratenumf[i] = nf/numrates;
|
sofia@0
|
84 }
|
sofia@0
|
85 if (rem > 0){
|
sofia@0
|
86 for (int i=0; i<rem; ++i)
|
sofia@0
|
87 ratenumf[i]++;
|
sofia@0
|
88 }
|
sofia@0
|
89
|
sofia@0
|
90 int in_sz = args.insize;
|
sofia@0
|
91 if (numrates == 1 && args.rconst==1){
|
sofia@0
|
92 gpuarrays.osize[0] = in_sz;
|
sofia@0
|
93 }
|
sofia@0
|
94 else{
|
sofia@0
|
95 for (int i=0; i< numrates; ++i){
|
sofia@0
|
96 gpuarrays.osize[i] = in_sz/rdiv[i];
|
sofia@0
|
97 if (gpuarrays.osize[i] < B_SIZE ){
|
sofia@0
|
98 printf("\n\nInput size for rate %d is shorter than filter size.\nChose a longer input block or a shorter filter.\n", i);
|
sofia@0
|
99 printf("\nFilter size = %d\n", B_SIZE);
|
sofia@0
|
100 printf("\nDecimated size = %d\n", gpuarrays.osize[i]);
|
sofia@0
|
101 printf("\nInitial input block size = %d\n", in_sz);
|
sofia@0
|
102 printf("\nDecimation factor = %d\n\n", rdiv[i]);
|
sofia@0
|
103 exit(EXIT_FAILURE);
|
sofia@0
|
104 }
|
sofia@0
|
105 }
|
sofia@0
|
106 }
|
sofia@0
|
107
|
sofia@0
|
108 int out_blk_sz = 0;
|
sofia@0
|
109
|
sofia@0
|
110 for (int i=0; i<numrates; i++){
|
sofia@0
|
111 gparams.rnumf[i] = ratenumf[i];
|
sofia@0
|
112 printf("\nNumber of filters for rate %d = %d\n", i, gparams.rnumf[i]);
|
sofia@0
|
113 gpuarrays.isize[i] = gpuarrays.osize[i] + gparams.fsize;
|
sofia@0
|
114 printf("\nGPU Input size for rate %d = %d\n", i, gpuarrays.isize[i]);
|
sofia@0
|
115 out_sz[i] = N * gpuarrays.osize[i];
|
sofia@0
|
116 out_blk_sz +=ratenumf[i] *gpuarrays.osize[i];
|
sofia@0
|
117 printf("\nGPU output size for 1 block for this rate: %d\n", ratenumf[i] *gpuarrays.osize[i]);
|
sofia@0
|
118 }
|
sofia@0
|
119 printf("\ntotal output size for 1 block for all rates: %d\n", out_blk_sz);
|
sofia@0
|
120 printf("----------------\n");
|
sofia@0
|
121
|
sofia@0
|
122 int oindex = 0;
|
sofia@0
|
123 int pos[MAX_RATES];
|
sofia@0
|
124 int sum = 0;
|
sofia@0
|
125 pos[0] = 0;
|
sofia@0
|
126 for (int r=1; r<numrates; r++){
|
sofia@0
|
127 sum+=ratenumf[r-1];
|
sofia@0
|
128 pos[r] = sum;
|
sofia@0
|
129 }
|
sofia@0
|
130
|
sofia@0
|
131 // Initialize arrays
|
sofia@0
|
132 for (int i=0; i<numrates; i++){
|
sofia@0
|
133 h_in[i] = (float*) malloc((gpuarrays.osize[i]*N + gparams.fsize)*sizeof(float));
|
sofia@0
|
134
|
sofia@0
|
135 h_out[i] = (float*) malloc(gpuarrays.osize[i]*ratenumf[i]*N*sizeof(float));
|
sofia@0
|
136
|
sofia@0
|
137 h_reference[i] = (float*) malloc(gpuarrays.osize[i]*ratenumf[i]*N*sizeof(float));
|
sofia@0
|
138
|
sofia@0
|
139 for (int n=0; n<gparams.fsize; ++n)
|
sofia@0
|
140 h_in[i][n] = 0.0f;
|
sofia@0
|
141
|
sofia@0
|
142 for (int n=0; n<gpuarrays.osize[i]*N; ++n)
|
sofia@0
|
143 h_in[i][n + gparams.fsize] = rand() / (float)RAND_MAX;
|
sofia@0
|
144
|
sofia@0
|
145 for (int n=0; n<gpuarrays.osize[i]*ratenumf[i]*N; ++n){
|
sofia@0
|
146 h_out[i][n] = 0.0f;
|
sofia@0
|
147 h_reference[i][n] = 0.0f;
|
sofia@0
|
148 }
|
sofia@0
|
149 }
|
sofia@0
|
150
|
sofia@0
|
151 // initialize filters with random numbers
|
sofia@0
|
152 farr.bk = (float*) malloc(nf*gparams.fsize*sizeof(float));
|
sofia@0
|
153 for (int i=0; i < nf; ++i){
|
sofia@0
|
154 farr.m_offb[i] = OFFSET;
|
sofia@0
|
155 for (int b=0; b < gparams.fsize; ++b)
|
sofia@0
|
156 farr.bk[i*gparams.fsize+b] = rand() / (float)RAND_MAX;
|
sofia@0
|
157 }
|
sofia@0
|
158
|
sofia@0
|
159 //initialize cpu buffers for each filter
|
sofia@0
|
160 for (int f=0; f < nf; ++f){
|
sofia@0
|
161 for (int i=0; i < bufb_size ; i++){
|
sofia@0
|
162 farr.buf_in[f][i] = 0.0;
|
sofia@0
|
163 }
|
sofia@0
|
164 }
|
sofia@0
|
165
|
sofia@0
|
166 printf("\nRunning multirate filter bank test on GPU, CPU, and CPU OpenMP\n");
|
sofia@0
|
167 // printf("----------------\n");
|
sofia@0
|
168 // compute reference solution
|
sofia@0
|
169 printf("\ncompute reference solution (1 CPU)\n");
|
sofia@0
|
170
|
sofia@0
|
171 compute_ref( h_in, h_reference, &gpuarrays, &gparams, &args, &farr, N);
|
sofia@0
|
172
|
sofia@0
|
173 //reset CPU buffers
|
sofia@0
|
174 for (int f=0; f < nf; ++f){
|
sofia@0
|
175 farr.m_offb[f] = OFFSET;
|
sofia@0
|
176 for (int i=0; i < bufb_size ; ++i)
|
sofia@0
|
177 farr.buf_in[f][i] = 0.0;
|
sofia@0
|
178 }
|
sofia@0
|
179
|
sofia@0
|
180 // compute OPENMP solution
|
sofia@0
|
181 printf("\nCompute OPENMP solution\n");
|
sofia@0
|
182
|
sofia@0
|
183 compute_omp( h_in, h_reference, &gpuarrays, &gparams, &args, &farr, N);
|
sofia@0
|
184
|
sofia@0
|
185 // compute CUDA solution
|
sofia@0
|
186 printf("\nCompute cuda solution\n");
|
sofia@0
|
187
|
sofia@0
|
188 // init
|
sofia@0
|
189 cudaMultiFilterFirInit(h_in, h_out, farr.bk, &gparams, &gpuarrays);
|
sofia@0
|
190 for (int i=0; i<numrates; i++)
|
sofia@0
|
191 memcpy(gpuarrays.h_in[i], &h_in[i][0], gpuarrays.isize[i]*sizeof(float) );
|
sofia@0
|
192 //time execution
|
sofia@0
|
193 oindex = 0;
|
sofia@0
|
194 if (args.tim)
|
sofia@0
|
195 gettimeofday(&t_start, NULL);
|
sofia@0
|
196
|
sofia@0
|
197 for (int ii=0; ii<N; ++ii){ // loop through input blocks
|
sofia@0
|
198
|
sofia@0
|
199 //need to copy data to pinned memory for streams...
|
sofia@0
|
200 for (int i=0; i<numrates; i++){
|
sofia@0
|
201 if (ii==0)
|
sofia@0
|
202 memcpy(gpuarrays.h_in[i], &h_in[i][0], gpuarrays.isize[i]*sizeof(float) );
|
sofia@0
|
203 else
|
sofia@0
|
204 memcpy(gpuarrays.h_in[i], &h_in[i][ii*gpuarrays.osize[i]], gpuarrays.isize[i]*sizeof(float) );
|
sofia@0
|
205 }
|
sofia@0
|
206
|
sofia@0
|
207 //call GPU function
|
sofia@0
|
208 cudaMultiFilterFirStreams(&gpuarrays, &gparams);
|
sofia@0
|
209
|
sofia@0
|
210 // ... and copy data back from pinned memory...
|
sofia@0
|
211 for (int i=0; i<numrates; i++){
|
sofia@0
|
212 oindex = ii*ratenumf[i]*gpuarrays.osize[i];
|
sofia@0
|
213 cudaDeviceSynchronize();
|
sofia@0
|
214 memcpy(&h_out[i][oindex], gpuarrays.h_out[i], gpuarrays.osize[i]* ratenumf[i] *sizeof(float) );
|
sofia@0
|
215 }
|
sofia@0
|
216 }
|
sofia@0
|
217
|
sofia@0
|
218 if (args.tim){
|
sofia@0
|
219 cudaDeviceSynchronize();
|
sofia@0
|
220 gettimeofday(&t_end, NULL);
|
sofia@0
|
221 t_gpu = (double) (t_end.tv_sec + (t_end.tv_usec / 1000000.0) - t_start.tv_sec - (t_start.tv_usec/ 1000000.0)) * 1000.0;
|
sofia@0
|
222 printf("Finished GPU FIR\nProcessing gpu took: %f ms\n", t_gpu/(double)N);
|
sofia@0
|
223 }
|
sofia@0
|
224
|
sofia@0
|
225 // check results
|
sofia@0
|
226 check_results(h_reference, h_out, &gpuarrays, &gparams, N);
|
sofia@0
|
227
|
sofia@0
|
228 //close GPU
|
sofia@0
|
229 cudaMultiFilterFirClose(&gparams, &gpuarrays);
|
sofia@0
|
230
|
sofia@0
|
231 // cleanup memory
|
sofia@0
|
232 for (int i = 0; i < numrates; ++i){
|
sofia@0
|
233 free(h_out[i]);
|
sofia@0
|
234 free(h_in[i]);
|
sofia@0
|
235 free(h_reference[i]);
|
sofia@0
|
236 }
|
sofia@0
|
237
|
sofia@0
|
238 free(farr.bk);
|
sofia@0
|
239
|
sofia@0
|
240 }
|