annotate src/vamp-sdk/ext/kiss_fft.c @ 434:e979a9c4ffb6 vampipe

Switch from Cross FFT with option of FFTW build, to KissFFT only (code bundled). This is much faster than the default build and simpler than managing two options.
author Chris Cannam
date Tue, 16 Aug 2016 16:04:09 +0100
parents
children b4addbeab790
rev   line source
Chris@434 1 /*
Chris@434 2 Copyright (c) 2003-2010, Mark Borgerding
Chris@434 3
Chris@434 4 All rights reserved.
Chris@434 5
Chris@434 6 Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
Chris@434 7
Chris@434 8 * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
Chris@434 9 * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
Chris@434 10 * Neither the author nor the names of any contributors may be used to endorse or promote products derived from this software without specific prior written permission.
Chris@434 11
Chris@434 12 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
Chris@434 13 */
Chris@434 14
Chris@434 15
Chris@434 16 #include "_kiss_fft_guts.h"
Chris@434 17 /* The guts header contains all the multiplication and addition macros that are defined for
Chris@434 18 fixed or floating point complex numbers. It also delares the kf_ internal functions.
Chris@434 19 */
Chris@434 20
Chris@434 21 static void kf_bfly2(
Chris@434 22 kiss_fft_cpx * Fout,
Chris@434 23 const size_t fstride,
Chris@434 24 const kiss_fft_cfg st,
Chris@434 25 int m
Chris@434 26 )
Chris@434 27 {
Chris@434 28 kiss_fft_cpx * Fout2;
Chris@434 29 kiss_fft_cpx * tw1 = st->twiddles;
Chris@434 30 kiss_fft_cpx t;
Chris@434 31 Fout2 = Fout + m;
Chris@434 32 do{
Chris@434 33 C_FIXDIV(*Fout,2); C_FIXDIV(*Fout2,2);
Chris@434 34
Chris@434 35 C_MUL (t, *Fout2 , *tw1);
Chris@434 36 tw1 += fstride;
Chris@434 37 C_SUB( *Fout2 , *Fout , t );
Chris@434 38 C_ADDTO( *Fout , t );
Chris@434 39 ++Fout2;
Chris@434 40 ++Fout;
Chris@434 41 }while (--m);
Chris@434 42 }
Chris@434 43
Chris@434 44 static void kf_bfly4(
Chris@434 45 kiss_fft_cpx * Fout,
Chris@434 46 const size_t fstride,
Chris@434 47 const kiss_fft_cfg st,
Chris@434 48 const size_t m
Chris@434 49 )
Chris@434 50 {
Chris@434 51 kiss_fft_cpx *tw1,*tw2,*tw3;
Chris@434 52 kiss_fft_cpx scratch[6];
Chris@434 53 size_t k=m;
Chris@434 54 const size_t m2=2*m;
Chris@434 55 const size_t m3=3*m;
Chris@434 56
Chris@434 57
Chris@434 58 tw3 = tw2 = tw1 = st->twiddles;
Chris@434 59
Chris@434 60 do {
Chris@434 61 C_FIXDIV(*Fout,4); C_FIXDIV(Fout[m],4); C_FIXDIV(Fout[m2],4); C_FIXDIV(Fout[m3],4);
Chris@434 62
Chris@434 63 C_MUL(scratch[0],Fout[m] , *tw1 );
Chris@434 64 C_MUL(scratch[1],Fout[m2] , *tw2 );
Chris@434 65 C_MUL(scratch[2],Fout[m3] , *tw3 );
Chris@434 66
Chris@434 67 C_SUB( scratch[5] , *Fout, scratch[1] );
Chris@434 68 C_ADDTO(*Fout, scratch[1]);
Chris@434 69 C_ADD( scratch[3] , scratch[0] , scratch[2] );
Chris@434 70 C_SUB( scratch[4] , scratch[0] , scratch[2] );
Chris@434 71 C_SUB( Fout[m2], *Fout, scratch[3] );
Chris@434 72 tw1 += fstride;
Chris@434 73 tw2 += fstride*2;
Chris@434 74 tw3 += fstride*3;
Chris@434 75 C_ADDTO( *Fout , scratch[3] );
Chris@434 76
Chris@434 77 if(st->inverse) {
Chris@434 78 Fout[m].r = scratch[5].r - scratch[4].i;
Chris@434 79 Fout[m].i = scratch[5].i + scratch[4].r;
Chris@434 80 Fout[m3].r = scratch[5].r + scratch[4].i;
Chris@434 81 Fout[m3].i = scratch[5].i - scratch[4].r;
Chris@434 82 }else{
Chris@434 83 Fout[m].r = scratch[5].r + scratch[4].i;
Chris@434 84 Fout[m].i = scratch[5].i - scratch[4].r;
Chris@434 85 Fout[m3].r = scratch[5].r - scratch[4].i;
Chris@434 86 Fout[m3].i = scratch[5].i + scratch[4].r;
Chris@434 87 }
Chris@434 88 ++Fout;
Chris@434 89 }while(--k);
Chris@434 90 }
Chris@434 91
Chris@434 92 static void kf_bfly3(
Chris@434 93 kiss_fft_cpx * Fout,
Chris@434 94 const size_t fstride,
Chris@434 95 const kiss_fft_cfg st,
Chris@434 96 size_t m
Chris@434 97 )
Chris@434 98 {
Chris@434 99 size_t k=m;
Chris@434 100 const size_t m2 = 2*m;
Chris@434 101 kiss_fft_cpx *tw1,*tw2;
Chris@434 102 kiss_fft_cpx scratch[5];
Chris@434 103 kiss_fft_cpx epi3;
Chris@434 104 epi3 = st->twiddles[fstride*m];
Chris@434 105
Chris@434 106 tw1=tw2=st->twiddles;
Chris@434 107
Chris@434 108 do{
Chris@434 109 C_FIXDIV(*Fout,3); C_FIXDIV(Fout[m],3); C_FIXDIV(Fout[m2],3);
Chris@434 110
Chris@434 111 C_MUL(scratch[1],Fout[m] , *tw1);
Chris@434 112 C_MUL(scratch[2],Fout[m2] , *tw2);
Chris@434 113
Chris@434 114 C_ADD(scratch[3],scratch[1],scratch[2]);
Chris@434 115 C_SUB(scratch[0],scratch[1],scratch[2]);
Chris@434 116 tw1 += fstride;
Chris@434 117 tw2 += fstride*2;
Chris@434 118
Chris@434 119 Fout[m].r = Fout->r - HALF_OF(scratch[3].r);
Chris@434 120 Fout[m].i = Fout->i - HALF_OF(scratch[3].i);
Chris@434 121
Chris@434 122 C_MULBYSCALAR( scratch[0] , epi3.i );
Chris@434 123
Chris@434 124 C_ADDTO(*Fout,scratch[3]);
Chris@434 125
Chris@434 126 Fout[m2].r = Fout[m].r + scratch[0].i;
Chris@434 127 Fout[m2].i = Fout[m].i - scratch[0].r;
Chris@434 128
Chris@434 129 Fout[m].r -= scratch[0].i;
Chris@434 130 Fout[m].i += scratch[0].r;
Chris@434 131
Chris@434 132 ++Fout;
Chris@434 133 }while(--k);
Chris@434 134 }
Chris@434 135
Chris@434 136 static void kf_bfly5(
Chris@434 137 kiss_fft_cpx * Fout,
Chris@434 138 const size_t fstride,
Chris@434 139 const kiss_fft_cfg st,
Chris@434 140 int m
Chris@434 141 )
Chris@434 142 {
Chris@434 143 kiss_fft_cpx *Fout0,*Fout1,*Fout2,*Fout3,*Fout4;
Chris@434 144 int u;
Chris@434 145 kiss_fft_cpx scratch[13];
Chris@434 146 kiss_fft_cpx * twiddles = st->twiddles;
Chris@434 147 kiss_fft_cpx *tw;
Chris@434 148 kiss_fft_cpx ya,yb;
Chris@434 149 ya = twiddles[fstride*m];
Chris@434 150 yb = twiddles[fstride*2*m];
Chris@434 151
Chris@434 152 Fout0=Fout;
Chris@434 153 Fout1=Fout0+m;
Chris@434 154 Fout2=Fout0+2*m;
Chris@434 155 Fout3=Fout0+3*m;
Chris@434 156 Fout4=Fout0+4*m;
Chris@434 157
Chris@434 158 tw=st->twiddles;
Chris@434 159 for ( u=0; u<m; ++u ) {
Chris@434 160 C_FIXDIV( *Fout0,5); C_FIXDIV( *Fout1,5); C_FIXDIV( *Fout2,5); C_FIXDIV( *Fout3,5); C_FIXDIV( *Fout4,5);
Chris@434 161 scratch[0] = *Fout0;
Chris@434 162
Chris@434 163 C_MUL(scratch[1] ,*Fout1, tw[u*fstride]);
Chris@434 164 C_MUL(scratch[2] ,*Fout2, tw[2*u*fstride]);
Chris@434 165 C_MUL(scratch[3] ,*Fout3, tw[3*u*fstride]);
Chris@434 166 C_MUL(scratch[4] ,*Fout4, tw[4*u*fstride]);
Chris@434 167
Chris@434 168 C_ADD( scratch[7],scratch[1],scratch[4]);
Chris@434 169 C_SUB( scratch[10],scratch[1],scratch[4]);
Chris@434 170 C_ADD( scratch[8],scratch[2],scratch[3]);
Chris@434 171 C_SUB( scratch[9],scratch[2],scratch[3]);
Chris@434 172
Chris@434 173 Fout0->r += scratch[7].r + scratch[8].r;
Chris@434 174 Fout0->i += scratch[7].i + scratch[8].i;
Chris@434 175
Chris@434 176 scratch[5].r = scratch[0].r + S_MUL(scratch[7].r,ya.r) + S_MUL(scratch[8].r,yb.r);
Chris@434 177 scratch[5].i = scratch[0].i + S_MUL(scratch[7].i,ya.r) + S_MUL(scratch[8].i,yb.r);
Chris@434 178
Chris@434 179 scratch[6].r = S_MUL(scratch[10].i,ya.i) + S_MUL(scratch[9].i,yb.i);
Chris@434 180 scratch[6].i = -S_MUL(scratch[10].r,ya.i) - S_MUL(scratch[9].r,yb.i);
Chris@434 181
Chris@434 182 C_SUB(*Fout1,scratch[5],scratch[6]);
Chris@434 183 C_ADD(*Fout4,scratch[5],scratch[6]);
Chris@434 184
Chris@434 185 scratch[11].r = scratch[0].r + S_MUL(scratch[7].r,yb.r) + S_MUL(scratch[8].r,ya.r);
Chris@434 186 scratch[11].i = scratch[0].i + S_MUL(scratch[7].i,yb.r) + S_MUL(scratch[8].i,ya.r);
Chris@434 187 scratch[12].r = - S_MUL(scratch[10].i,yb.i) + S_MUL(scratch[9].i,ya.i);
Chris@434 188 scratch[12].i = S_MUL(scratch[10].r,yb.i) - S_MUL(scratch[9].r,ya.i);
Chris@434 189
Chris@434 190 C_ADD(*Fout2,scratch[11],scratch[12]);
Chris@434 191 C_SUB(*Fout3,scratch[11],scratch[12]);
Chris@434 192
Chris@434 193 ++Fout0;++Fout1;++Fout2;++Fout3;++Fout4;
Chris@434 194 }
Chris@434 195 }
Chris@434 196
Chris@434 197 /* perform the butterfly for one stage of a mixed radix FFT */
Chris@434 198 static void kf_bfly_generic(
Chris@434 199 kiss_fft_cpx * Fout,
Chris@434 200 const size_t fstride,
Chris@434 201 const kiss_fft_cfg st,
Chris@434 202 int m,
Chris@434 203 int p
Chris@434 204 )
Chris@434 205 {
Chris@434 206 int u,k,q1,q;
Chris@434 207 kiss_fft_cpx * twiddles = st->twiddles;
Chris@434 208 kiss_fft_cpx t;
Chris@434 209 int Norig = st->nfft;
Chris@434 210
Chris@434 211 kiss_fft_cpx * scratch = (kiss_fft_cpx*)KISS_FFT_TMP_ALLOC(sizeof(kiss_fft_cpx)*p);
Chris@434 212
Chris@434 213 for ( u=0; u<m; ++u ) {
Chris@434 214 k=u;
Chris@434 215 for ( q1=0 ; q1<p ; ++q1 ) {
Chris@434 216 scratch[q1] = Fout[ k ];
Chris@434 217 C_FIXDIV(scratch[q1],p);
Chris@434 218 k += m;
Chris@434 219 }
Chris@434 220
Chris@434 221 k=u;
Chris@434 222 for ( q1=0 ; q1<p ; ++q1 ) {
Chris@434 223 int twidx=0;
Chris@434 224 Fout[ k ] = scratch[0];
Chris@434 225 for (q=1;q<p;++q ) {
Chris@434 226 twidx += fstride * k;
Chris@434 227 if (twidx>=Norig) twidx-=Norig;
Chris@434 228 C_MUL(t,scratch[q] , twiddles[twidx] );
Chris@434 229 C_ADDTO( Fout[ k ] ,t);
Chris@434 230 }
Chris@434 231 k += m;
Chris@434 232 }
Chris@434 233 }
Chris@434 234 KISS_FFT_TMP_FREE(scratch);
Chris@434 235 }
Chris@434 236
Chris@434 237 static
Chris@434 238 void kf_work(
Chris@434 239 kiss_fft_cpx * Fout,
Chris@434 240 const kiss_fft_cpx * f,
Chris@434 241 const size_t fstride,
Chris@434 242 int in_stride,
Chris@434 243 int * factors,
Chris@434 244 const kiss_fft_cfg st
Chris@434 245 )
Chris@434 246 {
Chris@434 247 kiss_fft_cpx * Fout_beg=Fout;
Chris@434 248 const int p=*factors++; /* the radix */
Chris@434 249 const int m=*factors++; /* stage's fft length/p */
Chris@434 250 const kiss_fft_cpx * Fout_end = Fout + p*m;
Chris@434 251
Chris@434 252 #ifdef _OPENMP
Chris@434 253 // use openmp extensions at the
Chris@434 254 // top-level (not recursive)
Chris@434 255 if (fstride==1 && p<=5)
Chris@434 256 {
Chris@434 257 int k;
Chris@434 258
Chris@434 259 // execute the p different work units in different threads
Chris@434 260 # pragma omp parallel for
Chris@434 261 for (k=0;k<p;++k)
Chris@434 262 kf_work( Fout +k*m, f+ fstride*in_stride*k,fstride*p,in_stride,factors,st);
Chris@434 263 // all threads have joined by this point
Chris@434 264
Chris@434 265 switch (p) {
Chris@434 266 case 2: kf_bfly2(Fout,fstride,st,m); break;
Chris@434 267 case 3: kf_bfly3(Fout,fstride,st,m); break;
Chris@434 268 case 4: kf_bfly4(Fout,fstride,st,m); break;
Chris@434 269 case 5: kf_bfly5(Fout,fstride,st,m); break;
Chris@434 270 default: kf_bfly_generic(Fout,fstride,st,m,p); break;
Chris@434 271 }
Chris@434 272 return;
Chris@434 273 }
Chris@434 274 #endif
Chris@434 275
Chris@434 276 if (m==1) {
Chris@434 277 do{
Chris@434 278 *Fout = *f;
Chris@434 279 f += fstride*in_stride;
Chris@434 280 }while(++Fout != Fout_end );
Chris@434 281 }else{
Chris@434 282 do{
Chris@434 283 // recursive call:
Chris@434 284 // DFT of size m*p performed by doing
Chris@434 285 // p instances of smaller DFTs of size m,
Chris@434 286 // each one takes a decimated version of the input
Chris@434 287 kf_work( Fout , f, fstride*p, in_stride, factors,st);
Chris@434 288 f += fstride*in_stride;
Chris@434 289 }while( (Fout += m) != Fout_end );
Chris@434 290 }
Chris@434 291
Chris@434 292 Fout=Fout_beg;
Chris@434 293
Chris@434 294 // recombine the p smaller DFTs
Chris@434 295 switch (p) {
Chris@434 296 case 2: kf_bfly2(Fout,fstride,st,m); break;
Chris@434 297 case 3: kf_bfly3(Fout,fstride,st,m); break;
Chris@434 298 case 4: kf_bfly4(Fout,fstride,st,m); break;
Chris@434 299 case 5: kf_bfly5(Fout,fstride,st,m); break;
Chris@434 300 default: kf_bfly_generic(Fout,fstride,st,m,p); break;
Chris@434 301 }
Chris@434 302 }
Chris@434 303
Chris@434 304 /* facbuf is populated by p1,m1,p2,m2, ...
Chris@434 305 where
Chris@434 306 p[i] * m[i] = m[i-1]
Chris@434 307 m0 = n */
Chris@434 308 static
Chris@434 309 void kf_factor(int n,int * facbuf)
Chris@434 310 {
Chris@434 311 int p=4;
Chris@434 312 double floor_sqrt;
Chris@434 313 floor_sqrt = floor( sqrt((double)n) );
Chris@434 314
Chris@434 315 /*factor out powers of 4, powers of 2, then any remaining primes */
Chris@434 316 do {
Chris@434 317 while (n % p) {
Chris@434 318 switch (p) {
Chris@434 319 case 4: p = 2; break;
Chris@434 320 case 2: p = 3; break;
Chris@434 321 default: p += 2; break;
Chris@434 322 }
Chris@434 323 if (p > floor_sqrt)
Chris@434 324 p = n; /* no more factors, skip to end */
Chris@434 325 }
Chris@434 326 n /= p;
Chris@434 327 *facbuf++ = p;
Chris@434 328 *facbuf++ = n;
Chris@434 329 } while (n > 1);
Chris@434 330 }
Chris@434 331
Chris@434 332 /*
Chris@434 333 *
Chris@434 334 * User-callable function to allocate all necessary storage space for the fft.
Chris@434 335 *
Chris@434 336 * The return value is a contiguous block of memory, allocated with malloc. As such,
Chris@434 337 * It can be freed with free(), rather than a kiss_fft-specific function.
Chris@434 338 * */
Chris@434 339 kiss_fft_cfg kiss_fft_alloc(int nfft,int inverse_fft,void * mem,size_t * lenmem )
Chris@434 340 {
Chris@434 341 kiss_fft_cfg st=NULL;
Chris@434 342 size_t memneeded = sizeof(struct kiss_fft_state)
Chris@434 343 + sizeof(kiss_fft_cpx)*(nfft-1); /* twiddle factors*/
Chris@434 344
Chris@434 345 if ( lenmem==NULL ) {
Chris@434 346 st = ( kiss_fft_cfg)KISS_FFT_MALLOC( memneeded );
Chris@434 347 }else{
Chris@434 348 if (mem != NULL && *lenmem >= memneeded)
Chris@434 349 st = (kiss_fft_cfg)mem;
Chris@434 350 *lenmem = memneeded;
Chris@434 351 }
Chris@434 352 if (st) {
Chris@434 353 int i;
Chris@434 354 st->nfft=nfft;
Chris@434 355 st->inverse = inverse_fft;
Chris@434 356
Chris@434 357 for (i=0;i<nfft;++i) {
Chris@434 358 const double pi=3.141592653589793238462643383279502884197169399375105820974944;
Chris@434 359 double phase = -2*pi*i / nfft;
Chris@434 360 if (st->inverse)
Chris@434 361 phase *= -1;
Chris@434 362 kf_cexp(st->twiddles+i, phase );
Chris@434 363 }
Chris@434 364
Chris@434 365 kf_factor(nfft,st->factors);
Chris@434 366 }
Chris@434 367 return st;
Chris@434 368 }
Chris@434 369
Chris@434 370
Chris@434 371 void kiss_fft_stride(kiss_fft_cfg st,const kiss_fft_cpx *fin,kiss_fft_cpx *fout,int in_stride)
Chris@434 372 {
Chris@434 373 if (fin == fout) {
Chris@434 374 //NOTE: this is not really an in-place FFT algorithm.
Chris@434 375 //It just performs an out-of-place FFT into a temp buffer
Chris@434 376 kiss_fft_cpx * tmpbuf = (kiss_fft_cpx*)KISS_FFT_TMP_ALLOC( sizeof(kiss_fft_cpx)*st->nfft);
Chris@434 377 kf_work(tmpbuf,fin,1,in_stride, st->factors,st);
Chris@434 378 memcpy(fout,tmpbuf,sizeof(kiss_fft_cpx)*st->nfft);
Chris@434 379 KISS_FFT_TMP_FREE(tmpbuf);
Chris@434 380 }else{
Chris@434 381 kf_work( fout, fin, 1,in_stride, st->factors,st );
Chris@434 382 }
Chris@434 383 }
Chris@434 384
Chris@434 385 void kiss_fft(kiss_fft_cfg cfg,const kiss_fft_cpx *fin,kiss_fft_cpx *fout)
Chris@434 386 {
Chris@434 387 kiss_fft_stride(cfg,fin,fout,1);
Chris@434 388 }
Chris@434 389
Chris@434 390
Chris@434 391 void kiss_fft_cleanup(void)
Chris@434 392 {
Chris@434 393 // nothing needed any more
Chris@434 394 }
Chris@434 395
Chris@434 396 int kiss_fft_next_fast_size(int n)
Chris@434 397 {
Chris@434 398 while(1) {
Chris@434 399 int m=n;
Chris@434 400 while ( (m%2) == 0 ) m/=2;
Chris@434 401 while ( (m%3) == 0 ) m/=3;
Chris@434 402 while ( (m%5) == 0 ) m/=5;
Chris@434 403 if (m<=1)
Chris@434 404 break; /* n is completely factorable by twos, threes, and fives */
Chris@434 405 n++;
Chris@434 406 }
Chris@434 407 return n;
Chris@434 408 }