annotate ext/kissfft/kiss_fft.c @ 515:08bcc06c38ec tip master

Remove fast-math
author Chris Cannam <cannam@all-day-breakfast.com>
date Tue, 28 Jan 2020 15:27:37 +0000
parents 624ddb622a5d
children
rev   line source
c@409 1 /*
c@409 2 Copyright (c) 2003-2010, Mark Borgerding
c@409 3
c@409 4 All rights reserved.
c@409 5
c@409 6 Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
c@409 7
c@409 8 * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
c@409 9 * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
c@409 10 * Neither the author nor the names of any contributors may be used to endorse or promote products derived from this software without specific prior written permission.
c@409 11
c@409 12 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
c@409 13 */
c@409 14
c@409 15
c@409 16 #include "_kiss_fft_guts.h"
c@409 17 /* The guts header contains all the multiplication and addition macros that are defined for
c@409 18 fixed or floating point complex numbers. It also delares the kf_ internal functions.
c@409 19 */
c@409 20
c@409 21 static void kf_bfly2(
c@409 22 kiss_fft_cpx * Fout,
c@409 23 const size_t fstride,
c@409 24 const kiss_fft_cfg st,
c@409 25 int m
c@409 26 )
c@409 27 {
c@409 28 kiss_fft_cpx * Fout2;
c@409 29 kiss_fft_cpx * tw1 = st->twiddles;
c@409 30 kiss_fft_cpx t;
c@409 31 Fout2 = Fout + m;
c@409 32 do{
c@409 33 C_FIXDIV(*Fout,2); C_FIXDIV(*Fout2,2);
c@409 34
c@409 35 C_MUL (t, *Fout2 , *tw1);
c@409 36 tw1 += fstride;
c@409 37 C_SUB( *Fout2 , *Fout , t );
c@409 38 C_ADDTO( *Fout , t );
c@409 39 ++Fout2;
c@409 40 ++Fout;
c@409 41 }while (--m);
c@409 42 }
c@409 43
c@409 44 static void kf_bfly4(
c@409 45 kiss_fft_cpx * Fout,
c@409 46 const size_t fstride,
c@409 47 const kiss_fft_cfg st,
c@409 48 const size_t m
c@409 49 )
c@409 50 {
c@409 51 kiss_fft_cpx *tw1,*tw2,*tw3;
c@409 52 kiss_fft_cpx scratch[6];
c@409 53 size_t k=m;
c@409 54 const size_t m2=2*m;
c@409 55 const size_t m3=3*m;
c@409 56
c@409 57
c@409 58 tw3 = tw2 = tw1 = st->twiddles;
c@409 59
c@409 60 do {
c@409 61 C_FIXDIV(*Fout,4); C_FIXDIV(Fout[m],4); C_FIXDIV(Fout[m2],4); C_FIXDIV(Fout[m3],4);
c@409 62
c@409 63 C_MUL(scratch[0],Fout[m] , *tw1 );
c@409 64 C_MUL(scratch[1],Fout[m2] , *tw2 );
c@409 65 C_MUL(scratch[2],Fout[m3] , *tw3 );
c@409 66
c@409 67 C_SUB( scratch[5] , *Fout, scratch[1] );
c@409 68 C_ADDTO(*Fout, scratch[1]);
c@409 69 C_ADD( scratch[3] , scratch[0] , scratch[2] );
c@409 70 C_SUB( scratch[4] , scratch[0] , scratch[2] );
c@409 71 C_SUB( Fout[m2], *Fout, scratch[3] );
c@409 72 tw1 += fstride;
c@409 73 tw2 += fstride*2;
c@409 74 tw3 += fstride*3;
c@409 75 C_ADDTO( *Fout , scratch[3] );
c@409 76
c@409 77 if(st->inverse) {
c@409 78 Fout[m].r = scratch[5].r - scratch[4].i;
c@409 79 Fout[m].i = scratch[5].i + scratch[4].r;
c@409 80 Fout[m3].r = scratch[5].r + scratch[4].i;
c@409 81 Fout[m3].i = scratch[5].i - scratch[4].r;
c@409 82 }else{
c@409 83 Fout[m].r = scratch[5].r + scratch[4].i;
c@409 84 Fout[m].i = scratch[5].i - scratch[4].r;
c@409 85 Fout[m3].r = scratch[5].r - scratch[4].i;
c@409 86 Fout[m3].i = scratch[5].i + scratch[4].r;
c@409 87 }
c@409 88 ++Fout;
c@409 89 }while(--k);
c@409 90 }
c@409 91
c@409 92 static void kf_bfly3(
c@409 93 kiss_fft_cpx * Fout,
c@409 94 const size_t fstride,
c@409 95 const kiss_fft_cfg st,
c@409 96 size_t m
c@409 97 )
c@409 98 {
c@409 99 size_t k=m;
c@409 100 const size_t m2 = 2*m;
c@409 101 kiss_fft_cpx *tw1,*tw2;
c@409 102 kiss_fft_cpx scratch[5];
c@409 103 kiss_fft_cpx epi3;
c@409 104 epi3 = st->twiddles[fstride*m];
c@409 105
c@409 106 tw1=tw2=st->twiddles;
c@409 107
c@409 108 do{
c@409 109 C_FIXDIV(*Fout,3); C_FIXDIV(Fout[m],3); C_FIXDIV(Fout[m2],3);
c@409 110
c@409 111 C_MUL(scratch[1],Fout[m] , *tw1);
c@409 112 C_MUL(scratch[2],Fout[m2] , *tw2);
c@409 113
c@409 114 C_ADD(scratch[3],scratch[1],scratch[2]);
c@409 115 C_SUB(scratch[0],scratch[1],scratch[2]);
c@409 116 tw1 += fstride;
c@409 117 tw2 += fstride*2;
c@409 118
c@409 119 Fout[m].r = Fout->r - HALF_OF(scratch[3].r);
c@409 120 Fout[m].i = Fout->i - HALF_OF(scratch[3].i);
c@409 121
c@409 122 C_MULBYSCALAR( scratch[0] , epi3.i );
c@409 123
c@409 124 C_ADDTO(*Fout,scratch[3]);
c@409 125
c@409 126 Fout[m2].r = Fout[m].r + scratch[0].i;
c@409 127 Fout[m2].i = Fout[m].i - scratch[0].r;
c@409 128
c@409 129 Fout[m].r -= scratch[0].i;
c@409 130 Fout[m].i += scratch[0].r;
c@409 131
c@409 132 ++Fout;
c@409 133 }while(--k);
c@409 134 }
c@409 135
c@409 136 static void kf_bfly5(
c@409 137 kiss_fft_cpx * Fout,
c@409 138 const size_t fstride,
c@409 139 const kiss_fft_cfg st,
c@409 140 int m
c@409 141 )
c@409 142 {
c@409 143 kiss_fft_cpx *Fout0,*Fout1,*Fout2,*Fout3,*Fout4;
c@409 144 int u;
c@409 145 kiss_fft_cpx scratch[13];
c@409 146 kiss_fft_cpx * twiddles = st->twiddles;
c@409 147 kiss_fft_cpx *tw;
c@409 148 kiss_fft_cpx ya,yb;
c@409 149 ya = twiddles[fstride*m];
c@409 150 yb = twiddles[fstride*2*m];
c@409 151
c@409 152 Fout0=Fout;
c@409 153 Fout1=Fout0+m;
c@409 154 Fout2=Fout0+2*m;
c@409 155 Fout3=Fout0+3*m;
c@409 156 Fout4=Fout0+4*m;
c@409 157
c@409 158 tw=st->twiddles;
c@409 159 for ( u=0; u<m; ++u ) {
c@409 160 C_FIXDIV( *Fout0,5); C_FIXDIV( *Fout1,5); C_FIXDIV( *Fout2,5); C_FIXDIV( *Fout3,5); C_FIXDIV( *Fout4,5);
c@409 161 scratch[0] = *Fout0;
c@409 162
c@409 163 C_MUL(scratch[1] ,*Fout1, tw[u*fstride]);
c@409 164 C_MUL(scratch[2] ,*Fout2, tw[2*u*fstride]);
c@409 165 C_MUL(scratch[3] ,*Fout3, tw[3*u*fstride]);
c@409 166 C_MUL(scratch[4] ,*Fout4, tw[4*u*fstride]);
c@409 167
c@409 168 C_ADD( scratch[7],scratch[1],scratch[4]);
c@409 169 C_SUB( scratch[10],scratch[1],scratch[4]);
c@409 170 C_ADD( scratch[8],scratch[2],scratch[3]);
c@409 171 C_SUB( scratch[9],scratch[2],scratch[3]);
c@409 172
c@409 173 Fout0->r += scratch[7].r + scratch[8].r;
c@409 174 Fout0->i += scratch[7].i + scratch[8].i;
c@409 175
c@409 176 scratch[5].r = scratch[0].r + S_MUL(scratch[7].r,ya.r) + S_MUL(scratch[8].r,yb.r);
c@409 177 scratch[5].i = scratch[0].i + S_MUL(scratch[7].i,ya.r) + S_MUL(scratch[8].i,yb.r);
c@409 178
c@409 179 scratch[6].r = S_MUL(scratch[10].i,ya.i) + S_MUL(scratch[9].i,yb.i);
c@409 180 scratch[6].i = -S_MUL(scratch[10].r,ya.i) - S_MUL(scratch[9].r,yb.i);
c@409 181
c@409 182 C_SUB(*Fout1,scratch[5],scratch[6]);
c@409 183 C_ADD(*Fout4,scratch[5],scratch[6]);
c@409 184
c@409 185 scratch[11].r = scratch[0].r + S_MUL(scratch[7].r,yb.r) + S_MUL(scratch[8].r,ya.r);
c@409 186 scratch[11].i = scratch[0].i + S_MUL(scratch[7].i,yb.r) + S_MUL(scratch[8].i,ya.r);
c@409 187 scratch[12].r = - S_MUL(scratch[10].i,yb.i) + S_MUL(scratch[9].i,ya.i);
c@409 188 scratch[12].i = S_MUL(scratch[10].r,yb.i) - S_MUL(scratch[9].r,ya.i);
c@409 189
c@409 190 C_ADD(*Fout2,scratch[11],scratch[12]);
c@409 191 C_SUB(*Fout3,scratch[11],scratch[12]);
c@409 192
c@409 193 ++Fout0;++Fout1;++Fout2;++Fout3;++Fout4;
c@409 194 }
c@409 195 }
c@409 196
c@409 197 /* perform the butterfly for one stage of a mixed radix FFT */
c@409 198 static void kf_bfly_generic(
c@409 199 kiss_fft_cpx * Fout,
c@409 200 const size_t fstride,
c@409 201 const kiss_fft_cfg st,
c@409 202 int m,
c@409 203 int p
c@409 204 )
c@409 205 {
c@409 206 int u,k,q1,q;
c@409 207 kiss_fft_cpx * twiddles = st->twiddles;
c@409 208 kiss_fft_cpx t;
c@409 209 int Norig = st->nfft;
c@409 210
c@409 211 kiss_fft_cpx * scratch = (kiss_fft_cpx*)KISS_FFT_TMP_ALLOC(sizeof(kiss_fft_cpx)*p);
c@409 212
c@409 213 for ( u=0; u<m; ++u ) {
c@409 214 k=u;
c@409 215 for ( q1=0 ; q1<p ; ++q1 ) {
c@409 216 scratch[q1] = Fout[ k ];
c@409 217 C_FIXDIV(scratch[q1],p);
c@409 218 k += m;
c@409 219 }
c@409 220
c@409 221 k=u;
c@409 222 for ( q1=0 ; q1<p ; ++q1 ) {
c@409 223 int twidx=0;
c@409 224 Fout[ k ] = scratch[0];
c@409 225 for (q=1;q<p;++q ) {
c@409 226 twidx += fstride * k;
c@409 227 if (twidx>=Norig) twidx-=Norig;
c@409 228 C_MUL(t,scratch[q] , twiddles[twidx] );
c@409 229 C_ADDTO( Fout[ k ] ,t);
c@409 230 }
c@409 231 k += m;
c@409 232 }
c@409 233 }
c@409 234 KISS_FFT_TMP_FREE(scratch);
c@409 235 }
c@409 236
c@409 237 static
c@409 238 void kf_work(
c@409 239 kiss_fft_cpx * Fout,
c@409 240 const kiss_fft_cpx * f,
c@409 241 const size_t fstride,
c@409 242 int in_stride,
c@409 243 int * factors,
c@409 244 const kiss_fft_cfg st
c@409 245 )
c@409 246 {
c@409 247 kiss_fft_cpx * Fout_beg=Fout;
c@409 248 const int p=*factors++; /* the radix */
c@409 249 const int m=*factors++; /* stage's fft length/p */
c@409 250 const kiss_fft_cpx * Fout_end = Fout + p*m;
c@409 251
c@409 252 #ifdef _OPENMP
c@409 253 // use openmp extensions at the
c@409 254 // top-level (not recursive)
c@409 255 if (fstride==1 && p<=5)
c@409 256 {
c@409 257 int k;
c@409 258
c@409 259 // execute the p different work units in different threads
c@409 260 # pragma omp parallel for
c@409 261 for (k=0;k<p;++k)
c@409 262 kf_work( Fout +k*m, f+ fstride*in_stride*k,fstride*p,in_stride,factors,st);
c@409 263 // all threads have joined by this point
c@409 264
c@409 265 switch (p) {
c@409 266 case 2: kf_bfly2(Fout,fstride,st,m); break;
c@409 267 case 3: kf_bfly3(Fout,fstride,st,m); break;
c@409 268 case 4: kf_bfly4(Fout,fstride,st,m); break;
c@409 269 case 5: kf_bfly5(Fout,fstride,st,m); break;
c@409 270 default: kf_bfly_generic(Fout,fstride,st,m,p); break;
c@409 271 }
c@409 272 return;
c@409 273 }
c@409 274 #endif
c@409 275
c@409 276 if (m==1) {
c@409 277 do{
c@432 278 Fout->r = f->r;
c@432 279 Fout->i = f->i;
c@409 280 f += fstride*in_stride;
c@409 281 }while(++Fout != Fout_end );
c@409 282 }else{
c@409 283 do{
c@409 284 // recursive call:
c@409 285 // DFT of size m*p performed by doing
c@409 286 // p instances of smaller DFTs of size m,
c@409 287 // each one takes a decimated version of the input
c@409 288 kf_work( Fout , f, fstride*p, in_stride, factors,st);
c@409 289 f += fstride*in_stride;
c@409 290 }while( (Fout += m) != Fout_end );
c@409 291 }
c@409 292
c@409 293 Fout=Fout_beg;
c@409 294
c@409 295 // recombine the p smaller DFTs
c@409 296 switch (p) {
c@409 297 case 2: kf_bfly2(Fout,fstride,st,m); break;
c@409 298 case 3: kf_bfly3(Fout,fstride,st,m); break;
c@409 299 case 4: kf_bfly4(Fout,fstride,st,m); break;
c@409 300 case 5: kf_bfly5(Fout,fstride,st,m); break;
c@409 301 default: kf_bfly_generic(Fout,fstride,st,m,p); break;
c@409 302 }
c@409 303 }
c@409 304
c@409 305 /* facbuf is populated by p1,m1,p2,m2, ...
c@409 306 where
c@409 307 p[i] * m[i] = m[i-1]
c@409 308 m0 = n */
c@409 309 static
c@409 310 void kf_factor(int n,int * facbuf)
c@409 311 {
c@409 312 int p=4;
c@409 313 double floor_sqrt;
c@409 314 floor_sqrt = floor( sqrt((double)n) );
c@409 315
c@409 316 /*factor out powers of 4, powers of 2, then any remaining primes */
c@409 317 do {
c@409 318 while (n % p) {
c@409 319 switch (p) {
c@409 320 case 4: p = 2; break;
c@409 321 case 2: p = 3; break;
c@409 322 default: p += 2; break;
c@409 323 }
c@409 324 if (p > floor_sqrt)
c@409 325 p = n; /* no more factors, skip to end */
c@409 326 }
c@409 327 n /= p;
c@409 328 *facbuf++ = p;
c@409 329 *facbuf++ = n;
c@409 330 } while (n > 1);
c@409 331 }
c@409 332
c@409 333 /*
c@409 334 *
c@409 335 * User-callable function to allocate all necessary storage space for the fft.
c@409 336 *
c@409 337 * The return value is a contiguous block of memory, allocated with malloc. As such,
c@409 338 * It can be freed with free(), rather than a kiss_fft-specific function.
c@409 339 * */
c@409 340 kiss_fft_cfg kiss_fft_alloc(int nfft,int inverse_fft,void * mem,size_t * lenmem )
c@409 341 {
c@409 342 kiss_fft_cfg st=NULL;
c@409 343 size_t memneeded = sizeof(struct kiss_fft_state)
c@409 344 + sizeof(kiss_fft_cpx)*(nfft-1); /* twiddle factors*/
c@409 345
c@409 346 if ( lenmem==NULL ) {
c@409 347 st = ( kiss_fft_cfg)KISS_FFT_MALLOC( memneeded );
c@409 348 }else{
c@409 349 if (mem != NULL && *lenmem >= memneeded)
c@409 350 st = (kiss_fft_cfg)mem;
c@409 351 *lenmem = memneeded;
c@409 352 }
c@409 353 if (st) {
c@409 354 int i;
c@409 355 st->nfft=nfft;
c@409 356 st->inverse = inverse_fft;
c@409 357
c@409 358 for (i=0;i<nfft;++i) {
c@409 359 const double pi=3.141592653589793238462643383279502884197169399375105820974944;
c@409 360 double phase = -2*pi*i / nfft;
c@409 361 if (st->inverse)
c@409 362 phase *= -1;
c@409 363 kf_cexp(st->twiddles+i, phase );
c@409 364 }
c@409 365
c@409 366 kf_factor(nfft,st->factors);
c@409 367 }
c@409 368 return st;
c@409 369 }
c@409 370
c@409 371
c@409 372 void kiss_fft_stride(kiss_fft_cfg st,const kiss_fft_cpx *fin,kiss_fft_cpx *fout,int in_stride)
c@409 373 {
c@409 374 if (fin == fout) {
c@409 375 //NOTE: this is not really an in-place FFT algorithm.
c@409 376 //It just performs an out-of-place FFT into a temp buffer
c@409 377 kiss_fft_cpx * tmpbuf = (kiss_fft_cpx*)KISS_FFT_TMP_ALLOC( sizeof(kiss_fft_cpx)*st->nfft);
c@409 378 kf_work(tmpbuf,fin,1,in_stride, st->factors,st);
c@409 379 memcpy(fout,tmpbuf,sizeof(kiss_fft_cpx)*st->nfft);
c@409 380 KISS_FFT_TMP_FREE(tmpbuf);
c@409 381 }else{
c@409 382 kf_work( fout, fin, 1,in_stride, st->factors,st );
c@409 383 }
c@409 384 }
c@409 385
c@409 386 void kiss_fft(kiss_fft_cfg cfg,const kiss_fft_cpx *fin,kiss_fft_cpx *fout)
c@409 387 {
c@409 388 kiss_fft_stride(cfg,fin,fout,1);
c@409 389 }
c@409 390
c@409 391
c@409 392 void kiss_fft_cleanup(void)
c@409 393 {
c@409 394 // nothing needed any more
c@409 395 }
c@409 396
c@409 397 int kiss_fft_next_fast_size(int n)
c@409 398 {
c@409 399 while(1) {
c@409 400 int m=n;
c@409 401 while ( (m%2) == 0 ) m/=2;
c@409 402 while ( (m%3) == 0 ) m/=3;
c@409 403 while ( (m%5) == 0 ) m/=5;
c@409 404 if (m<=1)
c@409 405 break; /* n is completely factorable by twos, threes, and fives */
c@409 406 n++;
c@409 407 }
c@409 408 return n;
c@409 409 }