annotate fft/native/bqvec/src/VectorOpsComplex.cpp @ 40:223f770b5341 kissfft-double tip

Try a double-precision kissfft
author Chris Cannam
date Wed, 07 Sep 2016 10:40:32 +0100
parents cf59817a5983
children
rev   line source
Chris@29 1 /* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */
Chris@29 2
Chris@29 3 /*
Chris@29 4 bqvec
Chris@29 5
Chris@29 6 A small library for vector arithmetic and allocation in C++ using
Chris@29 7 raw C pointer arrays.
Chris@29 8
Chris@29 9 Copyright 2007-2015 Particular Programs Ltd.
Chris@29 10
Chris@29 11 Permission is hereby granted, free of charge, to any person
Chris@29 12 obtaining a copy of this software and associated documentation
Chris@29 13 files (the "Software"), to deal in the Software without
Chris@29 14 restriction, including without limitation the rights to use, copy,
Chris@29 15 modify, merge, publish, distribute, sublicense, and/or sell copies
Chris@29 16 of the Software, and to permit persons to whom the Software is
Chris@29 17 furnished to do so, subject to the following conditions:
Chris@29 18
Chris@29 19 The above copyright notice and this permission notice shall be
Chris@29 20 included in all copies or substantial portions of the Software.
Chris@29 21
Chris@29 22 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
Chris@29 23 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
Chris@29 24 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
Chris@29 25 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
Chris@29 26 ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
Chris@29 27 CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
Chris@29 28 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
Chris@29 29
Chris@29 30 Except as contained in this notice, the names of Chris Cannam and
Chris@29 31 Particular Programs Ltd shall not be used in advertising or
Chris@29 32 otherwise to promote the sale, use or other dealings in this
Chris@29 33 Software without prior written authorization.
Chris@29 34 */
Chris@29 35
Chris@29 36 #include "VectorOpsComplex.h"
Chris@29 37
Chris@29 38 #include <cassert>
Chris@29 39
Chris@29 40 #ifdef __MSVC__
Chris@29 41 #include <malloc.h>
Chris@29 42 #define alloca _alloca
Chris@29 43 #endif
Chris@29 44
Chris@29 45 #if defined USE_POMMIER_MATHFUN
Chris@29 46 #if defined __ARMEL__
Chris@29 47 #include "pommier/neon_mathfun.h"
Chris@29 48 #else
Chris@29 49 #include "pommier/sse_mathfun.h"
Chris@29 50 #endif
Chris@29 51 #endif
Chris@29 52
Chris@29 53 #ifdef __MSVC__
Chris@29 54 #include <malloc.h>
Chris@29 55 #define alloca _alloca
Chris@29 56 #endif
Chris@29 57
Chris@29 58 namespace breakfastquay {
Chris@29 59
Chris@29 60 #ifdef USE_APPROXIMATE_ATAN2
Chris@29 61 float approximate_atan2f(float real, float imag)
Chris@29 62 {
Chris@29 63 static const float pi = M_PI;
Chris@29 64 static const float pi2 = M_PI / 2;
Chris@29 65
Chris@29 66 float atan;
Chris@29 67
Chris@29 68 if (real == 0.f) {
Chris@29 69
Chris@29 70 if (imag > 0.0f) atan = pi2;
Chris@29 71 else if (imag == 0.0f) atan = 0.0f;
Chris@29 72 else atan = -pi2;
Chris@29 73
Chris@29 74 } else {
Chris@29 75
Chris@29 76 float z = imag/real;
Chris@29 77
Chris@29 78 if (fabsf(z) < 1.f) {
Chris@29 79 atan = z / (1.f + 0.28f * z * z);
Chris@29 80 if (real < 0.f) {
Chris@29 81 if (imag < 0.f) atan -= pi;
Chris@29 82 else atan += pi;
Chris@29 83 }
Chris@29 84 } else {
Chris@29 85 atan = pi2 - z / (z * z + 0.28f);
Chris@29 86 if (imag < 0.f) atan -= pi;
Chris@29 87 }
Chris@29 88 }
Chris@29 89 }
Chris@29 90 #endif
Chris@29 91
Chris@29 92 #if defined USE_POMMIER_MATHFUN
Chris@29 93
Chris@29 94 #ifdef __ARMEL__
Chris@29 95 typedef union {
Chris@29 96 float f[4];
Chris@29 97 int i[4];
Chris@29 98 v4sf v;
Chris@29 99 } V4SF;
Chris@29 100 #else
Chris@29 101 typedef ALIGN16_BEG union {
Chris@29 102 float f[4];
Chris@29 103 int i[4];
Chris@29 104 v4sf v;
Chris@29 105 } ALIGN16_END V4SF;
Chris@29 106 #endif
Chris@29 107
Chris@29 108 void
Chris@29 109 v_polar_to_cartesian_pommier(float *const BQ_R__ real,
Chris@29 110 float *const BQ_R__ imag,
Chris@29 111 const float *const BQ_R__ mag,
Chris@29 112 const float *const BQ_R__ phase,
Chris@29 113 const int count)
Chris@29 114 {
Chris@29 115 int idx = 0, tidx = 0;
Chris@29 116 int i = 0;
Chris@29 117
Chris@29 118 for (int i = 0; i + 4 < count; i += 4) {
Chris@29 119
Chris@29 120 V4SF fmag, fphase, fre, fim;
Chris@29 121
Chris@29 122 for (int j = 0; j < 3; ++j) {
Chris@29 123 fmag.f[j] = mag[idx];
Chris@29 124 fphase.f[j] = phase[idx++];
Chris@29 125 }
Chris@29 126
Chris@29 127 sincos_ps(fphase.v, &fim.v, &fre.v);
Chris@29 128
Chris@29 129 for (int j = 0; j < 3; ++j) {
Chris@29 130 real[tidx] = fre.f[j] * fmag.f[j];
Chris@29 131 imag[tidx++] = fim.f[j] * fmag.f[j];
Chris@29 132 }
Chris@29 133 }
Chris@29 134
Chris@29 135 while (i < count) {
Chris@29 136 float re, im;
Chris@29 137 c_phasor(&re, &im, phase[i]);
Chris@29 138 real[tidx] = re * mag[i];
Chris@29 139 imag[tidx++] = im * mag[i];
Chris@29 140 ++i;
Chris@29 141 }
Chris@29 142 }
Chris@29 143
Chris@29 144 void
Chris@29 145 v_polar_interleaved_to_cartesian_inplace_pommier(float *const BQ_R__ srcdst,
Chris@29 146 const int count)
Chris@29 147 {
Chris@29 148 int i;
Chris@29 149 int idx = 0, tidx = 0;
Chris@29 150
Chris@29 151 for (i = 0; i + 4 < count; i += 4) {
Chris@29 152
Chris@29 153 V4SF fmag, fphase, fre, fim;
Chris@29 154
Chris@29 155 for (int j = 0; j < 3; ++j) {
Chris@29 156 fmag.f[j] = srcdst[idx++];
Chris@29 157 fphase.f[j] = srcdst[idx++];
Chris@29 158 }
Chris@29 159
Chris@29 160 sincos_ps(fphase.v, &fim.v, &fre.v);
Chris@29 161
Chris@29 162 for (int j = 0; j < 3; ++j) {
Chris@29 163 srcdst[tidx++] = fre.f[j] * fmag.f[j];
Chris@29 164 srcdst[tidx++] = fim.f[j] * fmag.f[j];
Chris@29 165 }
Chris@29 166 }
Chris@29 167
Chris@29 168 while (i < count) {
Chris@29 169 float real, imag;
Chris@29 170 float mag = srcdst[idx++];
Chris@29 171 float phase = srcdst[idx++];
Chris@29 172 c_phasor(&real, &imag, phase);
Chris@29 173 srcdst[tidx++] = real * mag;
Chris@29 174 srcdst[tidx++] = imag * mag;
Chris@29 175 ++i;
Chris@29 176 }
Chris@29 177 }
Chris@29 178
Chris@29 179 void
Chris@29 180 v_polar_to_cartesian_interleaved_pommier(float *const BQ_R__ dst,
Chris@29 181 const float *const BQ_R__ mag,
Chris@29 182 const float *const BQ_R__ phase,
Chris@29 183 const int count)
Chris@29 184 {
Chris@29 185 int i;
Chris@29 186 int idx = 0, tidx = 0;
Chris@29 187
Chris@29 188 for (i = 0; i + 4 <= count; i += 4) {
Chris@29 189
Chris@29 190 V4SF fmag, fphase, fre, fim;
Chris@29 191
Chris@29 192 for (int j = 0; j < 3; ++j) {
Chris@29 193 fmag.f[j] = mag[idx];
Chris@29 194 fphase.f[j] = phase[idx];
Chris@29 195 ++idx;
Chris@29 196 }
Chris@29 197
Chris@29 198 sincos_ps(fphase.v, &fim.v, &fre.v);
Chris@29 199
Chris@29 200 for (int j = 0; j < 3; ++j) {
Chris@29 201 dst[tidx++] = fre.f[j] * fmag.f[j];
Chris@29 202 dst[tidx++] = fim.f[j] * fmag.f[j];
Chris@29 203 }
Chris@29 204 }
Chris@29 205
Chris@29 206 while (i < count) {
Chris@29 207 float real, imag;
Chris@29 208 c_phasor(&real, &imag, phase[i]);
Chris@29 209 dst[tidx++] = real * mag[i];
Chris@29 210 dst[tidx++] = imag * mag[i];
Chris@29 211 ++i;
Chris@29 212 }
Chris@29 213 }
Chris@29 214
Chris@29 215 #endif
Chris@29 216
Chris@29 217 #ifndef NO_COMPLEX_TYPES
Chris@29 218
Chris@29 219 #if defined HAVE_IPP
Chris@29 220
Chris@29 221 void
Chris@29 222 v_polar_to_cartesian(bq_complex_t *const BQ_R__ dst,
Chris@29 223 const bq_complex_element_t *const BQ_R__ mag,
Chris@29 224 const bq_complex_element_t *const BQ_R__ phase,
Chris@29 225 const int count)
Chris@29 226 {
Chris@29 227 if (sizeof(bq_complex_element_t) == sizeof(float)) {
Chris@29 228 ippsPolarToCart_32fc((const float *)mag, (const float *)phase,
Chris@29 229 (Ipp32fc *)dst, count);
Chris@29 230 } else {
Chris@29 231 ippsPolarToCart_64fc((const double *)mag, (const double *)phase,
Chris@29 232 (Ipp64fc *)dst, count);
Chris@29 233 }
Chris@29 234 }
Chris@29 235
Chris@29 236 #elif defined HAVE_VDSP
Chris@29 237
Chris@29 238 void
Chris@29 239 v_polar_to_cartesian(bq_complex_t *const BQ_R__ dst,
Chris@29 240 const bq_complex_element_t *const BQ_R__ mag,
Chris@29 241 const bq_complex_element_t *const BQ_R__ phase,
Chris@29 242 const int count)
Chris@29 243 {
Chris@29 244 bq_complex_element_t *sc = (bq_complex_element_t *)
Chris@29 245 alloca(count * 2 * sizeof(bq_complex_element_t));
Chris@29 246
Chris@29 247 if (sizeof(bq_complex_element_t) == sizeof(float)) {
Chris@29 248 vvsincosf((float *)sc, (float *)(sc + count), (float *)phase, &count);
Chris@29 249 } else {
Chris@29 250 vvsincos((double *)sc, (double *)(sc + count), (double *)phase, &count);
Chris@29 251 }
Chris@29 252
Chris@29 253 int sini = 0;
Chris@29 254 int cosi = count;
Chris@29 255
Chris@29 256 for (int i = 0; i < count; ++i) {
Chris@29 257 dst[i].re = mag[i] * sc[cosi++];
Chris@29 258 dst[i].im = mag[i] * sc[sini++];
Chris@29 259 }
Chris@29 260 }
Chris@29 261
Chris@29 262 #else
Chris@29 263
Chris@29 264 void
Chris@29 265 v_polar_to_cartesian(bq_complex_t *const BQ_R__ dst,
Chris@29 266 const bq_complex_element_t *const BQ_R__ mag,
Chris@29 267 const bq_complex_element_t *const BQ_R__ phase,
Chris@29 268 const int count)
Chris@29 269 {
Chris@29 270 for (int i = 0; i < count; ++i) {
Chris@29 271 dst[i] = c_phasor(phase[i]);
Chris@29 272 }
Chris@29 273 for (int i = 0; i < count; ++i) {
Chris@29 274 dst[i].re *= mag[i];
Chris@29 275 dst[i].im *= mag[i];
Chris@29 276 }
Chris@29 277 }
Chris@29 278
Chris@29 279 #endif
Chris@29 280
Chris@29 281 #if defined USE_POMMIER_MATHFUN
Chris@29 282
Chris@29 283 //!!! further tests reqd. This is only single precision but it seems
Chris@29 284 //!!! to be much faster than normal math library sincos. The comments
Chris@29 285 //!!! note that precision suffers for high arguments to sincos though,
Chris@29 286 //!!! and that is probably a common case for us
Chris@29 287
Chris@29 288 void
Chris@29 289 v_polar_interleaved_to_cartesian(bq_complex_t *const BQ_R__ dst,
Chris@29 290 const bq_complex_element_t *const BQ_R__ src,
Chris@29 291 const int count)
Chris@29 292 {
Chris@29 293 int idx = 0, tidx = 0;
Chris@29 294
Chris@29 295 for (int i = 0; i < count; i += 4) {
Chris@29 296
Chris@29 297 V4SF fmag, fphase, fre, fim;
Chris@29 298
Chris@29 299 for (int j = 0; j < 3; ++j) {
Chris@29 300 fmag.f[j] = src[idx++];
Chris@29 301 fphase.f[j] = src[idx++];
Chris@29 302 }
Chris@29 303
Chris@29 304 sincos_ps(fphase.v, &fim.v, &fre.v);
Chris@29 305
Chris@29 306 for (int j = 0; j < 3; ++j) {
Chris@29 307 dst[tidx].re = fre.f[j] * fmag.f[j];
Chris@29 308 dst[tidx++].im = fim.f[j] * fmag.f[j];
Chris@29 309 }
Chris@29 310 }
Chris@29 311 }
Chris@29 312
Chris@29 313 #elif (defined HAVE_IPP || defined HAVE_VDSP)
Chris@29 314
Chris@29 315 // with a vector library, it should be faster to deinterleave and call
Chris@29 316 // the basic fn
Chris@29 317
Chris@29 318 void
Chris@29 319 v_polar_interleaved_to_cartesian(bq_complex_t *const BQ_R__ dst,
Chris@29 320 const bq_complex_element_t *const BQ_R__ src,
Chris@29 321 const int count)
Chris@29 322 {
Chris@29 323 bq_complex_element_t *mag = (bq_complex_element_t *)
Chris@29 324 alloca(count * sizeof(bq_complex_element_t));
Chris@29 325 bq_complex_element_t *phase = (bq_complex_element_t *)
Chris@29 326 alloca(count * sizeof(bq_complex_element_t));
Chris@29 327 bq_complex_element_t *magphase[] = { mag, phase };
Chris@29 328
Chris@29 329 v_deinterleave(magphase, src, 2, count);
Chris@29 330 v_polar_to_cartesian(dst, mag, phase, count);
Chris@29 331 }
Chris@29 332
Chris@29 333 #else
Chris@29 334
Chris@29 335 // without a vector library, better avoid the deinterleave step
Chris@29 336
Chris@29 337 void
Chris@29 338 v_polar_interleaved_to_cartesian(bq_complex_t *const BQ_R__ dst,
Chris@29 339 const bq_complex_element_t *const BQ_R__ src,
Chris@29 340 const int count)
Chris@29 341 {
Chris@29 342 bq_complex_element_t mag, phase;
Chris@29 343 int idx = 0;
Chris@29 344 for (int i = 0; i < count; ++i) {
Chris@29 345 mag = src[idx++];
Chris@29 346 phase = src[idx++];
Chris@29 347 dst[i] = c_phasor(phase);
Chris@29 348 dst[i].re *= mag;
Chris@29 349 dst[i].im *= mag;
Chris@29 350 }
Chris@29 351 }
Chris@29 352
Chris@29 353 #endif
Chris@29 354
Chris@29 355 void
Chris@29 356 v_polar_interleaved_to_cartesian_inplace(bq_complex_element_t *const BQ_R__ srcdst,
Chris@29 357 const int count)
Chris@29 358 {
Chris@29 359 // Not ideal
Chris@29 360 bq_complex_element_t mag, phase;
Chris@29 361 int ii = 0, io = 0;
Chris@29 362 for (int i = 0; i < count; ++i) {
Chris@29 363 mag = srcdst[ii++];
Chris@29 364 phase = srcdst[ii++];
Chris@29 365 bq_complex_t p = c_phasor(phase);
Chris@29 366 srcdst[io++] = mag * p.re;
Chris@29 367 srcdst[io++] = mag * p.im;
Chris@29 368 }
Chris@29 369 }
Chris@29 370
Chris@29 371 #endif
Chris@29 372
Chris@29 373 }