# HG changeset patch # User Chris Cannam # Date 1445090408 -3600 # Node ID cf59817a5983228644c9ff2291c2b0d248cc6233 # Parent 77fca9c01e4679ad51efd809b2a3afc213e6bdab Build stuff for native diff -r 77fca9c01e46 -r cf59817a5983 fft/native.cpp --- a/fft/native.cpp Wed Oct 07 21:31:02 2015 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,59 +0,0 @@ - -#include - -#include -#include -#include -#include - -using namespace std; -using namespace breakfastquay; - -int main(int argc, char **argv) -{ - vector sizes { 512, 2048 }; - - int iterations = 2000; - int times = 100; - - for (auto size: sizes) { - - FFT fft(size); - double total = 0.0; - - auto start = chrono::high_resolution_clock::now(); - - for (int ti = 0; ti < times; ++ti) { - - total = 0.0; - - for (int i = 0; i < iterations; ++i) { - - vector ri(size), ro(size/2+1), io(size/2+1); - for (int j = 0; j < size; ++j) { - ri[j] = (j % 2) / 4.0; - } - - fft.forward(ri.data(), ro.data(), io.data()); - - for (int j = 0; j <= size/2; ++j) { - total += sqrt(ro[j] * ro[j] + io[j] * io[j]); - } - - // synthesise the conjugate half - for (int j = 1; j < size/2; ++j) { - total += sqrt(ro[j] * ro[j] + io[j] * io[j]); - } - } - } - - auto end = chrono::high_resolution_clock::now(); - - double ms = chrono::duration(end - start).count() / times; - - cerr << "for " << iterations << " * size " << size << ": total = " - << total << ", time = " << ms - << " ms (" << (iterations / (ms / 1000.0)) << " itr/sec)" << endl; - } -} - diff -r 77fca9c01e46 -r cf59817a5983 fft/native/Makefile.osx --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fft/native/Makefile.osx Sat Oct 17 15:00:08 2015 +0100 @@ -0,0 +1,6 @@ + +OBJECTS := bqfft/src/FFT.o native.o +CXXFLAGS := -O3 -ffast-math -DMALLOC_IS_ALIGNED -DHAVE_VDSP -Ibqfft -Ibqvec -std=c++11 + +native: $(OBJECTS) + $(CXX) -o $@ $^ -framework Accelerate diff -r 77fca9c01e46 -r cf59817a5983 fft/native/bqfft/COPYING --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fft/native/bqfft/COPYING Sat Oct 17 15:00:08 2015 +0100 @@ -0,0 +1,26 @@ + + Copyright 2007-2015 Particular Programs Ltd. + + Permission is hereby granted, free of charge, to any person + obtaining a copy of this software and associated documentation + files (the "Software"), to deal in the Software without + restriction, including without limitation the rights to use, copy, + modify, merge, publish, distribute, sublicense, and/or sell copies + of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR + ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF + CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + Except as contained in this notice, the names of Chris Cannam and + Particular Programs Ltd shall not be used in advertising or + otherwise to promote the sale, use or other dealings in this + Software without prior written authorization. diff -r 77fca9c01e46 -r cf59817a5983 fft/native/bqfft/Makefile --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fft/native/bqfft/Makefile Sat Oct 17 15:00:08 2015 +0100 @@ -0,0 +1,90 @@ + +# Add to FFT_DEFINES the relevant options for your desired third-party +# library support. +# +# Available options are +# +# -DHAVE_IPP Intel's Integrated Performance Primitives are available +# -DHAVE_VDSP Apple's Accelerate framework is available +# -DHAVE_FFTW3 The FFTW library is available +# -DHAVE_KISSFFT The KissFFT library is available +# -DHAVE_MEDIALIB The Medialib library (from Sun) is available +# -DHAVE_OPENMAX The OpenMAX signal processing library is available +# -DUSE_BUILTIN_FFT Compile the built-in FFT code (which is very slow) +# +# You may define more than one of these. If you define +# USE_BUILTIN_FFT, the code will be compiled in but will only be used +# if no other option is available. The default, if no flags are +# supplied, is for the code to refuse to compile. +# +# Add any relevant -I flags for include paths as well. +# +# Note that you must supply the same flags when including bqfft +# headers later as you are using now when compiling the library. (You +# may find it simplest to just add the bqfft source files to your +# application's build system and not build a bqfft library at all.) + +# WARNING! The default option here is VERY SLOW! Read above for better +# alternatives! +FFT_DEFINES := -DHAVE_VDSP + + +# Add to ALLOCATOR_DEFINES options relating to aligned malloc. +# +# Available options are +# +# -DHAVE_POSIX_MEMALIGN The posix_memalign call is available in sys/mman.h +# -DLACK_POSIX_MEMALIGN The posix_memalign call is not available +# +# -DMALLOC_IS_ALIGNED The malloc call already returns aligned memory +# -DMALLOC_IS_NOT_ALIGNED The malloc call does not return aligned memory +# +# -DUSE_OWN_ALIGNED_MALLOC No aligned malloc is available, roll your own +# +# -DLACK_BAD_ALLOC The C++ library lacks the std::bad_alloc exception +# +# Here "aligned" is assumed to mean "aligned enough for whatever +# vector stuff the space will be used for" which most likely means +# 16-byte alignment. +# +# The default is to use _aligned_malloc when building with Visual C++, +# system malloc when building on OS/X, and posix_memalign otherwise. +# +# Note that you must supply the same flags when including bqfft +# headers later as you are using now when compiling the library. (You +# may find it simplest to just add the bqfft source files to your +# application's build system and not build a bqfft library at all.) + +ALLOCATOR_DEFINES := -DMALLOC_IS_ALIGNED + +SRC_DIR := src +HEADER_DIR := bqfft + +SOURCES := $(wildcard $(SRC_DIR)/*.cpp) +HEADERS := $(wildcard $(HEADER_DIR)/*.h) $(wildcard $(SRC_DIR)/*.h) + +OBJECTS := $(SOURCES:.cpp=.o) +OBJECTS := $(OBJECTS:.c=.o) + +CXXFLAGS := $(FFT_DEFINES) $(ALLOCATOR_DEFINES) -O3 -ffast-math -I. -I../bqvec -fpic + +LIBRARY := libbqfft.a + +all: $(LIBRARY) + +$(LIBRARY): $(OBJECTS) + $(AR) rc $@ $^ + +clean: + rm -f $(OBJECTS) + +distclean: clean + rm -f $(LIBRARY) + +depend: + makedepend -Y -fMakefile $(SOURCES) $(HEADERS) + + +# DO NOT DELETE + +src/FFT.o: bqfft/FFT.h diff -r 77fca9c01e46 -r cf59817a5983 fft/native/bqfft/README.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fft/native/bqfft/README.txt Sat Oct 17 15:00:08 2015 +0100 @@ -0,0 +1,20 @@ + +bqfft +===== + +A small library wrapping various FFT implementations for some common +audio processing use cases. Note this is not a general FFT interface, +as it handles only power-of-two FFT sizes and real inputs. + +Requires the bqvec library. + +C++ standard required: C++98 (does not use C++11) + +Copyright 2007-2015 Particular Programs Ltd. + +This code originated as part of the Rubber Band Library written by the +same authors (see https://bitbucket.org/breakfastquay/rubberband/). +It has been pulled out into a separate library and relicensed under a +more permissive licence: a BSD/MIT-style licence, as opposed to the +GPL used for Rubber Band. See the file COPYING for details. + diff -r 77fca9c01e46 -r cf59817a5983 fft/native/bqfft/bqfft/FFT.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fft/native/bqfft/bqfft/FFT.h Sat Oct 17 15:00:08 2015 +0100 @@ -0,0 +1,145 @@ +/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */ + +/* + bqfft + + A small library wrapping various FFT implementations for some + common audio processing use cases. + + Copyright 2007-2015 Particular Programs Ltd. + + Permission is hereby granted, free of charge, to any person + obtaining a copy of this software and associated documentation + files (the "Software"), to deal in the Software without + restriction, including without limitation the rights to use, copy, + modify, merge, publish, distribute, sublicense, and/or sell copies + of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR + ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF + CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + Except as contained in this notice, the names of Chris Cannam and + Particular Programs Ltd shall not be used in advertising or + otherwise to promote the sale, use or other dealings in this + Software without prior written authorization. +*/ + +#ifndef BQFFT_FFT_H +#define BQFFT_FFT_H + +#include + +#include +#include + +namespace breakfastquay { + +class FFTImpl; + +/** + * Provide basic FFT computations using one of a set of candidate FFT + * implementations (depending on compile flags). + * + * Implements real->complex FFTs of power-of-two sizes only. Note + * that only the first half of the output signal is returned (the + * complex conjugates half is omitted), so the "complex" arrays need + * room for size/2+1 elements. + * + * The "interleaved" functions use the format sometimes called CCS -- + * size/2+1 real+imaginary pairs. So, the array elements at indices 1 + * and size+1 will always be zero (since the signal is real). + * + * All pointer arguments must point to valid data. A NullArgument + * exception is thrown if any argument is NULL. + * + * Neither forward nor inverse transform is scaled. + * + * This class is reentrant but not thread safe: use a separate + * instance per thread, or use a mutex. + */ +class FFT +{ +public: + enum Exception { + NullArgument, InvalidSize, InvalidImplementation, InternalError + }; + + FFT(int size, int debugLevel = 0); // may throw InvalidSize + ~FFT(); + + void forward(const double *BQ_R__ realIn, double *BQ_R__ realOut, double *BQ_R__ imagOut); + void forwardInterleaved(const double *BQ_R__ realIn, double *BQ_R__ complexOut); + void forwardPolar(const double *BQ_R__ realIn, double *BQ_R__ magOut, double *BQ_R__ phaseOut); + void forwardMagnitude(const double *BQ_R__ realIn, double *BQ_R__ magOut); + + void forward(const float *BQ_R__ realIn, float *BQ_R__ realOut, float *BQ_R__ imagOut); + void forwardInterleaved(const float *BQ_R__ realIn, float *BQ_R__ complexOut); + void forwardPolar(const float *BQ_R__ realIn, float *BQ_R__ magOut, float *BQ_R__ phaseOut); + void forwardMagnitude(const float *BQ_R__ realIn, float *BQ_R__ magOut); + + void inverse(const double *BQ_R__ realIn, const double *BQ_R__ imagIn, double *BQ_R__ realOut); + void inverseInterleaved(const double *BQ_R__ complexIn, double *BQ_R__ realOut); + void inversePolar(const double *BQ_R__ magIn, const double *BQ_R__ phaseIn, double *BQ_R__ realOut); + void inverseCepstral(const double *BQ_R__ magIn, double *BQ_R__ cepOut); + + void inverse(const float *BQ_R__ realIn, const float *BQ_R__ imagIn, float *BQ_R__ realOut); + void inverseInterleaved(const float *BQ_R__ complexIn, float *BQ_R__ realOut); + void inversePolar(const float *BQ_R__ magIn, const float *BQ_R__ phaseIn, float *BQ_R__ realOut); + void inverseCepstral(const float *BQ_R__ magIn, float *BQ_R__ cepOut); + + // Calling one or both of these is optional -- if neither is + // called, the first call to a forward or inverse method will call + // init(). You only need call these if you don't want to risk + // expensive allocations etc happening in forward or inverse. + void initFloat(); + void initDouble(); + + enum Precision { + SinglePrecision = 0x1, + DoublePrecision = 0x2 + }; + typedef int Precisions; + + /** + * Return the OR of all precisions supported by this + * implementation. All of the functions (float and double) are + * available regardless of the supported implementations, but they + * will be calculated at the proper precision only if it is + * available. (So float functions will be calculated using doubles + * and then truncated if single-precision is unavailable, and + * double functions will use single-precision arithmetic if double + * is unavailable.) + */ + Precisions getSupportedPrecisions() const; + + static std::set getImplementations(); + static std::string getDefaultImplementation(); + static void setDefaultImplementation(std::string); + +#ifdef FFT_MEASUREMENT + static std::string tune(); +#endif + +protected: + FFTImpl *d; + static std::string m_implementation; + static void pickDefaultImplementation(); + +private: + FFT(const FFT &); // not provided + FFT &operator=(const FFT &); // not provided +}; + +} + +#endif + diff -r 77fca9c01e46 -r cf59817a5983 fft/native/bqfft/src/FFT.cpp --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fft/native/bqfft/src/FFT.cpp Sat Oct 17 15:00:08 2015 +0100 @@ -0,0 +1,3585 @@ +/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */ + +/* + bqfft + + A small library wrapping various FFT implementations for some + common audio processing use cases. + + Copyright 2007-2015 Particular Programs Ltd. + + Permission is hereby granted, free of charge, to any person + obtaining a copy of this software and associated documentation + files (the "Software"), to deal in the Software without + restriction, including without limitation the rights to use, copy, + modify, merge, publish, distribute, sublicense, and/or sell copies + of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR + ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF + CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + Except as contained in this notice, the names of Chris Cannam and + Particular Programs Ltd shall not be used in advertising or + otherwise to promote the sale, use or other dealings in this + Software without prior written authorization. +*/ + +#include "bqfft/FFT.h" + +//#include "system/Thread.h" + +#include +#include +#include + +//#define FFT_MEASUREMENT 1 + +#ifdef FFT_MEASUREMENT +#include +#endif + +#ifdef HAVE_IPP +#include +#endif + +#ifdef HAVE_FFTW3 +#include +#endif + +#ifdef HAVE_VDSP +#include +#endif + +#ifdef HAVE_MEDIALIB +#include +#endif + +#ifdef HAVE_OPENMAX +#include +#endif + +#ifdef HAVE_SFFT +extern "C" { +#include +} +#endif + +#ifdef HAVE_KISSFFT +#include "kissfft/kiss_fftr.h" +#endif + +#ifndef HAVE_IPP +#ifndef HAVE_FFTW3 +#ifndef HAVE_KISSFFT +#ifndef USE_BUILTIN_FFT +#ifndef HAVE_VDSP +#ifndef HAVE_MEDIALIB +#ifndef HAVE_OPENMAX +#ifndef HAVE_SFFT +#error No FFT implementation selected! +#endif +#endif +#endif +#endif +#endif +#endif +#endif +#endif + +#include +#include +#include +#include +#include +#include + +#ifdef FFT_MEASUREMENT +#ifndef _WIN32 +#include +#endif +#endif + +namespace breakfastquay { + +class FFTImpl +{ +public: + virtual ~FFTImpl() { } + + virtual FFT::Precisions getSupportedPrecisions() const = 0; + + virtual void initFloat() = 0; + virtual void initDouble() = 0; + + virtual void forward(const double *BQ_R__ realIn, double *BQ_R__ realOut, double *BQ_R__ imagOut) = 0; + virtual void forwardInterleaved(const double *BQ_R__ realIn, double *BQ_R__ complexOut) = 0; + virtual void forwardPolar(const double *BQ_R__ realIn, double *BQ_R__ magOut, double *BQ_R__ phaseOut) = 0; + virtual void forwardMagnitude(const double *BQ_R__ realIn, double *BQ_R__ magOut) = 0; + + virtual void forward(const float *BQ_R__ realIn, float *BQ_R__ realOut, float *BQ_R__ imagOut) = 0; + virtual void forwardInterleaved(const float *BQ_R__ realIn, float *BQ_R__ complexOut) = 0; + virtual void forwardPolar(const float *BQ_R__ realIn, float *BQ_R__ magOut, float *BQ_R__ phaseOut) = 0; + virtual void forwardMagnitude(const float *BQ_R__ realIn, float *BQ_R__ magOut) = 0; + + virtual void inverse(const double *BQ_R__ realIn, const double *BQ_R__ imagIn, double *BQ_R__ realOut) = 0; + virtual void inverseInterleaved(const double *BQ_R__ complexIn, double *BQ_R__ realOut) = 0; + virtual void inversePolar(const double *BQ_R__ magIn, const double *BQ_R__ phaseIn, double *BQ_R__ realOut) = 0; + virtual void inverseCepstral(const double *BQ_R__ magIn, double *BQ_R__ cepOut) = 0; + + virtual void inverse(const float *BQ_R__ realIn, const float *BQ_R__ imagIn, float *BQ_R__ realOut) = 0; + virtual void inverseInterleaved(const float *BQ_R__ complexIn, float *BQ_R__ realOut) = 0; + virtual void inversePolar(const float *BQ_R__ magIn, const float *BQ_R__ phaseIn, float *BQ_R__ realOut) = 0; + virtual void inverseCepstral(const float *BQ_R__ magIn, float *BQ_R__ cepOut) = 0; +}; + +namespace FFTs { + +#ifdef HAVE_IPP + +class D_IPP : public FFTImpl +{ +public: + D_IPP(int size) : + m_size(size), m_fspec(0), m_dspec(0) + { + for (int i = 0; ; ++i) { + if (m_size & (1 << i)) { + m_order = i; + break; + } + } + } + + ~D_IPP() { + if (m_fspec) { + ippsFFTFree_R_32f(m_fspec); + ippsFree(m_fbuf); + ippsFree(m_fpacked); + ippsFree(m_fspare); + } + if (m_dspec) { + ippsFFTFree_R_64f(m_dspec); + ippsFree(m_dbuf); + ippsFree(m_dpacked); + ippsFree(m_dspare); + } + } + + FFT::Precisions + getSupportedPrecisions() const { + return FFT::SinglePrecision | FFT::DoublePrecision; + } + + //!!! rv check + + void initFloat() { + if (m_fspec) return; + int specSize, specBufferSize, bufferSize; + ippsFFTGetSize_R_32f(m_order, IPP_FFT_NODIV_BY_ANY, ippAlgHintFast, + &specSize, &specBufferSize, &bufferSize); + m_fbuf = ippsMalloc_8u(bufferSize); + m_fpacked = ippsMalloc_32f(m_size + 2); + m_fspare = ippsMalloc_32f(m_size / 2 + 1); + ippsFFTInitAlloc_R_32f(&m_fspec, m_order, IPP_FFT_NODIV_BY_ANY, + ippAlgHintFast); + } + + void initDouble() { + if (m_dspec) return; + int specSize, specBufferSize, bufferSize; + ippsFFTGetSize_R_64f(m_order, IPP_FFT_NODIV_BY_ANY, ippAlgHintFast, + &specSize, &specBufferSize, &bufferSize); + m_dbuf = ippsMalloc_8u(bufferSize); + m_dpacked = ippsMalloc_64f(m_size + 2); + m_dspare = ippsMalloc_64f(m_size / 2 + 1); + ippsFFTInitAlloc_R_64f(&m_dspec, m_order, IPP_FFT_NODIV_BY_ANY, + ippAlgHintFast); + } + + void packFloat(const float *BQ_R__ re, const float *BQ_R__ im) { + int index = 0; + const int hs = m_size/2; + for (int i = 0; i <= hs; ++i) { + m_fpacked[index++] = re[i]; + index++; + } + index = 0; + if (im) { + for (int i = 0; i <= hs; ++i) { + index++; + m_fpacked[index++] = im[i]; + } + } else { + for (int i = 0; i <= hs; ++i) { + index++; + m_fpacked[index++] = 0.f; + } + } + } + + void packDouble(const double *BQ_R__ re, const double *BQ_R__ im) { + int index = 0; + const int hs = m_size/2; + for (int i = 0; i <= hs; ++i) { + m_dpacked[index++] = re[i]; + index++; + } + index = 0; + if (im) { + for (int i = 0; i <= hs; ++i) { + index++; + m_dpacked[index++] = im[i]; + } + } else { + for (int i = 0; i <= hs; ++i) { + index++; + m_dpacked[index++] = 0.0; + } + } + } + + void unpackFloat(float *re, float *BQ_R__ im) { // re may be equal to m_fpacked + int index = 0; + const int hs = m_size/2; + if (im) { + for (int i = 0; i <= hs; ++i) { + index++; + im[i] = m_fpacked[index++]; + } + } + index = 0; + for (int i = 0; i <= hs; ++i) { + re[i] = m_fpacked[index++]; + index++; + } + } + + void unpackDouble(double *re, double *BQ_R__ im) { // re may be equal to m_dpacked + int index = 0; + const int hs = m_size/2; + if (im) { + for (int i = 0; i <= hs; ++i) { + index++; + im[i] = m_dpacked[index++]; + } + } + index = 0; + for (int i = 0; i <= hs; ++i) { + re[i] = m_dpacked[index++]; + index++; + } + } + + void forward(const double *BQ_R__ realIn, double *BQ_R__ realOut, double *BQ_R__ imagOut) { + if (!m_dspec) initDouble(); + ippsFFTFwd_RToCCS_64f(realIn, m_dpacked, m_dspec, m_dbuf); + unpackDouble(realOut, imagOut); + } + + void forwardInterleaved(const double *BQ_R__ realIn, double *BQ_R__ complexOut) { + if (!m_dspec) initDouble(); + ippsFFTFwd_RToCCS_64f(realIn, complexOut, m_dspec, m_dbuf); + } + + void forwardPolar(const double *BQ_R__ realIn, double *BQ_R__ magOut, double *BQ_R__ phaseOut) { + if (!m_dspec) initDouble(); + ippsFFTFwd_RToCCS_64f(realIn, m_dpacked, m_dspec, m_dbuf); + unpackDouble(m_dpacked, m_dspare); + ippsCartToPolar_64f(m_dpacked, m_dspare, magOut, phaseOut, m_size/2+1); + } + + void forwardMagnitude(const double *BQ_R__ realIn, double *BQ_R__ magOut) { + if (!m_dspec) initDouble(); + ippsFFTFwd_RToCCS_64f(realIn, m_dpacked, m_dspec, m_dbuf); + unpackDouble(m_dpacked, m_dspare); + ippsMagnitude_64f(m_dpacked, m_dspare, magOut, m_size/2+1); + } + + void forward(const float *BQ_R__ realIn, float *BQ_R__ realOut, float *BQ_R__ imagOut) { + if (!m_fspec) initFloat(); + ippsFFTFwd_RToCCS_32f(realIn, m_fpacked, m_fspec, m_fbuf); + unpackFloat(realOut, imagOut); + } + + void forwardInterleaved(const float *BQ_R__ realIn, float *BQ_R__ complexOut) { + if (!m_fspec) initFloat(); + ippsFFTFwd_RToCCS_32f(realIn, complexOut, m_fspec, m_fbuf); + } + + void forwardPolar(const float *BQ_R__ realIn, float *BQ_R__ magOut, float *BQ_R__ phaseOut) { + if (!m_fspec) initFloat(); + ippsFFTFwd_RToCCS_32f(realIn, m_fpacked, m_fspec, m_fbuf); + unpackFloat(m_fpacked, m_fspare); + ippsCartToPolar_32f(m_fpacked, m_fspare, magOut, phaseOut, m_size/2+1); + } + + void forwardMagnitude(const float *BQ_R__ realIn, float *BQ_R__ magOut) { + if (!m_fspec) initFloat(); + ippsFFTFwd_RToCCS_32f(realIn, m_fpacked, m_fspec, m_fbuf); + unpackFloat(m_fpacked, m_fspare); + ippsMagnitude_32f(m_fpacked, m_fspare, magOut, m_size/2+1); + } + + void inverse(const double *BQ_R__ realIn, const double *BQ_R__ imagIn, double *BQ_R__ realOut) { + if (!m_dspec) initDouble(); + packDouble(realIn, imagIn); + ippsFFTInv_CCSToR_64f(m_dpacked, realOut, m_dspec, m_dbuf); + } + + void inverseInterleaved(const double *BQ_R__ complexIn, double *BQ_R__ realOut) { + if (!m_dspec) initDouble(); + ippsFFTInv_CCSToR_64f(complexIn, realOut, m_dspec, m_dbuf); + } + + void inversePolar(const double *BQ_R__ magIn, const double *BQ_R__ phaseIn, double *BQ_R__ realOut) { + if (!m_dspec) initDouble(); + ippsPolarToCart_64f(magIn, phaseIn, realOut, m_dspare, m_size/2+1); + packDouble(realOut, m_dspare); // to m_dpacked + ippsFFTInv_CCSToR_64f(m_dpacked, realOut, m_dspec, m_dbuf); + } + + void inverseCepstral(const double *BQ_R__ magIn, double *BQ_R__ cepOut) { + if (!m_dspec) initDouble(); + const int hs1 = m_size/2 + 1; + ippsCopy_64f(magIn, m_dspare, hs1); + ippsAddC_64f_I(0.000001, m_dspare, hs1); + ippsLn_64f_I(m_dspare, hs1); + packDouble(m_dspare, 0); + ippsFFTInv_CCSToR_64f(m_dpacked, cepOut, m_dspec, m_dbuf); + } + + void inverse(const float *BQ_R__ realIn, const float *BQ_R__ imagIn, float *BQ_R__ realOut) { + if (!m_fspec) initFloat(); + packFloat(realIn, imagIn); + ippsFFTInv_CCSToR_32f(m_fpacked, realOut, m_fspec, m_fbuf); + } + + void inverseInterleaved(const float *BQ_R__ complexIn, float *BQ_R__ realOut) { + if (!m_fspec) initFloat(); + ippsFFTInv_CCSToR_32f(complexIn, realOut, m_fspec, m_fbuf); + } + + void inversePolar(const float *BQ_R__ magIn, const float *BQ_R__ phaseIn, float *BQ_R__ realOut) { + if (!m_fspec) initFloat(); + ippsPolarToCart_32f(magIn, phaseIn, realOut, m_fspare, m_size/2+1); + packFloat(realOut, m_fspare); // to m_fpacked + ippsFFTInv_CCSToR_32f(m_fpacked, realOut, m_fspec, m_fbuf); + } + + void inverseCepstral(const float *BQ_R__ magIn, float *BQ_R__ cepOut) { + if (!m_fspec) initFloat(); + const int hs1 = m_size/2 + 1; + ippsCopy_32f(magIn, m_fspare, hs1); + ippsAddC_32f_I(0.000001f, m_fspare, hs1); + ippsLn_32f_I(m_fspare, hs1); + packFloat(m_fspare, 0); + ippsFFTInv_CCSToR_32f(m_fpacked, cepOut, m_fspec, m_fbuf); + } + +private: + const int m_size; + int m_order; + IppsFFTSpec_R_32f *m_fspec; + IppsFFTSpec_R_64f *m_dspec; + Ipp8u *m_fbuf; + Ipp8u *m_dbuf; + float *m_fpacked; + float *m_fspare; + double *m_dpacked; + double *m_dspare; +}; + +#endif /* HAVE_IPP */ + +#ifdef HAVE_VDSP + +class D_VDSP : public FFTImpl +{ +public: + D_VDSP(int size) : + m_size(size), m_fspec(0), m_dspec(0), + m_fpacked(0), m_fspare(0), + m_dpacked(0), m_dspare(0) + { + for (int i = 0; ; ++i) { + if (m_size & (1 << i)) { + m_order = i; + break; + } + } + } + + ~D_VDSP() { + if (m_fspec) { + vDSP_destroy_fftsetup(m_fspec); + deallocate(m_fspare); + deallocate(m_fspare2); + deallocate(m_fbuf->realp); + deallocate(m_fbuf->imagp); + delete m_fbuf; + deallocate(m_fpacked->realp); + deallocate(m_fpacked->imagp); + delete m_fpacked; + } + if (m_dspec) { + vDSP_destroy_fftsetupD(m_dspec); + deallocate(m_dspare); + deallocate(m_dspare2); + deallocate(m_dbuf->realp); + deallocate(m_dbuf->imagp); + delete m_dbuf; + deallocate(m_dpacked->realp); + deallocate(m_dpacked->imagp); + delete m_dpacked; + } + } + + FFT::Precisions + getSupportedPrecisions() const { + return FFT::SinglePrecision | FFT::DoublePrecision; + } + + //!!! rv check + + void initFloat() { + if (m_fspec) return; + m_fspec = vDSP_create_fftsetup(m_order, FFT_RADIX2); + m_fbuf = new DSPSplitComplex; + //!!! "If possible, tempBuffer->realp and tempBuffer->imagp should be 32-byte aligned for best performance." + m_fbuf->realp = allocate(m_size); + m_fbuf->imagp = allocate(m_size); + m_fpacked = new DSPSplitComplex; + m_fpacked->realp = allocate(m_size / 2 + 1); + m_fpacked->imagp = allocate(m_size / 2 + 1); + m_fspare = allocate(m_size + 2); + m_fspare2 = allocate(m_size + 2); + } + + void initDouble() { + if (m_dspec) return; + m_dspec = vDSP_create_fftsetupD(m_order, FFT_RADIX2); + m_dbuf = new DSPDoubleSplitComplex; + //!!! "If possible, tempBuffer->realp and tempBuffer->imagp should be 32-byte aligned for best performance." + m_dbuf->realp = allocate(m_size); + m_dbuf->imagp = allocate(m_size); + m_dpacked = new DSPDoubleSplitComplex; + m_dpacked->realp = allocate(m_size / 2 + 1); + m_dpacked->imagp = allocate(m_size / 2 + 1); + m_dspare = allocate(m_size + 2); + m_dspare2 = allocate(m_size + 2); + } + + void packReal(const float *BQ_R__ const re) { + // Pack input for forward transform + vDSP_ctoz((DSPComplex *)re, 2, m_fpacked, 1, m_size/2); + } + void packComplex(const float *BQ_R__ const re, const float *BQ_R__ const im) { + // Pack input for inverse transform + if (re) v_copy(m_fpacked->realp, re, m_size/2 + 1); + else v_zero(m_fpacked->realp, m_size/2 + 1); + if (im) v_copy(m_fpacked->imagp, im, m_size/2 + 1); + else v_zero(m_fpacked->imagp, m_size/2 + 1); + fnyq(); + } + + void unpackReal(float *BQ_R__ const re) { + // Unpack output for inverse transform + vDSP_ztoc(m_fpacked, 1, (DSPComplex *)re, 2, m_size/2); + } + void unpackComplex(float *BQ_R__ const re, float *BQ_R__ const im) { + // Unpack output for forward transform + // vDSP forward FFTs are scaled 2x (for some reason) + float two = 2.f; + vDSP_vsdiv(m_fpacked->realp, 1, &two, re, 1, m_size/2 + 1); + vDSP_vsdiv(m_fpacked->imagp, 1, &two, im, 1, m_size/2 + 1); + } + void unpackComplex(float *BQ_R__ const cplx) { + // Unpack output for forward transform + // vDSP forward FFTs are scaled 2x (for some reason) + const int hs1 = m_size/2 + 1; + for (int i = 0; i < hs1; ++i) { + cplx[i*2] = m_fpacked->realp[i] / 2.f; + cplx[i*2+1] = m_fpacked->imagp[i] / 2.f; + } + } + + void packReal(const double *BQ_R__ const re) { + // Pack input for forward transform + vDSP_ctozD((DSPDoubleComplex *)re, 2, m_dpacked, 1, m_size/2); + } + void packComplex(const double *BQ_R__ const re, const double *BQ_R__ const im) { + // Pack input for inverse transform + if (re) v_copy(m_dpacked->realp, re, m_size/2 + 1); + else v_zero(m_dpacked->realp, m_size/2 + 1); + if (im) v_copy(m_dpacked->imagp, im, m_size/2 + 1); + else v_zero(m_dpacked->imagp, m_size/2 + 1); + dnyq(); + } + + void unpackReal(double *BQ_R__ const re) { + // Unpack output for inverse transform + vDSP_ztocD(m_dpacked, 1, (DSPDoubleComplex *)re, 2, m_size/2); + } + void unpackComplex(double *BQ_R__ const re, double *BQ_R__ const im) { + // Unpack output for forward transform + // vDSP forward FFTs are scaled 2x (for some reason) + double two = 2.0; + vDSP_vsdivD(m_dpacked->realp, 1, &two, re, 1, m_size/2 + 1); + vDSP_vsdivD(m_dpacked->imagp, 1, &two, im, 1, m_size/2 + 1); + } + void unpackComplex(double *BQ_R__ const cplx) { + // Unpack output for forward transform + // vDSP forward FFTs are scaled 2x (for some reason) + const int hs1 = m_size/2 + 1; + for (int i = 0; i < hs1; ++i) { + cplx[i*2] = m_dpacked->realp[i] / 2.0; + cplx[i*2+1] = m_dpacked->imagp[i] / 2.0; + } + } + + void fdenyq() { + // for fft result in packed form, unpack the DC and Nyquist bins + const int hs = m_size/2; + m_fpacked->realp[hs] = m_fpacked->imagp[0]; + m_fpacked->imagp[hs] = 0.f; + m_fpacked->imagp[0] = 0.f; + } + void ddenyq() { + // for fft result in packed form, unpack the DC and Nyquist bins + const int hs = m_size/2; + m_dpacked->realp[hs] = m_dpacked->imagp[0]; + m_dpacked->imagp[hs] = 0.; + m_dpacked->imagp[0] = 0.; + } + + void fnyq() { + // for ifft input in packed form, pack the DC and Nyquist bins + const int hs = m_size/2; + m_fpacked->imagp[0] = m_fpacked->realp[hs]; + m_fpacked->realp[hs] = 0.f; + m_fpacked->imagp[hs] = 0.f; + } + void dnyq() { + // for ifft input in packed form, pack the DC and Nyquist bins + const int hs = m_size/2; + m_dpacked->imagp[0] = m_dpacked->realp[hs]; + m_dpacked->realp[hs] = 0.; + m_dpacked->imagp[hs] = 0.; + } + + void forward(const double *BQ_R__ realIn, double *BQ_R__ realOut, double *BQ_R__ imagOut) { + if (!m_dspec) initDouble(); + packReal(realIn); + vDSP_fft_zriptD(m_dspec, m_dpacked, 1, m_dbuf, m_order, FFT_FORWARD); + ddenyq(); + unpackComplex(realOut, imagOut); + } + + void forwardInterleaved(const double *BQ_R__ realIn, double *BQ_R__ complexOut) { + if (!m_dspec) initDouble(); + packReal(realIn); + vDSP_fft_zriptD(m_dspec, m_dpacked, 1, m_dbuf, m_order, FFT_FORWARD); + ddenyq(); + unpackComplex(complexOut); + } + + void forwardPolar(const double *BQ_R__ realIn, double *BQ_R__ magOut, double *BQ_R__ phaseOut) { + if (!m_dspec) initDouble(); + const int hs1 = m_size/2+1; + packReal(realIn); + vDSP_fft_zriptD(m_dspec, m_dpacked, 1, m_dbuf, m_order, FFT_FORWARD); + ddenyq(); + // vDSP forward FFTs are scaled 2x (for some reason) + for (int i = 0; i < hs1; ++i) m_dpacked->realp[i] /= 2.0; + for (int i = 0; i < hs1; ++i) m_dpacked->imagp[i] /= 2.0; + v_cartesian_to_polar(magOut, phaseOut, + m_dpacked->realp, m_dpacked->imagp, hs1); + } + + void forwardMagnitude(const double *BQ_R__ realIn, double *BQ_R__ magOut) { + if (!m_dspec) initDouble(); + packReal(realIn); + vDSP_fft_zriptD(m_dspec, m_dpacked, 1, m_dbuf, m_order, FFT_FORWARD); + ddenyq(); + const int hs1 = m_size/2+1; + vDSP_zvmagsD(m_dpacked, 1, m_dspare, 1, hs1); + vvsqrt(m_dspare2, m_dspare, &hs1); + // vDSP forward FFTs are scaled 2x (for some reason) + double two = 2.0; + vDSP_vsdivD(m_dspare2, 1, &two, magOut, 1, hs1); + } + + void forward(const float *BQ_R__ realIn, float *BQ_R__ realOut, float *BQ_R__ imagOut) { + if (!m_fspec) initFloat(); + packReal(realIn); + vDSP_fft_zript(m_fspec, m_fpacked, 1, m_fbuf, m_order, FFT_FORWARD); + fdenyq(); + unpackComplex(realOut, imagOut); + } + + void forwardInterleaved(const float *BQ_R__ realIn, float *BQ_R__ complexOut) { + if (!m_fspec) initFloat(); + packReal(realIn); + vDSP_fft_zript(m_fspec, m_fpacked, 1, m_fbuf, m_order, FFT_FORWARD); + fdenyq(); + unpackComplex(complexOut); + } + + void forwardPolar(const float *BQ_R__ realIn, float *BQ_R__ magOut, float *BQ_R__ phaseOut) { + if (!m_fspec) initFloat(); + const int hs1 = m_size/2+1; + packReal(realIn); + vDSP_fft_zript(m_fspec, m_fpacked, 1, m_fbuf, m_order, FFT_FORWARD); + fdenyq(); + // vDSP forward FFTs are scaled 2x (for some reason) + for (int i = 0; i < hs1; ++i) m_fpacked->realp[i] /= 2.f; + for (int i = 0; i < hs1; ++i) m_fpacked->imagp[i] /= 2.f; + v_cartesian_to_polar(magOut, phaseOut, + m_fpacked->realp, m_fpacked->imagp, hs1); + } + + void forwardMagnitude(const float *BQ_R__ realIn, float *BQ_R__ magOut) { + if (!m_fspec) initFloat(); + packReal(realIn); + vDSP_fft_zript(m_fspec, m_fpacked, 1, m_fbuf, m_order, FFT_FORWARD); + fdenyq(); + const int hs1 = m_size/2 + 1; + vDSP_zvmags(m_fpacked, 1, m_fspare, 1, hs1); + vvsqrtf(m_fspare2, m_fspare, &hs1); + // vDSP forward FFTs are scaled 2x (for some reason) + float two = 2.f; + vDSP_vsdiv(m_fspare2, 1, &two, magOut, 1, hs1); + } + + void inverse(const double *BQ_R__ realIn, const double *BQ_R__ imagIn, double *BQ_R__ realOut) { + if (!m_dspec) initDouble(); + packComplex(realIn, imagIn); + vDSP_fft_zriptD(m_dspec, m_dpacked, 1, m_dbuf, m_order, FFT_INVERSE); + unpackReal(realOut); + } + + void inverseInterleaved(const double *BQ_R__ complexIn, double *BQ_R__ realOut) { + if (!m_dspec) initDouble(); + double *d[2] = { m_dpacked->realp, m_dpacked->imagp }; + v_deinterleave(d, complexIn, 2, m_size/2 + 1); + vDSP_fft_zriptD(m_dspec, m_dpacked, 1, m_dbuf, m_order, FFT_INVERSE); + unpackReal(realOut); + } + + void inversePolar(const double *BQ_R__ magIn, const double *BQ_R__ phaseIn, double *BQ_R__ realOut) { + if (!m_dspec) initDouble(); + const int hs1 = m_size/2+1; + vvsincos(m_dpacked->imagp, m_dpacked->realp, phaseIn, &hs1); + double *const rp = m_dpacked->realp; + double *const ip = m_dpacked->imagp; + for (int i = 0; i < hs1; ++i) rp[i] *= magIn[i]; + for (int i = 0; i < hs1; ++i) ip[i] *= magIn[i]; + dnyq(); + vDSP_fft_zriptD(m_dspec, m_dpacked, 1, m_dbuf, m_order, FFT_INVERSE); + unpackReal(realOut); + } + + void inverseCepstral(const double *BQ_R__ magIn, double *BQ_R__ cepOut) { + if (!m_dspec) initDouble(); + const int hs1 = m_size/2 + 1; + v_copy(m_dspare, magIn, hs1); + for (int i = 0; i < hs1; ++i) m_dspare[i] += 0.000001; + vvlog(m_dspare2, m_dspare, &hs1); + inverse(m_dspare2, 0, cepOut); + } + + void inverse(const float *BQ_R__ realIn, const float *BQ_R__ imagIn, float *BQ_R__ realOut) { + if (!m_fspec) initFloat(); + packComplex(realIn, imagIn); + vDSP_fft_zript(m_fspec, m_fpacked, 1, m_fbuf, m_order, FFT_INVERSE); + unpackReal(realOut); + } + + void inverseInterleaved(const float *BQ_R__ complexIn, float *BQ_R__ realOut) { + if (!m_fspec) initFloat(); + float *f[2] = { m_fpacked->realp, m_fpacked->imagp }; + v_deinterleave(f, complexIn, 2, m_size/2 + 1); + vDSP_fft_zript(m_fspec, m_fpacked, 1, m_fbuf, m_order, FFT_INVERSE); + unpackReal(realOut); + } + + void inversePolar(const float *BQ_R__ magIn, const float *BQ_R__ phaseIn, float *BQ_R__ realOut) { + if (!m_fspec) initFloat(); + + const int hs1 = m_size/2+1; + vvsincosf(m_fpacked->imagp, m_fpacked->realp, phaseIn, &hs1); + float *const rp = m_fpacked->realp; + float *const ip = m_fpacked->imagp; + for (int i = 0; i < hs1; ++i) rp[i] *= magIn[i]; + for (int i = 0; i < hs1; ++i) ip[i] *= magIn[i]; + fnyq(); + vDSP_fft_zript(m_fspec, m_fpacked, 1, m_fbuf, m_order, FFT_INVERSE); + unpackReal(realOut); + } + + void inverseCepstral(const float *BQ_R__ magIn, float *BQ_R__ cepOut) { + if (!m_fspec) initFloat(); + const int hs1 = m_size/2 + 1; + v_copy(m_fspare, magIn, hs1); + for (int i = 0; i < hs1; ++i) m_fspare[i] += 0.000001f; + vvlogf(m_fspare2, m_fspare, &hs1); + inverse(m_fspare2, 0, cepOut); + } + +private: + const int m_size; + int m_order; + FFTSetup m_fspec; + FFTSetupD m_dspec; + DSPSplitComplex *m_fbuf; + DSPDoubleSplitComplex *m_dbuf; + DSPSplitComplex *m_fpacked; + float *m_fspare; + float *m_fspare2; + DSPDoubleSplitComplex *m_dpacked; + double *m_dspare; + double *m_dspare2; +}; + +#endif /* HAVE_VDSP */ + +#ifdef HAVE_MEDIALIB + +class D_MEDIALIB : public FFTImpl +{ +public: + D_MEDIALIB(int size) : + m_size(size), + m_dpacked(0), m_fpacked(0) + { + for (int i = 0; ; ++i) { + if (m_size & (1 << i)) { + m_order = i; + break; + } + } + } + + ~D_MEDIALIB() { + if (m_dpacked) { + deallocate(m_dpacked); + } + if (m_fpacked) { + deallocate(m_fpacked); + } + } + + FFT::Precisions + getSupportedPrecisions() const { + return FFT::SinglePrecision | FFT::DoublePrecision; + } + + //!!! rv check + + void initFloat() { + m_fpacked = allocate(m_size*2); + } + + void initDouble() { + m_dpacked = allocate(m_size*2); + } + + void packFloatConjugates() { + const int hs = m_size / 2; + for (int i = 1; i <= hs; ++i) { + m_fpacked[(m_size-i)*2] = m_fpacked[2*i]; + m_fpacked[(m_size-i)*2 + 1] = -m_fpacked[2*i + 1]; + } + } + + void packDoubleConjugates() { + const int hs = m_size / 2; + for (int i = 1; i <= hs; ++i) { + m_dpacked[(m_size-i)*2] = m_dpacked[2*i]; + m_dpacked[(m_size-i)*2 + 1] = -m_dpacked[2*i + 1]; + } + } + + void packFloat(const float *BQ_R__ re, const float *BQ_R__ im) { + int index = 0; + const int hs = m_size/2; + for (int i = 0; i <= hs; ++i) { + m_fpacked[index++] = re[i]; + index++; + } + index = 0; + if (im) { + for (int i = 0; i <= hs; ++i) { + index++; + m_fpacked[index++] = im[i]; + } + } else { + for (int i = 0; i <= hs; ++i) { + index++; + m_fpacked[index++] = 0.f; + } + } + packFloatConjugates(); + } + + void packDouble(const double *BQ_R__ re, const double *BQ_R__ im) { + int index = 0; + const int hs = m_size/2; + for (int i = 0; i <= hs; ++i) { + m_dpacked[index++] = re[i]; + index++; + } + index = 0; + if (im) { + for (int i = 0; i <= hs; ++i) { + index++; + m_dpacked[index++] = im[i]; + } + } else { + for (int i = 0; i <= hs; ++i) { + index++; + m_dpacked[index++] = 0.0; + } + } + packDoubleConjugates(); + } + + void unpackFloat(float *re, float *BQ_R__ im) { // re may be equal to m_fpacked + int index = 0; + const int hs = m_size/2; + if (im) { + for (int i = 0; i <= hs; ++i) { + index++; + im[i] = m_fpacked[index++]; + } + } + index = 0; + for (int i = 0; i <= hs; ++i) { + re[i] = m_fpacked[index++]; + index++; + } + } + + void unpackDouble(double *re, double *BQ_R__ im) { // re may be equal to m_dpacked + int index = 0; + const int hs = m_size/2; + if (im) { + for (int i = 0; i <= hs; ++i) { + index++; + im[i] = m_dpacked[index++]; + } + } + index = 0; + for (int i = 0; i <= hs; ++i) { + re[i] = m_dpacked[index++]; + index++; + } + } + + void forward(const double *BQ_R__ realIn, double *BQ_R__ realOut, double *BQ_R__ imagOut) { + if (!m_dpacked) initDouble(); + mlib_SignalFFT_1_D64C_D64(m_dpacked, realIn, m_order); + unpackDouble(realOut, imagOut); + } + + void forwardInterleaved(const double *BQ_R__ realIn, double *BQ_R__ complexOut) { + if (!m_dpacked) initDouble(); + // mlib FFT gives the whole redundant complex result + mlib_SignalFFT_1_D64C_D64(m_dpacked, realIn, m_order); + v_copy(complexOut, m_dpacked, m_size + 2); + } + + void forwardPolar(const double *BQ_R__ realIn, double *BQ_R__ magOut, double *BQ_R__ phaseOut) { + if (!m_dpacked) initDouble(); + mlib_SignalFFT_1_D64C_D64(m_dpacked, realIn, m_order); + const int hs = m_size/2; + int index = 0; + for (int i = 0; i <= hs; ++i) { + int reali = index; + ++index; + magOut[i] = sqrt(m_dpacked[reali] * m_dpacked[reali] + + m_dpacked[index] * m_dpacked[index]); + phaseOut[i] = atan2(m_dpacked[index], m_dpacked[reali]) ; + ++index; + } + } + + void forwardMagnitude(const double *BQ_R__ realIn, double *BQ_R__ magOut) { + if (!m_dpacked) initDouble(); + mlib_SignalFFT_1_D64C_D64(m_dpacked, realIn, m_order); + const int hs = m_size/2; + int index = 0; + for (int i = 0; i <= hs; ++i) { + int reali = index; + ++index; + magOut[i] = sqrt(m_dpacked[reali] * m_dpacked[reali] + + m_dpacked[index] * m_dpacked[index]); + ++index; + } + } + + void forward(const float *BQ_R__ realIn, float *BQ_R__ realOut, float *BQ_R__ imagOut) { + if (!m_fpacked) initFloat(); + mlib_SignalFFT_1_F32C_F32(m_fpacked, realIn, m_order); + unpackFloat(realOut, imagOut); + } + + void forwardInterleaved(const float *BQ_R__ realIn, float *BQ_R__ complexOut) { + if (!m_fpacked) initFloat(); + // mlib FFT gives the whole redundant complex result + mlib_SignalFFT_1_F32C_F32(m_fpacked, realIn, m_order); + v_copy(complexOut, m_fpacked, m_size + 2); + } + + void forwardPolar(const float *BQ_R__ realIn, float *BQ_R__ magOut, float *BQ_R__ phaseOut) { + if (!m_fpacked) initFloat(); + mlib_SignalFFT_1_F32C_F32(m_fpacked, realIn, m_order); + const int hs = m_size/2; + int index = 0; + for (int i = 0; i <= hs; ++i) { + int reali = index; + ++index; + magOut[i] = sqrtf(m_fpacked[reali] * m_fpacked[reali] + + m_fpacked[index] * m_fpacked[index]); + phaseOut[i] = atan2f(m_fpacked[index], m_fpacked[reali]); + ++index; + } + } + + void forwardMagnitude(const float *BQ_R__ realIn, float *BQ_R__ magOut) { + if (!m_fpacked) initFloat(); + mlib_SignalFFT_1_F32C_F32(m_fpacked, realIn, m_order); + const int hs = m_size/2; + int index = 0; + for (int i = 0; i <= hs; ++i) { + int reali = index; + ++index; + magOut[i] = sqrtf(m_fpacked[reali] * m_fpacked[reali] + + m_fpacked[index] * m_fpacked[index]); + ++index; + } + } + + void inverse(const double *BQ_R__ realIn, const double *BQ_R__ imagIn, double *BQ_R__ realOut) { + if (!m_dpacked) initDouble(); + packDouble(realIn, imagIn); + mlib_SignalIFFT_2_D64_D64C(realOut, m_dpacked, m_order); + } + + void inverseInterleaved(const double *BQ_R__ complexIn, double *BQ_R__ realOut) { + if (!m_dpacked) initDouble(); + v_copy(m_dpacked, complexIn, m_size + 2); + packDoubleConjugates(); + mlib_SignalIFFT_2_D64_D64C(realOut, m_dpacked, m_order); + } + + void inversePolar(const double *BQ_R__ magIn, const double *BQ_R__ phaseIn, double *BQ_R__ realOut) { + if (!m_dpacked) initDouble(); + const int hs = m_size/2; + for (int i = 0; i <= hs; ++i) { + double real = magIn[i] * cos(phaseIn[i]); + double imag = magIn[i] * sin(phaseIn[i]); + m_dpacked[i*2] = real; + m_dpacked[i*2 + 1] = imag; + } + packDoubleConjugates(); + mlib_SignalIFFT_2_D64_D64C(realOut, m_dpacked, m_order); + } + + void inverseCepstral(const double *BQ_R__ magIn, double *BQ_R__ cepOut) { + if (!m_dpacked) initDouble(); + const int hs = m_size/2; + for (int i = 0; i <= hs; ++i) { + m_dpacked[i*2] = log(magIn[i] + 0.000001); + m_dpacked[i*2 + 1] = 0.0; + } + packDoubleConjugates(); + mlib_SignalIFFT_2_D64_D64C(cepOut, m_dpacked, m_order); + } + + void inverse(const float *BQ_R__ realIn, const float *BQ_R__ imagIn, float *BQ_R__ realOut) { + if (!m_fpacked) initFloat(); + packFloat(realIn, imagIn); + mlib_SignalIFFT_2_F32_F32C(realOut, m_fpacked, m_order); + } + + void inverseInterleaved(const float *BQ_R__ complexIn, float *BQ_R__ realOut) { + if (!m_fpacked) initFloat(); + v_convert(m_fpacked, complexIn, m_size + 2); + packFloatConjugates(); + mlib_SignalIFFT_2_F32_F32C(realOut, m_fpacked, m_order); + } + + void inversePolar(const float *BQ_R__ magIn, const float *BQ_R__ phaseIn, float *BQ_R__ realOut) { + if (!m_fpacked) initFloat(); + const int hs = m_size/2; + for (int i = 0; i <= hs; ++i) { + double real = magIn[i] * cos(phaseIn[i]); + double imag = magIn[i] * sin(phaseIn[i]); + m_fpacked[i*2] = real; + m_fpacked[i*2 + 1] = imag; + } + packFloatConjugates(); + mlib_SignalIFFT_2_F32_F32C(realOut, m_fpacked, m_order); + } + + void inverseCepstral(const float *BQ_R__ magIn, float *BQ_R__ cepOut) { + if (!m_fpacked) initFloat(); + const int hs = m_size/2; + for (int i = 0; i <= hs; ++i) { + m_fpacked[i*2] = logf(magIn[i] + 0.000001); + m_fpacked[i*2 + 1] = 0.f; + } + packFloatConjugates(); + mlib_SignalIFFT_2_F32_F32C(cepOut, m_fpacked, m_order); + } + +private: + const int m_size; + int m_order; + double *m_dpacked; + float *m_fpacked; +}; + +#endif /* HAVE_MEDIALIB */ + +#ifdef HAVE_OPENMAX + +class D_OPENMAX : public FFTImpl +{ + // Convert a signed 32-bit integer to a float in the range [-1,1) + static inline float i2f(OMX_S32 i) + { + return float(i) / float(OMX_MAX_S32); + } + + // Convert a signed 32-bit integer to a double in the range [-1,1) + static inline double i2d(OMX_S32 i) + { + return double(i) / double(OMX_MAX_S32); + } + + // Convert a float in the range [-1,1) to a signed 32-bit integer + static inline OMX_S32 f2i(float f) + { + return OMX_S32(f * OMX_MAX_S32); + } + + // Convert a double in the range [-1,1) to a signed 32-bit integer + static inline OMX_S32 d2i(double d) + { + return OMX_S32(d * OMX_MAX_S32); + } + +public: + D_OPENMAX(int size) : + m_size(size), + m_packed(0) + { + for (int i = 0; ; ++i) { + if (m_size & (1 << i)) { + m_order = i; + break; + } + } + } + + ~D_OPENMAX() { + if (m_packed) { + deallocate(m_packed); + deallocate(m_buf); + deallocate(m_fbuf); + deallocate(m_spec); + } + } + + FFT::Precisions + getSupportedPrecisions() const { + return FFT::SinglePrecision; + } + + //!!! rv check + + // The OpenMAX implementation uses a fixed-point representation in + // 32-bit signed integers, with a downward scaling factor (0-32 + // bits) supplied as an argument to the FFT function. + + void initFloat() { + initDouble(); + } + + void initDouble() { + if (!m_packed) { + m_buf = allocate(m_size); + m_packed = allocate(m_size*2 + 2); + m_fbuf = allocate(m_size*2 + 2); + OMX_INT sz = 0; + omxSP_FFTGetBufSize_R_S32(m_order, &sz); + m_spec = (OMXFFTSpec_R_S32 *)allocate(sz); + omxSP_FFTInit_R_S32(m_spec, m_order); + } + } + + void packFloat(const float *BQ_R__ re) { + // prepare fixed point input for forward transform + for (int i = 0; i < m_size; ++i) { + m_buf[i] = f2i(re[i]); + } + } + + void packDouble(const double *BQ_R__ re) { + // prepare fixed point input for forward transform + for (int i = 0; i < m_size; ++i) { + m_buf[i] = d2i(re[i]); + } + } + + void unpackFloat(float *BQ_R__ re, float *BQ_R__ im) { + // convert fixed point output for forward transform + int index = 0; + const int hs = m_size/2; + if (im) { + for (int i = 0; i <= hs; ++i) { + index++; + im[i] = i2f(m_packed[index++]); + } + v_scale(im, m_size, hs + 1); + } + index = 0; + for (int i = 0; i <= hs; ++i) { + re[i] = i2f(m_packed[index++]); + index++; + } + v_scale(re, m_size, hs + 1); + } + + void unpackDouble(double *BQ_R__ re, double *BQ_R__ im) { + // convert fixed point output for forward transform + int index = 0; + const int hs = m_size/2; + if (im) { + for (int i = 0; i <= hs; ++i) { + index++; + im[i] = i2d(m_packed[index++]); + } + v_scale(im, m_size, hs + 1); + } + index = 0; + for (int i = 0; i <= hs; ++i) { + re[i] = i2d(m_packed[index++]); + index++; + } + v_scale(re, m_size, hs + 1); + } + + void unpackFloatInterleaved(float *BQ_R__ cplx) { + // convert fixed point output for forward transform + for (int i = 0; i < m_size + 2; ++i) { + cplx[i] = i2f(m_packed[i]); + } + v_scale(cplx, m_size, m_size + 2); + } + + void unpackDoubleInterleaved(double *BQ_R__ cplx) { + // convert fixed point output for forward transform + for (int i = 0; i < m_size + 2; ++i) { + cplx[i] = i2d(m_packed[i]); + } + v_scale(cplx, m_size, m_size + 2); + } + + void packFloat(const float *BQ_R__ re, const float *BQ_R__ im) { + // prepare fixed point input for inverse transform + int index = 0; + const int hs = m_size/2; + for (int i = 0; i <= hs; ++i) { + m_packed[index++] = f2i(re[i]); + index++; + } + index = 0; + if (im) { + for (int i = 0; i <= hs; ++i) { + index++; + m_packed[index++] = f2i(im[i]); + } + } else { + for (int i = 0; i <= hs; ++i) { + index++; + m_packed[index++] = 0; + } + } + } + + void packDouble(const double *BQ_R__ re, const double *BQ_R__ im) { + // prepare fixed point input for inverse transform + int index = 0; + const int hs = m_size/2; + for (int i = 0; i <= hs; ++i) { + m_packed[index++] = d2i(re[i]); + index++; + } + index = 0; + if (im) { + for (int i = 0; i <= hs; ++i) { + index++; + m_packed[index++] = d2i(im[i]); + } + } else { + for (int i = 0; i <= hs; ++i) { + index++; + m_packed[index++] = 0; + } + } + } + + void convertFloat(const float *BQ_R__ f) { + // convert interleaved input for inverse interleaved transform + const int n = m_size + 2; + for (int i = 0; i < n; ++i) { + m_packed[i] = f2i(f[i]); + } + } + + void convertDouble(const double *BQ_R__ d) { + // convert interleaved input for inverse interleaved transform + const int n = m_size + 2; + for (int i = 0; i < n; ++i) { + m_packed[i] = d2i(d[i]); + } + } + + void unpackFloat(float *BQ_R__ re) { + // convert fixed point output for inverse transform + for (int i = 0; i < m_size; ++i) { + re[i] = i2f(m_buf[i]) * m_size; + } + } + + void unpackDouble(double *BQ_R__ re) { + // convert fixed point output for inverse transform + for (int i = 0; i < m_size; ++i) { + re[i] = i2d(m_buf[i]) * m_size; + } + } + + void forward(const double *BQ_R__ realIn, double *BQ_R__ realOut, double *BQ_R__ imagOut) { + if (!m_packed) initDouble(); + packDouble(realIn); + omxSP_FFTFwd_RToCCS_S32_Sfs(m_buf, m_packed, m_spec, m_order); + unpackDouble(realOut, imagOut); + } + + void forwardInterleaved(const double *BQ_R__ realIn, double *BQ_R__ complexOut) { + if (!m_packed) initDouble(); + packDouble(realIn); + omxSP_FFTFwd_RToCCS_S32_Sfs(m_buf, m_packed, m_spec, m_order); + unpackDoubleInterleaved(complexOut); + } + + void forwardPolar(const double *BQ_R__ realIn, double *BQ_R__ magOut, double *BQ_R__ phaseOut) { + if (!m_packed) initDouble(); + packDouble(realIn); + omxSP_FFTFwd_RToCCS_S32_Sfs(m_buf, m_packed, m_spec, m_order); + unpackDouble(magOut, phaseOut); // temporarily + // at this point we actually have real/imag in the mag/phase arrays + const int hs = m_size/2; + for (int i = 0; i <= hs; ++i) { + double real = magOut[i]; + double imag = phaseOut[i]; + c_magphase(magOut + i, phaseOut + i, real, imag); + } + } + + void forwardMagnitude(const double *BQ_R__ realIn, double *BQ_R__ magOut) { + if (!m_packed) initDouble(); + packDouble(realIn); + omxSP_FFTFwd_RToCCS_S32_Sfs(m_buf, m_packed, m_spec, m_order); + const int hs = m_size/2; + for (int i = 0; i <= hs; ++i) { + int reali = i * 2; + int imagi = reali + 1; + double real = i2d(m_packed[reali]) * m_size; + double imag = i2d(m_packed[imagi]) * m_size; + magOut[i] = sqrt(real * real + imag * imag); + } + } + + void forward(const float *BQ_R__ realIn, float *BQ_R__ realOut, float *BQ_R__ imagOut) { + if (!m_packed) initFloat(); + packFloat(realIn); + omxSP_FFTFwd_RToCCS_S32_Sfs(m_buf, m_packed, m_spec, m_order); + unpackFloat(realOut, imagOut); + } + + void forwardInterleaved(const float *BQ_R__ realIn, float *BQ_R__ complexOut) { + if (!m_packed) initFloat(); + packFloat(realIn); + omxSP_FFTFwd_RToCCS_S32_Sfs(m_buf, m_packed, m_spec, m_order); + unpackFloatInterleaved(complexOut); + } + + void forwardPolar(const float *BQ_R__ realIn, float *BQ_R__ magOut, float *BQ_R__ phaseOut) { + if (!m_packed) initFloat(); + + packFloat(realIn); + omxSP_FFTFwd_RToCCS_S32_Sfs(m_buf, m_packed, m_spec, m_order); + unpackFloat(magOut, phaseOut); // temporarily + // at this point we actually have real/imag in the mag/phase arrays + const int hs = m_size/2; + for (int i = 0; i <= hs; ++i) { + float real = magOut[i]; + float imag = phaseOut[i]; + c_magphase(magOut + i, phaseOut + i, real, imag); + } + } + + void forwardMagnitude(const float *BQ_R__ realIn, float *BQ_R__ magOut) { + if (!m_packed) initFloat(); + packFloat(realIn); + omxSP_FFTFwd_RToCCS_S32_Sfs(m_buf, m_packed, m_spec, m_order); + const int hs = m_size/2; + for (int i = 0; i <= hs; ++i) { + int reali = i * 2; + int imagi = reali + 1; + float real = i2f(m_packed[reali]) * m_size; + float imag = i2f(m_packed[imagi]) * m_size; + magOut[i] = sqrtf(real * real + imag * imag); + } + } + + void inverse(const double *BQ_R__ realIn, const double *BQ_R__ imagIn, double *BQ_R__ realOut) { + if (!m_packed) initDouble(); + packDouble(realIn, imagIn); + omxSP_FFTInv_CCSToR_S32_Sfs(m_packed, m_buf, m_spec, 0); + unpackDouble(realOut); + } + + void inverseInterleaved(const double *BQ_R__ complexIn, double *BQ_R__ realOut) { + if (!m_packed) initDouble(); + convertDouble(complexIn); + omxSP_FFTInv_CCSToR_S32_Sfs(m_packed, m_buf, m_spec, 0); + unpackDouble(realOut); + } + + void inversePolar(const double *BQ_R__ magIn, const double *BQ_R__ phaseIn, double *BQ_R__ realOut) { + if (!m_packed) initDouble(); + int index = 0; + const int hs = m_size/2; + for (int i = 0; i <= hs; ++i) { + double real, imag; + c_phasor(&real, &imag, phaseIn[i]); + m_fbuf[index++] = float(real); + m_fbuf[index++] = float(imag); + } + convertFloat(m_fbuf); + omxSP_FFTInv_CCSToR_S32_Sfs(m_packed, m_buf, m_spec, 0); + unpackDouble(realOut); + } + + void inverseCepstral(const double *BQ_R__ magIn, double *BQ_R__ cepOut) { + if (!m_packed) initDouble(); + //!!! implement +#warning OpenMAX implementation lacks cepstral transforms + } + + void inverse(const float *BQ_R__ realIn, const float *BQ_R__ imagIn, float *BQ_R__ realOut) { + if (!m_packed) initFloat(); + packFloat(realIn, imagIn); + omxSP_FFTInv_CCSToR_S32_Sfs(m_packed, m_buf, m_spec, 0); + unpackFloat(realOut); + } + + void inverseInterleaved(const float *BQ_R__ complexIn, float *BQ_R__ realOut) { + if (!m_packed) initFloat(); + convertFloat(complexIn); + omxSP_FFTInv_CCSToR_S32_Sfs(m_packed, m_buf, m_spec, 0); + unpackFloat(realOut); + } + + void inversePolar(const float *BQ_R__ magIn, const float *BQ_R__ phaseIn, float *BQ_R__ realOut) { + if (!m_packed) initFloat(); + const int hs = m_size/2; + v_polar_to_cartesian_interleaved(m_fbuf, magIn, phaseIn, hs+1); + convertFloat(m_fbuf); + omxSP_FFTInv_CCSToR_S32_Sfs(m_packed, m_buf, m_spec, 0); + unpackFloat(realOut); + } + + void inverseCepstral(const float *BQ_R__ magIn, float *BQ_R__ cepOut) { + if (!m_packed) initFloat(); + //!!! implement +#warning OpenMAX implementation lacks cepstral transforms + } + +private: + const int m_size; + int m_order; + OMX_S32 *m_packed; + OMX_S32 *m_buf; + float *m_fbuf; + OMXFFTSpec_R_S32 *m_spec; + +}; + +#endif /* HAVE_OPENMAX */ + +#ifdef HAVE_FFTW3 + +/* + Define FFTW_DOUBLE_ONLY to make all uses of FFTW functions be + double-precision (so "float" FFTs are calculated by casting to + doubles and using the double-precision FFTW function). + + Define FFTW_SINGLE_ONLY to make all uses of FFTW functions be + single-precision (so "double" FFTs are calculated by casting to + floats and using the single-precision FFTW function). + + Neither of these flags is desirable for either performance or + precision. The main reason to define either flag is to avoid linking + against both fftw3 and fftw3f libraries. +*/ + +//#define FFTW_DOUBLE_ONLY 1 +//#define FFTW_SINGLE_ONLY 1 + +#if defined(FFTW_DOUBLE_ONLY) && defined(FFTW_SINGLE_ONLY) +// Can't meaningfully define both +#error Can only define one of FFTW_DOUBLE_ONLY and FFTW_SINGLE_ONLY +#endif + +#if defined(FFTW_FLOAT_ONLY) +#warning FFTW_FLOAT_ONLY is deprecated, use FFTW_SINGLE_ONLY instead +#define FFTW_SINGLE_ONLY 1 +#endif + +#ifdef FFTW_DOUBLE_ONLY +#define fft_float_type double +#define fftwf_complex fftw_complex +#define fftwf_plan fftw_plan +#define fftwf_plan_dft_r2c_1d fftw_plan_dft_r2c_1d +#define fftwf_plan_dft_c2r_1d fftw_plan_dft_c2r_1d +#define fftwf_destroy_plan fftw_destroy_plan +#define fftwf_malloc fftw_malloc +#define fftwf_free fftw_free +#define fftwf_execute fftw_execute +#define atan2f atan2 +#define sqrtf sqrt +#define cosf cos +#define sinf sin +#else +#define fft_float_type float +#endif /* FFTW_DOUBLE_ONLY */ + +#ifdef FFTW_SINGLE_ONLY +#define fft_double_type float +#define fftw_complex fftwf_complex +#define fftw_plan fftwf_plan +#define fftw_plan_dft_r2c_1d fftwf_plan_dft_r2c_1d +#define fftw_plan_dft_c2r_1d fftwf_plan_dft_c2r_1d +#define fftw_destroy_plan fftwf_destroy_plan +#define fftw_malloc fftwf_malloc +#define fftw_free fftwf_free +#define fftw_execute fftwf_execute +#define atan2 atan2f +#define sqrt sqrtf +#define cos cosf +#define sin sinf +#else +#define fft_double_type double +#endif /* FFTW_SINGLE_ONLY */ + +class D_FFTW : public FFTImpl +{ +public: + D_FFTW(int size) : + m_fplanf(0), m_dplanf(0), m_size(size) + { + initMutex(); + } + + ~D_FFTW() { + if (m_fplanf) { + lock(); + bool save = false; + if (m_extantf > 0 && --m_extantf == 0) save = true; +#ifndef FFTW_DOUBLE_ONLY + if (save) saveWisdom('f'); +#endif + fftwf_destroy_plan(m_fplanf); + fftwf_destroy_plan(m_fplani); + fftwf_free(m_fbuf); + fftwf_free(m_fpacked); + unlock(); + } + if (m_dplanf) { + lock(); + bool save = false; + if (m_extantd > 0 && --m_extantd == 0) save = true; +#ifndef FFTW_SINGLE_ONLY + if (save) saveWisdom('d'); +#endif + fftw_destroy_plan(m_dplanf); + fftw_destroy_plan(m_dplani); + fftw_free(m_dbuf); + fftw_free(m_dpacked); + unlock(); + } + destroyMutex(); + } + + FFT::Precisions + getSupportedPrecisions() const { +#ifdef FFTW_SINGLE_ONLY + return FFT::SinglePrecision; +#else +#ifdef FFTW_DOUBLE_ONLY + return FFT::DoublePrecision; +#else + return FFT::SinglePrecision | FFT::DoublePrecision; +#endif +#endif + } + + void initFloat() { + if (m_fplanf) return; + bool load = false; + lock(); + if (m_extantf++ == 0) load = true; +#ifdef FFTW_DOUBLE_ONLY + if (load) loadWisdom('d'); +#else + if (load) loadWisdom('f'); +#endif + m_fbuf = (fft_float_type *)fftw_malloc(m_size * sizeof(fft_float_type)); + m_fpacked = (fftwf_complex *)fftw_malloc + ((m_size/2 + 1) * sizeof(fftwf_complex)); + m_fplanf = fftwf_plan_dft_r2c_1d + (m_size, m_fbuf, m_fpacked, FFTW_MEASURE); + m_fplani = fftwf_plan_dft_c2r_1d + (m_size, m_fpacked, m_fbuf, FFTW_MEASURE); + unlock(); + } + + void initDouble() { + if (m_dplanf) return; + bool load = false; + lock(); + if (m_extantd++ == 0) load = true; +#ifdef FFTW_SINGLE_ONLY + if (load) loadWisdom('f'); +#else + if (load) loadWisdom('d'); +#endif + m_dbuf = (fft_double_type *)fftw_malloc(m_size * sizeof(fft_double_type)); + m_dpacked = (fftw_complex *)fftw_malloc + ((m_size/2 + 1) * sizeof(fftw_complex)); + m_dplanf = fftw_plan_dft_r2c_1d + (m_size, m_dbuf, m_dpacked, FFTW_MEASURE); + m_dplani = fftw_plan_dft_c2r_1d + (m_size, m_dpacked, m_dbuf, FFTW_MEASURE); + unlock(); + } + + void loadWisdom(char type) { wisdom(false, type); } + void saveWisdom(char type) { wisdom(true, type); } + + void wisdom(bool save, char type) { + +#ifdef FFTW_DOUBLE_ONLY + if (type == 'f') return; +#endif +#ifdef FFTW_SINGLE_ONLY + if (type == 'd') return; +#endif + + const char *home = getenv("HOME"); + if (!home) return; + + char fn[256]; + snprintf(fn, 256, "%s/%s.%c", home, ".turbot.wisdom", type); + + FILE *f = fopen(fn, save ? "wb" : "rb"); + if (!f) return; + + if (save) { + switch (type) { +#ifdef FFTW_DOUBLE_ONLY + case 'f': break; +#else + case 'f': fftwf_export_wisdom_to_file(f); break; +#endif +#ifdef FFTW_SINGLE_ONLY + case 'd': break; +#else + case 'd': fftw_export_wisdom_to_file(f); break; +#endif + default: break; + } + } else { + switch (type) { +#ifdef FFTW_DOUBLE_ONLY + case 'f': break; +#else + case 'f': fftwf_import_wisdom_from_file(f); break; +#endif +#ifdef FFTW_SINGLE_ONLY + case 'd': break; +#else + case 'd': fftw_import_wisdom_from_file(f); break; +#endif + default: break; + } + } + + fclose(f); + } + + void packFloat(const float *BQ_R__ re, const float *BQ_R__ im) { + const int hs = m_size/2; + fftwf_complex *const BQ_R__ fpacked = m_fpacked; + for (int i = 0; i <= hs; ++i) { + fpacked[i][0] = re[i]; + } + if (im) { + for (int i = 0; i <= hs; ++i) { + fpacked[i][1] = im[i]; + } + } else { + for (int i = 0; i <= hs; ++i) { + fpacked[i][1] = 0.f; + } + } + } + + void packDouble(const double *BQ_R__ re, const double *BQ_R__ im) { + const int hs = m_size/2; + fftw_complex *const BQ_R__ dpacked = m_dpacked; + for (int i = 0; i <= hs; ++i) { + dpacked[i][0] = re[i]; + } + if (im) { + for (int i = 0; i <= hs; ++i) { + dpacked[i][1] = im[i]; + } + } else { + for (int i = 0; i <= hs; ++i) { + dpacked[i][1] = 0.0; + } + } + } + + void unpackFloat(float *BQ_R__ re, float *BQ_R__ im) { + const int hs = m_size/2; + for (int i = 0; i <= hs; ++i) { + re[i] = m_fpacked[i][0]; + } + if (im) { + for (int i = 0; i <= hs; ++i) { + im[i] = m_fpacked[i][1]; + } + } + } + + void unpackDouble(double *BQ_R__ re, double *BQ_R__ im) { + const int hs = m_size/2; + for (int i = 0; i <= hs; ++i) { + re[i] = m_dpacked[i][0]; + } + if (im) { + for (int i = 0; i <= hs; ++i) { + im[i] = m_dpacked[i][1]; + } + } + } + + void forward(const double *BQ_R__ realIn, double *BQ_R__ realOut, double *BQ_R__ imagOut) { + if (!m_dplanf) initDouble(); + const int sz = m_size; + fft_double_type *const BQ_R__ dbuf = m_dbuf; +#ifndef FFTW_SINGLE_ONLY + if (realIn != dbuf) +#endif + for (int i = 0; i < sz; ++i) { + dbuf[i] = realIn[i]; + } + fftw_execute(m_dplanf); + unpackDouble(realOut, imagOut); + } + + void forwardInterleaved(const double *BQ_R__ realIn, double *BQ_R__ complexOut) { + if (!m_dplanf) initDouble(); + const int sz = m_size; + fft_double_type *const BQ_R__ dbuf = m_dbuf; +#ifndef FFTW_SINGLE_ONLY + if (realIn != dbuf) +#endif + for (int i = 0; i < sz; ++i) { + dbuf[i] = realIn[i]; + } + fftw_execute(m_dplanf); + v_convert(complexOut, (fft_double_type *)m_dpacked, sz + 2); + } + + void forwardPolar(const double *BQ_R__ realIn, double *BQ_R__ magOut, double *BQ_R__ phaseOut) { + if (!m_dplanf) initDouble(); + fft_double_type *const BQ_R__ dbuf = m_dbuf; + const int sz = m_size; +#ifndef FFTW_SINGLE_ONLY + if (realIn != dbuf) +#endif + for (int i = 0; i < sz; ++i) { + dbuf[i] = realIn[i]; + } + fftw_execute(m_dplanf); + v_cartesian_interleaved_to_polar(magOut, phaseOut, + (double *)m_dpacked, m_size/2+1); + } + + void forwardMagnitude(const double *BQ_R__ realIn, double *BQ_R__ magOut) { + if (!m_dplanf) initDouble(); + fft_double_type *const BQ_R__ dbuf = m_dbuf; + const int sz = m_size; +#ifndef FFTW_SINGLE_ONLY + if (realIn != m_dbuf) +#endif + for (int i = 0; i < sz; ++i) { + dbuf[i] = realIn[i]; + } + fftw_execute(m_dplanf); + const int hs = m_size/2; + for (int i = 0; i <= hs; ++i) { + magOut[i] = sqrt(m_dpacked[i][0] * m_dpacked[i][0] + + m_dpacked[i][1] * m_dpacked[i][1]); + } + } + + void forward(const float *BQ_R__ realIn, float *BQ_R__ realOut, float *BQ_R__ imagOut) { + if (!m_fplanf) initFloat(); + fft_float_type *const BQ_R__ fbuf = m_fbuf; + const int sz = m_size; +#ifndef FFTW_DOUBLE_ONLY + if (realIn != fbuf) +#endif + for (int i = 0; i < sz; ++i) { + fbuf[i] = realIn[i]; + } + fftwf_execute(m_fplanf); + unpackFloat(realOut, imagOut); + } + + void forwardInterleaved(const float *BQ_R__ realIn, float *BQ_R__ complexOut) { + if (!m_fplanf) initFloat(); + fft_float_type *const BQ_R__ fbuf = m_fbuf; + const int sz = m_size; +#ifndef FFTW_DOUBLE_ONLY + if (realIn != fbuf) +#endif + for (int i = 0; i < sz; ++i) { + fbuf[i] = realIn[i]; + } + fftwf_execute(m_fplanf); + v_convert(complexOut, (fft_float_type *)m_fpacked, sz + 2); + } + + void forwardPolar(const float *BQ_R__ realIn, float *BQ_R__ magOut, float *BQ_R__ phaseOut) { + if (!m_fplanf) initFloat(); + fft_float_type *const BQ_R__ fbuf = m_fbuf; + const int sz = m_size; +#ifndef FFTW_DOUBLE_ONLY + if (realIn != fbuf) +#endif + for (int i = 0; i < sz; ++i) { + fbuf[i] = realIn[i]; + } + fftwf_execute(m_fplanf); + v_cartesian_interleaved_to_polar(magOut, phaseOut, + (float *)m_fpacked, m_size/2+1); + } + + void forwardMagnitude(const float *BQ_R__ realIn, float *BQ_R__ magOut) { + if (!m_fplanf) initFloat(); + fft_float_type *const BQ_R__ fbuf = m_fbuf; + const int sz = m_size; +#ifndef FFTW_DOUBLE_ONLY + if (realIn != fbuf) +#endif + for (int i = 0; i < sz; ++i) { + fbuf[i] = realIn[i]; + } + fftwf_execute(m_fplanf); + const int hs = m_size/2; + for (int i = 0; i <= hs; ++i) { + magOut[i] = sqrtf(m_fpacked[i][0] * m_fpacked[i][0] + + m_fpacked[i][1] * m_fpacked[i][1]); + } + } + + void inverse(const double *BQ_R__ realIn, const double *BQ_R__ imagIn, double *BQ_R__ realOut) { + if (!m_dplanf) initDouble(); + packDouble(realIn, imagIn); + fftw_execute(m_dplani); + const int sz = m_size; + fft_double_type *const BQ_R__ dbuf = m_dbuf; +#ifndef FFTW_SINGLE_ONLY + if (realOut != dbuf) +#endif + for (int i = 0; i < sz; ++i) { + realOut[i] = dbuf[i]; + } + } + + void inverseInterleaved(const double *BQ_R__ complexIn, double *BQ_R__ realOut) { + if (!m_dplanf) initDouble(); + v_convert((double *)m_dpacked, complexIn, m_size + 2); + fftw_execute(m_dplani); + const int sz = m_size; + fft_double_type *const BQ_R__ dbuf = m_dbuf; +#ifndef FFTW_SINGLE_ONLY + if (realOut != dbuf) +#endif + for (int i = 0; i < sz; ++i) { + realOut[i] = dbuf[i]; + } + } + + void inversePolar(const double *BQ_R__ magIn, const double *BQ_R__ phaseIn, double *BQ_R__ realOut) { + if (!m_dplanf) initDouble(); + const int hs = m_size/2; + fftw_complex *const BQ_R__ dpacked = m_dpacked; + for (int i = 0; i <= hs; ++i) { + dpacked[i][0] = magIn[i] * cos(phaseIn[i]); + } + for (int i = 0; i <= hs; ++i) { + dpacked[i][1] = magIn[i] * sin(phaseIn[i]); + } + fftw_execute(m_dplani); + const int sz = m_size; + fft_double_type *const BQ_R__ dbuf = m_dbuf; +#ifndef FFTW_SINGLE_ONLY + if (realOut != dbuf) +#endif + for (int i = 0; i < sz; ++i) { + realOut[i] = dbuf[i]; + } + } + + void inverseCepstral(const double *BQ_R__ magIn, double *BQ_R__ cepOut) { + if (!m_dplanf) initDouble(); + fft_double_type *const BQ_R__ dbuf = m_dbuf; + fftw_complex *const BQ_R__ dpacked = m_dpacked; + const int hs = m_size/2; + for (int i = 0; i <= hs; ++i) { + dpacked[i][0] = log(magIn[i] + 0.000001); + } + for (int i = 0; i <= hs; ++i) { + dpacked[i][1] = 0.0; + } + fftw_execute(m_dplani); + const int sz = m_size; +#ifndef FFTW_SINGLE_ONLY + if (cepOut != dbuf) +#endif + for (int i = 0; i < sz; ++i) { + cepOut[i] = dbuf[i]; + } + } + + void inverse(const float *BQ_R__ realIn, const float *BQ_R__ imagIn, float *BQ_R__ realOut) { + if (!m_fplanf) initFloat(); + packFloat(realIn, imagIn); + fftwf_execute(m_fplani); + const int sz = m_size; + fft_float_type *const BQ_R__ fbuf = m_fbuf; +#ifndef FFTW_DOUBLE_ONLY + if (realOut != fbuf) +#endif + for (int i = 0; i < sz; ++i) { + realOut[i] = fbuf[i]; + } + } + + void inverseInterleaved(const float *BQ_R__ complexIn, float *BQ_R__ realOut) { + if (!m_fplanf) initFloat(); + v_copy((float *)m_fpacked, complexIn, m_size + 2); + fftwf_execute(m_fplani); + const int sz = m_size; + fft_float_type *const BQ_R__ fbuf = m_fbuf; +#ifndef FFTW_DOUBLE_ONLY + if (realOut != fbuf) +#endif + for (int i = 0; i < sz; ++i) { + realOut[i] = fbuf[i]; + } + } + + void inversePolar(const float *BQ_R__ magIn, const float *BQ_R__ phaseIn, float *BQ_R__ realOut) { + if (!m_fplanf) initFloat(); + const int hs = m_size/2; + fftwf_complex *const BQ_R__ fpacked = m_fpacked; + for (int i = 0; i <= hs; ++i) { + fpacked[i][0] = magIn[i] * cosf(phaseIn[i]); + } + for (int i = 0; i <= hs; ++i) { + fpacked[i][1] = magIn[i] * sinf(phaseIn[i]); + } + fftwf_execute(m_fplani); + const int sz = m_size; + fft_float_type *const BQ_R__ fbuf = m_fbuf; +#ifndef FFTW_DOUBLE_ONLY + if (realOut != fbuf) +#endif + for (int i = 0; i < sz; ++i) { + realOut[i] = fbuf[i]; + } + } + + void inverseCepstral(const float *BQ_R__ magIn, float *BQ_R__ cepOut) { + if (!m_fplanf) initFloat(); + const int hs = m_size/2; + fftwf_complex *const BQ_R__ fpacked = m_fpacked; + for (int i = 0; i <= hs; ++i) { + fpacked[i][0] = logf(magIn[i] + 0.000001f); + } + for (int i = 0; i <= hs; ++i) { + fpacked[i][1] = 0.f; + } + fftwf_execute(m_fplani); + const int sz = m_size; + fft_float_type *const BQ_R__ fbuf = m_fbuf; +#ifndef FFTW_DOUBLE_ONLY + if (cepOut != fbuf) +#endif + for (int i = 0; i < sz; ++i) { + cepOut[i] = fbuf[i]; + } + } + +private: + fftwf_plan m_fplanf; + fftwf_plan m_fplani; +#ifdef FFTW_DOUBLE_ONLY + double *m_fbuf; +#else + float *m_fbuf; +#endif + fftwf_complex *m_fpacked; + fftw_plan m_dplanf; + fftw_plan m_dplani; +#ifdef FFTW_SINGLE_ONLY + float *m_dbuf; +#else + double *m_dbuf; +#endif + fftw_complex *m_dpacked; + const int m_size; + static int m_extantf; + static int m_extantd; +#ifdef NO_THREADING + void initMutex() {} + void destroyMutex() {} + void lock() {} + void unlock() {} +#else +#ifdef _WIN32 + static HANDLE m_commonMutex; + void initMutex() { m_commonMutex = CreateMutex(NULL, FALSE, NULL); } + void destroyMutex() { CloseHandle(m_commonMutex); } + void lock() { WaitForSingleObject(m_commonMutex, INFINITE); } + void unlock() { ReleaseMutex(m_commonMutex); } +#else + static pthread_mutex_t m_commonMutex; + void initMutex() { pthread_mutex_init(&m_commonMutex, 0); } + void destroyMutex() { pthread_mutex_destroy(&m_commonMutex); } + void lock() { pthread_mutex_lock(&m_commonMutex); } + void unlock() { pthread_mutex_unlock(&m_commonMutex); } +#endif +#endif +}; + +int +D_FFTW::m_extantf = 0; + +int +D_FFTW::m_extantd = 0; + +#ifndef NO_THREADING +#ifdef _WIN32 +HANDLE D_FFTW::m_commonMutex; +#else +pthread_mutex_t D_FFTW::m_commonMutex; +#endif +#endif + +#endif /* HAVE_FFTW3 */ + +#ifdef HAVE_SFFT + +/* + Define SFFT_DOUBLE_ONLY to make all uses of SFFT functions be + double-precision (so "float" FFTs are calculated by casting to + doubles and using the double-precision SFFT function). + + Define SFFT_SINGLE_ONLY to make all uses of SFFT functions be + single-precision (so "double" FFTs are calculated by casting to + floats and using the single-precision SFFT function). + + Neither of these flags is desirable for either performance or + precision. +*/ + +//#define SFFT_DOUBLE_ONLY 1 +//#define SFFT_SINGLE_ONLY 1 + +#if defined(SFFT_DOUBLE_ONLY) && defined(SFFT_SINGLE_ONLY) +// Can't meaningfully define both +#error Can only define one of SFFT_DOUBLE_ONLY and SFFT_SINGLE_ONLY +#endif + +#ifdef SFFT_DOUBLE_ONLY +#define fft_float_type double +#define FLAG_SFFT_FLOAT SFFT_DOUBLE +#define atan2f atan2 +#define sqrtf sqrt +#define cosf cos +#define sinf sin +#define logf log +#else +#define FLAG_SFFT_FLOAT SFFT_FLOAT +#define fft_float_type float +#endif /* SFFT_DOUBLE_ONLY */ + +#ifdef SFFT_SINGLE_ONLY +#define fft_double_type float +#define FLAG_SFFT_DOUBLE SFFT_FLOAT +#define atan2 atan2f +#define sqrt sqrtf +#define cos cosf +#define sin sinf +#define log logf +#else +#define FLAG_SFFT_DOUBLE SFFT_DOUBLE +#define fft_double_type double +#endif /* SFFT_SINGLE_ONLY */ + +class D_SFFT : public FFTImpl +{ +public: + D_SFFT(int size) : + m_fplanf(0), m_fplani(0), m_dplanf(0), m_dplani(0), m_size(size) + { + } + + ~D_SFFT() { + if (m_fplanf) { + sfft_free(m_fplanf); + sfft_free(m_fplani); + deallocate(m_fbuf); + deallocate(m_fresult); + } + if (m_dplanf) { + sfft_free(m_dplanf); + sfft_free(m_dplani); + deallocate(m_dbuf); + deallocate(m_dresult); + } + } + + FFT::Precisions + getSupportedPrecisions() const { +#ifdef SFFT_SINGLE_ONLY + return FFT::SinglePrecision; +#else +#ifdef SFFT_DOUBLE_ONLY + return FFT::DoublePrecision; +#else + return FFT::SinglePrecision | FFT::DoublePrecision; +#endif +#endif + } + + void initFloat() { + if (m_fplanf) return; + m_fbuf = allocate(2 * m_size); + m_fresult = allocate(2 * m_size); + m_fplanf = sfft_init(m_size, SFFT_FORWARD | FLAG_SFFT_FLOAT); + m_fplani = sfft_init(m_size, SFFT_BACKWARD | FLAG_SFFT_FLOAT); + if (!m_fplanf || !m_fplani) { + if (!m_fplanf) { + std::cerr << "D_SFFT: Failed to construct forward float transform for size " << m_size << " (check SFFT library's target configuration)" << std::endl; + } else { + std::cerr << "D_SFFT: Failed to construct inverse float transform for size " << m_size << " (check SFFT library's target configuration)" << std::endl; + } +#ifndef NO_EXCEPTIONS + throw FFT::InternalError; +#else + abort(); +#endif + } + } + + void initDouble() { + if (m_dplanf) return; + m_dbuf = allocate(2 * m_size); + m_dresult = allocate(2 * m_size); + m_dplanf = sfft_init(m_size, SFFT_FORWARD | FLAG_SFFT_DOUBLE); + m_dplani = sfft_init(m_size, SFFT_BACKWARD | FLAG_SFFT_DOUBLE); + if (!m_dplanf || !m_dplani) { + if (!m_dplanf) { + std::cerr << "D_SFFT: Failed to construct forward double transform for size " << m_size << " (check SFFT library's target configuration)" << std::endl; + } else { + std::cerr << "D_SFFT: Failed to construct inverse double transform for size " << m_size << " (check SFFT library's target configuration)" << std::endl; + } +#ifndef NO_EXCEPTIONS + throw FFT::InternalError; +#else + abort(); +#endif + } + } + + void packFloat(const float *BQ_R__ re, const float *BQ_R__ im, fft_float_type *target, int n) { + for (int i = 0; i < n; ++i) target[i*2] = re[i]; + if (im) { + for (int i = 0; i < n; ++i) target[i*2+1] = im[i]; + } else { + for (int i = 0; i < n; ++i) target[i*2+1] = 0.f; + } + } + + void packDouble(const double *BQ_R__ re, const double *BQ_R__ im, fft_double_type *target, int n) { + for (int i = 0; i < n; ++i) target[i*2] = re[i]; + if (im) { + for (int i = 0; i < n; ++i) target[i*2+1] = im[i]; + } else { + for (int i = 0; i < n; ++i) target[i*2+1] = 0.0; + } + } + + void unpackFloat(const fft_float_type *source, float *BQ_R__ re, float *BQ_R__ im, int n) { + for (int i = 0; i < n; ++i) re[i] = source[i*2]; + if (im) { + for (int i = 0; i < n; ++i) im[i] = source[i*2+1]; + } + } + + void unpackDouble(const fft_double_type *source, double *BQ_R__ re, double *BQ_R__ im, int n) { + for (int i = 0; i < n; ++i) re[i] = source[i*2]; + if (im) { + for (int i = 0; i < n; ++i) im[i] = source[i*2+1]; + } + } + + template + void mirror(T *BQ_R__ cplx, int n) { + for (int i = 1; i <= n/2; ++i) { + int j = n-i; + cplx[j*2] = cplx[i*2]; + cplx[j*2+1] = -cplx[i*2+1]; + } + } + + void forward(const double *BQ_R__ realIn, double *BQ_R__ realOut, double *BQ_R__ imagOut) { + if (!m_dplanf) initDouble(); + packDouble(realIn, 0, m_dbuf, m_size); + sfft_execute(m_dplanf, m_dbuf, m_dresult); + unpackDouble(m_dresult, realOut, imagOut, m_size/2+1); + } + + void forwardInterleaved(const double *BQ_R__ realIn, double *BQ_R__ complexOut) { + if (!m_dplanf) initDouble(); + packDouble(realIn, 0, m_dbuf, m_size); + sfft_execute(m_dplanf, m_dbuf, m_dresult); + v_convert(complexOut, m_dresult, m_size+2); // i.e. m_size/2+1 complex + } + + void forwardPolar(const double *BQ_R__ realIn, double *BQ_R__ magOut, double *BQ_R__ phaseOut) { + if (!m_dplanf) initDouble(); + packDouble(realIn, 0, m_dbuf, m_size); + sfft_execute(m_dplanf, m_dbuf, m_dresult); + v_cartesian_interleaved_to_polar(magOut, phaseOut, + m_dresult, m_size/2+1); + } + + void forwardMagnitude(const double *BQ_R__ realIn, double *BQ_R__ magOut) { + if (!m_dplanf) initDouble(); + packDouble(realIn, 0, m_dbuf, m_size); + sfft_execute(m_dplanf, m_dbuf, m_dresult); + const int hs = m_size/2; + for (int i = 0; i <= hs; ++i) { + magOut[i] = sqrt(m_dresult[i*2] * m_dresult[i*2] + + m_dresult[i*2+1] * m_dresult[i*2+1]); + } + } + + void forward(const float *BQ_R__ realIn, float *BQ_R__ realOut, float *BQ_R__ imagOut) { + if (!m_fplanf) initFloat(); + packFloat(realIn, 0, m_fbuf, m_size); + sfft_execute(m_fplanf, m_fbuf, m_fresult); + unpackFloat(m_fresult, realOut, imagOut, m_size/2+1); + } + + void forwardInterleaved(const float *BQ_R__ realIn, float *BQ_R__ complexOut) { + if (!m_fplanf) initFloat(); + packFloat(realIn, 0, m_fbuf, m_size); + sfft_execute(m_fplanf, m_fbuf, m_fresult); + v_convert(complexOut, m_fresult, m_size+2); // i.e. m_size/2+1 complex + } + + void forwardPolar(const float *BQ_R__ realIn, float *BQ_R__ magOut, float *BQ_R__ phaseOut) { + if (!m_fplanf) initFloat(); + packFloat(realIn, 0, m_fbuf, m_size); + sfft_execute(m_fplanf, m_fbuf, m_fresult); + v_cartesian_interleaved_to_polar(magOut, phaseOut, + m_fresult, m_size/2+1); + } + + void forwardMagnitude(const float *BQ_R__ realIn, float *BQ_R__ magOut) { + if (!m_fplanf) initFloat(); + packFloat(realIn, 0, m_fbuf, m_size); + sfft_execute(m_fplanf, m_fbuf, m_fresult); + const int hs = m_size/2; + for (int i = 0; i <= hs; ++i) { + magOut[i] = sqrtf(m_fresult[i*2] * m_fresult[i*2] + + m_fresult[i*2+1] * m_fresult[i*2+1]); + } + } + + void inverse(const double *BQ_R__ realIn, const double *BQ_R__ imagIn, double *BQ_R__ realOut) { + if (!m_dplanf) initDouble(); + packDouble(realIn, imagIn, m_dbuf, m_size/2+1); + mirror(m_dbuf, m_size); + sfft_execute(m_dplani, m_dbuf, m_dresult); + for (int i = 0; i < m_size; ++i) { + realOut[i] = m_dresult[i*2]; + } + } + + void inverseInterleaved(const double *BQ_R__ complexIn, double *BQ_R__ realOut) { + if (!m_dplanf) initDouble(); + v_convert((double *)m_dbuf, complexIn, m_size + 2); + mirror(m_dbuf, m_size); + sfft_execute(m_dplani, m_dbuf, m_dresult); + for (int i = 0; i < m_size; ++i) { + realOut[i] = m_dresult[i*2]; + } + } + + void inversePolar(const double *BQ_R__ magIn, const double *BQ_R__ phaseIn, double *BQ_R__ realOut) { + if (!m_dplanf) initDouble(); + const int hs = m_size/2; + for (int i = 0; i <= hs; ++i) { + m_dbuf[i*2] = magIn[i] * cos(phaseIn[i]); + m_dbuf[i*2+1] = magIn[i] * sin(phaseIn[i]); + } + mirror(m_dbuf, m_size); + sfft_execute(m_dplani, m_dbuf, m_dresult); + for (int i = 0; i < m_size; ++i) { + realOut[i] = m_dresult[i*2]; + } + } + + void inverseCepstral(const double *BQ_R__ magIn, double *BQ_R__ cepOut) { + if (!m_dplanf) initDouble(); + const int hs = m_size/2; + for (int i = 0; i <= hs; ++i) { + m_dbuf[i*2] = log(magIn[i] + 0.000001); + m_dbuf[i*2+1] = 0.0; + } + mirror(m_dbuf, m_size); + sfft_execute(m_dplani, m_dbuf, m_dresult); + for (int i = 0; i < m_size; ++i) { + cepOut[i] = m_dresult[i*2]; + } + } + + void inverse(const float *BQ_R__ realIn, const float *BQ_R__ imagIn, float *BQ_R__ realOut) { + if (!m_fplanf) initFloat(); + packFloat(realIn, imagIn, m_fbuf, m_size/2+1); + mirror(m_fbuf, m_size); + sfft_execute(m_fplani, m_fbuf, m_fresult); + for (int i = 0; i < m_size; ++i) { + realOut[i] = m_fresult[i*2]; + } + } + + void inverseInterleaved(const float *BQ_R__ complexIn, float *BQ_R__ realOut) { + if (!m_fplanf) initFloat(); + v_convert((float *)m_fbuf, complexIn, m_size + 2); + mirror(m_fbuf, m_size); + sfft_execute(m_fplani, m_fbuf, m_fresult); + for (int i = 0; i < m_size; ++i) { + realOut[i] = m_fresult[i*2]; + } + } + + void inversePolar(const float *BQ_R__ magIn, const float *BQ_R__ phaseIn, float *BQ_R__ realOut) { + if (!m_fplanf) initFloat(); + const int hs = m_size/2; + for (int i = 0; i <= hs; ++i) { + m_fbuf[i*2] = magIn[i] * cosf(phaseIn[i]); + m_fbuf[i*2+1] = magIn[i] * sinf(phaseIn[i]); + } + mirror(m_fbuf, m_size); + sfft_execute(m_fplani, m_fbuf, m_fresult); + for (int i = 0; i < m_size; ++i) { + realOut[i] = m_fresult[i*2]; + } + } + + void inverseCepstral(const float *BQ_R__ magIn, float *BQ_R__ cepOut) { + if (!m_fplanf) initFloat(); + const int hs = m_size/2; + for (int i = 0; i <= hs; ++i) { + m_fbuf[i*2] = logf(magIn[i] + 0.00001); + m_fbuf[i*2+1] = 0.0f; + } + sfft_execute(m_fplani, m_fbuf, m_fresult); + for (int i = 0; i < m_size; ++i) { + cepOut[i] = m_fresult[i*2]; + } + } + +private: + sfft_plan_t *m_fplanf; + sfft_plan_t *m_fplani; + fft_float_type *m_fbuf; + fft_float_type *m_fresult; + + sfft_plan_t *m_dplanf; + sfft_plan_t *m_dplani; + fft_double_type *m_dbuf; + fft_double_type *m_dresult; + + const int m_size; +}; + +#endif /* HAVE_SFFT */ + +#ifdef HAVE_KISSFFT + +class D_KISSFFT : public FFTImpl +{ +public: + D_KISSFFT(int size) : + m_size(size), + m_fplanf(0), + m_fplani(0) + { +#ifdef FIXED_POINT +#error KISSFFT is not configured for float values +#endif + if (sizeof(kiss_fft_scalar) != sizeof(float)) { + std::cerr << "ERROR: KISSFFT is not configured for float values" + << std::endl; + } + + m_fbuf = new kiss_fft_scalar[m_size + 2]; + m_fpacked = new kiss_fft_cpx[m_size + 2]; + m_fplanf = kiss_fftr_alloc(m_size, 0, NULL, NULL); + m_fplani = kiss_fftr_alloc(m_size, 1, NULL, NULL); + } + + ~D_KISSFFT() { + kiss_fftr_free(m_fplanf); + kiss_fftr_free(m_fplani); + kiss_fft_cleanup(); + + delete[] m_fbuf; + delete[] m_fpacked; + } + + FFT::Precisions + getSupportedPrecisions() const { + return FFT::SinglePrecision; + } + + void initFloat() { } + void initDouble() { } + + void packFloat(const float *BQ_R__ re, const float *BQ_R__ im) { + const int hs = m_size/2; + for (int i = 0; i <= hs; ++i) { + m_fpacked[i].r = re[i]; + } + if (im) { + for (int i = 0; i <= hs; ++i) { + m_fpacked[i].i = im[i]; + } + } else { + for (int i = 0; i <= hs; ++i) { + m_fpacked[i].i = 0.f; + } + } + } + + void unpackFloat(float *BQ_R__ re, float *BQ_R__ im) { + const int hs = m_size/2; + for (int i = 0; i <= hs; ++i) { + re[i] = m_fpacked[i].r; + } + if (im) { + for (int i = 0; i <= hs; ++i) { + im[i] = m_fpacked[i].i; + } + } + } + + void packDouble(const double *BQ_R__ re, const double *BQ_R__ im) { + const int hs = m_size/2; + for (int i = 0; i <= hs; ++i) { + m_fpacked[i].r = float(re[i]); + } + if (im) { + for (int i = 0; i <= hs; ++i) { + m_fpacked[i].i = float(im[i]); + } + } else { + for (int i = 0; i <= hs; ++i) { + m_fpacked[i].i = 0.f; + } + } + } + + void unpackDouble(double *BQ_R__ re, double *BQ_R__ im) { + const int hs = m_size/2; + for (int i = 0; i <= hs; ++i) { + re[i] = double(m_fpacked[i].r); + } + if (im) { + for (int i = 0; i <= hs; ++i) { + im[i] = double(m_fpacked[i].i); + } + } + } + + void forward(const double *BQ_R__ realIn, double *BQ_R__ realOut, double *BQ_R__ imagOut) { + + v_convert(m_fbuf, realIn, m_size); + kiss_fftr(m_fplanf, m_fbuf, m_fpacked); + unpackDouble(realOut, imagOut); + } + + void forwardInterleaved(const double *BQ_R__ realIn, double *BQ_R__ complexOut) { + + v_convert(m_fbuf, realIn, m_size); + kiss_fftr(m_fplanf, m_fbuf, m_fpacked); + v_convert(complexOut, (float *)m_fpacked, m_size + 2); + } + + void forwardPolar(const double *BQ_R__ realIn, double *BQ_R__ magOut, double *BQ_R__ phaseOut) { + + for (int i = 0; i < m_size; ++i) { + m_fbuf[i] = float(realIn[i]); + } + + kiss_fftr(m_fplanf, m_fbuf, m_fpacked); + + const int hs = m_size/2; + + for (int i = 0; i <= hs; ++i) { + magOut[i] = sqrt(double(m_fpacked[i].r) * double(m_fpacked[i].r) + + double(m_fpacked[i].i) * double(m_fpacked[i].i)); + } + + for (int i = 0; i <= hs; ++i) { + phaseOut[i] = atan2(double(m_fpacked[i].i), double(m_fpacked[i].r)); + } + } + + void forwardMagnitude(const double *BQ_R__ realIn, double *BQ_R__ magOut) { + + for (int i = 0; i < m_size; ++i) { + m_fbuf[i] = float(realIn[i]); + } + + kiss_fftr(m_fplanf, m_fbuf, m_fpacked); + + const int hs = m_size/2; + + for (int i = 0; i <= hs; ++i) { + magOut[i] = sqrt(double(m_fpacked[i].r) * double(m_fpacked[i].r) + + double(m_fpacked[i].i) * double(m_fpacked[i].i)); + } + } + + void forward(const float *BQ_R__ realIn, float *BQ_R__ realOut, float *BQ_R__ imagOut) { + + kiss_fftr(m_fplanf, realIn, m_fpacked); + unpackFloat(realOut, imagOut); + } + + void forwardInterleaved(const float *BQ_R__ realIn, float *BQ_R__ complexOut) { + + kiss_fftr(m_fplanf, realIn, (kiss_fft_cpx *)complexOut); + } + + void forwardPolar(const float *BQ_R__ realIn, float *BQ_R__ magOut, float *BQ_R__ phaseOut) { + + kiss_fftr(m_fplanf, realIn, m_fpacked); + + const int hs = m_size/2; + + for (int i = 0; i <= hs; ++i) { + magOut[i] = sqrtf(m_fpacked[i].r * m_fpacked[i].r + + m_fpacked[i].i * m_fpacked[i].i); + } + + for (int i = 0; i <= hs; ++i) { + phaseOut[i] = atan2f(m_fpacked[i].i, m_fpacked[i].r); + } + } + + void forwardMagnitude(const float *BQ_R__ realIn, float *BQ_R__ magOut) { + + kiss_fftr(m_fplanf, realIn, m_fpacked); + + const int hs = m_size/2; + + for (int i = 0; i <= hs; ++i) { + magOut[i] = sqrtf(m_fpacked[i].r * m_fpacked[i].r + + m_fpacked[i].i * m_fpacked[i].i); + } + } + + void inverse(const double *BQ_R__ realIn, const double *BQ_R__ imagIn, double *BQ_R__ realOut) { + + packDouble(realIn, imagIn); + + kiss_fftri(m_fplani, m_fpacked, m_fbuf); + + for (int i = 0; i < m_size; ++i) { + realOut[i] = m_fbuf[i]; + } + } + + void inverseInterleaved(const double *BQ_R__ complexIn, double *BQ_R__ realOut) { + + v_convert((float *)m_fpacked, complexIn, m_size + 2); + + kiss_fftri(m_fplani, m_fpacked, m_fbuf); + + for (int i = 0; i < m_size; ++i) { + realOut[i] = m_fbuf[i]; + } + } + + void inversePolar(const double *BQ_R__ magIn, const double *BQ_R__ phaseIn, double *BQ_R__ realOut) { + + const int hs = m_size/2; + + for (int i = 0; i <= hs; ++i) { + m_fpacked[i].r = float(magIn[i] * cos(phaseIn[i])); + m_fpacked[i].i = float(magIn[i] * sin(phaseIn[i])); + } + + kiss_fftri(m_fplani, m_fpacked, m_fbuf); + + for (int i = 0; i < m_size; ++i) { + realOut[i] = m_fbuf[i]; + } + } + + void inverseCepstral(const double *BQ_R__ magIn, double *BQ_R__ cepOut) { + + const int hs = m_size/2; + + for (int i = 0; i <= hs; ++i) { + m_fpacked[i].r = float(log(magIn[i] + 0.000001)); + m_fpacked[i].i = 0.0f; + } + + kiss_fftri(m_fplani, m_fpacked, m_fbuf); + + for (int i = 0; i < m_size; ++i) { + cepOut[i] = m_fbuf[i]; + } + } + + void inverse(const float *BQ_R__ realIn, const float *BQ_R__ imagIn, float *BQ_R__ realOut) { + + packFloat(realIn, imagIn); + kiss_fftri(m_fplani, m_fpacked, realOut); + } + + void inverseInterleaved(const float *BQ_R__ complexIn, float *BQ_R__ realOut) { + + v_copy((float *)m_fpacked, complexIn, m_size + 2); + kiss_fftri(m_fplani, m_fpacked, realOut); + } + + void inversePolar(const float *BQ_R__ magIn, const float *BQ_R__ phaseIn, float *BQ_R__ realOut) { + + const int hs = m_size/2; + + for (int i = 0; i <= hs; ++i) { + m_fpacked[i].r = magIn[i] * cosf(phaseIn[i]); + m_fpacked[i].i = magIn[i] * sinf(phaseIn[i]); + } + + kiss_fftri(m_fplani, m_fpacked, realOut); + } + + void inverseCepstral(const float *BQ_R__ magIn, float *BQ_R__ cepOut) { + + const int hs = m_size/2; + + for (int i = 0; i <= hs; ++i) { + m_fpacked[i].r = logf(magIn[i] + 0.000001f); + m_fpacked[i].i = 0.0f; + } + + kiss_fftri(m_fplani, m_fpacked, cepOut); + } + +private: + const int m_size; + kiss_fftr_cfg m_fplanf; + kiss_fftr_cfg m_fplani; + kiss_fft_scalar *m_fbuf; + kiss_fft_cpx *m_fpacked; +}; + +#endif /* HAVE_KISSFFT */ + +#ifdef USE_BUILTIN_FFT + +class D_Cross : public FFTImpl +{ +public: + D_Cross(int size) : m_size(size), m_table(0) { + + m_a = new double[size]; + m_b = new double[size]; + m_c = new double[size]; + m_d = new double[size]; + + m_table = new int[m_size]; + + int bits; + int i, j, k, m; + + for (i = 0; ; ++i) { + if (m_size & (1 << i)) { + bits = i; + break; + } + } + + for (i = 0; i < m_size; ++i) { + + m = i; + + for (j = k = 0; j < bits; ++j) { + k = (k << 1) | (m & 1); + m >>= 1; + } + + m_table[i] = k; + } + } + + ~D_Cross() { + delete[] m_table; + delete[] m_a; + delete[] m_b; + delete[] m_c; + delete[] m_d; + } + + FFT::Precisions + getSupportedPrecisions() const { + return FFT::DoublePrecision; + } + + void initFloat() { } + void initDouble() { } + + void forward(const double *BQ_R__ realIn, double *BQ_R__ realOut, double *BQ_R__ imagOut) { + basefft(false, realIn, 0, m_c, m_d); + const int hs = m_size/2; + for (int i = 0; i <= hs; ++i) realOut[i] = m_c[i]; + if (imagOut) { + for (int i = 0; i <= hs; ++i) imagOut[i] = m_d[i]; + } + } + + void forwardInterleaved(const double *BQ_R__ realIn, double *BQ_R__ complexOut) { + basefft(false, realIn, 0, m_c, m_d); + const int hs = m_size/2; + for (int i = 0; i <= hs; ++i) complexOut[i*2] = m_c[i]; + for (int i = 0; i <= hs; ++i) complexOut[i*2+1] = m_d[i]; + } + + void forwardPolar(const double *BQ_R__ realIn, double *BQ_R__ magOut, double *BQ_R__ phaseOut) { + basefft(false, realIn, 0, m_c, m_d); + const int hs = m_size/2; + for (int i = 0; i <= hs; ++i) { + magOut[i] = sqrt(m_c[i] * m_c[i] + m_d[i] * m_d[i]); + phaseOut[i] = atan2(m_d[i], m_c[i]) ; + } + } + + void forwardMagnitude(const double *BQ_R__ realIn, double *BQ_R__ magOut) { + basefft(false, realIn, 0, m_c, m_d); + const int hs = m_size/2; + for (int i = 0; i <= hs; ++i) { + magOut[i] = sqrt(m_c[i] * m_c[i] + m_d[i] * m_d[i]); + } + } + + void forward(const float *BQ_R__ realIn, float *BQ_R__ realOut, float *BQ_R__ imagOut) { + for (int i = 0; i < m_size; ++i) m_a[i] = realIn[i]; + basefft(false, m_a, 0, m_c, m_d); + const int hs = m_size/2; + for (int i = 0; i <= hs; ++i) realOut[i] = m_c[i]; + if (imagOut) { + for (int i = 0; i <= hs; ++i) imagOut[i] = m_d[i]; + } + } + + void forwardInterleaved(const float *BQ_R__ realIn, float *BQ_R__ complexOut) { + for (int i = 0; i < m_size; ++i) m_a[i] = realIn[i]; + basefft(false, m_a, 0, m_c, m_d); + const int hs = m_size/2; + for (int i = 0; i <= hs; ++i) complexOut[i*2] = m_c[i]; + for (int i = 0; i <= hs; ++i) complexOut[i*2+1] = m_d[i]; + } + + void forwardPolar(const float *BQ_R__ realIn, float *BQ_R__ magOut, float *BQ_R__ phaseOut) { + for (int i = 0; i < m_size; ++i) m_a[i] = realIn[i]; + basefft(false, m_a, 0, m_c, m_d); + const int hs = m_size/2; + for (int i = 0; i <= hs; ++i) { + magOut[i] = sqrt(m_c[i] * m_c[i] + m_d[i] * m_d[i]); + phaseOut[i] = atan2(m_d[i], m_c[i]) ; + } + } + + void forwardMagnitude(const float *BQ_R__ realIn, float *BQ_R__ magOut) { + for (int i = 0; i < m_size; ++i) m_a[i] = realIn[i]; + basefft(false, m_a, 0, m_c, m_d); + const int hs = m_size/2; + for (int i = 0; i <= hs; ++i) { + magOut[i] = sqrt(m_c[i] * m_c[i] + m_d[i] * m_d[i]); + } + } + + void inverse(const double *BQ_R__ realIn, const double *BQ_R__ imagIn, double *BQ_R__ realOut) { + const int hs = m_size/2; + for (int i = 0; i <= hs; ++i) { + double real = realIn[i]; + double imag = imagIn[i]; + m_a[i] = real; + m_b[i] = imag; + if (i > 0) { + m_a[m_size-i] = real; + m_b[m_size-i] = -imag; + } + } + basefft(true, m_a, m_b, realOut, m_d); + } + + void inverseInterleaved(const double *BQ_R__ complexIn, double *BQ_R__ realOut) { + const int hs = m_size/2; + for (int i = 0; i <= hs; ++i) { + double real = complexIn[i*2]; + double imag = complexIn[i*2+1]; + m_a[i] = real; + m_b[i] = imag; + if (i > 0) { + m_a[m_size-i] = real; + m_b[m_size-i] = -imag; + } + } + basefft(true, m_a, m_b, realOut, m_d); + } + + void inversePolar(const double *BQ_R__ magIn, const double *BQ_R__ phaseIn, double *BQ_R__ realOut) { + const int hs = m_size/2; + for (int i = 0; i <= hs; ++i) { + double real = magIn[i] * cos(phaseIn[i]); + double imag = magIn[i] * sin(phaseIn[i]); + m_a[i] = real; + m_b[i] = imag; + if (i > 0) { + m_a[m_size-i] = real; + m_b[m_size-i] = -imag; + } + } + basefft(true, m_a, m_b, realOut, m_d); + } + + void inverseCepstral(const double *BQ_R__ magIn, double *BQ_R__ cepOut) { + const int hs = m_size/2; + for (int i = 0; i <= hs; ++i) { + double real = log(magIn[i] + 0.000001); + m_a[i] = real; + m_b[i] = 0.0; + if (i > 0) { + m_a[m_size-i] = real; + m_b[m_size-i] = 0.0; + } + } + basefft(true, m_a, m_b, cepOut, m_d); + } + + void inverse(const float *BQ_R__ realIn, const float *BQ_R__ imagIn, float *BQ_R__ realOut) { + const int hs = m_size/2; + for (int i = 0; i <= hs; ++i) { + float real = realIn[i]; + float imag = imagIn[i]; + m_a[i] = real; + m_b[i] = imag; + if (i > 0) { + m_a[m_size-i] = real; + m_b[m_size-i] = -imag; + } + } + basefft(true, m_a, m_b, m_c, m_d); + for (int i = 0; i < m_size; ++i) realOut[i] = m_c[i]; + } + + void inverseInterleaved(const float *BQ_R__ complexIn, float *BQ_R__ realOut) { + const int hs = m_size/2; + for (int i = 0; i <= hs; ++i) { + float real = complexIn[i*2]; + float imag = complexIn[i*2+1]; + m_a[i] = real; + m_b[i] = imag; + if (i > 0) { + m_a[m_size-i] = real; + m_b[m_size-i] = -imag; + } + } + basefft(true, m_a, m_b, m_c, m_d); + for (int i = 0; i < m_size; ++i) realOut[i] = m_c[i]; + } + + void inversePolar(const float *BQ_R__ magIn, const float *BQ_R__ phaseIn, float *BQ_R__ realOut) { + const int hs = m_size/2; + for (int i = 0; i <= hs; ++i) { + float real = magIn[i] * cosf(phaseIn[i]); + float imag = magIn[i] * sinf(phaseIn[i]); + m_a[i] = real; + m_b[i] = imag; + if (i > 0) { + m_a[m_size-i] = real; + m_b[m_size-i] = -imag; + } + } + basefft(true, m_a, m_b, m_c, m_d); + for (int i = 0; i < m_size; ++i) realOut[i] = m_c[i]; + } + + void inverseCepstral(const float *BQ_R__ magIn, float *BQ_R__ cepOut) { + const int hs = m_size/2; + for (int i = 0; i <= hs; ++i) { + float real = logf(magIn[i] + 0.000001); + m_a[i] = real; + m_b[i] = 0.0; + if (i > 0) { + m_a[m_size-i] = real; + m_b[m_size-i] = 0.0; + } + } + basefft(true, m_a, m_b, m_c, m_d); + for (int i = 0; i < m_size; ++i) cepOut[i] = m_c[i]; + } + +private: + const int m_size; + int *m_table; + double *m_a; + double *m_b; + double *m_c; + double *m_d; + void basefft(bool inverse, const double *BQ_R__ ri, const double *BQ_R__ ii, double *BQ_R__ ro, double *BQ_R__ io); +}; + +void +D_Cross::basefft(bool inverse, const double *BQ_R__ ri, const double *BQ_R__ ii, double *BQ_R__ ro, double *BQ_R__ io) +{ + if (!ri || !ro || !io) return; + + int i, j, k, m; + int blockSize, blockEnd; + + double tr, ti; + + double angle = 2.0 * M_PI; + if (inverse) angle = -angle; + + const int n = m_size; + + if (ii) { + for (i = 0; i < n; ++i) { + ro[m_table[i]] = ri[i]; + } + for (i = 0; i < n; ++i) { + io[m_table[i]] = ii[i]; + } + } else { + for (i = 0; i < n; ++i) { + ro[m_table[i]] = ri[i]; + } + for (i = 0; i < n; ++i) { + io[m_table[i]] = 0.0; + } + } + + blockEnd = 1; + + for (blockSize = 2; blockSize <= n; blockSize <<= 1) { + + double delta = angle / (double)blockSize; + double sm2 = -sin(-2 * delta); + double sm1 = -sin(-delta); + double cm2 = cos(-2 * delta); + double cm1 = cos(-delta); + double w = 2 * cm1; + double ar[3], ai[3]; + + for (i = 0; i < n; i += blockSize) { + + ar[2] = cm2; + ar[1] = cm1; + + ai[2] = sm2; + ai[1] = sm1; + + for (j = i, m = 0; m < blockEnd; j++, m++) { + + ar[0] = w * ar[1] - ar[2]; + ar[2] = ar[1]; + ar[1] = ar[0]; + + ai[0] = w * ai[1] - ai[2]; + ai[2] = ai[1]; + ai[1] = ai[0]; + + k = j + blockEnd; + tr = ar[0] * ro[k] - ai[0] * io[k]; + ti = ar[0] * io[k] + ai[0] * ro[k]; + + ro[k] = ro[j] - tr; + io[k] = io[j] - ti; + + ro[j] += tr; + io[j] += ti; + } + } + + blockEnd = blockSize; + } + +/* fftw doesn't rescale, so nor will we + + if (inverse) { + + double denom = (double)n; + + for (i = 0; i < n; i++) { + ro[i] /= denom; + io[i] /= denom; + } + } +*/ +} + +#endif /* USE_BUILTIN_FFT */ + +} /* end namespace FFTs */ + +std::string +FFT::m_implementation; + +std::set +FFT::getImplementations() +{ + std::set impls; +#ifdef HAVE_IPP + impls.insert("ipp"); +#endif +#ifdef HAVE_FFTW3 + impls.insert("fftw"); +#endif +#ifdef HAVE_KISSFFT + impls.insert("kissfft"); +#endif +#ifdef HAVE_VDSP + impls.insert("vdsp"); +#endif +#ifdef HAVE_MEDIALIB + impls.insert("medialib"); +#endif +#ifdef HAVE_OPENMAX + impls.insert("openmax"); +#endif +#ifdef HAVE_SFFT + impls.insert("sfft"); +#endif +#ifdef USE_BUILTIN_FFT + impls.insert("cross"); +#endif + return impls; +} + +void +FFT::pickDefaultImplementation() +{ + if (m_implementation != "") return; + + std::set impls = getImplementations(); + + std::string best = "cross"; + if (impls.find("kissfft") != impls.end()) best = "kissfft"; + if (impls.find("medialib") != impls.end()) best = "medialib"; + if (impls.find("openmax") != impls.end()) best = "openmax"; + if (impls.find("sfft") != impls.end()) best = "sfft"; + if (impls.find("fftw") != impls.end()) best = "fftw"; + if (impls.find("vdsp") != impls.end()) best = "vdsp"; + if (impls.find("ipp") != impls.end()) best = "ipp"; + + m_implementation = best; +} + +std::string +FFT::getDefaultImplementation() +{ + return m_implementation; +} + +void +FFT::setDefaultImplementation(std::string i) +{ + m_implementation = i; +} + +FFT::FFT(int size, int debugLevel) : + d(0) +{ + if ((size < 2) || + (size & (size-1))) { + std::cerr << "FFT::FFT(" << size << "): power-of-two sizes only supported, minimum size 2" << std::endl; +#ifndef NO_EXCEPTIONS + throw InvalidSize; +#else + abort(); +#endif + } + + if (m_implementation == "") pickDefaultImplementation(); + std::string impl = m_implementation; + + if (debugLevel > 0) { + std::cerr << "FFT::FFT(" << size << "): using implementation: " + << impl << std::endl; + } + + if (impl == "ipp") { +#ifdef HAVE_IPP + d = new FFTs::D_IPP(size); +#endif + } else if (impl == "fftw") { +#ifdef HAVE_FFTW3 + d = new FFTs::D_FFTW(size); +#endif + } else if (impl == "kissfft") { +#ifdef HAVE_KISSFFT + d = new FFTs::D_KISSFFT(size); +#endif + } else if (impl == "vdsp") { +#ifdef HAVE_VDSP + d = new FFTs::D_VDSP(size); +#endif + } else if (impl == "medialib") { +#ifdef HAVE_MEDIALIB + d = new FFTs::D_MEDIALIB(size); +#endif + } else if (impl == "openmax") { +#ifdef HAVE_OPENMAX + d = new FFTs::D_OPENMAX(size); +#endif + } else if (impl == "sfft") { +#ifdef HAVE_SFFT + d = new FFTs::D_SFFT(size); +#endif + } else if (impl == "cross") { +#ifdef USE_BUILTIN_FFT + d = new FFTs::D_Cross(size); +#endif + } + + if (!d) { + std::cerr << "FFT::FFT(" << size << "): ERROR: implementation " + << impl << " is not compiled in" << std::endl; +#ifndef NO_EXCEPTIONS + throw InvalidImplementation; +#else + abort(); +#endif + } +} + +FFT::~FFT() +{ + delete d; +} + +#ifndef NO_EXCEPTIONS +#define CHECK_NOT_NULL(x) \ + if (!(x)) { \ + std::cerr << "FFT: ERROR: Null argument " #x << std::endl; \ + throw NullArgument; \ + } +#else +#define CHECK_NOT_NULL(x) \ + if (!(x)) { \ + std::cerr << "FFT: ERROR: Null argument " #x << std::endl; \ + std::cerr << "FFT: Would be throwing NullArgument here, if exceptions were not disabled" << std::endl; \ + return; \ + } +#endif + +void +FFT::forward(const double *BQ_R__ realIn, double *BQ_R__ realOut, double *BQ_R__ imagOut) +{ + CHECK_NOT_NULL(realIn); + CHECK_NOT_NULL(realOut); + CHECK_NOT_NULL(imagOut); + d->forward(realIn, realOut, imagOut); +} + +void +FFT::forwardInterleaved(const double *BQ_R__ realIn, double *BQ_R__ complexOut) +{ + CHECK_NOT_NULL(realIn); + CHECK_NOT_NULL(complexOut); + d->forwardInterleaved(realIn, complexOut); +} + +void +FFT::forwardPolar(const double *BQ_R__ realIn, double *BQ_R__ magOut, double *BQ_R__ phaseOut) +{ + CHECK_NOT_NULL(realIn); + CHECK_NOT_NULL(magOut); + CHECK_NOT_NULL(phaseOut); + d->forwardPolar(realIn, magOut, phaseOut); +} + +void +FFT::forwardMagnitude(const double *BQ_R__ realIn, double *BQ_R__ magOut) +{ + CHECK_NOT_NULL(realIn); + CHECK_NOT_NULL(magOut); + d->forwardMagnitude(realIn, magOut); +} + +void +FFT::forward(const float *BQ_R__ realIn, float *BQ_R__ realOut, float *BQ_R__ imagOut) +{ + CHECK_NOT_NULL(realIn); + CHECK_NOT_NULL(realOut); + CHECK_NOT_NULL(imagOut); + d->forward(realIn, realOut, imagOut); +} + +void +FFT::forwardInterleaved(const float *BQ_R__ realIn, float *BQ_R__ complexOut) +{ + CHECK_NOT_NULL(realIn); + CHECK_NOT_NULL(complexOut); + d->forwardInterleaved(realIn, complexOut); +} + +void +FFT::forwardPolar(const float *BQ_R__ realIn, float *BQ_R__ magOut, float *BQ_R__ phaseOut) +{ + CHECK_NOT_NULL(realIn); + CHECK_NOT_NULL(magOut); + CHECK_NOT_NULL(phaseOut); + d->forwardPolar(realIn, magOut, phaseOut); +} + +void +FFT::forwardMagnitude(const float *BQ_R__ realIn, float *BQ_R__ magOut) +{ + CHECK_NOT_NULL(realIn); + CHECK_NOT_NULL(magOut); + d->forwardMagnitude(realIn, magOut); +} + +void +FFT::inverse(const double *BQ_R__ realIn, const double *BQ_R__ imagIn, double *BQ_R__ realOut) +{ + CHECK_NOT_NULL(realIn); + CHECK_NOT_NULL(imagIn); + CHECK_NOT_NULL(realOut); + d->inverse(realIn, imagIn, realOut); +} + +void +FFT::inverseInterleaved(const double *BQ_R__ complexIn, double *BQ_R__ realOut) +{ + CHECK_NOT_NULL(complexIn); + CHECK_NOT_NULL(realOut); + d->inverseInterleaved(complexIn, realOut); +} + +void +FFT::inversePolar(const double *BQ_R__ magIn, const double *BQ_R__ phaseIn, double *BQ_R__ realOut) +{ + CHECK_NOT_NULL(magIn); + CHECK_NOT_NULL(phaseIn); + CHECK_NOT_NULL(realOut); + d->inversePolar(magIn, phaseIn, realOut); +} + +void +FFT::inverseCepstral(const double *BQ_R__ magIn, double *BQ_R__ cepOut) +{ + CHECK_NOT_NULL(magIn); + CHECK_NOT_NULL(cepOut); + d->inverseCepstral(magIn, cepOut); +} + +void +FFT::inverse(const float *BQ_R__ realIn, const float *BQ_R__ imagIn, float *BQ_R__ realOut) +{ + CHECK_NOT_NULL(realIn); + CHECK_NOT_NULL(imagIn); + CHECK_NOT_NULL(realOut); + d->inverse(realIn, imagIn, realOut); +} + +void +FFT::inverseInterleaved(const float *BQ_R__ complexIn, float *BQ_R__ realOut) +{ + CHECK_NOT_NULL(complexIn); + CHECK_NOT_NULL(realOut); + d->inverseInterleaved(complexIn, realOut); +} + +void +FFT::inversePolar(const float *BQ_R__ magIn, const float *BQ_R__ phaseIn, float *BQ_R__ realOut) +{ + CHECK_NOT_NULL(magIn); + CHECK_NOT_NULL(phaseIn); + CHECK_NOT_NULL(realOut); + d->inversePolar(magIn, phaseIn, realOut); +} + +void +FFT::inverseCepstral(const float *BQ_R__ magIn, float *BQ_R__ cepOut) +{ + CHECK_NOT_NULL(magIn); + CHECK_NOT_NULL(cepOut); + d->inverseCepstral(magIn, cepOut); +} + +void +FFT::initFloat() +{ + d->initFloat(); +} + +void +FFT::initDouble() +{ + d->initDouble(); +} + +FFT::Precisions +FFT::getSupportedPrecisions() const +{ + return d->getSupportedPrecisions(); +} + +#ifdef FFT_MEASUREMENT + +std::string +FFT::tune() +{ + std::ostringstream os; + os << "FFT::tune()..." << std::endl; + + std::vector sizes; + std::map candidates; + std::map wins; + + sizes.push_back(512); + sizes.push_back(1024); + sizes.push_back(4096); + + for (unsigned int si = 0; si < sizes.size(); ++si) { + + int size = sizes[si]; + + while (!candidates.empty()) { + delete candidates.begin()->first; + candidates.erase(candidates.begin()); + } + + FFTImpl *d; + +#ifdef HAVE_IPP + std::cerr << "Constructing new IPP FFT object for size " << size << "..." << std::endl; + d = new FFTs::D_IPP(size); + d->initFloat(); + d->initDouble(); + candidates[d] = 0; +#endif + +#ifdef HAVE_FFTW3 + os << "Constructing new FFTW3 FFT object for size " << size << "..." << std::endl; + d = new FFTs::D_FFTW(size); + d->initFloat(); + d->initDouble(); + candidates[d] = 1; +#endif + +#ifdef HAVE_KISSFFT + os << "Constructing new KISSFFT object for size " << size << "..." << std::endl; + d = new FFTs::D_KISSFFT(size); + d->initFloat(); + d->initDouble(); + candidates[d] = 2; +#endif + +#ifdef USE_BUILTIN_FFT + os << "Constructing new Cross FFT object for size " << size << "..." << std::endl; + d = new FFTs::D_Cross(size); + d->initFloat(); + d->initDouble(); + candidates[d] = 3; +#endif + +#ifdef HAVE_VDSP + os << "Constructing new vDSP FFT object for size " << size << "..." << std::endl; + d = new FFTs::D_VDSP(size); + d->initFloat(); + d->initDouble(); + candidates[d] = 4; +#endif + +#ifdef HAVE_MEDIALIB + std::cerr << "Constructing new MediaLib FFT object for size " << size << "..." << std::endl; + d = new FFTs::D_MEDIALIB(size); + d->initFloat(); + d->initDouble(); + candidates[d] = 5; +#endif + +#ifdef HAVE_OPENMAX + os << "Constructing new OpenMAX FFT object for size " << size << "..." << std::endl; + d = new FFTs::D_OPENMAX(size); + d->initFloat(); + d->initDouble(); + candidates[d] = 6; +#endif + +#ifdef HAVE_SFFT + os << "Constructing new SFFT FFT object for size " << size << "..." << std::endl; + d = new FFTs::D_SFFT(size); +// d->initFloat(); + d->initDouble(); + candidates[d] = 6; +#endif + + os << "CLOCKS_PER_SEC = " << CLOCKS_PER_SEC << std::endl; + float divisor = float(CLOCKS_PER_SEC) / 1000.f; + + os << "Timing order is: "; + for (std::map::iterator ci = candidates.begin(); + ci != candidates.end(); ++ci) { + os << ci->second << " "; + } + os << std::endl; + + int iterations = 500; + os << "Iterations: " << iterations << std::endl; + + double *da = new double[size]; + double *db = new double[size]; + double *dc = new double[size]; + double *dd = new double[size]; + double *di = new double[size + 2]; + double *dj = new double[size + 2]; + + float *fa = new float[size]; + float *fb = new float[size]; + float *fc = new float[size]; + float *fd = new float[size]; + float *fi = new float[size + 2]; + float *fj = new float[size + 2]; + + for (int type = 0; type < 16; ++type) { + + //!!! + if ((type > 3 && type < 8) || + (type > 11)) { + continue; + } + + if (type > 7) { + // inverse transform: bigger inputs, to simulate the + // fact that the forward transform is unscaled + for (int i = 0; i < size; ++i) { + da[i] = drand48() * size; + fa[i] = da[i]; + db[i] = drand48() * size; + fb[i] = db[i]; + } + } else { + for (int i = 0; i < size; ++i) { + da[i] = drand48(); + fa[i] = da[i]; + db[i] = drand48(); + fb[i] = db[i]; + } + } + + for (int i = 0; i < size + 2; ++i) { + di[i] = drand48(); + fi[i] = di[i]; + } + + int low = -1; + int lowscore = 0; + + const char *names[] = { + + "Forward Cartesian Double", + "Forward Interleaved Double", + "Forward Polar Double", + "Forward Magnitude Double", + "Forward Cartesian Float", + "Forward Interleaved Float", + "Forward Polar Float", + "Forward Magnitude Float", + + "Inverse Cartesian Double", + "Inverse Interleaved Double", + "Inverse Polar Double", + "Inverse Cepstral Double", + "Inverse Cartesian Float", + "Inverse Interleaved Float", + "Inverse Polar Float", + "Inverse Cepstral Float" + }; + os << names[type] << " :: "; + + for (std::map::iterator ci = candidates.begin(); + ci != candidates.end(); ++ci) { + + FFTImpl *d = ci->first; + + double mean = 0; + + clock_t start = clock(); + + for (int i = 0; i < iterations; ++i) { + + if (i == 0) { + for (int j = 0; j < size; ++j) { + dc[j] = 0; + dd[j] = 0; + fc[j] = 0; + fd[j] = 0; + fj[j] = 0; + dj[j] = 0; + } + } + + switch (type) { + case 0: d->forward(da, dc, dd); break; + case 1: d->forwardInterleaved(da, dj); break; + case 2: d->forwardPolar(da, dc, dd); break; + case 3: d->forwardMagnitude(da, dc); break; + case 4: d->forward(fa, fc, fd); break; + case 5: d->forwardInterleaved(fa, fj); break; + case 6: d->forwardPolar(fa, fc, fd); break; + case 7: d->forwardMagnitude(fa, fc); break; + case 8: d->inverse(da, db, dc); break; + case 9: d->inverseInterleaved(di, dc); break; + case 10: d->inversePolar(da, db, dc); break; + case 11: d->inverseCepstral(da, dc); break; + case 12: d->inverse(fa, fb, fc); break; + case 13: d->inverseInterleaved(fi, fc); break; + case 14: d->inversePolar(fa, fb, fc); break; + case 15: d->inverseCepstral(fa, fc); break; + } + + if (i == 0) { + mean = 0; + for (int j = 0; j < size; ++j) { + mean += dc[j]; + mean += dd[j]; + mean += fc[j]; + mean += fd[j]; + mean += fj[j]; + mean += dj[j]; + } + mean /= size * 6; + } + } + + clock_t end = clock(); + + os << float(end - start)/divisor << " (" << mean << ") "; + + if (low == -1 || (end - start) < lowscore) { + low = ci->second; + lowscore = end - start; + } + } + + os << std::endl; + + os << " size " << size << ", type " << type << ": fastest is " << low << " (time " << float(lowscore)/divisor << ")" << std::endl; + + wins[low]++; + } + + delete[] fa; + delete[] fb; + delete[] fc; + delete[] fd; + delete[] da; + delete[] db; + delete[] dc; + delete[] dd; + } + + while (!candidates.empty()) { + delete candidates.begin()->first; + candidates.erase(candidates.begin()); + } + + int bestscore = 0; + int best = -1; + + for (std::map::iterator wi = wins.begin(); wi != wins.end(); ++wi) { + if (best == -1 || wi->second > bestscore) { + best = wi->first; + bestscore = wi->second; + } + } + + os << "overall winner is " << best << " with " << bestscore << " wins" << std::endl; + + return os.str(); +} + +#endif + +} diff -r 77fca9c01e46 -r cf59817a5983 fft/native/bqfft/test/Compares.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fft/native/bqfft/test/Compares.h Sat Oct 17 15:00:08 2015 +0100 @@ -0,0 +1,77 @@ +/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */ + +/* + bqfft + + A small library wrapping various FFT implementations for some + common audio processing use cases. + + Copyright 2007-2015 Particular Programs Ltd. + + Permission is hereby granted, free of charge, to any person + obtaining a copy of this software and associated documentation + files (the "Software"), to deal in the Software without + restriction, including without limitation the rights to use, copy, + modify, merge, publish, distribute, sublicense, and/or sell copies + of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR + ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF + CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + Except as contained in this notice, the names of Chris Cannam and + Particular Programs Ltd shall not be used in advertising or + otherwise to promote the sale, use or other dealings in this + Software without prior written authorization. +*/ + +#ifndef TEST_COMPARES_H +#define TEST_COMPARES_H + +// These macros are used for comparing generated results, and they +// aren't always going to be exact. Adding 0.1 to each value gives +// us a little more fuzz in qFuzzyCompare (which ultimately does +// the comparison). + +#define COMPARE_ZERO(a) \ + QCOMPARE(a + 0.1, 0.1) + +#define COMPARE_ZERO_F(a) \ + QCOMPARE(a + 0.1f, 0.1f) + +#define COMPARE_FUZZIER(a, b) \ + QCOMPARE(a + 0.1, b + 0.1) + +#define COMPARE_FUZZIER_F(a, b) \ + QCOMPARE(a + 0.1f, b + 0.1f) + +#define COMPARE_ALL(a, n) \ + for (int cmp_i = 0; cmp_i < (int)(sizeof(a)/sizeof(a[0])); ++cmp_i) { \ + COMPARE_FUZZIER(a[cmp_i], n); \ + } + +#define COMPARE_SCALED(a, b, s) \ + for (int cmp_i = 0; cmp_i < (int)(sizeof(a)/sizeof(a[0])); ++cmp_i) { \ + COMPARE_FUZZIER(a[cmp_i] / s, b[cmp_i]); \ + } + +#define COMPARE_ALL_F(a, n) \ + for (int cmp_i = 0; cmp_i < (int)(sizeof(a)/sizeof(a[0])); ++cmp_i) { \ + COMPARE_FUZZIER_F(a[cmp_i], n); \ + } + +#define COMPARE_SCALED_F(a, b, s) \ + for (int cmp_i = 0; cmp_i < (int)(sizeof(a)/sizeof(a[0])); ++cmp_i) { \ + COMPARE_FUZZIER_F(a[cmp_i] / s, b[cmp_i]); \ + } + +#endif + diff -r 77fca9c01e46 -r cf59817a5983 fft/native/bqfft/test/TestFFT.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fft/native/bqfft/test/TestFFT.h Sat Oct 17 15:00:08 2015 +0100 @@ -0,0 +1,510 @@ +/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */ + +/* + bqfft + + A small library wrapping various FFT implementations for some + common audio processing use cases. + + Copyright 2007-2015 Particular Programs Ltd. + + Permission is hereby granted, free of charge, to any person + obtaining a copy of this software and associated documentation + files (the "Software"), to deal in the Software without + restriction, including without limitation the rights to use, copy, + modify, merge, publish, distribute, sublicense, and/or sell copies + of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR + ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF + CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + Except as contained in this notice, the names of Chris Cannam and + Particular Programs Ltd shall not be used in advertising or + otherwise to promote the sale, use or other dealings in this + Software without prior written authorization. +*/ + +#ifndef TEST_FFT_H +#define TEST_FFT_H + +#include "bqfft/FFT.h" + +#include +#include + +#include + +#include "Compares.h" + +namespace breakfastquay { + +class TestFFT : public QObject +{ + Q_OBJECT + +private: + void idat() { + QTest::addColumn("implementation"); + std::set impls = FFT::getImplementations(); + foreach (std::string i, impls) { + QTest::newRow(i.c_str()) << i.c_str(); + } + } + QString ifetch() { + QFETCH(QString, implementation); + FFT::setDefaultImplementation(implementation.toLocal8Bit().data()); + return implementation; + } + + bool lackSingle() { + return !(FFT(4).getSupportedPrecisions() & FFT::SinglePrecision); + } + bool lackDouble() { + return !(FFT(4).getSupportedPrecisions() & FFT::DoublePrecision); + } + +private slots: + + void checkD() { + QString impl = ifetch(); + } + + void dc() { + ifetch(); + // DC-only signal. The DC bin is purely real + double in[] = { 1, 1, 1, 1 }; + double re[3], im[3]; + FFT(4).forward(in, re, im); + QCOMPARE(re[0], 4.0); + COMPARE_ZERO(re[1]); + COMPARE_ZERO(re[2]); + COMPARE_ALL(im, 0.0); + double back[4]; + FFT(4).inverse(re, im, back); + COMPARE_SCALED(back, in, 4); + } + + void sine() { + ifetch(); + // Sine. Output is purely imaginary + double in[] = { 0, 1, 0, -1 }; + double re[3], im[3]; + FFT(4).forward(in, re, im); + COMPARE_ALL(re, 0.0); + COMPARE_ZERO(im[0]); + QCOMPARE(im[1], -2.0); + COMPARE_ZERO(im[2]); + double back[4]; + FFT(4).inverse(re, im, back); + COMPARE_SCALED(back, in, 4); + } + + void cosine() { + ifetch(); + // Cosine. Output is purely real + double in[] = { 1, 0, -1, 0 }; + double re[3], im[3]; + FFT(4).forward(in, re, im); + COMPARE_ZERO(re[0]); + QCOMPARE(re[1], 2.0); + COMPARE_ZERO(re[2]); + COMPARE_ALL(im, 0.0); + double back[4]; + FFT(4).inverse(re, im, back); + COMPARE_SCALED(back, in, 4); + } + + void sineCosine() { + ifetch(); + // Sine and cosine mixed + double in[] = { 0.5, 1, -0.5, -1 }; + double re[3], im[3]; + FFT(4).forward(in, re, im); + COMPARE_ZERO(re[0]); + QCOMPARE(re[1], 1.0); + COMPARE_ZERO(re[2]); + COMPARE_ZERO(im[0]); + QCOMPARE(im[1], -2.0); + COMPARE_ZERO(im[2]); + double back[4]; + FFT(4).inverse(re, im, back); + COMPARE_SCALED(back, in, 4); + } + + void nyquist() { + ifetch(); + double in[] = { 1, -1, 1, -1 }; + double re[3], im[3]; + FFT(4).forward(in, re, im); + COMPARE_ZERO(re[0]); + COMPARE_ZERO(re[1]); + QCOMPARE(re[2], 4.0); + COMPARE_ALL(im, 0.0); + double back[4]; + FFT(4).inverse(re, im, back); + COMPARE_SCALED(back, in, 4); + } + + void interleaved() { + ifetch(); + // Sine and cosine mixed, test output format + double in[] = { 0.5, 1, -0.5, -1 }; + double out[6]; + FFT(4).forwardInterleaved(in, out); + COMPARE_ZERO(out[0]); + COMPARE_ZERO(out[1]); + QCOMPARE(out[2], 1.0); + QCOMPARE(out[3], -2.0); + COMPARE_ZERO(out[4]); + COMPARE_ZERO(out[5]); + double back[4]; + FFT(4).inverseInterleaved(out, back); + COMPARE_SCALED(back, in, 4); + } + + void cosinePolar() { + ifetch(); + double in[] = { 1, 0, -1, 0 }; + double mag[3], phase[3]; + FFT(4).forwardPolar(in, mag, phase); + COMPARE_ZERO(mag[0]); + QCOMPARE(mag[1], 2.0); + COMPARE_ZERO(mag[2]); + // No meaningful tests for phase[i] where mag[i]==0 (phase + // could legitimately be anything) + COMPARE_ZERO(phase[1]); + double back[4]; + FFT(4).inversePolar(mag, phase, back); + COMPARE_SCALED(back, in, 4); + } + + void sinePolar() { + ifetch(); + double in[] = { 0, 1, 0, -1 }; + double mag[3], phase[3]; + FFT(4).forwardPolar(in, mag, phase); + COMPARE_ZERO(mag[0]); + QCOMPARE(mag[1], 2.0); + COMPARE_ZERO(mag[2]); + // No meaningful tests for phase[i] where mag[i]==0 (phase + // could legitimately be anything) + QCOMPARE(phase[1], -M_PI/2.0); + double back[4]; + FFT(4).inversePolar(mag, phase, back); + COMPARE_SCALED(back, in, 4); + } + + void magnitude() { + ifetch(); + // Sine and cosine mixed + double in[] = { 0.5, 1, -0.5, -1 }; + double out[3]; + FFT(4).forwardMagnitude(in, out); + COMPARE_ZERO(out[0]); + QCOMPARE(float(out[1]), sqrtf(5.0)); + COMPARE_ZERO(out[2]); + } + + void dirac() { + ifetch(); + double in[] = { 1, 0, 0, 0 }; + double re[3], im[3]; + FFT(4).forward(in, re, im); + QCOMPARE(re[0], 1.0); + QCOMPARE(re[1], 1.0); + QCOMPARE(re[2], 1.0); + COMPARE_ALL(im, 0.0); + double back[4]; + FFT(4).inverse(re, im, back); + COMPARE_SCALED(back, in, 4); + } + + void cepstrum() { + ifetch(); + double in[] = { 1, 0, 0, 0, 1, 0, 0, 0 }; + double mag[5]; + FFT(8).forwardMagnitude(in, mag); + double cep[8]; + FFT(8).inverseCepstral(mag, cep); + COMPARE_ZERO(cep[1]); + COMPARE_ZERO(cep[2]); + COMPARE_ZERO(cep[3]); + COMPARE_ZERO(cep[5]); + COMPARE_ZERO(cep[6]); + COMPARE_ZERO(cep[7]); + QVERIFY(fabs(-6.561181 - cep[0]/8) < 0.000001); + QVERIFY(fabs( 7.254329 - cep[4]/8) < 0.000001); + } + + void forwardArrayBounds() { + ifetch(); + // initialise bins to something recognisable, so we can tell + // if they haven't been written + double in[] = { 1, 1, -1, -1 }; + double re[] = { 999, 999, 999, 999, 999 }; + double im[] = { 999, 999, 999, 999, 999 }; + FFT(4).forward(in, re+1, im+1); + // And check we haven't overrun the arrays + QCOMPARE(re[0], 999.0); + QCOMPARE(im[0], 999.0); + QCOMPARE(re[4], 999.0); + QCOMPARE(im[4], 999.0); + } + + void inverseArrayBounds() { + ifetch(); + // initialise bins to something recognisable, so we can tell + // if they haven't been written + double re[] = { 0, 1, 0 }; + double im[] = { 0, -2, 0 }; + double out[] = { 999, 999, 999, 999, 999, 999 }; + FFT(4).inverse(re, im, out+1); + // And check we haven't overrun the arrays + QCOMPARE(out[0], 999.0); + QCOMPARE(out[5], 999.0); + } + + void checkF() { + QString impl = ifetch(); + } + + void dcF() { + ifetch(); + // DC-only signal. The DC bin is purely real + float in[] = { 1, 1, 1, 1 }; + float re[3], im[3]; + FFT(4).forward(in, re, im); + QCOMPARE(re[0], 4.0f); + COMPARE_ZERO_F(re[1]); + COMPARE_ZERO_F(re[2]); + COMPARE_ALL_F(im, 0.0f); + float back[4]; + FFT(4).inverse(re, im, back); + COMPARE_SCALED_F(back, in, 4); + } + + void sineF() { + ifetch(); + // Sine. Output is purely imaginary + float in[] = { 0, 1, 0, -1 }; + float re[3], im[3]; + FFT(4).forward(in, re, im); + COMPARE_ALL_F(re, 0.0f); + COMPARE_ZERO_F(im[0]); + QCOMPARE(im[1], -2.0f); + COMPARE_ZERO_F(im[2]); + float back[4]; + FFT(4).inverse(re, im, back); + COMPARE_SCALED_F(back, in, 4); + } + + void cosineF() { + ifetch(); + // Cosine. Output is purely real + float in[] = { 1, 0, -1, 0 }; + float re[3], im[3]; + FFT(4).forward(in, re, im); + COMPARE_ZERO_F(re[0]); + QCOMPARE(re[1], 2.0f); + COMPARE_ZERO_F(re[2]); + COMPARE_ALL_F(im, 0.0f); + float back[4]; + FFT(4).inverse(re, im, back); + COMPARE_SCALED_F(back, in, 4); + } + + void sineCosineF() { + ifetch(); + // Sine and cosine mixed + float in[] = { 0.5, 1, -0.5, -1 }; + float re[3], im[3]; + FFT(4).forward(in, re, im); + COMPARE_ZERO_F(re[0]); + QCOMPARE(re[1], 1.0f); + COMPARE_ZERO_F(re[2]); + COMPARE_ZERO_F(im[0]); + QCOMPARE(im[1], -2.0f); + COMPARE_ZERO_F(im[2]); + float back[4]; + FFT(4).inverse(re, im, back); + COMPARE_SCALED_F(back, in, 4); + } + + void nyquistF() { + ifetch(); + float in[] = { 1, -1, 1, -1 }; + float re[3], im[3]; + FFT(4).forward(in, re, im); + COMPARE_ZERO_F(re[0]); + COMPARE_ZERO_F(re[1]); + QCOMPARE(re[2], 4.0f); + COMPARE_ALL_F(im, 0.0f); + float back[4]; + FFT(4).inverse(re, im, back); + COMPARE_SCALED_F(back, in, 4); + } + + void interleavedF() { + ifetch(); + // Sine and cosine mixed, test output format + float in[] = { 0.5, 1, -0.5, -1 }; + float out[6]; + FFT(4).forwardInterleaved(in, out); + COMPARE_ZERO_F(out[0]); + COMPARE_ZERO_F(out[1]); + QCOMPARE(out[2], 1.0f); + QCOMPARE(out[3], -2.0f); + COMPARE_ZERO_F(out[4]); + COMPARE_ZERO_F(out[5]); + float back[4]; + FFT(4).inverseInterleaved(out, back); + COMPARE_SCALED_F(back, in, 4); + } + + void cosinePolarF() { + ifetch(); + float in[] = { 1, 0, -1, 0 }; + float mag[3], phase[3]; + FFT(4).forwardPolar(in, mag, phase); + COMPARE_ZERO_F(mag[0]); + QCOMPARE(mag[1], 2.0f); + COMPARE_ZERO_F(mag[2]); + // No meaningful tests for phase[i] where mag[i]==0 (phase + // could legitimately be anything) + COMPARE_ZERO_F(phase[1]); + float back[4]; + FFT(4).inversePolar(mag, phase, back); + COMPARE_SCALED_F(back, in, 4); + } + + void sinePolarF() { + ifetch(); + float in[] = { 0, 1, 0, -1 }; + float mag[3], phase[3]; + FFT(4).forwardPolar(in, mag, phase); + COMPARE_ZERO_F(mag[0]); + QCOMPARE(mag[1], 2.0f); + COMPARE_ZERO_F(mag[2]); + // No meaningful tests for phase[i] where mag[i]==0 (phase + // could legitimately be anything) + QCOMPARE(phase[1], -float(M_PI)/2.0f); + float back[4]; + FFT(4).inversePolar(mag, phase, back); + COMPARE_SCALED_F(back, in, 4); + } + + void magnitudeF() { + ifetch(); + // Sine and cosine mixed + float in[] = { 0.5, 1, -0.5, -1 }; + float out[3]; + FFT(4).forwardMagnitude(in, out); + COMPARE_ZERO_F(out[0]); + QCOMPARE(float(out[1]), sqrtf(5.0f)); + COMPARE_ZERO_F(out[2]); + } + + void diracF() { + ifetch(); + float in[] = { 1, 0, 0, 0 }; + float re[3], im[3]; + FFT(4).forward(in, re, im); + QCOMPARE(re[0], 1.0f); + QCOMPARE(re[1], 1.0f); + QCOMPARE(re[2], 1.0f); + COMPARE_ALL_F(im, 0.0f); + float back[4]; + FFT(4).inverse(re, im, back); + COMPARE_SCALED_F(back, in, 4); + } + + void cepstrumF() { + ifetch(); + float in[] = { 1, 0, 0, 0, 1, 0, 0, 0 }; + float mag[5]; + FFT(8).forwardMagnitude(in, mag); + float cep[8]; + FFT(8).inverseCepstral(mag, cep); + COMPARE_ZERO_F(cep[1]); + COMPARE_ZERO_F(cep[2]); + COMPARE_ZERO_F(cep[3]); + COMPARE_ZERO_F(cep[5]); + COMPARE_ZERO_F(cep[6]); + COMPARE_ZERO_F(cep[7]); + QVERIFY(fabsf(-6.561181 - cep[0]/8) < 0.000001); + QVERIFY(fabsf( 7.254329 - cep[4]/8) < 0.000001); + } + + void forwardArrayBoundsF() { + ifetch(); + // initialise bins to something recognisable, so we can tell + // if they haven't been written + float in[] = { 1, 1, -1, -1 }; + float re[] = { 999, 999, 999, 999, 999 }; + float im[] = { 999, 999, 999, 999, 999 }; + FFT(4).forward(in, re+1, im+1); + // And check we haven't overrun the arrays + QCOMPARE(re[0], 999.0f); + QCOMPARE(im[0], 999.0f); + QCOMPARE(re[4], 999.0f); + QCOMPARE(im[4], 999.0f); + } + + void inverseArrayBoundsF() { + ifetch(); + // initialise bins to something recognisable, so we can tell + // if they haven't been written + float re[] = { 0, 1, 0 }; + float im[] = { 0, -2, 0 }; + float out[] = { 999, 999, 999, 999, 999, 999 }; + FFT(4).inverse(re, im, out+1); + // And check we haven't overrun the arrays + QCOMPARE(out[0], 999.0f); + QCOMPARE(out[5], 999.0f); + } + + void checkD_data() { idat(); } + void dc_data() { idat(); } + void sine_data() { idat(); } + void cosine_data() { idat(); } + void sineCosine_data() { idat(); } + void sineCosineDC_data() { idat(); } + void nyquist_data() { idat(); } + void interleaved_data() { idat(); } + void cosinePolar_data() { idat(); } + void sinePolar_data() { idat(); } + void magnitude_data() { idat(); } + void dirac_data() { idat(); } + void cepstrum_data() { idat(); } + void forwardArrayBounds_data() { idat(); } + void inverseArrayBounds_data() { idat(); } + + void checkF_data() { idat(); } + void dcF_data() { idat(); } + void sineF_data() { idat(); } + void cosineF_data() { idat(); } + void sineCosineF_data() { idat(); } + void sineCosineDCF_data() { idat(); } + void nyquistF_data() { idat(); } + void interleavedF_data() { idat(); } + void cosinePolarF_data() { idat(); } + void sinePolarF_data() { idat(); } + void magnitudeF_data() { idat(); } + void diracF_data() { idat(); } + void cepstrumF_data() { idat(); } + void forwardArrayBoundsF_data() { idat(); } + void inverseArrayBoundsF_data() { idat(); } +}; + +} + +#endif diff -r 77fca9c01e46 -r cf59817a5983 fft/native/bqfft/test/main.cpp --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fft/native/bqfft/test/main.cpp Sat Oct 17 15:00:08 2015 +0100 @@ -0,0 +1,27 @@ +/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */ +/* Copyright Chris Cannam - All Rights Reserved */ + +#include "TestFFT.h" +#include + +#include + +int main(int argc, char *argv[]) +{ + int good = 0, bad = 0; + + QCoreApplication app(argc, argv); + + breakfastquay::TestFFT tf; + if (QTest::qExec(&tf, argc, argv) == 0) ++good; + else ++bad; + + if (bad > 0) { + std::cerr << "\n********* " << bad << " test suite(s) failed!\n" << std::endl; + return 1; + } else { + std::cerr << "All tests passed" << std::endl; + return 0; + } +} + diff -r 77fca9c01e46 -r cf59817a5983 fft/native/bqfft/test/test.pro --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fft/native/bqfft/test/test.pro Sat Oct 17 15:00:08 2015 +0100 @@ -0,0 +1,30 @@ + +TEMPLATE = app +TARGET = test-fft +win*: TARGET = "TestFFT" + +QT += testlib +QT -= gui + +DESTDIR = . +QMAKE_LIBDIR += . .. + +LIBS += -lbqfft -lfftw3 -lfftw3f + +INCLUDEPATH += . .. ../../bqvec +DEPENDPATH += . .. ../../bqvec + +HEADERS += TestFFT.h +SOURCES += main.cpp + +!win32 { + !macx* { + QMAKE_POST_LINK=valgrind $${DESTDIR}/$${TARGET} + } + macx* { + QMAKE_POST_LINK=$${DESTDIR}/$${TARGET}.app/Contents/MacOS/$${TARGET} + } +} + +win32-g++:QMAKE_POST_LINK=$${DESTDIR}$${TARGET}.exe + diff -r 77fca9c01e46 -r cf59817a5983 fft/native/bqvec/COPYING --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fft/native/bqvec/COPYING Sat Oct 17 15:00:08 2015 +0100 @@ -0,0 +1,50 @@ + +The following terms apply to the code in the src/ directory: + + Copyright 2007-2014 Particular Programs Ltd. + + Permission is hereby granted, free of charge, to any person + obtaining a copy of this software and associated documentation + files (the "Software"), to deal in the Software without + restriction, including without limitation the rights to use, copy, + modify, merge, publish, distribute, sublicense, and/or sell copies + of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR + ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF + CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + Except as contained in this notice, the names of Chris Cannam and + Particular Programs Ltd shall not be used in advertising or + otherwise to promote the sale, use or other dealings in this + Software without prior written authorization. + + +The following terms apply to the code in the pommier/ directory: + + Copyright (C) 2011 Julien Pommier + + This software is provided 'as-is', without any express or implied + warranty. In no event will the authors be held liable for any damages + arising from the use of this software. + + Permission is granted to anyone to use this software for any purpose, + including commercial applications, and to alter it and redistribute it + freely, subject to the following restrictions: + + 1. The origin of this software must not be misrepresented; you must not + claim that you wrote the original software. If you use this software + in a product, an acknowledgment in the product documentation would be + appreciated but is not required. + 2. Altered source versions must be plainly marked as such, and must not be + misrepresented as being the original software. + 3. This notice may not be removed or altered from any source distribution. + diff -r 77fca9c01e46 -r cf59817a5983 fft/native/bqvec/Makefile --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fft/native/bqvec/Makefile Sat Oct 17 15:00:08 2015 +0100 @@ -0,0 +1,90 @@ + +# Add to VECTOR_DEFINES the relevant options for your desired +# third-party library support. +# +# Available options are +# +# -DHAVE_IPP Intel's Integrated Performance Primitives are available +# -DHAVE_VDSP Apple's Accelerate framework is available +# +# These are optional (they affect performance, not function) and you +# may define more than one of them. +# +# Add any relevant -I flags for include paths as well. +# +# Note that you must supply the same flags when including bqvec +# headers later as you are using now when compiling the library. (You +# may find it simplest to just add the bqvec source files to your +# application's build system and not build a bqvec library at all.) + +VECTOR_DEFINES := -DHAVE_VDSP + + +# Add to ALLOCATOR_DEFINES options relating to aligned malloc. +# +# Available options are +# +# -DHAVE_POSIX_MEMALIGN The posix_memalign call is available in sys/mman.h +# -DLACK_POSIX_MEMALIGN The posix_memalign call is not available +# +# -DMALLOC_IS_ALIGNED The malloc call already returns aligned memory +# -DMALLOC_IS_NOT_ALIGNED The malloc call does not return aligned memory +# +# -DUSE_OWN_ALIGNED_MALLOC No aligned malloc is available, roll your own +# +# -DLACK_BAD_ALLOC The C++ library lacks the std::bad_alloc exception +# +# Here "aligned" is assumed to mean "aligned enough for whatever +# vector stuff the space will be used for" which most likely means +# 16-byte alignment. +# +# The default is to use _aligned_malloc when building with Visual C++, +# system malloc when building on OS/X, and posix_memalign otherwise. +# +# Note that you must supply the same flags when including bqvec +# headers later as you are using now when compiling the library. (You +# may find it simplest to just add the bqvec source files to your +# application's build system and not build a bqvec library at all.) + +ALLOCATOR_DEFINES := -DMALLOC_IS_ALIGNED + + +SRC_DIR := src +HEADER_DIR := bqvec + +SOURCES := $(wildcard $(SRC_DIR)/*.cpp) +HEADERS := $(wildcard $(HEADER_DIR)/*.h) $(wildcard $(SRC_DIR)/*.h) + +OBJECTS := $(SOURCES:.cpp=.o) +OBJECTS := $(OBJECTS:.c=.o) + +CXXFLAGS := $(VECTOR_DEFINES) $(ALLOCATOR_DEFINES) -I$(HEADER_DIR) -O3 -ffast-math -Wall -Werror -fpic + +LIBRARY := libbqvec.a + +all: $(LIBRARY) + +$(LIBRARY): $(OBJECTS) + $(AR) rc $@ $^ + +clean: + rm -f $(OBJECTS) + +distclean: clean + rm -f $(LIBRARY) + +depend: + makedepend -Y -fMakefile $(SOURCES) $(HEADERS) + + +# DO NOT DELETE + +src/VectorOpsComplex.o: bqvec/VectorOpsComplex.h bqvec/VectorOps.h +src/VectorOpsComplex.o: bqvec/Restrict.h bqvec/ComplexTypes.h +src/Allocators.o: bqvec/Allocators.h bqvec/VectorOps.h bqvec/Restrict.h +bqvec/RingBuffer.o: bqvec/Barrier.h bqvec/Allocators.h bqvec/VectorOps.h +bqvec/RingBuffer.o: bqvec/Restrict.h +bqvec/VectorOpsComplex.o: bqvec/VectorOps.h bqvec/Restrict.h +bqvec/VectorOpsComplex.o: bqvec/ComplexTypes.h +bqvec/VectorOps.o: bqvec/Restrict.h +bqvec/Allocators.o: bqvec/VectorOps.h bqvec/Restrict.h diff -r 77fca9c01e46 -r cf59817a5983 fft/native/bqvec/README.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fft/native/bqvec/README.txt Sat Oct 17 15:00:08 2015 +0100 @@ -0,0 +1,17 @@ + +bqvec +===== + +A small library for efficient vector arithmetic and allocation in C++ +using raw C pointer arrays. + +C++ standard required: C++98 (does not use C++11) + +Copyright 2007-2015 Particular Programs Ltd. + +This code originated as part of the Rubber Band Library written by the +same authors (see https://bitbucket.org/breakfastquay/rubberband/). +It has been pulled out into a separate library and relicensed under a +more permissive licence: a BSD/MIT-style licence, as opposed to the +GPL used for Rubber Band. See the file COPYING for details. + diff -r 77fca9c01e46 -r cf59817a5983 fft/native/bqvec/bqvec/Allocators.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fft/native/bqvec/bqvec/Allocators.h Sat Oct 17 15:00:08 2015 +0100 @@ -0,0 +1,286 @@ +/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */ + +/* + bqvec + + A small library for vector arithmetic and allocation in C++ using + raw C pointer arrays. + + Copyright 2007-2015 Particular Programs Ltd. + + Permission is hereby granted, free of charge, to any person + obtaining a copy of this software and associated documentation + files (the "Software"), to deal in the Software without + restriction, including without limitation the rights to use, copy, + modify, merge, publish, distribute, sublicense, and/or sell copies + of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR + ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF + CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + Except as contained in this notice, the names of Chris Cannam and + Particular Programs Ltd shall not be used in advertising or + otherwise to promote the sale, use or other dealings in this + Software without prior written authorization. +*/ + +#ifndef BQVEC_ALLOCATORS_H +#define BQVEC_ALLOCATORS_H + +/* + * Aligned and per-channel allocators and deallocators for raw C array + * buffers. + */ + +#include // for std::bad_alloc +#include + +#ifndef HAVE_POSIX_MEMALIGN +#ifndef _WIN32 +#ifndef __APPLE__ +#ifndef LACK_POSIX_MEMALIGN +#define HAVE_POSIX_MEMALIGN +#endif +#endif +#endif +#endif + +#ifndef MALLOC_IS_NOT_ALIGNED +#ifdef __APPLE_ +#ifndef MALLOC_IS_ALIGNED +#define MALLOC_IS_ALIGNED +#endif +#endif +#endif + +#ifdef HAVE_POSIX_MEMALIGN +#include +#endif + +#ifdef LACK_BAD_ALLOC +namespace std { struct bad_alloc { }; } +#endif + +namespace breakfastquay { + +template +T *allocate(size_t count) +{ + void *ptr = 0; + // 32-byte alignment is required for at least OpenMAX + static const int alignment = 32; +#ifdef USE_OWN_ALIGNED_MALLOC + // Alignment must be a power of two, bigger than the pointer + // size. Stuff the actual malloc'd pointer in just before the + // returned value. This is the least desirable way to do this -- + // the other options below are all better + size_t allocd = count * sizeof(T) + alignment; + void *buf = malloc(allocd); + if (buf) { + char *adj = (char *)buf; + while ((unsigned long long)adj & (alignment-1)) --adj; + ptr = ((char *)adj) + alignment; + ((void **)ptr)[-1] = buf; + } +#else /* !USE_OWN_ALIGNED_MALLOC */ +#ifdef HAVE_POSIX_MEMALIGN + if (posix_memalign(&ptr, alignment, count * sizeof(T))) { + ptr = malloc(count * sizeof(T)); + } +#else /* !HAVE_POSIX_MEMALIGN */ +#ifdef __MSVC__ + ptr = _aligned_malloc(count * sizeof(T), alignment); +#else /* !__MSVC__ */ +#ifndef MALLOC_IS_ALIGNED +#error "No aligned malloc available: define MALLOC_IS_ALIGNED to stick with system malloc, HAVE_POSIX_MEMALIGN if posix_memalign is available, or USE_OWN_ALIGNED_MALLOC to roll our own" +#endif + // Note that malloc always aligns to 16 byte boundaries on OS/X + ptr = malloc(count * sizeof(T)); + (void)alignment; // avoid compiler warning for unused +#endif /* !__MSVC__ */ +#endif /* !HAVE_POSIX_MEMALIGN */ +#endif /* !USE_OWN_ALIGNED_MALLOC */ + if (!ptr) { +#ifndef NO_EXCEPTIONS + throw(std::bad_alloc()); +#else + abort(); +#endif + } + return (T *)ptr; +} + +#ifdef HAVE_IPP + +template <> +float *allocate(size_t count); + +template <> +double *allocate(size_t count); + +#endif + +template +T *allocate_and_zero(size_t count) +{ + T *ptr = allocate(count); + for (size_t i = 0; i < count; ++i) { + ptr[i] = T(); + } + return ptr; +} + +template +void deallocate(T *ptr) +{ +#ifdef USE_OWN_ALIGNED_MALLOC + if (ptr) free(((void **)ptr)[-1]); +#else /* !USE_OWN_ALIGNED_MALLOC */ +#ifdef __MSVC__ + if (ptr) _aligned_free((void *)ptr); +#else /* !__MSVC__ */ + if (ptr) free((void *)ptr); +#endif /* !__MSVC__ */ +#endif /* !USE_OWN_ALIGNED_MALLOC */ +} + +#ifdef HAVE_IPP + +template <> +void deallocate(float *); + +template <> +void deallocate(double *); + +#endif + +/// Reallocate preserving contents but leaving additional memory uninitialised +template +T *reallocate(T *ptr, size_t oldcount, size_t count) +{ + T *newptr = allocate(count); + if (oldcount && ptr) { + for (size_t i = 0; i < oldcount && i < count; ++i) { + newptr[i] = ptr[i]; + } + } + if (ptr) deallocate(ptr); + return newptr; +} + +/// Reallocate, zeroing all contents +template +T *reallocate_and_zero(T *ptr, size_t oldcount, size_t count) +{ + ptr = reallocate(ptr, oldcount, count); + for (size_t i = 0; i < count; ++i) { + ptr[i] = T(); + } + return ptr; +} + +/// Reallocate preserving contents and zeroing any additional memory +template +T *reallocate_and_zero_extension(T *ptr, size_t oldcount, size_t count) +{ + ptr = reallocate(ptr, oldcount, count); + if (count > oldcount) { + for (size_t i = oldcount; i < count; ++i) { + ptr[i] = T(); + } + } + return ptr; +} + +template +T **allocate_channels(size_t channels, size_t count) +{ + // We don't want to use the aligned allocate for the channel + // pointers, it might even make things slower + T **ptr = new T *[channels]; + for (size_t c = 0; c < channels; ++c) { + ptr[c] = allocate(count); + } + return ptr; +} + +template +T **allocate_and_zero_channels(size_t channels, size_t count) +{ + // We don't want to use the aligned allocate for the channel + // pointers, it might even make things slower + T **ptr = new T *[channels]; + for (size_t c = 0; c < channels; ++c) { + ptr[c] = allocate_and_zero(count); + } + return ptr; +} + +template +void deallocate_channels(T **ptr, size_t channels) +{ + if (!ptr) return; + for (size_t c = 0; c < channels; ++c) { + deallocate(ptr[c]); + } + delete[] ptr; +} + +template +T **reallocate_channels(T **ptr, + size_t oldchannels, size_t oldcount, + size_t channels, size_t count) +{ + T **newptr = allocate_channels(channels, count); + if (oldcount && ptr) { + for (size_t c = 0; c < channels; ++c) { + for (size_t i = 0; i < oldcount && i < count; ++i) { + newptr[c][i] = ptr[c][i]; + } + } + } + if (ptr) deallocate_channels(ptr, oldchannels); + return newptr; +} + +template +T **reallocate_and_zero_extend_channels(T **ptr, + size_t oldchannels, size_t oldcount, + size_t channels, size_t count) +{ + T **newptr = allocate_and_zero_channels(channels, count); + if (oldcount && ptr) { + for (size_t c = 0; c < channels; ++c) { + for (size_t i = 0; i < oldcount && i < count; ++i) { + newptr[c][i] = ptr[c][i]; + } + } + } + if (ptr) deallocate_channels(ptr, oldchannels); + return newptr; +} + +/// RAII class to call deallocate() on destruction +template +class Deallocator +{ +public: + Deallocator(T *t) : m_t(t) { } + ~Deallocator() { deallocate(m_t); } +private: + T *m_t; +}; + +} + +#endif + diff -r 77fca9c01e46 -r cf59817a5983 fft/native/bqvec/bqvec/Barrier.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fft/native/bqvec/bqvec/Barrier.h Sat Oct 17 15:00:08 2015 +0100 @@ -0,0 +1,62 @@ +/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */ + +/* + bqvec + + A small library for vector arithmetic and allocation in C++ using + raw C pointer arrays. + + Copyright 2007-2015 Particular Programs Ltd. + + Permission is hereby granted, free of charge, to any person + obtaining a copy of this software and associated documentation + files (the "Software"), to deal in the Software without + restriction, including without limitation the rights to use, copy, + modify, merge, publish, distribute, sublicense, and/or sell copies + of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR + ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF + CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + Except as contained in this notice, the names of Chris Cannam and + Particular Programs Ltd shall not be used in advertising or + otherwise to promote the sale, use or other dealings in this + Software without prior written authorization. +*/ + +#ifndef BQVEC_BARRIER_H +#define BQVEC_BARRIER_H + +namespace breakfastquay { + extern void system_memorybarrier(); +} + +#if defined _WIN32 + +#define BQ_MBARRIER() ::breakfastquay::system_memorybarrier() + +#elif defined __APPLE__ + +#include +#define BQ_MBARRIER() OSMemoryBarrier() + +#elif (__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 1) + +#define BQ_MBARRIER() __sync_synchronize() + +#else + +#define BQ_MBARRIER() ::breakfastquay::system_memorybarrier() + +#endif + +#endif diff -r 77fca9c01e46 -r cf59817a5983 fft/native/bqvec/bqvec/ComplexTypes.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fft/native/bqvec/bqvec/ComplexTypes.h Sat Oct 17 15:00:08 2015 +0100 @@ -0,0 +1,52 @@ +/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */ + +/* + bqvec + + A small library for vector arithmetic and allocation in C++ using + raw C pointer arrays. + + Copyright 2007-2015 Particular Programs Ltd. + + Permission is hereby granted, free of charge, to any person + obtaining a copy of this software and associated documentation + files (the "Software"), to deal in the Software without + restriction, including without limitation the rights to use, copy, + modify, merge, publish, distribute, sublicense, and/or sell copies + of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR + ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF + CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + Except as contained in this notice, the names of Chris Cannam and + Particular Programs Ltd shall not be used in advertising or + otherwise to promote the sale, use or other dealings in this + Software without prior written authorization. +*/ + +#ifndef BQVEC_COMPLEX_TYPES_H +#define BQVEC_COMPLEX_TYPES_H + +namespace breakfastquay { + +#ifndef NO_COMPLEX_TYPES +// Convertible with other complex types that store re+im consecutively +typedef double bq_complex_element_t; +struct bq_complex_t { + bq_complex_element_t re; + bq_complex_element_t im; +}; +#endif + +} + +#endif diff -r 77fca9c01e46 -r cf59817a5983 fft/native/bqvec/bqvec/Range.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fft/native/bqvec/bqvec/Range.h Sat Oct 17 15:00:08 2015 +0100 @@ -0,0 +1,57 @@ +/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */ + +/* + bqvec + + A small library for vector arithmetic and allocation in C++ using + raw C pointer arrays. + + Copyright 2007-2015 Particular Programs Ltd. + + Permission is hereby granted, free of charge, to any person + obtaining a copy of this software and associated documentation + files (the "Software"), to deal in the Software without + restriction, including without limitation the rights to use, copy, + modify, merge, publish, distribute, sublicense, and/or sell copies + of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR + ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF + CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + Except as contained in this notice, the names of Chris Cannam and + Particular Programs Ltd shall not be used in advertising or + otherwise to promote the sale, use or other dealings in this + Software without prior written authorization. +*/ + +#ifndef BQVEC_RANGE_H +#define BQVEC_RANGE_H + +namespace breakfastquay { + +/** Check whether an integer index is in range for a container, + avoiding overflows and signed/unsigned comparison warnings. +*/ +template +bool in_range_for(const C &container, T i) +{ + if (i < 0) return false; + if (sizeof(T) > sizeof(typename C::size_type)) { + return i < static_cast(container.size()); + } else { + return static_cast(i) < container.size(); + } +} + +} + +#endif diff -r 77fca9c01e46 -r cf59817a5983 fft/native/bqvec/bqvec/Restrict.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fft/native/bqvec/bqvec/Restrict.h Sat Oct 17 15:00:08 2015 +0100 @@ -0,0 +1,51 @@ +/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */ + +/* + bqvec + + A small library for vector arithmetic and allocation in C++ using + raw C pointer arrays. + + Copyright 2007-2015 Particular Programs Ltd. + + Permission is hereby granted, free of charge, to any person + obtaining a copy of this software and associated documentation + files (the "Software"), to deal in the Software without + restriction, including without limitation the rights to use, copy, + modify, merge, publish, distribute, sublicense, and/or sell copies + of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR + ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF + CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + Except as contained in this notice, the names of Chris Cannam and + Particular Programs Ltd shall not be used in advertising or + otherwise to promote the sale, use or other dealings in this + Software without prior written authorization. +*/ + +#ifndef BQVEC_RESTRICT_H +#define BQVEC_RESTRICT_H + +#ifdef __MSVC__ +#define BQ_R__ __restrict +#endif + +#ifdef __GNUC__ +#define BQ_R__ __restrict__ +#endif + +#ifndef BQ_R__ +#define BQ_R__ +#endif + +#endif diff -r 77fca9c01e46 -r cf59817a5983 fft/native/bqvec/bqvec/RingBuffer.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fft/native/bqvec/bqvec/RingBuffer.h Sat Oct 17 15:00:08 2015 +0100 @@ -0,0 +1,516 @@ +/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */ + +/* + bqvec + + A small library for vector arithmetic and allocation in C++ using + raw C pointer arrays. + + Copyright 2007-2015 Particular Programs Ltd. + + Permission is hereby granted, free of charge, to any person + obtaining a copy of this software and associated documentation + files (the "Software"), to deal in the Software without + restriction, including without limitation the rights to use, copy, + modify, merge, publish, distribute, sublicense, and/or sell copies + of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR + ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF + CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + Except as contained in this notice, the names of Chris Cannam and + Particular Programs Ltd shall not be used in advertising or + otherwise to promote the sale, use or other dealings in this + Software without prior written authorization. +*/ + +#ifndef BQVEC_RINGBUFFER_H +#define BQVEC_RINGBUFFER_H + +#include + +//#define DEBUG_RINGBUFFER 1 + +#include "Barrier.h" +#include "Allocators.h" + +#include + +namespace breakfastquay { + +/** + * RingBuffer implements a lock-free ring buffer for one writer and + * one reader, that is to be used to store a sample type T. + * + * RingBuffer is thread-safe provided only one thread writes and only + * one thread reads. + */ +template +class RingBuffer +{ +public: + /** + * Create a ring buffer with room to write n samples. + * + * Note that the internal storage size will actually be n+1 + * samples, as one element is unavailable for administrative + * reasons. Since the ring buffer performs best if its size is a + * power of two, this means n should ideally be some power of two + * minus one. + */ + RingBuffer(int n); + + virtual ~RingBuffer(); + + /** + * Return the total capacity of the ring buffer in samples. + * (This is the argument n passed to the constructor.) + */ + int getSize() const; + + /** + * Return a new ring buffer (allocated with "new" -- caller must + * delete when no longer needed) of the given size, containing the + * same data as this one. If another thread reads from or writes + * to this buffer during the call, the results may be incomplete + * or inconsistent. If this buffer's data will not fit in the new + * size, the contents are undefined. + */ + RingBuffer *resized(int newSize) const; + + /** + * Reset read and write pointers, thus emptying the buffer. + * Should be called from the write thread. + */ + void reset(); + + /** + * Return the amount of data available for reading, in samples. + */ + int getReadSpace() const; + + /** + * Return the amount of space available for writing, in samples. + */ + int getWriteSpace() const; + + /** + * Read n samples from the buffer. If fewer than n are available, + * the remainder will be zeroed out. Returns the number of + * samples actually read. + * + * This is a template function, taking an argument S for the target + * sample type, which is permitted to differ from T if the two + * types are compatible for arithmetic operations. + */ + template + int read(S *const BQ_R__ destination, int n); + + /** + * Read n samples from the buffer, adding them to the destination. + * If fewer than n are available, the remainder will be left + * alone. Returns the number of samples actually read. + * + * This is a template function, taking an argument S for the target + * sample type, which is permitted to differ from T if the two + * types are compatible for arithmetic operations. + */ + template + int readAdding(S *const BQ_R__ destination, int n); + + /** + * Read one sample from the buffer. If no sample is available, + * this will silently return zero. Calling this repeatedly is + * obviously slower than calling read once, but it may be good + * enough if you don't want to allocate a buffer to read into. + */ + T readOne(); + + /** + * Read n samples from the buffer, if available, without advancing + * the read pointer -- i.e. a subsequent read() or skip() will be + * necessary to empty the buffer. If fewer than n are available, + * the remainder will be zeroed out. Returns the number of + * samples actually read. + */ + int peek(T *const BQ_R__ destination, int n) const; + + /** + * Read one sample from the buffer, if available, without + * advancing the read pointer -- i.e. a subsequent read() or + * skip() will be necessary to empty the buffer. Returns zero if + * no sample was available. + */ + T peekOne() const; + + /** + * Pretend to read n samples from the buffer, without actually + * returning them (i.e. discard the next n samples). Returns the + * number of samples actually available for discarding. + */ + int skip(int n); + + /** + * Write n samples to the buffer. If insufficient space is + * available, not all samples may actually be written. Returns + * the number of samples actually written. + * + * This is a template function, taking an argument S for the source + * sample type, which is permitted to differ from T if the two + * types are compatible for assignment. + */ + template + int write(const S *const BQ_R__ source, int n); + + /** + * Write n zero-value samples to the buffer. If insufficient + * space is available, not all zeros may actually be written. + * Returns the number of zeroes actually written. + */ + int zero(int n); + +protected: + T *const BQ_R__ m_buffer; + int m_writer; + int m_reader; + const int m_size; + + int readSpaceFor(int w, int r) const { + int space; + if (w > r) space = w - r; + else if (w < r) space = (w + m_size) - r; + else space = 0; + return space; + } + + int writeSpaceFor(int w, int r) const { + int space = (r + m_size - w - 1); + if (space >= m_size) space -= m_size; + return space; + } + +private: + RingBuffer(const RingBuffer &); // not provided + RingBuffer &operator=(const RingBuffer &); // not provided +}; + +template +RingBuffer::RingBuffer(int n) : + m_buffer(allocate(n + 1)), + m_writer(0), + m_size(n + 1) +{ +#ifdef DEBUG_RINGBUFFER + std::cerr << "RingBuffer[" << this << "]::RingBuffer(" << n << ")" << std::endl; +#endif + + m_reader = 0; +} + +template +RingBuffer::~RingBuffer() +{ +#ifdef DEBUG_RINGBUFFER + std::cerr << "RingBuffer[" << this << "]::~RingBuffer" << std::endl; +#endif + + deallocate(m_buffer); +} + +template +int +RingBuffer::getSize() const +{ +#ifdef DEBUG_RINGBUFFER + std::cerr << "RingBuffer[" << this << "]::getSize(): " << m_size-1 << std::endl; +#endif + + return m_size - 1; +} + +template +RingBuffer * +RingBuffer::resized(int newSize) const +{ + RingBuffer *newBuffer = new RingBuffer(newSize); + + int w = m_writer; + int r = m_reader; + + while (r != w) { + T value = m_buffer[r]; + newBuffer->write(&value, 1); + if (++r == m_size) r = 0; + } + + return newBuffer; +} + +template +void +RingBuffer::reset() +{ +#ifdef DEBUG_RINGBUFFER + std::cerr << "RingBuffer[" << this << "]::reset" << std::endl; +#endif + + m_reader = m_writer; +} + +template +int +RingBuffer::getReadSpace() const +{ + return readSpaceFor(m_writer, m_reader); +} + +template +int +RingBuffer::getWriteSpace() const +{ + return writeSpaceFor(m_writer, m_reader); +} + +template +template +int +RingBuffer::read(S *const BQ_R__ destination, int n) +{ + int w = m_writer; + int r = m_reader; + + int available = readSpaceFor(w, r); + if (n > available) { + std::cerr << "WARNING: RingBuffer::read: " << n << " requested, only " + << available << " available" << std::endl; + n = available; + } + if (n == 0) return n; + + int here = m_size - r; + T *const BQ_R__ bufbase = m_buffer + r; + + if (here >= n) { + v_convert(destination, bufbase, n); + } else { + v_convert(destination, bufbase, here); + v_convert(destination + here, m_buffer, n - here); + } + + r += n; + while (r >= m_size) r -= m_size; + + BQ_MBARRIER(); + m_reader = r; + + return n; +} + +template +template +int +RingBuffer::readAdding(S *const BQ_R__ destination, int n) +{ + int w = m_writer; + int r = m_reader; + + int available = readSpaceFor(w, r); + if (n > available) { + std::cerr << "WARNING: RingBuffer::read: " << n << " requested, only " + << available << " available" << std::endl; + n = available; + } + if (n == 0) return n; + + int here = m_size - r; + T *const BQ_R__ bufbase = m_buffer + r; + + if (here >= n) { + v_add(destination, bufbase, n); + } else { + v_add(destination, bufbase, here); + v_add(destination + here, m_buffer, n - here); + } + + r += n; + while (r >= m_size) r -= m_size; + + BQ_MBARRIER(); + m_reader = r; + + return n; +} + +template +T +RingBuffer::readOne() +{ + int w = m_writer; + int r = m_reader; + + if (w == r) { + std::cerr << "WARNING: RingBuffer::readOne: no sample available" + << std::endl; + return T(); + } + + T value = m_buffer[r]; + if (++r == m_size) r = 0; + + BQ_MBARRIER(); + m_reader = r; + + return value; +} + +template +int +RingBuffer::peek(T *const BQ_R__ destination, int n) const +{ + int w = m_writer; + int r = m_reader; + + int available = readSpaceFor(w, r); + if (n > available) { + std::cerr << "WARNING: RingBuffer::peek: " << n << " requested, only " + << available << " available" << std::endl; + memset(destination + available, 0, (n - available) * sizeof(T)); + n = available; + } + if (n == 0) return n; + + int here = m_size - r; + const T *const BQ_R__ bufbase = m_buffer + r; + + if (here >= n) { + v_copy(destination, bufbase, n); + } else { + v_copy(destination, bufbase, here); + v_copy(destination + here, m_buffer, n - here); + } + + return n; +} + +template +T +RingBuffer::peekOne() const +{ + int w = m_writer; + int r = m_reader; + + if (w == r) { + std::cerr << "WARNING: RingBuffer::peekOne: no sample available" + << std::endl; + return 0; + } + + T value = m_buffer[r]; + return value; +} + +template +int +RingBuffer::skip(int n) +{ + int w = m_writer; + int r = m_reader; + + int available = readSpaceFor(w, r); + if (n > available) { + std::cerr << "WARNING: RingBuffer::skip: " << n << " requested, only " + << available << " available" << std::endl; + n = available; + } + if (n == 0) return n; + + r += n; + while (r >= m_size) r -= m_size; + + // No memory barrier required, because we didn't read any data + m_reader = r; + + return n; +} + +template +template +int +RingBuffer::write(const S *const BQ_R__ source, int n) +{ + int w = m_writer; + int r = m_reader; + + int available = writeSpaceFor(w, r); + if (n > available) { + std::cerr << "WARNING: RingBuffer::write: " << n + << " requested, only room for " << available << std::endl; + n = available; + } + if (n == 0) return n; + + int here = m_size - w; + T *const BQ_R__ bufbase = m_buffer + w; + + if (here >= n) { + v_convert(bufbase, source, n); + } else { + v_convert(bufbase, source, here); + v_convert(m_buffer, source + here, n - here); + } + + w += n; + while (w >= m_size) w -= m_size; + + BQ_MBARRIER(); + m_writer = w; + + return n; +} + +template +int +RingBuffer::zero(int n) +{ + int w = m_writer; + int r = m_reader; + + int available = writeSpaceFor(w, r); + if (n > available) { + std::cerr << "WARNING: RingBuffer::zero: " << n + << " requested, only room for " << available << std::endl; + n = available; + } + if (n == 0) return n; + + int here = m_size - w; + T *const BQ_R__ bufbase = m_buffer + w; + + if (here >= n) { + v_zero(bufbase, n); + } else { + v_zero(bufbase, here); + v_zero(m_buffer, n - here); + } + + w += n; + while (w >= m_size) w -= m_size; + + BQ_MBARRIER(); + m_writer = w; + + return n; +} + +} + +#endif // BQVEC_RINGBUFFER_H diff -r 77fca9c01e46 -r cf59817a5983 fft/native/bqvec/bqvec/VectorOps.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fft/native/bqvec/bqvec/VectorOps.h Sat Oct 17 15:00:08 2015 +0100 @@ -0,0 +1,1110 @@ +/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */ + +/* + bqvec + + A small library for vector arithmetic and allocation in C++ using + raw C pointer arrays. + + Copyright 2007-2015 Particular Programs Ltd. + + Permission is hereby granted, free of charge, to any person + obtaining a copy of this software and associated documentation + files (the "Software"), to deal in the Software without + restriction, including without limitation the rights to use, copy, + modify, merge, publish, distribute, sublicense, and/or sell copies + of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR + ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF + CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + Except as contained in this notice, the names of Chris Cannam and + Particular Programs Ltd shall not be used in advertising or + otherwise to promote the sale, use or other dealings in this + Software without prior written authorization. +*/ + +#ifndef BQVEC_VECTOR_OPS_H +#define BQVEC_VECTOR_OPS_H + +#ifdef HAVE_IPP +#ifndef _MSC_VER +#include +#endif +#include +#include +#endif + +#ifdef HAVE_VDSP +#include +#endif + +#include + +#include "Restrict.h" + +namespace breakfastquay { + +/** + * General principle: + * + * Write basic vector-manipulation loops in such a way as to promote + * the likelihood that a good current C++ compiler can auto-vectorize + * them (e.g. gcc-4.x with -ftree-vectorize). Provide calls out to + * supported vector libraries (e.g. IPP, Accelerate) where useful. + * No intrinsics or assembly. + * + * Note that all size and index arguments are plain machine ints, to + * facilitate compiler optimization and vectorization. In general + * these functions should be used only with buffers whose sizes are + * calculated from known processing parameters and that are known to + * be much smaller than 32-bit int range. For security reasons you + * should not use these functions with buffers whose sizes may be + * under control of the user or external input. + * + * Argument ordering: + * + * If a function operates on one or more vector sequences in memory, + * they appear as pointers at the start of the argument list. If one + * vector is the "destination", it appears first, with "source" second + * if present (i.e. argument ordering follows memcpy). + * + * The final argument is always a count of the number of elements in + * each vector. + * + * Some functions operate on a set of vectors at once: their names all + * contain the text _channels, and the number of channels (i.e. number + * of vectors) is the argument before last. + * + * Any additional arguments, e.g. scale factors, appear between the + * vector pointers at the start and the counts at the end. + * + * The caller takes responsibility for ensuring that vector pointer + * arguments are not aliased, i.e. that vectors provided as separate + * arguments to the same function are distinct and do not overlap, + * except where documented. + */ + +/** + * v_zero + * + * Zero the elements in the given vector, of length \arg count. + */ +template +inline void v_zero(T *const BQ_R__ vec, + const int count) +{ + const T value = T(0); + for (int i = 0; i < count; ++i) { + vec[i] = value; + } +} + +#if defined HAVE_IPP +template<> +inline void v_zero(float *const BQ_R__ vec, + const int count) +{ + ippsZero_32f(vec, count); +} +template<> +inline void v_zero(double *const BQ_R__ vec, + const int count) +{ + ippsZero_64f(vec, count); +} +#elif defined HAVE_VDSP +template<> +inline void v_zero(float *const BQ_R__ vec, + const int count) +{ + vDSP_vclr(vec, 1, count); +} +template<> +inline void v_zero(double *const BQ_R__ vec, + const int count) +{ + vDSP_vclrD(vec, 1, count); +} +#endif + +/** + * v_zero_channels + * + * Zero the elements in the given set of \arg channels vectors, each + * of length \arg count. + */ +template +inline void v_zero_channels(T *const BQ_R__ *const BQ_R__ vec, + const int channels, + const int count) +{ + for (int c = 0; c < channels; ++c) { + v_zero(vec[c], count); + } +} + +/** + * v_set + * + * Set all of the elements in the given vector, of length \arg count, + * to \arg value. + */ +template +inline void v_set(T *const BQ_R__ vec, + const T value, + const int count) +{ + for (int i = 0; i < count; ++i) { + vec[i] = value; + } +} + +/** + * v_copy + * + * Copy the contents of the vector \arg src to the vector \arg dst, + * both of length \arg count. + * + * Caller guarantees that \arg src and \arg dst are non-overlapping. + */ +template +inline void v_copy(T *const BQ_R__ dst, + const T *const BQ_R__ src, + const int count) +{ + for (int i = 0; i < count; ++i) { + dst[i] = src[i]; + } +} + +#if defined HAVE_IPP +template<> +inline void v_copy(float *const BQ_R__ dst, + const float *const BQ_R__ src, + const int count) +{ + ippsCopy_32f(src, dst, count); +} +template<> +inline void v_copy(double *const BQ_R__ dst, + const double *const BQ_R__ src, + const int count) +{ + ippsCopy_64f(src, dst, count); +} +#endif + +/** + * v_copy_channels + * + * Copy the contents of the individual vectors in the set \arg src to + * the corresponding vectors in the set \arg dst. All vectors have + * length \arg count and there are \arg channels vectors in each set. + * + * Caller guarantees that all of the \arg src and \arg dst vectors are + * non-overlapping with each other. + */ +template +inline void v_copy_channels(T *const BQ_R__ *const BQ_R__ dst, + const T *const BQ_R__ *const BQ_R__ src, + const int channels, + const int count) +{ + for (int c = 0; c < channels; ++c) { + v_copy(dst[c], src[c], count); + } +} + +/** + * v_move + * + * Copy the contents of vector \arg src to the vector \arg dst, both + * of length \arg count. The two vectors may overlap. (If you know + * that they cannot overlap, use v_copy instead.) + */ +template +inline void v_move(T *const dst, // not BQ_R__ (aliased) + const T *const src, // not BQ_R__ (aliased) + const int count) +{ + memmove(dst, src, count * sizeof(T)); +} + +#if defined HAVE_IPP +template<> +inline void v_move(float *const dst, + const float *const src, + const int count) +{ + ippsMove_32f(src, dst, count); +} +template<> +inline void v_move(double *const dst, + const double *const src, + const int count) +{ + ippsMove_64f(src, dst, count); +} +#endif + +/** + * v_convert + * + * Copy the contents of vector \arg src to the vector \arg dst, both + * of length \arg count. The two vectors may have different value + * types, e.g. double and float. If they have the same type, this is + * equivalent to v_copy. + * + * Caller guarantees that \arg src and \arg dst are non-overlapping. + */ +template +inline void v_convert(U *const BQ_R__ dst, + const T *const BQ_R__ src, + const int count) +{ + for (int i = 0; i < count; ++i) { + dst[i] = U(src[i]); + } +} + +template<> +inline void v_convert(float *const BQ_R__ dst, + const float *const BQ_R__ src, + const int count) +{ + v_copy(dst, src, count); +} +template<> +inline void v_convert(double *const BQ_R__ dst, + const double *const BQ_R__ src, + const int count) +{ + v_copy(dst, src, count); +} + +#if defined HAVE_IPP +template<> +inline void v_convert(double *const BQ_R__ dst, + const float *const BQ_R__ src, + const int count) +{ + ippsConvert_32f64f(src, dst, count); +} +template<> +inline void v_convert(float *const BQ_R__ dst, + const double *const BQ_R__ src, + const int count) +{ + ippsConvert_64f32f(src, dst, count); +} +#elif defined HAVE_VDSP +template<> +inline void v_convert(double *const BQ_R__ dst, + const float *const BQ_R__ src, + const int count) +{ + vDSP_vspdp((float *)src, 1, dst, 1, count); +} +template<> +inline void v_convert(float *const BQ_R__ dst, + const double *const BQ_R__ src, + const int count) +{ + vDSP_vdpsp((double *)src, 1, dst, 1, count); +} +#endif + +/** + * v_convert_channels + * + * Copy the contents of the individual vectors in the set \arg src to + * the corresponding vectors in the set \arg dst. All vectors have + * length \arg count, and there are \arg channels vectors in each + * set. The source and destination vectors may have different value + * types, e.g. double and float. If they have the same type, this is + * equivalent to v_copy_channels. + * + * Caller guarantees that all of the \arg src and \arg dst vectors are + * non-overlapping with each other. + */ +template +inline void v_convert_channels(U *const BQ_R__ *const BQ_R__ dst, + const T *const BQ_R__ *const BQ_R__ src, + const int channels, + const int count) +{ + for (int c = 0; c < channels; ++c) { + v_convert(dst[c], src[c], count); + } +} + +/** + * v_add + * + * Add the elements in the vector \arg src to the corresponding + * elements in the vector \arg dst, both of length arg \count, leaving + * the result in \arg dst. + * + * Caller guarantees that \arg src and \arg dst are non-overlapping. + */ +template +inline void v_add(T *const BQ_R__ dst, + const T *const BQ_R__ src, + const int count) +{ + for (int i = 0; i < count; ++i) { + dst[i] += src[i]; + } +} + +/** + * v_add + * + * Add the constant \arg value to every element of the vector \arg + * dst, of length arg \count, leaving the result in \arg dst. + */ +template +inline void v_add(T *const BQ_R__ dst, + const T value, + const int count) +{ + for (int i = 0; i < count; ++i) { + dst[i] += value; + } +} + +#if defined HAVE_IPP +template<> +inline void v_add(float *const BQ_R__ dst, + const float *const BQ_R__ src, + const int count) +{ + ippsAdd_32f_I(src, dst, count); +} +inline void v_add(double *const BQ_R__ dst, + const double *const BQ_R__ src, + const int count) +{ + ippsAdd_64f_I(src, dst, count); +} +#endif + +/** + * v_add_channels + * + * Add the elements in the individual vectors in the set \arg src to + * the corresponding elements of the corresponding vectors in \arg + * dst, leaving the results in \arg dst. All vectors have length \arg + * count, and there are \arg channels vectors in each set. + * + * Caller guarantees that all of the \arg src and \arg dst vectors are + * non-overlapping with each other. + */ +template +inline void v_add_channels(T *const BQ_R__ *const BQ_R__ dst, + const T *const BQ_R__ *const BQ_R__ src, + const int channels, const int count) +{ + for (int c = 0; c < channels; ++c) { + v_add(dst[c], src[c], count); + } +} + +/** + * v_add_with_gain + * + * Add the elements in the vector \arg src, each multiplied by the + * constant factor \arg gain, to the corresponding elements in the + * vector \arg dst, both of length arg \count, leaving the result in + * \arg dst. + * + * Caller guarantees that \arg src and \arg dst are non-overlapping. + */ +template +inline void v_add_with_gain(T *const BQ_R__ dst, + const T *const BQ_R__ src, + const G gain, + const int count) +{ + for (int i = 0; i < count; ++i) { + dst[i] += src[i] * gain; + } +} + +/** + * v_add_channels_with_gain + * + * Add the elements in the individual vectors in the set \arg src, + * each multiplied by the constant factor \arg gain, to the + * corresponding elements of the corresponding vectors in \arg dst, + * leaving the results in \arg dst. All vectors have length \arg + * count, and there are \arg channels vectors in each set. + * + * Caller guarantees that all of the \arg src and \arg dst vectors are + * non-overlapping with each other. + */ +template +inline void v_add_channels_with_gain(T *const BQ_R__ *const BQ_R__ dst, + const T *const BQ_R__ *const BQ_R__ src, + const G gain, + const int channels, + const int count) +{ + for (int c = 0; c < channels; ++c) { + v_add_with_gain(dst[c], src[c], gain, count); + } +} + +/** + * v_subtract + * + * Subtract the elements in the vector \arg src from the corresponding + * elements in the vector \arg dst, both of length arg \count, leaving + * the result in \arg dst. + * + * Caller guarantees that \arg src and \arg dst are non-overlapping. + */ +template +inline void v_subtract(T *const BQ_R__ dst, + const T *const BQ_R__ src, + const int count) +{ + for (int i = 0; i < count; ++i) { + dst[i] -= src[i]; + } +} + +#if defined HAVE_IPP +template<> +inline void v_subtract(float *const BQ_R__ dst, + const float *const BQ_R__ src, + const int count) +{ + ippsSub_32f_I(src, dst, count); +} +inline void v_subtract(double *const BQ_R__ dst, + const double *const BQ_R__ src, + const int count) +{ + ippsSub_64f_I(src, dst, count); +} +#endif + +/** + * v_scale + * + * Scale the elements in the vector \arg dst, of length \arg count, by + * the factor \arg gain. + */ +template +inline void v_scale(T *const BQ_R__ dst, + const G gain, + const int count) +{ + for (int i = 0; i < count; ++i) { + dst[i] *= gain; + } +} + +#if defined HAVE_IPP +template<> +inline void v_scale(float *const BQ_R__ dst, + const float gain, + const int count) +{ + ippsMulC_32f_I(gain, dst, count); +} +template<> +inline void v_scale(double *const BQ_R__ dst, + const double gain, + const int count) +{ + ippsMulC_64f_I(gain, dst, count); +} +#endif + +/** + * v_multiply + * + * Multiply the elements in the vector \arg dst by the corresponding + * elements in the vector \arg src, both of length arg \count, leaving + * the result in \arg dst. + * + * Caller guarantees that \arg src and \arg dst are non-overlapping. + */ +template +inline void v_multiply(T *const BQ_R__ dst, + const T *const BQ_R__ src, + const int count) +{ + for (int i = 0; i < count; ++i) { + dst[i] *= src[i]; + } +} + +#if defined HAVE_IPP +template<> +inline void v_multiply(float *const BQ_R__ dst, + const float *const BQ_R__ src, + const int count) +{ + ippsMul_32f_I(src, dst, count); +} +template<> +inline void v_multiply(double *const BQ_R__ dst, + const double *const BQ_R__ src, + const int count) +{ + ippsMul_64f_I(src, dst, count); +} +#endif + +/** + * v_multiply + * + * Multiply the corresponding elements of the vectors \arg src1 and + * \arg src2, both of length arg \count, and write the results into + * \arg dst. + * + * Caller guarantees that \arg src1, \arg src2 and \arg dst are + * non-overlapping. + */ +template +inline void v_multiply(T *const BQ_R__ dst, + const T *const BQ_R__ src1, + const T *const BQ_R__ src2, + const int count) +{ + for (int i = 0; i < count; ++i) { + dst[i] = src1[i] * src2[i]; + } +} + +#if defined HAVE_IPP +template<> +inline void v_multiply(float *const BQ_R__ dst, + const float *const BQ_R__ src1, + const float *const BQ_R__ src2, + const int count) +{ + ippsMul_32f(src1, src2, dst, count); +} +template<> +inline void v_multiply(double *const BQ_R__ dst, + const double *const BQ_R__ src1, + const double *const BQ_R__ src2, + const int count) +{ + ippsMul_64f(src1, src2, dst, count); +} +#endif + +/** + * v_divide + * + * Divide the elements in the vector \arg dst by the corresponding + * elements in the vector \arg src, both of length arg \count, leaving + * the result in \arg dst. + * + * Caller guarantees that \arg src and \arg dst are non-overlapping. + */ +template +inline void v_divide(T *const BQ_R__ dst, + const T *const BQ_R__ src, + const int count) +{ + for (int i = 0; i < count; ++i) { + dst[i] /= src[i]; + } +} + +#if defined HAVE_IPP +template<> +inline void v_divide(float *const BQ_R__ dst, + const float *const BQ_R__ src, + const int count) +{ + ippsDiv_32f_I(src, dst, count); +} +template<> +inline void v_divide(double *const BQ_R__ dst, + const double *const BQ_R__ src, + const int count) +{ + ippsDiv_64f_I(src, dst, count); +} +#endif + +/** + * v_multiply_and_add + * + * Multiply the corresponding elements of the vectors \arg src1 and + * \arg src2, both of length arg \count, and add the results to the + * corresponding elements of vector \arg dst. + * + * Caller guarantees that \arg src1, \arg src2 and \arg dst are + * non-overlapping. + */ +template +inline void v_multiply_and_add(T *const BQ_R__ dst, + const T *const BQ_R__ src1, + const T *const BQ_R__ src2, + const int count) +{ + for (int i = 0; i < count; ++i) { + dst[i] += src1[i] * src2[i]; + } +} + +#if defined HAVE_IPP +template<> +inline void v_multiply_and_add(float *const BQ_R__ dst, + const float *const BQ_R__ src1, + const float *const BQ_R__ src2, + const int count) +{ + ippsAddProduct_32f(src1, src2, dst, count); +} +template<> +inline void v_multiply_and_add(double *const BQ_R__ dst, + const double *const BQ_R__ src1, + const double *const BQ_R__ src2, + const int count) +{ + ippsAddProduct_64f(src1, src2, dst, count); +} +#endif + +/** + * v_sum + * + * Return the sum of the elements in vector \arg src, of length \arg + * count. + */ +template +inline T v_sum(const T *const BQ_R__ src, + const int count) +{ + T result = T(); + for (int i = 0; i < count; ++i) { + result += src[i]; + } + return result; +} + +/** + * v_multiply_and_sum + * + * Multiply the corresponding elements of the vectors \arg src1 and + * \arg src2, both of length arg \count, sum the results, and return + * the sum as a scalar value. + * + * Caller guarantees that \arg src1 and \arg src2 are non-overlapping. + */ +template +inline T v_multiply_and_sum(const T *const BQ_R__ src1, + const T *const BQ_R__ src2, + const int count) +{ + T result = T(); + for (int i = 0; i < count; ++i) { + result += src1[i] * src2[i]; + } + return result; +} + +/** + * v_log + * + * Replace each element in vector \arg dst, of length \arg count, with + * its natural logarithm. + */ +template +inline void v_log(T *const BQ_R__ dst, + const int count) +{ + for (int i = 0; i < count; ++i) { + dst[i] = log(dst[i]); + } +} + +#if defined HAVE_IPP +template<> +inline void v_log(float *const BQ_R__ dst, + const int count) +{ + ippsLn_32f_I(dst, count); +} +template<> +inline void v_log(double *const BQ_R__ dst, + const int count) +{ + ippsLn_64f_I(dst, count); +} +#elif defined HAVE_VDSP +// no in-place vForce functions for these -- can we use the +// out-of-place functions with equal input and output vectors? can we +// use an out-of-place one with temporary buffer and still be faster +// than doing it any other way? +template<> +inline void v_log(float *const BQ_R__ dst, + const int count) +{ + float tmp[count]; + vvlogf(tmp, dst, &count); + v_copy(dst, tmp, count); +} +template<> +inline void v_log(double *const BQ_R__ dst, + const int count) +{ + double tmp[count]; + vvlog(tmp, dst, &count); + v_copy(dst, tmp, count); +} +#endif + +/** + * v_exp + * + * Replace each element in vector \arg dst, of length \arg count, with + * its base-e exponential. + */ +template +inline void v_exp(T *const BQ_R__ dst, + const int count) +{ + for (int i = 0; i < count; ++i) { + dst[i] = exp(dst[i]); + } +} + +#if defined HAVE_IPP +template<> +inline void v_exp(float *const BQ_R__ dst, + const int count) +{ + ippsExp_32f_I(dst, count); +} +template<> +inline void v_exp(double *const BQ_R__ dst, + const int count) +{ + ippsExp_64f_I(dst, count); +} +#elif defined HAVE_VDSP +// no in-place vForce functions for these -- can we use the +// out-of-place functions with equal input and output vectors? can we +// use an out-of-place one with temporary buffer and still be faster +// than doing it any other way? +template<> +inline void v_exp(float *const BQ_R__ dst, + const int count) +{ + float tmp[count]; + vvexpf(tmp, dst, &count); + v_copy(dst, tmp, count); +} +template<> +inline void v_exp(double *const BQ_R__ dst, + const int count) +{ + double tmp[count]; + vvexp(tmp, dst, &count); + v_copy(dst, tmp, count); +} +#endif + +/** + * v_sqrt + * + * Replace each element in vector \arg dst, of length \arg count, with + * its square root. + */ +template +inline void v_sqrt(T *const BQ_R__ dst, + const int count) +{ + for (int i = 0; i < count; ++i) { + dst[i] = sqrt(dst[i]); + } +} + +#if defined HAVE_IPP +template<> +inline void v_sqrt(float *const BQ_R__ dst, + const int count) +{ + ippsSqrt_32f_I(dst, count); +} +template<> +inline void v_sqrt(double *const BQ_R__ dst, + const int count) +{ + ippsSqrt_64f_I(dst, count); +} +#elif defined HAVE_VDSP +// no in-place vForce functions for these -- can we use the +// out-of-place functions with equal input and output vectors? can we +// use an out-of-place one with temporary buffer and still be faster +// than doing it any other way? +template<> +inline void v_sqrt(float *const BQ_R__ dst, + const int count) +{ + float tmp[count]; + vvsqrtf(tmp, dst, &count); + v_copy(dst, tmp, count); +} +template<> +inline void v_sqrt(double *const BQ_R__ dst, + const int count) +{ + double tmp[count]; + vvsqrt(tmp, dst, &count); + v_copy(dst, tmp, count); +} +#endif + +/** + * v_square + * + * Replace each element in vector \arg dst, of length \arg count, with + * its square. + */ +template +inline void v_square(T *const BQ_R__ dst, + const int count) +{ + for (int i = 0; i < count; ++i) { + dst[i] = dst[i] * dst[i]; + } +} + +#if defined HAVE_IPP +template<> +inline void v_square(float *const BQ_R__ dst, + const int count) +{ + ippsSqr_32f_I(dst, count); +} +template<> +inline void v_square(double *const BQ_R__ dst, + const int count) +{ + ippsSqr_64f_I(dst, count); +} +#endif + +/** + * v_abs + * + * Replace each element in vector \arg dst, of length \arg count, with + * its absolute value. + */ +template +inline void v_abs(T *const BQ_R__ dst, + const int count) +{ + for (int i = 0; i < count; ++i) { + dst[i] = fabs(dst[i]); + } +} + +#if defined HAVE_IPP +template<> +inline void v_abs(float *const BQ_R__ dst, + const int count) +{ + ippsAbs_32f_I(dst, count); +} +template<> +inline void v_abs(double *const BQ_R__ dst, + const int count) +{ + ippsAbs_64f_I(dst, count); +} +#elif defined HAVE_VDSP +template<> +inline void v_abs(float *const BQ_R__ dst, + const int count) +{ + float tmp[count]; +#if (defined(MACOSX_DEPLOYMENT_TARGET) && MACOSX_DEPLOYMENT_TARGET <= 1070 && MAC_OS_X_VERSION_MIN_REQUIRED <= 1070) + vvfabf(tmp, dst, &count); +#else + vvfabsf(tmp, dst, &count); +#endif + v_copy(dst, tmp, count); +} +#endif + +/** + * v_interleave + * + * Interleave (zip) the \arg channels vectors in \arg src, each of + * length \arg count, into the single vector \arg dst of length \arg + * channels * \arg count. + * + * Caller guarantees that the \arg src and \arg dst vectors are + * non-overlapping. + */ +template +inline void v_interleave(T *const BQ_R__ dst, + const T *const BQ_R__ *const BQ_R__ src, + const int channels, + const int count) +{ + int idx = 0; + switch (channels) { + case 2: + // common case, may be vectorized by compiler if hardcoded + for (int i = 0; i < count; ++i) { + for (int j = 0; j < 2; ++j) { + dst[idx++] = src[j][i]; + } + } + return; + case 1: + v_copy(dst, src[0], count); + return; + default: + for (int i = 0; i < count; ++i) { + for (int j = 0; j < channels; ++j) { + dst[idx++] = src[j][i]; + } + } + } +} + +#if defined HAVE_IPP +template<> +inline void v_interleave(float *const BQ_R__ dst, + const float *const BQ_R__ *const BQ_R__ src, + const int channels, + const int count) +{ + ippsInterleave_32f((const Ipp32f **)src, channels, count, dst); +} +// IPP does not (currently?) provide double-precision interleave +#endif + +/** + * v_deinterleave + * + * Deinterleave (unzip) the single vector \arg src, of length \arg + * channels * \arg count, into the \arg channels vectors in \arg dst, + * each of length \arg count. + * + * Caller guarantees that the \arg src and \arg dst vectors are + * non-overlapping. + */ +template +inline void v_deinterleave(T *const BQ_R__ *const BQ_R__ dst, + const T *const BQ_R__ src, + const int channels, + const int count) +{ + int idx = 0; + switch (channels) { + case 2: + // common case, may be vectorized by compiler if hardcoded + for (int i = 0; i < count; ++i) { + for (int j = 0; j < 2; ++j) { + dst[j][i] = src[idx++]; + } + } + return; + case 1: + v_copy(dst[0], src, count); + return; + default: + for (int i = 0; i < count; ++i) { + for (int j = 0; j < channels; ++j) { + dst[j][i] = src[idx++]; + } + } + } +} + +#if defined HAVE_IPP +template<> +inline void v_deinterleave(float *const BQ_R__ *const BQ_R__ dst, + const float *const BQ_R__ src, + const int channels, + const int count) +{ + ippsDeinterleave_32f((const Ipp32f *)src, channels, count, (Ipp32f **)dst); +} +// IPP does not (currently?) provide double-precision deinterleave +#endif + +/** + * v_fftshift + * + * Perform an in-place left-right shift of the vector \arg vec of + * length \arg count, swapping the first and second halves of the + * vector. + */ +template +inline void v_fftshift(T *const BQ_R__ vec, + const int count) +{ + const int hs = count/2; + for (int i = 0; i < hs; ++i) { + T t = vec[i]; + vec[i] = vec[i + hs]; + vec[i + hs] = t; + } +} + +/** + * v_mean + * + * Return the mean of the values in the vector \arg vec, of length + * \arg count. + */ +template +inline T v_mean(const T *const BQ_R__ vec, const int count) +{ + T t = T(0); + for (int i = 0; i < count; ++i) { + t += vec[i]; + } + t /= T(count); + return t; +} + +/** + * v_mean_channels + * + * Return the mean of all the values in the set of \arg channels + * vectors in \arg vec, each of length \arg count. (This is the single + * mean of all values across all channels, not one mean per channel.) + */ +template +inline T v_mean_channels(const T *const BQ_R__ *const BQ_R__ vec, + const int channels, + const int count) +{ + T t = T(0); + for (int c = 0; c < channels; ++c) { + t += v_mean(vec[c], count); + } + t /= T(channels); + return t; +} + +} + +#endif diff -r 77fca9c01e46 -r cf59817a5983 fft/native/bqvec/bqvec/VectorOpsComplex.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fft/native/bqvec/bqvec/VectorOpsComplex.h Sat Oct 17 15:00:08 2015 +0100 @@ -0,0 +1,548 @@ +/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */ + +/* + bqvec + + A small library for vector arithmetic and allocation in C++ using + raw C pointer arrays. + + Copyright 2007-2015 Particular Programs Ltd. + + Permission is hereby granted, free of charge, to any person + obtaining a copy of this software and associated documentation + files (the "Software"), to deal in the Software without + restriction, including without limitation the rights to use, copy, + modify, merge, publish, distribute, sublicense, and/or sell copies + of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR + ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF + CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + Except as contained in this notice, the names of Chris Cannam and + Particular Programs Ltd shall not be used in advertising or + otherwise to promote the sale, use or other dealings in this + Software without prior written authorization. +*/ + +#ifndef BQVEC_VECTOR_OPS_COMPLEX_H +#define BQVEC_VECTOR_OPS_COMPLEX_H + +#include "VectorOps.h" +#include "ComplexTypes.h" + +#include + +namespace breakfastquay { + +#ifndef NO_COMPLEX_TYPES + +template<> +inline void v_zero(bq_complex_t *const BQ_R__ ptr, + const int count) +{ +#if defined HAVE_IPP + if (sizeof(bq_complex_element_t) == sizeof(float)) { + ippsZero_32fc((Ipp32fc *)ptr, count); + } else { + ippsZero_64fc((Ipp64fc *)ptr, count); + } +#elif defined HAVE_VDSP + if (sizeof(bq_complex_element_t) == sizeof(float)) { + vDSP_vclr((float *)ptr, 1, count * 2); + } else { + vDSP_vclrD((double *)ptr, 1, count * 2); + } +#else + const bq_complex_element_t value = 0.0; + for (int i = 0; i < count; ++i) { + ptr[i].re = value; + ptr[i].im = value; + } +#endif +} + +#if defined HAVE_IPP +template<> +inline void v_copy(bq_complex_t *const BQ_R__ dst, + const bq_complex_t *const BQ_R__ src, + const int count) +{ + if (sizeof(bq_complex_element_t) == sizeof(float)) { + ippsCopy_32fc((const Ipp32fc *)src, (Ipp32fc *)dst, count); + } else { + ippsCopy_64fc((const Ipp64fc *)src, (Ipp64fc *)dst, count); + } +} +template<> +inline void v_move(bq_complex_t *const BQ_R__ dst, + const bq_complex_t *const BQ_R__ src, + const int count) +{ + if (sizeof(bq_complex_element_t) == sizeof(float)) { + ippsMove_32fc((const Ipp32fc *)src, (Ipp32fc *)dst, count); + } else { + ippsMove_64fc((const Ipp64fc *)src, (Ipp64fc *)dst, count); + } +} +#endif + +template<> +inline void v_convert(bq_complex_t *const BQ_R__ dst, + const bq_complex_element_t *const BQ_R__ src, + const int srccount) +{ + const int targetcount = srccount / 2; + int srcidx = 0; + for (int i = 0; i < targetcount; ++i) { + dst[i].re = src[srcidx++]; + dst[i].im = src[srcidx++]; + } +} + +template<> +inline void v_convert(bq_complex_element_t *const BQ_R__ dst, + const bq_complex_t *const BQ_R__ src, + const int srccount) +{ + int targetidx = 0; + for (int i = 0; i < srccount; ++i) { + dst[targetidx++] = src[i].re; + dst[targetidx++] = src[i].im; + } +} + +inline void c_add(bq_complex_t &dst, + const bq_complex_t &src) +{ + dst.re += src.re; + dst.im += src.im; +} + +inline void c_add_with_gain(bq_complex_t &dst, + const bq_complex_t &src, + const bq_complex_element_t gain) +{ + dst.re += src.re * gain; + dst.im += src.im * gain; +} + +inline void c_multiply(bq_complex_t &dst, + const bq_complex_t &src1, + const bq_complex_t &src2) +{ + // Note dst may alias src1 or src2. + + // The usual formula -- four multiplies, one add and one subtract + // + // (x1 + y1i)(x2 + y2i) = (x1x2 - y1y2) + (x1y2 + y1x2)i + // + // Alternative formula -- three multiplies, two adds, three + // subtracts + // + // (x1 + y1i)(x2 + y2i) = (x1x2 - y1y2) + ((x1 + y1)(x2 + y2) - x1x2 - y1y2)i + // + // The first formulation tests marginally quicker here. + + bq_complex_element_t real = src1.re * src2.re - src1.im * src2.im; + bq_complex_element_t imag = src1.re * src2.im + src1.im * src2.re; + + dst.re = real; + dst.im = imag; +} + +inline void c_multiply(bq_complex_t &dst, + const bq_complex_t &src) +{ + c_multiply(dst, dst, src); +} + +inline void c_multiply_and_add(bq_complex_t &dst, + const bq_complex_t &src1, + const bq_complex_t &src2) +{ + bq_complex_t tmp; + c_multiply(tmp, src1, src2); + c_add(dst, tmp); +} + +template<> +inline void v_add(bq_complex_t *const BQ_R__ dst, + const bq_complex_t *const BQ_R__ src, + const int count) +{ +#if defined HAVE_IPP + if (sizeof(bq_complex_element_t) == sizeof(float)) { + ippsAdd_32fc_I((Ipp32fc *)src, (Ipp32fc *)dst, count); + } else { + ippsAdd_64fc_I((Ipp64fc *)src, (Ipp64fc *)dst, count); + } +#else + for (int i = 0; i < count; ++i) { + dst[i].re += src[i].re; + dst[i].im += src[i].im; + } +#endif +} + +template<> +inline void v_add_with_gain(bq_complex_t *const BQ_R__ dst, + const bq_complex_t *const BQ_R__ src, + const bq_complex_element_t gain, + const int count) +{ + for (int i = 0; i < count; ++i) { + dst[i].re += src[i].re * gain; + dst[i].im += src[i].im * gain; + } +} + +template<> +inline void v_multiply(bq_complex_t *const BQ_R__ dst, + const bq_complex_t *const BQ_R__ src, + const int count) +{ +#ifdef HAVE_IPP + if (sizeof(bq_complex_element_t) == sizeof(float)) { + ippsMul_32fc_I((const Ipp32fc *)src, (Ipp32fc *)dst, count); + } else { + ippsMul_64fc_I((const Ipp64fc *)src, (Ipp64fc *)dst, count); + } +#else + for (int i = 0; i < count; ++i) { + c_multiply(dst[i], src[i]); + } +#endif +} + +template<> +inline void v_multiply(bq_complex_t *const BQ_R__ dst, + const bq_complex_t *const BQ_R__ src1, + const bq_complex_t *const BQ_R__ src2, + const int count) +{ +#ifdef HAVE_IPP + if (sizeof(bq_complex_element_t) == sizeof(float)) { + ippsMul_32fc((const Ipp32fc *)src1, (const Ipp32fc *)src2, + (Ipp32fc *)dst, count); + } else { + ippsMul_64fc((const Ipp64fc *)src1, (const Ipp64fc *)src2, + (Ipp64fc *)dst, count); + } +#else + for (int i = 0; i < count; ++i) { + c_multiply(dst[i], src1[i], src2[i]); + } +#endif +} + +template<> +inline void v_multiply_and_add(bq_complex_t *const BQ_R__ dst, + const bq_complex_t *const BQ_R__ src1, + const bq_complex_t *const BQ_R__ src2, + const int count) +{ +#ifdef HAVE_IPP + if (sizeof(bq_complex_element_t) == sizeof(float)) { + ippsAddProduct_32fc((const Ipp32fc *)src1, (const Ipp32fc *)src2, + (Ipp32fc *)dst, count); + } else { + ippsAddProduct_64fc((const Ipp64fc *)src1, (const Ipp64fc *)src2, + (Ipp64fc *)dst, count); + } +#else + for (int i = 0; i < count; ++i) { + c_multiply_and_add(dst[i], src1[i], src2[i]); + } +#endif +} + +#if defined( __GNUC__ ) && defined( _WIN32 ) +// MinGW doesn't appear to have sincos, so define it -- it's +// a single x87 instruction anyway +static inline void sincos(double x, double *sin, double *cos) { + __asm__ ("fsincos;" : "=t" (*cos), "=u" (*sin) : "0" (x) : "st(7)"); +} +static inline void sincosf(float fx, float *fsin, float *fcos) { + double sin, cos; + sincos(fx, &sin, &cos); + *fsin = sin; + *fcos = cos; +} +#endif + +#endif /* !NO_COMPLEX_TYPES */ + +template +inline void c_phasor(T *real, T *imag, T phase) +{ + //!!! IPP contains ippsSinCos_xxx in ippvm.h -- these are + //!!! fixed-accuracy, test and compare +#if defined HAVE_VDSP + int one = 1; + if (sizeof(T) == sizeof(float)) { + vvsincosf((float *)imag, (float *)real, (const float *)&phase, &one); + } else { + vvsincos((double *)imag, (double *)real, (const double *)&phase, &one); + } +#elif defined LACK_SINCOS + if (sizeof(T) == sizeof(float)) { + *real = cosf(phase); + *imag = sinf(phase); + } else { + *real = cos(phase); + *imag = sin(phase); + } +#elif defined __GNUC__ + if (sizeof(T) == sizeof(float)) { + sincosf(float(phase), (float *)imag, (float *)real); + } else { + sincos(phase, (double *)imag, (double *)real); + } +#else + if (sizeof(T) == sizeof(float)) { + *real = cosf(phase); + *imag = sinf(phase); + } else { + *real = cos(phase); + *imag = sin(phase); + } +#endif +} + +template +inline void c_magphase(T *mag, T *phase, T real, T imag) +{ + *mag = sqrt(real * real + imag * imag); + *phase = atan2(imag, real); +} + +#ifdef USE_APPROXIMATE_ATAN2 +// NB arguments in opposite order from usual for atan2f +extern float approximate_atan2f(float real, float imag); +template<> +inline void c_magphase(float *mag, float *phase, float real, float imag) +{ + float atan = approximate_atan2f(real, imag); + *phase = atan; + *mag = sqrtf(real * real + imag * imag); +} +#else +template<> +inline void c_magphase(float *mag, float *phase, float real, float imag) +{ + *mag = sqrtf(real * real + imag * imag); + *phase = atan2f(imag, real); +} +#endif + +#ifndef NO_COMPLEX_TYPES + +inline bq_complex_t c_phasor(bq_complex_element_t phase) +{ + bq_complex_t c; + c_phasor(&c.re, &c.im, phase); + return c; +} + +inline void c_magphase(bq_complex_element_t *mag, bq_complex_element_t *phase, + bq_complex_t c) +{ + c_magphase(mag, phase, c.re, c.im); +} + +void v_polar_to_cartesian(bq_complex_t *const BQ_R__ dst, + const bq_complex_element_t *const BQ_R__ mag, + const bq_complex_element_t *const BQ_R__ phase, + const int count); + +void v_polar_interleaved_to_cartesian(bq_complex_t *const BQ_R__ dst, + const bq_complex_element_t *const BQ_R__ src, + const int count); + +inline void v_cartesian_to_polar(bq_complex_element_t *const BQ_R__ mag, + bq_complex_element_t *const BQ_R__ phase, + const bq_complex_t *const BQ_R__ src, + const int count) +{ + for (int i = 0; i < count; ++i) { + c_magphase(mag + i, phase + i, src[i].re, src[i].im); + } +} + +inline void v_cartesian_to_polar_interleaved(bq_complex_element_t *const BQ_R__ dst, + const bq_complex_t *const BQ_R__ src, + const int count) +{ + for (int i = 0; i < count; ++i) { + c_magphase(&dst[i*2], &dst[i*2+1], + src[i].re, src[i].im); + } +} + +#endif /* !NO_COMPLEX_TYPES */ + +template // S source, T target +void v_polar_to_cartesian(T *const BQ_R__ real, + T *const BQ_R__ imag, + const S *const BQ_R__ mag, + const S *const BQ_R__ phase, + const int count) +{ + for (int i = 0; i < count; ++i) { + c_phasor(real + i, imag + i, phase[i]); + } + v_multiply(real, mag, count); + v_multiply(imag, mag, count); +} + +template +void v_polar_interleaved_to_cartesian_inplace(T *const BQ_R__ srcdst, + const int count) +{ + T real, imag; + for (int i = 0; i < count*2; i += 2) { + c_phasor(&real, &imag, srcdst[i+1]); + real *= srcdst[i]; + imag *= srcdst[i]; + srcdst[i] = real; + srcdst[i+1] = imag; + } +} + +template // S source, T target +void v_polar_to_cartesian_interleaved(T *const BQ_R__ dst, + const S *const BQ_R__ mag, + const S *const BQ_R__ phase, + const int count) +{ + T real, imag; + for (int i = 0; i < count; ++i) { + c_phasor(&real, &imag, phase[i]); + real *= mag[i]; + imag *= mag[i]; + dst[i*2] = real; + dst[i*2+1] = imag; + } +} + +#if defined USE_POMMIER_MATHFUN +void v_polar_to_cartesian_pommier(float *const BQ_R__ real, + float *const BQ_R__ imag, + const float *const BQ_R__ mag, + const float *const BQ_R__ phase, + const int count); +void v_polar_interleaved_to_cartesian_inplace_pommier(float *const BQ_R__ srcdst, + const int count); +void v_polar_to_cartesian_interleaved_pommier(float *const BQ_R__ dst, + const float *const BQ_R__ mag, + const float *const BQ_R__ phase, + const int count); + +template<> +inline void v_polar_to_cartesian(float *const BQ_R__ real, + float *const BQ_R__ imag, + const float *const BQ_R__ mag, + const float *const BQ_R__ phase, + const int count) +{ + v_polar_to_cartesian_pommier(real, imag, mag, phase, count); +} + +template<> +inline void v_polar_interleaved_to_cartesian_inplace(float *const BQ_R__ srcdst, + const int count) +{ + v_polar_interleaved_to_cartesian_inplace_pommier(srcdst, count); +} + +template<> +inline void v_polar_to_cartesian_interleaved(float *const BQ_R__ dst, + const float *const BQ_R__ mag, + const float *const BQ_R__ phase, + const int count) +{ + v_polar_to_cartesian_interleaved_pommier(dst, mag, phase, count); +} + +#endif + +template // S source, T target +void v_cartesian_to_polar(T *const BQ_R__ mag, + T *const BQ_R__ phase, + const S *const BQ_R__ real, + const S *const BQ_R__ imag, + const int count) +{ + for (int i = 0; i < count; ++i) { + c_magphase(mag + i, phase + i, real[i], imag[i]); + } +} + +template // S source, T target +void v_cartesian_interleaved_to_polar(T *const BQ_R__ mag, + T *const BQ_R__ phase, + const S *const BQ_R__ src, + const int count) +{ + for (int i = 0; i < count; ++i) { + c_magphase(mag + i, phase + i, src[i*2], src[i*2+1]); + } +} + +#ifdef HAVE_VDSP +template<> +inline void v_cartesian_to_polar(float *const BQ_R__ mag, + float *const BQ_R__ phase, + const float *const BQ_R__ real, + const float *const BQ_R__ imag, + const int count) +{ + DSPSplitComplex c; + c.realp = const_cast(real); + c.imagp = const_cast(imag); + vDSP_zvmags(&c, 1, phase, 1, count); // using phase as a temporary dest + vvsqrtf(mag, phase, &count); // using phase as the source + vvatan2f(phase, imag, real, &count); +} +template<> +inline void v_cartesian_to_polar(double *const BQ_R__ mag, + double *const BQ_R__ phase, + const double *const BQ_R__ real, + const double *const BQ_R__ imag, + const int count) +{ + // double precision, this is significantly faster than using vDSP_polar + DSPDoubleSplitComplex c; + c.realp = const_cast(real); + c.imagp = const_cast(imag); + vDSP_zvmagsD(&c, 1, phase, 1, count); // using phase as a temporary dest + vvsqrt(mag, phase, &count); // using phase as the source + vvatan2(phase, imag, real, &count); +} +#endif + +template +void v_cartesian_to_polar_interleaved_inplace(T *const BQ_R__ srcdst, + const int count) +{ + T mag, phase; + for (int i = 0; i < count * 2; i += 2) { + c_magphase(&mag, &phase, srcdst[i], srcdst[i+1]); + srcdst[i] = mag; + srcdst[i+1] = phase; + } +} + +} + +#endif + diff -r 77fca9c01e46 -r cf59817a5983 fft/native/bqvec/pommier/neon_mathfun.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fft/native/bqvec/pommier/neon_mathfun.h Sat Oct 17 15:00:08 2015 +0100 @@ -0,0 +1,301 @@ +/* NEON implementation of sin, cos, exp and log + + Inspired by Intel Approximate Math library, and based on the + corresponding algorithms of the cephes math library +*/ + +/* Copyright (C) 2011 Julien Pommier + + This software is provided 'as-is', without any express or implied + warranty. In no event will the authors be held liable for any damages + arising from the use of this software. + + Permission is granted to anyone to use this software for any purpose, + including commercial applications, and to alter it and redistribute it + freely, subject to the following restrictions: + + 1. The origin of this software must not be misrepresented; you must not + claim that you wrote the original software. If you use this software + in a product, an acknowledgment in the product documentation would be + appreciated but is not required. + 2. Altered source versions must be plainly marked as such, and must not be + misrepresented as being the original software. + 3. This notice may not be removed or altered from any source distribution. + + (this is the zlib license) +*/ + +#include + +typedef float32x4_t v4sf; // vector of 4 float +typedef uint32x4_t v4su; // vector of 4 uint32 +typedef int32x4_t v4si; // vector of 4 uint32 + +#define c_inv_mant_mask ~0x7f800000u +#define c_cephes_SQRTHF 0.707106781186547524 +#define c_cephes_log_p0 7.0376836292E-2 +#define c_cephes_log_p1 - 1.1514610310E-1 +#define c_cephes_log_p2 1.1676998740E-1 +#define c_cephes_log_p3 - 1.2420140846E-1 +#define c_cephes_log_p4 + 1.4249322787E-1 +#define c_cephes_log_p5 - 1.6668057665E-1 +#define c_cephes_log_p6 + 2.0000714765E-1 +#define c_cephes_log_p7 - 2.4999993993E-1 +#define c_cephes_log_p8 + 3.3333331174E-1 +#define c_cephes_log_q1 -2.12194440e-4 +#define c_cephes_log_q2 0.693359375 + +/* natural logarithm computed for 4 simultaneous float + return NaN for x <= 0 +*/ +v4sf log_ps(v4sf x) { + v4sf one = vdupq_n_f32(1); + + x = vmaxq_f32(x, vdupq_n_f32(0)); /* force flush to zero on denormal values */ + v4su invalid_mask = vcleq_f32(x, vdupq_n_f32(0)); + + v4si ux = vreinterpretq_s32_f32(x); + + v4si emm0 = vshrq_n_s32(ux, 23); + + /* keep only the fractional part */ + ux = vandq_s32(ux, vdupq_n_s32(c_inv_mant_mask)); + ux = vorrq_s32(ux, vreinterpretq_s32_f32(vdupq_n_f32(0.5f))); + x = vreinterpretq_f32_s32(ux); + + emm0 = vsubq_s32(emm0, vdupq_n_s32(0x7f)); + v4sf e = vcvtq_f32_s32(emm0); + + e = vaddq_f32(e, one); + + /* part2: + if( x < SQRTHF ) { + e -= 1; + x = x + x - 1.0; + } else { x = x - 1.0; } + */ + v4su mask = vcltq_f32(x, vdupq_n_f32(c_cephes_SQRTHF)); + v4sf tmp = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(x), mask)); + x = vsubq_f32(x, one); + e = vsubq_f32(e, vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(one), mask))); + x = vaddq_f32(x, tmp); + + v4sf z = vmulq_f32(x,x); + + v4sf y = vdupq_n_f32(c_cephes_log_p0); + y = vmulq_f32(y, x); + y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p1)); + y = vmulq_f32(y, x); + y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p2)); + y = vmulq_f32(y, x); + y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p3)); + y = vmulq_f32(y, x); + y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p4)); + y = vmulq_f32(y, x); + y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p5)); + y = vmulq_f32(y, x); + y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p6)); + y = vmulq_f32(y, x); + y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p7)); + y = vmulq_f32(y, x); + y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p8)); + y = vmulq_f32(y, x); + + y = vmulq_f32(y, z); + + + tmp = vmulq_f32(e, vdupq_n_f32(c_cephes_log_q1)); + y = vaddq_f32(y, tmp); + + + tmp = vmulq_f32(z, vdupq_n_f32(0.5f)); + y = vsubq_f32(y, tmp); + + tmp = vmulq_f32(e, vdupq_n_f32(c_cephes_log_q2)); + x = vaddq_f32(x, y); + x = vaddq_f32(x, tmp); + x = vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(x), invalid_mask)); // negative arg will be NAN + return x; +} + +#define c_exp_hi 88.3762626647949f +#define c_exp_lo -88.3762626647949f + +#define c_cephes_LOG2EF 1.44269504088896341 +#define c_cephes_exp_C1 0.693359375 +#define c_cephes_exp_C2 -2.12194440e-4 + +#define c_cephes_exp_p0 1.9875691500E-4 +#define c_cephes_exp_p1 1.3981999507E-3 +#define c_cephes_exp_p2 8.3334519073E-3 +#define c_cephes_exp_p3 4.1665795894E-2 +#define c_cephes_exp_p4 1.6666665459E-1 +#define c_cephes_exp_p5 5.0000001201E-1 + +/* exp() computed for 4 float at once */ +v4sf exp_ps(v4sf x) { + v4sf tmp, fx; + + v4sf one = vdupq_n_f32(1); + x = vminq_f32(x, vdupq_n_f32(c_exp_hi)); + x = vmaxq_f32(x, vdupq_n_f32(c_exp_lo)); + + /* express exp(x) as exp(g + n*log(2)) */ + fx = vmlaq_f32(vdupq_n_f32(0.5f), x, vdupq_n_f32(c_cephes_LOG2EF)); + + /* perform a floorf */ + tmp = vcvtq_f32_s32(vcvtq_s32_f32(fx)); + + /* if greater, substract 1 */ + v4su mask = vcgtq_f32(tmp, fx); + mask = vandq_u32(mask, vreinterpretq_u32_f32(one)); + + + fx = vsubq_f32(tmp, vreinterpretq_f32_u32(mask)); + + tmp = vmulq_f32(fx, vdupq_n_f32(c_cephes_exp_C1)); + v4sf z = vmulq_f32(fx, vdupq_n_f32(c_cephes_exp_C2)); + x = vsubq_f32(x, tmp); + x = vsubq_f32(x, z); + + static const float32_t cephes_exp_p[6] = { c_cephes_exp_p0, c_cephes_exp_p1, c_cephes_exp_p2, c_cephes_exp_p3, c_cephes_exp_p4, c_cephes_exp_p5 }; + v4sf y = vld1q_dup_f32(cephes_exp_p+0); + v4sf c1 = vld1q_dup_f32(cephes_exp_p+1); + v4sf c2 = vld1q_dup_f32(cephes_exp_p+2); + v4sf c3 = vld1q_dup_f32(cephes_exp_p+3); + v4sf c4 = vld1q_dup_f32(cephes_exp_p+4); + v4sf c5 = vld1q_dup_f32(cephes_exp_p+5); + + y = vmulq_f32(y, x); + z = vmulq_f32(x,x); + y = vaddq_f32(y, c1); + y = vmulq_f32(y, x); + y = vaddq_f32(y, c2); + y = vmulq_f32(y, x); + y = vaddq_f32(y, c3); + y = vmulq_f32(y, x); + y = vaddq_f32(y, c4); + y = vmulq_f32(y, x); + y = vaddq_f32(y, c5); + + y = vmulq_f32(y, z); + y = vaddq_f32(y, x); + y = vaddq_f32(y, one); + + /* build 2^n */ + int32x4_t mm; + mm = vcvtq_s32_f32(fx); + mm = vaddq_s32(mm, vdupq_n_s32(0x7f)); + mm = vshlq_n_s32(mm, 23); + v4sf pow2n = vreinterpretq_f32_s32(mm); + + y = vmulq_f32(y, pow2n); + return y; +} + +#define c_minus_cephes_DP1 -0.78515625 +#define c_minus_cephes_DP2 -2.4187564849853515625e-4 +#define c_minus_cephes_DP3 -3.77489497744594108e-8 +#define c_sincof_p0 -1.9515295891E-4 +#define c_sincof_p1 8.3321608736E-3 +#define c_sincof_p2 -1.6666654611E-1 +#define c_coscof_p0 2.443315711809948E-005 +#define c_coscof_p1 -1.388731625493765E-003 +#define c_coscof_p2 4.166664568298827E-002 +#define c_cephes_FOPI 1.27323954473516 // 4 / M_PI + +/* evaluation of 4 sines & cosines at once. + + The code is the exact rewriting of the cephes sinf function. + Precision is excellent as long as x < 8192 (I did not bother to + take into account the special handling they have for greater values + -- it does not return garbage for arguments over 8192, though, but + the extra precision is missing). + + Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the + surprising but correct result. + + Note also that when you compute sin(x), cos(x) is available at + almost no extra price so both sin_ps and cos_ps make use of + sincos_ps.. + */ +void sincos_ps(v4sf x, v4sf *ysin, v4sf *ycos) { // any x + v4sf xmm1, xmm2, xmm3, y; + + v4su emm2; + + v4su sign_mask_sin, sign_mask_cos; + sign_mask_sin = vcltq_f32(x, vdupq_n_f32(0)); + x = vabsq_f32(x); + + /* scale by 4/Pi */ + y = vmulq_f32(x, vdupq_n_f32(c_cephes_FOPI)); + + /* store the integer part of y in mm0 */ + emm2 = vcvtq_u32_f32(y); + /* j=(j+1) & (~1) (see the cephes sources) */ + emm2 = vaddq_u32(emm2, vdupq_n_u32(1)); + emm2 = vandq_u32(emm2, vdupq_n_u32(~1)); + y = vcvtq_f32_u32(emm2); + + /* get the polynom selection mask + there is one polynom for 0 <= x <= Pi/4 + and another one for Pi/4 + +/* yes I know, the top of this file is quite ugly */ + +#ifdef _MSC_VER /* visual c++ */ +# define ALIGN16_BEG __declspec(align(16)) +# define ALIGN16_END +#else /* gcc or icc */ +# define ALIGN16_BEG +# define ALIGN16_END __attribute__((aligned(16))) +#endif + +/* __m128 is ugly to write */ +typedef __m128 v4sf; // vector of 4 float (sse1) + +#ifdef USE_SSE2 +# include +typedef __m128i v4si; // vector of 4 int (sse2) +#else +typedef __m64 v2si; // vector of 2 int (mmx) +#endif + +/* declare some SSE constants -- why can't I figure a better way to do that? */ +#define _PS_CONST(Name, Val) \ + static const ALIGN16_BEG float _ps_##Name[4] ALIGN16_END = { Val, Val, Val, Val } +#define _PI32_CONST(Name, Val) \ + static const ALIGN16_BEG int _pi32_##Name[4] ALIGN16_END = { Val, Val, Val, Val } +#define _PS_CONST_TYPE(Name, Type, Val) \ + static const ALIGN16_BEG Type _ps_##Name[4] ALIGN16_END = { Val, Val, Val, Val } + +_PS_CONST(1 , 1.0f); +_PS_CONST(0p5, 0.5f); +/* the smallest non denormalized float number */ +_PS_CONST_TYPE(min_norm_pos, int, 0x00800000); +_PS_CONST_TYPE(mant_mask, int, 0x7f800000); +_PS_CONST_TYPE(inv_mant_mask, int, ~0x7f800000); + +_PS_CONST_TYPE(sign_mask, int, 0x80000000); +_PS_CONST_TYPE(inv_sign_mask, int, ~0x80000000); + +_PI32_CONST(1, 1); +_PI32_CONST(inv1, ~1); +_PI32_CONST(2, 2); +_PI32_CONST(4, 4); +_PI32_CONST(0x7f, 0x7f); + +_PS_CONST(cephes_SQRTHF, 0.707106781186547524); +_PS_CONST(cephes_log_p0, 7.0376836292E-2); +_PS_CONST(cephes_log_p1, - 1.1514610310E-1); +_PS_CONST(cephes_log_p2, 1.1676998740E-1); +_PS_CONST(cephes_log_p3, - 1.2420140846E-1); +_PS_CONST(cephes_log_p4, + 1.4249322787E-1); +_PS_CONST(cephes_log_p5, - 1.6668057665E-1); +_PS_CONST(cephes_log_p6, + 2.0000714765E-1); +_PS_CONST(cephes_log_p7, - 2.4999993993E-1); +_PS_CONST(cephes_log_p8, + 3.3333331174E-1); +_PS_CONST(cephes_log_q1, -2.12194440e-4); +_PS_CONST(cephes_log_q2, 0.693359375); + +#if defined (__MINGW32__) + +/* the ugly part below: many versions of gcc used to be completely buggy with respect to some intrinsics + The movehl_ps is fixed in mingw 3.4.5, but I found out that all the _mm_cmp* intrinsics were completely + broken on my mingw gcc 3.4.5 ... + + Note that the bug on _mm_cmp* does occur only at -O0 optimization level +*/ + +inline __m128 my_movehl_ps(__m128 a, const __m128 b) { + asm ( + "movhlps %2,%0\n\t" + : "=x" (a) + : "0" (a), "x"(b) + ); + return a; } +#warning "redefined _mm_movehl_ps (see gcc bug 21179)" +#define _mm_movehl_ps my_movehl_ps + +inline __m128 my_cmplt_ps(__m128 a, const __m128 b) { + asm ( + "cmpltps %2,%0\n\t" + : "=x" (a) + : "0" (a), "x"(b) + ); + return a; + } +inline __m128 my_cmpgt_ps(__m128 a, const __m128 b) { + asm ( + "cmpnleps %2,%0\n\t" + : "=x" (a) + : "0" (a), "x"(b) + ); + return a; +} +inline __m128 my_cmpeq_ps(__m128 a, const __m128 b) { + asm ( + "cmpeqps %2,%0\n\t" + : "=x" (a) + : "0" (a), "x"(b) + ); + return a; +} +#warning "redefined _mm_cmpxx_ps functions..." +#define _mm_cmplt_ps my_cmplt_ps +#define _mm_cmpgt_ps my_cmpgt_ps +#define _mm_cmpeq_ps my_cmpeq_ps +#endif + +#ifndef USE_SSE2 +typedef union xmm_mm_union { + __m128 xmm; + __m64 mm[2]; +} xmm_mm_union; + +#define COPY_XMM_TO_MM(xmm_, mm0_, mm1_) { \ + xmm_mm_union u; u.xmm = xmm_; \ + mm0_ = u.mm[0]; \ + mm1_ = u.mm[1]; \ +} + +#define COPY_MM_TO_XMM(mm0_, mm1_, xmm_) { \ + xmm_mm_union u; u.mm[0]=mm0_; u.mm[1]=mm1_; xmm_ = u.xmm; \ + } + +#endif // USE_SSE2 + +/* natural logarithm computed for 4 simultaneous float + return NaN for x <= 0 +*/ +v4sf log_ps(v4sf x) { +#ifdef USE_SSE2 + v4si emm0; +#else + v2si mm0, mm1; +#endif + v4sf one = *(v4sf*)_ps_1; + + v4sf invalid_mask = _mm_cmple_ps(x, _mm_setzero_ps()); + + x = _mm_max_ps(x, *(v4sf*)_ps_min_norm_pos); /* cut off denormalized stuff */ + +#ifndef USE_SSE2 + /* part 1: x = frexpf(x, &e); */ + COPY_XMM_TO_MM(x, mm0, mm1); + mm0 = _mm_srli_pi32(mm0, 23); + mm1 = _mm_srli_pi32(mm1, 23); +#else + emm0 = _mm_srli_epi32(_mm_castps_si128(x), 23); +#endif + /* keep only the fractional part */ + x = _mm_and_ps(x, *(v4sf*)_ps_inv_mant_mask); + x = _mm_or_ps(x, *(v4sf*)_ps_0p5); + +#ifndef USE_SSE2 + /* now e=mm0:mm1 contain the really base-2 exponent */ + mm0 = _mm_sub_pi32(mm0, *(v2si*)_pi32_0x7f); + mm1 = _mm_sub_pi32(mm1, *(v2si*)_pi32_0x7f); + v4sf e = _mm_cvtpi32x2_ps(mm0, mm1); + _mm_empty(); /* bye bye mmx */ +#else + emm0 = _mm_sub_epi32(emm0, *(v4si*)_pi32_0x7f); + v4sf e = _mm_cvtepi32_ps(emm0); +#endif + + e = _mm_add_ps(e, one); + + /* part2: + if( x < SQRTHF ) { + e -= 1; + x = x + x - 1.0; + } else { x = x - 1.0; } + */ + v4sf mask = _mm_cmplt_ps(x, *(v4sf*)_ps_cephes_SQRTHF); + v4sf tmp = _mm_and_ps(x, mask); + x = _mm_sub_ps(x, one); + e = _mm_sub_ps(e, _mm_and_ps(one, mask)); + x = _mm_add_ps(x, tmp); + + + v4sf z = _mm_mul_ps(x,x); + + v4sf y = *(v4sf*)_ps_cephes_log_p0; + y = _mm_mul_ps(y, x); + y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p1); + y = _mm_mul_ps(y, x); + y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p2); + y = _mm_mul_ps(y, x); + y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p3); + y = _mm_mul_ps(y, x); + y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p4); + y = _mm_mul_ps(y, x); + y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p5); + y = _mm_mul_ps(y, x); + y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p6); + y = _mm_mul_ps(y, x); + y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p7); + y = _mm_mul_ps(y, x); + y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p8); + y = _mm_mul_ps(y, x); + + y = _mm_mul_ps(y, z); + + + tmp = _mm_mul_ps(e, *(v4sf*)_ps_cephes_log_q1); + y = _mm_add_ps(y, tmp); + + + tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5); + y = _mm_sub_ps(y, tmp); + + tmp = _mm_mul_ps(e, *(v4sf*)_ps_cephes_log_q2); + x = _mm_add_ps(x, y); + x = _mm_add_ps(x, tmp); + x = _mm_or_ps(x, invalid_mask); // negative arg will be NAN + return x; +} + +_PS_CONST(exp_hi, 88.3762626647949f); +_PS_CONST(exp_lo, -88.3762626647949f); + +_PS_CONST(cephes_LOG2EF, 1.44269504088896341); +_PS_CONST(cephes_exp_C1, 0.693359375); +_PS_CONST(cephes_exp_C2, -2.12194440e-4); + +_PS_CONST(cephes_exp_p0, 1.9875691500E-4); +_PS_CONST(cephes_exp_p1, 1.3981999507E-3); +_PS_CONST(cephes_exp_p2, 8.3334519073E-3); +_PS_CONST(cephes_exp_p3, 4.1665795894E-2); +_PS_CONST(cephes_exp_p4, 1.6666665459E-1); +_PS_CONST(cephes_exp_p5, 5.0000001201E-1); + +v4sf exp_ps(v4sf x) { + v4sf tmp = _mm_setzero_ps(), fx; +#ifdef USE_SSE2 + v4si emm0; +#else + v2si mm0, mm1; +#endif + v4sf one = *(v4sf*)_ps_1; + + x = _mm_min_ps(x, *(v4sf*)_ps_exp_hi); + x = _mm_max_ps(x, *(v4sf*)_ps_exp_lo); + + /* express exp(x) as exp(g + n*log(2)) */ + fx = _mm_mul_ps(x, *(v4sf*)_ps_cephes_LOG2EF); + fx = _mm_add_ps(fx, *(v4sf*)_ps_0p5); + + /* how to perform a floorf with SSE: just below */ +#ifndef USE_SSE2 + /* step 1 : cast to int */ + tmp = _mm_movehl_ps(tmp, fx); + mm0 = _mm_cvttps_pi32(fx); + mm1 = _mm_cvttps_pi32(tmp); + /* step 2 : cast back to float */ + tmp = _mm_cvtpi32x2_ps(mm0, mm1); +#else + emm0 = _mm_cvttps_epi32(fx); + tmp = _mm_cvtepi32_ps(emm0); +#endif + /* if greater, substract 1 */ + v4sf mask = _mm_cmpgt_ps(tmp, fx); + mask = _mm_and_ps(mask, one); + fx = _mm_sub_ps(tmp, mask); + + tmp = _mm_mul_ps(fx, *(v4sf*)_ps_cephes_exp_C1); + v4sf z = _mm_mul_ps(fx, *(v4sf*)_ps_cephes_exp_C2); + x = _mm_sub_ps(x, tmp); + x = _mm_sub_ps(x, z); + + z = _mm_mul_ps(x,x); + + v4sf y = *(v4sf*)_ps_cephes_exp_p0; + y = _mm_mul_ps(y, x); + y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p1); + y = _mm_mul_ps(y, x); + y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p2); + y = _mm_mul_ps(y, x); + y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p3); + y = _mm_mul_ps(y, x); + y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p4); + y = _mm_mul_ps(y, x); + y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p5); + y = _mm_mul_ps(y, z); + y = _mm_add_ps(y, x); + y = _mm_add_ps(y, one); + + /* build 2^n */ +#ifndef USE_SSE2 + z = _mm_movehl_ps(z, fx); + mm0 = _mm_cvttps_pi32(fx); + mm1 = _mm_cvttps_pi32(z); + mm0 = _mm_add_pi32(mm0, *(v2si*)_pi32_0x7f); + mm1 = _mm_add_pi32(mm1, *(v2si*)_pi32_0x7f); + mm0 = _mm_slli_pi32(mm0, 23); + mm1 = _mm_slli_pi32(mm1, 23); + + v4sf pow2n; + COPY_MM_TO_XMM(mm0, mm1, pow2n); + _mm_empty(); +#else + emm0 = _mm_cvttps_epi32(fx); + emm0 = _mm_add_epi32(emm0, *(v4si*)_pi32_0x7f); + emm0 = _mm_slli_epi32(emm0, 23); + v4sf pow2n = _mm_castsi128_ps(emm0); +#endif + y = _mm_mul_ps(y, pow2n); + return y; +} + +_PS_CONST(minus_cephes_DP1, -0.78515625); +_PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4); +_PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8); +_PS_CONST(sincof_p0, -1.9515295891E-4); +_PS_CONST(sincof_p1, 8.3321608736E-3); +_PS_CONST(sincof_p2, -1.6666654611E-1); +_PS_CONST(coscof_p0, 2.443315711809948E-005); +_PS_CONST(coscof_p1, -1.388731625493765E-003); +_PS_CONST(coscof_p2, 4.166664568298827E-002); +_PS_CONST(cephes_FOPI, 1.27323954473516); // 4 / M_PI + + +/* evaluation of 4 sines at onces, using only SSE1+MMX intrinsics so + it runs also on old athlons XPs and the pentium III of your grand + mother. + + The code is the exact rewriting of the cephes sinf function. + Precision is excellent as long as x < 8192 (I did not bother to + take into account the special handling they have for greater values + -- it does not return garbage for arguments over 8192, though, but + the extra precision is missing). + + Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the + surprising but correct result. + + Performance is also surprisingly good, 1.33 times faster than the + macos vsinf SSE2 function, and 1.5 times faster than the + __vrs4_sinf of amd's ACML (which is only available in 64 bits). Not + too bad for an SSE1 function (with no special tuning) ! + However the latter libraries probably have a much better handling of NaN, + Inf, denormalized and other special arguments.. + + On my core 1 duo, the execution of this function takes approximately 95 cycles. + + From what I have observed on the experiments with Intel AMath lib, switching to an + SSE2 version would improve the perf by only 10%. + + Since it is based on SSE intrinsics, it has to be compiled at -O2 to + deliver full speed. +*/ +v4sf sin_ps(v4sf x) { // any x + v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, sign_bit, y; + +#ifdef USE_SSE2 + v4si emm0, emm2; +#else + v2si mm0, mm1, mm2, mm3; +#endif + sign_bit = x; + /* take the absolute value */ + x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask); + /* extract the sign bit (upper one) */ + sign_bit = _mm_and_ps(sign_bit, *(v4sf*)_ps_sign_mask); + + /* scale by 4/Pi */ + y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI); + + //printf("plop:"); print4(y); +#ifdef USE_SSE2 + /* store the integer part of y in mm0 */ + emm2 = _mm_cvttps_epi32(y); + /* j=(j+1) & (~1) (see the cephes sources) */ + emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1); + emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1); + y = _mm_cvtepi32_ps(emm2); + /* get the swap sign flag */ + emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4); + emm0 = _mm_slli_epi32(emm0, 29); + /* get the polynom selection mask + there is one polynom for 0 <= x <= Pi/4 + and another one for Pi/4 +#endif + +#include +using std::cerr; +using std::endl; + +namespace breakfastquay { + +#ifdef HAVE_IPP + +template <> +float *allocate(size_t count) +{ + float *ptr = ippsMalloc_32f(count); + if (!ptr) throw (std::bad_alloc()); + return ptr; +} + +template <> +double *allocate(size_t count) +{ + double *ptr = ippsMalloc_64f(count); + if (!ptr) throw (std::bad_alloc()); + return ptr; +} + +template <> +void deallocate(float *ptr) +{ + if (ptr) ippsFree((void *)ptr); +} + +template <> +void deallocate(double *ptr) +{ + if (ptr) ippsFree((void *)ptr); +} + +#endif + +} + diff -r 77fca9c01e46 -r cf59817a5983 fft/native/bqvec/src/Barrier.cpp --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fft/native/bqvec/src/Barrier.cpp Sat Oct 17 15:00:08 2015 +0100 @@ -0,0 +1,67 @@ +/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */ + +/* + bqvec + + A small library for vector arithmetic and allocation in C++ using + raw C pointer arrays. + + Copyright 2007-2015 Particular Programs Ltd. + + Permission is hereby granted, free of charge, to any person + obtaining a copy of this software and associated documentation + files (the "Software"), to deal in the Software without + restriction, including without limitation the rights to use, copy, + modify, merge, publish, distribute, sublicense, and/or sell copies + of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR + ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF + CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + Except as contained in this notice, the names of Chris Cannam and + Particular Programs Ltd shall not be used in advertising or + otherwise to promote the sale, use or other dealings in this + Software without prior written authorization. +*/ + +#include "Barrier.h" + +namespace breakfastquay { + +void system_memorybarrier() +{ +#if defined __APPLE__ + + OSMemoryBarrier(); + +#elif (__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 1) + + __sync_synchronize(); + +#elif defined _WIN32 + +#if defined __MSVC__ + MemoryBarrier(); +#else /* (mingw) */ + LONG Barrier = 0; + __asm__ __volatile__("xchgl %%eax,%0 " + : "=r" (Barrier)); +#endif + +#else +#warning "No memory barrier defined" +#endif + +} + +} + diff -r 77fca9c01e46 -r cf59817a5983 fft/native/bqvec/src/VectorOpsComplex.cpp --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fft/native/bqvec/src/VectorOpsComplex.cpp Sat Oct 17 15:00:08 2015 +0100 @@ -0,0 +1,373 @@ +/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */ + +/* + bqvec + + A small library for vector arithmetic and allocation in C++ using + raw C pointer arrays. + + Copyright 2007-2015 Particular Programs Ltd. + + Permission is hereby granted, free of charge, to any person + obtaining a copy of this software and associated documentation + files (the "Software"), to deal in the Software without + restriction, including without limitation the rights to use, copy, + modify, merge, publish, distribute, sublicense, and/or sell copies + of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR + ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF + CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + Except as contained in this notice, the names of Chris Cannam and + Particular Programs Ltd shall not be used in advertising or + otherwise to promote the sale, use or other dealings in this + Software without prior written authorization. +*/ + +#include "VectorOpsComplex.h" + +#include + +#ifdef __MSVC__ +#include +#define alloca _alloca +#endif + +#if defined USE_POMMIER_MATHFUN +#if defined __ARMEL__ +#include "pommier/neon_mathfun.h" +#else +#include "pommier/sse_mathfun.h" +#endif +#endif + +#ifdef __MSVC__ +#include +#define alloca _alloca +#endif + +namespace breakfastquay { + +#ifdef USE_APPROXIMATE_ATAN2 +float approximate_atan2f(float real, float imag) +{ + static const float pi = M_PI; + static const float pi2 = M_PI / 2; + + float atan; + + if (real == 0.f) { + + if (imag > 0.0f) atan = pi2; + else if (imag == 0.0f) atan = 0.0f; + else atan = -pi2; + + } else { + + float z = imag/real; + + if (fabsf(z) < 1.f) { + atan = z / (1.f + 0.28f * z * z); + if (real < 0.f) { + if (imag < 0.f) atan -= pi; + else atan += pi; + } + } else { + atan = pi2 - z / (z * z + 0.28f); + if (imag < 0.f) atan -= pi; + } + } +} +#endif + +#if defined USE_POMMIER_MATHFUN + +#ifdef __ARMEL__ +typedef union { + float f[4]; + int i[4]; + v4sf v; +} V4SF; +#else +typedef ALIGN16_BEG union { + float f[4]; + int i[4]; + v4sf v; +} ALIGN16_END V4SF; +#endif + +void +v_polar_to_cartesian_pommier(float *const BQ_R__ real, + float *const BQ_R__ imag, + const float *const BQ_R__ mag, + const float *const BQ_R__ phase, + const int count) +{ + int idx = 0, tidx = 0; + int i = 0; + + for (int i = 0; i + 4 < count; i += 4) { + + V4SF fmag, fphase, fre, fim; + + for (int j = 0; j < 3; ++j) { + fmag.f[j] = mag[idx]; + fphase.f[j] = phase[idx++]; + } + + sincos_ps(fphase.v, &fim.v, &fre.v); + + for (int j = 0; j < 3; ++j) { + real[tidx] = fre.f[j] * fmag.f[j]; + imag[tidx++] = fim.f[j] * fmag.f[j]; + } + } + + while (i < count) { + float re, im; + c_phasor(&re, &im, phase[i]); + real[tidx] = re * mag[i]; + imag[tidx++] = im * mag[i]; + ++i; + } +} + +void +v_polar_interleaved_to_cartesian_inplace_pommier(float *const BQ_R__ srcdst, + const int count) +{ + int i; + int idx = 0, tidx = 0; + + for (i = 0; i + 4 < count; i += 4) { + + V4SF fmag, fphase, fre, fim; + + for (int j = 0; j < 3; ++j) { + fmag.f[j] = srcdst[idx++]; + fphase.f[j] = srcdst[idx++]; + } + + sincos_ps(fphase.v, &fim.v, &fre.v); + + for (int j = 0; j < 3; ++j) { + srcdst[tidx++] = fre.f[j] * fmag.f[j]; + srcdst[tidx++] = fim.f[j] * fmag.f[j]; + } + } + + while (i < count) { + float real, imag; + float mag = srcdst[idx++]; + float phase = srcdst[idx++]; + c_phasor(&real, &imag, phase); + srcdst[tidx++] = real * mag; + srcdst[tidx++] = imag * mag; + ++i; + } +} + +void +v_polar_to_cartesian_interleaved_pommier(float *const BQ_R__ dst, + const float *const BQ_R__ mag, + const float *const BQ_R__ phase, + const int count) +{ + int i; + int idx = 0, tidx = 0; + + for (i = 0; i + 4 <= count; i += 4) { + + V4SF fmag, fphase, fre, fim; + + for (int j = 0; j < 3; ++j) { + fmag.f[j] = mag[idx]; + fphase.f[j] = phase[idx]; + ++idx; + } + + sincos_ps(fphase.v, &fim.v, &fre.v); + + for (int j = 0; j < 3; ++j) { + dst[tidx++] = fre.f[j] * fmag.f[j]; + dst[tidx++] = fim.f[j] * fmag.f[j]; + } + } + + while (i < count) { + float real, imag; + c_phasor(&real, &imag, phase[i]); + dst[tidx++] = real * mag[i]; + dst[tidx++] = imag * mag[i]; + ++i; + } +} + +#endif + +#ifndef NO_COMPLEX_TYPES + +#if defined HAVE_IPP + +void +v_polar_to_cartesian(bq_complex_t *const BQ_R__ dst, + const bq_complex_element_t *const BQ_R__ mag, + const bq_complex_element_t *const BQ_R__ phase, + const int count) +{ + if (sizeof(bq_complex_element_t) == sizeof(float)) { + ippsPolarToCart_32fc((const float *)mag, (const float *)phase, + (Ipp32fc *)dst, count); + } else { + ippsPolarToCart_64fc((const double *)mag, (const double *)phase, + (Ipp64fc *)dst, count); + } +} + +#elif defined HAVE_VDSP + +void +v_polar_to_cartesian(bq_complex_t *const BQ_R__ dst, + const bq_complex_element_t *const BQ_R__ mag, + const bq_complex_element_t *const BQ_R__ phase, + const int count) +{ + bq_complex_element_t *sc = (bq_complex_element_t *) + alloca(count * 2 * sizeof(bq_complex_element_t)); + + if (sizeof(bq_complex_element_t) == sizeof(float)) { + vvsincosf((float *)sc, (float *)(sc + count), (float *)phase, &count); + } else { + vvsincos((double *)sc, (double *)(sc + count), (double *)phase, &count); + } + + int sini = 0; + int cosi = count; + + for (int i = 0; i < count; ++i) { + dst[i].re = mag[i] * sc[cosi++]; + dst[i].im = mag[i] * sc[sini++]; + } +} + +#else + +void +v_polar_to_cartesian(bq_complex_t *const BQ_R__ dst, + const bq_complex_element_t *const BQ_R__ mag, + const bq_complex_element_t *const BQ_R__ phase, + const int count) +{ + for (int i = 0; i < count; ++i) { + dst[i] = c_phasor(phase[i]); + } + for (int i = 0; i < count; ++i) { + dst[i].re *= mag[i]; + dst[i].im *= mag[i]; + } +} + +#endif + +#if defined USE_POMMIER_MATHFUN + +//!!! further tests reqd. This is only single precision but it seems +//!!! to be much faster than normal math library sincos. The comments +//!!! note that precision suffers for high arguments to sincos though, +//!!! and that is probably a common case for us + +void +v_polar_interleaved_to_cartesian(bq_complex_t *const BQ_R__ dst, + const bq_complex_element_t *const BQ_R__ src, + const int count) +{ + int idx = 0, tidx = 0; + + for (int i = 0; i < count; i += 4) { + + V4SF fmag, fphase, fre, fim; + + for (int j = 0; j < 3; ++j) { + fmag.f[j] = src[idx++]; + fphase.f[j] = src[idx++]; + } + + sincos_ps(fphase.v, &fim.v, &fre.v); + + for (int j = 0; j < 3; ++j) { + dst[tidx].re = fre.f[j] * fmag.f[j]; + dst[tidx++].im = fim.f[j] * fmag.f[j]; + } + } +} + +#elif (defined HAVE_IPP || defined HAVE_VDSP) + +// with a vector library, it should be faster to deinterleave and call +// the basic fn + +void +v_polar_interleaved_to_cartesian(bq_complex_t *const BQ_R__ dst, + const bq_complex_element_t *const BQ_R__ src, + const int count) +{ + bq_complex_element_t *mag = (bq_complex_element_t *) + alloca(count * sizeof(bq_complex_element_t)); + bq_complex_element_t *phase = (bq_complex_element_t *) + alloca(count * sizeof(bq_complex_element_t)); + bq_complex_element_t *magphase[] = { mag, phase }; + + v_deinterleave(magphase, src, 2, count); + v_polar_to_cartesian(dst, mag, phase, count); +} + +#else + +// without a vector library, better avoid the deinterleave step + +void +v_polar_interleaved_to_cartesian(bq_complex_t *const BQ_R__ dst, + const bq_complex_element_t *const BQ_R__ src, + const int count) +{ + bq_complex_element_t mag, phase; + int idx = 0; + for (int i = 0; i < count; ++i) { + mag = src[idx++]; + phase = src[idx++]; + dst[i] = c_phasor(phase); + dst[i].re *= mag; + dst[i].im *= mag; + } +} + +#endif + +void +v_polar_interleaved_to_cartesian_inplace(bq_complex_element_t *const BQ_R__ srcdst, + const int count) +{ + // Not ideal + bq_complex_element_t mag, phase; + int ii = 0, io = 0; + for (int i = 0; i < count; ++i) { + mag = srcdst[ii++]; + phase = srcdst[ii++]; + bq_complex_t p = c_phasor(phase); + srcdst[io++] = mag * p.re; + srcdst[io++] = mag * p.im; + } +} + +#endif + +} diff -r 77fca9c01e46 -r cf59817a5983 fft/native/bqvec/test/TestVectorOps.cpp --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fft/native/bqvec/test/TestVectorOps.cpp Sat Oct 17 15:00:08 2015 +0100 @@ -0,0 +1,254 @@ +/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */ + +#include "bqvec/VectorOpsComplex.h" + +#include +#include + +#include + +using namespace std; + +namespace breakfastquay { + +namespace Test { + +#ifdef _WIN32 +#define drand48() (-1+2*((float)rand())/RAND_MAX) +#endif + +bool +testMultiply() +{ + cerr << "testVectorOps: testing v_multiply complex" << endl; + + const int N = 1024; + bq_complex_t target[N]; + bq_complex_t src1[N]; + bq_complex_t src2[N]; + + for (int i = 0; i < N; ++i) { + src1[i].re = drand48(); + src1[i].im = drand48(); + src2[i].re = drand48(); + src2[i].im = drand48(); + } + + double mean, first, last, total = 0; + for (int i = 0; i < N; ++i) { + bq_complex_t result; + c_multiply(result, src1[i], src2[i]); + if (i == 0) first = result.re; + if (i == N-1) last = result.im; + total += result.re; + total += result.im; + } + mean = total / (N*2); + cerr << "Naive method: mean = " << mean << ", first = " << first + << ", last = " << last << endl; + + v_multiply(target, src1, src2, N); + total = 0; + + for (int i = 0; i < N; ++i) { + if (i == 0) first = target[i].re; + if (i == N-1) last = target[i].im; + total += target[i].re; + total += target[i].im; + } + mean = total / (N*2); + cerr << "v_multiply: mean = " << mean << ", first = " << first + << ", last = " << last << endl; + + int iterations = 50000; + cerr << "Iterations: " << iterations << endl; + + cerr << "CLOCKS_PER_SEC = " << CLOCKS_PER_SEC << endl; + float divisor = float(CLOCKS_PER_SEC) / 1000.f; + + clock_t start = clock(); + + for (int j = 0; j < iterations; ++j) { + for (int i = 0; i < N; ++i) { + c_multiply(target[i], src1[i], src2[i]); + } + } + + clock_t end = clock(); + + cerr << "Time for naive method: " << float(end - start)/divisor << endl; + + start = clock(); + + for (int j = 0; j < iterations; ++j) { + v_multiply(target, src1, src2, N); + } + + end = clock(); + + cerr << "Time for v_multiply: " << float(end - start)/divisor << endl; + + return true; +} + +bool +testPolarToCart() +{ + cerr << "testVectorOps: testing v_polar_to_cartesian" << endl; + + const int N = 1024; + bq_complex_t target[N]; + double mag[N]; + double phase[N]; + + for (int i = 0; i < N; ++i) { + mag[i] = drand48(); + phase[i] = (drand48() * M_PI * 2) - M_PI; + } + + double mean, first, last, total = 0; + for (int i = 0; i < N; ++i) { + double real = mag[i] * cos(phase[i]); + double imag = mag[i] * sin(phase[i]); + if (i == 0) first = real; + if (i == N-1) last = imag; + total += real; + total += imag; + } + mean = total / (N*2); + cerr << "Naive method: mean = " << mean << ", first = " << first + << ", last = " << last << endl; + + v_polar_to_cartesian(target, mag, phase, N); + + total = 0; + + for (int i = 0; i < N; ++i) { + if (i == 0) first = target[i].re; + if (i == N-1) last = target[i].im; + total += target[i].re; + total += target[i].im; + } + mean = total / (N*2); + cerr << "v_polar_to_cartesian: mean = " << mean << ", first = " << first + << ", last = " << last << endl; + + int iterations = 10000; + cerr << "Iterations: " << iterations << endl; + + cerr << "CLOCKS_PER_SEC = " << CLOCKS_PER_SEC << endl; + float divisor = float(CLOCKS_PER_SEC) / 1000.f; + + clock_t start = clock(); + + for (int j = 0; j < iterations; ++j) { + for (int i = 0; i < N; ++i) { + target[i].re = mag[i] * cos(phase[i]); + target[i].im = mag[i] * sin(phase[i]); + } + } + + clock_t end = clock(); + + cerr << "Time for naive method: " << float(end - start)/divisor << endl; + + start = clock(); + + for (int j = 0; j < iterations; ++j) { + v_polar_to_cartesian(target, mag, phase, N); + } + + end = clock(); + + cerr << "Time for v_polar_to_cartesian: " << float(end - start)/divisor << endl; + + return true; +} + +bool +testPolarToCartInterleaved() +{ + cerr << "testVectorOps: testing v_polar_interleaved_to_cartesian" << endl; + + const int N = 1024; + bq_complex_t target[N]; + double source[N*2]; + + for (int i = 0; i < N; ++i) { + source[i*2] = drand48(); + source[i*2+1] = (drand48() * M_PI * 2) - M_PI; + } + + double mean, first, last, total = 0; + for (int i = 0; i < N; ++i) { + double real = source[i*2] * cos(source[i*2+1]); + double imag = source[i*2] * sin(source[i*2+1]); + if (i == 0) first = real; + if (i == N-1) last = imag; + total += real; + total += imag; + } + mean = total / (N*2); + cerr << "Naive method: mean = " << mean << ", first = " << first + << ", last = " << last << endl; + + v_polar_interleaved_to_cartesian(target, source, N); + + total = 0; + + for (int i = 0; i < N; ++i) { + if (i == 0) first = target[i].re; + if (i == N-1) last = target[i].im; + total += target[i].re; + total += target[i].im; + } + mean = total / (N*2); + cerr << "v_polar_interleaved_to_cartesian: mean = " << mean << ", first = " << first + << ", last = " << last << endl; + + int iterations = 10000; + cerr << "Iterations: " << iterations << endl; + + cerr << "CLOCKS_PER_SEC = " << CLOCKS_PER_SEC << endl; + float divisor = float(CLOCKS_PER_SEC) / 1000.f; + + clock_t start = clock(); + + for (int j = 0; j < iterations; ++j) { + for (int i = 0; i < N; ++i) { + target[i].re = source[i*2] * cos(source[i*2+1]); + target[i].im = source[i*2] * sin(source[i*2+1]); + } + } + + clock_t end = clock(); + + cerr << "Time for naive method: " << float(end - start)/divisor << endl; + + start = clock(); + + for (int j = 0; j < iterations; ++j) { + v_polar_interleaved_to_cartesian(target, source, N); + } + + end = clock(); + + cerr << "Time for v_polar_interleaved_to_cartesian: " << float(end - start)/divisor << endl; + + return true; +} + +bool +testVectorOps() +{ + if (!testMultiply()) return false; + if (!testPolarToCart()) return false; + if (!testPolarToCartInterleaved()) return false; + + return true; +} + +} + +} + diff -r 77fca9c01e46 -r cf59817a5983 fft/native/bqvec/test/TestVectorOps.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fft/native/bqvec/test/TestVectorOps.h Sat Oct 17 15:00:08 2015 +0100 @@ -0,0 +1,12 @@ +/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */ + +namespace breakfastquay { + +namespace Test { + +bool testVectorOps(); + +} + +} + diff -r 77fca9c01e46 -r cf59817a5983 fft/native/native.cpp --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fft/native/native.cpp Sat Oct 17 15:00:08 2015 +0100 @@ -0,0 +1,59 @@ + +#include + +#include +#include +#include +#include + +using namespace std; +using namespace breakfastquay; + +int main(int argc, char **argv) +{ + vector sizes { 512, 2048 }; + + int iterations = 2000; + int times = 100; + + for (auto size: sizes) { + + FFT fft(size); + double total = 0.0; + + auto start = chrono::high_resolution_clock::now(); + + for (int ti = 0; ti < times; ++ti) { + + total = 0.0; + + for (int i = 0; i < iterations; ++i) { + + vector ri(size), ro(size/2+1), io(size/2+1); + for (int j = 0; j < size; ++j) { + ri[j] = (j % 2) / 4.0; + } + + fft.forward(ri.data(), ro.data(), io.data()); + + for (int j = 0; j <= size/2; ++j) { + total += sqrt(ro[j] * ro[j] + io[j] * io[j]); + } + + // synthesise the conjugate half + for (int j = 1; j < size/2; ++j) { + total += sqrt(ro[j] * ro[j] + io[j] * io[j]); + } + } + } + + auto end = chrono::high_resolution_clock::now(); + + double ms = chrono::duration(end - start).count() / times; + + cerr << "for " << iterations << " * size " << size << ": total = " + << total << ", time = " << ms + << " ms (" << (iterations / (ms / 1000.0)) << " itr/sec)" << endl; + } +} +