changeset 29:cf59817a5983

Build stuff for native
author Chris Cannam
date Sat, 17 Oct 2015 15:00:08 +0100
parents 77fca9c01e46
children 55f715002fe5
files fft/native.cpp fft/native/Makefile.osx fft/native/bqfft/COPYING fft/native/bqfft/Makefile fft/native/bqfft/README.txt fft/native/bqfft/bqfft/FFT.h fft/native/bqfft/src/FFT.cpp fft/native/bqfft/test/Compares.h fft/native/bqfft/test/TestFFT.h fft/native/bqfft/test/main.cpp fft/native/bqfft/test/test.pro fft/native/bqvec/COPYING fft/native/bqvec/Makefile fft/native/bqvec/README.txt fft/native/bqvec/bqvec/Allocators.h fft/native/bqvec/bqvec/Barrier.h fft/native/bqvec/bqvec/ComplexTypes.h fft/native/bqvec/bqvec/Range.h fft/native/bqvec/bqvec/Restrict.h fft/native/bqvec/bqvec/RingBuffer.h fft/native/bqvec/bqvec/VectorOps.h fft/native/bqvec/bqvec/VectorOpsComplex.h fft/native/bqvec/pommier/neon_mathfun.h fft/native/bqvec/pommier/sse_mathfun.h fft/native/bqvec/src/Allocators.cpp fft/native/bqvec/src/Barrier.cpp fft/native/bqvec/src/VectorOpsComplex.cpp fft/native/bqvec/test/TestVectorOps.cpp fft/native/bqvec/test/TestVectorOps.h fft/native/native.cpp
diffstat 30 files changed, 9268 insertions(+), 59 deletions(-) [+]
line wrap: on
line diff
--- a/fft/native.cpp	Wed Oct 07 21:31:02 2015 +0100
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,59 +0,0 @@
-
-#include <bqfft/FFT.h>
-
-#include <vector>
-#include <iostream>
-#include <cmath>
-#include <chrono>
-
-using namespace std;
-using namespace breakfastquay;
-
-int main(int argc, char **argv)
-{
-    vector<int> sizes { 512, 2048 };
-
-    int iterations = 2000;
-    int times = 100;
-
-    for (auto size: sizes) {
-
-	FFT fft(size);
-	double total = 0.0;
-
-	auto start = chrono::high_resolution_clock::now();
-
-	for (int ti = 0; ti < times; ++ti) {
-	    
-	    total = 0.0;
-
-	    for (int i = 0; i < iterations; ++i) {
-
-		vector<double> ri(size), ro(size/2+1), io(size/2+1);
-		for (int j = 0; j < size; ++j) {
-		    ri[j] = (j % 2) / 4.0;
-		}
-
-		fft.forward(ri.data(), ro.data(), io.data());
-
-		for (int j = 0; j <= size/2; ++j) {
-		    total += sqrt(ro[j] * ro[j] + io[j] * io[j]);
-		}
-
-		// synthesise the conjugate half
-		for (int j = 1; j < size/2; ++j) {
-		    total += sqrt(ro[j] * ro[j] + io[j] * io[j]);
-		}
-	    }
-	}
-
-	auto end = chrono::high_resolution_clock::now();
-
-	double ms = chrono::duration<double, milli>(end - start).count() / times;
-	
-	cerr << "for " << iterations << " * size " << size << ": total = "
-	     << total << ", time = " << ms
-	     << " ms (" << (iterations / (ms / 1000.0)) << " itr/sec)" << endl;
-    }
-}
-
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/fft/native/Makefile.osx	Sat Oct 17 15:00:08 2015 +0100
@@ -0,0 +1,6 @@
+
+OBJECTS	:= bqfft/src/FFT.o native.o
+CXXFLAGS   := -O3 -ffast-math -DMALLOC_IS_ALIGNED -DHAVE_VDSP -Ibqfft -Ibqvec -std=c++11
+
+native:	   $(OBJECTS)
+	   $(CXX) -o $@ $^ -framework Accelerate
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/fft/native/bqfft/COPYING	Sat Oct 17 15:00:08 2015 +0100
@@ -0,0 +1,26 @@
+
+    Copyright 2007-2015 Particular Programs Ltd.
+
+    Permission is hereby granted, free of charge, to any person
+    obtaining a copy of this software and associated documentation
+    files (the "Software"), to deal in the Software without
+    restriction, including without limitation the rights to use, copy,
+    modify, merge, publish, distribute, sublicense, and/or sell copies
+    of the Software, and to permit persons to whom the Software is
+    furnished to do so, subject to the following conditions:
+
+    The above copyright notice and this permission notice shall be
+    included in all copies or substantial portions of the Software.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+    EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+    NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
+    ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
+    CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+    WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+    Except as contained in this notice, the names of Chris Cannam and
+    Particular Programs Ltd shall not be used in advertising or
+    otherwise to promote the sale, use or other dealings in this
+    Software without prior written authorization.
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/fft/native/bqfft/Makefile	Sat Oct 17 15:00:08 2015 +0100
@@ -0,0 +1,90 @@
+
+# Add to FFT_DEFINES the relevant options for your desired third-party
+# library support.
+#
+# Available options are
+#
+#  -DHAVE_IPP         Intel's Integrated Performance Primitives are available
+#  -DHAVE_VDSP        Apple's Accelerate framework is available
+#  -DHAVE_FFTW3       The FFTW library is available
+#  -DHAVE_KISSFFT     The KissFFT library is available
+#  -DHAVE_MEDIALIB    The Medialib library (from Sun) is available
+#  -DHAVE_OPENMAX     The OpenMAX signal processing library is available
+#  -DUSE_BUILTIN_FFT  Compile the built-in FFT code (which is very slow)
+#
+# You may define more than one of these. If you define
+# USE_BUILTIN_FFT, the code will be compiled in but will only be used
+# if no other option is available. The default, if no flags are
+# supplied, is for the code to refuse to compile.
+# 
+# Add any relevant -I flags for include paths as well.
+#
+# Note that you must supply the same flags when including bqfft
+# headers later as you are using now when compiling the library. (You
+# may find it simplest to just add the bqfft source files to your
+# application's build system and not build a bqfft library at all.)
+
+# WARNING! The default option here is VERY SLOW! Read above for better
+# alternatives!
+FFT_DEFINES	:= -DHAVE_VDSP
+
+
+# Add to ALLOCATOR_DEFINES options relating to aligned malloc.
+#
+# Available options are
+#
+#  -DHAVE_POSIX_MEMALIGN     The posix_memalign call is available in sys/mman.h
+#  -DLACK_POSIX_MEMALIGN     The posix_memalign call is not available
+#
+#  -DMALLOC_IS_ALIGNED       The malloc call already returns aligned memory
+#  -DMALLOC_IS_NOT_ALIGNED   The malloc call does not return aligned memory
+#
+#  -DUSE_OWN_ALIGNED_MALLOC  No aligned malloc is available, roll your own
+#
+#  -DLACK_BAD_ALLOC          The C++ library lacks the std::bad_alloc exception
+#
+# Here "aligned" is assumed to mean "aligned enough for whatever
+# vector stuff the space will be used for" which most likely means
+# 16-byte alignment.
+#
+# The default is to use _aligned_malloc when building with Visual C++,
+# system malloc when building on OS/X, and posix_memalign otherwise.
+#
+# Note that you must supply the same flags when including bqfft
+# headers later as you are using now when compiling the library. (You
+# may find it simplest to just add the bqfft source files to your
+# application's build system and not build a bqfft library at all.)
+
+ALLOCATOR_DEFINES := -DMALLOC_IS_ALIGNED
+
+SRC_DIR	:= src
+HEADER_DIR := bqfft
+
+SOURCES	:= $(wildcard $(SRC_DIR)/*.cpp)
+HEADERS	:= $(wildcard $(HEADER_DIR)/*.h) $(wildcard $(SRC_DIR)/*.h)
+
+OBJECTS	:= $(SOURCES:.cpp=.o)
+OBJECTS	:= $(OBJECTS:.c=.o)
+
+CXXFLAGS := $(FFT_DEFINES) $(ALLOCATOR_DEFINES) -O3 -ffast-math -I. -I../bqvec -fpic
+
+LIBRARY	:= libbqfft.a
+
+all:	$(LIBRARY)
+
+$(LIBRARY):	$(OBJECTS)
+	$(AR) rc $@ $^
+
+clean:		
+	rm -f $(OBJECTS)
+
+distclean:	clean
+	rm -f $(LIBRARY)
+
+depend:
+	makedepend -Y -fMakefile $(SOURCES) $(HEADERS)
+
+
+# DO NOT DELETE
+
+src/FFT.o: bqfft/FFT.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/fft/native/bqfft/README.txt	Sat Oct 17 15:00:08 2015 +0100
@@ -0,0 +1,20 @@
+
+bqfft
+=====
+
+A small library wrapping various FFT implementations for some common
+audio processing use cases. Note this is not a general FFT interface,
+as it handles only power-of-two FFT sizes and real inputs.
+
+Requires the bqvec library.
+
+C++ standard required: C++98 (does not use C++11)
+
+Copyright 2007-2015 Particular Programs Ltd.
+
+This code originated as part of the Rubber Band Library written by the
+same authors (see https://bitbucket.org/breakfastquay/rubberband/).
+It has been pulled out into a separate library and relicensed under a
+more permissive licence: a BSD/MIT-style licence, as opposed to the
+GPL used for Rubber Band.  See the file COPYING for details.
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/fft/native/bqfft/bqfft/FFT.h	Sat Oct 17 15:00:08 2015 +0100
@@ -0,0 +1,145 @@
+/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*-  vi:set ts=8 sts=4 sw=4: */
+
+/*
+    bqfft
+
+    A small library wrapping various FFT implementations for some
+    common audio processing use cases.
+
+    Copyright 2007-2015 Particular Programs Ltd.
+
+    Permission is hereby granted, free of charge, to any person
+    obtaining a copy of this software and associated documentation
+    files (the "Software"), to deal in the Software without
+    restriction, including without limitation the rights to use, copy,
+    modify, merge, publish, distribute, sublicense, and/or sell copies
+    of the Software, and to permit persons to whom the Software is
+    furnished to do so, subject to the following conditions:
+
+    The above copyright notice and this permission notice shall be
+    included in all copies or substantial portions of the Software.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+    EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+    NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
+    ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
+    CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+    WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+    Except as contained in this notice, the names of Chris Cannam and
+    Particular Programs Ltd shall not be used in advertising or
+    otherwise to promote the sale, use or other dealings in this
+    Software without prior written authorization.
+*/
+
+#ifndef BQFFT_FFT_H
+#define BQFFT_FFT_H
+
+#include <bqvec/Restrict.h>
+
+#include <string>
+#include <set>
+
+namespace breakfastquay {
+
+class FFTImpl;
+
+/**
+ * Provide basic FFT computations using one of a set of candidate FFT
+ * implementations (depending on compile flags).
+ *
+ * Implements real->complex FFTs of power-of-two sizes only.  Note
+ * that only the first half of the output signal is returned (the
+ * complex conjugates half is omitted), so the "complex" arrays need
+ * room for size/2+1 elements.
+ *
+ * The "interleaved" functions use the format sometimes called CCS --
+ * size/2+1 real+imaginary pairs.  So, the array elements at indices 1
+ * and size+1 will always be zero (since the signal is real).
+ *
+ * All pointer arguments must point to valid data. A NullArgument
+ * exception is thrown if any argument is NULL.
+ *
+ * Neither forward nor inverse transform is scaled.
+ *
+ * This class is reentrant but not thread safe: use a separate
+ * instance per thread, or use a mutex.
+ */
+class FFT
+{
+public:
+    enum Exception {
+        NullArgument, InvalidSize, InvalidImplementation, InternalError
+    };
+
+    FFT(int size, int debugLevel = 0); // may throw InvalidSize
+    ~FFT();
+
+    void forward(const double *BQ_R__ realIn, double *BQ_R__ realOut, double *BQ_R__ imagOut);
+    void forwardInterleaved(const double *BQ_R__ realIn, double *BQ_R__ complexOut);
+    void forwardPolar(const double *BQ_R__ realIn, double *BQ_R__ magOut, double *BQ_R__ phaseOut);
+    void forwardMagnitude(const double *BQ_R__ realIn, double *BQ_R__ magOut);
+
+    void forward(const float *BQ_R__ realIn, float *BQ_R__ realOut, float *BQ_R__ imagOut);
+    void forwardInterleaved(const float *BQ_R__ realIn, float *BQ_R__ complexOut);
+    void forwardPolar(const float *BQ_R__ realIn, float *BQ_R__ magOut, float *BQ_R__ phaseOut);
+    void forwardMagnitude(const float *BQ_R__ realIn, float *BQ_R__ magOut);
+
+    void inverse(const double *BQ_R__ realIn, const double *BQ_R__ imagIn, double *BQ_R__ realOut);
+    void inverseInterleaved(const double *BQ_R__ complexIn, double *BQ_R__ realOut);
+    void inversePolar(const double *BQ_R__ magIn, const double *BQ_R__ phaseIn, double *BQ_R__ realOut);
+    void inverseCepstral(const double *BQ_R__ magIn, double *BQ_R__ cepOut);
+
+    void inverse(const float *BQ_R__ realIn, const float *BQ_R__ imagIn, float *BQ_R__ realOut);
+    void inverseInterleaved(const float *BQ_R__ complexIn, float *BQ_R__ realOut);
+    void inversePolar(const float *BQ_R__ magIn, const float *BQ_R__ phaseIn, float *BQ_R__ realOut);
+    void inverseCepstral(const float *BQ_R__ magIn, float *BQ_R__ cepOut);
+
+    // Calling one or both of these is optional -- if neither is
+    // called, the first call to a forward or inverse method will call
+    // init().  You only need call these if you don't want to risk
+    // expensive allocations etc happening in forward or inverse.
+    void initFloat();
+    void initDouble();
+
+    enum Precision {
+        SinglePrecision = 0x1,
+        DoublePrecision = 0x2
+    };
+    typedef int Precisions;
+
+    /**
+     * Return the OR of all precisions supported by this
+     * implementation. All of the functions (float and double) are
+     * available regardless of the supported implementations, but they
+     * will be calculated at the proper precision only if it is
+     * available. (So float functions will be calculated using doubles
+     * and then truncated if single-precision is unavailable, and
+     * double functions will use single-precision arithmetic if double
+     * is unavailable.)
+     */
+    Precisions getSupportedPrecisions() const;
+
+    static std::set<std::string> getImplementations();
+    static std::string getDefaultImplementation();
+    static void setDefaultImplementation(std::string);
+
+#ifdef FFT_MEASUREMENT
+    static std::string tune();
+#endif
+
+protected:
+    FFTImpl *d;
+    static std::string m_implementation;
+    static void pickDefaultImplementation();
+
+private:
+    FFT(const FFT &); // not provided
+    FFT &operator=(const FFT &); // not provided
+};
+
+}
+
+#endif
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/fft/native/bqfft/src/FFT.cpp	Sat Oct 17 15:00:08 2015 +0100
@@ -0,0 +1,3585 @@
+/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*-  vi:set ts=8 sts=4 sw=4: */
+
+/*
+    bqfft
+
+    A small library wrapping various FFT implementations for some
+    common audio processing use cases.
+
+    Copyright 2007-2015 Particular Programs Ltd.
+
+    Permission is hereby granted, free of charge, to any person
+    obtaining a copy of this software and associated documentation
+    files (the "Software"), to deal in the Software without
+    restriction, including without limitation the rights to use, copy,
+    modify, merge, publish, distribute, sublicense, and/or sell copies
+    of the Software, and to permit persons to whom the Software is
+    furnished to do so, subject to the following conditions:
+
+    The above copyright notice and this permission notice shall be
+    included in all copies or substantial portions of the Software.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+    EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+    NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
+    ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
+    CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+    WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+    Except as contained in this notice, the names of Chris Cannam and
+    Particular Programs Ltd shall not be used in advertising or
+    otherwise to promote the sale, use or other dealings in this
+    Software without prior written authorization.
+*/
+
+#include "bqfft/FFT.h"
+
+//#include "system/Thread.h"
+
+#include <bqvec/Allocators.h>
+#include <bqvec/VectorOps.h>
+#include <bqvec/VectorOpsComplex.h>
+
+//#define FFT_MEASUREMENT 1
+
+#ifdef FFT_MEASUREMENT
+#include <sstream>
+#endif
+
+#ifdef HAVE_IPP
+#include <ipps.h>
+#endif
+
+#ifdef HAVE_FFTW3
+#include <fftw3.h>
+#endif
+
+#ifdef HAVE_VDSP
+#include <Accelerate/Accelerate.h>
+#endif
+
+#ifdef HAVE_MEDIALIB
+#include <mlib_signal.h>
+#endif
+
+#ifdef HAVE_OPENMAX
+#include <omxSP.h>
+#endif
+
+#ifdef HAVE_SFFT
+extern "C" {
+#include <sfft.h>
+}
+#endif
+
+#ifdef HAVE_KISSFFT
+#include "kissfft/kiss_fftr.h"
+#endif
+
+#ifndef HAVE_IPP
+#ifndef HAVE_FFTW3
+#ifndef HAVE_KISSFFT
+#ifndef USE_BUILTIN_FFT
+#ifndef HAVE_VDSP
+#ifndef HAVE_MEDIALIB
+#ifndef HAVE_OPENMAX
+#ifndef HAVE_SFFT
+#error No FFT implementation selected!
+#endif
+#endif
+#endif
+#endif
+#endif
+#endif
+#endif
+#endif
+
+#include <cmath>
+#include <iostream>
+#include <map>
+#include <cstdio>
+#include <cstdlib>
+#include <vector>
+
+#ifdef FFT_MEASUREMENT
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+#endif
+
+namespace breakfastquay {
+
+class FFTImpl
+{
+public:
+    virtual ~FFTImpl() { }
+
+    virtual FFT::Precisions getSupportedPrecisions() const = 0;
+
+    virtual void initFloat() = 0;
+    virtual void initDouble() = 0;
+
+    virtual void forward(const double *BQ_R__ realIn, double *BQ_R__ realOut, double *BQ_R__ imagOut) = 0;
+    virtual void forwardInterleaved(const double *BQ_R__ realIn, double *BQ_R__ complexOut) = 0;
+    virtual void forwardPolar(const double *BQ_R__ realIn, double *BQ_R__ magOut, double *BQ_R__ phaseOut) = 0;
+    virtual void forwardMagnitude(const double *BQ_R__ realIn, double *BQ_R__ magOut) = 0;
+
+    virtual void forward(const float *BQ_R__ realIn, float *BQ_R__ realOut, float *BQ_R__ imagOut) = 0;
+    virtual void forwardInterleaved(const float *BQ_R__ realIn, float *BQ_R__ complexOut) = 0;
+    virtual void forwardPolar(const float *BQ_R__ realIn, float *BQ_R__ magOut, float *BQ_R__ phaseOut) = 0;
+    virtual void forwardMagnitude(const float *BQ_R__ realIn, float *BQ_R__ magOut) = 0;
+
+    virtual void inverse(const double *BQ_R__ realIn, const double *BQ_R__ imagIn, double *BQ_R__ realOut) = 0;
+    virtual void inverseInterleaved(const double *BQ_R__ complexIn, double *BQ_R__ realOut) = 0;
+    virtual void inversePolar(const double *BQ_R__ magIn, const double *BQ_R__ phaseIn, double *BQ_R__ realOut) = 0;
+    virtual void inverseCepstral(const double *BQ_R__ magIn, double *BQ_R__ cepOut) = 0;
+
+    virtual void inverse(const float *BQ_R__ realIn, const float *BQ_R__ imagIn, float *BQ_R__ realOut) = 0;
+    virtual void inverseInterleaved(const float *BQ_R__ complexIn, float *BQ_R__ realOut) = 0;
+    virtual void inversePolar(const float *BQ_R__ magIn, const float *BQ_R__ phaseIn, float *BQ_R__ realOut) = 0;
+    virtual void inverseCepstral(const float *BQ_R__ magIn, float *BQ_R__ cepOut) = 0;
+};    
+
+namespace FFTs {
+
+#ifdef HAVE_IPP
+
+class D_IPP : public FFTImpl
+{
+public:
+    D_IPP(int size) :
+        m_size(size), m_fspec(0), m_dspec(0)
+    { 
+        for (int i = 0; ; ++i) {
+            if (m_size & (1 << i)) {
+                m_order = i;
+                break;
+            }
+        }
+    }
+
+    ~D_IPP() {
+        if (m_fspec) {
+            ippsFFTFree_R_32f(m_fspec);
+            ippsFree(m_fbuf);
+            ippsFree(m_fpacked);
+            ippsFree(m_fspare);
+        }
+        if (m_dspec) {
+            ippsFFTFree_R_64f(m_dspec);
+            ippsFree(m_dbuf);
+            ippsFree(m_dpacked);
+            ippsFree(m_dspare);
+        }
+    }
+
+    FFT::Precisions
+    getSupportedPrecisions() const {
+        return FFT::SinglePrecision | FFT::DoublePrecision;
+    }
+
+    //!!! rv check
+
+    void initFloat() {
+        if (m_fspec) return;
+        int specSize, specBufferSize, bufferSize;
+        ippsFFTGetSize_R_32f(m_order, IPP_FFT_NODIV_BY_ANY, ippAlgHintFast,
+                             &specSize, &specBufferSize, &bufferSize);
+        m_fbuf = ippsMalloc_8u(bufferSize);
+        m_fpacked = ippsMalloc_32f(m_size + 2);
+        m_fspare = ippsMalloc_32f(m_size / 2 + 1);
+        ippsFFTInitAlloc_R_32f(&m_fspec, m_order, IPP_FFT_NODIV_BY_ANY, 
+                               ippAlgHintFast);
+    }
+
+    void initDouble() {
+        if (m_dspec) return;
+        int specSize, specBufferSize, bufferSize;
+        ippsFFTGetSize_R_64f(m_order, IPP_FFT_NODIV_BY_ANY, ippAlgHintFast,
+                             &specSize, &specBufferSize, &bufferSize);
+        m_dbuf = ippsMalloc_8u(bufferSize);
+        m_dpacked = ippsMalloc_64f(m_size + 2);
+        m_dspare = ippsMalloc_64f(m_size / 2 + 1);
+        ippsFFTInitAlloc_R_64f(&m_dspec, m_order, IPP_FFT_NODIV_BY_ANY, 
+                               ippAlgHintFast);
+    }
+
+    void packFloat(const float *BQ_R__ re, const float *BQ_R__ im) {
+        int index = 0;
+        const int hs = m_size/2;
+        for (int i = 0; i <= hs; ++i) {
+            m_fpacked[index++] = re[i];
+            index++;
+        }
+        index = 0;
+        if (im) {
+            for (int i = 0; i <= hs; ++i) {
+                index++;
+                m_fpacked[index++] = im[i];
+            }
+        } else {
+            for (int i = 0; i <= hs; ++i) {
+                index++;
+                m_fpacked[index++] = 0.f;
+            }
+        }
+    }
+
+    void packDouble(const double *BQ_R__ re, const double *BQ_R__ im) {
+        int index = 0;
+        const int hs = m_size/2;
+        for (int i = 0; i <= hs; ++i) {
+            m_dpacked[index++] = re[i];
+            index++;
+        }
+        index = 0;
+        if (im) {
+            for (int i = 0; i <= hs; ++i) {
+                index++;
+                m_dpacked[index++] = im[i];
+            }
+        } else {
+            for (int i = 0; i <= hs; ++i) {
+                index++;
+                m_dpacked[index++] = 0.0;
+            }
+        }
+    }
+
+    void unpackFloat(float *re, float *BQ_R__ im) { // re may be equal to m_fpacked
+        int index = 0;
+        const int hs = m_size/2;
+        if (im) {
+            for (int i = 0; i <= hs; ++i) {
+                index++;
+                im[i] = m_fpacked[index++];
+            }
+        }
+        index = 0;
+        for (int i = 0; i <= hs; ++i) {
+            re[i] = m_fpacked[index++];
+            index++;
+        }
+    }        
+
+    void unpackDouble(double *re, double *BQ_R__ im) { // re may be equal to m_dpacked
+        int index = 0;
+        const int hs = m_size/2;
+        if (im) {
+            for (int i = 0; i <= hs; ++i) {
+                index++;
+                im[i] = m_dpacked[index++];
+            }
+        }
+        index = 0;
+        for (int i = 0; i <= hs; ++i) {
+            re[i] = m_dpacked[index++];
+            index++;
+        }
+    }        
+
+    void forward(const double *BQ_R__ realIn, double *BQ_R__ realOut, double *BQ_R__ imagOut) {
+        if (!m_dspec) initDouble();
+        ippsFFTFwd_RToCCS_64f(realIn, m_dpacked, m_dspec, m_dbuf);
+        unpackDouble(realOut, imagOut);
+    }
+
+    void forwardInterleaved(const double *BQ_R__ realIn, double *BQ_R__ complexOut) {
+        if (!m_dspec) initDouble();
+        ippsFFTFwd_RToCCS_64f(realIn, complexOut, m_dspec, m_dbuf);
+    }
+
+    void forwardPolar(const double *BQ_R__ realIn, double *BQ_R__ magOut, double *BQ_R__ phaseOut) {
+        if (!m_dspec) initDouble();
+        ippsFFTFwd_RToCCS_64f(realIn, m_dpacked, m_dspec, m_dbuf);
+        unpackDouble(m_dpacked, m_dspare);
+        ippsCartToPolar_64f(m_dpacked, m_dspare, magOut, phaseOut, m_size/2+1);
+    }
+
+    void forwardMagnitude(const double *BQ_R__ realIn, double *BQ_R__ magOut) {
+        if (!m_dspec) initDouble();
+        ippsFFTFwd_RToCCS_64f(realIn, m_dpacked, m_dspec, m_dbuf);
+        unpackDouble(m_dpacked, m_dspare);
+        ippsMagnitude_64f(m_dpacked, m_dspare, magOut, m_size/2+1);
+    }
+
+    void forward(const float *BQ_R__ realIn, float *BQ_R__ realOut, float *BQ_R__ imagOut) {
+        if (!m_fspec) initFloat();
+        ippsFFTFwd_RToCCS_32f(realIn, m_fpacked, m_fspec, m_fbuf);
+        unpackFloat(realOut, imagOut);
+    }
+
+    void forwardInterleaved(const float *BQ_R__ realIn, float *BQ_R__ complexOut) {
+        if (!m_fspec) initFloat();
+        ippsFFTFwd_RToCCS_32f(realIn, complexOut, m_fspec, m_fbuf);
+    }
+
+    void forwardPolar(const float *BQ_R__ realIn, float *BQ_R__ magOut, float *BQ_R__ phaseOut) {
+        if (!m_fspec) initFloat();
+        ippsFFTFwd_RToCCS_32f(realIn, m_fpacked, m_fspec, m_fbuf);
+        unpackFloat(m_fpacked, m_fspare);
+        ippsCartToPolar_32f(m_fpacked, m_fspare, magOut, phaseOut, m_size/2+1);
+    }
+
+    void forwardMagnitude(const float *BQ_R__ realIn, float *BQ_R__ magOut) {
+        if (!m_fspec) initFloat();
+        ippsFFTFwd_RToCCS_32f(realIn, m_fpacked, m_fspec, m_fbuf);
+        unpackFloat(m_fpacked, m_fspare);
+        ippsMagnitude_32f(m_fpacked, m_fspare, magOut, m_size/2+1);
+    }
+
+    void inverse(const double *BQ_R__ realIn, const double *BQ_R__ imagIn, double *BQ_R__ realOut) {
+        if (!m_dspec) initDouble();
+        packDouble(realIn, imagIn);
+        ippsFFTInv_CCSToR_64f(m_dpacked, realOut, m_dspec, m_dbuf);
+    }
+
+    void inverseInterleaved(const double *BQ_R__ complexIn, double *BQ_R__ realOut) {
+        if (!m_dspec) initDouble();
+        ippsFFTInv_CCSToR_64f(complexIn, realOut, m_dspec, m_dbuf);
+    }
+
+    void inversePolar(const double *BQ_R__ magIn, const double *BQ_R__ phaseIn, double *BQ_R__ realOut) {
+        if (!m_dspec) initDouble();
+        ippsPolarToCart_64f(magIn, phaseIn, realOut, m_dspare, m_size/2+1);
+        packDouble(realOut, m_dspare); // to m_dpacked
+        ippsFFTInv_CCSToR_64f(m_dpacked, realOut, m_dspec, m_dbuf);
+    }
+
+    void inverseCepstral(const double *BQ_R__ magIn, double *BQ_R__ cepOut) {
+        if (!m_dspec) initDouble();
+        const int hs1 = m_size/2 + 1;
+        ippsCopy_64f(magIn, m_dspare, hs1);
+        ippsAddC_64f_I(0.000001, m_dspare, hs1);
+        ippsLn_64f_I(m_dspare, hs1);
+        packDouble(m_dspare, 0);
+        ippsFFTInv_CCSToR_64f(m_dpacked, cepOut, m_dspec, m_dbuf);
+    }
+    
+    void inverse(const float *BQ_R__ realIn, const float *BQ_R__ imagIn, float *BQ_R__ realOut) {
+        if (!m_fspec) initFloat();
+        packFloat(realIn, imagIn);
+        ippsFFTInv_CCSToR_32f(m_fpacked, realOut, m_fspec, m_fbuf);
+    }
+
+    void inverseInterleaved(const float *BQ_R__ complexIn, float *BQ_R__ realOut) {
+        if (!m_fspec) initFloat();
+        ippsFFTInv_CCSToR_32f(complexIn, realOut, m_fspec, m_fbuf);
+    }
+
+    void inversePolar(const float *BQ_R__ magIn, const float *BQ_R__ phaseIn, float *BQ_R__ realOut) {
+        if (!m_fspec) initFloat();
+        ippsPolarToCart_32f(magIn, phaseIn, realOut, m_fspare, m_size/2+1);
+        packFloat(realOut, m_fspare); // to m_fpacked
+        ippsFFTInv_CCSToR_32f(m_fpacked, realOut, m_fspec, m_fbuf);
+    }
+
+    void inverseCepstral(const float *BQ_R__ magIn, float *BQ_R__ cepOut) {
+        if (!m_fspec) initFloat();
+        const int hs1 = m_size/2 + 1;
+        ippsCopy_32f(magIn, m_fspare, hs1);
+        ippsAddC_32f_I(0.000001f, m_fspare, hs1);
+        ippsLn_32f_I(m_fspare, hs1);
+        packFloat(m_fspare, 0);
+        ippsFFTInv_CCSToR_32f(m_fpacked, cepOut, m_fspec, m_fbuf);
+    }
+
+private:
+    const int m_size;
+    int m_order;
+    IppsFFTSpec_R_32f *m_fspec;
+    IppsFFTSpec_R_64f *m_dspec;
+    Ipp8u *m_fbuf;
+    Ipp8u *m_dbuf;
+    float *m_fpacked;
+    float *m_fspare;
+    double *m_dpacked;
+    double *m_dspare;
+};
+
+#endif /* HAVE_IPP */
+
+#ifdef HAVE_VDSP
+
+class D_VDSP : public FFTImpl
+{
+public:
+    D_VDSP(int size) :
+        m_size(size), m_fspec(0), m_dspec(0),
+        m_fpacked(0), m_fspare(0),
+        m_dpacked(0), m_dspare(0)
+    { 
+        for (int i = 0; ; ++i) {
+            if (m_size & (1 << i)) {
+                m_order = i;
+                break;
+            }
+        }
+    }
+
+    ~D_VDSP() {
+        if (m_fspec) {
+            vDSP_destroy_fftsetup(m_fspec);
+            deallocate(m_fspare);
+            deallocate(m_fspare2);
+            deallocate(m_fbuf->realp);
+            deallocate(m_fbuf->imagp);
+            delete m_fbuf;
+            deallocate(m_fpacked->realp);
+            deallocate(m_fpacked->imagp);
+            delete m_fpacked;
+        }
+        if (m_dspec) {
+            vDSP_destroy_fftsetupD(m_dspec);
+            deallocate(m_dspare);
+            deallocate(m_dspare2);
+            deallocate(m_dbuf->realp);
+            deallocate(m_dbuf->imagp);
+            delete m_dbuf;
+            deallocate(m_dpacked->realp);
+            deallocate(m_dpacked->imagp);
+            delete m_dpacked;
+        }
+    }
+
+    FFT::Precisions
+    getSupportedPrecisions() const {
+        return FFT::SinglePrecision | FFT::DoublePrecision;
+    }
+
+    //!!! rv check
+
+    void initFloat() {
+        if (m_fspec) return;
+        m_fspec = vDSP_create_fftsetup(m_order, FFT_RADIX2);
+        m_fbuf = new DSPSplitComplex;
+        //!!! "If possible, tempBuffer->realp and tempBuffer->imagp should be 32-byte aligned for best performance."
+        m_fbuf->realp = allocate<float>(m_size);
+        m_fbuf->imagp = allocate<float>(m_size);
+        m_fpacked = new DSPSplitComplex;
+        m_fpacked->realp = allocate<float>(m_size / 2 + 1);
+        m_fpacked->imagp = allocate<float>(m_size / 2 + 1);
+        m_fspare = allocate<float>(m_size + 2);
+        m_fspare2 = allocate<float>(m_size + 2);
+    }
+
+    void initDouble() {
+        if (m_dspec) return;
+        m_dspec = vDSP_create_fftsetupD(m_order, FFT_RADIX2);
+        m_dbuf = new DSPDoubleSplitComplex;
+        //!!! "If possible, tempBuffer->realp and tempBuffer->imagp should be 32-byte aligned for best performance."
+        m_dbuf->realp = allocate<double>(m_size);
+        m_dbuf->imagp = allocate<double>(m_size);
+        m_dpacked = new DSPDoubleSplitComplex;
+        m_dpacked->realp = allocate<double>(m_size / 2 + 1);
+        m_dpacked->imagp = allocate<double>(m_size / 2 + 1);
+        m_dspare = allocate<double>(m_size + 2);
+        m_dspare2 = allocate<double>(m_size + 2);
+    }
+
+    void packReal(const float *BQ_R__ const re) {
+        // Pack input for forward transform 
+        vDSP_ctoz((DSPComplex *)re, 2, m_fpacked, 1, m_size/2);
+    }
+    void packComplex(const float *BQ_R__ const re, const float *BQ_R__ const im) {
+        // Pack input for inverse transform 
+        if (re) v_copy(m_fpacked->realp, re, m_size/2 + 1);
+        else v_zero(m_fpacked->realp, m_size/2 + 1);
+        if (im) v_copy(m_fpacked->imagp, im, m_size/2 + 1);
+        else v_zero(m_fpacked->imagp, m_size/2 + 1);
+        fnyq();
+    }
+
+    void unpackReal(float *BQ_R__ const re) {
+        // Unpack output for inverse transform
+        vDSP_ztoc(m_fpacked, 1, (DSPComplex *)re, 2, m_size/2);
+    }
+    void unpackComplex(float *BQ_R__ const re, float *BQ_R__ const im) {
+        // Unpack output for forward transform
+        // vDSP forward FFTs are scaled 2x (for some reason)
+        float two = 2.f;
+        vDSP_vsdiv(m_fpacked->realp, 1, &two, re, 1, m_size/2 + 1);
+        vDSP_vsdiv(m_fpacked->imagp, 1, &two, im, 1, m_size/2 + 1);
+    }
+    void unpackComplex(float *BQ_R__ const cplx) {
+        // Unpack output for forward transform
+        // vDSP forward FFTs are scaled 2x (for some reason)
+        const int hs1 = m_size/2 + 1;
+        for (int i = 0; i < hs1; ++i) {
+            cplx[i*2] = m_fpacked->realp[i] / 2.f;
+            cplx[i*2+1] = m_fpacked->imagp[i] / 2.f;
+        }
+    }
+
+    void packReal(const double *BQ_R__ const re) {
+        // Pack input for forward transform
+        vDSP_ctozD((DSPDoubleComplex *)re, 2, m_dpacked, 1, m_size/2);
+    }
+    void packComplex(const double *BQ_R__ const re, const double *BQ_R__ const im) {
+        // Pack input for inverse transform
+        if (re) v_copy(m_dpacked->realp, re, m_size/2 + 1);
+        else v_zero(m_dpacked->realp, m_size/2 + 1);
+        if (im) v_copy(m_dpacked->imagp, im, m_size/2 + 1);
+        else v_zero(m_dpacked->imagp, m_size/2 + 1);
+        dnyq();
+    }
+
+    void unpackReal(double *BQ_R__ const re) {
+        // Unpack output for inverse transform
+        vDSP_ztocD(m_dpacked, 1, (DSPDoubleComplex *)re, 2, m_size/2);
+    }
+    void unpackComplex(double *BQ_R__ const re, double *BQ_R__ const im) {
+        // Unpack output for forward transform
+        // vDSP forward FFTs are scaled 2x (for some reason)
+        double two = 2.0;
+        vDSP_vsdivD(m_dpacked->realp, 1, &two, re, 1, m_size/2 + 1);
+        vDSP_vsdivD(m_dpacked->imagp, 1, &two, im, 1, m_size/2 + 1);
+    }
+    void unpackComplex(double *BQ_R__ const cplx) {
+        // Unpack output for forward transform
+        // vDSP forward FFTs are scaled 2x (for some reason)
+        const int hs1 = m_size/2 + 1;
+        for (int i = 0; i < hs1; ++i) {
+            cplx[i*2] = m_dpacked->realp[i] / 2.0;
+            cplx[i*2+1] = m_dpacked->imagp[i] / 2.0;
+        }
+    }
+
+    void fdenyq() {
+        // for fft result in packed form, unpack the DC and Nyquist bins
+        const int hs = m_size/2;
+        m_fpacked->realp[hs] = m_fpacked->imagp[0];
+        m_fpacked->imagp[hs] = 0.f;
+        m_fpacked->imagp[0] = 0.f;
+    }
+    void ddenyq() {
+        // for fft result in packed form, unpack the DC and Nyquist bins
+        const int hs = m_size/2;
+        m_dpacked->realp[hs] = m_dpacked->imagp[0];
+        m_dpacked->imagp[hs] = 0.;
+        m_dpacked->imagp[0] = 0.;
+    }
+
+    void fnyq() {
+        // for ifft input in packed form, pack the DC and Nyquist bins
+        const int hs = m_size/2;
+        m_fpacked->imagp[0] = m_fpacked->realp[hs];
+        m_fpacked->realp[hs] = 0.f;
+        m_fpacked->imagp[hs] = 0.f;
+    }
+    void dnyq() {
+        // for ifft input in packed form, pack the DC and Nyquist bins
+        const int hs = m_size/2;
+        m_dpacked->imagp[0] = m_dpacked->realp[hs];
+        m_dpacked->realp[hs] = 0.;
+        m_dpacked->imagp[hs] = 0.;
+    }
+
+    void forward(const double *BQ_R__ realIn, double *BQ_R__ realOut, double *BQ_R__ imagOut) {
+        if (!m_dspec) initDouble();
+        packReal(realIn);
+        vDSP_fft_zriptD(m_dspec, m_dpacked, 1, m_dbuf, m_order, FFT_FORWARD);
+        ddenyq();
+        unpackComplex(realOut, imagOut);
+    }
+
+    void forwardInterleaved(const double *BQ_R__ realIn, double *BQ_R__ complexOut) {
+        if (!m_dspec) initDouble();
+        packReal(realIn);
+        vDSP_fft_zriptD(m_dspec, m_dpacked, 1, m_dbuf, m_order, FFT_FORWARD);
+        ddenyq();
+        unpackComplex(complexOut);
+    }
+
+    void forwardPolar(const double *BQ_R__ realIn, double *BQ_R__ magOut, double *BQ_R__ phaseOut) {
+        if (!m_dspec) initDouble();
+        const int hs1 = m_size/2+1;
+        packReal(realIn);
+        vDSP_fft_zriptD(m_dspec, m_dpacked, 1, m_dbuf, m_order, FFT_FORWARD);
+        ddenyq();
+        // vDSP forward FFTs are scaled 2x (for some reason)
+        for (int i = 0; i < hs1; ++i) m_dpacked->realp[i] /= 2.0;
+        for (int i = 0; i < hs1; ++i) m_dpacked->imagp[i] /= 2.0;
+        v_cartesian_to_polar(magOut, phaseOut,
+                             m_dpacked->realp, m_dpacked->imagp, hs1);
+    }
+
+    void forwardMagnitude(const double *BQ_R__ realIn, double *BQ_R__ magOut) {
+        if (!m_dspec) initDouble();
+        packReal(realIn);
+        vDSP_fft_zriptD(m_dspec, m_dpacked, 1, m_dbuf, m_order, FFT_FORWARD);
+        ddenyq();
+        const int hs1 = m_size/2+1;
+        vDSP_zvmagsD(m_dpacked, 1, m_dspare, 1, hs1);
+        vvsqrt(m_dspare2, m_dspare, &hs1);
+        // vDSP forward FFTs are scaled 2x (for some reason)
+        double two = 2.0;
+        vDSP_vsdivD(m_dspare2, 1, &two, magOut, 1, hs1);
+    }
+
+    void forward(const float *BQ_R__ realIn, float *BQ_R__ realOut, float *BQ_R__ imagOut) {
+        if (!m_fspec) initFloat();
+        packReal(realIn);
+        vDSP_fft_zript(m_fspec, m_fpacked, 1, m_fbuf, m_order, FFT_FORWARD);
+        fdenyq();
+        unpackComplex(realOut, imagOut);
+    }
+
+    void forwardInterleaved(const float *BQ_R__ realIn, float *BQ_R__ complexOut) {
+        if (!m_fspec) initFloat();
+        packReal(realIn);
+        vDSP_fft_zript(m_fspec, m_fpacked, 1, m_fbuf, m_order, FFT_FORWARD);
+        fdenyq();
+        unpackComplex(complexOut);
+    }
+
+    void forwardPolar(const float *BQ_R__ realIn, float *BQ_R__ magOut, float *BQ_R__ phaseOut) {
+        if (!m_fspec) initFloat();
+        const int hs1 = m_size/2+1;
+        packReal(realIn);
+        vDSP_fft_zript(m_fspec, m_fpacked, 1, m_fbuf, m_order, FFT_FORWARD);
+        fdenyq();
+        // vDSP forward FFTs are scaled 2x (for some reason)
+        for (int i = 0; i < hs1; ++i) m_fpacked->realp[i] /= 2.f;
+        for (int i = 0; i < hs1; ++i) m_fpacked->imagp[i] /= 2.f;
+        v_cartesian_to_polar(magOut, phaseOut,
+                             m_fpacked->realp, m_fpacked->imagp, hs1);
+    }
+
+    void forwardMagnitude(const float *BQ_R__ realIn, float *BQ_R__ magOut) {
+        if (!m_fspec) initFloat();
+        packReal(realIn);
+        vDSP_fft_zript(m_fspec, m_fpacked, 1, m_fbuf, m_order, FFT_FORWARD);
+        fdenyq();
+        const int hs1 = m_size/2 + 1;
+        vDSP_zvmags(m_fpacked, 1, m_fspare, 1, hs1);
+        vvsqrtf(m_fspare2, m_fspare, &hs1);
+        // vDSP forward FFTs are scaled 2x (for some reason)
+        float two = 2.f;
+        vDSP_vsdiv(m_fspare2, 1, &two, magOut, 1, hs1);
+    }
+
+    void inverse(const double *BQ_R__ realIn, const double *BQ_R__ imagIn, double *BQ_R__ realOut) {
+        if (!m_dspec) initDouble();
+        packComplex(realIn, imagIn);
+        vDSP_fft_zriptD(m_dspec, m_dpacked, 1, m_dbuf, m_order, FFT_INVERSE);
+        unpackReal(realOut);
+    }
+
+    void inverseInterleaved(const double *BQ_R__ complexIn, double *BQ_R__ realOut) {
+        if (!m_dspec) initDouble();
+        double *d[2] = { m_dpacked->realp, m_dpacked->imagp };
+        v_deinterleave(d, complexIn, 2, m_size/2 + 1);
+        vDSP_fft_zriptD(m_dspec, m_dpacked, 1, m_dbuf, m_order, FFT_INVERSE);
+        unpackReal(realOut);
+    }
+
+    void inversePolar(const double *BQ_R__ magIn, const double *BQ_R__ phaseIn, double *BQ_R__ realOut) {
+        if (!m_dspec) initDouble();
+        const int hs1 = m_size/2+1;
+        vvsincos(m_dpacked->imagp, m_dpacked->realp, phaseIn, &hs1);
+        double *const rp = m_dpacked->realp;
+        double *const ip = m_dpacked->imagp;
+        for (int i = 0; i < hs1; ++i) rp[i] *= magIn[i];
+        for (int i = 0; i < hs1; ++i) ip[i] *= magIn[i];
+        dnyq();
+        vDSP_fft_zriptD(m_dspec, m_dpacked, 1, m_dbuf, m_order, FFT_INVERSE);
+        unpackReal(realOut);
+    }
+
+    void inverseCepstral(const double *BQ_R__ magIn, double *BQ_R__ cepOut) {
+        if (!m_dspec) initDouble();
+        const int hs1 = m_size/2 + 1;
+        v_copy(m_dspare, magIn, hs1);
+        for (int i = 0; i < hs1; ++i) m_dspare[i] += 0.000001;
+        vvlog(m_dspare2, m_dspare, &hs1);
+        inverse(m_dspare2, 0, cepOut);
+    }
+    
+    void inverse(const float *BQ_R__ realIn, const float *BQ_R__ imagIn, float *BQ_R__ realOut) {
+        if (!m_fspec) initFloat();
+        packComplex(realIn, imagIn);
+        vDSP_fft_zript(m_fspec, m_fpacked, 1, m_fbuf, m_order, FFT_INVERSE);
+        unpackReal(realOut);
+    }
+
+    void inverseInterleaved(const float *BQ_R__ complexIn, float *BQ_R__ realOut) {
+        if (!m_fspec) initFloat();
+        float *f[2] = { m_fpacked->realp, m_fpacked->imagp };
+        v_deinterleave(f, complexIn, 2, m_size/2 + 1);
+        vDSP_fft_zript(m_fspec, m_fpacked, 1, m_fbuf, m_order, FFT_INVERSE);
+        unpackReal(realOut);
+    }
+
+    void inversePolar(const float *BQ_R__ magIn, const float *BQ_R__ phaseIn, float *BQ_R__ realOut) {
+        if (!m_fspec) initFloat();
+
+        const int hs1 = m_size/2+1;
+        vvsincosf(m_fpacked->imagp, m_fpacked->realp, phaseIn, &hs1);
+        float *const rp = m_fpacked->realp;
+        float *const ip = m_fpacked->imagp;
+        for (int i = 0; i < hs1; ++i) rp[i] *= magIn[i];
+        for (int i = 0; i < hs1; ++i) ip[i] *= magIn[i];
+        fnyq();
+        vDSP_fft_zript(m_fspec, m_fpacked, 1, m_fbuf, m_order, FFT_INVERSE);
+        unpackReal(realOut);
+    }
+
+    void inverseCepstral(const float *BQ_R__ magIn, float *BQ_R__ cepOut) {
+        if (!m_fspec) initFloat();
+        const int hs1 = m_size/2 + 1;
+        v_copy(m_fspare, magIn, hs1);
+        for (int i = 0; i < hs1; ++i) m_fspare[i] += 0.000001f;
+        vvlogf(m_fspare2, m_fspare, &hs1);
+        inverse(m_fspare2, 0, cepOut);
+    }
+
+private:
+    const int m_size;
+    int m_order;
+    FFTSetup m_fspec;
+    FFTSetupD m_dspec;
+    DSPSplitComplex *m_fbuf;
+    DSPDoubleSplitComplex *m_dbuf;
+    DSPSplitComplex *m_fpacked;
+    float *m_fspare;
+    float *m_fspare2;
+    DSPDoubleSplitComplex *m_dpacked;
+    double *m_dspare;
+    double *m_dspare2;
+};
+
+#endif /* HAVE_VDSP */
+
+#ifdef HAVE_MEDIALIB
+
+class D_MEDIALIB : public FFTImpl
+{
+public:
+    D_MEDIALIB(int size) :
+        m_size(size),
+        m_dpacked(0), m_fpacked(0)
+    { 
+        for (int i = 0; ; ++i) {
+            if (m_size & (1 << i)) {
+                m_order = i;
+                break;
+            }
+        }
+    }
+
+    ~D_MEDIALIB() {
+        if (m_dpacked) {
+            deallocate(m_dpacked);
+        }
+        if (m_fpacked) {
+            deallocate(m_fpacked);
+        }
+    }
+
+    FFT::Precisions
+    getSupportedPrecisions() const {
+        return FFT::SinglePrecision | FFT::DoublePrecision;
+    }
+
+    //!!! rv check
+
+    void initFloat() {
+        m_fpacked = allocate<float>(m_size*2);
+    }
+
+    void initDouble() {
+        m_dpacked = allocate<double>(m_size*2);
+    }
+
+    void packFloatConjugates() {
+        const int hs = m_size / 2;
+        for (int i = 1; i <= hs; ++i) {
+            m_fpacked[(m_size-i)*2] = m_fpacked[2*i];
+            m_fpacked[(m_size-i)*2 + 1] = -m_fpacked[2*i + 1];
+        }
+    }
+
+    void packDoubleConjugates() {
+        const int hs = m_size / 2;
+        for (int i = 1; i <= hs; ++i) {
+            m_dpacked[(m_size-i)*2] = m_dpacked[2*i];
+            m_dpacked[(m_size-i)*2 + 1] = -m_dpacked[2*i + 1];
+        }
+    }
+
+    void packFloat(const float *BQ_R__ re, const float *BQ_R__ im) {
+        int index = 0;
+        const int hs = m_size/2;
+        for (int i = 0; i <= hs; ++i) {
+            m_fpacked[index++] = re[i];
+            index++;
+        }
+        index = 0;
+        if (im) {
+            for (int i = 0; i <= hs; ++i) {
+                index++;
+                m_fpacked[index++] = im[i];
+            }
+        } else {
+            for (int i = 0; i <= hs; ++i) {
+                index++;
+                m_fpacked[index++] = 0.f;
+            }
+        }
+        packFloatConjugates();
+    }
+
+    void packDouble(const double *BQ_R__ re, const double *BQ_R__ im) {
+        int index = 0;
+        const int hs = m_size/2;
+        for (int i = 0; i <= hs; ++i) {
+            m_dpacked[index++] = re[i];
+            index++;
+        }
+        index = 0;
+        if (im) {
+            for (int i = 0; i <= hs; ++i) {
+                index++;
+                m_dpacked[index++] = im[i];
+            }
+        } else {
+            for (int i = 0; i <= hs; ++i) {
+                index++;
+                m_dpacked[index++] = 0.0;
+            }
+        }
+        packDoubleConjugates();
+    }
+
+    void unpackFloat(float *re, float *BQ_R__ im) { // re may be equal to m_fpacked
+        int index = 0;
+        const int hs = m_size/2;
+        if (im) {
+            for (int i = 0; i <= hs; ++i) {
+                index++;
+                im[i] = m_fpacked[index++];
+            }
+        }
+        index = 0;
+        for (int i = 0; i <= hs; ++i) {
+            re[i] = m_fpacked[index++];
+            index++;
+        }
+    }        
+
+    void unpackDouble(double *re, double *BQ_R__ im) { // re may be equal to m_dpacked
+        int index = 0;
+        const int hs = m_size/2;
+        if (im) {
+            for (int i = 0; i <= hs; ++i) {
+                index++;
+                im[i] = m_dpacked[index++];
+            }
+        }
+        index = 0;
+        for (int i = 0; i <= hs; ++i) {
+            re[i] = m_dpacked[index++];
+            index++;
+        }
+    }
+
+    void forward(const double *BQ_R__ realIn, double *BQ_R__ realOut, double *BQ_R__ imagOut) {
+        if (!m_dpacked) initDouble();
+        mlib_SignalFFT_1_D64C_D64(m_dpacked, realIn, m_order);
+        unpackDouble(realOut, imagOut);
+    }
+
+    void forwardInterleaved(const double *BQ_R__ realIn, double *BQ_R__ complexOut) {
+        if (!m_dpacked) initDouble();
+        // mlib FFT gives the whole redundant complex result
+        mlib_SignalFFT_1_D64C_D64(m_dpacked, realIn, m_order);
+        v_copy(complexOut, m_dpacked, m_size + 2);
+    }
+
+    void forwardPolar(const double *BQ_R__ realIn, double *BQ_R__ magOut, double *BQ_R__ phaseOut) {
+        if (!m_dpacked) initDouble();
+        mlib_SignalFFT_1_D64C_D64(m_dpacked, realIn, m_order);
+        const int hs = m_size/2;
+        int index = 0;
+        for (int i = 0; i <= hs; ++i) {
+            int reali = index;
+            ++index;
+            magOut[i] = sqrt(m_dpacked[reali] * m_dpacked[reali] +
+                             m_dpacked[index] * m_dpacked[index]);
+            phaseOut[i] = atan2(m_dpacked[index], m_dpacked[reali]) ;
+            ++index;
+        }
+    }
+
+    void forwardMagnitude(const double *BQ_R__ realIn, double *BQ_R__ magOut) {
+        if (!m_dpacked) initDouble();
+        mlib_SignalFFT_1_D64C_D64(m_dpacked, realIn, m_order);
+        const int hs = m_size/2;
+        int index = 0;
+        for (int i = 0; i <= hs; ++i) {
+            int reali = index;
+            ++index;
+            magOut[i] = sqrt(m_dpacked[reali] * m_dpacked[reali] +
+                             m_dpacked[index] * m_dpacked[index]);
+            ++index;
+        }
+    }
+
+    void forward(const float *BQ_R__ realIn, float *BQ_R__ realOut, float *BQ_R__ imagOut) {
+        if (!m_fpacked) initFloat();
+        mlib_SignalFFT_1_F32C_F32(m_fpacked, realIn, m_order);
+        unpackFloat(realOut, imagOut);
+    }
+
+    void forwardInterleaved(const float *BQ_R__ realIn, float *BQ_R__ complexOut) {
+        if (!m_fpacked) initFloat();
+        // mlib FFT gives the whole redundant complex result
+        mlib_SignalFFT_1_F32C_F32(m_fpacked, realIn, m_order);
+        v_copy(complexOut, m_fpacked, m_size + 2);
+    }
+
+    void forwardPolar(const float *BQ_R__ realIn, float *BQ_R__ magOut, float *BQ_R__ phaseOut) {
+        if (!m_fpacked) initFloat();
+        mlib_SignalFFT_1_F32C_F32(m_fpacked, realIn, m_order);
+        const int hs = m_size/2;
+        int index = 0;
+        for (int i = 0; i <= hs; ++i) {
+            int reali = index;
+            ++index;
+            magOut[i] = sqrtf(m_fpacked[reali] * m_fpacked[reali] +
+                              m_fpacked[index] * m_fpacked[index]);
+            phaseOut[i] = atan2f(m_fpacked[index], m_fpacked[reali]);
+            ++index;
+        }
+    }
+
+    void forwardMagnitude(const float *BQ_R__ realIn, float *BQ_R__ magOut) {
+        if (!m_fpacked) initFloat();
+        mlib_SignalFFT_1_F32C_F32(m_fpacked, realIn, m_order);
+        const int hs = m_size/2;
+        int index = 0;
+        for (int i = 0; i <= hs; ++i) {
+            int reali = index;
+            ++index;
+            magOut[i] = sqrtf(m_fpacked[reali] * m_fpacked[reali] +
+                              m_fpacked[index] * m_fpacked[index]);
+            ++index;
+        }
+    }
+
+    void inverse(const double *BQ_R__ realIn, const double *BQ_R__ imagIn, double *BQ_R__ realOut) {
+        if (!m_dpacked) initDouble();
+        packDouble(realIn, imagIn);
+        mlib_SignalIFFT_2_D64_D64C(realOut, m_dpacked, m_order);
+    }
+
+    void inverseInterleaved(const double *BQ_R__ complexIn, double *BQ_R__ realOut) {
+        if (!m_dpacked) initDouble();
+        v_copy(m_dpacked, complexIn, m_size + 2);
+        packDoubleConjugates();
+        mlib_SignalIFFT_2_D64_D64C(realOut, m_dpacked, m_order);
+    }
+
+    void inversePolar(const double *BQ_R__ magIn, const double *BQ_R__ phaseIn, double *BQ_R__ realOut) {
+        if (!m_dpacked) initDouble();
+        const int hs = m_size/2;
+        for (int i = 0; i <= hs; ++i) {
+            double real = magIn[i] * cos(phaseIn[i]);
+            double imag = magIn[i] * sin(phaseIn[i]);
+            m_dpacked[i*2] = real;
+            m_dpacked[i*2 + 1] = imag;
+        }
+        packDoubleConjugates();
+        mlib_SignalIFFT_2_D64_D64C(realOut, m_dpacked, m_order);
+    }
+
+    void inverseCepstral(const double *BQ_R__ magIn, double *BQ_R__ cepOut) {
+        if (!m_dpacked) initDouble();
+        const int hs = m_size/2;
+        for (int i = 0; i <= hs; ++i) {
+            m_dpacked[i*2] = log(magIn[i] + 0.000001);
+            m_dpacked[i*2 + 1] = 0.0;
+        }
+        packDoubleConjugates();
+        mlib_SignalIFFT_2_D64_D64C(cepOut, m_dpacked, m_order);
+    }
+    
+    void inverse(const float *BQ_R__ realIn, const float *BQ_R__ imagIn, float *BQ_R__ realOut) {
+        if (!m_fpacked) initFloat();
+        packFloat(realIn, imagIn);
+        mlib_SignalIFFT_2_F32_F32C(realOut, m_fpacked, m_order);
+    }
+    
+    void inverseInterleaved(const float *BQ_R__ complexIn, float *BQ_R__ realOut) {
+        if (!m_fpacked) initFloat();
+        v_convert(m_fpacked, complexIn, m_size + 2);
+        packFloatConjugates();
+        mlib_SignalIFFT_2_F32_F32C(realOut, m_fpacked, m_order);
+    }
+
+    void inversePolar(const float *BQ_R__ magIn, const float *BQ_R__ phaseIn, float *BQ_R__ realOut) {
+        if (!m_fpacked) initFloat();
+        const int hs = m_size/2;
+        for (int i = 0; i <= hs; ++i) {
+            double real = magIn[i] * cos(phaseIn[i]);
+            double imag = magIn[i] * sin(phaseIn[i]);
+            m_fpacked[i*2] = real;
+            m_fpacked[i*2 + 1] = imag;
+        }
+        packFloatConjugates();
+        mlib_SignalIFFT_2_F32_F32C(realOut, m_fpacked, m_order);
+    }
+
+    void inverseCepstral(const float *BQ_R__ magIn, float *BQ_R__ cepOut) {
+        if (!m_fpacked) initFloat();
+        const int hs = m_size/2;
+        for (int i = 0; i <= hs; ++i) {
+            m_fpacked[i*2] = logf(magIn[i] + 0.000001);
+            m_fpacked[i*2 + 1] = 0.f;
+        }
+        packFloatConjugates();
+        mlib_SignalIFFT_2_F32_F32C(cepOut, m_fpacked, m_order);
+    }
+
+private:
+    const int m_size;
+    int m_order;
+    double *m_dpacked;
+    float *m_fpacked;
+};
+
+#endif /* HAVE_MEDIALIB */
+
+#ifdef HAVE_OPENMAX
+
+class D_OPENMAX : public FFTImpl
+{
+    // Convert a signed 32-bit integer to a float in the range [-1,1)
+    static inline float i2f(OMX_S32 i)
+    {
+        return float(i) / float(OMX_MAX_S32);
+    }
+
+    // Convert a signed 32-bit integer to a double in the range [-1,1)
+    static inline double i2d(OMX_S32 i)
+    {
+        return double(i) / double(OMX_MAX_S32);
+    }
+
+    // Convert a float in the range [-1,1) to a signed 32-bit integer
+    static inline OMX_S32 f2i(float f)
+    {
+        return OMX_S32(f * OMX_MAX_S32);
+    }
+
+    // Convert a double in the range [-1,1) to a signed 32-bit integer
+    static inline OMX_S32 d2i(double d)
+    {
+        return OMX_S32(d * OMX_MAX_S32);
+    }
+
+public:
+    D_OPENMAX(int size) :
+        m_size(size),
+        m_packed(0)
+    { 
+        for (int i = 0; ; ++i) {
+            if (m_size & (1 << i)) {
+                m_order = i;
+                break;
+            }
+        }
+    }
+
+    ~D_OPENMAX() {
+        if (m_packed) {
+            deallocate(m_packed);
+            deallocate(m_buf);
+            deallocate(m_fbuf);
+            deallocate(m_spec);
+        }
+    }
+
+    FFT::Precisions
+    getSupportedPrecisions() const {
+        return FFT::SinglePrecision;
+    }
+
+    //!!! rv check
+
+    // The OpenMAX implementation uses a fixed-point representation in
+    // 32-bit signed integers, with a downward scaling factor (0-32
+    // bits) supplied as an argument to the FFT function.
+
+    void initFloat() {
+        initDouble();
+    }
+
+    void initDouble() {
+        if (!m_packed) {
+            m_buf = allocate<OMX_S32>(m_size);
+            m_packed = allocate<OMX_S32>(m_size*2 + 2);
+            m_fbuf = allocate<float>(m_size*2 + 2);
+            OMX_INT sz = 0;
+            omxSP_FFTGetBufSize_R_S32(m_order, &sz);
+            m_spec = (OMXFFTSpec_R_S32 *)allocate<char>(sz);
+            omxSP_FFTInit_R_S32(m_spec, m_order);
+        }
+    }
+
+    void packFloat(const float *BQ_R__ re) {
+        // prepare fixed point input for forward transform
+        for (int i = 0; i < m_size; ++i) {
+            m_buf[i] = f2i(re[i]);
+        }
+    }
+
+    void packDouble(const double *BQ_R__ re) {
+        // prepare fixed point input for forward transform
+        for (int i = 0; i < m_size; ++i) {
+            m_buf[i] = d2i(re[i]);
+        }
+    }
+
+    void unpackFloat(float *BQ_R__ re, float *BQ_R__ im) {
+        // convert fixed point output for forward transform
+        int index = 0;
+        const int hs = m_size/2;
+        if (im) {
+            for (int i = 0; i <= hs; ++i) {
+                index++;
+                im[i] = i2f(m_packed[index++]);
+            }
+            v_scale(im, m_size, hs + 1);
+        }
+        index = 0;
+        for (int i = 0; i <= hs; ++i) {
+            re[i] = i2f(m_packed[index++]);
+            index++;
+        }
+        v_scale(re, m_size, hs + 1);
+    }        
+
+    void unpackDouble(double *BQ_R__ re, double *BQ_R__ im) {
+        // convert fixed point output for forward transform
+        int index = 0;
+        const int hs = m_size/2;
+        if (im) {
+            for (int i = 0; i <= hs; ++i) {
+                index++;
+                im[i] = i2d(m_packed[index++]);
+            }
+            v_scale(im, m_size, hs + 1);
+        }
+        index = 0;
+        for (int i = 0; i <= hs; ++i) {
+            re[i] = i2d(m_packed[index++]);
+            index++;
+        }
+        v_scale(re, m_size, hs + 1);
+    }
+
+    void unpackFloatInterleaved(float *BQ_R__ cplx) {
+        // convert fixed point output for forward transform
+        for (int i = 0; i < m_size + 2; ++i) {
+            cplx[i] = i2f(m_packed[i]);
+        }            
+        v_scale(cplx, m_size, m_size + 2);
+    }
+
+    void unpackDoubleInterleaved(double *BQ_R__ cplx) {
+        // convert fixed point output for forward transform
+        for (int i = 0; i < m_size + 2; ++i) {
+            cplx[i] = i2d(m_packed[i]);
+        }            
+        v_scale(cplx, m_size, m_size + 2);
+    }
+
+    void packFloat(const float *BQ_R__ re, const float *BQ_R__ im) {
+        // prepare fixed point input for inverse transform
+        int index = 0;
+        const int hs = m_size/2;
+        for (int i = 0; i <= hs; ++i) {
+            m_packed[index++] = f2i(re[i]);
+            index++;
+        }
+        index = 0;
+        if (im) {
+            for (int i = 0; i <= hs; ++i) {
+                index++;
+                m_packed[index++] = f2i(im[i]);
+            }
+        } else {
+            for (int i = 0; i <= hs; ++i) {
+                index++;
+                m_packed[index++] = 0;
+            }
+        }
+    }
+
+    void packDouble(const double *BQ_R__ re, const double *BQ_R__ im) {
+        // prepare fixed point input for inverse transform
+        int index = 0;
+        const int hs = m_size/2;
+        for (int i = 0; i <= hs; ++i) {
+            m_packed[index++] = d2i(re[i]);
+            index++;
+        }
+        index = 0;
+        if (im) {
+            for (int i = 0; i <= hs; ++i) {
+                index++;
+                m_packed[index++] = d2i(im[i]);
+            }
+        } else {
+            for (int i = 0; i <= hs; ++i) {
+                index++;
+                m_packed[index++] = 0;
+            }
+        }
+    }
+
+    void convertFloat(const float *BQ_R__ f) {
+        // convert interleaved input for inverse interleaved transform
+        const int n = m_size + 2;
+        for (int i = 0; i < n; ++i) {
+            m_packed[i] = f2i(f[i]);
+        }
+    }        
+
+    void convertDouble(const double *BQ_R__ d) {
+        // convert interleaved input for inverse interleaved transform
+        const int n = m_size + 2;
+        for (int i = 0; i < n; ++i) {
+            m_packed[i] = d2i(d[i]);
+        }
+    }        
+
+    void unpackFloat(float *BQ_R__ re) {
+        // convert fixed point output for inverse transform
+        for (int i = 0; i < m_size; ++i) {
+            re[i] = i2f(m_buf[i]) * m_size;
+        }
+    }
+
+    void unpackDouble(double *BQ_R__ re) {
+        // convert fixed point output for inverse transform
+        for (int i = 0; i < m_size; ++i) {
+            re[i] = i2d(m_buf[i]) * m_size;
+        }
+    }
+
+    void forward(const double *BQ_R__ realIn, double *BQ_R__ realOut, double *BQ_R__ imagOut) {
+        if (!m_packed) initDouble();
+        packDouble(realIn);
+        omxSP_FFTFwd_RToCCS_S32_Sfs(m_buf, m_packed, m_spec, m_order);
+        unpackDouble(realOut, imagOut);
+    }
+    
+    void forwardInterleaved(const double *BQ_R__ realIn, double *BQ_R__ complexOut) {
+        if (!m_packed) initDouble();
+        packDouble(realIn);
+        omxSP_FFTFwd_RToCCS_S32_Sfs(m_buf, m_packed, m_spec, m_order);
+        unpackDoubleInterleaved(complexOut);
+    }
+
+    void forwardPolar(const double *BQ_R__ realIn, double *BQ_R__ magOut, double *BQ_R__ phaseOut) {
+        if (!m_packed) initDouble();
+        packDouble(realIn);
+        omxSP_FFTFwd_RToCCS_S32_Sfs(m_buf, m_packed, m_spec, m_order);
+        unpackDouble(magOut, phaseOut); // temporarily
+        // at this point we actually have real/imag in the mag/phase arrays
+        const int hs = m_size/2;
+        for (int i = 0; i <= hs; ++i) {
+            double real = magOut[i];
+            double imag = phaseOut[i];
+            c_magphase(magOut + i, phaseOut + i, real, imag);
+        }
+    }
+
+    void forwardMagnitude(const double *BQ_R__ realIn, double *BQ_R__ magOut) {
+        if (!m_packed) initDouble();
+        packDouble(realIn);
+        omxSP_FFTFwd_RToCCS_S32_Sfs(m_buf, m_packed, m_spec, m_order);
+        const int hs = m_size/2;
+        for (int i = 0; i <= hs; ++i) {
+            int reali = i * 2;
+            int imagi = reali + 1;
+            double real = i2d(m_packed[reali]) * m_size;
+            double imag = i2d(m_packed[imagi]) * m_size;
+            magOut[i] = sqrt(real * real + imag * imag);
+        }
+    }
+
+    void forward(const float *BQ_R__ realIn, float *BQ_R__ realOut, float *BQ_R__ imagOut) {
+        if (!m_packed) initFloat();
+        packFloat(realIn);
+        omxSP_FFTFwd_RToCCS_S32_Sfs(m_buf, m_packed, m_spec, m_order);
+        unpackFloat(realOut, imagOut);
+    }
+
+    void forwardInterleaved(const float *BQ_R__ realIn, float *BQ_R__ complexOut) {
+        if (!m_packed) initFloat();
+        packFloat(realIn);
+        omxSP_FFTFwd_RToCCS_S32_Sfs(m_buf, m_packed, m_spec, m_order);
+        unpackFloatInterleaved(complexOut);
+    }
+
+    void forwardPolar(const float *BQ_R__ realIn, float *BQ_R__ magOut, float *BQ_R__ phaseOut) {
+        if (!m_packed) initFloat();
+
+        packFloat(realIn);
+        omxSP_FFTFwd_RToCCS_S32_Sfs(m_buf, m_packed, m_spec, m_order);
+        unpackFloat(magOut, phaseOut); // temporarily
+        // at this point we actually have real/imag in the mag/phase arrays
+        const int hs = m_size/2;
+        for (int i = 0; i <= hs; ++i) {
+            float real = magOut[i];
+            float imag = phaseOut[i];
+            c_magphase(magOut + i, phaseOut + i, real, imag);
+        }
+    }
+
+    void forwardMagnitude(const float *BQ_R__ realIn, float *BQ_R__ magOut) {
+        if (!m_packed) initFloat();
+        packFloat(realIn);
+        omxSP_FFTFwd_RToCCS_S32_Sfs(m_buf, m_packed, m_spec, m_order);
+        const int hs = m_size/2;
+        for (int i = 0; i <= hs; ++i) {
+            int reali = i * 2;
+            int imagi = reali + 1;
+            float real = i2f(m_packed[reali]) * m_size;
+            float imag = i2f(m_packed[imagi]) * m_size;
+            magOut[i] = sqrtf(real * real + imag * imag);
+        }
+    }
+
+    void inverse(const double *BQ_R__ realIn, const double *BQ_R__ imagIn, double *BQ_R__ realOut) {
+        if (!m_packed) initDouble();
+        packDouble(realIn, imagIn);
+        omxSP_FFTInv_CCSToR_S32_Sfs(m_packed, m_buf, m_spec, 0);
+        unpackDouble(realOut);
+    }
+
+    void inverseInterleaved(const double *BQ_R__ complexIn, double *BQ_R__ realOut) {
+        if (!m_packed) initDouble();
+        convertDouble(complexIn);
+        omxSP_FFTInv_CCSToR_S32_Sfs(m_packed, m_buf, m_spec, 0);
+        unpackDouble(realOut);
+    }
+
+    void inversePolar(const double *BQ_R__ magIn, const double *BQ_R__ phaseIn, double *BQ_R__ realOut) {
+        if (!m_packed) initDouble();
+        int index = 0;
+        const int hs = m_size/2;
+        for (int i = 0; i <= hs; ++i) {
+            double real, imag;
+            c_phasor(&real, &imag, phaseIn[i]);
+            m_fbuf[index++] = float(real);
+            m_fbuf[index++] = float(imag);
+        }
+        convertFloat(m_fbuf);
+        omxSP_FFTInv_CCSToR_S32_Sfs(m_packed, m_buf, m_spec, 0);
+        unpackDouble(realOut);
+    }
+
+    void inverseCepstral(const double *BQ_R__ magIn, double *BQ_R__ cepOut) {
+        if (!m_packed) initDouble();
+        //!!! implement
+#warning OpenMAX implementation lacks cepstral transforms
+    }
+    
+    void inverse(const float *BQ_R__ realIn, const float *BQ_R__ imagIn, float *BQ_R__ realOut) {
+        if (!m_packed) initFloat();
+        packFloat(realIn, imagIn);
+        omxSP_FFTInv_CCSToR_S32_Sfs(m_packed, m_buf, m_spec, 0);
+        unpackFloat(realOut);
+    }
+
+    void inverseInterleaved(const float *BQ_R__ complexIn, float *BQ_R__ realOut) {
+        if (!m_packed) initFloat();
+        convertFloat(complexIn);
+        omxSP_FFTInv_CCSToR_S32_Sfs(m_packed, m_buf, m_spec, 0);
+        unpackFloat(realOut);
+    }
+
+    void inversePolar(const float *BQ_R__ magIn, const float *BQ_R__ phaseIn, float *BQ_R__ realOut) {
+        if (!m_packed) initFloat();
+        const int hs = m_size/2;
+        v_polar_to_cartesian_interleaved(m_fbuf, magIn, phaseIn, hs+1);
+        convertFloat(m_fbuf);
+        omxSP_FFTInv_CCSToR_S32_Sfs(m_packed, m_buf, m_spec, 0);
+        unpackFloat(realOut);
+    }
+
+    void inverseCepstral(const float *BQ_R__ magIn, float *BQ_R__ cepOut) {
+        if (!m_packed) initFloat();
+        //!!! implement
+#warning OpenMAX implementation lacks cepstral transforms
+    }
+
+private:
+    const int m_size;
+    int m_order;
+    OMX_S32 *m_packed;
+    OMX_S32 *m_buf;
+    float *m_fbuf;
+    OMXFFTSpec_R_S32 *m_spec;
+
+};
+
+#endif /* HAVE_OPENMAX */
+
+#ifdef HAVE_FFTW3
+
+/*
+ Define FFTW_DOUBLE_ONLY to make all uses of FFTW functions be
+ double-precision (so "float" FFTs are calculated by casting to
+ doubles and using the double-precision FFTW function).
+
+ Define FFTW_SINGLE_ONLY to make all uses of FFTW functions be
+ single-precision (so "double" FFTs are calculated by casting to
+ floats and using the single-precision FFTW function).
+
+ Neither of these flags is desirable for either performance or
+ precision. The main reason to define either flag is to avoid linking
+ against both fftw3 and fftw3f libraries.
+*/
+
+//#define FFTW_DOUBLE_ONLY 1
+//#define FFTW_SINGLE_ONLY 1
+
+#if defined(FFTW_DOUBLE_ONLY) && defined(FFTW_SINGLE_ONLY)
+// Can't meaningfully define both
+#error Can only define one of FFTW_DOUBLE_ONLY and FFTW_SINGLE_ONLY
+#endif
+
+#if defined(FFTW_FLOAT_ONLY)
+#warning FFTW_FLOAT_ONLY is deprecated, use FFTW_SINGLE_ONLY instead
+#define FFTW_SINGLE_ONLY 1
+#endif
+
+#ifdef FFTW_DOUBLE_ONLY
+#define fft_float_type double
+#define fftwf_complex fftw_complex
+#define fftwf_plan fftw_plan
+#define fftwf_plan_dft_r2c_1d fftw_plan_dft_r2c_1d
+#define fftwf_plan_dft_c2r_1d fftw_plan_dft_c2r_1d
+#define fftwf_destroy_plan fftw_destroy_plan
+#define fftwf_malloc fftw_malloc
+#define fftwf_free fftw_free
+#define fftwf_execute fftw_execute
+#define atan2f atan2
+#define sqrtf sqrt
+#define cosf cos
+#define sinf sin
+#else
+#define fft_float_type float
+#endif /* FFTW_DOUBLE_ONLY */
+
+#ifdef FFTW_SINGLE_ONLY
+#define fft_double_type float
+#define fftw_complex fftwf_complex
+#define fftw_plan fftwf_plan
+#define fftw_plan_dft_r2c_1d fftwf_plan_dft_r2c_1d
+#define fftw_plan_dft_c2r_1d fftwf_plan_dft_c2r_1d
+#define fftw_destroy_plan fftwf_destroy_plan
+#define fftw_malloc fftwf_malloc
+#define fftw_free fftwf_free
+#define fftw_execute fftwf_execute
+#define atan2 atan2f
+#define sqrt sqrtf
+#define cos cosf
+#define sin sinf
+#else
+#define fft_double_type double
+#endif /* FFTW_SINGLE_ONLY */
+
+class D_FFTW : public FFTImpl
+{
+public:
+    D_FFTW(int size) :
+        m_fplanf(0), m_dplanf(0), m_size(size)
+    {
+        initMutex();
+    }
+
+    ~D_FFTW() {
+        if (m_fplanf) {
+            lock();
+            bool save = false;
+            if (m_extantf > 0 && --m_extantf == 0) save = true;
+#ifndef FFTW_DOUBLE_ONLY
+            if (save) saveWisdom('f');
+#endif
+            fftwf_destroy_plan(m_fplanf);
+            fftwf_destroy_plan(m_fplani);
+            fftwf_free(m_fbuf);
+            fftwf_free(m_fpacked);
+            unlock();
+        }
+        if (m_dplanf) {
+            lock();
+            bool save = false;
+            if (m_extantd > 0 && --m_extantd == 0) save = true;
+#ifndef FFTW_SINGLE_ONLY
+            if (save) saveWisdom('d');
+#endif
+            fftw_destroy_plan(m_dplanf);
+            fftw_destroy_plan(m_dplani);
+            fftw_free(m_dbuf);
+            fftw_free(m_dpacked);
+            unlock();
+        }
+        destroyMutex();
+    }
+
+    FFT::Precisions
+    getSupportedPrecisions() const {
+#ifdef FFTW_SINGLE_ONLY
+        return FFT::SinglePrecision;
+#else
+#ifdef FFTW_DOUBLE_ONLY
+        return FFT::DoublePrecision;
+#else
+        return FFT::SinglePrecision | FFT::DoublePrecision;
+#endif
+#endif
+    }
+
+    void initFloat() {
+        if (m_fplanf) return;
+        bool load = false;
+        lock();
+        if (m_extantf++ == 0) load = true;
+#ifdef FFTW_DOUBLE_ONLY
+        if (load) loadWisdom('d');
+#else
+        if (load) loadWisdom('f');
+#endif
+        m_fbuf = (fft_float_type *)fftw_malloc(m_size * sizeof(fft_float_type));
+        m_fpacked = (fftwf_complex *)fftw_malloc
+            ((m_size/2 + 1) * sizeof(fftwf_complex));
+        m_fplanf = fftwf_plan_dft_r2c_1d
+            (m_size, m_fbuf, m_fpacked, FFTW_MEASURE);
+        m_fplani = fftwf_plan_dft_c2r_1d
+            (m_size, m_fpacked, m_fbuf, FFTW_MEASURE);
+        unlock();
+    }
+
+    void initDouble() {
+        if (m_dplanf) return;
+        bool load = false;
+        lock();
+        if (m_extantd++ == 0) load = true;
+#ifdef FFTW_SINGLE_ONLY
+        if (load) loadWisdom('f');
+#else
+        if (load) loadWisdom('d');
+#endif
+        m_dbuf = (fft_double_type *)fftw_malloc(m_size * sizeof(fft_double_type));
+        m_dpacked = (fftw_complex *)fftw_malloc
+            ((m_size/2 + 1) * sizeof(fftw_complex));
+        m_dplanf = fftw_plan_dft_r2c_1d
+            (m_size, m_dbuf, m_dpacked, FFTW_MEASURE);
+        m_dplani = fftw_plan_dft_c2r_1d
+            (m_size, m_dpacked, m_dbuf, FFTW_MEASURE);
+        unlock();
+    }
+
+    void loadWisdom(char type) { wisdom(false, type); }
+    void saveWisdom(char type) { wisdom(true, type); }
+
+    void wisdom(bool save, char type) {
+
+#ifdef FFTW_DOUBLE_ONLY
+        if (type == 'f') return;
+#endif
+#ifdef FFTW_SINGLE_ONLY
+        if (type == 'd') return;
+#endif
+
+        const char *home = getenv("HOME");
+        if (!home) return;
+
+        char fn[256];
+        snprintf(fn, 256, "%s/%s.%c", home, ".turbot.wisdom", type);
+
+        FILE *f = fopen(fn, save ? "wb" : "rb");
+        if (!f) return;
+
+        if (save) {
+            switch (type) {
+#ifdef FFTW_DOUBLE_ONLY
+            case 'f': break;
+#else
+            case 'f': fftwf_export_wisdom_to_file(f); break;
+#endif
+#ifdef FFTW_SINGLE_ONLY
+            case 'd': break;
+#else
+            case 'd': fftw_export_wisdom_to_file(f); break;
+#endif
+            default: break;
+            }
+        } else {
+            switch (type) {
+#ifdef FFTW_DOUBLE_ONLY
+            case 'f': break;
+#else
+            case 'f': fftwf_import_wisdom_from_file(f); break;
+#endif
+#ifdef FFTW_SINGLE_ONLY
+            case 'd': break;
+#else
+            case 'd': fftw_import_wisdom_from_file(f); break;
+#endif
+            default: break;
+            }
+        }
+
+        fclose(f);
+    }
+
+    void packFloat(const float *BQ_R__ re, const float *BQ_R__ im) {
+        const int hs = m_size/2;
+        fftwf_complex *const BQ_R__ fpacked = m_fpacked; 
+        for (int i = 0; i <= hs; ++i) {
+            fpacked[i][0] = re[i];
+        }
+        if (im) {
+            for (int i = 0; i <= hs; ++i) {
+                fpacked[i][1] = im[i];
+            }
+        } else {
+            for (int i = 0; i <= hs; ++i) {
+                fpacked[i][1] = 0.f;
+            }
+        }                
+    }
+
+    void packDouble(const double *BQ_R__ re, const double *BQ_R__ im) {
+        const int hs = m_size/2;
+        fftw_complex *const BQ_R__ dpacked = m_dpacked; 
+        for (int i = 0; i <= hs; ++i) {
+            dpacked[i][0] = re[i];
+        }
+        if (im) {
+            for (int i = 0; i <= hs; ++i) {
+                dpacked[i][1] = im[i];
+            }
+        } else {
+            for (int i = 0; i <= hs; ++i) {
+                dpacked[i][1] = 0.0;
+            }
+        }
+    }
+
+    void unpackFloat(float *BQ_R__ re, float *BQ_R__ im) {
+        const int hs = m_size/2;
+        for (int i = 0; i <= hs; ++i) {
+            re[i] = m_fpacked[i][0];
+        }
+        if (im) {
+            for (int i = 0; i <= hs; ++i) {
+                im[i] = m_fpacked[i][1];
+            }
+        }
+    }        
+
+    void unpackDouble(double *BQ_R__ re, double *BQ_R__ im) {
+        const int hs = m_size/2;
+        for (int i = 0; i <= hs; ++i) {
+            re[i] = m_dpacked[i][0];
+        }
+        if (im) {
+            for (int i = 0; i <= hs; ++i) {
+                im[i] = m_dpacked[i][1];
+            }
+        }
+    }        
+
+    void forward(const double *BQ_R__ realIn, double *BQ_R__ realOut, double *BQ_R__ imagOut) {
+        if (!m_dplanf) initDouble();
+        const int sz = m_size;
+        fft_double_type *const BQ_R__ dbuf = m_dbuf;
+#ifndef FFTW_SINGLE_ONLY
+        if (realIn != dbuf) 
+#endif
+            for (int i = 0; i < sz; ++i) {
+                dbuf[i] = realIn[i];
+            }
+        fftw_execute(m_dplanf);
+        unpackDouble(realOut, imagOut);
+    }
+
+    void forwardInterleaved(const double *BQ_R__ realIn, double *BQ_R__ complexOut) {
+        if (!m_dplanf) initDouble();
+        const int sz = m_size;
+        fft_double_type *const BQ_R__ dbuf = m_dbuf;
+#ifndef FFTW_SINGLE_ONLY
+        if (realIn != dbuf) 
+#endif
+            for (int i = 0; i < sz; ++i) {
+                dbuf[i] = realIn[i];
+            }
+        fftw_execute(m_dplanf);
+        v_convert(complexOut, (fft_double_type *)m_dpacked, sz + 2);
+    }
+
+    void forwardPolar(const double *BQ_R__ realIn, double *BQ_R__ magOut, double *BQ_R__ phaseOut) {
+        if (!m_dplanf) initDouble();
+        fft_double_type *const BQ_R__ dbuf = m_dbuf;
+        const int sz = m_size;
+#ifndef FFTW_SINGLE_ONLY
+        if (realIn != dbuf)
+#endif
+            for (int i = 0; i < sz; ++i) {
+                dbuf[i] = realIn[i];
+            }
+        fftw_execute(m_dplanf);
+        v_cartesian_interleaved_to_polar(magOut, phaseOut,
+                                         (double *)m_dpacked, m_size/2+1);
+    }
+
+    void forwardMagnitude(const double *BQ_R__ realIn, double *BQ_R__ magOut) {
+        if (!m_dplanf) initDouble();
+        fft_double_type *const BQ_R__ dbuf = m_dbuf;
+        const int sz = m_size;
+#ifndef FFTW_SINGLE_ONLY
+        if (realIn != m_dbuf)
+#endif
+            for (int i = 0; i < sz; ++i) {
+                dbuf[i] = realIn[i];
+            }
+        fftw_execute(m_dplanf);
+        const int hs = m_size/2;
+        for (int i = 0; i <= hs; ++i) {
+            magOut[i] = sqrt(m_dpacked[i][0] * m_dpacked[i][0] +
+                             m_dpacked[i][1] * m_dpacked[i][1]);
+        }
+    }
+
+    void forward(const float *BQ_R__ realIn, float *BQ_R__ realOut, float *BQ_R__ imagOut) {
+        if (!m_fplanf) initFloat();
+        fft_float_type *const BQ_R__ fbuf = m_fbuf;
+        const int sz = m_size;
+#ifndef FFTW_DOUBLE_ONLY
+        if (realIn != fbuf)
+#endif
+            for (int i = 0; i < sz; ++i) {
+                fbuf[i] = realIn[i];
+            }
+        fftwf_execute(m_fplanf);
+        unpackFloat(realOut, imagOut);
+    }
+
+    void forwardInterleaved(const float *BQ_R__ realIn, float *BQ_R__ complexOut) {
+        if (!m_fplanf) initFloat();
+        fft_float_type *const BQ_R__ fbuf = m_fbuf;
+        const int sz = m_size;
+#ifndef FFTW_DOUBLE_ONLY
+        if (realIn != fbuf)
+#endif
+            for (int i = 0; i < sz; ++i) {
+                fbuf[i] = realIn[i];
+            }
+        fftwf_execute(m_fplanf);
+        v_convert(complexOut, (fft_float_type *)m_fpacked, sz + 2);
+    }
+
+    void forwardPolar(const float *BQ_R__ realIn, float *BQ_R__ magOut, float *BQ_R__ phaseOut) {
+        if (!m_fplanf) initFloat();
+        fft_float_type *const BQ_R__ fbuf = m_fbuf;
+        const int sz = m_size;
+#ifndef FFTW_DOUBLE_ONLY
+        if (realIn != fbuf) 
+#endif
+            for (int i = 0; i < sz; ++i) {
+                fbuf[i] = realIn[i];
+            }
+        fftwf_execute(m_fplanf);
+        v_cartesian_interleaved_to_polar(magOut, phaseOut,
+                                         (float *)m_fpacked, m_size/2+1);
+    }
+
+    void forwardMagnitude(const float *BQ_R__ realIn, float *BQ_R__ magOut) {
+        if (!m_fplanf) initFloat();
+        fft_float_type *const BQ_R__ fbuf = m_fbuf;
+        const int sz = m_size;
+#ifndef FFTW_DOUBLE_ONLY
+        if (realIn != fbuf)
+#endif
+            for (int i = 0; i < sz; ++i) {
+                fbuf[i] = realIn[i];
+            }
+        fftwf_execute(m_fplanf);
+        const int hs = m_size/2;
+        for (int i = 0; i <= hs; ++i) {
+            magOut[i] = sqrtf(m_fpacked[i][0] * m_fpacked[i][0] +
+                              m_fpacked[i][1] * m_fpacked[i][1]);
+        }
+    }
+
+    void inverse(const double *BQ_R__ realIn, const double *BQ_R__ imagIn, double *BQ_R__ realOut) {
+        if (!m_dplanf) initDouble();
+        packDouble(realIn, imagIn);
+        fftw_execute(m_dplani);
+        const int sz = m_size;
+        fft_double_type *const BQ_R__ dbuf = m_dbuf;
+#ifndef FFTW_SINGLE_ONLY
+        if (realOut != dbuf) 
+#endif
+            for (int i = 0; i < sz; ++i) {
+                realOut[i] = dbuf[i];
+            }
+    }
+
+    void inverseInterleaved(const double *BQ_R__ complexIn, double *BQ_R__ realOut) {
+        if (!m_dplanf) initDouble();
+        v_convert((double *)m_dpacked, complexIn, m_size + 2);
+        fftw_execute(m_dplani);
+        const int sz = m_size;
+        fft_double_type *const BQ_R__ dbuf = m_dbuf;
+#ifndef FFTW_SINGLE_ONLY
+        if (realOut != dbuf) 
+#endif
+            for (int i = 0; i < sz; ++i) {
+                realOut[i] = dbuf[i];
+            }
+    }
+
+    void inversePolar(const double *BQ_R__ magIn, const double *BQ_R__ phaseIn, double *BQ_R__ realOut) {
+        if (!m_dplanf) initDouble();
+        const int hs = m_size/2;
+        fftw_complex *const BQ_R__ dpacked = m_dpacked;
+        for (int i = 0; i <= hs; ++i) {
+            dpacked[i][0] = magIn[i] * cos(phaseIn[i]);
+        }
+        for (int i = 0; i <= hs; ++i) {
+            dpacked[i][1] = magIn[i] * sin(phaseIn[i]);
+        }
+        fftw_execute(m_dplani);
+        const int sz = m_size;
+        fft_double_type *const BQ_R__ dbuf = m_dbuf;
+#ifndef FFTW_SINGLE_ONLY
+        if (realOut != dbuf)
+#endif
+            for (int i = 0; i < sz; ++i) {
+                realOut[i] = dbuf[i];
+            }
+    }
+
+    void inverseCepstral(const double *BQ_R__ magIn, double *BQ_R__ cepOut) {
+        if (!m_dplanf) initDouble();
+        fft_double_type *const BQ_R__ dbuf = m_dbuf;
+        fftw_complex *const BQ_R__ dpacked = m_dpacked;
+        const int hs = m_size/2;
+        for (int i = 0; i <= hs; ++i) {
+            dpacked[i][0] = log(magIn[i] + 0.000001);
+        }
+        for (int i = 0; i <= hs; ++i) {
+            dpacked[i][1] = 0.0;
+        }
+        fftw_execute(m_dplani);
+        const int sz = m_size;
+#ifndef FFTW_SINGLE_ONLY
+        if (cepOut != dbuf)
+#endif
+            for (int i = 0; i < sz; ++i) {
+                cepOut[i] = dbuf[i];
+            }
+    }
+
+    void inverse(const float *BQ_R__ realIn, const float *BQ_R__ imagIn, float *BQ_R__ realOut) {
+        if (!m_fplanf) initFloat();
+        packFloat(realIn, imagIn);
+        fftwf_execute(m_fplani);
+        const int sz = m_size;
+        fft_float_type *const BQ_R__ fbuf = m_fbuf;
+#ifndef FFTW_DOUBLE_ONLY
+        if (realOut != fbuf)
+#endif
+            for (int i = 0; i < sz; ++i) {
+                realOut[i] = fbuf[i];
+            }
+    }
+
+    void inverseInterleaved(const float *BQ_R__ complexIn, float *BQ_R__ realOut) {
+        if (!m_fplanf) initFloat();
+        v_copy((float *)m_fpacked, complexIn, m_size + 2);
+        fftwf_execute(m_fplani);
+        const int sz = m_size;
+        fft_float_type *const BQ_R__ fbuf = m_fbuf;
+#ifndef FFTW_DOUBLE_ONLY
+        if (realOut != fbuf)
+#endif
+            for (int i = 0; i < sz; ++i) {
+                realOut[i] = fbuf[i];
+            }
+    }
+
+    void inversePolar(const float *BQ_R__ magIn, const float *BQ_R__ phaseIn, float *BQ_R__ realOut) {
+        if (!m_fplanf) initFloat();
+        const int hs = m_size/2;
+        fftwf_complex *const BQ_R__ fpacked = m_fpacked;
+        for (int i = 0; i <= hs; ++i) {
+            fpacked[i][0] = magIn[i] * cosf(phaseIn[i]);
+        }
+        for (int i = 0; i <= hs; ++i) {
+            fpacked[i][1] = magIn[i] * sinf(phaseIn[i]);
+        }
+        fftwf_execute(m_fplani);
+        const int sz = m_size;
+        fft_float_type *const BQ_R__ fbuf = m_fbuf;
+#ifndef FFTW_DOUBLE_ONLY
+        if (realOut != fbuf)
+#endif
+            for (int i = 0; i < sz; ++i) {
+                realOut[i] = fbuf[i];
+            }
+    }
+
+    void inverseCepstral(const float *BQ_R__ magIn, float *BQ_R__ cepOut) {
+        if (!m_fplanf) initFloat();
+        const int hs = m_size/2;
+        fftwf_complex *const BQ_R__ fpacked = m_fpacked;
+        for (int i = 0; i <= hs; ++i) {
+            fpacked[i][0] = logf(magIn[i] + 0.000001f);
+        }
+        for (int i = 0; i <= hs; ++i) {
+            fpacked[i][1] = 0.f;
+        }
+        fftwf_execute(m_fplani);
+        const int sz = m_size;
+        fft_float_type *const BQ_R__ fbuf = m_fbuf;
+#ifndef FFTW_DOUBLE_ONLY
+        if (cepOut != fbuf)
+#endif
+            for (int i = 0; i < sz; ++i) {
+                cepOut[i] = fbuf[i];
+            }
+    }
+
+private:
+    fftwf_plan m_fplanf;
+    fftwf_plan m_fplani;
+#ifdef FFTW_DOUBLE_ONLY
+    double *m_fbuf;
+#else
+    float *m_fbuf;
+#endif
+    fftwf_complex *m_fpacked;
+    fftw_plan m_dplanf;
+    fftw_plan m_dplani;
+#ifdef FFTW_SINGLE_ONLY
+    float *m_dbuf;
+#else
+    double *m_dbuf;
+#endif
+    fftw_complex *m_dpacked;
+    const int m_size;
+    static int m_extantf;
+    static int m_extantd;
+#ifdef NO_THREADING
+    void initMutex() {}
+    void destroyMutex() {}
+    void lock() {}
+    void unlock() {}
+#else
+#ifdef _WIN32
+    static HANDLE m_commonMutex;
+    void initMutex() { m_commonMutex = CreateMutex(NULL, FALSE, NULL); }
+    void destroyMutex() { CloseHandle(m_commonMutex); }
+    void lock() { WaitForSingleObject(m_commonMutex, INFINITE); }
+    void unlock() { ReleaseMutex(m_commonMutex); }
+#else
+    static pthread_mutex_t m_commonMutex;
+    void initMutex() { pthread_mutex_init(&m_commonMutex, 0); }
+    void destroyMutex() { pthread_mutex_destroy(&m_commonMutex); }
+    void lock() { pthread_mutex_lock(&m_commonMutex); }
+    void unlock() { pthread_mutex_unlock(&m_commonMutex); }
+#endif
+#endif
+};
+
+int
+D_FFTW::m_extantf = 0;
+
+int
+D_FFTW::m_extantd = 0;
+
+#ifndef NO_THREADING
+#ifdef _WIN32
+HANDLE D_FFTW::m_commonMutex;
+#else
+pthread_mutex_t D_FFTW::m_commonMutex;
+#endif
+#endif
+
+#endif /* HAVE_FFTW3 */
+
+#ifdef HAVE_SFFT
+
+/*
+ Define SFFT_DOUBLE_ONLY to make all uses of SFFT functions be
+ double-precision (so "float" FFTs are calculated by casting to
+ doubles and using the double-precision SFFT function).
+
+ Define SFFT_SINGLE_ONLY to make all uses of SFFT functions be
+ single-precision (so "double" FFTs are calculated by casting to
+ floats and using the single-precision SFFT function).
+
+ Neither of these flags is desirable for either performance or
+ precision.
+*/
+
+//#define SFFT_DOUBLE_ONLY 1
+//#define SFFT_SINGLE_ONLY 1
+
+#if defined(SFFT_DOUBLE_ONLY) && defined(SFFT_SINGLE_ONLY)
+// Can't meaningfully define both
+#error Can only define one of SFFT_DOUBLE_ONLY and SFFT_SINGLE_ONLY
+#endif
+
+#ifdef SFFT_DOUBLE_ONLY
+#define fft_float_type double
+#define FLAG_SFFT_FLOAT SFFT_DOUBLE
+#define atan2f atan2
+#define sqrtf sqrt
+#define cosf cos
+#define sinf sin
+#define logf log
+#else
+#define FLAG_SFFT_FLOAT SFFT_FLOAT
+#define fft_float_type float
+#endif /* SFFT_DOUBLE_ONLY */
+
+#ifdef SFFT_SINGLE_ONLY
+#define fft_double_type float
+#define FLAG_SFFT_DOUBLE SFFT_FLOAT
+#define atan2 atan2f
+#define sqrt sqrtf
+#define cos cosf
+#define sin sinf
+#define log logf
+#else
+#define FLAG_SFFT_DOUBLE SFFT_DOUBLE
+#define fft_double_type double
+#endif /* SFFT_SINGLE_ONLY */
+
+class D_SFFT : public FFTImpl
+{
+public:
+    D_SFFT(int size) :
+        m_fplanf(0), m_fplani(0), m_dplanf(0), m_dplani(0), m_size(size)
+    {
+    }
+
+    ~D_SFFT() {
+        if (m_fplanf) {
+            sfft_free(m_fplanf);
+            sfft_free(m_fplani);
+            deallocate(m_fbuf);
+            deallocate(m_fresult);
+        }
+        if (m_dplanf) {
+            sfft_free(m_dplanf);
+            sfft_free(m_dplani);
+            deallocate(m_dbuf);
+            deallocate(m_dresult);
+        }
+    }
+
+    FFT::Precisions
+    getSupportedPrecisions() const {
+#ifdef SFFT_SINGLE_ONLY
+        return FFT::SinglePrecision;
+#else
+#ifdef SFFT_DOUBLE_ONLY
+        return FFT::DoublePrecision;
+#else
+        return FFT::SinglePrecision | FFT::DoublePrecision;
+#endif
+#endif
+    }
+
+    void initFloat() {
+        if (m_fplanf) return;
+        m_fbuf = allocate<fft_float_type>(2 * m_size);
+        m_fresult = allocate<fft_float_type>(2 * m_size);
+        m_fplanf = sfft_init(m_size, SFFT_FORWARD | FLAG_SFFT_FLOAT);
+        m_fplani = sfft_init(m_size, SFFT_BACKWARD | FLAG_SFFT_FLOAT);
+        if (!m_fplanf || !m_fplani) {
+            if (!m_fplanf) {
+                std::cerr << "D_SFFT: Failed to construct forward float transform for size " << m_size << " (check SFFT library's target configuration)" << std::endl;
+            } else {
+                std::cerr << "D_SFFT: Failed to construct inverse float transform for size " << m_size << " (check SFFT library's target configuration)" << std::endl;
+            }
+#ifndef NO_EXCEPTIONS
+            throw FFT::InternalError;
+#else
+            abort();
+#endif
+        }
+    }
+
+    void initDouble() {
+        if (m_dplanf) return;
+        m_dbuf = allocate<fft_double_type>(2 * m_size);
+        m_dresult = allocate<fft_double_type>(2 * m_size);
+        m_dplanf = sfft_init(m_size, SFFT_FORWARD | FLAG_SFFT_DOUBLE);
+        m_dplani = sfft_init(m_size, SFFT_BACKWARD | FLAG_SFFT_DOUBLE);
+        if (!m_dplanf || !m_dplani) {
+            if (!m_dplanf) {
+                std::cerr << "D_SFFT: Failed to construct forward double transform for size " << m_size << " (check SFFT library's target configuration)" << std::endl;
+            } else {
+                std::cerr << "D_SFFT: Failed to construct inverse double transform for size " << m_size << " (check SFFT library's target configuration)" << std::endl;
+            }
+#ifndef NO_EXCEPTIONS
+            throw FFT::InternalError;
+#else
+            abort();
+#endif
+        }
+    }
+
+    void packFloat(const float *BQ_R__ re, const float *BQ_R__ im, fft_float_type *target, int n) {
+        for (int i = 0; i < n; ++i) target[i*2] = re[i];
+        if (im) {
+            for (int i = 0; i < n; ++i) target[i*2+1] = im[i]; 
+        } else {
+            for (int i = 0; i < n; ++i) target[i*2+1] = 0.f;
+        }                
+    }
+
+    void packDouble(const double *BQ_R__ re, const double *BQ_R__ im, fft_double_type *target, int n) {
+        for (int i = 0; i < n; ++i) target[i*2] = re[i];
+        if (im) {
+            for (int i = 0; i < n; ++i) target[i*2+1] = im[i];
+        } else {
+            for (int i = 0; i < n; ++i) target[i*2+1] = 0.0;
+        }                
+    }
+
+    void unpackFloat(const fft_float_type *source, float *BQ_R__ re, float *BQ_R__ im, int n) {
+        for (int i = 0; i < n; ++i) re[i] = source[i*2];
+        if (im) {
+            for (int i = 0; i < n; ++i) im[i] = source[i*2+1];
+        }
+    }        
+
+    void unpackDouble(const fft_double_type *source, double *BQ_R__ re, double *BQ_R__ im, int n) {
+        for (int i = 0; i < n; ++i) re[i] = source[i*2];
+        if (im) {
+            for (int i = 0; i < n; ++i) im[i] = source[i*2+1];
+        }
+    }        
+
+    template<typename T>
+    void mirror(T *BQ_R__ cplx, int n) {
+        for (int i = 1; i <= n/2; ++i) {
+            int j = n-i;
+            cplx[j*2] = cplx[i*2];
+            cplx[j*2+1] = -cplx[i*2+1];
+        }
+    }
+
+    void forward(const double *BQ_R__ realIn, double *BQ_R__ realOut, double *BQ_R__ imagOut) {
+        if (!m_dplanf) initDouble();
+        packDouble(realIn, 0, m_dbuf, m_size);
+        sfft_execute(m_dplanf, m_dbuf, m_dresult);
+        unpackDouble(m_dresult, realOut, imagOut, m_size/2+1);
+    }
+
+    void forwardInterleaved(const double *BQ_R__ realIn, double *BQ_R__ complexOut) {
+        if (!m_dplanf) initDouble();
+        packDouble(realIn, 0, m_dbuf, m_size);
+        sfft_execute(m_dplanf, m_dbuf, m_dresult);
+        v_convert(complexOut, m_dresult, m_size+2); // i.e. m_size/2+1 complex
+    }
+
+    void forwardPolar(const double *BQ_R__ realIn, double *BQ_R__ magOut, double *BQ_R__ phaseOut) {
+        if (!m_dplanf) initDouble();
+        packDouble(realIn, 0, m_dbuf, m_size);
+        sfft_execute(m_dplanf, m_dbuf, m_dresult);
+        v_cartesian_interleaved_to_polar(magOut, phaseOut,
+                                         m_dresult, m_size/2+1);
+    }
+
+    void forwardMagnitude(const double *BQ_R__ realIn, double *BQ_R__ magOut) {
+        if (!m_dplanf) initDouble();
+        packDouble(realIn, 0, m_dbuf, m_size);
+        sfft_execute(m_dplanf, m_dbuf, m_dresult);
+        const int hs = m_size/2;
+        for (int i = 0; i <= hs; ++i) {
+            magOut[i] = sqrt(m_dresult[i*2] * m_dresult[i*2] +
+                             m_dresult[i*2+1] * m_dresult[i*2+1]);
+        }
+    }
+
+    void forward(const float *BQ_R__ realIn, float *BQ_R__ realOut, float *BQ_R__ imagOut) {
+        if (!m_fplanf) initFloat();
+        packFloat(realIn, 0, m_fbuf, m_size);
+        sfft_execute(m_fplanf, m_fbuf, m_fresult);
+        unpackFloat(m_fresult, realOut, imagOut, m_size/2+1);
+    }
+
+    void forwardInterleaved(const float *BQ_R__ realIn, float *BQ_R__ complexOut) {
+        if (!m_fplanf) initFloat();
+        packFloat(realIn, 0, m_fbuf, m_size);
+        sfft_execute(m_fplanf, m_fbuf, m_fresult);
+        v_convert(complexOut, m_fresult, m_size+2); // i.e. m_size/2+1 complex
+    }
+
+    void forwardPolar(const float *BQ_R__ realIn, float *BQ_R__ magOut, float *BQ_R__ phaseOut) {
+        if (!m_fplanf) initFloat();
+        packFloat(realIn, 0, m_fbuf, m_size);
+        sfft_execute(m_fplanf, m_fbuf, m_fresult);
+        v_cartesian_interleaved_to_polar(magOut, phaseOut,
+                                         m_fresult, m_size/2+1);
+    }
+
+    void forwardMagnitude(const float *BQ_R__ realIn, float *BQ_R__ magOut) {
+        if (!m_fplanf) initFloat();
+        packFloat(realIn, 0, m_fbuf, m_size);
+        sfft_execute(m_fplanf, m_fbuf, m_fresult);
+        const int hs = m_size/2;
+        for (int i = 0; i <= hs; ++i) {
+            magOut[i] = sqrtf(m_fresult[i*2] * m_fresult[i*2] +
+                              m_fresult[i*2+1] * m_fresult[i*2+1]);
+        }
+    }
+
+    void inverse(const double *BQ_R__ realIn, const double *BQ_R__ imagIn, double *BQ_R__ realOut) {
+        if (!m_dplanf) initDouble();
+        packDouble(realIn, imagIn, m_dbuf, m_size/2+1);
+        mirror(m_dbuf, m_size);
+        sfft_execute(m_dplani, m_dbuf, m_dresult);
+        for (int i = 0; i < m_size; ++i) {
+            realOut[i] = m_dresult[i*2];
+        }
+    }
+
+    void inverseInterleaved(const double *BQ_R__ complexIn, double *BQ_R__ realOut) {
+        if (!m_dplanf) initDouble();
+        v_convert((double *)m_dbuf, complexIn, m_size + 2);
+        mirror(m_dbuf, m_size);
+        sfft_execute(m_dplani, m_dbuf, m_dresult);
+        for (int i = 0; i < m_size; ++i) {
+            realOut[i] = m_dresult[i*2];
+        }
+    }
+
+    void inversePolar(const double *BQ_R__ magIn, const double *BQ_R__ phaseIn, double *BQ_R__ realOut) {
+        if (!m_dplanf) initDouble();
+        const int hs = m_size/2;
+        for (int i = 0; i <= hs; ++i) {
+            m_dbuf[i*2] = magIn[i] * cos(phaseIn[i]);
+            m_dbuf[i*2+1] = magIn[i] * sin(phaseIn[i]);
+        }
+        mirror(m_dbuf, m_size);
+        sfft_execute(m_dplani, m_dbuf, m_dresult);
+        for (int i = 0; i < m_size; ++i) {
+            realOut[i] = m_dresult[i*2];
+        }
+    }
+
+    void inverseCepstral(const double *BQ_R__ magIn, double *BQ_R__ cepOut) {
+        if (!m_dplanf) initDouble();
+        const int hs = m_size/2;
+        for (int i = 0; i <= hs; ++i) {
+            m_dbuf[i*2] = log(magIn[i] + 0.000001);
+            m_dbuf[i*2+1] = 0.0;
+        }
+        mirror(m_dbuf, m_size);
+        sfft_execute(m_dplani, m_dbuf, m_dresult);
+        for (int i = 0; i < m_size; ++i) {
+            cepOut[i] = m_dresult[i*2];
+        }
+    }
+
+    void inverse(const float *BQ_R__ realIn, const float *BQ_R__ imagIn, float *BQ_R__ realOut) {
+        if (!m_fplanf) initFloat();
+        packFloat(realIn, imagIn, m_fbuf, m_size/2+1);
+        mirror(m_fbuf, m_size);
+        sfft_execute(m_fplani, m_fbuf, m_fresult);
+        for (int i = 0; i < m_size; ++i) {
+            realOut[i] = m_fresult[i*2];
+        }
+    }
+
+    void inverseInterleaved(const float *BQ_R__ complexIn, float *BQ_R__ realOut) {
+        if (!m_fplanf) initFloat();
+        v_convert((float *)m_fbuf, complexIn, m_size + 2);
+        mirror(m_fbuf, m_size);
+        sfft_execute(m_fplani, m_fbuf, m_fresult);
+        for (int i = 0; i < m_size; ++i) {
+            realOut[i] = m_fresult[i*2];
+        }
+    }
+
+    void inversePolar(const float *BQ_R__ magIn, const float *BQ_R__ phaseIn, float *BQ_R__ realOut) {
+        if (!m_fplanf) initFloat();
+        const int hs = m_size/2;
+        for (int i = 0; i <= hs; ++i) {
+            m_fbuf[i*2] = magIn[i] * cosf(phaseIn[i]);
+            m_fbuf[i*2+1] = magIn[i] * sinf(phaseIn[i]);
+        }
+        mirror(m_fbuf, m_size);
+        sfft_execute(m_fplani, m_fbuf, m_fresult);
+        for (int i = 0; i < m_size; ++i) {
+            realOut[i] = m_fresult[i*2];
+        }
+    }
+
+    void inverseCepstral(const float *BQ_R__ magIn, float *BQ_R__ cepOut) {
+        if (!m_fplanf) initFloat();
+        const int hs = m_size/2;
+        for (int i = 0; i <= hs; ++i) {
+            m_fbuf[i*2] = logf(magIn[i] + 0.00001);
+            m_fbuf[i*2+1] = 0.0f;
+        }
+        sfft_execute(m_fplani, m_fbuf, m_fresult);
+        for (int i = 0; i < m_size; ++i) {
+            cepOut[i] = m_fresult[i*2];
+        }
+    }
+
+private:
+    sfft_plan_t *m_fplanf;
+    sfft_plan_t *m_fplani;
+    fft_float_type *m_fbuf;
+    fft_float_type *m_fresult;
+
+    sfft_plan_t *m_dplanf;
+    sfft_plan_t *m_dplani;
+    fft_double_type *m_dbuf;
+    fft_double_type *m_dresult;
+
+    const int m_size;
+};
+
+#endif /* HAVE_SFFT */
+
+#ifdef HAVE_KISSFFT
+
+class D_KISSFFT : public FFTImpl
+{
+public:
+    D_KISSFFT(int size) :
+        m_size(size),
+        m_fplanf(0),  
+        m_fplani(0)
+    {
+#ifdef FIXED_POINT
+#error KISSFFT is not configured for float values
+#endif
+        if (sizeof(kiss_fft_scalar) != sizeof(float)) {
+            std::cerr << "ERROR: KISSFFT is not configured for float values"
+                      << std::endl;
+        }
+
+        m_fbuf = new kiss_fft_scalar[m_size + 2];
+        m_fpacked = new kiss_fft_cpx[m_size + 2];
+        m_fplanf = kiss_fftr_alloc(m_size, 0, NULL, NULL);
+        m_fplani = kiss_fftr_alloc(m_size, 1, NULL, NULL);
+    }
+
+    ~D_KISSFFT() {
+        kiss_fftr_free(m_fplanf);
+        kiss_fftr_free(m_fplani);
+        kiss_fft_cleanup();
+
+        delete[] m_fbuf;
+        delete[] m_fpacked;
+    }
+
+    FFT::Precisions
+    getSupportedPrecisions() const {
+        return FFT::SinglePrecision;
+    }
+
+    void initFloat() { }
+    void initDouble() { }
+
+    void packFloat(const float *BQ_R__ re, const float *BQ_R__ im) {
+        const int hs = m_size/2;
+        for (int i = 0; i <= hs; ++i) {
+            m_fpacked[i].r = re[i];
+        }
+        if (im) {
+            for (int i = 0; i <= hs; ++i) {
+                m_fpacked[i].i = im[i];
+            }
+        } else {
+            for (int i = 0; i <= hs; ++i) {
+                m_fpacked[i].i = 0.f;
+            }
+        }
+    }
+
+    void unpackFloat(float *BQ_R__ re, float *BQ_R__ im) {
+        const int hs = m_size/2;
+        for (int i = 0; i <= hs; ++i) {
+            re[i] = m_fpacked[i].r;
+        }
+        if (im) {
+            for (int i = 0; i <= hs; ++i) {
+                im[i] = m_fpacked[i].i;
+            }
+        }
+    }        
+
+    void packDouble(const double *BQ_R__ re, const double *BQ_R__ im) {
+        const int hs = m_size/2;
+        for (int i = 0; i <= hs; ++i) {
+            m_fpacked[i].r = float(re[i]);
+        }
+        if (im) {
+            for (int i = 0; i <= hs; ++i) {
+                m_fpacked[i].i = float(im[i]);
+            }
+        } else {
+            for (int i = 0; i <= hs; ++i) {
+                m_fpacked[i].i = 0.f;
+            }
+        }
+    }
+
+    void unpackDouble(double *BQ_R__ re, double *BQ_R__ im) {
+        const int hs = m_size/2;
+        for (int i = 0; i <= hs; ++i) {
+            re[i] = double(m_fpacked[i].r);
+        }
+        if (im) {
+            for (int i = 0; i <= hs; ++i) {
+                im[i] = double(m_fpacked[i].i);
+            }
+        }
+    }        
+
+    void forward(const double *BQ_R__ realIn, double *BQ_R__ realOut, double *BQ_R__ imagOut) {
+
+        v_convert(m_fbuf, realIn, m_size);
+        kiss_fftr(m_fplanf, m_fbuf, m_fpacked);
+        unpackDouble(realOut, imagOut);
+    }
+
+    void forwardInterleaved(const double *BQ_R__ realIn, double *BQ_R__ complexOut) {
+
+        v_convert(m_fbuf, realIn, m_size);
+        kiss_fftr(m_fplanf, m_fbuf, m_fpacked);
+        v_convert(complexOut, (float *)m_fpacked, m_size + 2);
+    }
+
+    void forwardPolar(const double *BQ_R__ realIn, double *BQ_R__ magOut, double *BQ_R__ phaseOut) {
+
+        for (int i = 0; i < m_size; ++i) {
+            m_fbuf[i] = float(realIn[i]);
+        }
+
+        kiss_fftr(m_fplanf, m_fbuf, m_fpacked);
+
+        const int hs = m_size/2;
+
+        for (int i = 0; i <= hs; ++i) {
+            magOut[i] = sqrt(double(m_fpacked[i].r) * double(m_fpacked[i].r) +
+                             double(m_fpacked[i].i) * double(m_fpacked[i].i));
+        }
+
+        for (int i = 0; i <= hs; ++i) {
+            phaseOut[i] = atan2(double(m_fpacked[i].i), double(m_fpacked[i].r));
+        }
+    }
+
+    void forwardMagnitude(const double *BQ_R__ realIn, double *BQ_R__ magOut) {
+
+        for (int i = 0; i < m_size; ++i) {
+            m_fbuf[i] = float(realIn[i]);
+        }
+
+        kiss_fftr(m_fplanf, m_fbuf, m_fpacked);
+
+        const int hs = m_size/2;
+
+        for (int i = 0; i <= hs; ++i) {
+            magOut[i] = sqrt(double(m_fpacked[i].r) * double(m_fpacked[i].r) +
+                             double(m_fpacked[i].i) * double(m_fpacked[i].i));
+        }
+    }
+
+    void forward(const float *BQ_R__ realIn, float *BQ_R__ realOut, float *BQ_R__ imagOut) {
+
+        kiss_fftr(m_fplanf, realIn, m_fpacked);
+        unpackFloat(realOut, imagOut);
+    }
+
+    void forwardInterleaved(const float *BQ_R__ realIn, float *BQ_R__ complexOut) {
+
+        kiss_fftr(m_fplanf, realIn, (kiss_fft_cpx *)complexOut);
+    }
+
+    void forwardPolar(const float *BQ_R__ realIn, float *BQ_R__ magOut, float *BQ_R__ phaseOut) {
+
+        kiss_fftr(m_fplanf, realIn, m_fpacked);
+
+        const int hs = m_size/2;
+
+        for (int i = 0; i <= hs; ++i) {
+            magOut[i] = sqrtf(m_fpacked[i].r * m_fpacked[i].r +
+                              m_fpacked[i].i * m_fpacked[i].i);
+        }
+
+        for (int i = 0; i <= hs; ++i) {
+            phaseOut[i] = atan2f(m_fpacked[i].i, m_fpacked[i].r);
+        }
+    }
+
+    void forwardMagnitude(const float *BQ_R__ realIn, float *BQ_R__ magOut) {
+
+        kiss_fftr(m_fplanf, realIn, m_fpacked);
+
+        const int hs = m_size/2;
+
+        for (int i = 0; i <= hs; ++i) {
+            magOut[i] = sqrtf(m_fpacked[i].r * m_fpacked[i].r +
+                              m_fpacked[i].i * m_fpacked[i].i);
+        }
+    }
+
+    void inverse(const double *BQ_R__ realIn, const double *BQ_R__ imagIn, double *BQ_R__ realOut) {
+
+        packDouble(realIn, imagIn);
+
+        kiss_fftri(m_fplani, m_fpacked, m_fbuf);
+
+        for (int i = 0; i < m_size; ++i) {
+            realOut[i] = m_fbuf[i];
+        }
+    }
+
+    void inverseInterleaved(const double *BQ_R__ complexIn, double *BQ_R__ realOut) {
+
+        v_convert((float *)m_fpacked, complexIn, m_size + 2);
+
+        kiss_fftri(m_fplani, m_fpacked, m_fbuf);
+
+        for (int i = 0; i < m_size; ++i) {
+            realOut[i] = m_fbuf[i];
+        }
+    }
+
+    void inversePolar(const double *BQ_R__ magIn, const double *BQ_R__ phaseIn, double *BQ_R__ realOut) {
+
+        const int hs = m_size/2;
+
+        for (int i = 0; i <= hs; ++i) {
+            m_fpacked[i].r = float(magIn[i] * cos(phaseIn[i]));
+            m_fpacked[i].i = float(magIn[i] * sin(phaseIn[i]));
+        }
+
+        kiss_fftri(m_fplani, m_fpacked, m_fbuf);
+
+        for (int i = 0; i < m_size; ++i) {
+            realOut[i] = m_fbuf[i];
+        }
+    }
+
+    void inverseCepstral(const double *BQ_R__ magIn, double *BQ_R__ cepOut) {
+
+        const int hs = m_size/2;
+
+        for (int i = 0; i <= hs; ++i) {
+            m_fpacked[i].r = float(log(magIn[i] + 0.000001));
+            m_fpacked[i].i = 0.0f;
+        }
+
+        kiss_fftri(m_fplani, m_fpacked, m_fbuf);
+
+        for (int i = 0; i < m_size; ++i) {
+            cepOut[i] = m_fbuf[i];
+        }
+    }
+    
+    void inverse(const float *BQ_R__ realIn, const float *BQ_R__ imagIn, float *BQ_R__ realOut) {
+
+        packFloat(realIn, imagIn);
+        kiss_fftri(m_fplani, m_fpacked, realOut);
+    }
+
+    void inverseInterleaved(const float *BQ_R__ complexIn, float *BQ_R__ realOut) {
+
+        v_copy((float *)m_fpacked, complexIn, m_size + 2);
+        kiss_fftri(m_fplani, m_fpacked, realOut);
+    }
+
+    void inversePolar(const float *BQ_R__ magIn, const float *BQ_R__ phaseIn, float *BQ_R__ realOut) {
+
+        const int hs = m_size/2;
+
+        for (int i = 0; i <= hs; ++i) {
+            m_fpacked[i].r = magIn[i] * cosf(phaseIn[i]);
+            m_fpacked[i].i = magIn[i] * sinf(phaseIn[i]);
+        }
+
+        kiss_fftri(m_fplani, m_fpacked, realOut);
+    }
+
+    void inverseCepstral(const float *BQ_R__ magIn, float *BQ_R__ cepOut) {
+
+        const int hs = m_size/2;
+
+        for (int i = 0; i <= hs; ++i) {
+            m_fpacked[i].r = logf(magIn[i] + 0.000001f);
+            m_fpacked[i].i = 0.0f;
+        }
+
+        kiss_fftri(m_fplani, m_fpacked, cepOut);
+    }
+
+private:
+    const int m_size;
+    kiss_fftr_cfg m_fplanf;
+    kiss_fftr_cfg m_fplani;
+    kiss_fft_scalar *m_fbuf;
+    kiss_fft_cpx *m_fpacked;
+};
+
+#endif /* HAVE_KISSFFT */
+
+#ifdef USE_BUILTIN_FFT
+
+class D_Cross : public FFTImpl
+{
+public:
+    D_Cross(int size) : m_size(size), m_table(0) {
+        
+        m_a = new double[size];
+        m_b = new double[size];
+        m_c = new double[size];
+        m_d = new double[size];
+
+        m_table = new int[m_size];
+    
+        int bits;
+        int i, j, k, m;
+
+        for (i = 0; ; ++i) {
+            if (m_size & (1 << i)) {
+                bits = i;
+                break;
+            }
+        }
+        
+        for (i = 0; i < m_size; ++i) {
+            
+            m = i;
+            
+            for (j = k = 0; j < bits; ++j) {
+                k = (k << 1) | (m & 1);
+                m >>= 1;
+            }
+            
+            m_table[i] = k;
+        }
+    }
+
+    ~D_Cross() {
+        delete[] m_table;
+        delete[] m_a;
+        delete[] m_b;
+        delete[] m_c;
+        delete[] m_d;
+    }
+
+    FFT::Precisions
+    getSupportedPrecisions() const {
+        return FFT::DoublePrecision;
+    }
+
+    void initFloat() { }
+    void initDouble() { }
+
+    void forward(const double *BQ_R__ realIn, double *BQ_R__ realOut, double *BQ_R__ imagOut) {
+        basefft(false, realIn, 0, m_c, m_d);
+        const int hs = m_size/2;
+        for (int i = 0; i <= hs; ++i) realOut[i] = m_c[i];
+        if (imagOut) {
+            for (int i = 0; i <= hs; ++i) imagOut[i] = m_d[i];
+        }
+    }
+
+    void forwardInterleaved(const double *BQ_R__ realIn, double *BQ_R__ complexOut) {
+        basefft(false, realIn, 0, m_c, m_d);
+        const int hs = m_size/2;
+        for (int i = 0; i <= hs; ++i) complexOut[i*2] = m_c[i];
+        for (int i = 0; i <= hs; ++i) complexOut[i*2+1] = m_d[i];
+    }
+
+    void forwardPolar(const double *BQ_R__ realIn, double *BQ_R__ magOut, double *BQ_R__ phaseOut) {
+        basefft(false, realIn, 0, m_c, m_d);
+        const int hs = m_size/2;
+        for (int i = 0; i <= hs; ++i) {
+            magOut[i] = sqrt(m_c[i] * m_c[i] + m_d[i] * m_d[i]);
+            phaseOut[i] = atan2(m_d[i], m_c[i]) ;
+        }
+    }
+
+    void forwardMagnitude(const double *BQ_R__ realIn, double *BQ_R__ magOut) {
+        basefft(false, realIn, 0, m_c, m_d);
+        const int hs = m_size/2;
+        for (int i = 0; i <= hs; ++i) {
+            magOut[i] = sqrt(m_c[i] * m_c[i] + m_d[i] * m_d[i]);
+        }
+    }
+
+    void forward(const float *BQ_R__ realIn, float *BQ_R__ realOut, float *BQ_R__ imagOut) {
+        for (int i = 0; i < m_size; ++i) m_a[i] = realIn[i];
+        basefft(false, m_a, 0, m_c, m_d);
+        const int hs = m_size/2;
+        for (int i = 0; i <= hs; ++i) realOut[i] = m_c[i];
+        if (imagOut) {
+            for (int i = 0; i <= hs; ++i) imagOut[i] = m_d[i];
+        }
+    }
+
+    void forwardInterleaved(const float *BQ_R__ realIn, float *BQ_R__ complexOut) {
+        for (int i = 0; i < m_size; ++i) m_a[i] = realIn[i];
+        basefft(false, m_a, 0, m_c, m_d);
+        const int hs = m_size/2;
+        for (int i = 0; i <= hs; ++i) complexOut[i*2] = m_c[i];
+        for (int i = 0; i <= hs; ++i) complexOut[i*2+1] = m_d[i];
+    }
+
+    void forwardPolar(const float *BQ_R__ realIn, float *BQ_R__ magOut, float *BQ_R__ phaseOut) {
+        for (int i = 0; i < m_size; ++i) m_a[i] = realIn[i];
+        basefft(false, m_a, 0, m_c, m_d);
+        const int hs = m_size/2;
+        for (int i = 0; i <= hs; ++i) {
+            magOut[i] = sqrt(m_c[i] * m_c[i] + m_d[i] * m_d[i]);
+            phaseOut[i] = atan2(m_d[i], m_c[i]) ;
+        }
+    }
+
+    void forwardMagnitude(const float *BQ_R__ realIn, float *BQ_R__ magOut) {
+        for (int i = 0; i < m_size; ++i) m_a[i] = realIn[i];
+        basefft(false, m_a, 0, m_c, m_d);
+        const int hs = m_size/2;
+        for (int i = 0; i <= hs; ++i) {
+            magOut[i] = sqrt(m_c[i] * m_c[i] + m_d[i] * m_d[i]);
+        }
+    }
+
+    void inverse(const double *BQ_R__ realIn, const double *BQ_R__ imagIn, double *BQ_R__ realOut) {
+        const int hs = m_size/2;
+        for (int i = 0; i <= hs; ++i) {
+            double real = realIn[i];
+            double imag = imagIn[i];
+            m_a[i] = real;
+            m_b[i] = imag;
+            if (i > 0) {
+                m_a[m_size-i] = real;
+                m_b[m_size-i] = -imag;
+            }
+        }
+        basefft(true, m_a, m_b, realOut, m_d);
+    }
+
+    void inverseInterleaved(const double *BQ_R__ complexIn, double *BQ_R__ realOut) {
+        const int hs = m_size/2;
+        for (int i = 0; i <= hs; ++i) {
+            double real = complexIn[i*2];
+            double imag = complexIn[i*2+1];
+            m_a[i] = real;
+            m_b[i] = imag;
+            if (i > 0) {
+                m_a[m_size-i] = real;
+                m_b[m_size-i] = -imag;
+            }
+        }
+        basefft(true, m_a, m_b, realOut, m_d);
+    }
+
+    void inversePolar(const double *BQ_R__ magIn, const double *BQ_R__ phaseIn, double *BQ_R__ realOut) {
+        const int hs = m_size/2;
+        for (int i = 0; i <= hs; ++i) {
+            double real = magIn[i] * cos(phaseIn[i]);
+            double imag = magIn[i] * sin(phaseIn[i]);
+            m_a[i] = real;
+            m_b[i] = imag;
+            if (i > 0) {
+                m_a[m_size-i] = real;
+                m_b[m_size-i] = -imag;
+            }
+        }
+        basefft(true, m_a, m_b, realOut, m_d);
+    }
+
+    void inverseCepstral(const double *BQ_R__ magIn, double *BQ_R__ cepOut) {
+        const int hs = m_size/2;
+        for (int i = 0; i <= hs; ++i) {
+            double real = log(magIn[i] + 0.000001);
+            m_a[i] = real;
+            m_b[i] = 0.0;
+            if (i > 0) {
+                m_a[m_size-i] = real;
+                m_b[m_size-i] = 0.0;
+            }
+        }
+        basefft(true, m_a, m_b, cepOut, m_d);
+    }
+
+    void inverse(const float *BQ_R__ realIn, const float *BQ_R__ imagIn, float *BQ_R__ realOut) {
+        const int hs = m_size/2;
+        for (int i = 0; i <= hs; ++i) {
+            float real = realIn[i];
+            float imag = imagIn[i];
+            m_a[i] = real;
+            m_b[i] = imag;
+            if (i > 0) {
+                m_a[m_size-i] = real;
+                m_b[m_size-i] = -imag;
+            }
+        }
+        basefft(true, m_a, m_b, m_c, m_d);
+        for (int i = 0; i < m_size; ++i) realOut[i] = m_c[i];
+    }
+
+    void inverseInterleaved(const float *BQ_R__ complexIn, float *BQ_R__ realOut) {
+        const int hs = m_size/2;
+        for (int i = 0; i <= hs; ++i) {
+            float real = complexIn[i*2];
+            float imag = complexIn[i*2+1];
+            m_a[i] = real;
+            m_b[i] = imag;
+            if (i > 0) {
+                m_a[m_size-i] = real;
+                m_b[m_size-i] = -imag;
+            }
+        }
+        basefft(true, m_a, m_b, m_c, m_d);
+        for (int i = 0; i < m_size; ++i) realOut[i] = m_c[i];
+    }
+
+    void inversePolar(const float *BQ_R__ magIn, const float *BQ_R__ phaseIn, float *BQ_R__ realOut) {
+        const int hs = m_size/2;
+        for (int i = 0; i <= hs; ++i) {
+            float real = magIn[i] * cosf(phaseIn[i]);
+            float imag = magIn[i] * sinf(phaseIn[i]);
+            m_a[i] = real;
+            m_b[i] = imag;
+            if (i > 0) {
+                m_a[m_size-i] = real;
+                m_b[m_size-i] = -imag;
+            }
+        }
+        basefft(true, m_a, m_b, m_c, m_d);
+        for (int i = 0; i < m_size; ++i) realOut[i] = m_c[i];
+    }
+
+    void inverseCepstral(const float *BQ_R__ magIn, float *BQ_R__ cepOut) {
+        const int hs = m_size/2;
+        for (int i = 0; i <= hs; ++i) {
+            float real = logf(magIn[i] + 0.000001);
+            m_a[i] = real;
+            m_b[i] = 0.0;
+            if (i > 0) {
+                m_a[m_size-i] = real;
+                m_b[m_size-i] = 0.0;
+            }
+        }
+        basefft(true, m_a, m_b, m_c, m_d);
+        for (int i = 0; i < m_size; ++i) cepOut[i] = m_c[i];
+    }
+
+private:
+    const int m_size;
+    int *m_table;
+    double *m_a;
+    double *m_b;
+    double *m_c;
+    double *m_d;
+    void basefft(bool inverse, const double *BQ_R__ ri, const double *BQ_R__ ii, double *BQ_R__ ro, double *BQ_R__ io);
+};
+
+void
+D_Cross::basefft(bool inverse, const double *BQ_R__ ri, const double *BQ_R__ ii, double *BQ_R__ ro, double *BQ_R__ io)
+{
+    if (!ri || !ro || !io) return;
+
+    int i, j, k, m;
+    int blockSize, blockEnd;
+
+    double tr, ti;
+
+    double angle = 2.0 * M_PI;
+    if (inverse) angle = -angle;
+
+    const int n = m_size;
+
+    if (ii) {
+	for (i = 0; i < n; ++i) {
+	    ro[m_table[i]] = ri[i];
+        }
+	for (i = 0; i < n; ++i) {
+	    io[m_table[i]] = ii[i];
+	}
+    } else {
+	for (i = 0; i < n; ++i) {
+	    ro[m_table[i]] = ri[i];
+        }
+	for (i = 0; i < n; ++i) {
+	    io[m_table[i]] = 0.0;
+	}
+    }
+
+    blockEnd = 1;
+
+    for (blockSize = 2; blockSize <= n; blockSize <<= 1) {
+
+	double delta = angle / (double)blockSize;
+	double sm2 = -sin(-2 * delta);
+	double sm1 = -sin(-delta);
+	double cm2 = cos(-2 * delta);
+	double cm1 = cos(-delta);
+	double w = 2 * cm1;
+	double ar[3], ai[3];
+
+	for (i = 0; i < n; i += blockSize) {
+
+	    ar[2] = cm2;
+	    ar[1] = cm1;
+
+	    ai[2] = sm2;
+	    ai[1] = sm1;
+
+	    for (j = i, m = 0; m < blockEnd; j++, m++) {
+
+		ar[0] = w * ar[1] - ar[2];
+		ar[2] = ar[1];
+		ar[1] = ar[0];
+
+		ai[0] = w * ai[1] - ai[2];
+		ai[2] = ai[1];
+		ai[1] = ai[0];
+
+		k = j + blockEnd;
+		tr = ar[0] * ro[k] - ai[0] * io[k];
+		ti = ar[0] * io[k] + ai[0] * ro[k];
+
+		ro[k] = ro[j] - tr;
+		io[k] = io[j] - ti;
+
+		ro[j] += tr;
+		io[j] += ti;
+	    }
+	}
+
+	blockEnd = blockSize;
+    }
+
+/* fftw doesn't rescale, so nor will we
+
+    if (inverse) {
+
+	double denom = (double)n;
+
+	for (i = 0; i < n; i++) {
+	    ro[i] /= denom;
+	    io[i] /= denom;
+	}
+    }
+*/
+}
+
+#endif /* USE_BUILTIN_FFT */
+
+} /* end namespace FFTs */
+
+std::string
+FFT::m_implementation;
+
+std::set<std::string>
+FFT::getImplementations()
+{
+    std::set<std::string> impls;
+#ifdef HAVE_IPP
+    impls.insert("ipp");
+#endif
+#ifdef HAVE_FFTW3
+    impls.insert("fftw");
+#endif
+#ifdef HAVE_KISSFFT
+    impls.insert("kissfft");
+#endif
+#ifdef HAVE_VDSP
+    impls.insert("vdsp");
+#endif
+#ifdef HAVE_MEDIALIB
+    impls.insert("medialib");
+#endif
+#ifdef HAVE_OPENMAX
+    impls.insert("openmax");
+#endif
+#ifdef HAVE_SFFT
+    impls.insert("sfft");
+#endif
+#ifdef USE_BUILTIN_FFT
+    impls.insert("cross");
+#endif
+    return impls;
+}
+
+void
+FFT::pickDefaultImplementation()
+{
+    if (m_implementation != "") return;
+
+    std::set<std::string> impls = getImplementations();
+
+    std::string best = "cross";
+    if (impls.find("kissfft") != impls.end()) best = "kissfft";
+    if (impls.find("medialib") != impls.end()) best = "medialib";
+    if (impls.find("openmax") != impls.end()) best = "openmax";
+    if (impls.find("sfft") != impls.end()) best = "sfft";
+    if (impls.find("fftw") != impls.end()) best = "fftw";
+    if (impls.find("vdsp") != impls.end()) best = "vdsp";
+    if (impls.find("ipp") != impls.end()) best = "ipp";
+    
+    m_implementation = best;
+}
+
+std::string
+FFT::getDefaultImplementation()
+{
+    return m_implementation;
+}
+
+void
+FFT::setDefaultImplementation(std::string i)
+{
+    m_implementation = i;
+}
+
+FFT::FFT(int size, int debugLevel) :
+    d(0)
+{
+    if ((size < 2) ||
+        (size & (size-1))) {
+        std::cerr << "FFT::FFT(" << size << "): power-of-two sizes only supported, minimum size 2" << std::endl;
+#ifndef NO_EXCEPTIONS
+        throw InvalidSize;
+#else
+        abort();
+#endif
+    }
+
+    if (m_implementation == "") pickDefaultImplementation();
+    std::string impl = m_implementation;
+
+    if (debugLevel > 0) {
+        std::cerr << "FFT::FFT(" << size << "): using implementation: "
+                  << impl << std::endl;
+    }
+
+    if (impl == "ipp") {
+#ifdef HAVE_IPP
+        d = new FFTs::D_IPP(size);
+#endif
+    } else if (impl == "fftw") {
+#ifdef HAVE_FFTW3
+        d = new FFTs::D_FFTW(size);
+#endif
+    } else if (impl == "kissfft") {        
+#ifdef HAVE_KISSFFT
+        d = new FFTs::D_KISSFFT(size);
+#endif
+    } else if (impl == "vdsp") {
+#ifdef HAVE_VDSP
+        d = new FFTs::D_VDSP(size);
+#endif
+    } else if (impl == "medialib") {
+#ifdef HAVE_MEDIALIB
+        d = new FFTs::D_MEDIALIB(size);
+#endif
+    } else if (impl == "openmax") {
+#ifdef HAVE_OPENMAX
+        d = new FFTs::D_OPENMAX(size);
+#endif
+    } else if (impl == "sfft") {
+#ifdef HAVE_SFFT
+        d = new FFTs::D_SFFT(size);
+#endif
+    } else if (impl == "cross") {
+#ifdef USE_BUILTIN_FFT
+        d = new FFTs::D_Cross(size);
+#endif
+    }
+
+    if (!d) {
+        std::cerr << "FFT::FFT(" << size << "): ERROR: implementation "
+                  << impl << " is not compiled in" << std::endl;
+#ifndef NO_EXCEPTIONS
+        throw InvalidImplementation;
+#else
+        abort();
+#endif
+    }
+}
+
+FFT::~FFT()
+{
+    delete d;
+}
+
+#ifndef NO_EXCEPTIONS
+#define CHECK_NOT_NULL(x) \
+    if (!(x)) { \
+        std::cerr << "FFT: ERROR: Null argument " #x << std::endl;  \
+        throw NullArgument; \
+    }
+#else
+#define CHECK_NOT_NULL(x) \
+    if (!(x)) { \
+        std::cerr << "FFT: ERROR: Null argument " #x << std::endl;  \
+        std::cerr << "FFT: Would be throwing NullArgument here, if exceptions were not disabled" << std::endl;  \
+        return; \
+    }
+#endif
+
+void
+FFT::forward(const double *BQ_R__ realIn, double *BQ_R__ realOut, double *BQ_R__ imagOut)
+{
+    CHECK_NOT_NULL(realIn);
+    CHECK_NOT_NULL(realOut);
+    CHECK_NOT_NULL(imagOut);
+    d->forward(realIn, realOut, imagOut);
+}
+
+void
+FFT::forwardInterleaved(const double *BQ_R__ realIn, double *BQ_R__ complexOut)
+{
+    CHECK_NOT_NULL(realIn);
+    CHECK_NOT_NULL(complexOut);
+    d->forwardInterleaved(realIn, complexOut);
+}
+
+void
+FFT::forwardPolar(const double *BQ_R__ realIn, double *BQ_R__ magOut, double *BQ_R__ phaseOut)
+{
+    CHECK_NOT_NULL(realIn);
+    CHECK_NOT_NULL(magOut);
+    CHECK_NOT_NULL(phaseOut);
+    d->forwardPolar(realIn, magOut, phaseOut);
+}
+
+void
+FFT::forwardMagnitude(const double *BQ_R__ realIn, double *BQ_R__ magOut)
+{
+    CHECK_NOT_NULL(realIn);
+    CHECK_NOT_NULL(magOut);
+    d->forwardMagnitude(realIn, magOut);
+}
+
+void
+FFT::forward(const float *BQ_R__ realIn, float *BQ_R__ realOut, float *BQ_R__ imagOut)
+{
+    CHECK_NOT_NULL(realIn);
+    CHECK_NOT_NULL(realOut);
+    CHECK_NOT_NULL(imagOut);
+    d->forward(realIn, realOut, imagOut);
+}
+
+void
+FFT::forwardInterleaved(const float *BQ_R__ realIn, float *BQ_R__ complexOut)
+{
+    CHECK_NOT_NULL(realIn);
+    CHECK_NOT_NULL(complexOut);
+    d->forwardInterleaved(realIn, complexOut);
+}
+
+void
+FFT::forwardPolar(const float *BQ_R__ realIn, float *BQ_R__ magOut, float *BQ_R__ phaseOut)
+{
+    CHECK_NOT_NULL(realIn);
+    CHECK_NOT_NULL(magOut);
+    CHECK_NOT_NULL(phaseOut);
+    d->forwardPolar(realIn, magOut, phaseOut);
+}
+
+void
+FFT::forwardMagnitude(const float *BQ_R__ realIn, float *BQ_R__ magOut)
+{
+    CHECK_NOT_NULL(realIn);
+    CHECK_NOT_NULL(magOut);
+    d->forwardMagnitude(realIn, magOut);
+}
+
+void
+FFT::inverse(const double *BQ_R__ realIn, const double *BQ_R__ imagIn, double *BQ_R__ realOut)
+{
+    CHECK_NOT_NULL(realIn);
+    CHECK_NOT_NULL(imagIn);
+    CHECK_NOT_NULL(realOut);
+    d->inverse(realIn, imagIn, realOut);
+}
+
+void
+FFT::inverseInterleaved(const double *BQ_R__ complexIn, double *BQ_R__ realOut)
+{
+    CHECK_NOT_NULL(complexIn);
+    CHECK_NOT_NULL(realOut);
+    d->inverseInterleaved(complexIn, realOut);
+}
+
+void
+FFT::inversePolar(const double *BQ_R__ magIn, const double *BQ_R__ phaseIn, double *BQ_R__ realOut)
+{
+    CHECK_NOT_NULL(magIn);
+    CHECK_NOT_NULL(phaseIn);
+    CHECK_NOT_NULL(realOut);
+    d->inversePolar(magIn, phaseIn, realOut);
+}
+
+void
+FFT::inverseCepstral(const double *BQ_R__ magIn, double *BQ_R__ cepOut)
+{
+    CHECK_NOT_NULL(magIn);
+    CHECK_NOT_NULL(cepOut);
+    d->inverseCepstral(magIn, cepOut);
+}
+
+void
+FFT::inverse(const float *BQ_R__ realIn, const float *BQ_R__ imagIn, float *BQ_R__ realOut)
+{
+    CHECK_NOT_NULL(realIn);
+    CHECK_NOT_NULL(imagIn);
+    CHECK_NOT_NULL(realOut);
+    d->inverse(realIn, imagIn, realOut);
+}
+
+void
+FFT::inverseInterleaved(const float *BQ_R__ complexIn, float *BQ_R__ realOut)
+{
+    CHECK_NOT_NULL(complexIn);
+    CHECK_NOT_NULL(realOut);
+    d->inverseInterleaved(complexIn, realOut);
+}
+
+void
+FFT::inversePolar(const float *BQ_R__ magIn, const float *BQ_R__ phaseIn, float *BQ_R__ realOut)
+{
+    CHECK_NOT_NULL(magIn);
+    CHECK_NOT_NULL(phaseIn);
+    CHECK_NOT_NULL(realOut);
+    d->inversePolar(magIn, phaseIn, realOut);
+}
+
+void
+FFT::inverseCepstral(const float *BQ_R__ magIn, float *BQ_R__ cepOut)
+{
+    CHECK_NOT_NULL(magIn);
+    CHECK_NOT_NULL(cepOut);
+    d->inverseCepstral(magIn, cepOut);
+}
+
+void
+FFT::initFloat() 
+{
+    d->initFloat();
+}
+
+void
+FFT::initDouble() 
+{
+    d->initDouble();
+}
+
+FFT::Precisions
+FFT::getSupportedPrecisions() const
+{
+    return d->getSupportedPrecisions();
+}
+
+#ifdef FFT_MEASUREMENT
+
+std::string
+FFT::tune()
+{
+    std::ostringstream os;
+    os << "FFT::tune()..." << std::endl;
+
+    std::vector<int> sizes;
+    std::map<FFTImpl *, int> candidates;
+    std::map<int, int> wins;
+
+    sizes.push_back(512);
+    sizes.push_back(1024);
+    sizes.push_back(4096);
+    
+    for (unsigned int si = 0; si < sizes.size(); ++si) {
+
+        int size = sizes[si];
+
+        while (!candidates.empty()) {
+            delete candidates.begin()->first;
+            candidates.erase(candidates.begin());
+        }
+
+        FFTImpl *d;
+        
+#ifdef HAVE_IPP
+        std::cerr << "Constructing new IPP FFT object for size " << size << "..." << std::endl;
+        d = new FFTs::D_IPP(size);
+        d->initFloat();
+        d->initDouble();
+        candidates[d] = 0;
+#endif
+        
+#ifdef HAVE_FFTW3
+        os << "Constructing new FFTW3 FFT object for size " << size << "..." << std::endl;
+        d = new FFTs::D_FFTW(size);
+        d->initFloat();
+        d->initDouble();
+        candidates[d] = 1;
+#endif
+
+#ifdef HAVE_KISSFFT
+        os << "Constructing new KISSFFT object for size " << size << "..." << std::endl;
+        d = new FFTs::D_KISSFFT(size);
+        d->initFloat();
+        d->initDouble();
+        candidates[d] = 2;
+#endif        
+
+#ifdef USE_BUILTIN_FFT
+        os << "Constructing new Cross FFT object for size " << size << "..." << std::endl;
+        d = new FFTs::D_Cross(size);
+        d->initFloat();
+        d->initDouble();
+        candidates[d] = 3;
+#endif
+        
+#ifdef HAVE_VDSP
+        os << "Constructing new vDSP FFT object for size " << size << "..." << std::endl;
+        d = new FFTs::D_VDSP(size);
+        d->initFloat();
+        d->initDouble();
+        candidates[d] = 4;
+#endif
+        
+#ifdef HAVE_MEDIALIB
+        std::cerr << "Constructing new MediaLib FFT object for size " << size << "..." << std::endl;
+        d = new FFTs::D_MEDIALIB(size);
+        d->initFloat();
+        d->initDouble();
+        candidates[d] = 5;
+#endif
+        
+#ifdef HAVE_OPENMAX
+        os << "Constructing new OpenMAX FFT object for size " << size << "..." << std::endl;
+        d = new FFTs::D_OPENMAX(size);
+        d->initFloat();
+        d->initDouble();
+        candidates[d] = 6;
+#endif
+        
+#ifdef HAVE_SFFT
+        os << "Constructing new SFFT FFT object for size " << size << "..." << std::endl;
+        d = new FFTs::D_SFFT(size);
+//        d->initFloat();
+        d->initDouble();
+        candidates[d] = 6;
+#endif
+
+        os << "CLOCKS_PER_SEC = " << CLOCKS_PER_SEC << std::endl;
+        float divisor = float(CLOCKS_PER_SEC) / 1000.f;
+        
+        os << "Timing order is: ";
+        for (std::map<FFTImpl *, int>::iterator ci = candidates.begin();
+             ci != candidates.end(); ++ci) {
+            os << ci->second << " ";
+        }
+        os << std::endl;
+
+        int iterations = 500;
+        os << "Iterations: " << iterations << std::endl;
+
+        double *da = new double[size];
+        double *db = new double[size];
+        double *dc = new double[size];
+        double *dd = new double[size];
+        double *di = new double[size + 2];
+        double *dj = new double[size + 2];
+
+        float *fa = new float[size];
+        float *fb = new float[size];
+        float *fc = new float[size];
+        float *fd = new float[size];
+        float *fi = new float[size + 2];
+        float *fj = new float[size + 2];
+
+        for (int type = 0; type < 16; ++type) {
+    
+            //!!!
+            if ((type > 3 && type < 8) ||
+                (type > 11)) {
+                continue;
+            }
+
+            if (type > 7) {
+                // inverse transform: bigger inputs, to simulate the
+                // fact that the forward transform is unscaled
+                for (int i = 0; i < size; ++i) {
+                    da[i] = drand48() * size;
+                    fa[i] = da[i];
+                    db[i] = drand48() * size;
+                    fb[i] = db[i];
+                }
+            } else {    
+                for (int i = 0; i < size; ++i) {
+                    da[i] = drand48();
+                    fa[i] = da[i];
+                    db[i] = drand48();
+                    fb[i] = db[i];
+                }
+            }
+                
+            for (int i = 0; i < size + 2; ++i) {
+                di[i] = drand48();
+                fi[i] = di[i];
+            }
+
+            int low = -1;
+            int lowscore = 0;
+
+            const char *names[] = {
+
+                "Forward Cartesian Double",
+                "Forward Interleaved Double",
+                "Forward Polar Double",
+                "Forward Magnitude Double",
+                "Forward Cartesian Float",
+                "Forward Interleaved Float",
+                "Forward Polar Float",
+                "Forward Magnitude Float",
+
+                "Inverse Cartesian Double",
+                "Inverse Interleaved Double",
+                "Inverse Polar Double",
+                "Inverse Cepstral Double",
+                "Inverse Cartesian Float",
+                "Inverse Interleaved Float",
+                "Inverse Polar Float",
+                "Inverse Cepstral Float"
+            };
+            os << names[type] << " :: ";
+
+            for (std::map<FFTImpl *, int>::iterator ci = candidates.begin();
+                 ci != candidates.end(); ++ci) {
+
+                FFTImpl *d = ci->first;
+
+                double mean = 0;
+
+                clock_t start = clock();
+                
+                for (int i = 0; i < iterations; ++i) {
+
+                    if (i == 0) {
+                        for (int j = 0; j < size; ++j) {
+                            dc[j] = 0;
+                            dd[j] = 0;
+                            fc[j] = 0;
+                            fd[j] = 0;
+                            fj[j] = 0;
+                            dj[j] = 0;
+                        }
+                    }
+
+                    switch (type) {
+                    case 0: d->forward(da, dc, dd); break;
+                    case 1: d->forwardInterleaved(da, dj); break;
+                    case 2: d->forwardPolar(da, dc, dd); break;
+                    case 3: d->forwardMagnitude(da, dc); break;
+                    case 4: d->forward(fa, fc, fd); break;
+                    case 5: d->forwardInterleaved(fa, fj); break;
+                    case 6: d->forwardPolar(fa, fc, fd); break;
+                    case 7: d->forwardMagnitude(fa, fc); break;
+                    case 8: d->inverse(da, db, dc); break;
+                    case 9: d->inverseInterleaved(di, dc); break;
+                    case 10: d->inversePolar(da, db, dc); break;
+                    case 11: d->inverseCepstral(da, dc); break;
+                    case 12: d->inverse(fa, fb, fc); break;
+                    case 13: d->inverseInterleaved(fi, fc); break;
+                    case 14: d->inversePolar(fa, fb, fc); break;
+                    case 15: d->inverseCepstral(fa, fc); break;
+                    }
+
+                    if (i == 0) {
+                        mean = 0;
+                        for (int j = 0; j < size; ++j) {
+                            mean += dc[j];
+                            mean += dd[j];
+                            mean += fc[j];
+                            mean += fd[j];
+                            mean += fj[j];
+                            mean += dj[j];
+                        }
+                        mean /= size * 6;
+                    }
+                }
+
+                clock_t end = clock();
+
+                os << float(end - start)/divisor << " (" << mean << ") ";
+
+                if (low == -1 || (end - start) < lowscore) {
+                    low = ci->second;
+                    lowscore = end - start;
+                }
+            }
+
+            os << std::endl;
+
+            os << "  size " << size << ", type " << type << ": fastest is " << low << " (time " << float(lowscore)/divisor << ")" << std::endl;
+
+            wins[low]++;
+        }
+        
+        delete[] fa;
+        delete[] fb;
+        delete[] fc;
+        delete[] fd;
+        delete[] da;
+        delete[] db;
+        delete[] dc;
+        delete[] dd;
+    }
+
+    while (!candidates.empty()) {
+        delete candidates.begin()->first;
+        candidates.erase(candidates.begin());
+    }
+
+    int bestscore = 0;
+    int best = -1;
+
+    for (std::map<int, int>::iterator wi = wins.begin(); wi != wins.end(); ++wi) {
+        if (best == -1 || wi->second > bestscore) {
+            best = wi->first;
+            bestscore = wi->second;
+        }
+    }
+
+    os << "overall winner is " << best << " with " << bestscore << " wins" << std::endl;
+
+    return os.str();
+}
+
+#endif
+
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/fft/native/bqfft/test/Compares.h	Sat Oct 17 15:00:08 2015 +0100
@@ -0,0 +1,77 @@
+/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*-  vi:set ts=8 sts=4 sw=4: */
+
+/*
+    bqfft
+
+    A small library wrapping various FFT implementations for some
+    common audio processing use cases.
+
+    Copyright 2007-2015 Particular Programs Ltd.
+
+    Permission is hereby granted, free of charge, to any person
+    obtaining a copy of this software and associated documentation
+    files (the "Software"), to deal in the Software without
+    restriction, including without limitation the rights to use, copy,
+    modify, merge, publish, distribute, sublicense, and/or sell copies
+    of the Software, and to permit persons to whom the Software is
+    furnished to do so, subject to the following conditions:
+
+    The above copyright notice and this permission notice shall be
+    included in all copies or substantial portions of the Software.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+    EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+    NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
+    ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
+    CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+    WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+    Except as contained in this notice, the names of Chris Cannam and
+    Particular Programs Ltd shall not be used in advertising or
+    otherwise to promote the sale, use or other dealings in this
+    Software without prior written authorization.
+*/
+
+#ifndef TEST_COMPARES_H
+#define TEST_COMPARES_H
+
+// These macros are used for comparing generated results, and they
+// aren't always going to be exact. Adding 0.1 to each value gives
+// us a little more fuzz in qFuzzyCompare (which ultimately does
+// the comparison).
+
+#define COMPARE_ZERO(a) \
+    QCOMPARE(a + 0.1, 0.1)
+
+#define COMPARE_ZERO_F(a) \
+    QCOMPARE(a + 0.1f, 0.1f)
+
+#define COMPARE_FUZZIER(a, b) \
+    QCOMPARE(a + 0.1, b + 0.1)
+
+#define COMPARE_FUZZIER_F(a, b) \
+    QCOMPARE(a + 0.1f, b + 0.1f)
+
+#define COMPARE_ALL(a, n) \
+    for (int cmp_i = 0; cmp_i < (int)(sizeof(a)/sizeof(a[0])); ++cmp_i) { \
+        COMPARE_FUZZIER(a[cmp_i], n); \
+    }
+
+#define COMPARE_SCALED(a, b, s)						\
+    for (int cmp_i = 0; cmp_i < (int)(sizeof(a)/sizeof(a[0])); ++cmp_i) { \
+        COMPARE_FUZZIER(a[cmp_i] / s, b[cmp_i]); \
+    }
+
+#define COMPARE_ALL_F(a, n) \
+    for (int cmp_i = 0; cmp_i < (int)(sizeof(a)/sizeof(a[0])); ++cmp_i) { \
+        COMPARE_FUZZIER_F(a[cmp_i], n); \
+    }
+
+#define COMPARE_SCALED_F(a, b, s)						\
+    for (int cmp_i = 0; cmp_i < (int)(sizeof(a)/sizeof(a[0])); ++cmp_i) { \
+        COMPARE_FUZZIER_F(a[cmp_i] / s, b[cmp_i]); \
+    }
+
+#endif
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/fft/native/bqfft/test/TestFFT.h	Sat Oct 17 15:00:08 2015 +0100
@@ -0,0 +1,510 @@
+/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*-  vi:set ts=8 sts=4 sw=4: */
+
+/*
+    bqfft
+
+    A small library wrapping various FFT implementations for some
+    common audio processing use cases.
+
+    Copyright 2007-2015 Particular Programs Ltd.
+
+    Permission is hereby granted, free of charge, to any person
+    obtaining a copy of this software and associated documentation
+    files (the "Software"), to deal in the Software without
+    restriction, including without limitation the rights to use, copy,
+    modify, merge, publish, distribute, sublicense, and/or sell copies
+    of the Software, and to permit persons to whom the Software is
+    furnished to do so, subject to the following conditions:
+
+    The above copyright notice and this permission notice shall be
+    included in all copies or substantial portions of the Software.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+    EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+    NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
+    ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
+    CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+    WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+    Except as contained in this notice, the names of Chris Cannam and
+    Particular Programs Ltd shall not be used in advertising or
+    otherwise to promote the sale, use or other dealings in this
+    Software without prior written authorization.
+*/
+
+#ifndef TEST_FFT_H
+#define TEST_FFT_H
+
+#include "bqfft/FFT.h"
+
+#include <QObject>
+#include <QtTest>
+
+#include <cstdio>
+
+#include "Compares.h"
+
+namespace breakfastquay {
+
+class TestFFT : public QObject
+{
+    Q_OBJECT
+
+private:
+    void idat() {
+        QTest::addColumn<QString>("implementation");
+        std::set<std::string> impls = FFT::getImplementations();
+        foreach (std::string i, impls) {
+            QTest::newRow(i.c_str()) << i.c_str();
+        }
+    }
+    QString ifetch() {
+        QFETCH(QString, implementation);
+        FFT::setDefaultImplementation(implementation.toLocal8Bit().data());
+        return implementation;
+    }
+
+    bool lackSingle() {
+        return !(FFT(4).getSupportedPrecisions() & FFT::SinglePrecision);
+    }
+    bool lackDouble() {
+        return !(FFT(4).getSupportedPrecisions() & FFT::DoublePrecision);
+    }
+
+private slots:
+
+    void checkD() {
+        QString impl = ifetch();
+    }
+
+    void dc() {
+        ifetch();
+	// DC-only signal. The DC bin is purely real
+	double in[] = { 1, 1, 1, 1 };
+	double re[3], im[3];
+	FFT(4).forward(in, re, im);
+	QCOMPARE(re[0], 4.0);
+	COMPARE_ZERO(re[1]);
+	COMPARE_ZERO(re[2]);
+	COMPARE_ALL(im, 0.0);
+	double back[4];
+        FFT(4).inverse(re, im, back);
+	COMPARE_SCALED(back, in, 4);
+    }
+
+    void sine() {
+        ifetch();
+	// Sine. Output is purely imaginary
+	double in[] = { 0, 1, 0, -1 };
+	double re[3], im[3];
+	FFT(4).forward(in, re, im);
+	COMPARE_ALL(re, 0.0);
+	COMPARE_ZERO(im[0]);
+	QCOMPARE(im[1], -2.0);
+	COMPARE_ZERO(im[2]);
+	double back[4];
+        FFT(4).inverse(re, im, back);
+	COMPARE_SCALED(back, in, 4);
+    }
+
+    void cosine() {
+        ifetch();
+	// Cosine. Output is purely real
+	double in[] = { 1, 0, -1, 0 };
+	double re[3], im[3];
+	FFT(4).forward(in, re, im);
+	COMPARE_ZERO(re[0]);
+	QCOMPARE(re[1], 2.0);
+	COMPARE_ZERO(re[2]);
+	COMPARE_ALL(im, 0.0);
+	double back[4];
+        FFT(4).inverse(re, im, back);
+	COMPARE_SCALED(back, in, 4);
+    }
+	
+    void sineCosine() {
+        ifetch();
+	// Sine and cosine mixed
+	double in[] = { 0.5, 1, -0.5, -1 };
+	double re[3], im[3];
+	FFT(4).forward(in, re, im);
+	COMPARE_ZERO(re[0]);
+	QCOMPARE(re[1], 1.0);
+	COMPARE_ZERO(re[2]);
+	COMPARE_ZERO(im[0]);
+	QCOMPARE(im[1], -2.0);
+	COMPARE_ZERO(im[2]);
+	double back[4];
+        FFT(4).inverse(re, im, back);
+	COMPARE_SCALED(back, in, 4);
+    }
+
+    void nyquist() {
+        ifetch();
+	double in[] = { 1, -1, 1, -1 };
+	double re[3], im[3];
+	FFT(4).forward(in, re, im);
+	COMPARE_ZERO(re[0]);
+	COMPARE_ZERO(re[1]);
+	QCOMPARE(re[2], 4.0);
+	COMPARE_ALL(im, 0.0);
+	double back[4];
+        FFT(4).inverse(re, im, back);
+	COMPARE_SCALED(back, in, 4);
+    }
+	
+    void interleaved() {
+        ifetch();
+	// Sine and cosine mixed, test output format
+	double in[] = { 0.5, 1, -0.5, -1 };
+	double out[6];
+	FFT(4).forwardInterleaved(in, out);
+	COMPARE_ZERO(out[0]);
+	COMPARE_ZERO(out[1]);
+	QCOMPARE(out[2], 1.0);
+	QCOMPARE(out[3], -2.0);
+	COMPARE_ZERO(out[4]);
+	COMPARE_ZERO(out[5]);
+	double back[4];
+        FFT(4).inverseInterleaved(out, back);
+	COMPARE_SCALED(back, in, 4);
+    }
+
+    void cosinePolar() {
+        ifetch();
+	double in[] = { 1, 0, -1, 0 };
+	double mag[3], phase[3];
+	FFT(4).forwardPolar(in, mag, phase);
+	COMPARE_ZERO(mag[0]);
+	QCOMPARE(mag[1], 2.0);
+	COMPARE_ZERO(mag[2]);
+        // No meaningful tests for phase[i] where mag[i]==0 (phase
+        // could legitimately be anything)
+	COMPARE_ZERO(phase[1]);
+	double back[4];
+        FFT(4).inversePolar(mag, phase, back);
+	COMPARE_SCALED(back, in, 4);
+    }
+
+    void sinePolar() {
+        ifetch();
+	double in[] = { 0, 1, 0, -1 };
+	double mag[3], phase[3];
+	FFT(4).forwardPolar(in, mag, phase);
+	COMPARE_ZERO(mag[0]);
+	QCOMPARE(mag[1], 2.0);
+	COMPARE_ZERO(mag[2]);
+        // No meaningful tests for phase[i] where mag[i]==0 (phase
+        // could legitimately be anything)
+	QCOMPARE(phase[1], -M_PI/2.0);
+	double back[4];
+        FFT(4).inversePolar(mag, phase, back);
+	COMPARE_SCALED(back, in, 4);
+    }
+
+    void magnitude() {
+        ifetch();
+	// Sine and cosine mixed
+	double in[] = { 0.5, 1, -0.5, -1 };
+	double out[3];
+	FFT(4).forwardMagnitude(in, out);
+	COMPARE_ZERO(out[0]);
+	QCOMPARE(float(out[1]), sqrtf(5.0));
+	COMPARE_ZERO(out[2]);
+    }
+
+    void dirac() {
+        ifetch();
+	double in[] = { 1, 0, 0, 0 };
+	double re[3], im[3];
+	FFT(4).forward(in, re, im);
+	QCOMPARE(re[0], 1.0);
+	QCOMPARE(re[1], 1.0);
+	QCOMPARE(re[2], 1.0);
+	COMPARE_ALL(im, 0.0);
+	double back[4];
+        FFT(4).inverse(re, im, back);
+	COMPARE_SCALED(back, in, 4);
+    }
+
+    void cepstrum() {
+        ifetch();
+	double in[] = { 1, 0, 0, 0, 1, 0, 0, 0 };
+	double mag[5];
+	FFT(8).forwardMagnitude(in, mag);
+	double cep[8];
+	FFT(8).inverseCepstral(mag, cep);
+	COMPARE_ZERO(cep[1]);
+	COMPARE_ZERO(cep[2]);
+	COMPARE_ZERO(cep[3]);
+	COMPARE_ZERO(cep[5]);
+	COMPARE_ZERO(cep[6]);
+	COMPARE_ZERO(cep[7]);
+	QVERIFY(fabs(-6.561181 - cep[0]/8) < 0.000001);
+	QVERIFY(fabs( 7.254329 - cep[4]/8) < 0.000001);
+    }
+
+    void forwardArrayBounds() {
+        ifetch();
+	// initialise bins to something recognisable, so we can tell
+	// if they haven't been written
+	double in[] = { 1, 1, -1, -1 };
+	double re[] = { 999, 999, 999, 999, 999 };
+	double im[] = { 999, 999, 999, 999, 999 };
+	FFT(4).forward(in, re+1, im+1);
+	// And check we haven't overrun the arrays
+	QCOMPARE(re[0], 999.0);
+	QCOMPARE(im[0], 999.0);
+	QCOMPARE(re[4], 999.0);
+	QCOMPARE(im[4], 999.0);
+    }
+
+    void inverseArrayBounds() {
+        ifetch();
+	// initialise bins to something recognisable, so we can tell
+	// if they haven't been written
+	double re[] = { 0, 1, 0 };
+	double im[] = { 0, -2, 0 };
+	double out[] = { 999, 999, 999, 999, 999, 999 };
+	FFT(4).inverse(re, im, out+1);
+	// And check we haven't overrun the arrays
+	QCOMPARE(out[0], 999.0);
+	QCOMPARE(out[5], 999.0);
+    }
+
+    void checkF() {
+        QString impl = ifetch();
+    }
+
+    void dcF() {
+        ifetch();
+	// DC-only signal. The DC bin is purely real
+	float in[] = { 1, 1, 1, 1 };
+	float re[3], im[3];
+	FFT(4).forward(in, re, im);
+	QCOMPARE(re[0], 4.0f);
+	COMPARE_ZERO_F(re[1]);
+	COMPARE_ZERO_F(re[2]);
+	COMPARE_ALL_F(im, 0.0f);
+	float back[4];
+        FFT(4).inverse(re, im, back);
+	COMPARE_SCALED_F(back, in, 4);
+    }
+
+    void sineF() {
+        ifetch();
+	// Sine. Output is purely imaginary
+	float in[] = { 0, 1, 0, -1 };
+	float re[3], im[3];
+	FFT(4).forward(in, re, im);
+	COMPARE_ALL_F(re, 0.0f);
+	COMPARE_ZERO_F(im[0]);
+	QCOMPARE(im[1], -2.0f);
+	COMPARE_ZERO_F(im[2]);
+	float back[4];
+        FFT(4).inverse(re, im, back);
+	COMPARE_SCALED_F(back, in, 4);
+    }
+
+    void cosineF() {
+        ifetch();
+	// Cosine. Output is purely real
+	float in[] = { 1, 0, -1, 0 };
+	float re[3], im[3];
+	FFT(4).forward(in, re, im);
+	COMPARE_ZERO_F(re[0]);
+	QCOMPARE(re[1], 2.0f);
+	COMPARE_ZERO_F(re[2]);
+	COMPARE_ALL_F(im, 0.0f);
+	float back[4];
+        FFT(4).inverse(re, im, back);
+	COMPARE_SCALED_F(back, in, 4);
+    }
+	
+    void sineCosineF() {
+        ifetch();
+	// Sine and cosine mixed
+	float in[] = { 0.5, 1, -0.5, -1 };
+	float re[3], im[3];
+	FFT(4).forward(in, re, im);
+	COMPARE_ZERO_F(re[0]);
+	QCOMPARE(re[1], 1.0f);
+	COMPARE_ZERO_F(re[2]);
+	COMPARE_ZERO_F(im[0]);
+	QCOMPARE(im[1], -2.0f);
+	COMPARE_ZERO_F(im[2]);
+	float back[4];
+        FFT(4).inverse(re, im, back);
+	COMPARE_SCALED_F(back, in, 4);
+    }
+
+    void nyquistF() {
+        ifetch();
+	float in[] = { 1, -1, 1, -1 };
+	float re[3], im[3];
+	FFT(4).forward(in, re, im);
+	COMPARE_ZERO_F(re[0]);
+	COMPARE_ZERO_F(re[1]);
+	QCOMPARE(re[2], 4.0f);
+	COMPARE_ALL_F(im, 0.0f);
+	float back[4];
+        FFT(4).inverse(re, im, back);
+	COMPARE_SCALED_F(back, in, 4);
+    }
+	
+    void interleavedF() {
+        ifetch();
+	// Sine and cosine mixed, test output format
+	float in[] = { 0.5, 1, -0.5, -1 };
+	float out[6];
+	FFT(4).forwardInterleaved(in, out);
+	COMPARE_ZERO_F(out[0]);
+	COMPARE_ZERO_F(out[1]);
+	QCOMPARE(out[2], 1.0f);
+	QCOMPARE(out[3], -2.0f);
+	COMPARE_ZERO_F(out[4]);
+	COMPARE_ZERO_F(out[5]);
+	float back[4];
+        FFT(4).inverseInterleaved(out, back);
+	COMPARE_SCALED_F(back, in, 4);
+    }
+
+    void cosinePolarF() {
+        ifetch();
+	float in[] = { 1, 0, -1, 0 };
+	float mag[3], phase[3];
+	FFT(4).forwardPolar(in, mag, phase);
+	COMPARE_ZERO_F(mag[0]);
+	QCOMPARE(mag[1], 2.0f);
+	COMPARE_ZERO_F(mag[2]);
+        // No meaningful tests for phase[i] where mag[i]==0 (phase
+        // could legitimately be anything)
+	COMPARE_ZERO_F(phase[1]);
+	float back[4];
+        FFT(4).inversePolar(mag, phase, back);
+	COMPARE_SCALED_F(back, in, 4);
+    }
+
+    void sinePolarF() {
+        ifetch();
+	float in[] = { 0, 1, 0, -1 };
+	float mag[3], phase[3];
+	FFT(4).forwardPolar(in, mag, phase);
+	COMPARE_ZERO_F(mag[0]);
+	QCOMPARE(mag[1], 2.0f);
+	COMPARE_ZERO_F(mag[2]);
+        // No meaningful tests for phase[i] where mag[i]==0 (phase
+        // could legitimately be anything)
+	QCOMPARE(phase[1], -float(M_PI)/2.0f);
+	float back[4];
+        FFT(4).inversePolar(mag, phase, back);
+	COMPARE_SCALED_F(back, in, 4);
+    }
+
+    void magnitudeF() {
+        ifetch();
+	// Sine and cosine mixed
+	float in[] = { 0.5, 1, -0.5, -1 };
+	float out[3];
+	FFT(4).forwardMagnitude(in, out);
+	COMPARE_ZERO_F(out[0]);
+	QCOMPARE(float(out[1]), sqrtf(5.0f));
+	COMPARE_ZERO_F(out[2]);
+    }
+
+    void diracF() {
+        ifetch();
+	float in[] = { 1, 0, 0, 0 };
+	float re[3], im[3];
+	FFT(4).forward(in, re, im);
+	QCOMPARE(re[0], 1.0f);
+	QCOMPARE(re[1], 1.0f);
+	QCOMPARE(re[2], 1.0f);
+	COMPARE_ALL_F(im, 0.0f);
+	float back[4];
+        FFT(4).inverse(re, im, back);
+	COMPARE_SCALED_F(back, in, 4);
+    }
+
+    void cepstrumF() {
+        ifetch();
+	float in[] = { 1, 0, 0, 0, 1, 0, 0, 0 };
+	float mag[5];
+	FFT(8).forwardMagnitude(in, mag);
+	float cep[8];
+	FFT(8).inverseCepstral(mag, cep);
+	COMPARE_ZERO_F(cep[1]);
+	COMPARE_ZERO_F(cep[2]);
+	COMPARE_ZERO_F(cep[3]);
+	COMPARE_ZERO_F(cep[5]);
+	COMPARE_ZERO_F(cep[6]);
+	COMPARE_ZERO_F(cep[7]);
+	QVERIFY(fabsf(-6.561181 - cep[0]/8) < 0.000001);
+	QVERIFY(fabsf( 7.254329 - cep[4]/8) < 0.000001);
+    }
+
+    void forwardArrayBoundsF() {
+        ifetch();
+	// initialise bins to something recognisable, so we can tell
+	// if they haven't been written
+	float in[] = { 1, 1, -1, -1 };
+	float re[] = { 999, 999, 999, 999, 999 };
+	float im[] = { 999, 999, 999, 999, 999 };
+	FFT(4).forward(in, re+1, im+1);
+	// And check we haven't overrun the arrays
+	QCOMPARE(re[0], 999.0f);
+	QCOMPARE(im[0], 999.0f);
+	QCOMPARE(re[4], 999.0f);
+	QCOMPARE(im[4], 999.0f);
+    }
+
+    void inverseArrayBoundsF() {
+        ifetch();
+	// initialise bins to something recognisable, so we can tell
+	// if they haven't been written
+	float re[] = { 0, 1, 0 };
+	float im[] = { 0, -2, 0 };
+	float out[] = { 999, 999, 999, 999, 999, 999 };
+	FFT(4).inverse(re, im, out+1);
+	// And check we haven't overrun the arrays
+	QCOMPARE(out[0], 999.0f);
+	QCOMPARE(out[5], 999.0f);
+    }
+
+    void checkD_data() { idat(); }
+    void dc_data() { idat(); }
+    void sine_data() { idat(); }
+    void cosine_data() { idat(); }
+    void sineCosine_data() { idat(); }
+    void sineCosineDC_data() { idat(); }
+    void nyquist_data() { idat(); }
+    void interleaved_data() { idat(); }
+    void cosinePolar_data() { idat(); }
+    void sinePolar_data() { idat(); }
+    void magnitude_data() { idat(); }
+    void dirac_data() { idat(); }
+    void cepstrum_data() { idat(); }
+    void forwardArrayBounds_data() { idat(); }
+    void inverseArrayBounds_data() { idat(); }
+
+    void checkF_data() { idat(); }
+    void dcF_data() { idat(); }
+    void sineF_data() { idat(); }
+    void cosineF_data() { idat(); }
+    void sineCosineF_data() { idat(); }
+    void sineCosineDCF_data() { idat(); }
+    void nyquistF_data() { idat(); }
+    void interleavedF_data() { idat(); }
+    void cosinePolarF_data() { idat(); }
+    void sinePolarF_data() { idat(); }
+    void magnitudeF_data() { idat(); }
+    void diracF_data() { idat(); }
+    void cepstrumF_data() { idat(); }
+    void forwardArrayBoundsF_data() { idat(); }
+    void inverseArrayBoundsF_data() { idat(); }
+};
+
+}
+
+#endif
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/fft/native/bqfft/test/main.cpp	Sat Oct 17 15:00:08 2015 +0100
@@ -0,0 +1,27 @@
+/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*-  vi:set ts=8 sts=4 sw=4: */
+/* Copyright Chris Cannam - All Rights Reserved */
+
+#include "TestFFT.h"
+#include <QtTest>
+
+#include <iostream>
+
+int main(int argc, char *argv[])
+{
+    int good = 0, bad = 0;
+
+    QCoreApplication app(argc, argv);
+
+    breakfastquay::TestFFT tf;
+    if (QTest::qExec(&tf, argc, argv) == 0) ++good;
+    else ++bad;
+
+    if (bad > 0) {
+	std::cerr << "\n********* " << bad << " test suite(s) failed!\n" << std::endl;
+	return 1;
+    } else {
+        std::cerr << "All tests passed" << std::endl;
+        return 0;
+    }
+}
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/fft/native/bqfft/test/test.pro	Sat Oct 17 15:00:08 2015 +0100
@@ -0,0 +1,30 @@
+
+TEMPLATE = app
+TARGET = test-fft
+win*: TARGET = "TestFFT"
+
+QT += testlib
+QT -= gui
+
+DESTDIR = .
+QMAKE_LIBDIR += . ..
+
+LIBS += -lbqfft -lfftw3 -lfftw3f
+
+INCLUDEPATH += . .. ../../bqvec
+DEPENDPATH += . .. ../../bqvec
+
+HEADERS += TestFFT.h
+SOURCES += main.cpp
+
+!win32 {
+    !macx* {
+        QMAKE_POST_LINK=valgrind $${DESTDIR}/$${TARGET}
+    }
+    macx* {
+        QMAKE_POST_LINK=$${DESTDIR}/$${TARGET}.app/Contents/MacOS/$${TARGET}
+    }
+}
+
+win32-g++:QMAKE_POST_LINK=$${DESTDIR}$${TARGET}.exe
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/fft/native/bqvec/COPYING	Sat Oct 17 15:00:08 2015 +0100
@@ -0,0 +1,50 @@
+
+The following terms apply to the code in the src/ directory:
+
+    Copyright 2007-2014 Particular Programs Ltd.
+
+    Permission is hereby granted, free of charge, to any person
+    obtaining a copy of this software and associated documentation
+    files (the "Software"), to deal in the Software without
+    restriction, including without limitation the rights to use, copy,
+    modify, merge, publish, distribute, sublicense, and/or sell copies
+    of the Software, and to permit persons to whom the Software is
+    furnished to do so, subject to the following conditions:
+
+    The above copyright notice and this permission notice shall be
+    included in all copies or substantial portions of the Software.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+    EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+    NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
+    ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
+    CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+    WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+    Except as contained in this notice, the names of Chris Cannam and
+    Particular Programs Ltd shall not be used in advertising or
+    otherwise to promote the sale, use or other dealings in this
+    Software without prior written authorization.
+
+
+The following terms apply to the code in the pommier/ directory:
+
+    Copyright (C) 2011  Julien Pommier
+
+    This software is provided 'as-is', without any express or implied
+    warranty.  In no event will the authors be held liable for any damages
+    arising from the use of this software.
+
+    Permission is granted to anyone to use this software for any purpose,
+    including commercial applications, and to alter it and redistribute it
+    freely, subject to the following restrictions:
+
+    1. The origin of this software must not be misrepresented; you must not
+       claim that you wrote the original software. If you use this software
+       in a product, an acknowledgment in the product documentation would be
+       appreciated but is not required.
+    2. Altered source versions must be plainly marked as such, and must not be
+       misrepresented as being the original software.
+    3. This notice may not be removed or altered from any source distribution.
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/fft/native/bqvec/Makefile	Sat Oct 17 15:00:08 2015 +0100
@@ -0,0 +1,90 @@
+
+# Add to VECTOR_DEFINES the relevant options for your desired
+# third-party library support.
+#
+# Available options are
+#
+#  -DHAVE_IPP    Intel's Integrated Performance Primitives are available
+#  -DHAVE_VDSP   Apple's Accelerate framework is available
+#
+# These are optional (they affect performance, not function) and you
+# may define more than one of them.
+# 
+# Add any relevant -I flags for include paths as well.
+#
+# Note that you must supply the same flags when including bqvec
+# headers later as you are using now when compiling the library. (You
+# may find it simplest to just add the bqvec source files to your
+# application's build system and not build a bqvec library at all.)
+
+VECTOR_DEFINES	:= -DHAVE_VDSP
+
+
+# Add to ALLOCATOR_DEFINES options relating to aligned malloc.
+#
+# Available options are
+#
+#  -DHAVE_POSIX_MEMALIGN     The posix_memalign call is available in sys/mman.h
+#  -DLACK_POSIX_MEMALIGN     The posix_memalign call is not available
+#
+#  -DMALLOC_IS_ALIGNED       The malloc call already returns aligned memory
+#  -DMALLOC_IS_NOT_ALIGNED   The malloc call does not return aligned memory
+#
+#  -DUSE_OWN_ALIGNED_MALLOC  No aligned malloc is available, roll your own
+#
+#  -DLACK_BAD_ALLOC          The C++ library lacks the std::bad_alloc exception
+#
+# Here "aligned" is assumed to mean "aligned enough for whatever
+# vector stuff the space will be used for" which most likely means
+# 16-byte alignment.
+#
+# The default is to use _aligned_malloc when building with Visual C++,
+# system malloc when building on OS/X, and posix_memalign otherwise.
+#
+# Note that you must supply the same flags when including bqvec
+# headers later as you are using now when compiling the library. (You
+# may find it simplest to just add the bqvec source files to your
+# application's build system and not build a bqvec library at all.)
+
+ALLOCATOR_DEFINES := -DMALLOC_IS_ALIGNED
+
+
+SRC_DIR	:= src
+HEADER_DIR := bqvec
+
+SOURCES	:= $(wildcard $(SRC_DIR)/*.cpp)
+HEADERS	:= $(wildcard $(HEADER_DIR)/*.h) $(wildcard $(SRC_DIR)/*.h)
+
+OBJECTS	:= $(SOURCES:.cpp=.o)
+OBJECTS	:= $(OBJECTS:.c=.o)
+
+CXXFLAGS := $(VECTOR_DEFINES) $(ALLOCATOR_DEFINES) -I$(HEADER_DIR) -O3 -ffast-math -Wall -Werror -fpic
+
+LIBRARY	:= libbqvec.a
+
+all:	$(LIBRARY)
+
+$(LIBRARY):	$(OBJECTS)
+	$(AR) rc $@ $^
+
+clean:		
+	rm -f $(OBJECTS)
+
+distclean:	clean
+	rm -f $(LIBRARY)
+
+depend:
+	makedepend -Y -fMakefile $(SOURCES) $(HEADERS)
+
+
+# DO NOT DELETE
+
+src/VectorOpsComplex.o: bqvec/VectorOpsComplex.h bqvec/VectorOps.h
+src/VectorOpsComplex.o: bqvec/Restrict.h bqvec/ComplexTypes.h
+src/Allocators.o: bqvec/Allocators.h bqvec/VectorOps.h bqvec/Restrict.h
+bqvec/RingBuffer.o: bqvec/Barrier.h bqvec/Allocators.h bqvec/VectorOps.h
+bqvec/RingBuffer.o: bqvec/Restrict.h
+bqvec/VectorOpsComplex.o: bqvec/VectorOps.h bqvec/Restrict.h
+bqvec/VectorOpsComplex.o: bqvec/ComplexTypes.h
+bqvec/VectorOps.o: bqvec/Restrict.h
+bqvec/Allocators.o: bqvec/VectorOps.h bqvec/Restrict.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/fft/native/bqvec/README.txt	Sat Oct 17 15:00:08 2015 +0100
@@ -0,0 +1,17 @@
+
+bqvec
+=====
+
+A small library for efficient vector arithmetic and allocation in C++
+using raw C pointer arrays.
+
+C++ standard required: C++98 (does not use C++11)
+
+Copyright 2007-2015 Particular Programs Ltd.
+
+This code originated as part of the Rubber Band Library written by the
+same authors (see https://bitbucket.org/breakfastquay/rubberband/).
+It has been pulled out into a separate library and relicensed under a
+more permissive licence: a BSD/MIT-style licence, as opposed to the
+GPL used for Rubber Band.  See the file COPYING for details.
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/fft/native/bqvec/bqvec/Allocators.h	Sat Oct 17 15:00:08 2015 +0100
@@ -0,0 +1,286 @@
+/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*-  vi:set ts=8 sts=4 sw=4: */
+
+/*
+    bqvec
+
+    A small library for vector arithmetic and allocation in C++ using
+    raw C pointer arrays.
+
+    Copyright 2007-2015 Particular Programs Ltd.
+
+    Permission is hereby granted, free of charge, to any person
+    obtaining a copy of this software and associated documentation
+    files (the "Software"), to deal in the Software without
+    restriction, including without limitation the rights to use, copy,
+    modify, merge, publish, distribute, sublicense, and/or sell copies
+    of the Software, and to permit persons to whom the Software is
+    furnished to do so, subject to the following conditions:
+
+    The above copyright notice and this permission notice shall be
+    included in all copies or substantial portions of the Software.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+    EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+    NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
+    ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
+    CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+    WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+    Except as contained in this notice, the names of Chris Cannam and
+    Particular Programs Ltd shall not be used in advertising or
+    otherwise to promote the sale, use or other dealings in this
+    Software without prior written authorization.
+*/
+
+#ifndef BQVEC_ALLOCATORS_H
+#define BQVEC_ALLOCATORS_H
+
+/*
+ * Aligned and per-channel allocators and deallocators for raw C array
+ * buffers.
+ */
+
+#include <new> // for std::bad_alloc
+#include <stdlib.h>
+
+#ifndef HAVE_POSIX_MEMALIGN
+#ifndef _WIN32
+#ifndef __APPLE__
+#ifndef LACK_POSIX_MEMALIGN
+#define HAVE_POSIX_MEMALIGN
+#endif
+#endif
+#endif
+#endif
+
+#ifndef MALLOC_IS_NOT_ALIGNED
+#ifdef __APPLE_
+#ifndef MALLOC_IS_ALIGNED
+#define MALLOC_IS_ALIGNED
+#endif
+#endif
+#endif
+
+#ifdef HAVE_POSIX_MEMALIGN
+#include <sys/mman.h>
+#endif
+
+#ifdef LACK_BAD_ALLOC
+namespace std { struct bad_alloc { }; }
+#endif
+
+namespace breakfastquay {
+
+template <typename T>
+T *allocate(size_t count)
+{
+    void *ptr = 0;
+    // 32-byte alignment is required for at least OpenMAX
+    static const int alignment = 32;
+#ifdef USE_OWN_ALIGNED_MALLOC
+    // Alignment must be a power of two, bigger than the pointer
+    // size. Stuff the actual malloc'd pointer in just before the
+    // returned value.  This is the least desirable way to do this --
+    // the other options below are all better
+    size_t allocd = count * sizeof(T) + alignment;
+    void *buf = malloc(allocd);
+    if (buf) {
+        char *adj = (char *)buf;
+        while ((unsigned long long)adj & (alignment-1)) --adj;
+        ptr = ((char *)adj) + alignment;
+        ((void **)ptr)[-1] = buf;
+    }
+#else /* !USE_OWN_ALIGNED_MALLOC */
+#ifdef HAVE_POSIX_MEMALIGN
+    if (posix_memalign(&ptr, alignment, count * sizeof(T))) {
+        ptr = malloc(count * sizeof(T));
+    }
+#else /* !HAVE_POSIX_MEMALIGN */
+#ifdef __MSVC__
+    ptr = _aligned_malloc(count * sizeof(T), alignment);
+#else /* !__MSVC__ */
+#ifndef MALLOC_IS_ALIGNED
+#error "No aligned malloc available: define MALLOC_IS_ALIGNED to stick with system malloc, HAVE_POSIX_MEMALIGN if posix_memalign is available, or USE_OWN_ALIGNED_MALLOC to roll our own"
+#endif
+    // Note that malloc always aligns to 16 byte boundaries on OS/X
+    ptr = malloc(count * sizeof(T));
+    (void)alignment; // avoid compiler warning for unused 
+#endif /* !__MSVC__ */
+#endif /* !HAVE_POSIX_MEMALIGN */
+#endif /* !USE_OWN_ALIGNED_MALLOC */
+    if (!ptr) {
+#ifndef NO_EXCEPTIONS
+        throw(std::bad_alloc());
+#else
+        abort();
+#endif
+    }
+    return (T *)ptr;
+}
+
+#ifdef HAVE_IPP
+
+template <>
+float *allocate(size_t count);
+
+template <>
+double *allocate(size_t count);
+
+#endif
+	
+template <typename T>
+T *allocate_and_zero(size_t count)
+{
+    T *ptr = allocate<T>(count);
+    for (size_t i = 0; i < count; ++i) {
+        ptr[i] = T();
+    }
+    return ptr;
+}
+
+template <typename T>
+void deallocate(T *ptr)
+{
+#ifdef USE_OWN_ALIGNED_MALLOC
+    if (ptr) free(((void **)ptr)[-1]);
+#else /* !USE_OWN_ALIGNED_MALLOC */
+#ifdef __MSVC__
+    if (ptr) _aligned_free((void *)ptr);
+#else /* !__MSVC__ */
+    if (ptr) free((void *)ptr);
+#endif /* !__MSVC__ */
+#endif /* !USE_OWN_ALIGNED_MALLOC */
+}
+
+#ifdef HAVE_IPP
+
+template <>
+void deallocate(float *);
+
+template <>
+void deallocate(double *);
+
+#endif
+
+/// Reallocate preserving contents but leaving additional memory uninitialised	
+template <typename T>
+T *reallocate(T *ptr, size_t oldcount, size_t count)
+{
+    T *newptr = allocate<T>(count);
+    if (oldcount && ptr) {
+        for (size_t i = 0; i < oldcount && i < count; ++i) {
+            newptr[i] = ptr[i];
+        }
+    }
+    if (ptr) deallocate<T>(ptr);
+    return newptr;
+}
+
+/// Reallocate, zeroing all contents
+template <typename T>
+T *reallocate_and_zero(T *ptr, size_t oldcount, size_t count)
+{
+    ptr = reallocate(ptr, oldcount, count);
+    for (size_t i = 0; i < count; ++i) {
+        ptr[i] = T();
+    }
+    return ptr;
+}
+	
+/// Reallocate preserving contents and zeroing any additional memory	
+template <typename T>
+T *reallocate_and_zero_extension(T *ptr, size_t oldcount, size_t count)
+{
+    ptr = reallocate(ptr, oldcount, count);
+    if (count > oldcount) {
+        for (size_t i = oldcount; i < count; ++i) {
+            ptr[i] = T();
+        }
+    }
+    return ptr;
+}
+
+template <typename T>
+T **allocate_channels(size_t channels, size_t count)
+{
+    // We don't want to use the aligned allocate for the channel
+    // pointers, it might even make things slower
+    T **ptr = new T *[channels];
+    for (size_t c = 0; c < channels; ++c) {
+        ptr[c] = allocate<T>(count);
+    }
+    return ptr;
+}
+	
+template <typename T>
+T **allocate_and_zero_channels(size_t channels, size_t count)
+{
+    // We don't want to use the aligned allocate for the channel
+    // pointers, it might even make things slower
+    T **ptr = new T *[channels];
+    for (size_t c = 0; c < channels; ++c) {
+        ptr[c] = allocate_and_zero<T>(count);
+    }
+    return ptr;
+}
+
+template <typename T>
+void deallocate_channels(T **ptr, size_t channels)
+{
+    if (!ptr) return;
+    for (size_t c = 0; c < channels; ++c) {
+        deallocate<T>(ptr[c]);
+    }
+    delete[] ptr;
+}
+	
+template <typename T>
+T **reallocate_channels(T **ptr,
+                        size_t oldchannels, size_t oldcount,
+                        size_t channels, size_t count)
+{
+    T **newptr = allocate_channels<T>(channels, count);
+    if (oldcount && ptr) {
+        for (size_t c = 0; c < channels; ++c) {
+            for (size_t i = 0; i < oldcount && i < count; ++i) {
+                newptr[c][i] = ptr[c][i];
+            }
+        }
+    } 
+    if (ptr) deallocate_channels<T>(ptr, oldchannels);
+    return newptr;
+}
+	
+template <typename T>
+T **reallocate_and_zero_extend_channels(T **ptr,
+                                        size_t oldchannels, size_t oldcount,
+                                        size_t channels, size_t count)
+{
+    T **newptr = allocate_and_zero_channels<T>(channels, count);
+    if (oldcount && ptr) {
+        for (size_t c = 0; c < channels; ++c) {
+            for (size_t i = 0; i < oldcount && i < count; ++i) {
+                newptr[c][i] = ptr[c][i];
+            }
+        }
+    } 
+    if (ptr) deallocate_channels<T>(ptr, oldchannels);
+    return newptr;
+}
+
+/// RAII class to call deallocate() on destruction
+template <typename T>
+class Deallocator
+{
+public:
+    Deallocator(T *t) : m_t(t) { }
+    ~Deallocator() { deallocate<T>(m_t); }
+private:
+    T *m_t;
+};
+
+}
+
+#endif
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/fft/native/bqvec/bqvec/Barrier.h	Sat Oct 17 15:00:08 2015 +0100
@@ -0,0 +1,62 @@
+/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*-  vi:set ts=8 sts=4 sw=4: */
+
+/*
+    bqvec
+
+    A small library for vector arithmetic and allocation in C++ using
+    raw C pointer arrays.
+
+    Copyright 2007-2015 Particular Programs Ltd.
+
+    Permission is hereby granted, free of charge, to any person
+    obtaining a copy of this software and associated documentation
+    files (the "Software"), to deal in the Software without
+    restriction, including without limitation the rights to use, copy,
+    modify, merge, publish, distribute, sublicense, and/or sell copies
+    of the Software, and to permit persons to whom the Software is
+    furnished to do so, subject to the following conditions:
+
+    The above copyright notice and this permission notice shall be
+    included in all copies or substantial portions of the Software.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+    EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+    NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
+    ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
+    CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+    WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+    Except as contained in this notice, the names of Chris Cannam and
+    Particular Programs Ltd shall not be used in advertising or
+    otherwise to promote the sale, use or other dealings in this
+    Software without prior written authorization.
+*/
+
+#ifndef BQVEC_BARRIER_H
+#define BQVEC_BARRIER_H
+
+namespace breakfastquay {
+    extern void system_memorybarrier();
+}
+
+#if defined _WIN32
+
+#define BQ_MBARRIER() ::breakfastquay::system_memorybarrier()
+
+#elif defined __APPLE__
+
+#include <libkern/OSAtomic.h>
+#define BQ_MBARRIER() OSMemoryBarrier()
+
+#elif (__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 1)
+
+#define BQ_MBARRIER() __sync_synchronize()
+
+#else
+
+#define BQ_MBARRIER() ::breakfastquay::system_memorybarrier()
+
+#endif
+
+#endif
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/fft/native/bqvec/bqvec/ComplexTypes.h	Sat Oct 17 15:00:08 2015 +0100
@@ -0,0 +1,52 @@
+/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*-  vi:set ts=8 sts=4 sw=4: */
+
+/*
+    bqvec
+
+    A small library for vector arithmetic and allocation in C++ using
+    raw C pointer arrays.
+
+    Copyright 2007-2015 Particular Programs Ltd.
+
+    Permission is hereby granted, free of charge, to any person
+    obtaining a copy of this software and associated documentation
+    files (the "Software"), to deal in the Software without
+    restriction, including without limitation the rights to use, copy,
+    modify, merge, publish, distribute, sublicense, and/or sell copies
+    of the Software, and to permit persons to whom the Software is
+    furnished to do so, subject to the following conditions:
+
+    The above copyright notice and this permission notice shall be
+    included in all copies or substantial portions of the Software.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+    EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+    NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
+    ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
+    CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+    WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+    Except as contained in this notice, the names of Chris Cannam and
+    Particular Programs Ltd shall not be used in advertising or
+    otherwise to promote the sale, use or other dealings in this
+    Software without prior written authorization.
+*/
+
+#ifndef BQVEC_COMPLEX_TYPES_H
+#define BQVEC_COMPLEX_TYPES_H
+
+namespace breakfastquay {
+
+#ifndef NO_COMPLEX_TYPES
+// Convertible with other complex types that store re+im consecutively
+typedef double bq_complex_element_t;
+struct bq_complex_t {
+    bq_complex_element_t re;
+    bq_complex_element_t im;
+};
+#endif
+
+}
+
+#endif
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/fft/native/bqvec/bqvec/Range.h	Sat Oct 17 15:00:08 2015 +0100
@@ -0,0 +1,57 @@
+/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*-  vi:set ts=8 sts=4 sw=4: */
+
+/*
+    bqvec
+
+    A small library for vector arithmetic and allocation in C++ using
+    raw C pointer arrays.
+
+    Copyright 2007-2015 Particular Programs Ltd.
+
+    Permission is hereby granted, free of charge, to any person
+    obtaining a copy of this software and associated documentation
+    files (the "Software"), to deal in the Software without
+    restriction, including without limitation the rights to use, copy,
+    modify, merge, publish, distribute, sublicense, and/or sell copies
+    of the Software, and to permit persons to whom the Software is
+    furnished to do so, subject to the following conditions:
+
+    The above copyright notice and this permission notice shall be
+    included in all copies or substantial portions of the Software.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+    EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+    NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
+    ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
+    CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+    WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+    Except as contained in this notice, the names of Chris Cannam and
+    Particular Programs Ltd shall not be used in advertising or
+    otherwise to promote the sale, use or other dealings in this
+    Software without prior written authorization.
+*/
+
+#ifndef BQVEC_RANGE_H
+#define BQVEC_RANGE_H
+
+namespace breakfastquay {
+
+/** Check whether an integer index is in range for a container,
+    avoiding overflows and signed/unsigned comparison warnings.
+*/
+template<typename T, typename C>
+bool in_range_for(const C &container, T i)
+{
+    if (i < 0) return false;
+    if (sizeof(T) > sizeof(typename C::size_type)) {
+	return i < static_cast<T>(container.size());
+    } else {
+	return static_cast<typename C::size_type>(i) < container.size();
+    }
+}
+
+}
+
+#endif
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/fft/native/bqvec/bqvec/Restrict.h	Sat Oct 17 15:00:08 2015 +0100
@@ -0,0 +1,51 @@
+/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*-  vi:set ts=8 sts=4 sw=4: */
+
+/*
+    bqvec
+
+    A small library for vector arithmetic and allocation in C++ using
+    raw C pointer arrays.
+
+    Copyright 2007-2015 Particular Programs Ltd.
+
+    Permission is hereby granted, free of charge, to any person
+    obtaining a copy of this software and associated documentation
+    files (the "Software"), to deal in the Software without
+    restriction, including without limitation the rights to use, copy,
+    modify, merge, publish, distribute, sublicense, and/or sell copies
+    of the Software, and to permit persons to whom the Software is
+    furnished to do so, subject to the following conditions:
+
+    The above copyright notice and this permission notice shall be
+    included in all copies or substantial portions of the Software.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+    EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+    NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
+    ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
+    CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+    WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+    Except as contained in this notice, the names of Chris Cannam and
+    Particular Programs Ltd shall not be used in advertising or
+    otherwise to promote the sale, use or other dealings in this
+    Software without prior written authorization.
+*/
+
+#ifndef BQVEC_RESTRICT_H
+#define BQVEC_RESTRICT_H
+
+#ifdef __MSVC__
+#define BQ_R__ __restrict
+#endif
+
+#ifdef __GNUC__
+#define BQ_R__ __restrict__
+#endif
+
+#ifndef BQ_R__
+#define BQ_R__
+#endif
+
+#endif
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/fft/native/bqvec/bqvec/RingBuffer.h	Sat Oct 17 15:00:08 2015 +0100
@@ -0,0 +1,516 @@
+/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*-  vi:set ts=8 sts=4 sw=4: */
+
+/*
+    bqvec
+
+    A small library for vector arithmetic and allocation in C++ using
+    raw C pointer arrays.
+
+    Copyright 2007-2015 Particular Programs Ltd.
+
+    Permission is hereby granted, free of charge, to any person
+    obtaining a copy of this software and associated documentation
+    files (the "Software"), to deal in the Software without
+    restriction, including without limitation the rights to use, copy,
+    modify, merge, publish, distribute, sublicense, and/or sell copies
+    of the Software, and to permit persons to whom the Software is
+    furnished to do so, subject to the following conditions:
+
+    The above copyright notice and this permission notice shall be
+    included in all copies or substantial portions of the Software.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+    EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+    NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
+    ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
+    CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+    WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+    Except as contained in this notice, the names of Chris Cannam and
+    Particular Programs Ltd shall not be used in advertising or
+    otherwise to promote the sale, use or other dealings in this
+    Software without prior written authorization.
+*/
+
+#ifndef BQVEC_RINGBUFFER_H
+#define BQVEC_RINGBUFFER_H
+
+#include <sys/types.h>
+
+//#define DEBUG_RINGBUFFER 1
+
+#include "Barrier.h"
+#include "Allocators.h"
+
+#include <iostream>
+
+namespace breakfastquay {
+
+/**
+ * RingBuffer implements a lock-free ring buffer for one writer and
+ * one reader, that is to be used to store a sample type T.
+ *
+ * RingBuffer is thread-safe provided only one thread writes and only
+ * one thread reads.
+ */
+template <typename T>
+class RingBuffer
+{
+public:
+    /**
+     * Create a ring buffer with room to write n samples.
+     *
+     * Note that the internal storage size will actually be n+1
+     * samples, as one element is unavailable for administrative
+     * reasons.  Since the ring buffer performs best if its size is a
+     * power of two, this means n should ideally be some power of two
+     * minus one.
+     */
+    RingBuffer(int n);
+
+    virtual ~RingBuffer();
+
+    /**
+     * Return the total capacity of the ring buffer in samples.
+     * (This is the argument n passed to the constructor.)
+     */
+    int getSize() const;
+
+    /**
+     * Return a new ring buffer (allocated with "new" -- caller must
+     * delete when no longer needed) of the given size, containing the
+     * same data as this one.  If another thread reads from or writes
+     * to this buffer during the call, the results may be incomplete
+     * or inconsistent.  If this buffer's data will not fit in the new
+     * size, the contents are undefined.
+     */
+    RingBuffer<T> *resized(int newSize) const;
+
+    /**
+     * Reset read and write pointers, thus emptying the buffer.
+     * Should be called from the write thread.
+     */
+    void reset();
+
+    /**
+     * Return the amount of data available for reading, in samples.
+     */
+    int getReadSpace() const;
+
+    /**
+     * Return the amount of space available for writing, in samples.
+     */
+    int getWriteSpace() const;
+
+    /**
+     * Read n samples from the buffer.  If fewer than n are available,
+     * the remainder will be zeroed out.  Returns the number of
+     * samples actually read.
+     *
+     * This is a template function, taking an argument S for the target
+     * sample type, which is permitted to differ from T if the two
+     * types are compatible for arithmetic operations.
+     */
+    template <typename S>
+    int read(S *const BQ_R__ destination, int n);
+
+    /**
+     * Read n samples from the buffer, adding them to the destination.
+     * If fewer than n are available, the remainder will be left
+     * alone.  Returns the number of samples actually read.
+     *
+     * This is a template function, taking an argument S for the target
+     * sample type, which is permitted to differ from T if the two
+     * types are compatible for arithmetic operations.
+     */
+    template <typename S>
+    int readAdding(S *const BQ_R__ destination, int n);
+
+    /**
+     * Read one sample from the buffer.  If no sample is available,
+     * this will silently return zero.  Calling this repeatedly is
+     * obviously slower than calling read once, but it may be good
+     * enough if you don't want to allocate a buffer to read into.
+     */
+    T readOne();
+
+    /**
+     * Read n samples from the buffer, if available, without advancing
+     * the read pointer -- i.e. a subsequent read() or skip() will be
+     * necessary to empty the buffer.  If fewer than n are available,
+     * the remainder will be zeroed out.  Returns the number of
+     * samples actually read.
+     */
+    int peek(T *const BQ_R__ destination, int n) const;
+
+    /**
+     * Read one sample from the buffer, if available, without
+     * advancing the read pointer -- i.e. a subsequent read() or
+     * skip() will be necessary to empty the buffer.  Returns zero if
+     * no sample was available.
+     */
+    T peekOne() const;
+
+    /**
+     * Pretend to read n samples from the buffer, without actually
+     * returning them (i.e. discard the next n samples).  Returns the
+     * number of samples actually available for discarding.
+     */
+    int skip(int n);
+
+    /**
+     * Write n samples to the buffer.  If insufficient space is
+     * available, not all samples may actually be written.  Returns
+     * the number of samples actually written.
+     *
+     * This is a template function, taking an argument S for the source
+     * sample type, which is permitted to differ from T if the two
+     * types are compatible for assignment.
+     */
+    template <typename S>
+    int write(const S *const BQ_R__ source, int n);
+
+    /**
+     * Write n zero-value samples to the buffer.  If insufficient
+     * space is available, not all zeros may actually be written.
+     * Returns the number of zeroes actually written.
+     */
+    int zero(int n);
+
+protected:
+    T *const BQ_R__ m_buffer;
+    int             m_writer;
+    int             m_reader;
+    const int       m_size;
+
+    int readSpaceFor(int w, int r) const {
+        int space;
+        if (w > r) space = w - r;
+        else if (w < r) space = (w + m_size) - r;
+        else space = 0;
+        return space;
+    }
+
+    int writeSpaceFor(int w, int r) const {
+        int space = (r + m_size - w - 1);
+        if (space >= m_size) space -= m_size;
+        return space;
+    }
+
+private:
+    RingBuffer(const RingBuffer &); // not provided
+    RingBuffer &operator=(const RingBuffer &); // not provided
+};
+
+template <typename T>
+RingBuffer<T>::RingBuffer(int n) :
+    m_buffer(allocate<T>(n + 1)),
+    m_writer(0),
+    m_size(n + 1)
+{
+#ifdef DEBUG_RINGBUFFER
+    std::cerr << "RingBuffer<T>[" << this << "]::RingBuffer(" << n << ")" << std::endl;
+#endif
+
+    m_reader = 0;
+}
+
+template <typename T>
+RingBuffer<T>::~RingBuffer()
+{
+#ifdef DEBUG_RINGBUFFER
+    std::cerr << "RingBuffer<T>[" << this << "]::~RingBuffer" << std::endl;
+#endif
+
+    deallocate(m_buffer);
+}
+
+template <typename T>
+int
+RingBuffer<T>::getSize() const
+{
+#ifdef DEBUG_RINGBUFFER
+    std::cerr << "RingBuffer<T>[" << this << "]::getSize(): " << m_size-1 << std::endl;
+#endif
+
+    return m_size - 1;
+}
+
+template <typename T>
+RingBuffer<T> *
+RingBuffer<T>::resized(int newSize) const
+{
+    RingBuffer<T> *newBuffer = new RingBuffer<T>(newSize);
+
+    int w = m_writer;
+    int r = m_reader;
+
+    while (r != w) {
+        T value = m_buffer[r];
+        newBuffer->write(&value, 1);
+        if (++r == m_size) r = 0;
+    }
+
+    return newBuffer;
+}
+
+template <typename T>
+void
+RingBuffer<T>::reset()
+{
+#ifdef DEBUG_RINGBUFFER
+    std::cerr << "RingBuffer<T>[" << this << "]::reset" << std::endl;
+#endif
+
+    m_reader = m_writer;
+}
+
+template <typename T>
+int
+RingBuffer<T>::getReadSpace() const
+{
+    return readSpaceFor(m_writer, m_reader);
+}
+
+template <typename T>
+int
+RingBuffer<T>::getWriteSpace() const
+{
+    return writeSpaceFor(m_writer, m_reader);
+}
+
+template <typename T>
+template <typename S>
+int
+RingBuffer<T>::read(S *const BQ_R__ destination, int n)
+{
+    int w = m_writer;
+    int r = m_reader;
+
+    int available = readSpaceFor(w, r);
+    if (n > available) {
+	std::cerr << "WARNING: RingBuffer::read: " << n << " requested, only "
+                  << available << " available" << std::endl;
+	n = available;
+    }
+    if (n == 0) return n;
+
+    int here = m_size - r;
+    T *const BQ_R__ bufbase = m_buffer + r;
+
+    if (here >= n) {
+        v_convert(destination, bufbase, n);
+    } else {
+        v_convert(destination, bufbase, here);
+        v_convert(destination + here, m_buffer, n - here);
+    }
+
+    r += n;
+    while (r >= m_size) r -= m_size;
+
+    BQ_MBARRIER();
+    m_reader = r;
+
+    return n;
+}
+
+template <typename T>
+template <typename S>
+int
+RingBuffer<T>::readAdding(S *const BQ_R__ destination, int n)
+{
+    int w = m_writer;
+    int r = m_reader;
+
+    int available = readSpaceFor(w, r);
+    if (n > available) {
+	std::cerr << "WARNING: RingBuffer::read: " << n << " requested, only "
+                  << available << " available" << std::endl;
+	n = available;
+    }
+    if (n == 0) return n;
+
+    int here = m_size - r;
+    T *const BQ_R__ bufbase = m_buffer + r;
+
+    if (here >= n) {
+        v_add(destination, bufbase, n);
+    } else {
+        v_add(destination, bufbase, here);
+        v_add(destination + here, m_buffer, n - here);
+    }
+
+    r += n;
+    while (r >= m_size) r -= m_size;
+
+    BQ_MBARRIER();
+    m_reader = r;
+
+    return n;
+}
+
+template <typename T>
+T
+RingBuffer<T>::readOne()
+{
+    int w = m_writer;
+    int r = m_reader;
+
+    if (w == r) {
+	std::cerr << "WARNING: RingBuffer::readOne: no sample available"
+		  << std::endl;
+	return T();
+    }
+
+    T value = m_buffer[r];
+    if (++r == m_size) r = 0;
+
+    BQ_MBARRIER();
+    m_reader = r;
+
+    return value;
+}
+
+template <typename T>
+int
+RingBuffer<T>::peek(T *const BQ_R__ destination, int n) const
+{
+    int w = m_writer;
+    int r = m_reader;
+
+    int available = readSpaceFor(w, r);
+    if (n > available) {
+	std::cerr << "WARNING: RingBuffer::peek: " << n << " requested, only "
+                  << available << " available" << std::endl;
+	memset(destination + available, 0, (n - available) * sizeof(T));
+	n = available;
+    }
+    if (n == 0) return n;
+
+    int here = m_size - r;
+    const T *const BQ_R__ bufbase = m_buffer + r;
+
+    if (here >= n) {
+        v_copy(destination, bufbase, n);
+    } else {
+        v_copy(destination, bufbase, here);
+        v_copy(destination + here, m_buffer, n - here);
+    }
+
+    return n;
+}
+
+template <typename T>
+T
+RingBuffer<T>::peekOne() const
+{
+    int w = m_writer;
+    int r = m_reader;
+
+    if (w == r) {
+	std::cerr << "WARNING: RingBuffer::peekOne: no sample available"
+		  << std::endl;
+	return 0;
+    }
+
+    T value = m_buffer[r];
+    return value;
+}
+
+template <typename T>
+int
+RingBuffer<T>::skip(int n)
+{
+    int w = m_writer;
+    int r = m_reader;
+
+    int available = readSpaceFor(w, r);
+    if (n > available) {
+	std::cerr << "WARNING: RingBuffer::skip: " << n << " requested, only "
+                  << available << " available" << std::endl;
+	n = available;
+    }
+    if (n == 0) return n;
+
+    r += n;
+    while (r >= m_size) r -= m_size;
+
+    // No memory barrier required, because we didn't read any data
+    m_reader = r;
+
+    return n;
+}
+
+template <typename T>
+template <typename S>
+int
+RingBuffer<T>::write(const S *const BQ_R__ source, int n)
+{
+    int w = m_writer;
+    int r = m_reader;
+
+    int available = writeSpaceFor(w, r);
+    if (n > available) {
+	std::cerr << "WARNING: RingBuffer::write: " << n
+                  << " requested, only room for " << available << std::endl;
+	n = available;
+    }
+    if (n == 0) return n;
+
+    int here = m_size - w;
+    T *const BQ_R__ bufbase = m_buffer + w;
+
+    if (here >= n) {
+        v_convert<S, T>(bufbase, source, n);
+    } else {
+        v_convert<S, T>(bufbase, source, here);
+        v_convert<S, T>(m_buffer, source + here, n - here);
+    }
+
+    w += n;
+    while (w >= m_size) w -= m_size;
+
+    BQ_MBARRIER();
+    m_writer = w;
+
+    return n;
+}
+
+template <typename T>
+int
+RingBuffer<T>::zero(int n)
+{
+    int w = m_writer;
+    int r = m_reader;
+
+    int available = writeSpaceFor(w, r);
+    if (n > available) {
+	std::cerr << "WARNING: RingBuffer::zero: " << n
+                  << " requested, only room for " << available << std::endl;
+	n = available;
+    }
+    if (n == 0) return n;
+
+    int here = m_size - w;
+    T *const BQ_R__ bufbase = m_buffer + w;
+
+    if (here >= n) {
+        v_zero(bufbase, n);
+    } else {
+        v_zero(bufbase, here);
+        v_zero(m_buffer, n - here);
+    }
+
+    w += n;
+    while (w >= m_size) w -= m_size;
+
+    BQ_MBARRIER();
+    m_writer = w;
+
+    return n;
+}
+
+}
+
+#endif // BQVEC_RINGBUFFER_H
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/fft/native/bqvec/bqvec/VectorOps.h	Sat Oct 17 15:00:08 2015 +0100
@@ -0,0 +1,1110 @@
+/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*-  vi:set ts=8 sts=4 sw=4: */
+
+/*
+    bqvec
+
+    A small library for vector arithmetic and allocation in C++ using
+    raw C pointer arrays.
+
+    Copyright 2007-2015 Particular Programs Ltd.
+
+    Permission is hereby granted, free of charge, to any person
+    obtaining a copy of this software and associated documentation
+    files (the "Software"), to deal in the Software without
+    restriction, including without limitation the rights to use, copy,
+    modify, merge, publish, distribute, sublicense, and/or sell copies
+    of the Software, and to permit persons to whom the Software is
+    furnished to do so, subject to the following conditions:
+
+    The above copyright notice and this permission notice shall be
+    included in all copies or substantial portions of the Software.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+    EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+    NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
+    ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
+    CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+    WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+    Except as contained in this notice, the names of Chris Cannam and
+    Particular Programs Ltd shall not be used in advertising or
+    otherwise to promote the sale, use or other dealings in this
+    Software without prior written authorization.
+*/
+
+#ifndef BQVEC_VECTOR_OPS_H
+#define BQVEC_VECTOR_OPS_H
+
+#ifdef HAVE_IPP
+#ifndef _MSC_VER
+#include <inttypes.h>
+#endif
+#include <ipps.h>
+#include <ippac.h>
+#endif
+
+#ifdef HAVE_VDSP
+#include <Accelerate/Accelerate.h>
+#endif
+
+#include <cstring>
+
+#include "Restrict.h"
+
+namespace breakfastquay {
+
+/**
+ * General principle:
+ * 
+ * Write basic vector-manipulation loops in such a way as to promote
+ * the likelihood that a good current C++ compiler can auto-vectorize
+ * them (e.g. gcc-4.x with -ftree-vectorize). Provide calls out to
+ * supported vector libraries (e.g. IPP, Accelerate) where useful.
+ * No intrinsics or assembly.
+ *
+ * Note that all size and index arguments are plain machine ints, to
+ * facilitate compiler optimization and vectorization. In general
+ * these functions should be used only with buffers whose sizes are
+ * calculated from known processing parameters and that are known to
+ * be much smaller than 32-bit int range. For security reasons you
+ * should not use these functions with buffers whose sizes may be
+ * under control of the user or external input.
+ *
+ * Argument ordering:
+ *
+ * If a function operates on one or more vector sequences in memory,
+ * they appear as pointers at the start of the argument list. If one
+ * vector is the "destination", it appears first, with "source" second
+ * if present (i.e. argument ordering follows memcpy).
+ * 
+ * The final argument is always a count of the number of elements in
+ * each vector. 
+ *
+ * Some functions operate on a set of vectors at once: their names all
+ * contain the text _channels, and the number of channels (i.e. number
+ * of vectors) is the argument before last.
+ *
+ * Any additional arguments, e.g. scale factors, appear between the
+ * vector pointers at the start and the counts at the end.
+ *
+ * The caller takes responsibility for ensuring that vector pointer
+ * arguments are not aliased, i.e. that vectors provided as separate
+ * arguments to the same function are distinct and do not overlap,
+ * except where documented.
+ */
+
+/**
+ * v_zero
+ *
+ * Zero the elements in the given vector, of length \arg count.
+ */
+template<typename T>
+inline void v_zero(T *const BQ_R__ vec,
+                   const int count)
+{
+    const T value = T(0);
+    for (int i = 0; i < count; ++i) {
+        vec[i] = value;
+    }
+}
+
+#if defined HAVE_IPP
+template<> 
+inline void v_zero(float *const BQ_R__ vec, 
+                   const int count)
+{
+    ippsZero_32f(vec, count);
+}
+template<> 
+inline void v_zero(double *const BQ_R__ vec,
+                   const int count)
+{
+    ippsZero_64f(vec, count);
+}
+#elif defined HAVE_VDSP
+template<> 
+inline void v_zero(float *const BQ_R__ vec, 
+                   const int count)
+{
+    vDSP_vclr(vec, 1, count);
+}
+template<> 
+inline void v_zero(double *const BQ_R__ vec,
+                   const int count)
+{
+    vDSP_vclrD(vec, 1, count);
+}
+#endif
+
+/**
+ * v_zero_channels
+ *
+ * Zero the elements in the given set of \arg channels vectors, each
+ * of length \arg count.
+ */
+template<typename T>
+inline void v_zero_channels(T *const BQ_R__ *const BQ_R__ vec,
+                            const int channels,
+                            const int count)
+{
+    for (int c = 0; c < channels; ++c) {
+        v_zero(vec[c], count);
+    }
+}
+
+/**
+ * v_set
+ *
+ * Set all of the elements in the given vector, of length \arg count,
+ * to \arg value.
+ */
+template<typename T>
+inline void v_set(T *const BQ_R__ vec,
+                  const T value,
+                  const int count)
+{
+    for (int i = 0; i < count; ++i) {
+        vec[i] = value;
+    }
+}
+
+/**
+ * v_copy
+ *
+ * Copy the contents of the vector \arg src to the vector \arg dst,
+ * both of length \arg count.
+ *
+ * Caller guarantees that \arg src and \arg dst are non-overlapping.
+ */
+template<typename T>
+inline void v_copy(T *const BQ_R__ dst,
+                   const T *const BQ_R__ src,
+                   const int count)
+{
+    for (int i = 0; i < count; ++i) {
+        dst[i] = src[i];
+    }
+}
+
+#if defined HAVE_IPP
+template<>
+inline void v_copy(float *const BQ_R__ dst,
+                   const float *const BQ_R__ src,
+                   const int count)
+{
+    ippsCopy_32f(src, dst, count);
+}
+template<>
+inline void v_copy(double *const BQ_R__ dst,
+                   const double *const BQ_R__ src,
+                   const int count)
+{
+    ippsCopy_64f(src, dst, count);
+}
+#endif
+
+/**
+ * v_copy_channels
+ *
+ * Copy the contents of the individual vectors in the set \arg src to
+ * the corresponding vectors in the set \arg dst. All vectors have
+ * length \arg count and there are \arg channels vectors in each set.
+ *
+ * Caller guarantees that all of the \arg src and \arg dst vectors are
+ * non-overlapping with each other.
+ */
+template<typename T>
+inline void v_copy_channels(T *const BQ_R__ *const BQ_R__ dst,
+                            const T *const BQ_R__ *const BQ_R__ src,
+                            const int channels,
+                            const int count)
+{
+    for (int c = 0; c < channels; ++c) {
+        v_copy(dst[c], src[c], count);
+    }
+}
+
+/**
+ * v_move
+ *
+ * Copy the contents of vector \arg src to the vector \arg dst, both
+ * of length \arg count. The two vectors may overlap. (If you know
+ * that they cannot overlap, use v_copy instead.)
+ */
+template<typename T>
+inline void v_move(T *const dst,       // not BQ_R__ (aliased)
+                   const T *const src, // not BQ_R__ (aliased)
+                   const int count)
+{
+    memmove(dst, src, count * sizeof(T));
+}
+
+#if defined HAVE_IPP
+template<>
+inline void v_move(float *const dst,
+                   const float *const src,
+                   const int count)
+{
+    ippsMove_32f(src, dst, count);
+}
+template<>
+inline void v_move(double *const dst,
+                   const double *const src,
+                   const int count)
+{
+    ippsMove_64f(src, dst, count);
+}
+#endif
+
+/**
+ * v_convert
+ *
+ * Copy the contents of vector \arg src to the vector \arg dst, both
+ * of length \arg count. The two vectors may have different value
+ * types, e.g. double and float. If they have the same type, this is
+ * equivalent to v_copy.
+ *
+ * Caller guarantees that \arg src and \arg dst are non-overlapping.
+ */
+template<typename T, typename U>
+inline void v_convert(U *const BQ_R__ dst,
+                      const T *const BQ_R__ src,
+                      const int count)
+{
+    for (int i = 0; i < count; ++i) {
+        dst[i] = U(src[i]);
+    }
+}
+
+template<>
+inline void v_convert(float *const BQ_R__ dst,
+                      const float *const BQ_R__ src,
+                      const int count)
+{
+    v_copy(dst, src, count);
+}
+template<>
+inline void v_convert(double *const BQ_R__ dst,
+                      const double *const BQ_R__ src,
+                      const int count)
+{
+    v_copy(dst, src, count);
+}
+
+#if defined HAVE_IPP
+template<>
+inline void v_convert(double *const BQ_R__ dst,
+                      const float *const BQ_R__ src,
+                      const int count)
+{
+    ippsConvert_32f64f(src, dst, count);
+}
+template<>
+inline void v_convert(float *const BQ_R__ dst,
+                      const double *const BQ_R__ src,
+                      const int count)
+{
+    ippsConvert_64f32f(src, dst, count);
+}
+#elif defined HAVE_VDSP
+template<>
+inline void v_convert(double *const BQ_R__ dst,
+                      const float *const BQ_R__ src,
+                      const int count)
+{
+    vDSP_vspdp((float *)src, 1, dst, 1, count);
+}
+template<>
+inline void v_convert(float *const BQ_R__ dst,
+                      const double *const BQ_R__ src,
+                      const int count)
+{
+    vDSP_vdpsp((double *)src, 1, dst, 1, count);
+}
+#endif
+
+/**
+ * v_convert_channels
+ *
+ * Copy the contents of the individual vectors in the set \arg src to
+ * the corresponding vectors in the set \arg dst. All vectors have
+ * length \arg count, and there are \arg channels vectors in each
+ * set. The source and destination vectors may have different value
+ * types, e.g. double and float. If they have the same type, this is
+ * equivalent to v_copy_channels.
+ *
+ * Caller guarantees that all of the \arg src and \arg dst vectors are
+ * non-overlapping with each other.
+ */
+template<typename T, typename U>
+inline void v_convert_channels(U *const BQ_R__ *const BQ_R__ dst,
+                               const T *const BQ_R__ *const BQ_R__ src,
+                               const int channels,
+                               const int count)
+{
+    for (int c = 0; c < channels; ++c) {
+        v_convert(dst[c], src[c], count);
+    }
+}
+
+/**
+ * v_add
+ *
+ * Add the elements in the vector \arg src to the corresponding
+ * elements in the vector \arg dst, both of length arg \count, leaving
+ * the result in \arg dst.
+ *
+ * Caller guarantees that \arg src and \arg dst are non-overlapping.
+ */
+template<typename T>
+inline void v_add(T *const BQ_R__ dst,
+                  const T *const BQ_R__ src,
+                  const int count)
+{
+    for (int i = 0; i < count; ++i) {
+        dst[i] += src[i];
+    }
+}
+
+/**
+ * v_add
+ *
+ * Add the constant \arg value to every element of the vector \arg
+ * dst, of length arg \count, leaving the result in \arg dst.
+ */
+template<typename T>
+inline void v_add(T *const BQ_R__ dst,
+                  const T value,
+                  const int count)
+{
+    for (int i = 0; i < count; ++i) {
+        dst[i] += value;
+    }
+}
+
+#if defined HAVE_IPP
+template<>
+inline void v_add(float *const BQ_R__ dst,
+                  const float *const BQ_R__ src,
+                  const int count)
+{
+    ippsAdd_32f_I(src, dst, count);
+}    
+inline void v_add(double *const BQ_R__ dst,
+                  const double *const BQ_R__ src,
+                  const int count)
+{
+    ippsAdd_64f_I(src, dst, count);
+}    
+#endif
+
+/**
+ * v_add_channels
+ *
+ * Add the elements in the individual vectors in the set \arg src to
+ * the corresponding elements of the corresponding vectors in \arg
+ * dst, leaving the results in \arg dst. All vectors have length \arg
+ * count, and there are \arg channels vectors in each set.
+ *
+ * Caller guarantees that all of the \arg src and \arg dst vectors are
+ * non-overlapping with each other.
+ */
+template<typename T>
+inline void v_add_channels(T *const BQ_R__ *const BQ_R__ dst,
+                           const T *const BQ_R__ *const BQ_R__ src,
+                           const int channels, const int count)
+{
+    for (int c = 0; c < channels; ++c) {
+        v_add(dst[c], src[c], count);
+    }
+}
+
+/**
+ * v_add_with_gain
+ *
+ * Add the elements in the vector \arg src, each multiplied by the
+ * constant factor \arg gain, to the corresponding elements in the
+ * vector \arg dst, both of length arg \count, leaving the result in
+ * \arg dst.
+ *
+ * Caller guarantees that \arg src and \arg dst are non-overlapping.
+ */
+template<typename T, typename G>
+inline void v_add_with_gain(T *const BQ_R__ dst,
+                            const T *const BQ_R__ src,
+                            const G gain,
+                            const int count)
+{
+    for (int i = 0; i < count; ++i) {
+        dst[i] += src[i] * gain;
+    }
+}
+
+/**
+ * v_add_channels_with_gain
+ *
+ * Add the elements in the individual vectors in the set \arg src,
+ * each multiplied by the constant factor \arg gain, to the
+ * corresponding elements of the corresponding vectors in \arg dst,
+ * leaving the results in \arg dst. All vectors have length \arg
+ * count, and there are \arg channels vectors in each set.
+ *
+ * Caller guarantees that all of the \arg src and \arg dst vectors are
+ * non-overlapping with each other.
+ */
+template<typename T, typename G>
+inline void v_add_channels_with_gain(T *const BQ_R__ *const BQ_R__ dst,
+                                     const T *const BQ_R__ *const BQ_R__ src,
+                                     const G gain,
+                                     const int channels,
+                                     const int count)
+{
+    for (int c = 0; c < channels; ++c) {
+        v_add_with_gain(dst[c], src[c], gain, count);
+    }
+}
+
+/**
+ * v_subtract
+ *
+ * Subtract the elements in the vector \arg src from the corresponding
+ * elements in the vector \arg dst, both of length arg \count, leaving
+ * the result in \arg dst.
+ *
+ * Caller guarantees that \arg src and \arg dst are non-overlapping.
+ */
+template<typename T>
+inline void v_subtract(T *const BQ_R__ dst,
+                       const T *const BQ_R__ src,
+                       const int count)
+{
+    for (int i = 0; i < count; ++i) {
+        dst[i] -= src[i];
+    }
+}
+
+#if defined HAVE_IPP
+template<>
+inline void v_subtract(float *const BQ_R__ dst,
+                       const float *const BQ_R__ src,
+                       const int count)
+{
+    ippsSub_32f_I(src, dst, count);
+}    
+inline void v_subtract(double *const BQ_R__ dst,
+                       const double *const BQ_R__ src,
+                       const int count)
+{
+    ippsSub_64f_I(src, dst, count);
+}    
+#endif
+
+/**
+ * v_scale
+ *
+ * Scale the elements in the vector \arg dst, of length \arg count, by
+ * the factor \arg gain.
+ */
+template<typename T, typename G>
+inline void v_scale(T *const BQ_R__ dst,
+                    const G gain,
+                    const int count)
+{
+    for (int i = 0; i < count; ++i) {
+        dst[i] *= gain;
+    }
+}
+
+#if defined HAVE_IPP 
+template<>
+inline void v_scale(float *const BQ_R__ dst,
+                    const float gain,
+                    const int count)
+{
+    ippsMulC_32f_I(gain, dst, count);
+}
+template<>
+inline void v_scale(double *const BQ_R__ dst,
+                    const double gain,
+                    const int count)
+{
+    ippsMulC_64f_I(gain, dst, count);
+}
+#endif
+
+/**
+ * v_multiply
+ *
+ * Multiply the elements in the vector \arg dst by the corresponding
+ * elements in the vector \arg src, both of length arg \count, leaving
+ * the result in \arg dst.
+ *
+ * Caller guarantees that \arg src and \arg dst are non-overlapping.
+ */
+template<typename T>
+inline void v_multiply(T *const BQ_R__ dst,
+                       const T *const BQ_R__ src,
+                       const int count)
+{
+    for (int i = 0; i < count; ++i) {
+        dst[i] *= src[i];
+    }
+}
+
+#if defined HAVE_IPP 
+template<>
+inline void v_multiply(float *const BQ_R__ dst,
+                       const float *const BQ_R__ src,
+                       const int count)
+{
+    ippsMul_32f_I(src, dst, count);
+}
+template<>
+inline void v_multiply(double *const BQ_R__ dst,
+                       const double *const BQ_R__ src,
+                       const int count)
+{
+    ippsMul_64f_I(src, dst, count);
+}
+#endif
+
+/**
+ * v_multiply
+ *
+ * Multiply the corresponding elements of the vectors \arg src1 and
+ * \arg src2, both of length arg \count, and write the results into
+ * \arg dst.
+ *
+ * Caller guarantees that \arg src1, \arg src2 and \arg dst are
+ * non-overlapping.
+ */
+template<typename T>
+inline void v_multiply(T *const BQ_R__ dst,
+                       const T *const BQ_R__ src1,
+                       const T *const BQ_R__ src2,
+                       const int count)
+{
+    for (int i = 0; i < count; ++i) {
+        dst[i] = src1[i] * src2[i];
+    }
+}
+
+#if defined HAVE_IPP 
+template<>
+inline void v_multiply(float *const BQ_R__ dst,
+                       const float *const BQ_R__ src1,
+                       const float *const BQ_R__ src2,
+                       const int count)
+{
+    ippsMul_32f(src1, src2, dst, count);
+}    
+template<>
+inline void v_multiply(double *const BQ_R__ dst,
+                       const double *const BQ_R__ src1,
+                       const double *const BQ_R__ src2,
+                       const int count)
+{
+    ippsMul_64f(src1, src2, dst, count);
+}
+#endif
+
+/**
+ * v_divide
+ *
+ * Divide the elements in the vector \arg dst by the corresponding
+ * elements in the vector \arg src, both of length arg \count, leaving
+ * the result in \arg dst.
+ *
+ * Caller guarantees that \arg src and \arg dst are non-overlapping.
+ */
+template<typename T>
+inline void v_divide(T *const BQ_R__ dst,
+                     const T *const BQ_R__ src,
+                     const int count)
+{
+    for (int i = 0; i < count; ++i) {
+        dst[i] /= src[i];
+    }
+}
+
+#if defined HAVE_IPP 
+template<>
+inline void v_divide(float *const BQ_R__ dst,
+                     const float *const BQ_R__ src,
+                     const int count)
+{
+    ippsDiv_32f_I(src, dst, count);
+}
+template<>
+inline void v_divide(double *const BQ_R__ dst,
+                     const double *const BQ_R__ src,
+                     const int count)
+{
+    ippsDiv_64f_I(src, dst, count);
+}
+#endif
+
+/**
+ * v_multiply_and_add
+ *
+ * Multiply the corresponding elements of the vectors \arg src1 and
+ * \arg src2, both of length arg \count, and add the results to the
+ * corresponding elements of vector \arg dst.
+ *
+ * Caller guarantees that \arg src1, \arg src2 and \arg dst are
+ * non-overlapping.
+ */
+template<typename T>
+inline void v_multiply_and_add(T *const BQ_R__ dst,
+                               const T *const BQ_R__ src1,
+                               const T *const BQ_R__ src2,
+                               const int count)
+{
+    for (int i = 0; i < count; ++i) {
+        dst[i] += src1[i] * src2[i];
+    }
+}
+
+#if defined HAVE_IPP
+template<>
+inline void v_multiply_and_add(float *const BQ_R__ dst,
+                               const float *const BQ_R__ src1,
+                               const float *const BQ_R__ src2,
+                               const int count)
+{
+    ippsAddProduct_32f(src1, src2, dst, count);
+}
+template<>
+inline void v_multiply_and_add(double *const BQ_R__ dst,
+                               const double *const BQ_R__ src1,
+                               const double *const BQ_R__ src2,
+                               const int count)
+{
+    ippsAddProduct_64f(src1, src2, dst, count);
+}
+#endif
+
+/**
+ * v_sum
+ *
+ * Return the sum of the elements in vector \arg src, of length \arg
+ * count.
+ */
+template<typename T>
+inline T v_sum(const T *const BQ_R__ src,
+               const int count)
+{
+    T result = T();
+    for (int i = 0; i < count; ++i) {
+        result += src[i];
+    }
+    return result;
+}
+
+/**
+ * v_multiply_and_sum
+ *
+ * Multiply the corresponding elements of the vectors \arg src1 and
+ * \arg src2, both of length arg \count, sum the results, and return
+ * the sum as a scalar value.
+ *
+ * Caller guarantees that \arg src1 and \arg src2 are non-overlapping.
+ */
+template<typename T>
+inline T v_multiply_and_sum(const T *const BQ_R__ src1,
+                            const T *const BQ_R__ src2,
+                            const int count)
+{
+    T result = T();
+    for (int i = 0; i < count; ++i) {
+        result += src1[i] * src2[i];
+    }
+    return result;
+}
+
+/**
+ * v_log
+ *
+ * Replace each element in vector \arg dst, of length \arg count, with
+ * its natural logarithm.
+ */
+template<typename T>
+inline void v_log(T *const BQ_R__ dst,
+                  const int count)
+{
+    for (int i = 0; i < count; ++i) {
+        dst[i] = log(dst[i]);
+    }
+}
+
+#if defined HAVE_IPP
+template<>
+inline void v_log(float *const BQ_R__ dst,
+                  const int count)
+{
+    ippsLn_32f_I(dst, count);
+}
+template<>
+inline void v_log(double *const BQ_R__ dst,
+                  const int count)
+{
+    ippsLn_64f_I(dst, count);
+}
+#elif defined HAVE_VDSP
+// no in-place vForce functions for these -- can we use the
+// out-of-place functions with equal input and output vectors? can we
+// use an out-of-place one with temporary buffer and still be faster
+// than doing it any other way?
+template<>
+inline void v_log(float *const BQ_R__ dst,
+                  const int count)
+{
+    float tmp[count];
+    vvlogf(tmp, dst, &count);
+    v_copy(dst, tmp, count);
+}
+template<>
+inline void v_log(double *const BQ_R__ dst,
+                  const int count)
+{
+    double tmp[count];
+    vvlog(tmp, dst, &count);
+    v_copy(dst, tmp, count);
+}
+#endif
+
+/**
+ * v_exp
+ *
+ * Replace each element in vector \arg dst, of length \arg count, with
+ * its base-e exponential.
+ */
+template<typename T>
+inline void v_exp(T *const BQ_R__ dst,
+                  const int count)
+{
+    for (int i = 0; i < count; ++i) {
+        dst[i] = exp(dst[i]);
+    }
+}
+
+#if defined HAVE_IPP
+template<>
+inline void v_exp(float *const BQ_R__ dst,
+                  const int count)
+{
+    ippsExp_32f_I(dst, count);
+}
+template<>
+inline void v_exp(double *const BQ_R__ dst,
+                  const int count)
+{
+    ippsExp_64f_I(dst, count);
+}
+#elif defined HAVE_VDSP
+// no in-place vForce functions for these -- can we use the
+// out-of-place functions with equal input and output vectors? can we
+// use an out-of-place one with temporary buffer and still be faster
+// than doing it any other way?
+template<>
+inline void v_exp(float *const BQ_R__ dst,
+                  const int count)
+{
+    float tmp[count];
+    vvexpf(tmp, dst, &count);
+    v_copy(dst, tmp, count);
+}
+template<>
+inline void v_exp(double *const BQ_R__ dst,
+                  const int count)
+{
+    double tmp[count];
+    vvexp(tmp, dst, &count);
+    v_copy(dst, tmp, count);
+}
+#endif
+
+/**
+ * v_sqrt
+ *
+ * Replace each element in vector \arg dst, of length \arg count, with
+ * its square root.
+ */
+template<typename T>
+inline void v_sqrt(T *const BQ_R__ dst,
+                   const int count)
+{
+    for (int i = 0; i < count; ++i) {
+        dst[i] = sqrt(dst[i]);
+    }
+}
+
+#if defined HAVE_IPP
+template<>
+inline void v_sqrt(float *const BQ_R__ dst,
+                   const int count)
+{
+    ippsSqrt_32f_I(dst, count);
+}
+template<>
+inline void v_sqrt(double *const BQ_R__ dst,
+                   const int count)
+{
+    ippsSqrt_64f_I(dst, count);
+}
+#elif defined HAVE_VDSP
+// no in-place vForce functions for these -- can we use the
+// out-of-place functions with equal input and output vectors? can we
+// use an out-of-place one with temporary buffer and still be faster
+// than doing it any other way?
+template<>
+inline void v_sqrt(float *const BQ_R__ dst,
+                   const int count)
+{
+    float tmp[count];
+    vvsqrtf(tmp, dst, &count);
+    v_copy(dst, tmp, count);
+}
+template<>
+inline void v_sqrt(double *const BQ_R__ dst,
+                   const int count)
+{
+    double tmp[count];
+    vvsqrt(tmp, dst, &count);
+    v_copy(dst, tmp, count);
+}
+#endif
+
+/**
+ * v_square
+ *
+ * Replace each element in vector \arg dst, of length \arg count, with
+ * its square.
+ */
+template<typename T>
+inline void v_square(T *const BQ_R__ dst,
+                   const int count)
+{
+    for (int i = 0; i < count; ++i) {
+        dst[i] = dst[i] * dst[i];
+    }
+}
+
+#if defined HAVE_IPP
+template<>
+inline void v_square(float *const BQ_R__ dst,
+                   const int count)
+{
+    ippsSqr_32f_I(dst, count);
+}
+template<>
+inline void v_square(double *const BQ_R__ dst,
+                   const int count)
+{
+    ippsSqr_64f_I(dst, count);
+}
+#endif
+
+/**
+ * v_abs
+ *
+ * Replace each element in vector \arg dst, of length \arg count, with
+ * its absolute value.
+ */
+template<typename T>
+inline void v_abs(T *const BQ_R__ dst,
+                  const int count)
+{
+    for (int i = 0; i < count; ++i) {
+        dst[i] = fabs(dst[i]);
+    }
+}
+
+#if defined HAVE_IPP
+template<>
+inline void v_abs(float *const BQ_R__ dst,
+                  const int count)
+{
+    ippsAbs_32f_I(dst, count);
+}
+template<>
+inline void v_abs(double *const BQ_R__ dst,
+                  const int count)
+{
+    ippsAbs_64f_I(dst, count);
+}
+#elif defined HAVE_VDSP
+template<>
+inline void v_abs(float *const BQ_R__ dst,
+                  const int count)
+{
+    float tmp[count];
+#if (defined(MACOSX_DEPLOYMENT_TARGET) && MACOSX_DEPLOYMENT_TARGET <= 1070 && MAC_OS_X_VERSION_MIN_REQUIRED <= 1070)
+    vvfabf(tmp, dst, &count);
+#else
+    vvfabsf(tmp, dst, &count);
+#endif
+    v_copy(dst, tmp, count);
+}
+#endif
+
+/**
+ * v_interleave
+ *
+ * Interleave (zip) the \arg channels vectors in \arg src, each of
+ * length \arg count, into the single vector \arg dst of length \arg
+ * channels * \arg count.
+ *
+ * Caller guarantees that the \arg src and \arg dst vectors are
+ * non-overlapping.
+ */
+template<typename T>
+inline void v_interleave(T *const BQ_R__ dst,
+                         const T *const BQ_R__ *const BQ_R__ src,
+                         const int channels, 
+                         const int count)
+{
+    int idx = 0;
+    switch (channels) {
+    case 2:
+        // common case, may be vectorized by compiler if hardcoded
+        for (int i = 0; i < count; ++i) {
+            for (int j = 0; j < 2; ++j) {
+                dst[idx++] = src[j][i];
+            }
+        }
+        return;
+    case 1:
+        v_copy(dst, src[0], count);
+        return;
+    default:
+        for (int i = 0; i < count; ++i) {
+            for (int j = 0; j < channels; ++j) {
+                dst[idx++] = src[j][i];
+            }
+        }
+    }
+}
+
+#if defined HAVE_IPP 
+template<>
+inline void v_interleave(float *const BQ_R__ dst,
+                         const float *const BQ_R__ *const BQ_R__ src,
+                         const int channels, 
+                         const int count)
+{
+    ippsInterleave_32f((const Ipp32f **)src, channels, count, dst);
+}
+// IPP does not (currently?) provide double-precision interleave
+#endif
+
+/**
+ * v_deinterleave
+ *
+ * Deinterleave (unzip) the single vector \arg src, of length \arg
+ * channels * \arg count, into the \arg channels vectors in \arg dst,
+ * each of length \arg count.
+ *
+ * Caller guarantees that the \arg src and \arg dst vectors are
+ * non-overlapping.
+ */
+template<typename T>
+inline void v_deinterleave(T *const BQ_R__ *const BQ_R__ dst,
+                           const T *const BQ_R__ src,
+                           const int channels, 
+                           const int count)
+{
+    int idx = 0;
+    switch (channels) {
+    case 2:
+        // common case, may be vectorized by compiler if hardcoded
+        for (int i = 0; i < count; ++i) {
+            for (int j = 0; j < 2; ++j) {
+                dst[j][i] = src[idx++];
+            }
+        }
+        return;
+    case 1:
+        v_copy(dst[0], src, count);
+        return;
+    default:
+        for (int i = 0; i < count; ++i) {
+            for (int j = 0; j < channels; ++j) {
+                dst[j][i] = src[idx++];
+            }
+        }
+    }
+}
+
+#if defined HAVE_IPP
+template<>
+inline void v_deinterleave(float *const BQ_R__ *const BQ_R__ dst,
+                           const float *const BQ_R__ src,
+                           const int channels, 
+                           const int count)
+{
+    ippsDeinterleave_32f((const Ipp32f *)src, channels, count, (Ipp32f **)dst);
+}
+// IPP does not (currently?) provide double-precision deinterleave
+#endif
+
+/**
+ * v_fftshift
+ *
+ * Perform an in-place left-right shift of the vector \arg vec of
+ * length \arg count, swapping the first and second halves of the
+ * vector.
+ */
+template<typename T>
+inline void v_fftshift(T *const BQ_R__ vec,
+                       const int count)
+{
+    const int hs = count/2;
+    for (int i = 0; i < hs; ++i) {
+        T t = vec[i];
+        vec[i] = vec[i + hs];
+        vec[i + hs] = t;
+    }
+}
+
+/**
+ * v_mean
+ *
+ * Return the mean of the values in the vector \arg vec, of length
+ * \arg count.
+ */
+template<typename T>
+inline T v_mean(const T *const BQ_R__ vec, const int count)
+{
+    T t = T(0);
+    for (int i = 0; i < count; ++i) {
+        t += vec[i];
+    }
+    t /= T(count);
+    return t;
+}
+
+/**
+ * v_mean_channels
+ *
+ * Return the mean of all the values in the set of \arg channels
+ * vectors in \arg vec, each of length \arg count. (This is the single
+ * mean of all values across all channels, not one mean per channel.)
+ */
+template<typename T>
+inline T v_mean_channels(const T *const BQ_R__ *const BQ_R__ vec,
+                         const int channels,
+                         const int count)
+{
+    T t = T(0);
+    for (int c = 0; c < channels; ++c) {
+        t += v_mean(vec[c], count);
+    }
+    t /= T(channels);
+    return t;
+}
+
+}
+
+#endif
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/fft/native/bqvec/bqvec/VectorOpsComplex.h	Sat Oct 17 15:00:08 2015 +0100
@@ -0,0 +1,548 @@
+/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*-  vi:set ts=8 sts=4 sw=4: */
+
+/*
+    bqvec
+
+    A small library for vector arithmetic and allocation in C++ using
+    raw C pointer arrays.
+
+    Copyright 2007-2015 Particular Programs Ltd.
+
+    Permission is hereby granted, free of charge, to any person
+    obtaining a copy of this software and associated documentation
+    files (the "Software"), to deal in the Software without
+    restriction, including without limitation the rights to use, copy,
+    modify, merge, publish, distribute, sublicense, and/or sell copies
+    of the Software, and to permit persons to whom the Software is
+    furnished to do so, subject to the following conditions:
+
+    The above copyright notice and this permission notice shall be
+    included in all copies or substantial portions of the Software.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+    EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+    NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
+    ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
+    CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+    WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+    Except as contained in this notice, the names of Chris Cannam and
+    Particular Programs Ltd shall not be used in advertising or
+    otherwise to promote the sale, use or other dealings in this
+    Software without prior written authorization.
+*/
+
+#ifndef BQVEC_VECTOR_OPS_COMPLEX_H
+#define BQVEC_VECTOR_OPS_COMPLEX_H
+
+#include "VectorOps.h"
+#include "ComplexTypes.h"
+
+#include <cmath>
+
+namespace breakfastquay {
+
+#ifndef NO_COMPLEX_TYPES
+
+template<> 
+inline void v_zero(bq_complex_t *const BQ_R__ ptr, 
+                   const int count)
+{
+#if defined HAVE_IPP
+    if (sizeof(bq_complex_element_t) == sizeof(float)) {
+        ippsZero_32fc((Ipp32fc *)ptr, count);
+    } else {
+        ippsZero_64fc((Ipp64fc *)ptr, count);
+    }
+#elif defined HAVE_VDSP
+    if (sizeof(bq_complex_element_t) == sizeof(float)) {
+        vDSP_vclr((float *)ptr, 1, count * 2);
+    } else {
+        vDSP_vclrD((double *)ptr, 1, count * 2);
+    }
+#else
+    const bq_complex_element_t value = 0.0;
+    for (int i = 0; i < count; ++i) {
+        ptr[i].re = value;
+        ptr[i].im = value;
+    }
+#endif
+}
+
+#if defined HAVE_IPP
+template<>
+inline void v_copy(bq_complex_t *const BQ_R__ dst,
+                   const bq_complex_t *const BQ_R__ src,
+                   const int count)
+{
+    if (sizeof(bq_complex_element_t) == sizeof(float)) {
+        ippsCopy_32fc((const Ipp32fc *)src, (Ipp32fc *)dst, count);
+    } else {
+        ippsCopy_64fc((const Ipp64fc *)src, (Ipp64fc *)dst, count);
+    }
+}
+template<>
+inline void v_move(bq_complex_t *const BQ_R__ dst,
+                   const bq_complex_t *const BQ_R__ src,
+                   const int count)
+{
+    if (sizeof(bq_complex_element_t) == sizeof(float)) {
+        ippsMove_32fc((const Ipp32fc *)src, (Ipp32fc *)dst, count);
+    } else {
+        ippsMove_64fc((const Ipp64fc *)src, (Ipp64fc *)dst, count);
+    }
+}
+#endif
+
+template<>
+inline void v_convert(bq_complex_t *const BQ_R__ dst,
+                      const bq_complex_element_t *const BQ_R__ src,
+                      const int srccount)
+{
+    const int targetcount = srccount / 2;
+    int srcidx = 0;
+    for (int i = 0; i < targetcount; ++i) {
+        dst[i].re = src[srcidx++];
+        dst[i].im = src[srcidx++];
+    }
+}
+
+template<>
+inline void v_convert(bq_complex_element_t *const BQ_R__ dst,
+                      const bq_complex_t *const BQ_R__ src,
+                      const int srccount)
+{
+    int targetidx = 0;
+    for (int i = 0; i < srccount; ++i) {
+        dst[targetidx++] = src[i].re;
+        dst[targetidx++] = src[i].im;
+    }
+}
+
+inline void c_add(bq_complex_t &dst,
+                  const bq_complex_t &src)
+{
+    dst.re += src.re;
+    dst.im += src.im;
+}
+
+inline void c_add_with_gain(bq_complex_t &dst,
+                            const bq_complex_t &src,
+                            const bq_complex_element_t gain)
+{
+    dst.re += src.re * gain;
+    dst.im += src.im * gain;
+}
+
+inline void c_multiply(bq_complex_t &dst,
+                       const bq_complex_t &src1,
+                       const bq_complex_t &src2)
+{
+    // Note dst may alias src1 or src2.
+
+    // The usual formula -- four multiplies, one add and one subtract
+    // 
+    // (x1 + y1i)(x2 + y2i) = (x1x2 - y1y2) + (x1y2 + y1x2)i
+    //
+    // Alternative formula -- three multiplies, two adds, three
+    // subtracts
+    // 
+    // (x1 + y1i)(x2 + y2i) = (x1x2 - y1y2) + ((x1 + y1)(x2 + y2) - x1x2 - y1y2)i
+    //
+    // The first formulation tests marginally quicker here.
+
+    bq_complex_element_t real = src1.re * src2.re - src1.im * src2.im;
+    bq_complex_element_t imag = src1.re * src2.im + src1.im * src2.re;
+
+    dst.re = real;
+    dst.im = imag;
+}
+
+inline void c_multiply(bq_complex_t &dst,
+                       const bq_complex_t &src)
+{
+    c_multiply(dst, dst, src);
+}
+
+inline void c_multiply_and_add(bq_complex_t &dst,
+                               const bq_complex_t &src1,
+                               const bq_complex_t &src2)
+{
+    bq_complex_t tmp;
+    c_multiply(tmp, src1, src2);
+    c_add(dst, tmp);
+}
+
+template<>
+inline void v_add(bq_complex_t *const BQ_R__ dst,
+                  const bq_complex_t *const BQ_R__ src,
+                  const int count)
+{
+#if defined HAVE_IPP
+    if (sizeof(bq_complex_element_t) == sizeof(float)) {
+        ippsAdd_32fc_I((Ipp32fc *)src, (Ipp32fc *)dst, count);
+    } else {
+        ippsAdd_64fc_I((Ipp64fc *)src, (Ipp64fc *)dst, count);
+    }
+#else
+    for (int i = 0; i < count; ++i) {
+        dst[i].re += src[i].re;
+        dst[i].im += src[i].im;
+    }
+#endif
+}    
+
+template<>
+inline void v_add_with_gain(bq_complex_t *const BQ_R__ dst,
+                            const bq_complex_t *const BQ_R__ src,
+                            const bq_complex_element_t gain,
+                            const int count)
+{
+    for (int i = 0; i < count; ++i) {
+        dst[i].re += src[i].re * gain;
+        dst[i].im += src[i].im * gain;
+    }
+}
+
+template<>
+inline void v_multiply(bq_complex_t *const BQ_R__ dst,
+                       const bq_complex_t *const BQ_R__ src,
+                       const int count)
+{
+#ifdef HAVE_IPP
+    if (sizeof(bq_complex_element_t) == sizeof(float)) {
+        ippsMul_32fc_I((const Ipp32fc *)src, (Ipp32fc *)dst, count);
+    } else {
+        ippsMul_64fc_I((const Ipp64fc *)src, (Ipp64fc *)dst, count);
+    }
+#else
+    for (int i = 0; i < count; ++i) {
+        c_multiply(dst[i], src[i]);
+    }
+#endif
+}
+
+template<>
+inline void v_multiply(bq_complex_t *const BQ_R__ dst,
+                       const bq_complex_t *const BQ_R__ src1,
+                       const bq_complex_t *const BQ_R__ src2,
+                       const int count)
+{
+#ifdef HAVE_IPP
+    if (sizeof(bq_complex_element_t) == sizeof(float)) {
+        ippsMul_32fc((const Ipp32fc *)src1, (const Ipp32fc *)src2,
+                     (Ipp32fc *)dst, count);
+    } else {
+        ippsMul_64fc((const Ipp64fc *)src1, (const Ipp64fc *)src2,
+                     (Ipp64fc *)dst, count);
+    }
+#else
+    for (int i = 0; i < count; ++i) {
+        c_multiply(dst[i], src1[i], src2[i]);
+    }
+#endif
+}
+
+template<>
+inline void v_multiply_and_add(bq_complex_t *const BQ_R__ dst,
+                               const bq_complex_t *const BQ_R__ src1,
+                               const bq_complex_t *const BQ_R__ src2,
+                               const int count)
+{
+#ifdef HAVE_IPP
+    if (sizeof(bq_complex_element_t) == sizeof(float)) {
+        ippsAddProduct_32fc((const Ipp32fc *)src1, (const Ipp32fc *)src2,
+                            (Ipp32fc *)dst, count);
+    } else {
+        ippsAddProduct_64fc((const Ipp64fc *)src1, (const Ipp64fc *)src2,
+                            (Ipp64fc *)dst, count);
+    }
+#else
+    for (int i = 0; i < count; ++i) {
+        c_multiply_and_add(dst[i], src1[i], src2[i]);
+    }
+#endif
+}
+
+#if defined( __GNUC__ ) && defined( _WIN32 )
+// MinGW doesn't appear to have sincos, so define it -- it's
+// a single x87 instruction anyway
+static inline void sincos(double x, double *sin, double *cos) {
+    __asm__ ("fsincos;" : "=t" (*cos), "=u" (*sin) : "0" (x) : "st(7)");
+}
+static inline void sincosf(float fx, float *fsin, float *fcos) {
+    double sin, cos;
+    sincos(fx, &sin, &cos);
+    *fsin = sin;
+    *fcos = cos;
+}
+#endif
+
+#endif /* !NO_COMPLEX_TYPES */
+
+template<typename T>
+inline void c_phasor(T *real, T *imag, T phase)
+{
+    //!!! IPP contains ippsSinCos_xxx in ippvm.h -- these are
+    //!!! fixed-accuracy, test and compare
+#if defined HAVE_VDSP
+    int one = 1;
+    if (sizeof(T) == sizeof(float)) {
+        vvsincosf((float *)imag, (float *)real, (const float *)&phase, &one);
+    } else {
+        vvsincos((double *)imag, (double *)real, (const double *)&phase, &one);
+    }
+#elif defined LACK_SINCOS
+    if (sizeof(T) == sizeof(float)) {
+        *real = cosf(phase);
+        *imag = sinf(phase);
+    } else {
+        *real = cos(phase);
+        *imag = sin(phase);
+    }
+#elif defined __GNUC__
+    if (sizeof(T) == sizeof(float)) {
+        sincosf(float(phase), (float *)imag, (float *)real);
+    } else {
+        sincos(phase, (double *)imag, (double *)real);
+    }
+#else
+    if (sizeof(T) == sizeof(float)) {
+        *real = cosf(phase);
+        *imag = sinf(phase);
+    } else {
+        *real = cos(phase);
+        *imag = sin(phase);
+    }
+#endif
+}
+
+template<typename T>
+inline void c_magphase(T *mag, T *phase, T real, T imag)
+{
+    *mag = sqrt(real * real + imag * imag);
+    *phase = atan2(imag, real);
+}
+
+#ifdef USE_APPROXIMATE_ATAN2
+// NB arguments in opposite order from usual for atan2f
+extern float approximate_atan2f(float real, float imag);
+template<>
+inline void c_magphase(float *mag, float *phase, float real, float imag)
+{
+    float atan = approximate_atan2f(real, imag);
+    *phase = atan;
+    *mag = sqrtf(real * real + imag * imag);
+}
+#else
+template<>
+inline void c_magphase(float *mag, float *phase, float real, float imag)
+{
+    *mag = sqrtf(real * real + imag * imag);
+    *phase = atan2f(imag, real);
+}
+#endif
+
+#ifndef NO_COMPLEX_TYPES
+
+inline bq_complex_t c_phasor(bq_complex_element_t phase)
+{
+    bq_complex_t c;
+    c_phasor<bq_complex_element_t>(&c.re, &c.im, phase);
+    return c;
+}
+
+inline void c_magphase(bq_complex_element_t *mag, bq_complex_element_t *phase,
+                       bq_complex_t c)
+{
+    c_magphase<bq_complex_element_t>(mag, phase, c.re, c.im);
+}
+
+void v_polar_to_cartesian(bq_complex_t *const BQ_R__ dst,
+                          const bq_complex_element_t *const BQ_R__ mag,
+                          const bq_complex_element_t *const BQ_R__ phase,
+                          const int count);
+
+void v_polar_interleaved_to_cartesian(bq_complex_t *const BQ_R__ dst,
+                                      const bq_complex_element_t *const BQ_R__ src,
+                                      const int count);
+
+inline void v_cartesian_to_polar(bq_complex_element_t *const BQ_R__ mag,
+                                 bq_complex_element_t *const BQ_R__ phase,
+                                 const bq_complex_t *const BQ_R__ src,
+                                 const int count)
+{
+    for (int i = 0; i < count; ++i) {
+        c_magphase<bq_complex_element_t>(mag + i, phase + i, src[i].re, src[i].im);
+    }
+}
+
+inline void v_cartesian_to_polar_interleaved(bq_complex_element_t *const BQ_R__ dst,
+                                             const bq_complex_t *const BQ_R__ src,
+                                             const int count)
+{
+    for (int i = 0; i < count; ++i) {
+        c_magphase<bq_complex_element_t>(&dst[i*2], &dst[i*2+1],
+                                    src[i].re, src[i].im);
+    }
+}
+
+#endif /* !NO_COMPLEX_TYPES */
+
+template<typename S, typename T> // S source, T target
+void v_polar_to_cartesian(T *const BQ_R__ real,
+                          T *const BQ_R__ imag,
+                          const S *const BQ_R__ mag,
+                          const S *const BQ_R__ phase,
+                          const int count)
+{
+    for (int i = 0; i < count; ++i) {
+        c_phasor<T>(real + i, imag + i, phase[i]);
+    }
+    v_multiply(real, mag, count);
+    v_multiply(imag, mag, count);
+}
+
+template<typename T>
+void v_polar_interleaved_to_cartesian_inplace(T *const BQ_R__ srcdst,
+                                              const int count)
+{
+    T real, imag;
+    for (int i = 0; i < count*2; i += 2) {
+        c_phasor(&real, &imag, srcdst[i+1]);
+        real *= srcdst[i];
+        imag *= srcdst[i];
+        srcdst[i] = real;
+        srcdst[i+1] = imag;
+    }
+}
+
+template<typename S, typename T> // S source, T target
+void v_polar_to_cartesian_interleaved(T *const BQ_R__ dst,
+                                      const S *const BQ_R__ mag,
+                                      const S *const BQ_R__ phase,
+                                      const int count)
+{
+    T real, imag;
+    for (int i = 0; i < count; ++i) {
+        c_phasor<T>(&real, &imag, phase[i]);
+        real *= mag[i];
+        imag *= mag[i];
+        dst[i*2] = real;
+        dst[i*2+1] = imag;
+    }
+}    
+
+#if defined USE_POMMIER_MATHFUN
+void v_polar_to_cartesian_pommier(float *const BQ_R__ real,
+                                  float *const BQ_R__ imag,
+                                  const float *const BQ_R__ mag,
+                                  const float *const BQ_R__ phase,
+                                  const int count);
+void v_polar_interleaved_to_cartesian_inplace_pommier(float *const BQ_R__ srcdst,
+                                                      const int count);
+void v_polar_to_cartesian_interleaved_pommier(float *const BQ_R__ dst,
+                                              const float *const BQ_R__ mag,
+                                              const float *const BQ_R__ phase,
+                                              const int count);
+
+template<>
+inline void v_polar_to_cartesian(float *const BQ_R__ real,
+                                 float *const BQ_R__ imag,
+                                 const float *const BQ_R__ mag,
+                                 const float *const BQ_R__ phase,
+                                 const int count)
+{
+    v_polar_to_cartesian_pommier(real, imag, mag, phase, count);
+}
+
+template<>
+inline void v_polar_interleaved_to_cartesian_inplace(float *const BQ_R__ srcdst,
+                                                     const int count)
+{
+    v_polar_interleaved_to_cartesian_inplace_pommier(srcdst, count);
+}
+
+template<>
+inline void v_polar_to_cartesian_interleaved(float *const BQ_R__ dst,
+                                             const float *const BQ_R__ mag,
+                                             const float *const BQ_R__ phase,
+                                             const int count)
+{
+    v_polar_to_cartesian_interleaved_pommier(dst, mag, phase, count);
+}
+
+#endif
+
+template<typename S, typename T> // S source, T target
+void v_cartesian_to_polar(T *const BQ_R__ mag,
+                          T *const BQ_R__ phase,
+                          const S *const BQ_R__ real,
+                          const S *const BQ_R__ imag,
+                          const int count)
+{
+    for (int i = 0; i < count; ++i) {
+        c_magphase<T>(mag + i, phase + i, real[i], imag[i]);
+    }
+}
+
+template<typename S, typename T> // S source, T target
+void v_cartesian_interleaved_to_polar(T *const BQ_R__ mag,
+                                      T *const BQ_R__ phase,
+                                      const S *const BQ_R__ src,
+                                      const int count)
+{
+    for (int i = 0; i < count; ++i) {
+        c_magphase<T>(mag + i, phase + i, src[i*2], src[i*2+1]);
+    }
+}
+
+#ifdef HAVE_VDSP
+template<>
+inline void v_cartesian_to_polar(float *const BQ_R__ mag,
+                                 float *const BQ_R__ phase,
+                                 const float *const BQ_R__ real,
+                                 const float *const BQ_R__ imag,
+                                 const int count)
+{
+    DSPSplitComplex c;
+    c.realp = const_cast<float *>(real);
+    c.imagp = const_cast<float *>(imag);
+    vDSP_zvmags(&c, 1, phase, 1, count); // using phase as a temporary dest
+    vvsqrtf(mag, phase, &count); // using phase as the source
+    vvatan2f(phase, imag, real, &count);
+}
+template<>
+inline void v_cartesian_to_polar(double *const BQ_R__ mag,
+                                 double *const BQ_R__ phase,
+                                 const double *const BQ_R__ real,
+                                 const double *const BQ_R__ imag,
+                                 const int count)
+{
+    // double precision, this is significantly faster than using vDSP_polar
+    DSPDoubleSplitComplex c;
+    c.realp = const_cast<double *>(real);
+    c.imagp = const_cast<double *>(imag);
+    vDSP_zvmagsD(&c, 1, phase, 1, count); // using phase as a temporary dest
+    vvsqrt(mag, phase, &count); // using phase as the source
+    vvatan2(phase, imag, real, &count);
+}
+#endif
+
+template<typename T>
+void v_cartesian_to_polar_interleaved_inplace(T *const BQ_R__ srcdst,
+                                              const int count)
+{
+    T mag, phase;
+    for (int i = 0; i < count * 2; i += 2) {
+        c_magphase(&mag, &phase, srcdst[i], srcdst[i+1]);
+        srcdst[i] = mag;
+        srcdst[i+1] = phase;
+    }
+}
+
+}
+
+#endif
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/fft/native/bqvec/pommier/neon_mathfun.h	Sat Oct 17 15:00:08 2015 +0100
@@ -0,0 +1,301 @@
+/* NEON implementation of sin, cos, exp and log
+
+   Inspired by Intel Approximate Math library, and based on the
+   corresponding algorithms of the cephes math library
+*/
+
+/* Copyright (C) 2011  Julien Pommier
+
+  This software is provided 'as-is', without any express or implied
+  warranty.  In no event will the authors be held liable for any damages
+  arising from the use of this software.
+
+  Permission is granted to anyone to use this software for any purpose,
+  including commercial applications, and to alter it and redistribute it
+  freely, subject to the following restrictions:
+
+  1. The origin of this software must not be misrepresented; you must not
+     claim that you wrote the original software. If you use this software
+     in a product, an acknowledgment in the product documentation would be
+     appreciated but is not required.
+  2. Altered source versions must be plainly marked as such, and must not be
+     misrepresented as being the original software.
+  3. This notice may not be removed or altered from any source distribution.
+
+  (this is the zlib license)
+*/
+
+#include <arm_neon.h>
+
+typedef float32x4_t v4sf;  // vector of 4 float
+typedef uint32x4_t v4su;  // vector of 4 uint32
+typedef int32x4_t v4si;  // vector of 4 uint32
+
+#define c_inv_mant_mask ~0x7f800000u
+#define c_cephes_SQRTHF 0.707106781186547524
+#define c_cephes_log_p0 7.0376836292E-2
+#define c_cephes_log_p1 - 1.1514610310E-1
+#define c_cephes_log_p2 1.1676998740E-1
+#define c_cephes_log_p3 - 1.2420140846E-1
+#define c_cephes_log_p4 + 1.4249322787E-1
+#define c_cephes_log_p5 - 1.6668057665E-1
+#define c_cephes_log_p6 + 2.0000714765E-1
+#define c_cephes_log_p7 - 2.4999993993E-1
+#define c_cephes_log_p8 + 3.3333331174E-1
+#define c_cephes_log_q1 -2.12194440e-4
+#define c_cephes_log_q2 0.693359375
+
+/* natural logarithm computed for 4 simultaneous float 
+   return NaN for x <= 0
+*/
+v4sf log_ps(v4sf x) {
+  v4sf one = vdupq_n_f32(1);
+
+  x = vmaxq_f32(x, vdupq_n_f32(0)); /* force flush to zero on denormal values */
+  v4su invalid_mask = vcleq_f32(x, vdupq_n_f32(0));
+
+  v4si ux = vreinterpretq_s32_f32(x);
+  
+  v4si emm0 = vshrq_n_s32(ux, 23);
+
+  /* keep only the fractional part */
+  ux = vandq_s32(ux, vdupq_n_s32(c_inv_mant_mask));
+  ux = vorrq_s32(ux, vreinterpretq_s32_f32(vdupq_n_f32(0.5f)));
+  x = vreinterpretq_f32_s32(ux);
+
+  emm0 = vsubq_s32(emm0, vdupq_n_s32(0x7f));
+  v4sf e = vcvtq_f32_s32(emm0);
+
+  e = vaddq_f32(e, one);
+
+  /* part2: 
+     if( x < SQRTHF ) {
+       e -= 1;
+       x = x + x - 1.0;
+     } else { x = x - 1.0; }
+  */
+  v4su mask = vcltq_f32(x, vdupq_n_f32(c_cephes_SQRTHF));
+  v4sf tmp = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(x), mask));
+  x = vsubq_f32(x, one);
+  e = vsubq_f32(e, vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(one), mask)));
+  x = vaddq_f32(x, tmp);
+
+  v4sf z = vmulq_f32(x,x);
+
+  v4sf y = vdupq_n_f32(c_cephes_log_p0);
+  y = vmulq_f32(y, x);
+  y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p1));
+  y = vmulq_f32(y, x);
+  y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p2));
+  y = vmulq_f32(y, x);
+  y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p3));
+  y = vmulq_f32(y, x);
+  y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p4));
+  y = vmulq_f32(y, x);
+  y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p5));
+  y = vmulq_f32(y, x);
+  y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p6));
+  y = vmulq_f32(y, x);
+  y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p7));
+  y = vmulq_f32(y, x);
+  y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p8));
+  y = vmulq_f32(y, x);
+
+  y = vmulq_f32(y, z);
+  
+
+  tmp = vmulq_f32(e, vdupq_n_f32(c_cephes_log_q1));
+  y = vaddq_f32(y, tmp);
+
+
+  tmp = vmulq_f32(z, vdupq_n_f32(0.5f));
+  y = vsubq_f32(y, tmp);
+
+  tmp = vmulq_f32(e, vdupq_n_f32(c_cephes_log_q2));
+  x = vaddq_f32(x, y);
+  x = vaddq_f32(x, tmp);
+  x = vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(x), invalid_mask)); // negative arg will be NAN
+  return x;
+}
+
+#define c_exp_hi 88.3762626647949f
+#define c_exp_lo -88.3762626647949f
+
+#define c_cephes_LOG2EF 1.44269504088896341
+#define c_cephes_exp_C1 0.693359375
+#define c_cephes_exp_C2 -2.12194440e-4
+
+#define c_cephes_exp_p0 1.9875691500E-4
+#define c_cephes_exp_p1 1.3981999507E-3
+#define c_cephes_exp_p2 8.3334519073E-3
+#define c_cephes_exp_p3 4.1665795894E-2
+#define c_cephes_exp_p4 1.6666665459E-1
+#define c_cephes_exp_p5 5.0000001201E-1
+
+/* exp() computed for 4 float at once */
+v4sf exp_ps(v4sf x) {
+  v4sf tmp, fx;
+
+  v4sf one = vdupq_n_f32(1);
+  x = vminq_f32(x, vdupq_n_f32(c_exp_hi));
+  x = vmaxq_f32(x, vdupq_n_f32(c_exp_lo));
+
+  /* express exp(x) as exp(g + n*log(2)) */
+  fx = vmlaq_f32(vdupq_n_f32(0.5f), x, vdupq_n_f32(c_cephes_LOG2EF));
+
+  /* perform a floorf */
+  tmp = vcvtq_f32_s32(vcvtq_s32_f32(fx));
+
+  /* if greater, substract 1 */
+  v4su mask = vcgtq_f32(tmp, fx);    
+  mask = vandq_u32(mask, vreinterpretq_u32_f32(one));
+
+
+  fx = vsubq_f32(tmp, vreinterpretq_f32_u32(mask));
+
+  tmp = vmulq_f32(fx, vdupq_n_f32(c_cephes_exp_C1));
+  v4sf z = vmulq_f32(fx, vdupq_n_f32(c_cephes_exp_C2));
+  x = vsubq_f32(x, tmp);
+  x = vsubq_f32(x, z);
+
+  static const float32_t cephes_exp_p[6] = { c_cephes_exp_p0, c_cephes_exp_p1, c_cephes_exp_p2, c_cephes_exp_p3, c_cephes_exp_p4, c_cephes_exp_p5 };
+  v4sf y = vld1q_dup_f32(cephes_exp_p+0);
+  v4sf c1 = vld1q_dup_f32(cephes_exp_p+1); 
+  v4sf c2 = vld1q_dup_f32(cephes_exp_p+2); 
+  v4sf c3 = vld1q_dup_f32(cephes_exp_p+3); 
+  v4sf c4 = vld1q_dup_f32(cephes_exp_p+4); 
+  v4sf c5 = vld1q_dup_f32(cephes_exp_p+5);
+
+  y = vmulq_f32(y, x);
+  z = vmulq_f32(x,x);
+  y = vaddq_f32(y, c1);
+  y = vmulq_f32(y, x);
+  y = vaddq_f32(y, c2);
+  y = vmulq_f32(y, x);
+  y = vaddq_f32(y, c3);
+  y = vmulq_f32(y, x);
+  y = vaddq_f32(y, c4);
+  y = vmulq_f32(y, x);
+  y = vaddq_f32(y, c5);
+  
+  y = vmulq_f32(y, z);
+  y = vaddq_f32(y, x);
+  y = vaddq_f32(y, one);
+
+  /* build 2^n */
+  int32x4_t mm;
+  mm = vcvtq_s32_f32(fx);
+  mm = vaddq_s32(mm, vdupq_n_s32(0x7f));
+  mm = vshlq_n_s32(mm, 23);
+  v4sf pow2n = vreinterpretq_f32_s32(mm);
+
+  y = vmulq_f32(y, pow2n);
+  return y;
+}
+
+#define c_minus_cephes_DP1 -0.78515625
+#define c_minus_cephes_DP2 -2.4187564849853515625e-4
+#define c_minus_cephes_DP3 -3.77489497744594108e-8
+#define c_sincof_p0 -1.9515295891E-4
+#define c_sincof_p1  8.3321608736E-3
+#define c_sincof_p2 -1.6666654611E-1
+#define c_coscof_p0  2.443315711809948E-005
+#define c_coscof_p1 -1.388731625493765E-003
+#define c_coscof_p2  4.166664568298827E-002
+#define c_cephes_FOPI 1.27323954473516 // 4 / M_PI
+
+/* evaluation of 4 sines & cosines at once.
+
+   The code is the exact rewriting of the cephes sinf function.
+   Precision is excellent as long as x < 8192 (I did not bother to
+   take into account the special handling they have for greater values
+   -- it does not return garbage for arguments over 8192, though, but
+   the extra precision is missing).
+
+   Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the
+   surprising but correct result.
+
+   Note also that when you compute sin(x), cos(x) is available at
+   almost no extra price so both sin_ps and cos_ps make use of
+   sincos_ps..
+  */
+void sincos_ps(v4sf x, v4sf *ysin, v4sf *ycos) { // any x
+  v4sf xmm1, xmm2, xmm3, y;
+
+  v4su emm2;
+  
+  v4su sign_mask_sin, sign_mask_cos;
+  sign_mask_sin = vcltq_f32(x, vdupq_n_f32(0));
+  x = vabsq_f32(x);
+
+  /* scale by 4/Pi */
+  y = vmulq_f32(x, vdupq_n_f32(c_cephes_FOPI));
+
+  /* store the integer part of y in mm0 */
+  emm2 = vcvtq_u32_f32(y);
+  /* j=(j+1) & (~1) (see the cephes sources) */
+  emm2 = vaddq_u32(emm2, vdupq_n_u32(1));
+  emm2 = vandq_u32(emm2, vdupq_n_u32(~1));
+  y = vcvtq_f32_u32(emm2);
+
+  /* get the polynom selection mask 
+     there is one polynom for 0 <= x <= Pi/4
+     and another one for Pi/4<x<=Pi/2
+
+     Both branches will be computed.
+  */
+  v4su poly_mask = vtstq_u32(emm2, vdupq_n_u32(2));
+  
+  /* The magic pass: "Extended precision modular arithmetic" 
+     x = ((x - y * DP1) - y * DP2) - y * DP3; */
+  xmm1 = vmulq_n_f32(y, c_minus_cephes_DP1);
+  xmm2 = vmulq_n_f32(y, c_minus_cephes_DP2);
+  xmm3 = vmulq_n_f32(y, c_minus_cephes_DP3);
+  x = vaddq_f32(x, xmm1);
+  x = vaddq_f32(x, xmm2);
+  x = vaddq_f32(x, xmm3);
+
+  sign_mask_sin = veorq_u32(sign_mask_sin, vtstq_u32(emm2, vdupq_n_u32(4)));
+  sign_mask_cos = vtstq_u32(vsubq_u32(emm2, vdupq_n_u32(2)), vdupq_n_u32(4));
+
+  /* Evaluate the first polynom  (0 <= x <= Pi/4) in y1, 
+     and the second polynom      (Pi/4 <= x <= 0) in y2 */
+  v4sf z = vmulq_f32(x,x);
+  v4sf y1, y2;
+
+  y1 = vmulq_n_f32(z, c_coscof_p0);
+  y2 = vmulq_n_f32(z, c_sincof_p0);
+  y1 = vaddq_f32(y1, vdupq_n_f32(c_coscof_p1));
+  y2 = vaddq_f32(y2, vdupq_n_f32(c_sincof_p1));
+  y1 = vmulq_f32(y1, z);
+  y2 = vmulq_f32(y2, z);
+  y1 = vaddq_f32(y1, vdupq_n_f32(c_coscof_p2));
+  y2 = vaddq_f32(y2, vdupq_n_f32(c_sincof_p2));
+  y1 = vmulq_f32(y1, z);
+  y2 = vmulq_f32(y2, z);
+  y1 = vmulq_f32(y1, z);
+  y2 = vmulq_f32(y2, x);
+  y1 = vsubq_f32(y1, vmulq_f32(z, vdupq_n_f32(0.5f)));
+  y2 = vaddq_f32(y2, x);
+  y1 = vaddq_f32(y1, vdupq_n_f32(1));
+
+  /* select the correct result from the two polynoms */  
+  v4sf ys = vbslq_f32(poly_mask, y1, y2);
+  v4sf yc = vbslq_f32(poly_mask, y2, y1);
+  *ysin = vbslq_f32(sign_mask_sin, vnegq_f32(ys), ys);
+  *ycos = vbslq_f32(sign_mask_cos, yc, vnegq_f32(yc));
+}
+
+v4sf sin_ps(v4sf x) {
+  v4sf ysin, ycos; 
+  sincos_ps(x, &ysin, &ycos); 
+  return ysin;
+}
+
+v4sf cos_ps(v4sf x) {
+  v4sf ysin, ycos; 
+  sincos_ps(x, &ysin, &ycos); 
+  return ycos;
+}
+
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/fft/native/bqvec/pommier/sse_mathfun.h	Sat Oct 17 15:00:08 2015 +0100
@@ -0,0 +1,766 @@
+
+#ifndef _POMMIER_SSE_MATHFUN_H_
+#define _POMMIER_SSE_MATHFUN_H_
+
+/* SIMD (SSE1+MMX or SSE2) implementation of sin, cos, exp and log
+
+   Inspired by Intel Approximate Math library, and based on the
+   corresponding algorithms of the cephes math library
+
+   The default is to use the SSE1 version. If you define USE_SSE2 the
+   the SSE2 intrinsics will be used in place of the MMX intrinsics. Do
+   not expect any significant performance improvement with SSE2.
+*/
+
+/* Copyright (C) 2007  Julien Pommier
+
+  This software is provided 'as-is', without any express or implied
+  warranty.  In no event will the authors be held liable for any damages
+  arising from the use of this software.
+
+  Permission is granted to anyone to use this software for any purpose,
+  including commercial applications, and to alter it and redistribute it
+  freely, subject to the following restrictions:
+
+  1. The origin of this software must not be misrepresented; you must not
+     claim that you wrote the original software. If you use this software
+     in a product, an acknowledgment in the product documentation would be
+     appreciated but is not required.
+  2. Altered source versions must be plainly marked as such, and must not be
+     misrepresented as being the original software.
+  3. This notice may not be removed or altered from any source distribution.
+
+  (this is the zlib license)
+*/
+
+#include <xmmintrin.h>
+
+/* yes I know, the top of this file is quite ugly */
+
+#ifdef _MSC_VER /* visual c++ */
+# define ALIGN16_BEG __declspec(align(16))
+# define ALIGN16_END 
+#else /* gcc or icc */
+# define ALIGN16_BEG
+# define ALIGN16_END __attribute__((aligned(16)))
+#endif
+
+/* __m128 is ugly to write */
+typedef __m128 v4sf;  // vector of 4 float (sse1)
+
+#ifdef USE_SSE2
+# include <emmintrin.h>
+typedef __m128i v4si; // vector of 4 int (sse2)
+#else
+typedef __m64 v2si;   // vector of 2 int (mmx)
+#endif
+
+/* declare some SSE constants -- why can't I figure a better way to do that? */
+#define _PS_CONST(Name, Val)                                            \
+  static const ALIGN16_BEG float _ps_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
+#define _PI32_CONST(Name, Val)                                            \
+  static const ALIGN16_BEG int _pi32_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
+#define _PS_CONST_TYPE(Name, Type, Val)                                 \
+  static const ALIGN16_BEG Type _ps_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
+
+_PS_CONST(1  , 1.0f);
+_PS_CONST(0p5, 0.5f);
+/* the smallest non denormalized float number */
+_PS_CONST_TYPE(min_norm_pos, int, 0x00800000);
+_PS_CONST_TYPE(mant_mask, int, 0x7f800000);
+_PS_CONST_TYPE(inv_mant_mask, int, ~0x7f800000);
+
+_PS_CONST_TYPE(sign_mask, int, 0x80000000);
+_PS_CONST_TYPE(inv_sign_mask, int, ~0x80000000);
+
+_PI32_CONST(1, 1);
+_PI32_CONST(inv1, ~1);
+_PI32_CONST(2, 2);
+_PI32_CONST(4, 4);
+_PI32_CONST(0x7f, 0x7f);
+
+_PS_CONST(cephes_SQRTHF, 0.707106781186547524);
+_PS_CONST(cephes_log_p0, 7.0376836292E-2);
+_PS_CONST(cephes_log_p1, - 1.1514610310E-1);
+_PS_CONST(cephes_log_p2, 1.1676998740E-1);
+_PS_CONST(cephes_log_p3, - 1.2420140846E-1);
+_PS_CONST(cephes_log_p4, + 1.4249322787E-1);
+_PS_CONST(cephes_log_p5, - 1.6668057665E-1);
+_PS_CONST(cephes_log_p6, + 2.0000714765E-1);
+_PS_CONST(cephes_log_p7, - 2.4999993993E-1);
+_PS_CONST(cephes_log_p8, + 3.3333331174E-1);
+_PS_CONST(cephes_log_q1, -2.12194440e-4);
+_PS_CONST(cephes_log_q2, 0.693359375);
+
+#if defined (__MINGW32__)
+
+/* the ugly part below: many versions of gcc used to be completely buggy with respect to some intrinsics
+   The movehl_ps is fixed in mingw 3.4.5, but I found out that all the _mm_cmp* intrinsics were completely
+   broken on my mingw gcc 3.4.5 ...
+
+   Note that the bug on _mm_cmp* does occur only at -O0 optimization level
+*/
+
+inline __m128 my_movehl_ps(__m128 a, const __m128 b) {
+	asm (
+			"movhlps %2,%0\n\t"
+			: "=x" (a)
+			: "0" (a), "x"(b)
+	    );
+	return a;                                 }
+#warning "redefined _mm_movehl_ps (see gcc bug 21179)"
+#define _mm_movehl_ps my_movehl_ps
+
+inline __m128 my_cmplt_ps(__m128 a, const __m128 b) {
+	asm (
+			"cmpltps %2,%0\n\t"
+			: "=x" (a)
+			: "0" (a), "x"(b)
+	    );
+	return a;               
+                  }
+inline __m128 my_cmpgt_ps(__m128 a, const __m128 b) {
+	asm (
+			"cmpnleps %2,%0\n\t"
+			: "=x" (a)
+			: "0" (a), "x"(b)
+	    );
+	return a;               
+}
+inline __m128 my_cmpeq_ps(__m128 a, const __m128 b) {
+	asm (
+			"cmpeqps %2,%0\n\t"
+			: "=x" (a)
+			: "0" (a), "x"(b)
+	    );
+	return a;               
+}
+#warning "redefined _mm_cmpxx_ps functions..."
+#define _mm_cmplt_ps my_cmplt_ps
+#define _mm_cmpgt_ps my_cmpgt_ps
+#define _mm_cmpeq_ps my_cmpeq_ps
+#endif
+
+#ifndef USE_SSE2
+typedef union xmm_mm_union {
+  __m128 xmm;
+  __m64 mm[2];
+} xmm_mm_union;
+
+#define COPY_XMM_TO_MM(xmm_, mm0_, mm1_) {          \
+    xmm_mm_union u; u.xmm = xmm_;                   \
+    mm0_ = u.mm[0];                                 \
+    mm1_ = u.mm[1];                                 \
+}
+
+#define COPY_MM_TO_XMM(mm0_, mm1_, xmm_) {                         \
+    xmm_mm_union u; u.mm[0]=mm0_; u.mm[1]=mm1_; xmm_ = u.xmm;      \
+  }
+
+#endif // USE_SSE2
+
+/* natural logarithm computed for 4 simultaneous float 
+   return NaN for x <= 0
+*/
+v4sf log_ps(v4sf x) {
+#ifdef USE_SSE2
+  v4si emm0;
+#else
+  v2si mm0, mm1;
+#endif
+  v4sf one = *(v4sf*)_ps_1;
+
+  v4sf invalid_mask = _mm_cmple_ps(x, _mm_setzero_ps());
+
+  x = _mm_max_ps(x, *(v4sf*)_ps_min_norm_pos);  /* cut off denormalized stuff */
+
+#ifndef USE_SSE2
+  /* part 1: x = frexpf(x, &e); */
+  COPY_XMM_TO_MM(x, mm0, mm1);
+  mm0 = _mm_srli_pi32(mm0, 23);
+  mm1 = _mm_srli_pi32(mm1, 23);
+#else
+  emm0 = _mm_srli_epi32(_mm_castps_si128(x), 23);
+#endif
+  /* keep only the fractional part */
+  x = _mm_and_ps(x, *(v4sf*)_ps_inv_mant_mask);
+  x = _mm_or_ps(x, *(v4sf*)_ps_0p5);
+
+#ifndef USE_SSE2
+  /* now e=mm0:mm1 contain the really base-2 exponent */
+  mm0 = _mm_sub_pi32(mm0, *(v2si*)_pi32_0x7f);
+  mm1 = _mm_sub_pi32(mm1, *(v2si*)_pi32_0x7f);
+  v4sf e = _mm_cvtpi32x2_ps(mm0, mm1);
+  _mm_empty(); /* bye bye mmx */
+#else
+  emm0 = _mm_sub_epi32(emm0, *(v4si*)_pi32_0x7f);
+  v4sf e = _mm_cvtepi32_ps(emm0);
+#endif
+
+  e = _mm_add_ps(e, one);
+
+  /* part2: 
+     if( x < SQRTHF ) {
+       e -= 1;
+       x = x + x - 1.0;
+     } else { x = x - 1.0; }
+  */
+  v4sf mask = _mm_cmplt_ps(x, *(v4sf*)_ps_cephes_SQRTHF);
+  v4sf tmp = _mm_and_ps(x, mask);
+  x = _mm_sub_ps(x, one);
+  e = _mm_sub_ps(e, _mm_and_ps(one, mask));
+  x = _mm_add_ps(x, tmp);
+
+
+  v4sf z = _mm_mul_ps(x,x);
+
+  v4sf y = *(v4sf*)_ps_cephes_log_p0;
+  y = _mm_mul_ps(y, x);
+  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p1);
+  y = _mm_mul_ps(y, x);
+  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p2);
+  y = _mm_mul_ps(y, x);
+  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p3);
+  y = _mm_mul_ps(y, x);
+  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p4);
+  y = _mm_mul_ps(y, x);
+  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p5);
+  y = _mm_mul_ps(y, x);
+  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p6);
+  y = _mm_mul_ps(y, x);
+  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p7);
+  y = _mm_mul_ps(y, x);
+  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p8);
+  y = _mm_mul_ps(y, x);
+
+  y = _mm_mul_ps(y, z);
+  
+
+  tmp = _mm_mul_ps(e, *(v4sf*)_ps_cephes_log_q1);
+  y = _mm_add_ps(y, tmp);
+
+
+  tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
+  y = _mm_sub_ps(y, tmp);
+
+  tmp = _mm_mul_ps(e, *(v4sf*)_ps_cephes_log_q2);
+  x = _mm_add_ps(x, y);
+  x = _mm_add_ps(x, tmp);
+  x = _mm_or_ps(x, invalid_mask); // negative arg will be NAN
+  return x;
+}
+
+_PS_CONST(exp_hi,	88.3762626647949f);
+_PS_CONST(exp_lo,	-88.3762626647949f);
+
+_PS_CONST(cephes_LOG2EF, 1.44269504088896341);
+_PS_CONST(cephes_exp_C1, 0.693359375);
+_PS_CONST(cephes_exp_C2, -2.12194440e-4);
+
+_PS_CONST(cephes_exp_p0, 1.9875691500E-4);
+_PS_CONST(cephes_exp_p1, 1.3981999507E-3);
+_PS_CONST(cephes_exp_p2, 8.3334519073E-3);
+_PS_CONST(cephes_exp_p3, 4.1665795894E-2);
+_PS_CONST(cephes_exp_p4, 1.6666665459E-1);
+_PS_CONST(cephes_exp_p5, 5.0000001201E-1);
+
+v4sf exp_ps(v4sf x) {
+  v4sf tmp = _mm_setzero_ps(), fx;
+#ifdef USE_SSE2
+  v4si emm0;
+#else
+  v2si mm0, mm1;
+#endif
+  v4sf one = *(v4sf*)_ps_1;
+
+  x = _mm_min_ps(x, *(v4sf*)_ps_exp_hi);
+  x = _mm_max_ps(x, *(v4sf*)_ps_exp_lo);
+
+  /* express exp(x) as exp(g + n*log(2)) */
+  fx = _mm_mul_ps(x, *(v4sf*)_ps_cephes_LOG2EF);
+  fx = _mm_add_ps(fx, *(v4sf*)_ps_0p5);
+
+  /* how to perform a floorf with SSE: just below */
+#ifndef USE_SSE2
+  /* step 1 : cast to int */
+  tmp = _mm_movehl_ps(tmp, fx);
+  mm0 = _mm_cvttps_pi32(fx);
+  mm1 = _mm_cvttps_pi32(tmp);
+  /* step 2 : cast back to float */
+  tmp = _mm_cvtpi32x2_ps(mm0, mm1);
+#else
+  emm0 = _mm_cvttps_epi32(fx);
+  tmp  = _mm_cvtepi32_ps(emm0);
+#endif
+  /* if greater, substract 1 */
+  v4sf mask = _mm_cmpgt_ps(tmp, fx);    
+  mask = _mm_and_ps(mask, one);
+  fx = _mm_sub_ps(tmp, mask);
+
+  tmp = _mm_mul_ps(fx, *(v4sf*)_ps_cephes_exp_C1);
+  v4sf z = _mm_mul_ps(fx, *(v4sf*)_ps_cephes_exp_C2);
+  x = _mm_sub_ps(x, tmp);
+  x = _mm_sub_ps(x, z);
+
+  z = _mm_mul_ps(x,x);
+  
+  v4sf y = *(v4sf*)_ps_cephes_exp_p0;
+  y = _mm_mul_ps(y, x);
+  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p1);
+  y = _mm_mul_ps(y, x);
+  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p2);
+  y = _mm_mul_ps(y, x);
+  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p3);
+  y = _mm_mul_ps(y, x);
+  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p4);
+  y = _mm_mul_ps(y, x);
+  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p5);
+  y = _mm_mul_ps(y, z);
+  y = _mm_add_ps(y, x);
+  y = _mm_add_ps(y, one);
+
+  /* build 2^n */
+#ifndef USE_SSE2
+  z = _mm_movehl_ps(z, fx);
+  mm0 = _mm_cvttps_pi32(fx);
+  mm1 = _mm_cvttps_pi32(z);
+  mm0 = _mm_add_pi32(mm0, *(v2si*)_pi32_0x7f);
+  mm1 = _mm_add_pi32(mm1, *(v2si*)_pi32_0x7f);
+  mm0 = _mm_slli_pi32(mm0, 23); 
+  mm1 = _mm_slli_pi32(mm1, 23);
+  
+  v4sf pow2n; 
+  COPY_MM_TO_XMM(mm0, mm1, pow2n);
+  _mm_empty();
+#else
+  emm0 = _mm_cvttps_epi32(fx);
+  emm0 = _mm_add_epi32(emm0, *(v4si*)_pi32_0x7f);
+  emm0 = _mm_slli_epi32(emm0, 23);
+  v4sf pow2n = _mm_castsi128_ps(emm0);
+#endif
+  y = _mm_mul_ps(y, pow2n);
+  return y;
+}
+
+_PS_CONST(minus_cephes_DP1, -0.78515625);
+_PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
+_PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
+_PS_CONST(sincof_p0, -1.9515295891E-4);
+_PS_CONST(sincof_p1,  8.3321608736E-3);
+_PS_CONST(sincof_p2, -1.6666654611E-1);
+_PS_CONST(coscof_p0,  2.443315711809948E-005);
+_PS_CONST(coscof_p1, -1.388731625493765E-003);
+_PS_CONST(coscof_p2,  4.166664568298827E-002);
+_PS_CONST(cephes_FOPI, 1.27323954473516); // 4 / M_PI
+
+
+/* evaluation of 4 sines at onces, using only SSE1+MMX intrinsics so
+   it runs also on old athlons XPs and the pentium III of your grand
+   mother.
+
+   The code is the exact rewriting of the cephes sinf function.
+   Precision is excellent as long as x < 8192 (I did not bother to
+   take into account the special handling they have for greater values
+   -- it does not return garbage for arguments over 8192, though, but
+   the extra precision is missing).
+
+   Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the
+   surprising but correct result.
+
+   Performance is also surprisingly good, 1.33 times faster than the
+   macos vsinf SSE2 function, and 1.5 times faster than the
+   __vrs4_sinf of amd's ACML (which is only available in 64 bits). Not
+   too bad for an SSE1 function (with no special tuning) !
+   However the latter libraries probably have a much better handling of NaN,
+   Inf, denormalized and other special arguments..
+
+   On my core 1 duo, the execution of this function takes approximately 95 cycles.
+
+   From what I have observed on the experiments with Intel AMath lib, switching to an
+   SSE2 version would improve the perf by only 10%.
+
+   Since it is based on SSE intrinsics, it has to be compiled at -O2 to
+   deliver full speed.
+*/
+v4sf sin_ps(v4sf x) { // any x
+  v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, sign_bit, y;
+
+#ifdef USE_SSE2
+  v4si emm0, emm2;
+#else
+  v2si mm0, mm1, mm2, mm3;
+#endif
+  sign_bit = x;
+  /* take the absolute value */
+  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
+  /* extract the sign bit (upper one) */
+  sign_bit = _mm_and_ps(sign_bit, *(v4sf*)_ps_sign_mask);
+  
+  /* scale by 4/Pi */
+  y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
+
+  //printf("plop:"); print4(y); 
+#ifdef USE_SSE2
+  /* store the integer part of y in mm0 */
+  emm2 = _mm_cvttps_epi32(y);
+  /* j=(j+1) & (~1) (see the cephes sources) */
+  emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
+  emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
+  y = _mm_cvtepi32_ps(emm2);
+  /* get the swap sign flag */
+  emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
+  emm0 = _mm_slli_epi32(emm0, 29);
+  /* get the polynom selection mask 
+     there is one polynom for 0 <= x <= Pi/4
+     and another one for Pi/4<x<=Pi/2
+
+     Both branches will be computed.
+  */
+  emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
+  emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
+  
+  v4sf swap_sign_bit = _mm_castsi128_ps(emm0);
+  v4sf poly_mask = _mm_castsi128_ps(emm2);
+  sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
+#else
+  /* store the integer part of y in mm0:mm1 */
+  xmm2 = _mm_movehl_ps(xmm2, y);
+  mm2 = _mm_cvttps_pi32(y);
+  mm3 = _mm_cvttps_pi32(xmm2);
+  /* j=(j+1) & (~1) (see the cephes sources) */
+  mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
+  mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
+  mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
+  mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
+  y = _mm_cvtpi32x2_ps(mm2, mm3);
+  /* get the swap sign flag */
+  mm0 = _mm_and_si64(mm2, *(v2si*)_pi32_4);
+  mm1 = _mm_and_si64(mm3, *(v2si*)_pi32_4);
+  mm0 = _mm_slli_pi32(mm0, 29);
+  mm1 = _mm_slli_pi32(mm1, 29);
+  /* get the polynom selection mask */
+  mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
+  mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
+  mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
+  mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
+  v4sf swap_sign_bit, poly_mask;
+  COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit);
+  COPY_MM_TO_XMM(mm2, mm3, poly_mask);
+  sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
+  _mm_empty(); /* good-bye mmx */
+#endif
+  
+  /* The magic pass: "Extended precision modular arithmetic" 
+     x = ((x - y * DP1) - y * DP2) - y * DP3; */
+  xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
+  xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
+  xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
+  xmm1 = _mm_mul_ps(y, xmm1);
+  xmm2 = _mm_mul_ps(y, xmm2);
+  xmm3 = _mm_mul_ps(y, xmm3);
+  x = _mm_add_ps(x, xmm1);
+  x = _mm_add_ps(x, xmm2);
+  x = _mm_add_ps(x, xmm3);
+
+  /* Evaluate the first polynom  (0 <= x <= Pi/4) */
+  y = *(v4sf*)_ps_coscof_p0;
+  v4sf z = _mm_mul_ps(x,x);
+
+  y = _mm_mul_ps(y, z);
+  y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
+  y = _mm_mul_ps(y, z);
+  y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
+  y = _mm_mul_ps(y, z);
+  y = _mm_mul_ps(y, z);
+  v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
+  y = _mm_sub_ps(y, tmp);
+  y = _mm_add_ps(y, *(v4sf*)_ps_1);
+  
+  /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
+
+  v4sf y2 = *(v4sf*)_ps_sincof_p0;
+  y2 = _mm_mul_ps(y2, z);
+  y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
+  y2 = _mm_mul_ps(y2, z);
+  y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
+  y2 = _mm_mul_ps(y2, z);
+  y2 = _mm_mul_ps(y2, x);
+  y2 = _mm_add_ps(y2, x);
+
+  /* select the correct result from the two polynoms */  
+  xmm3 = poly_mask;
+  y2 = _mm_and_ps(xmm3, y2); //, xmm3);
+  y = _mm_andnot_ps(xmm3, y);
+  y = _mm_add_ps(y,y2);
+  /* update the sign */
+  y = _mm_xor_ps(y, sign_bit);
+
+  return y;
+}
+
+/* almost the same as sin_ps */
+v4sf cos_ps(v4sf x) { // any x
+  v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, y;
+#ifdef USE_SSE2
+  v4si emm0, emm2;
+#else
+  v2si mm0, mm1, mm2, mm3;
+#endif
+  /* take the absolute value */
+  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
+  
+  /* scale by 4/Pi */
+  y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
+  
+#ifdef USE_SSE2
+  /* store the integer part of y in mm0 */
+  emm2 = _mm_cvttps_epi32(y);
+  /* j=(j+1) & (~1) (see the cephes sources) */
+  emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
+  emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
+  y = _mm_cvtepi32_ps(emm2);
+
+  emm2 = _mm_sub_epi32(emm2, *(v4si*)_pi32_2);
+  
+  /* get the swap sign flag */
+  emm0 = _mm_andnot_si128(emm2, *(v4si*)_pi32_4);
+  emm0 = _mm_slli_epi32(emm0, 29);
+  /* get the polynom selection mask */
+  emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
+  emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
+  
+  v4sf sign_bit = _mm_castsi128_ps(emm0);
+  v4sf poly_mask = _mm_castsi128_ps(emm2);
+#else
+  /* store the integer part of y in mm0:mm1 */
+  xmm2 = _mm_movehl_ps(xmm2, y);
+  mm2 = _mm_cvttps_pi32(y);
+  mm3 = _mm_cvttps_pi32(xmm2);
+
+  /* j=(j+1) & (~1) (see the cephes sources) */
+  mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
+  mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
+  mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
+  mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
+
+  y = _mm_cvtpi32x2_ps(mm2, mm3);
+
+
+  mm2 = _mm_sub_pi32(mm2, *(v2si*)_pi32_2);
+  mm3 = _mm_sub_pi32(mm3, *(v2si*)_pi32_2);
+
+  /* get the swap sign flag in mm0:mm1 and the 
+     polynom selection mask in mm2:mm3 */
+
+  mm0 = _mm_andnot_si64(mm2, *(v2si*)_pi32_4);
+  mm1 = _mm_andnot_si64(mm3, *(v2si*)_pi32_4);
+  mm0 = _mm_slli_pi32(mm0, 29);
+  mm1 = _mm_slli_pi32(mm1, 29);
+
+  mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
+  mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
+
+  mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
+  mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
+
+  v4sf sign_bit, poly_mask;
+  COPY_MM_TO_XMM(mm0, mm1, sign_bit);
+  COPY_MM_TO_XMM(mm2, mm3, poly_mask);
+  _mm_empty(); /* good-bye mmx */
+#endif
+  /* The magic pass: "Extended precision modular arithmetic" 
+     x = ((x - y * DP1) - y * DP2) - y * DP3; */
+  xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
+  xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
+  xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
+  xmm1 = _mm_mul_ps(y, xmm1);
+  xmm2 = _mm_mul_ps(y, xmm2);
+  xmm3 = _mm_mul_ps(y, xmm3);
+  x = _mm_add_ps(x, xmm1);
+  x = _mm_add_ps(x, xmm2);
+  x = _mm_add_ps(x, xmm3);
+  
+  /* Evaluate the first polynom  (0 <= x <= Pi/4) */
+  y = *(v4sf*)_ps_coscof_p0;
+  v4sf z = _mm_mul_ps(x,x);
+
+  y = _mm_mul_ps(y, z);
+  y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
+  y = _mm_mul_ps(y, z);
+  y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
+  y = _mm_mul_ps(y, z);
+  y = _mm_mul_ps(y, z);
+  v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
+  y = _mm_sub_ps(y, tmp);
+  y = _mm_add_ps(y, *(v4sf*)_ps_1);
+  
+  /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
+
+  v4sf y2 = *(v4sf*)_ps_sincof_p0;
+  y2 = _mm_mul_ps(y2, z);
+  y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
+  y2 = _mm_mul_ps(y2, z);
+  y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
+  y2 = _mm_mul_ps(y2, z);
+  y2 = _mm_mul_ps(y2, x);
+  y2 = _mm_add_ps(y2, x);
+
+  /* select the correct result from the two polynoms */  
+  xmm3 = poly_mask;
+  y2 = _mm_and_ps(xmm3, y2); //, xmm3);
+  y = _mm_andnot_ps(xmm3, y);
+  y = _mm_add_ps(y,y2);
+  /* update the sign */
+  y = _mm_xor_ps(y, sign_bit);
+
+  return y;
+}
+
+/* since sin_ps and cos_ps are almost identical, sincos_ps could replace both of them..
+   it is almost as fast, and gives you a free cosine with your sine */
+void sincos_ps(v4sf x, v4sf *s, v4sf *c) {
+  v4sf xmm1, xmm2, xmm3 = _mm_setzero_ps(), sign_bit_sin, y;
+#ifdef USE_SSE2
+  v4si emm0, emm2, emm4;
+#else
+  v2si mm0, mm1, mm2, mm3, mm4, mm5;
+#endif
+  sign_bit_sin = x;
+  /* take the absolute value */
+  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
+  /* extract the sign bit (upper one) */
+  sign_bit_sin = _mm_and_ps(sign_bit_sin, *(v4sf*)_ps_sign_mask);
+  
+  /* scale by 4/Pi */
+  y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
+    
+#ifdef USE_SSE2
+  /* store the integer part of y in emm2 */
+  emm2 = _mm_cvttps_epi32(y);
+
+  /* j=(j+1) & (~1) (see the cephes sources) */
+  emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
+  emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
+  y = _mm_cvtepi32_ps(emm2);
+
+  emm4 = emm2;
+
+  /* get the swap sign flag for the sine */
+  emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
+  emm0 = _mm_slli_epi32(emm0, 29);
+  v4sf swap_sign_bit_sin = _mm_castsi128_ps(emm0);
+
+  /* get the polynom selection mask for the sine*/
+  emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
+  emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
+  v4sf poly_mask = _mm_castsi128_ps(emm2);
+#else
+  /* store the integer part of y in mm2:mm3 */
+  xmm3 = _mm_movehl_ps(xmm3, y);
+  mm2 = _mm_cvttps_pi32(y);
+  mm3 = _mm_cvttps_pi32(xmm3);
+
+  /* j=(j+1) & (~1) (see the cephes sources) */
+  mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
+  mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
+  mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
+  mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
+
+  y = _mm_cvtpi32x2_ps(mm2, mm3);
+
+  mm4 = mm2;
+  mm5 = mm3;
+
+  /* get the swap sign flag for the sine */
+  mm0 = _mm_and_si64(mm2, *(v2si*)_pi32_4);
+  mm1 = _mm_and_si64(mm3, *(v2si*)_pi32_4);
+  mm0 = _mm_slli_pi32(mm0, 29);
+  mm1 = _mm_slli_pi32(mm1, 29);
+  v4sf swap_sign_bit_sin;
+  COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit_sin);
+
+  /* get the polynom selection mask for the sine */
+
+  mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
+  mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
+  mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
+  mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
+  v4sf poly_mask;
+  COPY_MM_TO_XMM(mm2, mm3, poly_mask);
+#endif
+
+  /* The magic pass: "Extended precision modular arithmetic" 
+     x = ((x - y * DP1) - y * DP2) - y * DP3; */
+  xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
+  xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
+  xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
+  xmm1 = _mm_mul_ps(y, xmm1);
+  xmm2 = _mm_mul_ps(y, xmm2);
+  xmm3 = _mm_mul_ps(y, xmm3);
+  x = _mm_add_ps(x, xmm1);
+  x = _mm_add_ps(x, xmm2);
+  x = _mm_add_ps(x, xmm3);
+
+#ifdef USE_SSE2
+  emm4 = _mm_sub_epi32(emm4, *(v4si*)_pi32_2);
+  emm4 = _mm_andnot_si128(emm4, *(v4si*)_pi32_4);
+  emm4 = _mm_slli_epi32(emm4, 29);
+  v4sf sign_bit_cos = _mm_castsi128_ps(emm4);
+#else
+  /* get the sign flag for the cosine */
+  mm4 = _mm_sub_pi32(mm4, *(v2si*)_pi32_2);
+  mm5 = _mm_sub_pi32(mm5, *(v2si*)_pi32_2);
+  mm4 = _mm_andnot_si64(mm4, *(v2si*)_pi32_4);
+  mm5 = _mm_andnot_si64(mm5, *(v2si*)_pi32_4);
+  mm4 = _mm_slli_pi32(mm4, 29);
+  mm5 = _mm_slli_pi32(mm5, 29);
+  v4sf sign_bit_cos;
+  COPY_MM_TO_XMM(mm4, mm5, sign_bit_cos);
+  _mm_empty(); /* good-bye mmx */
+#endif
+
+  sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin);
+
+  
+  /* Evaluate the first polynom  (0 <= x <= Pi/4) */
+  v4sf z = _mm_mul_ps(x,x);
+  y = *(v4sf*)_ps_coscof_p0;
+
+  y = _mm_mul_ps(y, z);
+  y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
+  y = _mm_mul_ps(y, z);
+  y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
+  y = _mm_mul_ps(y, z);
+  y = _mm_mul_ps(y, z);
+  v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
+  y = _mm_sub_ps(y, tmp);
+  y = _mm_add_ps(y, *(v4sf*)_ps_1);
+  
+  /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
+
+  v4sf y2 = *(v4sf*)_ps_sincof_p0;
+  y2 = _mm_mul_ps(y2, z);
+  y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
+  y2 = _mm_mul_ps(y2, z);
+  y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
+  y2 = _mm_mul_ps(y2, z);
+  y2 = _mm_mul_ps(y2, x);
+  y2 = _mm_add_ps(y2, x);
+
+  /* select the correct result from the two polynoms */  
+  xmm3 = poly_mask;
+  v4sf ysin2 = _mm_and_ps(xmm3, y2);
+  v4sf ysin1 = _mm_andnot_ps(xmm3, y);
+  y2 = _mm_sub_ps(y2,ysin2);
+  y = _mm_sub_ps(y, ysin1);
+
+  xmm1 = _mm_add_ps(ysin1,ysin2);
+  xmm2 = _mm_add_ps(y,y2);
+ 
+  /* update the sign */
+  *s = _mm_xor_ps(xmm1, sign_bit_sin);
+  *c = _mm_xor_ps(xmm2, sign_bit_cos);
+}
+
+#endif
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/fft/native/bqvec/src/Allocators.cpp	Sat Oct 17 15:00:08 2015 +0100
@@ -0,0 +1,81 @@
+/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*-  vi:set ts=8 sts=4 sw=4: */
+
+/*
+    bqvec
+
+    A small library for vector arithmetic and allocation in C++ using
+    raw C pointer arrays.
+
+    Copyright 2007-2014 Particular Programs Ltd.
+
+    Permission is hereby granted, free of charge, to any person
+    obtaining a copy of this software and associated documentation
+    files (the "Software"), to deal in the Software without
+    restriction, including without limitation the rights to use, copy,
+    modify, merge, publish, distribute, sublicense, and/or sell copies
+    of the Software, and to permit persons to whom the Software is
+    furnished to do so, subject to the following conditions:
+
+    The above copyright notice and this permission notice shall be
+    included in all copies or substantial portions of the Software.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+    EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+    NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
+    ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
+    CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+    WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+    Except as contained in this notice, the names of Chris Cannam and
+    Particular Programs Ltd shall not be used in advertising or
+    otherwise to promote the sale, use or other dealings in this
+    Software without prior written authorization.
+*/
+
+#include "Allocators.h"
+
+#ifdef HAVE_IPP
+#include <ipps.h>
+#endif
+
+#include <iostream>
+using std::cerr;
+using std::endl;
+
+namespace breakfastquay {
+
+#ifdef HAVE_IPP
+
+template <>
+float *allocate(size_t count)
+{
+    float *ptr = ippsMalloc_32f(count);
+    if (!ptr) throw (std::bad_alloc());
+    return ptr;
+}
+
+template <>
+double *allocate(size_t count)
+{
+    double *ptr = ippsMalloc_64f(count);
+    if (!ptr) throw (std::bad_alloc());
+    return ptr;
+}
+
+template <>
+void deallocate(float *ptr)
+{
+    if (ptr) ippsFree((void *)ptr);
+}
+
+template <>
+void deallocate(double *ptr)
+{
+    if (ptr) ippsFree((void *)ptr);
+}
+
+#endif
+
+}
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/fft/native/bqvec/src/Barrier.cpp	Sat Oct 17 15:00:08 2015 +0100
@@ -0,0 +1,67 @@
+/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*-  vi:set ts=8 sts=4 sw=4: */
+
+/*
+    bqvec
+
+    A small library for vector arithmetic and allocation in C++ using
+    raw C pointer arrays.
+
+    Copyright 2007-2015 Particular Programs Ltd.
+
+    Permission is hereby granted, free of charge, to any person
+    obtaining a copy of this software and associated documentation
+    files (the "Software"), to deal in the Software without
+    restriction, including without limitation the rights to use, copy,
+    modify, merge, publish, distribute, sublicense, and/or sell copies
+    of the Software, and to permit persons to whom the Software is
+    furnished to do so, subject to the following conditions:
+
+    The above copyright notice and this permission notice shall be
+    included in all copies or substantial portions of the Software.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+    EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+    NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
+    ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
+    CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+    WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+    Except as contained in this notice, the names of Chris Cannam and
+    Particular Programs Ltd shall not be used in advertising or
+    otherwise to promote the sale, use or other dealings in this
+    Software without prior written authorization.
+*/
+
+#include "Barrier.h"
+
+namespace breakfastquay {
+
+void system_memorybarrier()
+{
+#if defined __APPLE__
+
+    OSMemoryBarrier();
+    
+#elif (__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 1)
+
+    __sync_synchronize();
+
+#elif defined _WIN32 
+
+#if defined __MSVC__    
+    MemoryBarrier();
+#else /* (mingw) */
+    LONG Barrier = 0;
+    __asm__ __volatile__("xchgl %%eax,%0 "
+                         : "=r" (Barrier));
+#endif
+
+#else
+#warning "No memory barrier defined"
+#endif
+    
+}
+
+}
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/fft/native/bqvec/src/VectorOpsComplex.cpp	Sat Oct 17 15:00:08 2015 +0100
@@ -0,0 +1,373 @@
+/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*-  vi:set ts=8 sts=4 sw=4: */
+
+/*
+    bqvec
+
+    A small library for vector arithmetic and allocation in C++ using
+    raw C pointer arrays.
+
+    Copyright 2007-2015 Particular Programs Ltd.
+
+    Permission is hereby granted, free of charge, to any person
+    obtaining a copy of this software and associated documentation
+    files (the "Software"), to deal in the Software without
+    restriction, including without limitation the rights to use, copy,
+    modify, merge, publish, distribute, sublicense, and/or sell copies
+    of the Software, and to permit persons to whom the Software is
+    furnished to do so, subject to the following conditions:
+
+    The above copyright notice and this permission notice shall be
+    included in all copies or substantial portions of the Software.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+    EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+    NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
+    ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
+    CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+    WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+    Except as contained in this notice, the names of Chris Cannam and
+    Particular Programs Ltd shall not be used in advertising or
+    otherwise to promote the sale, use or other dealings in this
+    Software without prior written authorization.
+*/
+
+#include "VectorOpsComplex.h"
+
+#include <cassert>
+
+#ifdef __MSVC__
+#include <malloc.h>
+#define alloca _alloca
+#endif
+
+#if defined USE_POMMIER_MATHFUN
+#if defined __ARMEL__
+#include "pommier/neon_mathfun.h"
+#else
+#include "pommier/sse_mathfun.h"
+#endif
+#endif
+
+#ifdef __MSVC__
+#include <malloc.h>
+#define alloca _alloca
+#endif
+
+namespace breakfastquay {
+
+#ifdef USE_APPROXIMATE_ATAN2
+float approximate_atan2f(float real, float imag)
+{
+    static const float pi = M_PI;
+    static const float pi2 = M_PI / 2;
+
+    float atan;
+
+    if (real == 0.f) {
+
+        if (imag > 0.0f) atan = pi2;
+        else if (imag == 0.0f) atan = 0.0f;
+        else atan = -pi2;
+
+    } else {
+
+        float z = imag/real;
+
+        if (fabsf(z) < 1.f) {
+            atan = z / (1.f + 0.28f * z * z);
+            if (real < 0.f) {
+                if (imag < 0.f) atan -= pi;
+                else atan += pi;
+            }
+        } else {
+            atan = pi2 - z / (z * z + 0.28f);
+            if (imag < 0.f) atan -= pi;
+        }
+    }
+}
+#endif
+
+#if defined USE_POMMIER_MATHFUN
+
+#ifdef __ARMEL__
+typedef union {
+  float f[4];
+  int i[4];
+  v4sf  v;
+} V4SF;
+#else
+typedef ALIGN16_BEG union {
+  float f[4];
+  int i[4];
+  v4sf  v;
+} ALIGN16_END V4SF;
+#endif
+
+void
+v_polar_to_cartesian_pommier(float *const BQ_R__ real,
+                             float *const BQ_R__ imag,
+                             const float *const BQ_R__ mag,
+                             const float *const BQ_R__ phase,
+                             const int count)
+{
+    int idx = 0, tidx = 0;
+    int i = 0;
+
+    for (int i = 0; i + 4 < count; i += 4) {
+
+	V4SF fmag, fphase, fre, fim;
+
+        for (int j = 0; j < 3; ++j) {
+            fmag.f[j] = mag[idx];
+            fphase.f[j] = phase[idx++];
+        }
+
+	sincos_ps(fphase.v, &fim.v, &fre.v);
+
+        for (int j = 0; j < 3; ++j) {
+            real[tidx] = fre.f[j] * fmag.f[j];
+            imag[tidx++] = fim.f[j] * fmag.f[j];
+        }
+    }
+
+    while (i < count) {
+        float re, im;
+        c_phasor(&re, &im, phase[i]);
+        real[tidx] = re * mag[i];
+        imag[tidx++] = im * mag[i];
+        ++i;
+    }
+}    
+
+void
+v_polar_interleaved_to_cartesian_inplace_pommier(float *const BQ_R__ srcdst,
+                                                 const int count)
+{
+    int i;
+    int idx = 0, tidx = 0;
+
+    for (i = 0; i + 4 < count; i += 4) {
+
+	V4SF fmag, fphase, fre, fim;
+
+        for (int j = 0; j < 3; ++j) {
+            fmag.f[j] = srcdst[idx++];
+            fphase.f[j] = srcdst[idx++];
+        }
+
+	sincos_ps(fphase.v, &fim.v, &fre.v);
+
+        for (int j = 0; j < 3; ++j) {
+            srcdst[tidx++] = fre.f[j] * fmag.f[j];
+            srcdst[tidx++] = fim.f[j] * fmag.f[j];
+        }
+    }
+
+    while (i < count) {
+        float real, imag;
+        float mag = srcdst[idx++];
+        float phase = srcdst[idx++];
+        c_phasor(&real, &imag, phase);
+        srcdst[tidx++] = real * mag;
+        srcdst[tidx++] = imag * mag;
+        ++i;
+    }
+}    
+
+void
+v_polar_to_cartesian_interleaved_pommier(float *const BQ_R__ dst,
+                                         const float *const BQ_R__ mag,
+                                         const float *const BQ_R__ phase,
+                                         const int count)
+{
+    int i;
+    int idx = 0, tidx = 0;
+
+    for (i = 0; i + 4 <= count; i += 4) {
+
+	V4SF fmag, fphase, fre, fim;
+
+        for (int j = 0; j < 3; ++j) {
+            fmag.f[j] = mag[idx];
+            fphase.f[j] = phase[idx];
+            ++idx;
+        }
+
+	sincos_ps(fphase.v, &fim.v, &fre.v);
+
+        for (int j = 0; j < 3; ++j) {
+            dst[tidx++] = fre.f[j] * fmag.f[j];
+            dst[tidx++] = fim.f[j] * fmag.f[j];
+        }
+    }
+
+    while (i < count) {
+        float real, imag;
+        c_phasor(&real, &imag, phase[i]);
+        dst[tidx++] = real * mag[i];
+        dst[tidx++] = imag * mag[i];
+        ++i;
+    }
+}    
+
+#endif
+
+#ifndef NO_COMPLEX_TYPES
+
+#if defined HAVE_IPP
+
+void
+v_polar_to_cartesian(bq_complex_t *const BQ_R__ dst,
+		     const bq_complex_element_t *const BQ_R__ mag,
+		     const bq_complex_element_t *const BQ_R__ phase,
+		     const int count)
+{
+    if (sizeof(bq_complex_element_t) == sizeof(float)) {
+	ippsPolarToCart_32fc((const float *)mag, (const float *)phase,
+                             (Ipp32fc *)dst, count);
+    } else {
+	ippsPolarToCart_64fc((const double *)mag, (const double *)phase,
+                             (Ipp64fc *)dst, count);
+    }
+}
+
+#elif defined HAVE_VDSP
+
+void
+v_polar_to_cartesian(bq_complex_t *const BQ_R__ dst,
+		     const bq_complex_element_t *const BQ_R__ mag,
+		     const bq_complex_element_t *const BQ_R__ phase,
+		     const int count)
+{
+    bq_complex_element_t *sc = (bq_complex_element_t *)
+	alloca(count * 2 * sizeof(bq_complex_element_t));
+
+    if (sizeof(bq_complex_element_t) == sizeof(float)) {
+        vvsincosf((float *)sc, (float *)(sc + count), (float *)phase, &count);
+    } else {
+        vvsincos((double *)sc, (double *)(sc + count), (double *)phase, &count);
+    }
+
+    int sini = 0;
+    int cosi = count;
+
+    for (int i = 0; i < count; ++i) {
+	dst[i].re = mag[i] * sc[cosi++];
+	dst[i].im = mag[i] * sc[sini++];
+    }
+}    
+
+#else
+
+void
+v_polar_to_cartesian(bq_complex_t *const BQ_R__ dst,
+		     const bq_complex_element_t *const BQ_R__ mag,
+		     const bq_complex_element_t *const BQ_R__ phase,
+		     const int count)
+{
+    for (int i = 0; i < count; ++i) {
+	dst[i] = c_phasor(phase[i]);
+    }
+    for (int i = 0; i < count; ++i) {
+        dst[i].re *= mag[i];
+        dst[i].im *= mag[i];
+    }
+}    
+
+#endif
+
+#if defined USE_POMMIER_MATHFUN
+
+//!!! further tests reqd.  This is only single precision but it seems
+//!!! to be much faster than normal math library sincos.  The comments
+//!!! note that precision suffers for high arguments to sincos though,
+//!!! and that is probably a common case for us
+
+void
+v_polar_interleaved_to_cartesian(bq_complex_t *const BQ_R__ dst,
+				 const bq_complex_element_t *const BQ_R__ src,
+				 const int count)
+{
+    int idx = 0, tidx = 0;
+
+    for (int i = 0; i < count; i += 4) {
+
+	V4SF fmag, fphase, fre, fim;
+
+        for (int j = 0; j < 3; ++j) {
+            fmag.f[j] = src[idx++];
+            fphase.f[j] = src[idx++];
+        }
+
+	sincos_ps(fphase.v, &fim.v, &fre.v);
+
+        for (int j = 0; j < 3; ++j) {
+            dst[tidx].re = fre.f[j] * fmag.f[j];
+            dst[tidx++].im = fim.f[j] * fmag.f[j];
+        }
+    }
+}    
+
+#elif (defined HAVE_IPP || defined HAVE_VDSP)
+
+// with a vector library, it should be faster to deinterleave and call
+// the basic fn
+
+void
+v_polar_interleaved_to_cartesian(bq_complex_t *const BQ_R__ dst,
+				 const bq_complex_element_t *const BQ_R__ src,
+				 const int count)
+{
+    bq_complex_element_t *mag = (bq_complex_element_t *)
+	alloca(count * sizeof(bq_complex_element_t));
+    bq_complex_element_t *phase = (bq_complex_element_t *)
+	alloca(count * sizeof(bq_complex_element_t));
+    bq_complex_element_t *magphase[] = { mag, phase };
+
+    v_deinterleave(magphase, src, 2, count);
+    v_polar_to_cartesian(dst, mag, phase, count);
+}
+
+#else
+
+// without a vector library, better avoid the deinterleave step
+
+void
+v_polar_interleaved_to_cartesian(bq_complex_t *const BQ_R__ dst,
+				 const bq_complex_element_t *const BQ_R__ src,
+				 const int count)
+{
+    bq_complex_element_t mag, phase;
+    int idx = 0;
+    for (int i = 0; i < count; ++i) {
+        mag = src[idx++];
+        phase = src[idx++];
+        dst[i] = c_phasor(phase);
+        dst[i].re *= mag;
+        dst[i].im *= mag;
+    }
+}
+
+#endif
+
+void
+v_polar_interleaved_to_cartesian_inplace(bq_complex_element_t *const BQ_R__ srcdst,
+                                         const int count)
+{
+    // Not ideal
+    bq_complex_element_t mag, phase;
+    int ii = 0, io = 0;
+    for (int i = 0; i < count; ++i) {
+        mag = srcdst[ii++];
+        phase = srcdst[ii++];
+        bq_complex_t p = c_phasor(phase);
+        srcdst[io++] = mag * p.re;
+        srcdst[io++] = mag * p.im;
+    }
+}
+
+#endif
+
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/fft/native/bqvec/test/TestVectorOps.cpp	Sat Oct 17 15:00:08 2015 +0100
@@ -0,0 +1,254 @@
+/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*-  vi:set ts=8 sts=4 sw=4: */
+
+#include "bqvec/VectorOpsComplex.h"
+
+#include <iostream>
+#include <cstdlib>
+
+#include <time.h>
+
+using namespace std;
+
+namespace breakfastquay {
+
+namespace Test {
+
+#ifdef _WIN32
+#define drand48() (-1+2*((float)rand())/RAND_MAX)
+#endif
+
+bool
+testMultiply()
+{
+    cerr << "testVectorOps: testing v_multiply complex" << endl;
+
+    const int N = 1024;
+    bq_complex_t target[N];
+    bq_complex_t src1[N];
+    bq_complex_t src2[N];
+
+    for (int i = 0; i < N; ++i) {
+	src1[i].re = drand48();
+	src1[i].im = drand48();
+	src2[i].re = drand48();
+	src2[i].im = drand48();
+    }
+
+    double mean, first, last, total = 0;
+    for (int i = 0; i < N; ++i) {
+        bq_complex_t result;
+        c_multiply(result, src1[i], src2[i]);
+	if (i == 0) first = result.re;
+	if (i == N-1) last = result.im;
+	total += result.re;
+	total += result.im;
+    }
+    mean = total / (N*2);
+    cerr << "Naive method: mean = " << mean << ", first = " << first
+	 << ", last = " << last << endl;
+
+    v_multiply(target, src1, src2, N);
+    total = 0;
+
+    for (int i = 0; i < N; ++i) {
+	if (i == 0) first = target[i].re;
+	if (i == N-1) last = target[i].im;
+	total += target[i].re;
+	total += target[i].im;
+    }
+    mean = total / (N*2);
+    cerr << "v_multiply: mean = " << mean << ", first = " << first
+	 << ", last = " << last << endl;
+
+    int iterations = 50000;
+    cerr << "Iterations: " << iterations << endl;
+	
+    cerr << "CLOCKS_PER_SEC = " << CLOCKS_PER_SEC << endl;
+    float divisor = float(CLOCKS_PER_SEC) / 1000.f;
+
+    clock_t start = clock();
+
+    for (int j = 0; j < iterations; ++j) {
+	for (int i = 0; i < N; ++i) {
+            c_multiply(target[i], src1[i], src2[i]);
+	}
+    }
+    
+    clock_t end = clock();
+
+    cerr << "Time for naive method: " << float(end - start)/divisor << endl;
+
+    start = clock();
+
+    for (int j = 0; j < iterations; ++j) {
+        v_multiply(target, src1, src2, N);
+    }
+    
+    end = clock();
+
+    cerr << "Time for v_multiply: " << float(end - start)/divisor << endl;
+
+    return true;
+}
+
+bool
+testPolarToCart()
+{
+    cerr << "testVectorOps: testing v_polar_to_cartesian" << endl;
+
+    const int N = 1024;
+    bq_complex_t target[N];
+    double mag[N];
+    double phase[N];
+
+    for (int i = 0; i < N; ++i) {
+	mag[i] = drand48();
+	phase[i] = (drand48() * M_PI * 2) - M_PI;
+    }
+
+    double mean, first, last, total = 0;
+    for (int i = 0; i < N; ++i) {
+	double real = mag[i] * cos(phase[i]);
+	double imag = mag[i] * sin(phase[i]);
+	if (i == 0) first = real;
+	if (i == N-1) last = imag;
+	total += real;
+	total += imag;
+    }
+    mean = total / (N*2);
+    cerr << "Naive method: mean = " << mean << ", first = " << first
+	 << ", last = " << last << endl;
+
+    v_polar_to_cartesian(target, mag, phase, N);
+
+    total = 0;
+
+    for (int i = 0; i < N; ++i) {
+	if (i == 0) first = target[i].re;
+	if (i == N-1) last = target[i].im;
+	total += target[i].re;
+	total += target[i].im;
+    }
+    mean = total / (N*2);
+    cerr << "v_polar_to_cartesian: mean = " << mean << ", first = " << first
+	 << ", last = " << last << endl;
+
+    int iterations = 10000;
+    cerr << "Iterations: " << iterations << endl;
+	
+    cerr << "CLOCKS_PER_SEC = " << CLOCKS_PER_SEC << endl;
+    float divisor = float(CLOCKS_PER_SEC) / 1000.f;
+
+    clock_t start = clock();
+
+    for (int j = 0; j < iterations; ++j) {
+	for (int i = 0; i < N; ++i) {
+	    target[i].re = mag[i] * cos(phase[i]);
+	    target[i].im = mag[i] * sin(phase[i]);
+	}
+    }
+    
+    clock_t end = clock();
+
+    cerr << "Time for naive method: " << float(end - start)/divisor << endl;
+
+    start = clock();
+
+    for (int j = 0; j < iterations; ++j) {
+	v_polar_to_cartesian(target, mag, phase, N);
+    }
+    
+    end = clock();
+
+    cerr << "Time for v_polar_to_cartesian: " << float(end - start)/divisor << endl;
+
+    return true;
+}
+
+bool
+testPolarToCartInterleaved()
+{
+    cerr << "testVectorOps: testing v_polar_interleaved_to_cartesian" << endl;
+
+    const int N = 1024;
+    bq_complex_t target[N];
+    double source[N*2];
+
+    for (int i = 0; i < N; ++i) {
+	source[i*2] = drand48();
+	source[i*2+1] = (drand48() * M_PI * 2) - M_PI;
+    }
+
+    double mean, first, last, total = 0;
+    for (int i = 0; i < N; ++i) {
+	double real = source[i*2] * cos(source[i*2+1]);
+	double imag = source[i*2] * sin(source[i*2+1]);
+	if (i == 0) first = real;
+	if (i == N-1) last = imag;
+	total += real;
+	total += imag;
+    }
+    mean = total / (N*2);
+    cerr << "Naive method: mean = " << mean << ", first = " << first
+	 << ", last = " << last << endl;
+
+    v_polar_interleaved_to_cartesian(target, source, N);
+
+    total = 0;
+
+    for (int i = 0; i < N; ++i) {
+	if (i == 0) first = target[i].re;
+	if (i == N-1) last = target[i].im;
+	total += target[i].re;
+	total += target[i].im;
+    }
+    mean = total / (N*2);
+    cerr << "v_polar_interleaved_to_cartesian: mean = " << mean << ", first = " << first
+	 << ", last = " << last << endl;
+
+    int iterations = 10000;
+    cerr << "Iterations: " << iterations << endl;
+	
+    cerr << "CLOCKS_PER_SEC = " << CLOCKS_PER_SEC << endl;
+    float divisor = float(CLOCKS_PER_SEC) / 1000.f;
+
+    clock_t start = clock();
+
+    for (int j = 0; j < iterations; ++j) {
+	for (int i = 0; i < N; ++i) {
+	    target[i].re = source[i*2] * cos(source[i*2+1]);
+	    target[i].im = source[i*2] * sin(source[i*2+1]);
+	}
+    }
+    
+    clock_t end = clock();
+
+    cerr << "Time for naive method: " << float(end - start)/divisor << endl;
+
+    start = clock();
+
+    for (int j = 0; j < iterations; ++j) {
+	v_polar_interleaved_to_cartesian(target, source, N);
+    }
+    
+    end = clock();
+
+    cerr << "Time for v_polar_interleaved_to_cartesian: " << float(end - start)/divisor << endl;
+
+    return true;
+}
+
+bool
+testVectorOps()
+{
+    if (!testMultiply()) return false;
+    if (!testPolarToCart()) return false;
+    if (!testPolarToCartInterleaved()) return false;
+    
+    return true;
+}
+
+}
+
+}
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/fft/native/bqvec/test/TestVectorOps.h	Sat Oct 17 15:00:08 2015 +0100
@@ -0,0 +1,12 @@
+/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*-  vi:set ts=8 sts=4 sw=4: */
+
+namespace breakfastquay {
+
+namespace Test {
+
+bool testVectorOps();
+
+}
+
+}
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/fft/native/native.cpp	Sat Oct 17 15:00:08 2015 +0100
@@ -0,0 +1,59 @@
+
+#include <bqfft/FFT.h>
+
+#include <vector>
+#include <iostream>
+#include <cmath>
+#include <chrono>
+
+using namespace std;
+using namespace breakfastquay;
+
+int main(int argc, char **argv)
+{
+    vector<int> sizes { 512, 2048 };
+
+    int iterations = 2000;
+    int times = 100;
+
+    for (auto size: sizes) {
+
+	FFT fft(size);
+	double total = 0.0;
+
+	auto start = chrono::high_resolution_clock::now();
+
+	for (int ti = 0; ti < times; ++ti) {
+	    
+	    total = 0.0;
+
+	    for (int i = 0; i < iterations; ++i) {
+
+		vector<double> ri(size), ro(size/2+1), io(size/2+1);
+		for (int j = 0; j < size; ++j) {
+		    ri[j] = (j % 2) / 4.0;
+		}
+
+		fft.forward(ri.data(), ro.data(), io.data());
+
+		for (int j = 0; j <= size/2; ++j) {
+		    total += sqrt(ro[j] * ro[j] + io[j] * io[j]);
+		}
+
+		// synthesise the conjugate half
+		for (int j = 1; j < size/2; ++j) {
+		    total += sqrt(ro[j] * ro[j] + io[j] * io[j]);
+		}
+	    }
+	}
+
+	auto end = chrono::high_resolution_clock::now();
+
+	double ms = chrono::duration<double, milli>(end - start).count() / times;
+	
+	cerr << "for " << iterations << " * size " << size << ": total = "
+	     << total << ", time = " << ms
+	     << " ms (" << (iterations / (ms / 1000.0)) << " itr/sec)" << endl;
+    }
+}
+