view bqvec/src/VectorOpsComplex.cpp @ 372:af71cbdab621 tip

Update bqvec code
author Chris Cannam
date Tue, 19 Nov 2019 10:13:32 +0000
parents 5d0a2ebb4d17
children
line wrap: on
line source
/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*-  vi:set ts=8 sts=4 sw=4: */

/*
    bqvec

    A small library for vector arithmetic and allocation in C++ using
    raw C pointer arrays.

    Copyright 2007-2016 Particular Programs Ltd.

    Permission is hereby granted, free of charge, to any person
    obtaining a copy of this software and associated documentation
    files (the "Software"), to deal in the Software without
    restriction, including without limitation the rights to use, copy,
    modify, merge, publish, distribute, sublicense, and/or sell copies
    of the Software, and to permit persons to whom the Software is
    furnished to do so, subject to the following conditions:

    The above copyright notice and this permission notice shall be
    included in all copies or substantial portions of the Software.

    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
    EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
    NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
    ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
    CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
    WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

    Except as contained in this notice, the names of Chris Cannam and
    Particular Programs Ltd shall not be used in advertising or
    otherwise to promote the sale, use or other dealings in this
    Software without prior written authorization.
*/

#include "VectorOpsComplex.h"

#include <cassert>

#if defined USE_POMMIER_MATHFUN
#if defined __ARMEL__
#include "pommier/neon_mathfun.h"
#else
#include "pommier/sse_mathfun.h"
#endif
#endif

#if defined(_MSC_VER) || defined(WIN32)
#include <malloc.h>
#ifndef alloca
#define alloca _alloca
#endif
#else
#include <alloca.h>
#endif

#include <iostream>

using namespace std;

namespace breakfastquay {

#ifdef USE_APPROXIMATE_ATAN2
float approximate_atan2f(float real, float imag)
{
    static const float pi = M_PI;
    static const float pi2 = M_PI / 2;

    float atan;

    if (real == 0.f) {

        if (imag > 0.0f) atan = pi2;
        else if (imag == 0.0f) atan = 0.0f;
        else atan = -pi2;

    } else {

        float z = imag/real;

        if (fabsf(z) < 1.f) {
            atan = z / (1.f + 0.28f * z * z);
            if (real < 0.f) {
                if (imag < 0.f) atan -= pi;
                else atan += pi;
            }
        } else {
            atan = pi2 - z / (z * z + 0.28f);
            if (imag < 0.f) atan -= pi;
        }
    }

    return atan;
}
#endif

#if defined USE_POMMIER_MATHFUN

#ifdef __ARMEL__
typedef union {
  float f[4];
  int i[4];
  v4sf  v;
} V4SF;
#else
typedef ALIGN16_BEG union {
  float f[4];
  int i[4];
  v4sf  v;
} ALIGN16_END V4SF;
#endif

void
v_polar_to_cartesian_pommier(float *const BQ_R__ real,
                             float *const BQ_R__ imag,
                             const float *const BQ_R__ mag,
                             const float *const BQ_R__ phase,
                             const int count)
{
    int idx = 0, tidx = 0;
    int i = 0;

    for (int i = 0; i + 4 < count; i += 4) {

	V4SF fmag, fphase, fre, fim;

        for (int j = 0; j < 3; ++j) {
            fmag.f[j] = mag[idx];
            fphase.f[j] = phase[idx++];
        }

	sincos_ps(fphase.v, &fim.v, &fre.v);

        for (int j = 0; j < 3; ++j) {
            real[tidx] = fre.f[j] * fmag.f[j];
            imag[tidx++] = fim.f[j] * fmag.f[j];
        }
    }

    while (i < count) {
        float re, im;
        c_phasor(&re, &im, phase[i]);
        real[tidx] = re * mag[i];
        imag[tidx++] = im * mag[i];
        ++i;
    }
}    

void
v_polar_interleaved_to_cartesian_inplace_pommier(float *const BQ_R__ srcdst,
                                                 const int count)
{
    int i;
    int idx = 0, tidx = 0;

    for (i = 0; i + 4 < count; i += 4) {

	V4SF fmag, fphase, fre, fim;

        for (int j = 0; j < 3; ++j) {
            fmag.f[j] = srcdst[idx++];
            fphase.f[j] = srcdst[idx++];
        }

	sincos_ps(fphase.v, &fim.v, &fre.v);

        for (int j = 0; j < 3; ++j) {
            srcdst[tidx++] = fre.f[j] * fmag.f[j];
            srcdst[tidx++] = fim.f[j] * fmag.f[j];
        }
    }

    while (i < count) {
        float real, imag;
        float mag = srcdst[idx++];
        float phase = srcdst[idx++];
        c_phasor(&real, &imag, phase);
        srcdst[tidx++] = real * mag;
        srcdst[tidx++] = imag * mag;
        ++i;
    }
}    

void
v_polar_to_cartesian_interleaved_pommier(float *const BQ_R__ dst,
                                         const float *const BQ_R__ mag,
                                         const float *const BQ_R__ phase,
                                         const int count)
{
    int i;
    int idx = 0, tidx = 0;

    for (i = 0; i + 4 <= count; i += 4) {

	V4SF fmag, fphase, fre, fim;

        for (int j = 0; j < 3; ++j) {
            fmag.f[j] = mag[idx];
            fphase.f[j] = phase[idx];
            ++idx;
        }

	sincos_ps(fphase.v, &fim.v, &fre.v);

        for (int j = 0; j < 3; ++j) {
            dst[tidx++] = fre.f[j] * fmag.f[j];
            dst[tidx++] = fim.f[j] * fmag.f[j];
        }
    }

    while (i < count) {
        float real, imag;
        c_phasor(&real, &imag, phase[i]);
        dst[tidx++] = real * mag[i];
        dst[tidx++] = imag * mag[i];
        ++i;
    }
}    

#endif

#ifndef NO_COMPLEX_TYPES

void
v_polar_to_cartesian(bq_complex_t *const BQ_R__ dst,
		     const bq_complex_element_t *const BQ_R__ mag,
		     const bq_complex_element_t *const BQ_R__ phase,
		     const int count)
{
    if (sizeof(bq_complex_element_t) == sizeof(float)) {
        v_polar_to_cartesian_interleaved((float *)dst,
                                         (const float *)mag,
                                         (const float *)phase,
                                         count);
    } else {
        v_polar_to_cartesian_interleaved((double *)dst,
                                         (const double *)mag,
                                         (const double *)phase,
                                         count);
    }
}

void
v_cartesian_to_polar(bq_complex_element_t *const BQ_R__ mag,
                     bq_complex_element_t *const BQ_R__ phase,
                     const bq_complex_t *const BQ_R__ src,
                     const int count)
{
    if (sizeof(bq_complex_element_t) == sizeof(float)) {
        v_cartesian_interleaved_to_polar((float *)mag,
                                         (float *)phase,
                                         (const float *)src,
                                         count);
    } else {
        v_cartesian_interleaved_to_polar((double *)mag,
                                         (double *)phase,
                                         (const double *)src,
                                         count);
    }
}

void
v_cartesian_to_magnitudes(bq_complex_element_t *const BQ_R__ mag,
                          const bq_complex_t *const BQ_R__ src,
                          const int count)
{
    if (sizeof(bq_complex_element_t) == sizeof(float)) {
        v_cartesian_interleaved_to_magnitudes((float *)mag,
                                              (const float *)src,
                                              count);
    } else {
        v_cartesian_interleaved_to_magnitudes((double *)mag,
                                              (const double *)src,
                                              count);
    }
}

#if defined USE_POMMIER_MATHFUN

//!!! further tests reqd.  This is only single precision but it seems
//!!! to be much faster than normal math library sincos.  The comments
//!!! note that precision suffers for high arguments to sincos though,
//!!! and that is probably a common case for us

void
v_polar_interleaved_to_cartesian(bq_complex_t *const BQ_R__ dst,
				 const bq_complex_element_t *const BQ_R__ src,
				 const int count)
{
    int idx = 0, tidx = 0;

    for (int i = 0; i < count; i += 4) {

	V4SF fmag, fphase, fre, fim;

        for (int j = 0; j < 3; ++j) {
            fmag.f[j] = src[idx++];
            fphase.f[j] = src[idx++];
        }

	sincos_ps(fphase.v, &fim.v, &fre.v);

        for (int j = 0; j < 3; ++j) {
            dst[tidx].re = fre.f[j] * fmag.f[j];
            dst[tidx++].im = fim.f[j] * fmag.f[j];
        }
    }
}    

#elif (defined HAVE_IPP || defined HAVE_VDSP)

// with a vector library, it should be faster to deinterleave and call
// the basic fn

void
v_polar_interleaved_to_cartesian(bq_complex_t *const BQ_R__ dst,
				 const bq_complex_element_t *const BQ_R__ src,
				 const int count)
{
    bq_complex_element_t *mag = (bq_complex_element_t *)
	alloca(count * sizeof(bq_complex_element_t));
    bq_complex_element_t *phase = (bq_complex_element_t *)
	alloca(count * sizeof(bq_complex_element_t));
    bq_complex_element_t *magphase[] = { mag, phase };

    v_deinterleave(magphase, src, 2, count);
    v_polar_to_cartesian(dst, mag, phase, count);
}

#else

// without a vector library, better avoid the deinterleave step

void
v_polar_interleaved_to_cartesian(bq_complex_t *const BQ_R__ dst,
				 const bq_complex_element_t *const BQ_R__ src,
				 const int count)
{
    bq_complex_element_t mag, phase;
    int idx = 0;
    for (int i = 0; i < count; ++i) {
        mag = src[idx++];
        phase = src[idx++];
        dst[i] = c_phasor(phase);
        dst[i].re *= mag;
        dst[i].im *= mag;
    }
}

#endif

void
v_polar_interleaved_to_cartesian_inplace(bq_complex_element_t *const BQ_R__ srcdst,
                                         const int count)
{
    bq_complex_element_t mag, phase;
    int ii = 0, io = 0;
    for (int i = 0; i < count; ++i) {
        mag = srcdst[ii++];
        phase = srcdst[ii++];
        bq_complex_t p = c_phasor(phase);
        srcdst[io++] = mag * p.re;
        srcdst[io++] = mag * p.im;
    }
}

#endif // NO_COMPLEX_TYPES

}