annotate src/fftw-3.3.3/simd-support/simd-neon.h @ 95:89f5e221ed7b

Add FFTW3
author Chris Cannam <cannam@all-day-breakfast.com>
date Wed, 20 Mar 2013 15:35:50 +0000
parents
children
rev   line source
cannam@95 1 /*
cannam@95 2 * Copyright (c) 2003, 2007-11 Matteo Frigo
cannam@95 3 * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
cannam@95 4 *
cannam@95 5 * This program is free software; you can redistribute it and/or modify
cannam@95 6 * it under the terms of the GNU General Public License as published by
cannam@95 7 * the Free Software Foundation; either version 2 of the License, or
cannam@95 8 * (at your option) any later version.
cannam@95 9 *
cannam@95 10 * This program is distributed in the hope that it will be useful,
cannam@95 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
cannam@95 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
cannam@95 13 * GNU General Public License for more details.
cannam@95 14 *
cannam@95 15 * You should have received a copy of the GNU General Public License
cannam@95 16 * along with this program; if not, write to the Free Software
cannam@95 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
cannam@95 18 *
cannam@95 19 */
cannam@95 20
cannam@95 21 #ifndef FFTW_SINGLE
cannam@95 22 #error "NEON only works in single precision"
cannam@95 23 #endif
cannam@95 24
cannam@95 25 /* define these unconditionally, because they are used by
cannam@95 26 taint.c which is compiled without neon */
cannam@95 27 #define SIMD_SUFFIX _neon /* for renaming */
cannam@95 28 #define VL 2 /* SIMD complex vector length */
cannam@95 29 #define SIMD_VSTRIDE_OKA(x) ((x) == 2)
cannam@95 30 #define SIMD_STRIDE_OKPAIR SIMD_STRIDE_OK
cannam@95 31
cannam@95 32 #if defined(__GNUC__) && !defined(__ARM_NEON__)
cannam@95 33 #error "compiling simd-neon.h requires -mfpu=neon or equivalent"
cannam@95 34 #endif
cannam@95 35
cannam@95 36 #include <arm_neon.h>
cannam@95 37
cannam@95 38 /* FIXME: I am not sure whether this code assumes little-endian
cannam@95 39 ordering. VLIT may or may not be wrong for big-endian systems. */
cannam@95 40 typedef float32x4_t V;
cannam@95 41
cannam@95 42 #define VLIT(x0, x1, x2, x3) {x0, x1, x2, x3}
cannam@95 43 #define LDK(x) x
cannam@95 44 #define DVK(var, val) const V var = VLIT(val, val, val, val)
cannam@95 45
cannam@95 46 /* NEON has FMA, but a three-operand FMA is not too useful
cannam@95 47 for FFT purposes. We normally compute
cannam@95 48
cannam@95 49 t0=a+b*c
cannam@95 50 t1=a-b*c
cannam@95 51
cannam@95 52 In a three-operand instruction set this translates into
cannam@95 53
cannam@95 54 t0=a
cannam@95 55 t0+=b*c
cannam@95 56 t1=a
cannam@95 57 t1-=b*c
cannam@95 58
cannam@95 59 At least one move must be implemented, negating the advantage of
cannam@95 60 the FMA in the first place. At least some versions of gcc generate
cannam@95 61 both moves. So we are better off generating t=b*c;t0=a+t;t1=a-t;*/
cannam@95 62 #if HAVE_FMA
cannam@95 63 #warning "--enable-fma on NEON is probably a bad idea (see source code)"
cannam@95 64 #endif
cannam@95 65
cannam@95 66 #define VADD(a, b) vaddq_f32(a, b)
cannam@95 67 #define VSUB(a, b) vsubq_f32(a, b)
cannam@95 68 #define VMUL(a, b) vmulq_f32(a, b)
cannam@95 69 #define VFMA(a, b, c) vmlaq_f32(c, a, b) /* a*b+c */
cannam@95 70 #define VFNMS(a, b, c) vmlsq_f32(c, a, b) /* FNMS=-(a*b-c) in powerpc terminology; MLS=c-a*b
cannam@95 71 in ARM terminology */
cannam@95 72 #define VFMS(a, b, c) VSUB(VMUL(a, b), c) /* FMS=a*b-c in powerpc terminology; no equivalent
cannam@95 73 arm instruction (?) */
cannam@95 74
cannam@95 75 static inline V LDA(const R *x, INT ivs, const R *aligned_like)
cannam@95 76 {
cannam@95 77 (void) aligned_like; /* UNUSED */
cannam@95 78 return vld1q_f32((const float32_t *)x);
cannam@95 79 }
cannam@95 80
cannam@95 81 static inline V LD(const R *x, INT ivs, const R *aligned_like)
cannam@95 82 {
cannam@95 83 (void) aligned_like; /* UNUSED */
cannam@95 84 return vcombine_f32(vld1_f32((float32_t *)x), vld1_f32((float32_t *)(x + ivs)));
cannam@95 85 }
cannam@95 86
cannam@95 87 static inline void STA(R *x, V v, INT ovs, const R *aligned_like)
cannam@95 88 {
cannam@95 89 (void) aligned_like; /* UNUSED */
cannam@95 90 vst1q_f32((float32_t *)x, v);
cannam@95 91 }
cannam@95 92
cannam@95 93 static inline void ST(R *x, V v, INT ovs, const R *aligned_like)
cannam@95 94 {
cannam@95 95 (void) aligned_like; /* UNUSED */
cannam@95 96 /* WARNING: the extra_iter hack depends upon store-low occurring
cannam@95 97 after store-high */
cannam@95 98 vst1_f32((float32_t *)(x + ovs), vget_high_f32(v));
cannam@95 99 vst1_f32((float32_t *)x, vget_low_f32(v));
cannam@95 100 }
cannam@95 101
cannam@95 102 /* 2x2 complex transpose and store */
cannam@95 103 #define STM2 ST
cannam@95 104 #define STN2(x, v0, v1, ovs) /* nop */
cannam@95 105
cannam@95 106 /* store and 4x4 real transpose */
cannam@95 107 static inline void STM4(R *x, V v, INT ovs, const R *aligned_like)
cannam@95 108 {
cannam@95 109 (void) aligned_like; /* UNUSED */
cannam@95 110 vst1_lane_f32((float32_t *)(x) , vget_low_f32(v), 0);
cannam@95 111 vst1_lane_f32((float32_t *)(x + ovs), vget_low_f32(v), 1);
cannam@95 112 vst1_lane_f32((float32_t *)(x + 2 * ovs), vget_high_f32(v), 0);
cannam@95 113 vst1_lane_f32((float32_t *)(x + 3 * ovs), vget_high_f32(v), 1);
cannam@95 114 }
cannam@95 115 #define STN4(x, v0, v1, v2, v3, ovs) /* use STM4 */
cannam@95 116
cannam@95 117 #define FLIP_RI(x) vrev64q_f32(x)
cannam@95 118
cannam@95 119 static inline V VCONJ(V x)
cannam@95 120 {
cannam@95 121 #if 1
cannam@95 122 static const uint32x4_t pm = {0, 0x80000000u, 0, 0x80000000u};
cannam@95 123 return vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(x), pm));
cannam@95 124 #else
cannam@95 125 const V pm = VLIT(1.0, -1.0, 1.0, -1.0);
cannam@95 126 return VMUL(x, pm);
cannam@95 127 #endif
cannam@95 128 }
cannam@95 129
cannam@95 130 static inline V VBYI(V x)
cannam@95 131 {
cannam@95 132 return FLIP_RI(VCONJ(x));
cannam@95 133 }
cannam@95 134
cannam@95 135 static inline V VFMAI(V b, V c)
cannam@95 136 {
cannam@95 137 const V mp = VLIT(-1.0, 1.0, -1.0, 1.0);
cannam@95 138 return VFMA(FLIP_RI(b), mp, c);
cannam@95 139 }
cannam@95 140
cannam@95 141 static inline V VFNMSI(V b, V c)
cannam@95 142 {
cannam@95 143 const V mp = VLIT(-1.0, 1.0, -1.0, 1.0);
cannam@95 144 return VFNMS(FLIP_RI(b), mp, c);
cannam@95 145 }
cannam@95 146
cannam@95 147 static inline V VFMACONJ(V b, V c)
cannam@95 148 {
cannam@95 149 const V pm = VLIT(1.0, -1.0, 1.0, -1.0);
cannam@95 150 return VFMA(b, pm, c);
cannam@95 151 }
cannam@95 152
cannam@95 153 static inline V VFNMSCONJ(V b, V c)
cannam@95 154 {
cannam@95 155 const V pm = VLIT(1.0, -1.0, 1.0, -1.0);
cannam@95 156 return VFNMS(b, pm, c);
cannam@95 157 }
cannam@95 158
cannam@95 159 static inline V VFMSCONJ(V b, V c)
cannam@95 160 {
cannam@95 161 return VSUB(VCONJ(b), c);
cannam@95 162 }
cannam@95 163
cannam@95 164 #if 1
cannam@95 165 #define VEXTRACT_REIM(tr, ti, tx) \
cannam@95 166 { \
cannam@95 167 tr = vcombine_f32(vdup_lane_f32(vget_low_f32(tx), 0), \
cannam@95 168 vdup_lane_f32(vget_high_f32(tx), 0)); \
cannam@95 169 ti = vcombine_f32(vdup_lane_f32(vget_low_f32(tx), 1), \
cannam@95 170 vdup_lane_f32(vget_high_f32(tx), 1)); \
cannam@95 171 }
cannam@95 172 #else
cannam@95 173 /* this alternative might be faster in an ideal world, but gcc likes
cannam@95 174 to spill VVV onto the stack */
cannam@95 175 #define VEXTRACT_REIM(tr, ti, tx) \
cannam@95 176 { \
cannam@95 177 float32x4x2_t vvv = vtrnq_f32(tx, tx); \
cannam@95 178 tr = vvv.val[0]; \
cannam@95 179 ti = vvv.val[1]; \
cannam@95 180 }
cannam@95 181 #endif
cannam@95 182
cannam@95 183 static inline V VZMUL(V tx, V sr)
cannam@95 184 {
cannam@95 185 V tr, ti;
cannam@95 186 VEXTRACT_REIM(tr, ti, tx);
cannam@95 187 tr = VMUL(sr, tr);
cannam@95 188 sr = VBYI(sr);
cannam@95 189 return VFMA(ti, sr, tr);
cannam@95 190 }
cannam@95 191
cannam@95 192 static inline V VZMULJ(V tx, V sr)
cannam@95 193 {
cannam@95 194 V tr, ti;
cannam@95 195 VEXTRACT_REIM(tr, ti, tx);
cannam@95 196 tr = VMUL(sr, tr);
cannam@95 197 sr = VBYI(sr);
cannam@95 198 return VFNMS(ti, sr, tr);
cannam@95 199 }
cannam@95 200
cannam@95 201 static inline V VZMULI(V tx, V sr)
cannam@95 202 {
cannam@95 203 V tr, ti;
cannam@95 204 VEXTRACT_REIM(tr, ti, tx);
cannam@95 205 ti = VMUL(ti, sr);
cannam@95 206 sr = VBYI(sr);
cannam@95 207 return VFMS(tr, sr, ti);
cannam@95 208 }
cannam@95 209
cannam@95 210 static inline V VZMULIJ(V tx, V sr)
cannam@95 211 {
cannam@95 212 V tr, ti;
cannam@95 213 VEXTRACT_REIM(tr, ti, tx);
cannam@95 214 ti = VMUL(ti, sr);
cannam@95 215 sr = VBYI(sr);
cannam@95 216 return VFMA(tr, sr, ti);
cannam@95 217 }
cannam@95 218
cannam@95 219 /* twiddle storage #1: compact, slower */
cannam@95 220 #define VTW1(v,x) {TW_CEXP, v, x}, {TW_CEXP, v+1, x}
cannam@95 221 #define TWVL1 VL
cannam@95 222 static inline V BYTW1(const R *t, V sr)
cannam@95 223 {
cannam@95 224 V tx = LDA(t, 2, 0);
cannam@95 225 return VZMUL(tx, sr);
cannam@95 226 }
cannam@95 227
cannam@95 228 static inline V BYTWJ1(const R *t, V sr)
cannam@95 229 {
cannam@95 230 V tx = LDA(t, 2, 0);
cannam@95 231 return VZMULJ(tx, sr);
cannam@95 232 }
cannam@95 233
cannam@95 234 /* twiddle storage #2: twice the space, faster (when in cache) */
cannam@95 235 # define VTW2(v,x) \
cannam@95 236 {TW_COS, v, x}, {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_COS, v+1, x}, \
cannam@95 237 {TW_SIN, v, -x}, {TW_SIN, v, x}, {TW_SIN, v+1, -x}, {TW_SIN, v+1, x}
cannam@95 238 #define TWVL2 (2 * VL)
cannam@95 239
cannam@95 240 static inline V BYTW2(const R *t, V sr)
cannam@95 241 {
cannam@95 242 V si = FLIP_RI(sr);
cannam@95 243 V tr = LDA(t, 2, 0), ti = LDA(t+2*VL, 2, 0);
cannam@95 244 return VFMA(ti, si, VMUL(tr, sr));
cannam@95 245 }
cannam@95 246
cannam@95 247 static inline V BYTWJ2(const R *t, V sr)
cannam@95 248 {
cannam@95 249 V si = FLIP_RI(sr);
cannam@95 250 V tr = LDA(t, 2, 0), ti = LDA(t+2*VL, 2, 0);
cannam@95 251 return VFNMS(ti, si, VMUL(tr, sr));
cannam@95 252 }
cannam@95 253
cannam@95 254 /* twiddle storage #3 */
cannam@95 255 # define VTW3(v,x) {TW_CEXP, v, x}, {TW_CEXP, v+1, x}
cannam@95 256 # define TWVL3 (VL)
cannam@95 257
cannam@95 258 /* twiddle storage for split arrays */
cannam@95 259 # define VTWS(v,x) \
cannam@95 260 {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, \
cannam@95 261 {TW_SIN, v, x}, {TW_SIN, v+1, x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, x}
cannam@95 262 #define TWVLS (2 * VL)
cannam@95 263
cannam@95 264 #define VLEAVE() /* nothing */
cannam@95 265
cannam@95 266 #include "simd-common.h"