Mercurial > hg > batch-feature-extraction-tool
view Lib/fftw-3.2.1/simd/simd-mips_ps.h @ 1:e86e9c111b29
Updates stuff that potentially fixes the memory leak and also makes it work on Windows and Linux (Need to test). Still have to fix fftw include for linux in Jucer.
author | David Ronan <d.m.ronan@qmul.ac.uk> |
---|---|
date | Thu, 09 Jul 2015 15:01:32 +0100 |
parents | 25bf17994ef1 |
children |
line wrap: on
line source
/* * Copyright (c) 2007 Codesourcery * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * */ #ifndef _SIMD_SIMD_MIPS_PS_H #define _SIMD_SIMD_MIPS_PS_H #ifndef FFTW_SINGLE #error "MIPS PS only works in single precision" #endif #define VL 1 /* SIMD complex vector length */ #define ALIGNMENT 8 /* alignment for LD/ST */ #define ALIGNMENTA 8 /* alignment for LDA/STA */ #define SIMD_VSTRIDE_OKA(x) 1 #define SIMD_STRIDE_OKPAIR SIMD_STRIDE_OKA /* Codelets allows stores to be performed in one of two ways: - Values can be written back as they are generated. This potentially lowers register pressure by freeing the register holding the value for reuse. - Values can be written back in groups to consecutive addresses. This potentially lowers the cost of address generation. */ #define USE_MULTIPLE_STORES 0 #define RIGHT_CPU X(have_mips_ps) extern int RIGHT_CPU(void); #if defined(__mips__) #include <mips_ps.h> typedef psingle V; #define LDK(x) x #define DVK(var, val) \ const V var = _mips64_lc(val) #define VADD _mips64_add #define VSUB _mips64_sub #define VMUL _mips64_mul #define VFMS _mips64_msub #define VFMA _mips64_madd #define VFNMS _mips64_nmsub #define UNPCKL _mips64_pl #define UNPCKH _mips64_pu #define STOREL _mips64_sl #define STOREH _mips64_su #define SHUFFLE _mips64_shuffle #define FLIP_RI SHUFFLE #define VCONJ _mips64_chs_i #define VFMACONJ(b,c) VADD(VCONJ(b),c) #define VFMSCONJ(b,c) VSUB(VCONJ(b),c) #define VFNMSCONJ(b,c) VSUB(c, VCONJ(b)) static inline V LDA(const R *x, INT ivs, const R *aligned_like) { (void)aligned_like; (void)ivs; return _mips64_ld((V*)x); } #define LD LDA static inline void STA(R *x, V v, INT ovs, const R *aligned_like) { (void)aligned_like; (void)ovs; _mips64_st(x, v); } #define ST STA #if USE_MULTIPLE_STORES # define STM2(x,v,ovs,a) /* nop */ static inline void STN2(R *x, V v0, V v1, INT ovs) { (void)ovs; STA(x, v0, 0, 0); STA(x+2, v1, 0, 0); } #else /* !USE_MULTIPLE_STORES */ # define STM2 STA # define STN2(x,v0,v1,ovs) /* nop */ #endif #if USE_MULTIPLE_STORES # define STM4(x,v,ovs,a) /* nop */ static inline void STN4(R *x, V v0, V v1, V v2, V v3, INT ovs) { R *x_ptr = x; STA(x_ptr, UNPCKL(v0,v1), 0, 0); x_ptr += ovs; STA(x_ptr, UNPCKL(v2,v3), 0, 0); x_ptr += ovs; STA(x_ptr, UNPCKH(v0,v1), 0, 0); x_ptr += ovs; STA(x_ptr, UNPCKH(v2,v3), 0, 0); } #else /* !USE_MULTIPLE_STORES */ static inline void STM4(R *x, V v, INT ovs, const R *aligned_like) { (void)aligned_like; /* UNUSED */ STOREL(x, v); STOREH(x + ovs, v); } # define STN4(x, v0, v1, v2, v3, ovs) /* nop */ #endif static inline V VBYI(V x) { x = VCONJ(x); x = FLIP_RI(x); return x; } #define VFMAI(b, c) VADD(c, VBYI(b)) #define VFNMSI(b, c) VSUB(c, VBYI(b)) static inline V VZMUL(V tx, V sr) { V tr = UNPCKL(tx, tx); V ti = UNPCKH(tx, tx); tr = VMUL(sr, tr); sr = VBYI(sr); return VFMA(ti,sr,tr); } static inline V VZMULJ(V tx, V sr) { V tr = UNPCKL(tx, tx); V ti = UNPCKH(tx, tx); tr = VMUL(sr, tr); sr = VBYI(sr); return VFNMS(ti,sr,tr); } /* twiddle storage #1: compact, slower */ #define VTW1(v,x) {TW_CEXP, v, x} #define TWVL1 1 static inline V BYTW1(const R *t, V sr) { V tx = LD(t, 1, t); return VZMUL(tx, sr); } static inline V BYTWJ1(const R *t, V sr) { V tx = LD(t, 1, t); return VZMULJ(tx, sr); } /* twiddle storage #2: twice the space, faster (when in cache) */ #define VTW2(v,x) \ {TW_COS, v, x}, {TW_COS, v, x}, {TW_SIN, v, -x}, {TW_SIN, v, x} #define TWVL2 2 static inline V BYTW2(const R *t, V sr) { const V *twp = (const V *)t; V si = FLIP_RI(sr); V tr = twp[0], ti = twp[1]; return VFMA(tr,sr,VMUL(ti,si)); } static inline V BYTWJ2(const R *t, V sr) { const V *twp = (const V *)t; V si = FLIP_RI(sr); V tr = twp[0], ti = twp[1]; return VFNMS(ti,si, VMUL(tr,sr)); } static inline V VZMULI(V tx, V sr) { V tr = UNPCKL(tx, tx); V ti = UNPCKH(tx, tx); ti = VMUL(ti, sr); sr = VBYI(sr); return VFMS(tr,sr,ti); } static inline V VZMULIJ(V tx, V sr) { V tr = UNPCKL(tx, tx); V ti = UNPCKH(tx, tx); ti = VMUL(ti, sr); sr = VBYI(sr); return VFMA(tr,sr,ti); } /* twiddle storage #3 */ #define VTW3(v,x) VTW1(v,x) #define TWVL3 TWVL1 /* twiddle storage for split arrays */ #define VTWS(v,x) \ {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_SIN, v, x}, {TW_SIN, v+1, x} #define TWVLS (2 * VL) #endif /* defined(__mips__) */ #endif /* _SIMD_SIMD_MIPS_PS_H */