chris@163
|
1 /**
|
chris@163
|
2 * Copyright (c) 2014, 2015, Enzien Audio Ltd.
|
chris@163
|
3 *
|
chris@163
|
4 * Permission to use, copy, modify, and/or distribute this software for any
|
chris@163
|
5 * purpose with or without fee is hereby granted, provided that the above
|
chris@163
|
6 * copyright notice and this permission notice appear in all copies.
|
chris@163
|
7 *
|
chris@163
|
8 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH
|
chris@163
|
9 * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
|
chris@163
|
10 * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT,
|
chris@163
|
11 * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
|
chris@163
|
12 * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
|
chris@163
|
13 * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
|
chris@163
|
14 * PERFORMANCE OF THIS SOFTWARE.
|
chris@163
|
15 */
|
chris@163
|
16
|
chris@163
|
17 #ifndef _HEAVY_MATH_H_
|
chris@163
|
18 #define _HEAVY_MATH_H_
|
chris@163
|
19
|
chris@163
|
20 #include "Utils.h"
|
chris@163
|
21
|
chris@163
|
22 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/
|
chris@163
|
23 // https://gcc.gnu.org/onlinedocs/gcc-4.8.1/gcc/ARM-NEON-Intrinsics.html
|
chris@163
|
24 // http://codesuppository.blogspot.co.uk/2015/02/sse2neonh-porting-guide-and-header-file.html
|
chris@163
|
25
|
chris@163
|
26 static inline void __hv_zero_f(hv_bOutf_t bOut) {
|
chris@163
|
27 #if HV_SIMD_AVX
|
chris@163
|
28 *bOut = _mm256_setzero_ps();
|
chris@163
|
29 #elif HV_SIMD_SSE
|
chris@163
|
30 *bOut = _mm_setzero_ps();
|
chris@163
|
31 #elif HV_SIMD_NEON
|
chris@163
|
32 *bOut = vdupq_n_f32(0.0f);
|
chris@163
|
33 #else // HV_SIMD_NONE
|
chris@163
|
34 *bOut = 0.0f;
|
chris@163
|
35 #endif
|
chris@163
|
36 }
|
chris@163
|
37
|
chris@163
|
38 static inline void __hv_load_f(float *bIn, hv_bOutf_t bOut) {
|
chris@163
|
39 #if HV_SIMD_AVX
|
chris@163
|
40 *bOut = _mm256_load_ps(bIn);
|
chris@163
|
41 #elif HV_SIMD_SSE
|
chris@163
|
42 *bOut = _mm_load_ps(bIn);
|
chris@163
|
43 #elif HV_SIMD_NEON
|
chris@163
|
44 *bOut = vld1q_f32(bIn);
|
chris@163
|
45 #else // HV_SIMD_NONE
|
chris@163
|
46 *bOut = *bIn;
|
chris@163
|
47 #endif
|
chris@163
|
48 }
|
chris@163
|
49
|
chris@163
|
50 static inline void __hv_store_f(float *bOut, hv_bInf_t bIn) {
|
chris@163
|
51 #if HV_SIMD_AVX
|
chris@163
|
52 _mm256_store_ps(bOut, bIn);
|
chris@163
|
53 #elif HV_SIMD_SSE
|
chris@163
|
54 _mm_store_ps(bOut, bIn);
|
chris@163
|
55 #elif HV_SIMD_NEON
|
chris@163
|
56 vst1q_f32(bOut, bIn);
|
chris@163
|
57 #else // HV_SIMD_NONE
|
chris@163
|
58 *bOut = bIn;
|
chris@163
|
59 #endif
|
chris@163
|
60 }
|
chris@163
|
61
|
chris@163
|
62 static inline void __hv_log_f(hv_bInf_t bIn, hv_bOutf_t bOut) {
|
chris@163
|
63 #if HV_SIMD_AVX
|
chris@163
|
64 #warning __hv_log_f() not implemented
|
chris@163
|
65 #elif HV_SIMD_SSE
|
chris@163
|
66 #warning __hv_log_f() not implemented
|
chris@163
|
67 #elif HV_SIMD_NEON
|
chris@163
|
68 #warning __hv_log_f() not implemented
|
chris@163
|
69 #else // HV_SIMD_NONE
|
chris@163
|
70 *bOut = (bIn > 0.0f) ? hv_log_f(bIn) : 0.0f;
|
chris@163
|
71 #endif
|
chris@163
|
72 }
|
chris@163
|
73
|
chris@163
|
74 static inline void __hv_log10_f(hv_bInf_t bIn, hv_bOutf_t bOut) {
|
chris@163
|
75 #if HV_SIMD_AVX
|
chris@163
|
76 #warning __hv_log10_f() not implemented
|
chris@163
|
77 #elif HV_SIMD_SSE
|
chris@163
|
78 #warning __hv_log10_f() not implemented
|
chris@163
|
79 #elif HV_SIMD_NEON
|
chris@163
|
80 #warning __hv_log10_f() not implemented
|
chris@163
|
81 #else // HV_SIMD_NONE
|
chris@163
|
82 *bOut = (bIn > 0.0f) ? hv_log10_f(bIn) : 0.0f;
|
chris@163
|
83 #endif
|
chris@163
|
84 }
|
chris@163
|
85
|
chris@163
|
86 static inline void __hv_log2_f(hv_bInf_t bIn, hv_bOutf_t bOut) {
|
chris@163
|
87 #if HV_SIMD_AVX
|
chris@163
|
88 #warning __hv_log2_f() not implemented
|
chris@163
|
89 #elif HV_SIMD_SSE
|
chris@163
|
90 #warning __hv_log2_f() not implemented
|
chris@163
|
91 #elif HV_SIMD_NEON
|
chris@163
|
92 #warning __hv_log2_f() not implemented
|
chris@163
|
93 #else // HV_SIMD_NONE
|
chris@163
|
94 *bOut = (bIn > 0.0f) ? hv_log2_f(bIn) : 0.0f;
|
chris@163
|
95 #endif
|
chris@163
|
96 }
|
chris@163
|
97
|
chris@163
|
98 // NOTE(mhroth): this is a pretty ghetto implementation
|
chris@163
|
99 static inline void __hv_cos_f(hv_bInf_t bIn, hv_bOutf_t bOut) {
|
chris@163
|
100 #if HV_SIMD_AVX
|
chris@163
|
101 *bOut = _mm256_set_ps(
|
chris@163
|
102 hv_cos_f(bIn[7]), hv_cos_f(bIn[6]), hv_cos_f(bIn[5]), hv_cos_f(bIn[4]),
|
chris@163
|
103 hv_cos_f(bIn[3]), hv_cos_f(bIn[2]), hv_cos_f(bIn[1]), hv_cos_f(bIn[0]));
|
chris@163
|
104 #elif HV_SIMD_SSE
|
chris@163
|
105 *bOut = _mm_set_ps(hv_cos_f(bIn[3]), hv_cos_f(bIn[2]), hv_cos_f(bIn[1]), hv_cos_f(bIn[0]));
|
chris@163
|
106 #elif HV_SIMD_NEON
|
chris@163
|
107 *bOut = (float32x4_t) {hv_cos_f(bIn[0]), hv_cos_f(bIn[1]), hv_cos_f(bIn[2]), hv_cos_f(bIn[3])};
|
chris@163
|
108 #else // HV_SIMD_NONE
|
chris@163
|
109 *bOut = hv_cos_f(bIn);
|
chris@163
|
110 #endif
|
chris@163
|
111 }
|
chris@163
|
112
|
chris@163
|
113 static inline void __hv_acos_f(hv_bInf_t bIn, hv_bOutf_t bOut) {
|
chris@163
|
114 #if HV_SIMD_AVX
|
chris@163
|
115 #warning __hv_acos_f() not implemented
|
chris@163
|
116 #elif HV_SIMD_SSE
|
chris@163
|
117 #warning __hv_acos_f() not implemented
|
chris@163
|
118 #elif HV_SIMD_NEON
|
chris@163
|
119 #warning __hv_acos_f() not implemented
|
chris@163
|
120 #else // HV_SIMD_NONE
|
chris@163
|
121 *bOut = hv_acos_f(bIn);
|
chris@163
|
122 #endif
|
chris@163
|
123 }
|
chris@163
|
124
|
chris@163
|
125 static inline void __hv_cosh_f(hv_bInf_t bIn, hv_bOutf_t bOut) {
|
chris@163
|
126 #if HV_SIMD_AVX
|
chris@163
|
127 #warning __hv_cosh_f() not implemented
|
chris@163
|
128 #elif HV_SIMD_SSE
|
chris@163
|
129 #warning __hv_cosh_f() not implemented
|
chris@163
|
130 #elif HV_SIMD_NEON
|
chris@163
|
131 #warning __hv_cosh_f() not implemented
|
chris@163
|
132 #else // HV_SIMD_NONE
|
chris@163
|
133 *bOut = hv_cosh_f(bIn);
|
chris@163
|
134 #endif
|
chris@163
|
135 }
|
chris@163
|
136
|
chris@163
|
137 static inline void __hv_acosh_f(hv_bInf_t bIn, hv_bOutf_t bOut) {
|
chris@163
|
138 #if HV_SIMD_AVX
|
chris@163
|
139 #warning __hv_acosh_f() not implemented
|
chris@163
|
140 #elif HV_SIMD_SSE
|
chris@163
|
141 #warning __hv_acosh_f() not implemented
|
chris@163
|
142 #elif HV_SIMD_NEON
|
chris@163
|
143 #warning __hv_acosh_f() not implemented
|
chris@163
|
144 #else // HV_SIMD_NONE
|
chris@163
|
145 *bOut = hv_acosh_f(bIn);
|
chris@163
|
146 #endif
|
chris@163
|
147 }
|
chris@163
|
148
|
chris@163
|
149 static inline void __hv_sin_f(hv_bInf_t bIn, hv_bOutf_t bOut) {
|
chris@163
|
150 #if HV_SIMD_AVX
|
chris@163
|
151 #warning __hv_sin_f() not implemented
|
chris@163
|
152 #elif HV_SIMD_SSE
|
chris@163
|
153 #warning __hv_sin_f() not implemented
|
chris@163
|
154 #elif HV_SIMD_NEON
|
chris@163
|
155 #warning __hv_sin_f() not implemented
|
chris@163
|
156 #else // HV_SIMD_NONE
|
chris@163
|
157 *bOut = hv_sin_f(bIn);
|
chris@163
|
158 #endif
|
chris@163
|
159 }
|
chris@163
|
160
|
chris@163
|
161 static inline void __hv_asin_f(hv_bInf_t bIn, hv_bOutf_t bOut) {
|
chris@163
|
162 #if HV_SIMD_AVX
|
chris@163
|
163 #warning __hv_asin_f() not implemented
|
chris@163
|
164 #elif HV_SIMD_SSE
|
chris@163
|
165 #warning __hv_asin_f() not implemented
|
chris@163
|
166 #elif HV_SIMD_NEON
|
chris@163
|
167 #warning __hv_asin_f() not implemented
|
chris@163
|
168 #else // HV_SIMD_NONE
|
chris@163
|
169 *bOut = hv_asin_f(bIn);
|
chris@163
|
170 #endif
|
chris@163
|
171 }
|
chris@163
|
172
|
chris@163
|
173 static inline void __hv_sinh_f(hv_bInf_t bIn, hv_bOutf_t bOut) {
|
chris@163
|
174 #if HV_SIMD_AVX
|
chris@163
|
175 #warning __hv_sinh_f() not implemented
|
chris@163
|
176 #elif HV_SIMD_SSE
|
chris@163
|
177 #warning __hv_sinh_f() not implemented
|
chris@163
|
178 #elif HV_SIMD_NEON
|
chris@163
|
179 #warning __hv_sinh_f() not implemented
|
chris@163
|
180 #else // HV_SIMD_NONE
|
chris@163
|
181 *bOut = hv_sinh_f(bIn);
|
chris@163
|
182 #endif
|
chris@163
|
183 }
|
chris@163
|
184
|
chris@163
|
185 static inline void __hv_asinh_f(hv_bInf_t bIn, hv_bOutf_t bOut) {
|
chris@163
|
186 #if HV_SIMD_AVX
|
chris@163
|
187 #warning __hv_asinh_f() not implemented
|
chris@163
|
188 #elif HV_SIMD_SSE
|
chris@163
|
189 #warning __hv_asinh_f() not implemented
|
chris@163
|
190 #elif HV_SIMD_NEON
|
chris@163
|
191 #warning __hv_asinh_f() not implemented
|
chris@163
|
192 #else // HV_SIMD_NONE
|
chris@163
|
193 *bOut = hv_asinh_f(bIn);
|
chris@163
|
194 #endif
|
chris@163
|
195 }
|
chris@163
|
196
|
chris@163
|
197 static inline void __hv_tan_f(hv_bInf_t bIn, hv_bOutf_t bOut) {
|
chris@163
|
198 #if HV_SIMD_AVX
|
chris@163
|
199 #warning __hv_tan_f() not implemented
|
chris@163
|
200 #elif HV_SIMD_SSE
|
chris@163
|
201 #warning __hv_tan_f() not implemented
|
chris@163
|
202 #elif HV_SIMD_NEON
|
chris@163
|
203 #warning __hv_tan_f() not implemented
|
chris@163
|
204 #else // HV_SIMD_NONE
|
chris@163
|
205 *bOut = hv_tan_f(bIn);
|
chris@163
|
206 #endif
|
chris@163
|
207 }
|
chris@163
|
208
|
chris@163
|
209 static inline void __hv_atan_f(hv_bInf_t bIn, hv_bOutf_t bOut) {
|
chris@163
|
210 #if HV_SIMD_AVX
|
chris@163
|
211 #warning __hv_atan_f() not implemented
|
chris@163
|
212 #elif HV_SIMD_SSE
|
chris@163
|
213 #warning __hv_atan_f() not implemented
|
chris@163
|
214 #elif HV_SIMD_NEON
|
chris@163
|
215 #warning __hv_atan_f() not implemented
|
chris@163
|
216 #else // HV_SIMD_NONE
|
chris@163
|
217 *bOut = hv_atan_f(bIn);
|
chris@163
|
218 #endif
|
chris@163
|
219 }
|
chris@163
|
220
|
chris@163
|
221 static inline void __hv_atan2_f(hv_bInf_t bIn0, hv_bInf_t bIn1, hv_bOutf_t bOut) {
|
chris@163
|
222 #if HV_SIMD_AVX
|
chris@163
|
223 #warning __hv_atan2_f() not implemented
|
chris@163
|
224 #elif HV_SIMD_SSE
|
chris@163
|
225 #warning __hv_atan2_f() not implemented
|
chris@163
|
226 #elif HV_SIMD_NEON
|
chris@163
|
227 #warning __hv_atan2_f() not implemented
|
chris@163
|
228 #else // HV_SIMD_NONE
|
chris@163
|
229 *bOut = hv_atan2_f(bIn0, bIn1);
|
chris@163
|
230 #endif
|
chris@163
|
231 }
|
chris@163
|
232
|
chris@163
|
233 static inline void __hv_tanh_f(hv_bInf_t bIn, hv_bOutf_t bOut) {
|
chris@163
|
234 #if HV_SIMD_AVX
|
chris@163
|
235 #warning __hv_tanh_f() not implemented
|
chris@163
|
236 #elif HV_SIMD_SSE
|
chris@163
|
237 #warning __hv_tanh_f() not implemented
|
chris@163
|
238 #elif HV_SIMD_NEON
|
chris@163
|
239 #warning __hv_tanh_f() not implemented
|
chris@163
|
240 #else // HV_SIMD_NONE
|
chris@163
|
241 *bOut = hv_tanh_f(bIn);
|
chris@163
|
242 #endif
|
chris@163
|
243 }
|
chris@163
|
244
|
chris@163
|
245 static inline void __hv_atanh_f(hv_bInf_t bIn, hv_bOutf_t bOut) {
|
chris@163
|
246 #if HV_SIMD_AVX
|
chris@163
|
247 #warning __hv_atanh_f() not implemented
|
chris@163
|
248 #elif HV_SIMD_SSE
|
chris@163
|
249 #warning __hv_atanh_f() not implemented
|
chris@163
|
250 #elif HV_SIMD_NEON
|
chris@163
|
251 #warning __hv_atanh_f() not implemented
|
chris@163
|
252 #else // HV_SIMD_NONE
|
chris@163
|
253 *bOut = hv_atanh_f(bIn);
|
chris@163
|
254 #endif
|
chris@163
|
255 }
|
chris@163
|
256
|
chris@163
|
257 // NOTE(mhroth): use of sqrt is absolute and total MURDER. Make do with recipocal sqrt if possible!!
|
chris@163
|
258 static inline void __hv_sqrt_f(hv_bInf_t bIn, hv_bOutf_t bOut) {
|
chris@163
|
259 #if HV_SIMD_AVX
|
chris@163
|
260 *bOut = _mm256_sqrt_ps(bIn);
|
chris@163
|
261 #elif HV_SIMD_SSE
|
chris@163
|
262 *bOut = _mm_sqrt_ps(bIn);
|
chris@163
|
263 #elif HV_SIMD_NEON
|
chris@163
|
264 #warning __hv_sqrt_f() numerical results may be inexact
|
chris@163
|
265 *bOut = vrecpeq_f32(vrsqrteq_f32(bIn));
|
chris@163
|
266 #else // HV_SIMD_NONE
|
chris@163
|
267 *bOut = hv_sqrt_f(bIn);
|
chris@163
|
268 #endif
|
chris@163
|
269 }
|
chris@163
|
270
|
chris@163
|
271 static inline void __hv_rsqrt_f(hv_bInf_t bIn, hv_bOutf_t bOut) {
|
chris@163
|
272 #if HV_SIMD_AVX
|
chris@163
|
273 *bOut = _mm256_rsqrt_ps(bIn);
|
chris@163
|
274 #elif HV_SIMD_SSE
|
chris@163
|
275 *bOut = _mm_rsqrt_ps(bIn);
|
chris@163
|
276 #elif HV_SIMD_NEON
|
chris@163
|
277 #warning __hv_rsqrt_f() numerical results may be inexact
|
chris@163
|
278 *bOut = vrsqrteq_f32(bIn);
|
chris@163
|
279 #else // HV_SIMD_NONE
|
chris@163
|
280 *bOut = 1.0f/hv_sqrt_f(bIn);
|
chris@163
|
281 #endif
|
chris@163
|
282 }
|
chris@163
|
283
|
chris@163
|
284 static inline void __hv_abs_f(hv_bInf_t bIn, hv_bOutf_t bOut) {
|
chris@163
|
285 #if HV_SIMD_AVX
|
chris@163
|
286 *bOut = _mm256_andnot_ps(_mm256_set1_ps(-0.0f), bIn);
|
chris@163
|
287 #elif HV_SIMD_SSE
|
chris@163
|
288 *bOut = _mm_andnot_ps(_mm_set1_ps(-0.0f), bIn); // == 1 << 31
|
chris@163
|
289 #elif HV_SIMD_NEON
|
chris@163
|
290 *bOut = vabsq_f32(bIn);
|
chris@163
|
291 #else // HV_SIMD_NONE
|
chris@163
|
292 *bOut = hv_abs_f(bIn);
|
chris@163
|
293 #endif
|
chris@163
|
294 }
|
chris@163
|
295
|
chris@163
|
296 static inline void __hv_exp_f(hv_bInf_t bIn, hv_bOutf_t bOut) {
|
chris@163
|
297 #if HV_SIMD_AVX
|
chris@163
|
298 #warning __hv_exp_f() not implemented
|
chris@163
|
299 #elif HV_SIMD_SSE
|
chris@163
|
300 #warning __hv_exp_f() not implemented
|
chris@163
|
301 #elif HV_SIMD_NEON
|
chris@163
|
302 #warning __hv_exp_f() not implemented
|
chris@163
|
303 #else // HV_SIMD_NONE
|
chris@163
|
304 *bOut = hv_exp_f(bIn);
|
chris@163
|
305 #endif
|
chris@163
|
306 }
|
chris@163
|
307
|
chris@163
|
308 static inline void __hv_ceil_f(hv_bInf_t bIn, hv_bOutf_t bOut) {
|
chris@163
|
309 #if HV_SIMD_AVX
|
chris@163
|
310 *bOut = _mm256_ceil_ps(bIn);
|
chris@163
|
311 #elif HV_SIMD_SSE
|
chris@163
|
312 *bOut = _mm_ceil_ps(bIn);
|
chris@163
|
313 #elif HV_SIMD_NEON
|
chris@163
|
314 #if __ARM_ARCH >= 8
|
chris@163
|
315 *bOut = vrndpq_f32(bIn);
|
chris@163
|
316 #else
|
chris@163
|
317 #warning A slow NEON implementation of __hv_ceil_f() is being used because the necessary intrinsic cannot be found. It is only available in ARMv8.
|
chris@163
|
318 *bOut = (float32x4_t) {hv_ceil_f(bIn[0]), hv_ceil_f(bIn[1]), hv_ceil_f(bIn[2]), hv_ceil_f(bIn[3])};
|
chris@163
|
319 #endif // vrndpq_f32
|
chris@163
|
320 #else // HV_SIMD_NONE
|
chris@163
|
321 *bOut = hv_ceil_f(bIn);
|
chris@163
|
322 #endif
|
chris@163
|
323 }
|
chris@163
|
324
|
chris@163
|
325 static inline void __hv_floor_f(hv_bInf_t bIn, hv_bOutf_t bOut) {
|
chris@163
|
326 #if HV_SIMD_AVX
|
chris@163
|
327 *bOut = _mm256_floor_ps(bIn);
|
chris@163
|
328 #elif HV_SIMD_SSE
|
chris@163
|
329 *bOut = _mm_floor_ps(bIn);
|
chris@163
|
330 #elif HV_SIMD_NEON
|
chris@163
|
331 #if __ARM_ARCH >= 8
|
chris@163
|
332 *bOut = vrndmq_f32(bIn);
|
chris@163
|
333 #else
|
chris@163
|
334 #warning A slow NEON implementation of __hv_floor_f() is being used because the necessary intrinsic cannot be found. It is only available in ARMv8.
|
chris@163
|
335 *bOut = (float32x4_t) {hv_floor_f(bIn[0]), hv_floor_f(bIn[1]), hv_floor_f(bIn[2]), hv_floor_f(bIn[3])};
|
chris@163
|
336 #endif // vrndmq_f32
|
chris@163
|
337 #else // HV_SIMD_NONE
|
chris@163
|
338 *bOut = hv_floor_f(bIn);
|
chris@163
|
339 #endif
|
chris@163
|
340 }
|
chris@163
|
341
|
chris@163
|
342 // __add~f
|
chris@163
|
343 static inline void __hv_add_f(hv_bInf_t bIn0, hv_bInf_t bIn1, hv_bOutf_t bOut) {
|
chris@163
|
344 #if HV_SIMD_AVX
|
chris@163
|
345 *bOut = _mm256_add_ps(bIn0, bIn1);
|
chris@163
|
346 #elif HV_SIMD_SSE
|
chris@163
|
347 *bOut = _mm_add_ps(bIn0, bIn1);
|
chris@163
|
348 #elif HV_SIMD_NEON
|
chris@163
|
349 *bOut = vaddq_f32(bIn0, bIn1);
|
chris@163
|
350 #else // HV_SIMD_NONE
|
chris@163
|
351 *bOut = bIn0 + bIn1;
|
chris@163
|
352 #endif
|
chris@163
|
353 }
|
chris@163
|
354
|
chris@163
|
355 // __add~i
|
chris@163
|
356 static inline void __hv_add_i(hv_bIni_t bIn0, hv_bIni_t bIn1, hv_bOuti_t bOut) {
|
chris@163
|
357 #if HV_SIMD_AVX
|
chris@163
|
358 __m128i x = _mm_add_epi32(_mm256_castsi256_si128(bIn0), _mm256_castsi256_si128(bIn1));
|
chris@163
|
359 __m128i y = _mm_add_epi32(_mm256_extractf128_si256(bIn0, 1), _mm256_extractf128_si256(bIn1, 1));
|
chris@163
|
360 *bOut = _mm256_insertf128_si256(_mm256_castsi128_si256(x), y, 1);
|
chris@163
|
361 #elif HV_SIMD_SSE
|
chris@163
|
362 *bOut = _mm_add_epi32(bIn0, bIn1);
|
chris@163
|
363 #elif HV_SIMD_NEON
|
chris@163
|
364 *bOut = vaddq_s32(bIn0, bIn1);
|
chris@163
|
365 #else // HV_SIMD_NONE
|
chris@163
|
366 *bOut = bIn0 + bIn1;
|
chris@163
|
367 #endif
|
chris@163
|
368 }
|
chris@163
|
369
|
chris@163
|
370 // __sub~f
|
chris@163
|
371 static inline void __hv_sub_f(hv_bInf_t bIn0, hv_bInf_t bIn1, hv_bOutf_t bOut) {
|
chris@163
|
372 #if HV_SIMD_AVX
|
chris@163
|
373 *bOut = _mm256_sub_ps(bIn0, bIn1);
|
chris@163
|
374 #elif HV_SIMD_SSE
|
chris@163
|
375 *bOut = _mm_sub_ps(bIn0, bIn1);
|
chris@163
|
376 #elif HV_SIMD_NEON
|
chris@163
|
377 *bOut = vsubq_f32(bIn0, bIn1);
|
chris@163
|
378 #else // HV_SIMD_NONE
|
chris@163
|
379 *bOut = bIn0 - bIn1;
|
chris@163
|
380 #endif
|
chris@163
|
381 }
|
chris@163
|
382
|
chris@163
|
383 // __mul~f
|
chris@163
|
384 static inline void __hv_mul_f(hv_bInf_t bIn0, hv_bInf_t bIn1, hv_bOutf_t bOut) {
|
chris@163
|
385 #if HV_SIMD_AVX
|
chris@163
|
386 *bOut = _mm256_mul_ps(bIn0, bIn1);
|
chris@163
|
387 #elif HV_SIMD_SSE
|
chris@163
|
388 *bOut = _mm_mul_ps(bIn0, bIn1);
|
chris@163
|
389 #elif HV_SIMD_NEON
|
chris@163
|
390 *bOut = vmulq_f32(bIn0, bIn1);
|
chris@163
|
391 #else // HV_SIMD_NONE
|
chris@163
|
392 *bOut = bIn0 * bIn1;
|
chris@163
|
393 #endif
|
chris@163
|
394 }
|
chris@163
|
395
|
chris@163
|
396 // __*~i
|
chris@163
|
397 static inline void __hv_mul_i(hv_bIni_t bIn0, hv_bIni_t bIn1, hv_bOuti_t bOut) {
|
chris@163
|
398 #if HV_SIMD_AVX
|
chris@163
|
399 __m128i x = _mm_mullo_epi32(_mm256_castsi256_si128(bIn0), _mm256_castsi256_si128(bIn1));
|
chris@163
|
400 __m128i y = _mm_mullo_epi32(_mm256_extractf128_si256(bIn0, 1), _mm256_extractf128_si256(bIn1, 1));
|
chris@163
|
401 *bOut = _mm256_insertf128_si256(_mm256_castsi128_si256(x), y, 1);
|
chris@163
|
402 #elif HV_SIMD_SSE
|
chris@163
|
403 *bOut = _mm_mullo_epi32(bIn0, bIn1);
|
chris@163
|
404 #elif HV_SIMD_NEON
|
chris@163
|
405 *bOut = vmulq_s32(bIn0, bIn1);
|
chris@163
|
406 #else // HV_SIMD_NONE
|
chris@163
|
407 *bOut = bIn0 * bIn1;
|
chris@163
|
408 #endif
|
chris@163
|
409 }
|
chris@163
|
410
|
chris@163
|
411 // __cast~if
|
chris@163
|
412 static inline void __hv_cast_if(hv_bIni_t bIn, hv_bOutf_t bOut) {
|
chris@163
|
413 #if HV_SIMD_AVX
|
chris@163
|
414 *bOut = _mm256_cvtepi32_ps(bIn);
|
chris@163
|
415 #elif HV_SIMD_SSE
|
chris@163
|
416 *bOut = _mm_cvtepi32_ps(bIn);
|
chris@163
|
417 #elif HV_SIMD_NEON
|
chris@163
|
418 *bOut = vcvtq_f32_s32(bIn);
|
chris@163
|
419 #else // HV_SIMD_NONE
|
chris@163
|
420 *bOut = (float) bIn;
|
chris@163
|
421 #endif
|
chris@163
|
422 }
|
chris@163
|
423
|
chris@163
|
424 // __cast~fi
|
chris@163
|
425 static inline void __hv_cast_fi(hv_bInf_t bIn, hv_bOuti_t bOut) {
|
chris@163
|
426 #if HV_SIMD_AVX
|
chris@163
|
427 *bOut = _mm256_cvtps_epi32(bIn);
|
chris@163
|
428 #elif HV_SIMD_SSE
|
chris@163
|
429 *bOut = _mm_cvtps_epi32(bIn);
|
chris@163
|
430 #elif HV_SIMD_NEON
|
chris@163
|
431 *bOut = vcvtq_s32_f32(bIn);
|
chris@163
|
432 #else // HV_SIMD_NONE
|
chris@163
|
433 *bOut = (int) bIn;
|
chris@163
|
434 #endif
|
chris@163
|
435 }
|
chris@163
|
436
|
chris@163
|
437 static inline void __hv_div_f(hv_bInf_t bIn0, hv_bInf_t bIn1, hv_bOutf_t bOut) {
|
chris@163
|
438 #if HV_SIMD_AVX
|
chris@163
|
439 *bOut = _mm256_div_ps(bIn0, bIn1);
|
chris@163
|
440 #elif HV_SIMD_SSE
|
chris@163
|
441 *bOut = _mm_div_ps(bIn0, bIn1);
|
chris@163
|
442 #elif HV_SIMD_NEON
|
chris@163
|
443 #warning __hv_div_f() numerical results may be inexact
|
chris@163
|
444 *bOut = vmulq_f32(bIn0, vrecpeq_f32(bIn1));
|
chris@163
|
445 #else // HV_SIMD_NONE
|
chris@163
|
446 *bOut = (bIn1 != 0.0f) ? (bIn0 / bIn1) : 0.0f;
|
chris@163
|
447 #endif
|
chris@163
|
448 }
|
chris@163
|
449
|
chris@163
|
450 static inline void __hv_min_f(hv_bInf_t bIn0, hv_bInf_t bIn1, hv_bOutf_t bOut) {
|
chris@163
|
451 #if HV_SIMD_AVX
|
chris@163
|
452 *bOut = _mm256_min_ps(bIn0, bIn1);
|
chris@163
|
453 #elif HV_SIMD_SSE
|
chris@163
|
454 *bOut = _mm_min_ps(bIn0, bIn1);
|
chris@163
|
455 #elif HV_SIMD_NEON
|
chris@163
|
456 *bOut = vminq_f32(bIn0, bIn1);
|
chris@163
|
457 #else // HV_SIMD_NONE
|
chris@163
|
458 *bOut = hv_min_f(bIn0, bIn1);
|
chris@163
|
459 #endif
|
chris@163
|
460 }
|
chris@163
|
461
|
chris@163
|
462 static inline void __hv_min_i(hv_bIni_t bIn0, hv_bIni_t bIn1, hv_bOuti_t bOut) {
|
chris@163
|
463 #if HV_SIMD_AVX
|
chris@163
|
464 __m128i x = _mm_min_epi32(_mm256_castsi256_si128(bIn0), _mm256_castsi256_si128(bIn1));
|
chris@163
|
465 __m128i y = _mm_min_epi32(_mm256_extractf128_si256(bIn0, 1), _mm256_extractf128_si256(bIn1, 1));
|
chris@163
|
466 *bOut = _mm256_insertf128_si256(_mm256_castsi128_si256(x), y, 1);
|
chris@163
|
467 #elif HV_SIMD_SSE
|
chris@163
|
468 *bOut = _mm_min_epi32(bIn0, bIn1);
|
chris@163
|
469 #elif HV_SIMD_NEON
|
chris@163
|
470 *bOut = vminq_s32(bIn0, bIn1);
|
chris@163
|
471 #else // HV_SIMD_NONE
|
chris@163
|
472 *bOut = hv_min_i(bIn0, bIn1);
|
chris@163
|
473 #endif
|
chris@163
|
474 }
|
chris@163
|
475
|
chris@163
|
476 static inline void __hv_max_f(hv_bInf_t bIn0, hv_bInf_t bIn1, hv_bOutf_t bOut) {
|
chris@163
|
477 #if HV_SIMD_AVX
|
chris@163
|
478 *bOut = _mm256_max_ps(bIn0, bIn1);
|
chris@163
|
479 #elif HV_SIMD_SSE
|
chris@163
|
480 *bOut = _mm_max_ps(bIn0, bIn1);
|
chris@163
|
481 #elif HV_SIMD_NEON
|
chris@163
|
482 *bOut = vmaxq_f32(bIn0, bIn1);
|
chris@163
|
483 #else // HV_SIMD_NONE
|
chris@163
|
484 *bOut = hv_max_f(bIn0, bIn1);
|
chris@163
|
485 #endif
|
chris@163
|
486 }
|
chris@163
|
487
|
chris@163
|
488 static inline void __hv_max_i(hv_bIni_t bIn0, hv_bIni_t bIn1, hv_bOuti_t bOut) {
|
chris@163
|
489 #if HV_SIMD_AVX
|
chris@163
|
490 __m128i x = _mm_max_epi32(_mm256_castsi256_si128(bIn0), _mm256_castsi256_si128(bIn1));
|
chris@163
|
491 __m128i y = _mm_max_epi32(_mm256_extractf128_si256(bIn0, 1), _mm256_extractf128_si256(bIn1, 1));
|
chris@163
|
492 *bOut = _mm256_insertf128_si256(_mm256_castsi128_si256(x), y, 1);
|
chris@163
|
493 #elif HV_SIMD_SSE
|
chris@163
|
494 *bOut = _mm_max_epi32(bIn0, bIn1);
|
chris@163
|
495 #elif HV_SIMD_NEON
|
chris@163
|
496 *bOut = vmaxq_s32(bIn0, bIn1);
|
chris@163
|
497 #else // HV_SIMD_NONE
|
chris@163
|
498 *bOut = hv_max_i(bIn0, bIn1);
|
chris@163
|
499 #endif
|
chris@163
|
500 }
|
chris@163
|
501
|
chris@163
|
502 static inline void __hv_pow_f(hv_bInf_t bIn0, hv_bInf_t bIn1, hv_bOutf_t bOut) {
|
chris@163
|
503 #if HV_SIMD_AVX
|
chris@163
|
504 *bOut = _mm256_set_ps(
|
chris@163
|
505 hv_pow_f(bIn0[7], bIn1[7]),
|
chris@163
|
506 hv_pow_f(bIn0[6], bIn1[6]),
|
chris@163
|
507 hv_pow_f(bIn0[5], bIn1[5]),
|
chris@163
|
508 hv_pow_f(bIn0[4], bIn1[4]),
|
chris@163
|
509 hv_pow_f(bIn0[3], bIn1[3]),
|
chris@163
|
510 hv_pow_f(bIn0[2], bIn1[2]),
|
chris@163
|
511 hv_pow_f(bIn0[1], bIn1[1]),
|
chris@163
|
512 hv_pow_f(bIn0[0], bIn1[0]));
|
chris@163
|
513 #elif HV_SIMD_SSE
|
chris@163
|
514 *bOut = _mm_set_ps(
|
chris@163
|
515 hv_pow_f(bIn0[3], bIn1[3]),
|
chris@163
|
516 hv_pow_f(bIn0[2], bIn1[2]),
|
chris@163
|
517 hv_pow_f(bIn0[1], bIn1[1]),
|
chris@163
|
518 hv_pow_f(bIn0[0], bIn1[0]));
|
chris@163
|
519 #elif HV_SIMD_NEON
|
chris@163
|
520 *bOut = (float32x4_t) {
|
chris@163
|
521 hv_pow_f(bIn0[0], bIn1[0]),
|
chris@163
|
522 hv_pow_f(bIn0[1], bIn1[1]),
|
chris@163
|
523 hv_pow_f(bIn0[2], bIn1[2]),
|
chris@163
|
524 hv_pow_f(bIn0[3], bIn1[3])};
|
chris@163
|
525 #else // HV_SIMD_NONE
|
chris@163
|
526 *bOut = hv_pow_f(bIn0, bIn1);
|
chris@163
|
527 #endif
|
chris@163
|
528 }
|
chris@163
|
529
|
chris@163
|
530 static inline void __hv_gt_f(hv_bInf_t bIn0, hv_bInf_t bIn1, hv_bOutf_t bOut) {
|
chris@163
|
531 #if HV_SIMD_AVX
|
chris@163
|
532 *bOut = _mm256_cmp_ps(bIn0, bIn1, _CMP_GT_OQ);
|
chris@163
|
533 #elif HV_SIMD_SSE
|
chris@163
|
534 *bOut = _mm_cmpgt_ps(bIn0, bIn1);
|
chris@163
|
535 #elif HV_SIMD_NEON
|
chris@163
|
536 *bOut = vreinterpretq_f32_u32(vcgtq_f32(bIn0, bIn1));
|
chris@163
|
537 #else // HV_SIMD_NONE
|
chris@163
|
538 *bOut = (bIn0 > bIn1) ? 1.0f : 0.0f;
|
chris@163
|
539 #endif
|
chris@163
|
540 }
|
chris@163
|
541
|
chris@163
|
542 static inline void __hv_gte_f(hv_bInf_t bIn0, hv_bInf_t bIn1, hv_bOutf_t bOut) {
|
chris@163
|
543 #if HV_SIMD_AVX
|
chris@163
|
544 *bOut = _mm256_cmp_ps(bIn0, bIn1, _CMP_GE_OQ);
|
chris@163
|
545 #elif HV_SIMD_SSE
|
chris@163
|
546 *bOut = _mm_cmpge_ps(bIn0, bIn1);
|
chris@163
|
547 #elif HV_SIMD_NEON
|
chris@163
|
548 *bOut = vreinterpretq_f32_u32(vcgeq_f32(bIn0, bIn1));
|
chris@163
|
549 #else // HV_SIMD_NONE
|
chris@163
|
550 *bOut = (bIn0 >= bIn1) ? 1.0f : 0.0f;
|
chris@163
|
551 #endif
|
chris@163
|
552 }
|
chris@163
|
553
|
chris@163
|
554 static inline void __hv_lt_f(hv_bInf_t bIn0, hv_bInf_t bIn1, hv_bOutf_t bOut) {
|
chris@163
|
555 #if HV_SIMD_AVX
|
chris@163
|
556 *bOut = _mm256_cmp_ps(bIn0, bIn1, _CMP_LT_OQ);
|
chris@163
|
557 #elif HV_SIMD_SSE
|
chris@163
|
558 *bOut = _mm_cmplt_ps(bIn0, bIn1);
|
chris@163
|
559 #elif HV_SIMD_NEON
|
chris@163
|
560 *bOut = vreinterpretq_f32_u32(vcltq_f32(bIn0, bIn1));
|
chris@163
|
561 #else // HV_SIMD_NONE
|
chris@163
|
562 *bOut = (bIn0 < bIn1) ? 1.0f : 0.0f;
|
chris@163
|
563 #endif
|
chris@163
|
564 }
|
chris@163
|
565
|
chris@163
|
566 static inline void __hv_lte_f(hv_bInf_t bIn0, hv_bInf_t bIn1, hv_bOutf_t bOut) {
|
chris@163
|
567 #if HV_SIMD_AVX
|
chris@163
|
568 *bOut = _mm256_cmp_ps(bIn0, bIn1, _CMP_LE_OQ);
|
chris@163
|
569 #elif HV_SIMD_SSE
|
chris@163
|
570 *bOut = _mm_cmple_ps(bIn0, bIn1);
|
chris@163
|
571 #elif HV_SIMD_NEON
|
chris@163
|
572 *bOut = vreinterpretq_f32_u32(vcleq_f32(bIn0, bIn1));
|
chris@163
|
573 #else // HV_SIMD_NONE
|
chris@163
|
574 *bOut = (bIn0 <= bIn1) ? 1.0f : 0.0f;
|
chris@163
|
575 #endif
|
chris@163
|
576 }
|
chris@163
|
577
|
chris@163
|
578 static inline void __hv_neq_f(hv_bInf_t bIn0, hv_bInf_t bIn1, hv_bOutf_t bOut) {
|
chris@163
|
579 #if HV_SIMD_AVX
|
chris@163
|
580 *bOut = _mm256_cmp_ps(bIn0, bIn1, _CMP_NEQ_OQ);
|
chris@163
|
581 #elif HV_SIMD_SSE
|
chris@163
|
582 *bOut = _mm_cmpneq_ps(bIn0, bIn1);
|
chris@163
|
583 #elif HV_SIMD_NEON
|
chris@163
|
584 *bOut = vreinterpretq_f32_u32(vmvnq_u32(vceqq_f32(bIn0, bIn1)));
|
chris@163
|
585 #else // HV_SIMD_NONE
|
chris@163
|
586 *bOut = (bIn0 != bIn1) ? 1.0f : 0.0f;
|
chris@163
|
587 #endif
|
chris@163
|
588 }
|
chris@163
|
589
|
chris@163
|
590 static inline void __hv_xor_f(hv_bInf_t bIn0, hv_bInf_t bIn1, hv_bOutf_t bOut) {
|
chris@163
|
591 #if HV_SIMD_AVX
|
chris@163
|
592 #warning __hv_xor_f() not implemented
|
chris@163
|
593 #elif HV_SIMD_SSE
|
chris@163
|
594 #warning __hv_xor_f() not implemented
|
chris@163
|
595 #elif HV_SIMD_NEON
|
chris@163
|
596 #warning __hv_xor_f() not implemented
|
chris@163
|
597 #else // HV_SIMD_NONE
|
chris@163
|
598 *bOut = (float) (((int) bIn0) ^ ((int) bIn1));
|
chris@163
|
599 #endif
|
chris@163
|
600 }
|
chris@163
|
601
|
chris@163
|
602 static inline void __hv_and_f(hv_bInf_t bIn0, hv_bInf_t bIn1, hv_bOutf_t bOut) {
|
chris@163
|
603 #if HV_SIMD_AVX
|
chris@163
|
604 *bOut = _mm256_and_ps(bIn1, bIn0);
|
chris@163
|
605 #elif HV_SIMD_SSE
|
chris@163
|
606 *bOut = _mm_and_ps(bIn1, bIn0);
|
chris@163
|
607 #elif HV_SIMD_NEON
|
chris@163
|
608 *bOut = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(bIn1), vreinterpretq_u32_f32(bIn0)));
|
chris@163
|
609 #else // HV_SIMD_NONE
|
chris@163
|
610 if (bIn0 == 0.0f || bIn1 == 0.0f) *bOut = 0.0f;
|
chris@163
|
611 else if (bIn0 == 1.0f) *bOut = bIn1;
|
chris@163
|
612 else if (bIn1 == 1.0f) *bOut = bIn0;
|
chris@163
|
613 else hv_assert(0); // NOTE(mhroth): floating point & is pretty much a bad idea, only used for if~
|
chris@163
|
614 #endif
|
chris@163
|
615 }
|
chris@163
|
616
|
chris@163
|
617 // bOut = (bIn0 * bIn1) + bIn2
|
chris@163
|
618 static inline void __hv_fma_f(hv_bInf_t bIn0, hv_bInf_t bIn1, hv_bInf_t bIn2, hv_bOutf_t bOut) {
|
chris@163
|
619 #if HV_SIMD_AVX
|
chris@163
|
620 #if HV_SIMD_FMA
|
chris@163
|
621 *bOut = _mm256_fmadd_ps(bIn0, bIn1, bIn2);
|
chris@163
|
622 #else
|
chris@163
|
623 *bOut = _mm256_add_ps(_mm256_mul_ps(bIn0, bIn1), bIn2);
|
chris@163
|
624 #endif // HV_SIMD_FMA
|
chris@163
|
625 #elif HV_SIMD_SSE
|
chris@163
|
626 #if HV_SIMD_FMA
|
chris@163
|
627 *bOut = _mm_fmadd_ps(bIn0, bIn1, bIn2);
|
chris@163
|
628 #else
|
chris@163
|
629 *bOut = _mm_add_ps(_mm_mul_ps(bIn0, bIn1), bIn2);
|
chris@163
|
630 #endif // HV_SIMD_FMA
|
chris@163
|
631 #elif HV_SIMD_NEON
|
chris@163
|
632 #if __ARM_ARCH >= 8
|
chris@163
|
633 *bOut = vfmaq_f32(bIn2, bIn0, bIn1);
|
chris@163
|
634 #else
|
chris@163
|
635 // NOTE(mhroth): it turns out, fma SUUUUCKS on lesser ARM architectures
|
chris@163
|
636 // But in fact ideally fma would be disabled in ir2c for ARM architectures.
|
chris@163
|
637 // LLVM does a much better job handling fma than we do.
|
chris@163
|
638 *bOut = vaddq_f32(vmulq_f32(bIn0, bIn1), bIn2);
|
chris@163
|
639 #endif
|
chris@163
|
640 #else // HV_SIMD_NONE
|
chris@163
|
641 *bOut = hv_fma_f(bIn0, bIn1, bIn2);
|
chris@163
|
642 #endif
|
chris@163
|
643 }
|
chris@163
|
644
|
chris@163
|
645 #endif // _HEAVY_MATH_H_
|