comparison projects/heavy/envelopeTrigger/HeavyMath.h @ 162:c3e8226a5651 heavy-updated

- added additional flags to C rules (-DNDEBUG, -mfpu=neon) - sample-accurate envelope triggering pd/heavy example
author chnrx <chris.heinrichs@gmail.com>
date Thu, 12 Nov 2015 14:59:46 +0000
parents
children
comparison
equal deleted inserted replaced
161:07735c9d95c8 162:c3e8226a5651
1 /**
2 * Copyright (c) 2014, 2015, Enzien Audio Ltd.
3 *
4 * Permission to use, copy, modify, and/or distribute this software for any
5 * purpose with or without fee is hereby granted, provided that the above
6 * copyright notice and this permission notice appear in all copies.
7 *
8 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH
9 * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
10 * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT,
11 * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
12 * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
13 * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
14 * PERFORMANCE OF THIS SOFTWARE.
15 */
16
17 #ifndef _HEAVY_MATH_H_
18 #define _HEAVY_MATH_H_
19
20 #include "Utils.h"
21
22 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/
23 // https://gcc.gnu.org/onlinedocs/gcc-4.8.1/gcc/ARM-NEON-Intrinsics.html
24 // http://codesuppository.blogspot.co.uk/2015/02/sse2neonh-porting-guide-and-header-file.html
25
26 static inline void __hv_zero_f(hv_bOutf_t bOut) {
27 #if HV_SIMD_AVX
28 *bOut = _mm256_setzero_ps();
29 #elif HV_SIMD_SSE
30 *bOut = _mm_setzero_ps();
31 #elif HV_SIMD_NEON
32 *bOut = vdupq_n_f32(0.0f);
33 #else // HV_SIMD_NONE
34 *bOut = 0.0f;
35 #endif
36 }
37
38 static inline void __hv_load_f(float *bIn, hv_bOutf_t bOut) {
39 #if HV_SIMD_AVX
40 *bOut = _mm256_load_ps(bIn);
41 #elif HV_SIMD_SSE
42 *bOut = _mm_load_ps(bIn);
43 #elif HV_SIMD_NEON
44 *bOut = vld1q_f32(bIn);
45 #else // HV_SIMD_NONE
46 *bOut = *bIn;
47 #endif
48 }
49
50 static inline void __hv_store_f(float *bOut, hv_bInf_t bIn) {
51 #if HV_SIMD_AVX
52 _mm256_store_ps(bOut, bIn);
53 #elif HV_SIMD_SSE
54 _mm_store_ps(bOut, bIn);
55 #elif HV_SIMD_NEON
56 vst1q_f32(bOut, bIn);
57 #else // HV_SIMD_NONE
58 *bOut = bIn;
59 #endif
60 }
61
62 static inline void __hv_log_f(hv_bInf_t bIn, hv_bOutf_t bOut) {
63 #if HV_SIMD_AVX
64 #warning __hv_log_f() not implemented
65 #elif HV_SIMD_SSE
66 #warning __hv_log_f() not implemented
67 #elif HV_SIMD_NEON
68 #warning __hv_log_f() not implemented
69 #else // HV_SIMD_NONE
70 *bOut = (bIn > 0.0f) ? hv_log_f(bIn) : 0.0f;
71 #endif
72 }
73
74 static inline void __hv_log10_f(hv_bInf_t bIn, hv_bOutf_t bOut) {
75 #if HV_SIMD_AVX
76 #warning __hv_log10_f() not implemented
77 #elif HV_SIMD_SSE
78 #warning __hv_log10_f() not implemented
79 #elif HV_SIMD_NEON
80 #warning __hv_log10_f() not implemented
81 #else // HV_SIMD_NONE
82 *bOut = (bIn > 0.0f) ? hv_log10_f(bIn) : 0.0f;
83 #endif
84 }
85
86 static inline void __hv_log2_f(hv_bInf_t bIn, hv_bOutf_t bOut) {
87 #if HV_SIMD_AVX
88 #warning __hv_log2_f() not implemented
89 #elif HV_SIMD_SSE
90 #warning __hv_log2_f() not implemented
91 #elif HV_SIMD_NEON
92 #warning __hv_log2_f() not implemented
93 #else // HV_SIMD_NONE
94 *bOut = (bIn > 0.0f) ? hv_log2_f(bIn) : 0.0f;
95 #endif
96 }
97
98 // NOTE(mhroth): this is a pretty ghetto implementation
99 static inline void __hv_cos_f(hv_bInf_t bIn, hv_bOutf_t bOut) {
100 #if HV_SIMD_AVX
101 *bOut = _mm256_set_ps(
102 hv_cos_f(bIn[7]), hv_cos_f(bIn[6]), hv_cos_f(bIn[5]), hv_cos_f(bIn[4]),
103 hv_cos_f(bIn[3]), hv_cos_f(bIn[2]), hv_cos_f(bIn[1]), hv_cos_f(bIn[0]));
104 #elif HV_SIMD_SSE
105 *bOut = _mm_set_ps(hv_cos_f(bIn[3]), hv_cos_f(bIn[2]), hv_cos_f(bIn[1]), hv_cos_f(bIn[0]));
106 #elif HV_SIMD_NEON
107 *bOut = (float32x4_t) {hv_cos_f(bIn[0]), hv_cos_f(bIn[1]), hv_cos_f(bIn[2]), hv_cos_f(bIn[3])};
108 #else // HV_SIMD_NONE
109 *bOut = hv_cos_f(bIn);
110 #endif
111 }
112
113 static inline void __hv_acos_f(hv_bInf_t bIn, hv_bOutf_t bOut) {
114 #if HV_SIMD_AVX
115 #warning __hv_acos_f() not implemented
116 #elif HV_SIMD_SSE
117 #warning __hv_acos_f() not implemented
118 #elif HV_SIMD_NEON
119 #warning __hv_acos_f() not implemented
120 #else // HV_SIMD_NONE
121 *bOut = hv_acos_f(bIn);
122 #endif
123 }
124
125 static inline void __hv_cosh_f(hv_bInf_t bIn, hv_bOutf_t bOut) {
126 #if HV_SIMD_AVX
127 #warning __hv_cosh_f() not implemented
128 #elif HV_SIMD_SSE
129 #warning __hv_cosh_f() not implemented
130 #elif HV_SIMD_NEON
131 #warning __hv_cosh_f() not implemented
132 #else // HV_SIMD_NONE
133 *bOut = hv_cosh_f(bIn);
134 #endif
135 }
136
137 static inline void __hv_acosh_f(hv_bInf_t bIn, hv_bOutf_t bOut) {
138 #if HV_SIMD_AVX
139 #warning __hv_acosh_f() not implemented
140 #elif HV_SIMD_SSE
141 #warning __hv_acosh_f() not implemented
142 #elif HV_SIMD_NEON
143 #warning __hv_acosh_f() not implemented
144 #else // HV_SIMD_NONE
145 *bOut = hv_acosh_f(bIn);
146 #endif
147 }
148
149 static inline void __hv_sin_f(hv_bInf_t bIn, hv_bOutf_t bOut) {
150 #if HV_SIMD_AVX
151 #warning __hv_sin_f() not implemented
152 #elif HV_SIMD_SSE
153 #warning __hv_sin_f() not implemented
154 #elif HV_SIMD_NEON
155 #warning __hv_sin_f() not implemented
156 #else // HV_SIMD_NONE
157 *bOut = hv_sin_f(bIn);
158 #endif
159 }
160
161 static inline void __hv_asin_f(hv_bInf_t bIn, hv_bOutf_t bOut) {
162 #if HV_SIMD_AVX
163 #warning __hv_asin_f() not implemented
164 #elif HV_SIMD_SSE
165 #warning __hv_asin_f() not implemented
166 #elif HV_SIMD_NEON
167 #warning __hv_asin_f() not implemented
168 #else // HV_SIMD_NONE
169 *bOut = hv_asin_f(bIn);
170 #endif
171 }
172
173 static inline void __hv_sinh_f(hv_bInf_t bIn, hv_bOutf_t bOut) {
174 #if HV_SIMD_AVX
175 #warning __hv_sinh_f() not implemented
176 #elif HV_SIMD_SSE
177 #warning __hv_sinh_f() not implemented
178 #elif HV_SIMD_NEON
179 #warning __hv_sinh_f() not implemented
180 #else // HV_SIMD_NONE
181 *bOut = hv_sinh_f(bIn);
182 #endif
183 }
184
185 static inline void __hv_asinh_f(hv_bInf_t bIn, hv_bOutf_t bOut) {
186 #if HV_SIMD_AVX
187 #warning __hv_asinh_f() not implemented
188 #elif HV_SIMD_SSE
189 #warning __hv_asinh_f() not implemented
190 #elif HV_SIMD_NEON
191 #warning __hv_asinh_f() not implemented
192 #else // HV_SIMD_NONE
193 *bOut = hv_asinh_f(bIn);
194 #endif
195 }
196
197 static inline void __hv_tan_f(hv_bInf_t bIn, hv_bOutf_t bOut) {
198 #if HV_SIMD_AVX
199 #warning __hv_tan_f() not implemented
200 #elif HV_SIMD_SSE
201 #warning __hv_tan_f() not implemented
202 #elif HV_SIMD_NEON
203 #warning __hv_tan_f() not implemented
204 #else // HV_SIMD_NONE
205 *bOut = hv_tan_f(bIn);
206 #endif
207 }
208
209 static inline void __hv_atan_f(hv_bInf_t bIn, hv_bOutf_t bOut) {
210 #if HV_SIMD_AVX
211 #warning __hv_atan_f() not implemented
212 #elif HV_SIMD_SSE
213 #warning __hv_atan_f() not implemented
214 #elif HV_SIMD_NEON
215 #warning __hv_atan_f() not implemented
216 #else // HV_SIMD_NONE
217 *bOut = hv_atan_f(bIn);
218 #endif
219 }
220
221 static inline void __hv_atan2_f(hv_bInf_t bIn0, hv_bInf_t bIn1, hv_bOutf_t bOut) {
222 #if HV_SIMD_AVX
223 #warning __hv_atan2_f() not implemented
224 #elif HV_SIMD_SSE
225 #warning __hv_atan2_f() not implemented
226 #elif HV_SIMD_NEON
227 #warning __hv_atan2_f() not implemented
228 #else // HV_SIMD_NONE
229 *bOut = hv_atan2_f(bIn0, bIn1);
230 #endif
231 }
232
233 static inline void __hv_tanh_f(hv_bInf_t bIn, hv_bOutf_t bOut) {
234 #if HV_SIMD_AVX
235 #warning __hv_tanh_f() not implemented
236 #elif HV_SIMD_SSE
237 #warning __hv_tanh_f() not implemented
238 #elif HV_SIMD_NEON
239 #warning __hv_tanh_f() not implemented
240 #else // HV_SIMD_NONE
241 *bOut = hv_tanh_f(bIn);
242 #endif
243 }
244
245 static inline void __hv_atanh_f(hv_bInf_t bIn, hv_bOutf_t bOut) {
246 #if HV_SIMD_AVX
247 #warning __hv_atanh_f() not implemented
248 #elif HV_SIMD_SSE
249 #warning __hv_atanh_f() not implemented
250 #elif HV_SIMD_NEON
251 #warning __hv_atanh_f() not implemented
252 #else // HV_SIMD_NONE
253 *bOut = hv_atanh_f(bIn);
254 #endif
255 }
256
257 // NOTE(mhroth): use of sqrt is absolute and total MURDER. Make do with recipocal sqrt if possible!!
258 static inline void __hv_sqrt_f(hv_bInf_t bIn, hv_bOutf_t bOut) {
259 #if HV_SIMD_AVX
260 *bOut = _mm256_sqrt_ps(bIn);
261 #elif HV_SIMD_SSE
262 *bOut = _mm_sqrt_ps(bIn);
263 #elif HV_SIMD_NEON
264 #warning __hv_sqrt_f() numerical results may be inexact
265 *bOut = vrecpeq_f32(vrsqrteq_f32(bIn));
266 #else // HV_SIMD_NONE
267 *bOut = hv_sqrt_f(bIn);
268 #endif
269 }
270
271 static inline void __hv_rsqrt_f(hv_bInf_t bIn, hv_bOutf_t bOut) {
272 #if HV_SIMD_AVX
273 *bOut = _mm256_rsqrt_ps(bIn);
274 #elif HV_SIMD_SSE
275 *bOut = _mm_rsqrt_ps(bIn);
276 #elif HV_SIMD_NEON
277 #warning __hv_rsqrt_f() numerical results may be inexact
278 *bOut = vrsqrteq_f32(bIn);
279 #else // HV_SIMD_NONE
280 *bOut = 1.0f/hv_sqrt_f(bIn);
281 #endif
282 }
283
284 static inline void __hv_abs_f(hv_bInf_t bIn, hv_bOutf_t bOut) {
285 #if HV_SIMD_AVX
286 *bOut = _mm256_andnot_ps(_mm256_set1_ps(-0.0f), bIn);
287 #elif HV_SIMD_SSE
288 *bOut = _mm_andnot_ps(_mm_set1_ps(-0.0f), bIn); // == 1 << 31
289 #elif HV_SIMD_NEON
290 *bOut = vabsq_f32(bIn);
291 #else // HV_SIMD_NONE
292 *bOut = hv_abs_f(bIn);
293 #endif
294 }
295
296 static inline void __hv_exp_f(hv_bInf_t bIn, hv_bOutf_t bOut) {
297 #if HV_SIMD_AVX
298 #warning __hv_exp_f() not implemented
299 #elif HV_SIMD_SSE
300 #warning __hv_exp_f() not implemented
301 #elif HV_SIMD_NEON
302 #warning __hv_exp_f() not implemented
303 #else // HV_SIMD_NONE
304 *bOut = hv_exp_f(bIn);
305 #endif
306 }
307
308 static inline void __hv_ceil_f(hv_bInf_t bIn, hv_bOutf_t bOut) {
309 #if HV_SIMD_AVX
310 *bOut = _mm256_ceil_ps(bIn);
311 #elif HV_SIMD_SSE
312 *bOut = _mm_ceil_ps(bIn);
313 #elif HV_SIMD_NEON
314 #if __ARM_ARCH >= 8
315 *bOut = vrndpq_f32(bIn);
316 #else
317 #warning A slow NEON implementation of __hv_ceil_f() is being used because the necessary intrinsic cannot be found. It is only available in ARMv8.
318 *bOut = (float32x4_t) {hv_ceil_f(bIn[0]), hv_ceil_f(bIn[1]), hv_ceil_f(bIn[2]), hv_ceil_f(bIn[3])};
319 #endif // vrndpq_f32
320 #else // HV_SIMD_NONE
321 *bOut = hv_ceil_f(bIn);
322 #endif
323 }
324
325 static inline void __hv_floor_f(hv_bInf_t bIn, hv_bOutf_t bOut) {
326 #if HV_SIMD_AVX
327 *bOut = _mm256_floor_ps(bIn);
328 #elif HV_SIMD_SSE
329 *bOut = _mm_floor_ps(bIn);
330 #elif HV_SIMD_NEON
331 #if __ARM_ARCH >= 8
332 *bOut = vrndmq_f32(bIn);
333 #else
334 #warning A slow NEON implementation of __hv_floor_f() is being used because the necessary intrinsic cannot be found. It is only available in ARMv8.
335 *bOut = (float32x4_t) {hv_floor_f(bIn[0]), hv_floor_f(bIn[1]), hv_floor_f(bIn[2]), hv_floor_f(bIn[3])};
336 #endif // vrndmq_f32
337 #else // HV_SIMD_NONE
338 *bOut = hv_floor_f(bIn);
339 #endif
340 }
341
342 // __add~f
343 static inline void __hv_add_f(hv_bInf_t bIn0, hv_bInf_t bIn1, hv_bOutf_t bOut) {
344 #if HV_SIMD_AVX
345 *bOut = _mm256_add_ps(bIn0, bIn1);
346 #elif HV_SIMD_SSE
347 *bOut = _mm_add_ps(bIn0, bIn1);
348 #elif HV_SIMD_NEON
349 *bOut = vaddq_f32(bIn0, bIn1);
350 #else // HV_SIMD_NONE
351 *bOut = bIn0 + bIn1;
352 #endif
353 }
354
355 // __add~i
356 static inline void __hv_add_i(hv_bIni_t bIn0, hv_bIni_t bIn1, hv_bOuti_t bOut) {
357 #if HV_SIMD_AVX
358 __m128i x = _mm_add_epi32(_mm256_castsi256_si128(bIn0), _mm256_castsi256_si128(bIn1));
359 __m128i y = _mm_add_epi32(_mm256_extractf128_si256(bIn0, 1), _mm256_extractf128_si256(bIn1, 1));
360 *bOut = _mm256_insertf128_si256(_mm256_castsi128_si256(x), y, 1);
361 #elif HV_SIMD_SSE
362 *bOut = _mm_add_epi32(bIn0, bIn1);
363 #elif HV_SIMD_NEON
364 *bOut = vaddq_s32(bIn0, bIn1);
365 #else // HV_SIMD_NONE
366 *bOut = bIn0 + bIn1;
367 #endif
368 }
369
370 // __sub~f
371 static inline void __hv_sub_f(hv_bInf_t bIn0, hv_bInf_t bIn1, hv_bOutf_t bOut) {
372 #if HV_SIMD_AVX
373 *bOut = _mm256_sub_ps(bIn0, bIn1);
374 #elif HV_SIMD_SSE
375 *bOut = _mm_sub_ps(bIn0, bIn1);
376 #elif HV_SIMD_NEON
377 *bOut = vsubq_f32(bIn0, bIn1);
378 #else // HV_SIMD_NONE
379 *bOut = bIn0 - bIn1;
380 #endif
381 }
382
383 // __mul~f
384 static inline void __hv_mul_f(hv_bInf_t bIn0, hv_bInf_t bIn1, hv_bOutf_t bOut) {
385 #if HV_SIMD_AVX
386 *bOut = _mm256_mul_ps(bIn0, bIn1);
387 #elif HV_SIMD_SSE
388 *bOut = _mm_mul_ps(bIn0, bIn1);
389 #elif HV_SIMD_NEON
390 *bOut = vmulq_f32(bIn0, bIn1);
391 #else // HV_SIMD_NONE
392 *bOut = bIn0 * bIn1;
393 #endif
394 }
395
396 // __*~i
397 static inline void __hv_mul_i(hv_bIni_t bIn0, hv_bIni_t bIn1, hv_bOuti_t bOut) {
398 #if HV_SIMD_AVX
399 __m128i x = _mm_mullo_epi32(_mm256_castsi256_si128(bIn0), _mm256_castsi256_si128(bIn1));
400 __m128i y = _mm_mullo_epi32(_mm256_extractf128_si256(bIn0, 1), _mm256_extractf128_si256(bIn1, 1));
401 *bOut = _mm256_insertf128_si256(_mm256_castsi128_si256(x), y, 1);
402 #elif HV_SIMD_SSE
403 *bOut = _mm_mullo_epi32(bIn0, bIn1);
404 #elif HV_SIMD_NEON
405 *bOut = vmulq_s32(bIn0, bIn1);
406 #else // HV_SIMD_NONE
407 *bOut = bIn0 * bIn1;
408 #endif
409 }
410
411 // __cast~if
412 static inline void __hv_cast_if(hv_bIni_t bIn, hv_bOutf_t bOut) {
413 #if HV_SIMD_AVX
414 *bOut = _mm256_cvtepi32_ps(bIn);
415 #elif HV_SIMD_SSE
416 *bOut = _mm_cvtepi32_ps(bIn);
417 #elif HV_SIMD_NEON
418 *bOut = vcvtq_f32_s32(bIn);
419 #else // HV_SIMD_NONE
420 *bOut = (float) bIn;
421 #endif
422 }
423
424 // __cast~fi
425 static inline void __hv_cast_fi(hv_bInf_t bIn, hv_bOuti_t bOut) {
426 #if HV_SIMD_AVX
427 *bOut = _mm256_cvtps_epi32(bIn);
428 #elif HV_SIMD_SSE
429 *bOut = _mm_cvtps_epi32(bIn);
430 #elif HV_SIMD_NEON
431 *bOut = vcvtq_s32_f32(bIn);
432 #else // HV_SIMD_NONE
433 *bOut = (int) bIn;
434 #endif
435 }
436
437 static inline void __hv_div_f(hv_bInf_t bIn0, hv_bInf_t bIn1, hv_bOutf_t bOut) {
438 #if HV_SIMD_AVX
439 *bOut = _mm256_div_ps(bIn0, bIn1);
440 #elif HV_SIMD_SSE
441 *bOut = _mm_div_ps(bIn0, bIn1);
442 #elif HV_SIMD_NEON
443 #warning __hv_div_f() numerical results may be inexact
444 *bOut = vmulq_f32(bIn0, vrecpeq_f32(bIn1));
445 #else // HV_SIMD_NONE
446 *bOut = (bIn1 != 0.0f) ? (bIn0 / bIn1) : 0.0f;
447 #endif
448 }
449
450 static inline void __hv_min_f(hv_bInf_t bIn0, hv_bInf_t bIn1, hv_bOutf_t bOut) {
451 #if HV_SIMD_AVX
452 *bOut = _mm256_min_ps(bIn0, bIn1);
453 #elif HV_SIMD_SSE
454 *bOut = _mm_min_ps(bIn0, bIn1);
455 #elif HV_SIMD_NEON
456 *bOut = vminq_f32(bIn0, bIn1);
457 #else // HV_SIMD_NONE
458 *bOut = hv_min_f(bIn0, bIn1);
459 #endif
460 }
461
462 static inline void __hv_min_i(hv_bIni_t bIn0, hv_bIni_t bIn1, hv_bOuti_t bOut) {
463 #if HV_SIMD_AVX
464 __m128i x = _mm_min_epi32(_mm256_castsi256_si128(bIn0), _mm256_castsi256_si128(bIn1));
465 __m128i y = _mm_min_epi32(_mm256_extractf128_si256(bIn0, 1), _mm256_extractf128_si256(bIn1, 1));
466 *bOut = _mm256_insertf128_si256(_mm256_castsi128_si256(x), y, 1);
467 #elif HV_SIMD_SSE
468 *bOut = _mm_min_epi32(bIn0, bIn1);
469 #elif HV_SIMD_NEON
470 *bOut = vminq_s32(bIn0, bIn1);
471 #else // HV_SIMD_NONE
472 *bOut = hv_min_i(bIn0, bIn1);
473 #endif
474 }
475
476 static inline void __hv_max_f(hv_bInf_t bIn0, hv_bInf_t bIn1, hv_bOutf_t bOut) {
477 #if HV_SIMD_AVX
478 *bOut = _mm256_max_ps(bIn0, bIn1);
479 #elif HV_SIMD_SSE
480 *bOut = _mm_max_ps(bIn0, bIn1);
481 #elif HV_SIMD_NEON
482 *bOut = vmaxq_f32(bIn0, bIn1);
483 #else // HV_SIMD_NONE
484 *bOut = hv_max_f(bIn0, bIn1);
485 #endif
486 }
487
488 static inline void __hv_max_i(hv_bIni_t bIn0, hv_bIni_t bIn1, hv_bOuti_t bOut) {
489 #if HV_SIMD_AVX
490 __m128i x = _mm_max_epi32(_mm256_castsi256_si128(bIn0), _mm256_castsi256_si128(bIn1));
491 __m128i y = _mm_max_epi32(_mm256_extractf128_si256(bIn0, 1), _mm256_extractf128_si256(bIn1, 1));
492 *bOut = _mm256_insertf128_si256(_mm256_castsi128_si256(x), y, 1);
493 #elif HV_SIMD_SSE
494 *bOut = _mm_max_epi32(bIn0, bIn1);
495 #elif HV_SIMD_NEON
496 *bOut = vmaxq_s32(bIn0, bIn1);
497 #else // HV_SIMD_NONE
498 *bOut = hv_max_i(bIn0, bIn1);
499 #endif
500 }
501
502 static inline void __hv_pow_f(hv_bInf_t bIn0, hv_bInf_t bIn1, hv_bOutf_t bOut) {
503 #if HV_SIMD_AVX
504 *bOut = _mm256_set_ps(
505 hv_pow_f(bIn0[7], bIn1[7]),
506 hv_pow_f(bIn0[6], bIn1[6]),
507 hv_pow_f(bIn0[5], bIn1[5]),
508 hv_pow_f(bIn0[4], bIn1[4]),
509 hv_pow_f(bIn0[3], bIn1[3]),
510 hv_pow_f(bIn0[2], bIn1[2]),
511 hv_pow_f(bIn0[1], bIn1[1]),
512 hv_pow_f(bIn0[0], bIn1[0]));
513 #elif HV_SIMD_SSE
514 *bOut = _mm_set_ps(
515 hv_pow_f(bIn0[3], bIn1[3]),
516 hv_pow_f(bIn0[2], bIn1[2]),
517 hv_pow_f(bIn0[1], bIn1[1]),
518 hv_pow_f(bIn0[0], bIn1[0]));
519 #elif HV_SIMD_NEON
520 *bOut = (float32x4_t) {
521 hv_pow_f(bIn0[0], bIn1[0]),
522 hv_pow_f(bIn0[1], bIn1[1]),
523 hv_pow_f(bIn0[2], bIn1[2]),
524 hv_pow_f(bIn0[3], bIn1[3])};
525 #else // HV_SIMD_NONE
526 *bOut = hv_pow_f(bIn0, bIn1);
527 #endif
528 }
529
530 static inline void __hv_gt_f(hv_bInf_t bIn0, hv_bInf_t bIn1, hv_bOutf_t bOut) {
531 #if HV_SIMD_AVX
532 *bOut = _mm256_cmp_ps(bIn0, bIn1, _CMP_GT_OQ);
533 #elif HV_SIMD_SSE
534 *bOut = _mm_cmpgt_ps(bIn0, bIn1);
535 #elif HV_SIMD_NEON
536 *bOut = vreinterpretq_f32_u32(vcgtq_f32(bIn0, bIn1));
537 #else // HV_SIMD_NONE
538 *bOut = (bIn0 > bIn1) ? 1.0f : 0.0f;
539 #endif
540 }
541
542 static inline void __hv_gte_f(hv_bInf_t bIn0, hv_bInf_t bIn1, hv_bOutf_t bOut) {
543 #if HV_SIMD_AVX
544 *bOut = _mm256_cmp_ps(bIn0, bIn1, _CMP_GE_OQ);
545 #elif HV_SIMD_SSE
546 *bOut = _mm_cmpge_ps(bIn0, bIn1);
547 #elif HV_SIMD_NEON
548 *bOut = vreinterpretq_f32_u32(vcgeq_f32(bIn0, bIn1));
549 #else // HV_SIMD_NONE
550 *bOut = (bIn0 >= bIn1) ? 1.0f : 0.0f;
551 #endif
552 }
553
554 static inline void __hv_lt_f(hv_bInf_t bIn0, hv_bInf_t bIn1, hv_bOutf_t bOut) {
555 #if HV_SIMD_AVX
556 *bOut = _mm256_cmp_ps(bIn0, bIn1, _CMP_LT_OQ);
557 #elif HV_SIMD_SSE
558 *bOut = _mm_cmplt_ps(bIn0, bIn1);
559 #elif HV_SIMD_NEON
560 *bOut = vreinterpretq_f32_u32(vcltq_f32(bIn0, bIn1));
561 #else // HV_SIMD_NONE
562 *bOut = (bIn0 < bIn1) ? 1.0f : 0.0f;
563 #endif
564 }
565
566 static inline void __hv_lte_f(hv_bInf_t bIn0, hv_bInf_t bIn1, hv_bOutf_t bOut) {
567 #if HV_SIMD_AVX
568 *bOut = _mm256_cmp_ps(bIn0, bIn1, _CMP_LE_OQ);
569 #elif HV_SIMD_SSE
570 *bOut = _mm_cmple_ps(bIn0, bIn1);
571 #elif HV_SIMD_NEON
572 *bOut = vreinterpretq_f32_u32(vcleq_f32(bIn0, bIn1));
573 #else // HV_SIMD_NONE
574 *bOut = (bIn0 <= bIn1) ? 1.0f : 0.0f;
575 #endif
576 }
577
578 static inline void __hv_neq_f(hv_bInf_t bIn0, hv_bInf_t bIn1, hv_bOutf_t bOut) {
579 #if HV_SIMD_AVX
580 *bOut = _mm256_cmp_ps(bIn0, bIn1, _CMP_NEQ_OQ);
581 #elif HV_SIMD_SSE
582 *bOut = _mm_cmpneq_ps(bIn0, bIn1);
583 #elif HV_SIMD_NEON
584 *bOut = vreinterpretq_f32_u32(vmvnq_u32(vceqq_f32(bIn0, bIn1)));
585 #else // HV_SIMD_NONE
586 *bOut = (bIn0 != bIn1) ? 1.0f : 0.0f;
587 #endif
588 }
589
590 static inline void __hv_xor_f(hv_bInf_t bIn0, hv_bInf_t bIn1, hv_bOutf_t bOut) {
591 #if HV_SIMD_AVX
592 #warning __hv_xor_f() not implemented
593 #elif HV_SIMD_SSE
594 #warning __hv_xor_f() not implemented
595 #elif HV_SIMD_NEON
596 #warning __hv_xor_f() not implemented
597 #else // HV_SIMD_NONE
598 *bOut = (float) (((int) bIn0) ^ ((int) bIn1));
599 #endif
600 }
601
602 static inline void __hv_and_f(hv_bInf_t bIn0, hv_bInf_t bIn1, hv_bOutf_t bOut) {
603 #if HV_SIMD_AVX
604 *bOut = _mm256_and_ps(bIn1, bIn0);
605 #elif HV_SIMD_SSE
606 *bOut = _mm_and_ps(bIn1, bIn0);
607 #elif HV_SIMD_NEON
608 *bOut = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(bIn1), vreinterpretq_u32_f32(bIn0)));
609 #else // HV_SIMD_NONE
610 if (bIn0 == 0.0f || bIn1 == 0.0f) *bOut = 0.0f;
611 else if (bIn0 == 1.0f) *bOut = bIn1;
612 else if (bIn1 == 1.0f) *bOut = bIn0;
613 else hv_assert(0); // NOTE(mhroth): floating point & is pretty much a bad idea, only used for if~
614 #endif
615 }
616
617 // bOut = (bIn0 * bIn1) + bIn2
618 static inline void __hv_fma_f(hv_bInf_t bIn0, hv_bInf_t bIn1, hv_bInf_t bIn2, hv_bOutf_t bOut) {
619 #if HV_SIMD_AVX
620 #if HV_SIMD_FMA
621 *bOut = _mm256_fmadd_ps(bIn0, bIn1, bIn2);
622 #else
623 *bOut = _mm256_add_ps(_mm256_mul_ps(bIn0, bIn1), bIn2);
624 #endif // HV_SIMD_FMA
625 #elif HV_SIMD_SSE
626 #if HV_SIMD_FMA
627 *bOut = _mm_fmadd_ps(bIn0, bIn1, bIn2);
628 #else
629 *bOut = _mm_add_ps(_mm_mul_ps(bIn0, bIn1), bIn2);
630 #endif // HV_SIMD_FMA
631 #elif HV_SIMD_NEON
632 #if __ARM_ARCH >= 8
633 *bOut = vfmaq_f32(bIn2, bIn0, bIn1);
634 #else
635 // NOTE(mhroth): it turns out, fma SUUUUCKS on lesser ARM architectures
636 // But in fact ideally fma would be disabled in ir2c for ARM architectures.
637 // LLVM does a much better job handling fma than we do.
638 *bOut = vaddq_f32(vmulq_f32(bIn0, bIn1), bIn2);
639 #endif
640 #else // HV_SIMD_NONE
641 *bOut = hv_fma_f(bIn0, bIn1, bIn2);
642 #endif
643 }
644
645 #endif // _HEAVY_MATH_H_