Mercurial > hg > beaglert
comparison projects/heavy/circularBuffer/HeavyMath.h @ 163:20b52283c7b4 heavy-updated
- added circular buffer pd/heavy example (works but process needs to be killed manually if launched via ssh?)
author | chnrx <chris.heinrichs@gmail.com> |
---|---|
date | Thu, 12 Nov 2015 15:55:30 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
162:c3e8226a5651 | 163:20b52283c7b4 |
---|---|
1 /** | |
2 * Copyright (c) 2014, 2015, Enzien Audio Ltd. | |
3 * | |
4 * Permission to use, copy, modify, and/or distribute this software for any | |
5 * purpose with or without fee is hereby granted, provided that the above | |
6 * copyright notice and this permission notice appear in all copies. | |
7 * | |
8 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH | |
9 * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY | |
10 * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, | |
11 * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM | |
12 * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR | |
13 * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR | |
14 * PERFORMANCE OF THIS SOFTWARE. | |
15 */ | |
16 | |
17 #ifndef _HEAVY_MATH_H_ | |
18 #define _HEAVY_MATH_H_ | |
19 | |
20 #include "Utils.h" | |
21 | |
22 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/ | |
23 // https://gcc.gnu.org/onlinedocs/gcc-4.8.1/gcc/ARM-NEON-Intrinsics.html | |
24 // http://codesuppository.blogspot.co.uk/2015/02/sse2neonh-porting-guide-and-header-file.html | |
25 | |
26 static inline void __hv_zero_f(hv_bOutf_t bOut) { | |
27 #if HV_SIMD_AVX | |
28 *bOut = _mm256_setzero_ps(); | |
29 #elif HV_SIMD_SSE | |
30 *bOut = _mm_setzero_ps(); | |
31 #elif HV_SIMD_NEON | |
32 *bOut = vdupq_n_f32(0.0f); | |
33 #else // HV_SIMD_NONE | |
34 *bOut = 0.0f; | |
35 #endif | |
36 } | |
37 | |
38 static inline void __hv_load_f(float *bIn, hv_bOutf_t bOut) { | |
39 #if HV_SIMD_AVX | |
40 *bOut = _mm256_load_ps(bIn); | |
41 #elif HV_SIMD_SSE | |
42 *bOut = _mm_load_ps(bIn); | |
43 #elif HV_SIMD_NEON | |
44 *bOut = vld1q_f32(bIn); | |
45 #else // HV_SIMD_NONE | |
46 *bOut = *bIn; | |
47 #endif | |
48 } | |
49 | |
50 static inline void __hv_store_f(float *bOut, hv_bInf_t bIn) { | |
51 #if HV_SIMD_AVX | |
52 _mm256_store_ps(bOut, bIn); | |
53 #elif HV_SIMD_SSE | |
54 _mm_store_ps(bOut, bIn); | |
55 #elif HV_SIMD_NEON | |
56 vst1q_f32(bOut, bIn); | |
57 #else // HV_SIMD_NONE | |
58 *bOut = bIn; | |
59 #endif | |
60 } | |
61 | |
62 static inline void __hv_log_f(hv_bInf_t bIn, hv_bOutf_t bOut) { | |
63 #if HV_SIMD_AVX | |
64 #warning __hv_log_f() not implemented | |
65 #elif HV_SIMD_SSE | |
66 #warning __hv_log_f() not implemented | |
67 #elif HV_SIMD_NEON | |
68 #warning __hv_log_f() not implemented | |
69 #else // HV_SIMD_NONE | |
70 *bOut = (bIn > 0.0f) ? hv_log_f(bIn) : 0.0f; | |
71 #endif | |
72 } | |
73 | |
74 static inline void __hv_log10_f(hv_bInf_t bIn, hv_bOutf_t bOut) { | |
75 #if HV_SIMD_AVX | |
76 #warning __hv_log10_f() not implemented | |
77 #elif HV_SIMD_SSE | |
78 #warning __hv_log10_f() not implemented | |
79 #elif HV_SIMD_NEON | |
80 #warning __hv_log10_f() not implemented | |
81 #else // HV_SIMD_NONE | |
82 *bOut = (bIn > 0.0f) ? hv_log10_f(bIn) : 0.0f; | |
83 #endif | |
84 } | |
85 | |
86 static inline void __hv_log2_f(hv_bInf_t bIn, hv_bOutf_t bOut) { | |
87 #if HV_SIMD_AVX | |
88 #warning __hv_log2_f() not implemented | |
89 #elif HV_SIMD_SSE | |
90 #warning __hv_log2_f() not implemented | |
91 #elif HV_SIMD_NEON | |
92 #warning __hv_log2_f() not implemented | |
93 #else // HV_SIMD_NONE | |
94 *bOut = (bIn > 0.0f) ? hv_log2_f(bIn) : 0.0f; | |
95 #endif | |
96 } | |
97 | |
98 // NOTE(mhroth): this is a pretty ghetto implementation | |
99 static inline void __hv_cos_f(hv_bInf_t bIn, hv_bOutf_t bOut) { | |
100 #if HV_SIMD_AVX | |
101 *bOut = _mm256_set_ps( | |
102 hv_cos_f(bIn[7]), hv_cos_f(bIn[6]), hv_cos_f(bIn[5]), hv_cos_f(bIn[4]), | |
103 hv_cos_f(bIn[3]), hv_cos_f(bIn[2]), hv_cos_f(bIn[1]), hv_cos_f(bIn[0])); | |
104 #elif HV_SIMD_SSE | |
105 *bOut = _mm_set_ps(hv_cos_f(bIn[3]), hv_cos_f(bIn[2]), hv_cos_f(bIn[1]), hv_cos_f(bIn[0])); | |
106 #elif HV_SIMD_NEON | |
107 *bOut = (float32x4_t) {hv_cos_f(bIn[0]), hv_cos_f(bIn[1]), hv_cos_f(bIn[2]), hv_cos_f(bIn[3])}; | |
108 #else // HV_SIMD_NONE | |
109 *bOut = hv_cos_f(bIn); | |
110 #endif | |
111 } | |
112 | |
113 static inline void __hv_acos_f(hv_bInf_t bIn, hv_bOutf_t bOut) { | |
114 #if HV_SIMD_AVX | |
115 #warning __hv_acos_f() not implemented | |
116 #elif HV_SIMD_SSE | |
117 #warning __hv_acos_f() not implemented | |
118 #elif HV_SIMD_NEON | |
119 #warning __hv_acos_f() not implemented | |
120 #else // HV_SIMD_NONE | |
121 *bOut = hv_acos_f(bIn); | |
122 #endif | |
123 } | |
124 | |
125 static inline void __hv_cosh_f(hv_bInf_t bIn, hv_bOutf_t bOut) { | |
126 #if HV_SIMD_AVX | |
127 #warning __hv_cosh_f() not implemented | |
128 #elif HV_SIMD_SSE | |
129 #warning __hv_cosh_f() not implemented | |
130 #elif HV_SIMD_NEON | |
131 #warning __hv_cosh_f() not implemented | |
132 #else // HV_SIMD_NONE | |
133 *bOut = hv_cosh_f(bIn); | |
134 #endif | |
135 } | |
136 | |
137 static inline void __hv_acosh_f(hv_bInf_t bIn, hv_bOutf_t bOut) { | |
138 #if HV_SIMD_AVX | |
139 #warning __hv_acosh_f() not implemented | |
140 #elif HV_SIMD_SSE | |
141 #warning __hv_acosh_f() not implemented | |
142 #elif HV_SIMD_NEON | |
143 #warning __hv_acosh_f() not implemented | |
144 #else // HV_SIMD_NONE | |
145 *bOut = hv_acosh_f(bIn); | |
146 #endif | |
147 } | |
148 | |
149 static inline void __hv_sin_f(hv_bInf_t bIn, hv_bOutf_t bOut) { | |
150 #if HV_SIMD_AVX | |
151 #warning __hv_sin_f() not implemented | |
152 #elif HV_SIMD_SSE | |
153 #warning __hv_sin_f() not implemented | |
154 #elif HV_SIMD_NEON | |
155 #warning __hv_sin_f() not implemented | |
156 #else // HV_SIMD_NONE | |
157 *bOut = hv_sin_f(bIn); | |
158 #endif | |
159 } | |
160 | |
161 static inline void __hv_asin_f(hv_bInf_t bIn, hv_bOutf_t bOut) { | |
162 #if HV_SIMD_AVX | |
163 #warning __hv_asin_f() not implemented | |
164 #elif HV_SIMD_SSE | |
165 #warning __hv_asin_f() not implemented | |
166 #elif HV_SIMD_NEON | |
167 #warning __hv_asin_f() not implemented | |
168 #else // HV_SIMD_NONE | |
169 *bOut = hv_asin_f(bIn); | |
170 #endif | |
171 } | |
172 | |
173 static inline void __hv_sinh_f(hv_bInf_t bIn, hv_bOutf_t bOut) { | |
174 #if HV_SIMD_AVX | |
175 #warning __hv_sinh_f() not implemented | |
176 #elif HV_SIMD_SSE | |
177 #warning __hv_sinh_f() not implemented | |
178 #elif HV_SIMD_NEON | |
179 #warning __hv_sinh_f() not implemented | |
180 #else // HV_SIMD_NONE | |
181 *bOut = hv_sinh_f(bIn); | |
182 #endif | |
183 } | |
184 | |
185 static inline void __hv_asinh_f(hv_bInf_t bIn, hv_bOutf_t bOut) { | |
186 #if HV_SIMD_AVX | |
187 #warning __hv_asinh_f() not implemented | |
188 #elif HV_SIMD_SSE | |
189 #warning __hv_asinh_f() not implemented | |
190 #elif HV_SIMD_NEON | |
191 #warning __hv_asinh_f() not implemented | |
192 #else // HV_SIMD_NONE | |
193 *bOut = hv_asinh_f(bIn); | |
194 #endif | |
195 } | |
196 | |
197 static inline void __hv_tan_f(hv_bInf_t bIn, hv_bOutf_t bOut) { | |
198 #if HV_SIMD_AVX | |
199 #warning __hv_tan_f() not implemented | |
200 #elif HV_SIMD_SSE | |
201 #warning __hv_tan_f() not implemented | |
202 #elif HV_SIMD_NEON | |
203 #warning __hv_tan_f() not implemented | |
204 #else // HV_SIMD_NONE | |
205 *bOut = hv_tan_f(bIn); | |
206 #endif | |
207 } | |
208 | |
209 static inline void __hv_atan_f(hv_bInf_t bIn, hv_bOutf_t bOut) { | |
210 #if HV_SIMD_AVX | |
211 #warning __hv_atan_f() not implemented | |
212 #elif HV_SIMD_SSE | |
213 #warning __hv_atan_f() not implemented | |
214 #elif HV_SIMD_NEON | |
215 #warning __hv_atan_f() not implemented | |
216 #else // HV_SIMD_NONE | |
217 *bOut = hv_atan_f(bIn); | |
218 #endif | |
219 } | |
220 | |
221 static inline void __hv_atan2_f(hv_bInf_t bIn0, hv_bInf_t bIn1, hv_bOutf_t bOut) { | |
222 #if HV_SIMD_AVX | |
223 #warning __hv_atan2_f() not implemented | |
224 #elif HV_SIMD_SSE | |
225 #warning __hv_atan2_f() not implemented | |
226 #elif HV_SIMD_NEON | |
227 #warning __hv_atan2_f() not implemented | |
228 #else // HV_SIMD_NONE | |
229 *bOut = hv_atan2_f(bIn0, bIn1); | |
230 #endif | |
231 } | |
232 | |
233 static inline void __hv_tanh_f(hv_bInf_t bIn, hv_bOutf_t bOut) { | |
234 #if HV_SIMD_AVX | |
235 #warning __hv_tanh_f() not implemented | |
236 #elif HV_SIMD_SSE | |
237 #warning __hv_tanh_f() not implemented | |
238 #elif HV_SIMD_NEON | |
239 #warning __hv_tanh_f() not implemented | |
240 #else // HV_SIMD_NONE | |
241 *bOut = hv_tanh_f(bIn); | |
242 #endif | |
243 } | |
244 | |
245 static inline void __hv_atanh_f(hv_bInf_t bIn, hv_bOutf_t bOut) { | |
246 #if HV_SIMD_AVX | |
247 #warning __hv_atanh_f() not implemented | |
248 #elif HV_SIMD_SSE | |
249 #warning __hv_atanh_f() not implemented | |
250 #elif HV_SIMD_NEON | |
251 #warning __hv_atanh_f() not implemented | |
252 #else // HV_SIMD_NONE | |
253 *bOut = hv_atanh_f(bIn); | |
254 #endif | |
255 } | |
256 | |
257 // NOTE(mhroth): use of sqrt is absolute and total MURDER. Make do with recipocal sqrt if possible!! | |
258 static inline void __hv_sqrt_f(hv_bInf_t bIn, hv_bOutf_t bOut) { | |
259 #if HV_SIMD_AVX | |
260 *bOut = _mm256_sqrt_ps(bIn); | |
261 #elif HV_SIMD_SSE | |
262 *bOut = _mm_sqrt_ps(bIn); | |
263 #elif HV_SIMD_NEON | |
264 #warning __hv_sqrt_f() numerical results may be inexact | |
265 *bOut = vrecpeq_f32(vrsqrteq_f32(bIn)); | |
266 #else // HV_SIMD_NONE | |
267 *bOut = hv_sqrt_f(bIn); | |
268 #endif | |
269 } | |
270 | |
271 static inline void __hv_rsqrt_f(hv_bInf_t bIn, hv_bOutf_t bOut) { | |
272 #if HV_SIMD_AVX | |
273 *bOut = _mm256_rsqrt_ps(bIn); | |
274 #elif HV_SIMD_SSE | |
275 *bOut = _mm_rsqrt_ps(bIn); | |
276 #elif HV_SIMD_NEON | |
277 #warning __hv_rsqrt_f() numerical results may be inexact | |
278 *bOut = vrsqrteq_f32(bIn); | |
279 #else // HV_SIMD_NONE | |
280 *bOut = 1.0f/hv_sqrt_f(bIn); | |
281 #endif | |
282 } | |
283 | |
284 static inline void __hv_abs_f(hv_bInf_t bIn, hv_bOutf_t bOut) { | |
285 #if HV_SIMD_AVX | |
286 *bOut = _mm256_andnot_ps(_mm256_set1_ps(-0.0f), bIn); | |
287 #elif HV_SIMD_SSE | |
288 *bOut = _mm_andnot_ps(_mm_set1_ps(-0.0f), bIn); // == 1 << 31 | |
289 #elif HV_SIMD_NEON | |
290 *bOut = vabsq_f32(bIn); | |
291 #else // HV_SIMD_NONE | |
292 *bOut = hv_abs_f(bIn); | |
293 #endif | |
294 } | |
295 | |
296 static inline void __hv_exp_f(hv_bInf_t bIn, hv_bOutf_t bOut) { | |
297 #if HV_SIMD_AVX | |
298 #warning __hv_exp_f() not implemented | |
299 #elif HV_SIMD_SSE | |
300 #warning __hv_exp_f() not implemented | |
301 #elif HV_SIMD_NEON | |
302 #warning __hv_exp_f() not implemented | |
303 #else // HV_SIMD_NONE | |
304 *bOut = hv_exp_f(bIn); | |
305 #endif | |
306 } | |
307 | |
308 static inline void __hv_ceil_f(hv_bInf_t bIn, hv_bOutf_t bOut) { | |
309 #if HV_SIMD_AVX | |
310 *bOut = _mm256_ceil_ps(bIn); | |
311 #elif HV_SIMD_SSE | |
312 *bOut = _mm_ceil_ps(bIn); | |
313 #elif HV_SIMD_NEON | |
314 #if __ARM_ARCH >= 8 | |
315 *bOut = vrndpq_f32(bIn); | |
316 #else | |
317 #warning A slow NEON implementation of __hv_ceil_f() is being used because the necessary intrinsic cannot be found. It is only available in ARMv8. | |
318 *bOut = (float32x4_t) {hv_ceil_f(bIn[0]), hv_ceil_f(bIn[1]), hv_ceil_f(bIn[2]), hv_ceil_f(bIn[3])}; | |
319 #endif // vrndpq_f32 | |
320 #else // HV_SIMD_NONE | |
321 *bOut = hv_ceil_f(bIn); | |
322 #endif | |
323 } | |
324 | |
325 static inline void __hv_floor_f(hv_bInf_t bIn, hv_bOutf_t bOut) { | |
326 #if HV_SIMD_AVX | |
327 *bOut = _mm256_floor_ps(bIn); | |
328 #elif HV_SIMD_SSE | |
329 *bOut = _mm_floor_ps(bIn); | |
330 #elif HV_SIMD_NEON | |
331 #if __ARM_ARCH >= 8 | |
332 *bOut = vrndmq_f32(bIn); | |
333 #else | |
334 #warning A slow NEON implementation of __hv_floor_f() is being used because the necessary intrinsic cannot be found. It is only available in ARMv8. | |
335 *bOut = (float32x4_t) {hv_floor_f(bIn[0]), hv_floor_f(bIn[1]), hv_floor_f(bIn[2]), hv_floor_f(bIn[3])}; | |
336 #endif // vrndmq_f32 | |
337 #else // HV_SIMD_NONE | |
338 *bOut = hv_floor_f(bIn); | |
339 #endif | |
340 } | |
341 | |
342 // __add~f | |
343 static inline void __hv_add_f(hv_bInf_t bIn0, hv_bInf_t bIn1, hv_bOutf_t bOut) { | |
344 #if HV_SIMD_AVX | |
345 *bOut = _mm256_add_ps(bIn0, bIn1); | |
346 #elif HV_SIMD_SSE | |
347 *bOut = _mm_add_ps(bIn0, bIn1); | |
348 #elif HV_SIMD_NEON | |
349 *bOut = vaddq_f32(bIn0, bIn1); | |
350 #else // HV_SIMD_NONE | |
351 *bOut = bIn0 + bIn1; | |
352 #endif | |
353 } | |
354 | |
355 // __add~i | |
356 static inline void __hv_add_i(hv_bIni_t bIn0, hv_bIni_t bIn1, hv_bOuti_t bOut) { | |
357 #if HV_SIMD_AVX | |
358 __m128i x = _mm_add_epi32(_mm256_castsi256_si128(bIn0), _mm256_castsi256_si128(bIn1)); | |
359 __m128i y = _mm_add_epi32(_mm256_extractf128_si256(bIn0, 1), _mm256_extractf128_si256(bIn1, 1)); | |
360 *bOut = _mm256_insertf128_si256(_mm256_castsi128_si256(x), y, 1); | |
361 #elif HV_SIMD_SSE | |
362 *bOut = _mm_add_epi32(bIn0, bIn1); | |
363 #elif HV_SIMD_NEON | |
364 *bOut = vaddq_s32(bIn0, bIn1); | |
365 #else // HV_SIMD_NONE | |
366 *bOut = bIn0 + bIn1; | |
367 #endif | |
368 } | |
369 | |
370 // __sub~f | |
371 static inline void __hv_sub_f(hv_bInf_t bIn0, hv_bInf_t bIn1, hv_bOutf_t bOut) { | |
372 #if HV_SIMD_AVX | |
373 *bOut = _mm256_sub_ps(bIn0, bIn1); | |
374 #elif HV_SIMD_SSE | |
375 *bOut = _mm_sub_ps(bIn0, bIn1); | |
376 #elif HV_SIMD_NEON | |
377 *bOut = vsubq_f32(bIn0, bIn1); | |
378 #else // HV_SIMD_NONE | |
379 *bOut = bIn0 - bIn1; | |
380 #endif | |
381 } | |
382 | |
383 // __mul~f | |
384 static inline void __hv_mul_f(hv_bInf_t bIn0, hv_bInf_t bIn1, hv_bOutf_t bOut) { | |
385 #if HV_SIMD_AVX | |
386 *bOut = _mm256_mul_ps(bIn0, bIn1); | |
387 #elif HV_SIMD_SSE | |
388 *bOut = _mm_mul_ps(bIn0, bIn1); | |
389 #elif HV_SIMD_NEON | |
390 *bOut = vmulq_f32(bIn0, bIn1); | |
391 #else // HV_SIMD_NONE | |
392 *bOut = bIn0 * bIn1; | |
393 #endif | |
394 } | |
395 | |
396 // __*~i | |
397 static inline void __hv_mul_i(hv_bIni_t bIn0, hv_bIni_t bIn1, hv_bOuti_t bOut) { | |
398 #if HV_SIMD_AVX | |
399 __m128i x = _mm_mullo_epi32(_mm256_castsi256_si128(bIn0), _mm256_castsi256_si128(bIn1)); | |
400 __m128i y = _mm_mullo_epi32(_mm256_extractf128_si256(bIn0, 1), _mm256_extractf128_si256(bIn1, 1)); | |
401 *bOut = _mm256_insertf128_si256(_mm256_castsi128_si256(x), y, 1); | |
402 #elif HV_SIMD_SSE | |
403 *bOut = _mm_mullo_epi32(bIn0, bIn1); | |
404 #elif HV_SIMD_NEON | |
405 *bOut = vmulq_s32(bIn0, bIn1); | |
406 #else // HV_SIMD_NONE | |
407 *bOut = bIn0 * bIn1; | |
408 #endif | |
409 } | |
410 | |
411 // __cast~if | |
412 static inline void __hv_cast_if(hv_bIni_t bIn, hv_bOutf_t bOut) { | |
413 #if HV_SIMD_AVX | |
414 *bOut = _mm256_cvtepi32_ps(bIn); | |
415 #elif HV_SIMD_SSE | |
416 *bOut = _mm_cvtepi32_ps(bIn); | |
417 #elif HV_SIMD_NEON | |
418 *bOut = vcvtq_f32_s32(bIn); | |
419 #else // HV_SIMD_NONE | |
420 *bOut = (float) bIn; | |
421 #endif | |
422 } | |
423 | |
424 // __cast~fi | |
425 static inline void __hv_cast_fi(hv_bInf_t bIn, hv_bOuti_t bOut) { | |
426 #if HV_SIMD_AVX | |
427 *bOut = _mm256_cvtps_epi32(bIn); | |
428 #elif HV_SIMD_SSE | |
429 *bOut = _mm_cvtps_epi32(bIn); | |
430 #elif HV_SIMD_NEON | |
431 *bOut = vcvtq_s32_f32(bIn); | |
432 #else // HV_SIMD_NONE | |
433 *bOut = (int) bIn; | |
434 #endif | |
435 } | |
436 | |
437 static inline void __hv_div_f(hv_bInf_t bIn0, hv_bInf_t bIn1, hv_bOutf_t bOut) { | |
438 #if HV_SIMD_AVX | |
439 *bOut = _mm256_div_ps(bIn0, bIn1); | |
440 #elif HV_SIMD_SSE | |
441 *bOut = _mm_div_ps(bIn0, bIn1); | |
442 #elif HV_SIMD_NEON | |
443 #warning __hv_div_f() numerical results may be inexact | |
444 *bOut = vmulq_f32(bIn0, vrecpeq_f32(bIn1)); | |
445 #else // HV_SIMD_NONE | |
446 *bOut = (bIn1 != 0.0f) ? (bIn0 / bIn1) : 0.0f; | |
447 #endif | |
448 } | |
449 | |
450 static inline void __hv_min_f(hv_bInf_t bIn0, hv_bInf_t bIn1, hv_bOutf_t bOut) { | |
451 #if HV_SIMD_AVX | |
452 *bOut = _mm256_min_ps(bIn0, bIn1); | |
453 #elif HV_SIMD_SSE | |
454 *bOut = _mm_min_ps(bIn0, bIn1); | |
455 #elif HV_SIMD_NEON | |
456 *bOut = vminq_f32(bIn0, bIn1); | |
457 #else // HV_SIMD_NONE | |
458 *bOut = hv_min_f(bIn0, bIn1); | |
459 #endif | |
460 } | |
461 | |
462 static inline void __hv_min_i(hv_bIni_t bIn0, hv_bIni_t bIn1, hv_bOuti_t bOut) { | |
463 #if HV_SIMD_AVX | |
464 __m128i x = _mm_min_epi32(_mm256_castsi256_si128(bIn0), _mm256_castsi256_si128(bIn1)); | |
465 __m128i y = _mm_min_epi32(_mm256_extractf128_si256(bIn0, 1), _mm256_extractf128_si256(bIn1, 1)); | |
466 *bOut = _mm256_insertf128_si256(_mm256_castsi128_si256(x), y, 1); | |
467 #elif HV_SIMD_SSE | |
468 *bOut = _mm_min_epi32(bIn0, bIn1); | |
469 #elif HV_SIMD_NEON | |
470 *bOut = vminq_s32(bIn0, bIn1); | |
471 #else // HV_SIMD_NONE | |
472 *bOut = hv_min_i(bIn0, bIn1); | |
473 #endif | |
474 } | |
475 | |
476 static inline void __hv_max_f(hv_bInf_t bIn0, hv_bInf_t bIn1, hv_bOutf_t bOut) { | |
477 #if HV_SIMD_AVX | |
478 *bOut = _mm256_max_ps(bIn0, bIn1); | |
479 #elif HV_SIMD_SSE | |
480 *bOut = _mm_max_ps(bIn0, bIn1); | |
481 #elif HV_SIMD_NEON | |
482 *bOut = vmaxq_f32(bIn0, bIn1); | |
483 #else // HV_SIMD_NONE | |
484 *bOut = hv_max_f(bIn0, bIn1); | |
485 #endif | |
486 } | |
487 | |
488 static inline void __hv_max_i(hv_bIni_t bIn0, hv_bIni_t bIn1, hv_bOuti_t bOut) { | |
489 #if HV_SIMD_AVX | |
490 __m128i x = _mm_max_epi32(_mm256_castsi256_si128(bIn0), _mm256_castsi256_si128(bIn1)); | |
491 __m128i y = _mm_max_epi32(_mm256_extractf128_si256(bIn0, 1), _mm256_extractf128_si256(bIn1, 1)); | |
492 *bOut = _mm256_insertf128_si256(_mm256_castsi128_si256(x), y, 1); | |
493 #elif HV_SIMD_SSE | |
494 *bOut = _mm_max_epi32(bIn0, bIn1); | |
495 #elif HV_SIMD_NEON | |
496 *bOut = vmaxq_s32(bIn0, bIn1); | |
497 #else // HV_SIMD_NONE | |
498 *bOut = hv_max_i(bIn0, bIn1); | |
499 #endif | |
500 } | |
501 | |
502 static inline void __hv_pow_f(hv_bInf_t bIn0, hv_bInf_t bIn1, hv_bOutf_t bOut) { | |
503 #if HV_SIMD_AVX | |
504 *bOut = _mm256_set_ps( | |
505 hv_pow_f(bIn0[7], bIn1[7]), | |
506 hv_pow_f(bIn0[6], bIn1[6]), | |
507 hv_pow_f(bIn0[5], bIn1[5]), | |
508 hv_pow_f(bIn0[4], bIn1[4]), | |
509 hv_pow_f(bIn0[3], bIn1[3]), | |
510 hv_pow_f(bIn0[2], bIn1[2]), | |
511 hv_pow_f(bIn0[1], bIn1[1]), | |
512 hv_pow_f(bIn0[0], bIn1[0])); | |
513 #elif HV_SIMD_SSE | |
514 *bOut = _mm_set_ps( | |
515 hv_pow_f(bIn0[3], bIn1[3]), | |
516 hv_pow_f(bIn0[2], bIn1[2]), | |
517 hv_pow_f(bIn0[1], bIn1[1]), | |
518 hv_pow_f(bIn0[0], bIn1[0])); | |
519 #elif HV_SIMD_NEON | |
520 *bOut = (float32x4_t) { | |
521 hv_pow_f(bIn0[0], bIn1[0]), | |
522 hv_pow_f(bIn0[1], bIn1[1]), | |
523 hv_pow_f(bIn0[2], bIn1[2]), | |
524 hv_pow_f(bIn0[3], bIn1[3])}; | |
525 #else // HV_SIMD_NONE | |
526 *bOut = hv_pow_f(bIn0, bIn1); | |
527 #endif | |
528 } | |
529 | |
530 static inline void __hv_gt_f(hv_bInf_t bIn0, hv_bInf_t bIn1, hv_bOutf_t bOut) { | |
531 #if HV_SIMD_AVX | |
532 *bOut = _mm256_cmp_ps(bIn0, bIn1, _CMP_GT_OQ); | |
533 #elif HV_SIMD_SSE | |
534 *bOut = _mm_cmpgt_ps(bIn0, bIn1); | |
535 #elif HV_SIMD_NEON | |
536 *bOut = vreinterpretq_f32_u32(vcgtq_f32(bIn0, bIn1)); | |
537 #else // HV_SIMD_NONE | |
538 *bOut = (bIn0 > bIn1) ? 1.0f : 0.0f; | |
539 #endif | |
540 } | |
541 | |
542 static inline void __hv_gte_f(hv_bInf_t bIn0, hv_bInf_t bIn1, hv_bOutf_t bOut) { | |
543 #if HV_SIMD_AVX | |
544 *bOut = _mm256_cmp_ps(bIn0, bIn1, _CMP_GE_OQ); | |
545 #elif HV_SIMD_SSE | |
546 *bOut = _mm_cmpge_ps(bIn0, bIn1); | |
547 #elif HV_SIMD_NEON | |
548 *bOut = vreinterpretq_f32_u32(vcgeq_f32(bIn0, bIn1)); | |
549 #else // HV_SIMD_NONE | |
550 *bOut = (bIn0 >= bIn1) ? 1.0f : 0.0f; | |
551 #endif | |
552 } | |
553 | |
554 static inline void __hv_lt_f(hv_bInf_t bIn0, hv_bInf_t bIn1, hv_bOutf_t bOut) { | |
555 #if HV_SIMD_AVX | |
556 *bOut = _mm256_cmp_ps(bIn0, bIn1, _CMP_LT_OQ); | |
557 #elif HV_SIMD_SSE | |
558 *bOut = _mm_cmplt_ps(bIn0, bIn1); | |
559 #elif HV_SIMD_NEON | |
560 *bOut = vreinterpretq_f32_u32(vcltq_f32(bIn0, bIn1)); | |
561 #else // HV_SIMD_NONE | |
562 *bOut = (bIn0 < bIn1) ? 1.0f : 0.0f; | |
563 #endif | |
564 } | |
565 | |
566 static inline void __hv_lte_f(hv_bInf_t bIn0, hv_bInf_t bIn1, hv_bOutf_t bOut) { | |
567 #if HV_SIMD_AVX | |
568 *bOut = _mm256_cmp_ps(bIn0, bIn1, _CMP_LE_OQ); | |
569 #elif HV_SIMD_SSE | |
570 *bOut = _mm_cmple_ps(bIn0, bIn1); | |
571 #elif HV_SIMD_NEON | |
572 *bOut = vreinterpretq_f32_u32(vcleq_f32(bIn0, bIn1)); | |
573 #else // HV_SIMD_NONE | |
574 *bOut = (bIn0 <= bIn1) ? 1.0f : 0.0f; | |
575 #endif | |
576 } | |
577 | |
578 static inline void __hv_neq_f(hv_bInf_t bIn0, hv_bInf_t bIn1, hv_bOutf_t bOut) { | |
579 #if HV_SIMD_AVX | |
580 *bOut = _mm256_cmp_ps(bIn0, bIn1, _CMP_NEQ_OQ); | |
581 #elif HV_SIMD_SSE | |
582 *bOut = _mm_cmpneq_ps(bIn0, bIn1); | |
583 #elif HV_SIMD_NEON | |
584 *bOut = vreinterpretq_f32_u32(vmvnq_u32(vceqq_f32(bIn0, bIn1))); | |
585 #else // HV_SIMD_NONE | |
586 *bOut = (bIn0 != bIn1) ? 1.0f : 0.0f; | |
587 #endif | |
588 } | |
589 | |
590 static inline void __hv_xor_f(hv_bInf_t bIn0, hv_bInf_t bIn1, hv_bOutf_t bOut) { | |
591 #if HV_SIMD_AVX | |
592 #warning __hv_xor_f() not implemented | |
593 #elif HV_SIMD_SSE | |
594 #warning __hv_xor_f() not implemented | |
595 #elif HV_SIMD_NEON | |
596 #warning __hv_xor_f() not implemented | |
597 #else // HV_SIMD_NONE | |
598 *bOut = (float) (((int) bIn0) ^ ((int) bIn1)); | |
599 #endif | |
600 } | |
601 | |
602 static inline void __hv_and_f(hv_bInf_t bIn0, hv_bInf_t bIn1, hv_bOutf_t bOut) { | |
603 #if HV_SIMD_AVX | |
604 *bOut = _mm256_and_ps(bIn1, bIn0); | |
605 #elif HV_SIMD_SSE | |
606 *bOut = _mm_and_ps(bIn1, bIn0); | |
607 #elif HV_SIMD_NEON | |
608 *bOut = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(bIn1), vreinterpretq_u32_f32(bIn0))); | |
609 #else // HV_SIMD_NONE | |
610 if (bIn0 == 0.0f || bIn1 == 0.0f) *bOut = 0.0f; | |
611 else if (bIn0 == 1.0f) *bOut = bIn1; | |
612 else if (bIn1 == 1.0f) *bOut = bIn0; | |
613 else hv_assert(0); // NOTE(mhroth): floating point & is pretty much a bad idea, only used for if~ | |
614 #endif | |
615 } | |
616 | |
617 // bOut = (bIn0 * bIn1) + bIn2 | |
618 static inline void __hv_fma_f(hv_bInf_t bIn0, hv_bInf_t bIn1, hv_bInf_t bIn2, hv_bOutf_t bOut) { | |
619 #if HV_SIMD_AVX | |
620 #if HV_SIMD_FMA | |
621 *bOut = _mm256_fmadd_ps(bIn0, bIn1, bIn2); | |
622 #else | |
623 *bOut = _mm256_add_ps(_mm256_mul_ps(bIn0, bIn1), bIn2); | |
624 #endif // HV_SIMD_FMA | |
625 #elif HV_SIMD_SSE | |
626 #if HV_SIMD_FMA | |
627 *bOut = _mm_fmadd_ps(bIn0, bIn1, bIn2); | |
628 #else | |
629 *bOut = _mm_add_ps(_mm_mul_ps(bIn0, bIn1), bIn2); | |
630 #endif // HV_SIMD_FMA | |
631 #elif HV_SIMD_NEON | |
632 #if __ARM_ARCH >= 8 | |
633 *bOut = vfmaq_f32(bIn2, bIn0, bIn1); | |
634 #else | |
635 // NOTE(mhroth): it turns out, fma SUUUUCKS on lesser ARM architectures | |
636 // But in fact ideally fma would be disabled in ir2c for ARM architectures. | |
637 // LLVM does a much better job handling fma than we do. | |
638 *bOut = vaddq_f32(vmulq_f32(bIn0, bIn1), bIn2); | |
639 #endif | |
640 #else // HV_SIMD_NONE | |
641 *bOut = hv_fma_f(bIn0, bIn1, bIn2); | |
642 #endif | |
643 } | |
644 | |
645 #endif // _HEAVY_MATH_H_ |