annotate projects/heavy/envelopeTrigger/HeavyMath.h @ 162:c3e8226a5651 heavy-updated

- added additional flags to C rules (-DNDEBUG, -mfpu=neon) - sample-accurate envelope triggering pd/heavy example
author chnrx <chris.heinrichs@gmail.com>
date Thu, 12 Nov 2015 14:59:46 +0000
parents
children
rev   line source
chris@162 1 /**
chris@162 2 * Copyright (c) 2014, 2015, Enzien Audio Ltd.
chris@162 3 *
chris@162 4 * Permission to use, copy, modify, and/or distribute this software for any
chris@162 5 * purpose with or without fee is hereby granted, provided that the above
chris@162 6 * copyright notice and this permission notice appear in all copies.
chris@162 7 *
chris@162 8 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH
chris@162 9 * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
chris@162 10 * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT,
chris@162 11 * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
chris@162 12 * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
chris@162 13 * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
chris@162 14 * PERFORMANCE OF THIS SOFTWARE.
chris@162 15 */
chris@162 16
chris@162 17 #ifndef _HEAVY_MATH_H_
chris@162 18 #define _HEAVY_MATH_H_
chris@162 19
chris@162 20 #include "Utils.h"
chris@162 21
chris@162 22 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/
chris@162 23 // https://gcc.gnu.org/onlinedocs/gcc-4.8.1/gcc/ARM-NEON-Intrinsics.html
chris@162 24 // http://codesuppository.blogspot.co.uk/2015/02/sse2neonh-porting-guide-and-header-file.html
chris@162 25
chris@162 26 static inline void __hv_zero_f(hv_bOutf_t bOut) {
chris@162 27 #if HV_SIMD_AVX
chris@162 28 *bOut = _mm256_setzero_ps();
chris@162 29 #elif HV_SIMD_SSE
chris@162 30 *bOut = _mm_setzero_ps();
chris@162 31 #elif HV_SIMD_NEON
chris@162 32 *bOut = vdupq_n_f32(0.0f);
chris@162 33 #else // HV_SIMD_NONE
chris@162 34 *bOut = 0.0f;
chris@162 35 #endif
chris@162 36 }
chris@162 37
chris@162 38 static inline void __hv_load_f(float *bIn, hv_bOutf_t bOut) {
chris@162 39 #if HV_SIMD_AVX
chris@162 40 *bOut = _mm256_load_ps(bIn);
chris@162 41 #elif HV_SIMD_SSE
chris@162 42 *bOut = _mm_load_ps(bIn);
chris@162 43 #elif HV_SIMD_NEON
chris@162 44 *bOut = vld1q_f32(bIn);
chris@162 45 #else // HV_SIMD_NONE
chris@162 46 *bOut = *bIn;
chris@162 47 #endif
chris@162 48 }
chris@162 49
chris@162 50 static inline void __hv_store_f(float *bOut, hv_bInf_t bIn) {
chris@162 51 #if HV_SIMD_AVX
chris@162 52 _mm256_store_ps(bOut, bIn);
chris@162 53 #elif HV_SIMD_SSE
chris@162 54 _mm_store_ps(bOut, bIn);
chris@162 55 #elif HV_SIMD_NEON
chris@162 56 vst1q_f32(bOut, bIn);
chris@162 57 #else // HV_SIMD_NONE
chris@162 58 *bOut = bIn;
chris@162 59 #endif
chris@162 60 }
chris@162 61
chris@162 62 static inline void __hv_log_f(hv_bInf_t bIn, hv_bOutf_t bOut) {
chris@162 63 #if HV_SIMD_AVX
chris@162 64 #warning __hv_log_f() not implemented
chris@162 65 #elif HV_SIMD_SSE
chris@162 66 #warning __hv_log_f() not implemented
chris@162 67 #elif HV_SIMD_NEON
chris@162 68 #warning __hv_log_f() not implemented
chris@162 69 #else // HV_SIMD_NONE
chris@162 70 *bOut = (bIn > 0.0f) ? hv_log_f(bIn) : 0.0f;
chris@162 71 #endif
chris@162 72 }
chris@162 73
chris@162 74 static inline void __hv_log10_f(hv_bInf_t bIn, hv_bOutf_t bOut) {
chris@162 75 #if HV_SIMD_AVX
chris@162 76 #warning __hv_log10_f() not implemented
chris@162 77 #elif HV_SIMD_SSE
chris@162 78 #warning __hv_log10_f() not implemented
chris@162 79 #elif HV_SIMD_NEON
chris@162 80 #warning __hv_log10_f() not implemented
chris@162 81 #else // HV_SIMD_NONE
chris@162 82 *bOut = (bIn > 0.0f) ? hv_log10_f(bIn) : 0.0f;
chris@162 83 #endif
chris@162 84 }
chris@162 85
chris@162 86 static inline void __hv_log2_f(hv_bInf_t bIn, hv_bOutf_t bOut) {
chris@162 87 #if HV_SIMD_AVX
chris@162 88 #warning __hv_log2_f() not implemented
chris@162 89 #elif HV_SIMD_SSE
chris@162 90 #warning __hv_log2_f() not implemented
chris@162 91 #elif HV_SIMD_NEON
chris@162 92 #warning __hv_log2_f() not implemented
chris@162 93 #else // HV_SIMD_NONE
chris@162 94 *bOut = (bIn > 0.0f) ? hv_log2_f(bIn) : 0.0f;
chris@162 95 #endif
chris@162 96 }
chris@162 97
chris@162 98 // NOTE(mhroth): this is a pretty ghetto implementation
chris@162 99 static inline void __hv_cos_f(hv_bInf_t bIn, hv_bOutf_t bOut) {
chris@162 100 #if HV_SIMD_AVX
chris@162 101 *bOut = _mm256_set_ps(
chris@162 102 hv_cos_f(bIn[7]), hv_cos_f(bIn[6]), hv_cos_f(bIn[5]), hv_cos_f(bIn[4]),
chris@162 103 hv_cos_f(bIn[3]), hv_cos_f(bIn[2]), hv_cos_f(bIn[1]), hv_cos_f(bIn[0]));
chris@162 104 #elif HV_SIMD_SSE
chris@162 105 *bOut = _mm_set_ps(hv_cos_f(bIn[3]), hv_cos_f(bIn[2]), hv_cos_f(bIn[1]), hv_cos_f(bIn[0]));
chris@162 106 #elif HV_SIMD_NEON
chris@162 107 *bOut = (float32x4_t) {hv_cos_f(bIn[0]), hv_cos_f(bIn[1]), hv_cos_f(bIn[2]), hv_cos_f(bIn[3])};
chris@162 108 #else // HV_SIMD_NONE
chris@162 109 *bOut = hv_cos_f(bIn);
chris@162 110 #endif
chris@162 111 }
chris@162 112
chris@162 113 static inline void __hv_acos_f(hv_bInf_t bIn, hv_bOutf_t bOut) {
chris@162 114 #if HV_SIMD_AVX
chris@162 115 #warning __hv_acos_f() not implemented
chris@162 116 #elif HV_SIMD_SSE
chris@162 117 #warning __hv_acos_f() not implemented
chris@162 118 #elif HV_SIMD_NEON
chris@162 119 #warning __hv_acos_f() not implemented
chris@162 120 #else // HV_SIMD_NONE
chris@162 121 *bOut = hv_acos_f(bIn);
chris@162 122 #endif
chris@162 123 }
chris@162 124
chris@162 125 static inline void __hv_cosh_f(hv_bInf_t bIn, hv_bOutf_t bOut) {
chris@162 126 #if HV_SIMD_AVX
chris@162 127 #warning __hv_cosh_f() not implemented
chris@162 128 #elif HV_SIMD_SSE
chris@162 129 #warning __hv_cosh_f() not implemented
chris@162 130 #elif HV_SIMD_NEON
chris@162 131 #warning __hv_cosh_f() not implemented
chris@162 132 #else // HV_SIMD_NONE
chris@162 133 *bOut = hv_cosh_f(bIn);
chris@162 134 #endif
chris@162 135 }
chris@162 136
chris@162 137 static inline void __hv_acosh_f(hv_bInf_t bIn, hv_bOutf_t bOut) {
chris@162 138 #if HV_SIMD_AVX
chris@162 139 #warning __hv_acosh_f() not implemented
chris@162 140 #elif HV_SIMD_SSE
chris@162 141 #warning __hv_acosh_f() not implemented
chris@162 142 #elif HV_SIMD_NEON
chris@162 143 #warning __hv_acosh_f() not implemented
chris@162 144 #else // HV_SIMD_NONE
chris@162 145 *bOut = hv_acosh_f(bIn);
chris@162 146 #endif
chris@162 147 }
chris@162 148
chris@162 149 static inline void __hv_sin_f(hv_bInf_t bIn, hv_bOutf_t bOut) {
chris@162 150 #if HV_SIMD_AVX
chris@162 151 #warning __hv_sin_f() not implemented
chris@162 152 #elif HV_SIMD_SSE
chris@162 153 #warning __hv_sin_f() not implemented
chris@162 154 #elif HV_SIMD_NEON
chris@162 155 #warning __hv_sin_f() not implemented
chris@162 156 #else // HV_SIMD_NONE
chris@162 157 *bOut = hv_sin_f(bIn);
chris@162 158 #endif
chris@162 159 }
chris@162 160
chris@162 161 static inline void __hv_asin_f(hv_bInf_t bIn, hv_bOutf_t bOut) {
chris@162 162 #if HV_SIMD_AVX
chris@162 163 #warning __hv_asin_f() not implemented
chris@162 164 #elif HV_SIMD_SSE
chris@162 165 #warning __hv_asin_f() not implemented
chris@162 166 #elif HV_SIMD_NEON
chris@162 167 #warning __hv_asin_f() not implemented
chris@162 168 #else // HV_SIMD_NONE
chris@162 169 *bOut = hv_asin_f(bIn);
chris@162 170 #endif
chris@162 171 }
chris@162 172
chris@162 173 static inline void __hv_sinh_f(hv_bInf_t bIn, hv_bOutf_t bOut) {
chris@162 174 #if HV_SIMD_AVX
chris@162 175 #warning __hv_sinh_f() not implemented
chris@162 176 #elif HV_SIMD_SSE
chris@162 177 #warning __hv_sinh_f() not implemented
chris@162 178 #elif HV_SIMD_NEON
chris@162 179 #warning __hv_sinh_f() not implemented
chris@162 180 #else // HV_SIMD_NONE
chris@162 181 *bOut = hv_sinh_f(bIn);
chris@162 182 #endif
chris@162 183 }
chris@162 184
chris@162 185 static inline void __hv_asinh_f(hv_bInf_t bIn, hv_bOutf_t bOut) {
chris@162 186 #if HV_SIMD_AVX
chris@162 187 #warning __hv_asinh_f() not implemented
chris@162 188 #elif HV_SIMD_SSE
chris@162 189 #warning __hv_asinh_f() not implemented
chris@162 190 #elif HV_SIMD_NEON
chris@162 191 #warning __hv_asinh_f() not implemented
chris@162 192 #else // HV_SIMD_NONE
chris@162 193 *bOut = hv_asinh_f(bIn);
chris@162 194 #endif
chris@162 195 }
chris@162 196
chris@162 197 static inline void __hv_tan_f(hv_bInf_t bIn, hv_bOutf_t bOut) {
chris@162 198 #if HV_SIMD_AVX
chris@162 199 #warning __hv_tan_f() not implemented
chris@162 200 #elif HV_SIMD_SSE
chris@162 201 #warning __hv_tan_f() not implemented
chris@162 202 #elif HV_SIMD_NEON
chris@162 203 #warning __hv_tan_f() not implemented
chris@162 204 #else // HV_SIMD_NONE
chris@162 205 *bOut = hv_tan_f(bIn);
chris@162 206 #endif
chris@162 207 }
chris@162 208
chris@162 209 static inline void __hv_atan_f(hv_bInf_t bIn, hv_bOutf_t bOut) {
chris@162 210 #if HV_SIMD_AVX
chris@162 211 #warning __hv_atan_f() not implemented
chris@162 212 #elif HV_SIMD_SSE
chris@162 213 #warning __hv_atan_f() not implemented
chris@162 214 #elif HV_SIMD_NEON
chris@162 215 #warning __hv_atan_f() not implemented
chris@162 216 #else // HV_SIMD_NONE
chris@162 217 *bOut = hv_atan_f(bIn);
chris@162 218 #endif
chris@162 219 }
chris@162 220
chris@162 221 static inline void __hv_atan2_f(hv_bInf_t bIn0, hv_bInf_t bIn1, hv_bOutf_t bOut) {
chris@162 222 #if HV_SIMD_AVX
chris@162 223 #warning __hv_atan2_f() not implemented
chris@162 224 #elif HV_SIMD_SSE
chris@162 225 #warning __hv_atan2_f() not implemented
chris@162 226 #elif HV_SIMD_NEON
chris@162 227 #warning __hv_atan2_f() not implemented
chris@162 228 #else // HV_SIMD_NONE
chris@162 229 *bOut = hv_atan2_f(bIn0, bIn1);
chris@162 230 #endif
chris@162 231 }
chris@162 232
chris@162 233 static inline void __hv_tanh_f(hv_bInf_t bIn, hv_bOutf_t bOut) {
chris@162 234 #if HV_SIMD_AVX
chris@162 235 #warning __hv_tanh_f() not implemented
chris@162 236 #elif HV_SIMD_SSE
chris@162 237 #warning __hv_tanh_f() not implemented
chris@162 238 #elif HV_SIMD_NEON
chris@162 239 #warning __hv_tanh_f() not implemented
chris@162 240 #else // HV_SIMD_NONE
chris@162 241 *bOut = hv_tanh_f(bIn);
chris@162 242 #endif
chris@162 243 }
chris@162 244
chris@162 245 static inline void __hv_atanh_f(hv_bInf_t bIn, hv_bOutf_t bOut) {
chris@162 246 #if HV_SIMD_AVX
chris@162 247 #warning __hv_atanh_f() not implemented
chris@162 248 #elif HV_SIMD_SSE
chris@162 249 #warning __hv_atanh_f() not implemented
chris@162 250 #elif HV_SIMD_NEON
chris@162 251 #warning __hv_atanh_f() not implemented
chris@162 252 #else // HV_SIMD_NONE
chris@162 253 *bOut = hv_atanh_f(bIn);
chris@162 254 #endif
chris@162 255 }
chris@162 256
chris@162 257 // NOTE(mhroth): use of sqrt is absolute and total MURDER. Make do with recipocal sqrt if possible!!
chris@162 258 static inline void __hv_sqrt_f(hv_bInf_t bIn, hv_bOutf_t bOut) {
chris@162 259 #if HV_SIMD_AVX
chris@162 260 *bOut = _mm256_sqrt_ps(bIn);
chris@162 261 #elif HV_SIMD_SSE
chris@162 262 *bOut = _mm_sqrt_ps(bIn);
chris@162 263 #elif HV_SIMD_NEON
chris@162 264 #warning __hv_sqrt_f() numerical results may be inexact
chris@162 265 *bOut = vrecpeq_f32(vrsqrteq_f32(bIn));
chris@162 266 #else // HV_SIMD_NONE
chris@162 267 *bOut = hv_sqrt_f(bIn);
chris@162 268 #endif
chris@162 269 }
chris@162 270
chris@162 271 static inline void __hv_rsqrt_f(hv_bInf_t bIn, hv_bOutf_t bOut) {
chris@162 272 #if HV_SIMD_AVX
chris@162 273 *bOut = _mm256_rsqrt_ps(bIn);
chris@162 274 #elif HV_SIMD_SSE
chris@162 275 *bOut = _mm_rsqrt_ps(bIn);
chris@162 276 #elif HV_SIMD_NEON
chris@162 277 #warning __hv_rsqrt_f() numerical results may be inexact
chris@162 278 *bOut = vrsqrteq_f32(bIn);
chris@162 279 #else // HV_SIMD_NONE
chris@162 280 *bOut = 1.0f/hv_sqrt_f(bIn);
chris@162 281 #endif
chris@162 282 }
chris@162 283
chris@162 284 static inline void __hv_abs_f(hv_bInf_t bIn, hv_bOutf_t bOut) {
chris@162 285 #if HV_SIMD_AVX
chris@162 286 *bOut = _mm256_andnot_ps(_mm256_set1_ps(-0.0f), bIn);
chris@162 287 #elif HV_SIMD_SSE
chris@162 288 *bOut = _mm_andnot_ps(_mm_set1_ps(-0.0f), bIn); // == 1 << 31
chris@162 289 #elif HV_SIMD_NEON
chris@162 290 *bOut = vabsq_f32(bIn);
chris@162 291 #else // HV_SIMD_NONE
chris@162 292 *bOut = hv_abs_f(bIn);
chris@162 293 #endif
chris@162 294 }
chris@162 295
chris@162 296 static inline void __hv_exp_f(hv_bInf_t bIn, hv_bOutf_t bOut) {
chris@162 297 #if HV_SIMD_AVX
chris@162 298 #warning __hv_exp_f() not implemented
chris@162 299 #elif HV_SIMD_SSE
chris@162 300 #warning __hv_exp_f() not implemented
chris@162 301 #elif HV_SIMD_NEON
chris@162 302 #warning __hv_exp_f() not implemented
chris@162 303 #else // HV_SIMD_NONE
chris@162 304 *bOut = hv_exp_f(bIn);
chris@162 305 #endif
chris@162 306 }
chris@162 307
chris@162 308 static inline void __hv_ceil_f(hv_bInf_t bIn, hv_bOutf_t bOut) {
chris@162 309 #if HV_SIMD_AVX
chris@162 310 *bOut = _mm256_ceil_ps(bIn);
chris@162 311 #elif HV_SIMD_SSE
chris@162 312 *bOut = _mm_ceil_ps(bIn);
chris@162 313 #elif HV_SIMD_NEON
chris@162 314 #if __ARM_ARCH >= 8
chris@162 315 *bOut = vrndpq_f32(bIn);
chris@162 316 #else
chris@162 317 #warning A slow NEON implementation of __hv_ceil_f() is being used because the necessary intrinsic cannot be found. It is only available in ARMv8.
chris@162 318 *bOut = (float32x4_t) {hv_ceil_f(bIn[0]), hv_ceil_f(bIn[1]), hv_ceil_f(bIn[2]), hv_ceil_f(bIn[3])};
chris@162 319 #endif // vrndpq_f32
chris@162 320 #else // HV_SIMD_NONE
chris@162 321 *bOut = hv_ceil_f(bIn);
chris@162 322 #endif
chris@162 323 }
chris@162 324
chris@162 325 static inline void __hv_floor_f(hv_bInf_t bIn, hv_bOutf_t bOut) {
chris@162 326 #if HV_SIMD_AVX
chris@162 327 *bOut = _mm256_floor_ps(bIn);
chris@162 328 #elif HV_SIMD_SSE
chris@162 329 *bOut = _mm_floor_ps(bIn);
chris@162 330 #elif HV_SIMD_NEON
chris@162 331 #if __ARM_ARCH >= 8
chris@162 332 *bOut = vrndmq_f32(bIn);
chris@162 333 #else
chris@162 334 #warning A slow NEON implementation of __hv_floor_f() is being used because the necessary intrinsic cannot be found. It is only available in ARMv8.
chris@162 335 *bOut = (float32x4_t) {hv_floor_f(bIn[0]), hv_floor_f(bIn[1]), hv_floor_f(bIn[2]), hv_floor_f(bIn[3])};
chris@162 336 #endif // vrndmq_f32
chris@162 337 #else // HV_SIMD_NONE
chris@162 338 *bOut = hv_floor_f(bIn);
chris@162 339 #endif
chris@162 340 }
chris@162 341
chris@162 342 // __add~f
chris@162 343 static inline void __hv_add_f(hv_bInf_t bIn0, hv_bInf_t bIn1, hv_bOutf_t bOut) {
chris@162 344 #if HV_SIMD_AVX
chris@162 345 *bOut = _mm256_add_ps(bIn0, bIn1);
chris@162 346 #elif HV_SIMD_SSE
chris@162 347 *bOut = _mm_add_ps(bIn0, bIn1);
chris@162 348 #elif HV_SIMD_NEON
chris@162 349 *bOut = vaddq_f32(bIn0, bIn1);
chris@162 350 #else // HV_SIMD_NONE
chris@162 351 *bOut = bIn0 + bIn1;
chris@162 352 #endif
chris@162 353 }
chris@162 354
chris@162 355 // __add~i
chris@162 356 static inline void __hv_add_i(hv_bIni_t bIn0, hv_bIni_t bIn1, hv_bOuti_t bOut) {
chris@162 357 #if HV_SIMD_AVX
chris@162 358 __m128i x = _mm_add_epi32(_mm256_castsi256_si128(bIn0), _mm256_castsi256_si128(bIn1));
chris@162 359 __m128i y = _mm_add_epi32(_mm256_extractf128_si256(bIn0, 1), _mm256_extractf128_si256(bIn1, 1));
chris@162 360 *bOut = _mm256_insertf128_si256(_mm256_castsi128_si256(x), y, 1);
chris@162 361 #elif HV_SIMD_SSE
chris@162 362 *bOut = _mm_add_epi32(bIn0, bIn1);
chris@162 363 #elif HV_SIMD_NEON
chris@162 364 *bOut = vaddq_s32(bIn0, bIn1);
chris@162 365 #else // HV_SIMD_NONE
chris@162 366 *bOut = bIn0 + bIn1;
chris@162 367 #endif
chris@162 368 }
chris@162 369
chris@162 370 // __sub~f
chris@162 371 static inline void __hv_sub_f(hv_bInf_t bIn0, hv_bInf_t bIn1, hv_bOutf_t bOut) {
chris@162 372 #if HV_SIMD_AVX
chris@162 373 *bOut = _mm256_sub_ps(bIn0, bIn1);
chris@162 374 #elif HV_SIMD_SSE
chris@162 375 *bOut = _mm_sub_ps(bIn0, bIn1);
chris@162 376 #elif HV_SIMD_NEON
chris@162 377 *bOut = vsubq_f32(bIn0, bIn1);
chris@162 378 #else // HV_SIMD_NONE
chris@162 379 *bOut = bIn0 - bIn1;
chris@162 380 #endif
chris@162 381 }
chris@162 382
chris@162 383 // __mul~f
chris@162 384 static inline void __hv_mul_f(hv_bInf_t bIn0, hv_bInf_t bIn1, hv_bOutf_t bOut) {
chris@162 385 #if HV_SIMD_AVX
chris@162 386 *bOut = _mm256_mul_ps(bIn0, bIn1);
chris@162 387 #elif HV_SIMD_SSE
chris@162 388 *bOut = _mm_mul_ps(bIn0, bIn1);
chris@162 389 #elif HV_SIMD_NEON
chris@162 390 *bOut = vmulq_f32(bIn0, bIn1);
chris@162 391 #else // HV_SIMD_NONE
chris@162 392 *bOut = bIn0 * bIn1;
chris@162 393 #endif
chris@162 394 }
chris@162 395
chris@162 396 // __*~i
chris@162 397 static inline void __hv_mul_i(hv_bIni_t bIn0, hv_bIni_t bIn1, hv_bOuti_t bOut) {
chris@162 398 #if HV_SIMD_AVX
chris@162 399 __m128i x = _mm_mullo_epi32(_mm256_castsi256_si128(bIn0), _mm256_castsi256_si128(bIn1));
chris@162 400 __m128i y = _mm_mullo_epi32(_mm256_extractf128_si256(bIn0, 1), _mm256_extractf128_si256(bIn1, 1));
chris@162 401 *bOut = _mm256_insertf128_si256(_mm256_castsi128_si256(x), y, 1);
chris@162 402 #elif HV_SIMD_SSE
chris@162 403 *bOut = _mm_mullo_epi32(bIn0, bIn1);
chris@162 404 #elif HV_SIMD_NEON
chris@162 405 *bOut = vmulq_s32(bIn0, bIn1);
chris@162 406 #else // HV_SIMD_NONE
chris@162 407 *bOut = bIn0 * bIn1;
chris@162 408 #endif
chris@162 409 }
chris@162 410
chris@162 411 // __cast~if
chris@162 412 static inline void __hv_cast_if(hv_bIni_t bIn, hv_bOutf_t bOut) {
chris@162 413 #if HV_SIMD_AVX
chris@162 414 *bOut = _mm256_cvtepi32_ps(bIn);
chris@162 415 #elif HV_SIMD_SSE
chris@162 416 *bOut = _mm_cvtepi32_ps(bIn);
chris@162 417 #elif HV_SIMD_NEON
chris@162 418 *bOut = vcvtq_f32_s32(bIn);
chris@162 419 #else // HV_SIMD_NONE
chris@162 420 *bOut = (float) bIn;
chris@162 421 #endif
chris@162 422 }
chris@162 423
chris@162 424 // __cast~fi
chris@162 425 static inline void __hv_cast_fi(hv_bInf_t bIn, hv_bOuti_t bOut) {
chris@162 426 #if HV_SIMD_AVX
chris@162 427 *bOut = _mm256_cvtps_epi32(bIn);
chris@162 428 #elif HV_SIMD_SSE
chris@162 429 *bOut = _mm_cvtps_epi32(bIn);
chris@162 430 #elif HV_SIMD_NEON
chris@162 431 *bOut = vcvtq_s32_f32(bIn);
chris@162 432 #else // HV_SIMD_NONE
chris@162 433 *bOut = (int) bIn;
chris@162 434 #endif
chris@162 435 }
chris@162 436
chris@162 437 static inline void __hv_div_f(hv_bInf_t bIn0, hv_bInf_t bIn1, hv_bOutf_t bOut) {
chris@162 438 #if HV_SIMD_AVX
chris@162 439 *bOut = _mm256_div_ps(bIn0, bIn1);
chris@162 440 #elif HV_SIMD_SSE
chris@162 441 *bOut = _mm_div_ps(bIn0, bIn1);
chris@162 442 #elif HV_SIMD_NEON
chris@162 443 #warning __hv_div_f() numerical results may be inexact
chris@162 444 *bOut = vmulq_f32(bIn0, vrecpeq_f32(bIn1));
chris@162 445 #else // HV_SIMD_NONE
chris@162 446 *bOut = (bIn1 != 0.0f) ? (bIn0 / bIn1) : 0.0f;
chris@162 447 #endif
chris@162 448 }
chris@162 449
chris@162 450 static inline void __hv_min_f(hv_bInf_t bIn0, hv_bInf_t bIn1, hv_bOutf_t bOut) {
chris@162 451 #if HV_SIMD_AVX
chris@162 452 *bOut = _mm256_min_ps(bIn0, bIn1);
chris@162 453 #elif HV_SIMD_SSE
chris@162 454 *bOut = _mm_min_ps(bIn0, bIn1);
chris@162 455 #elif HV_SIMD_NEON
chris@162 456 *bOut = vminq_f32(bIn0, bIn1);
chris@162 457 #else // HV_SIMD_NONE
chris@162 458 *bOut = hv_min_f(bIn0, bIn1);
chris@162 459 #endif
chris@162 460 }
chris@162 461
chris@162 462 static inline void __hv_min_i(hv_bIni_t bIn0, hv_bIni_t bIn1, hv_bOuti_t bOut) {
chris@162 463 #if HV_SIMD_AVX
chris@162 464 __m128i x = _mm_min_epi32(_mm256_castsi256_si128(bIn0), _mm256_castsi256_si128(bIn1));
chris@162 465 __m128i y = _mm_min_epi32(_mm256_extractf128_si256(bIn0, 1), _mm256_extractf128_si256(bIn1, 1));
chris@162 466 *bOut = _mm256_insertf128_si256(_mm256_castsi128_si256(x), y, 1);
chris@162 467 #elif HV_SIMD_SSE
chris@162 468 *bOut = _mm_min_epi32(bIn0, bIn1);
chris@162 469 #elif HV_SIMD_NEON
chris@162 470 *bOut = vminq_s32(bIn0, bIn1);
chris@162 471 #else // HV_SIMD_NONE
chris@162 472 *bOut = hv_min_i(bIn0, bIn1);
chris@162 473 #endif
chris@162 474 }
chris@162 475
chris@162 476 static inline void __hv_max_f(hv_bInf_t bIn0, hv_bInf_t bIn1, hv_bOutf_t bOut) {
chris@162 477 #if HV_SIMD_AVX
chris@162 478 *bOut = _mm256_max_ps(bIn0, bIn1);
chris@162 479 #elif HV_SIMD_SSE
chris@162 480 *bOut = _mm_max_ps(bIn0, bIn1);
chris@162 481 #elif HV_SIMD_NEON
chris@162 482 *bOut = vmaxq_f32(bIn0, bIn1);
chris@162 483 #else // HV_SIMD_NONE
chris@162 484 *bOut = hv_max_f(bIn0, bIn1);
chris@162 485 #endif
chris@162 486 }
chris@162 487
chris@162 488 static inline void __hv_max_i(hv_bIni_t bIn0, hv_bIni_t bIn1, hv_bOuti_t bOut) {
chris@162 489 #if HV_SIMD_AVX
chris@162 490 __m128i x = _mm_max_epi32(_mm256_castsi256_si128(bIn0), _mm256_castsi256_si128(bIn1));
chris@162 491 __m128i y = _mm_max_epi32(_mm256_extractf128_si256(bIn0, 1), _mm256_extractf128_si256(bIn1, 1));
chris@162 492 *bOut = _mm256_insertf128_si256(_mm256_castsi128_si256(x), y, 1);
chris@162 493 #elif HV_SIMD_SSE
chris@162 494 *bOut = _mm_max_epi32(bIn0, bIn1);
chris@162 495 #elif HV_SIMD_NEON
chris@162 496 *bOut = vmaxq_s32(bIn0, bIn1);
chris@162 497 #else // HV_SIMD_NONE
chris@162 498 *bOut = hv_max_i(bIn0, bIn1);
chris@162 499 #endif
chris@162 500 }
chris@162 501
chris@162 502 static inline void __hv_pow_f(hv_bInf_t bIn0, hv_bInf_t bIn1, hv_bOutf_t bOut) {
chris@162 503 #if HV_SIMD_AVX
chris@162 504 *bOut = _mm256_set_ps(
chris@162 505 hv_pow_f(bIn0[7], bIn1[7]),
chris@162 506 hv_pow_f(bIn0[6], bIn1[6]),
chris@162 507 hv_pow_f(bIn0[5], bIn1[5]),
chris@162 508 hv_pow_f(bIn0[4], bIn1[4]),
chris@162 509 hv_pow_f(bIn0[3], bIn1[3]),
chris@162 510 hv_pow_f(bIn0[2], bIn1[2]),
chris@162 511 hv_pow_f(bIn0[1], bIn1[1]),
chris@162 512 hv_pow_f(bIn0[0], bIn1[0]));
chris@162 513 #elif HV_SIMD_SSE
chris@162 514 *bOut = _mm_set_ps(
chris@162 515 hv_pow_f(bIn0[3], bIn1[3]),
chris@162 516 hv_pow_f(bIn0[2], bIn1[2]),
chris@162 517 hv_pow_f(bIn0[1], bIn1[1]),
chris@162 518 hv_pow_f(bIn0[0], bIn1[0]));
chris@162 519 #elif HV_SIMD_NEON
chris@162 520 *bOut = (float32x4_t) {
chris@162 521 hv_pow_f(bIn0[0], bIn1[0]),
chris@162 522 hv_pow_f(bIn0[1], bIn1[1]),
chris@162 523 hv_pow_f(bIn0[2], bIn1[2]),
chris@162 524 hv_pow_f(bIn0[3], bIn1[3])};
chris@162 525 #else // HV_SIMD_NONE
chris@162 526 *bOut = hv_pow_f(bIn0, bIn1);
chris@162 527 #endif
chris@162 528 }
chris@162 529
chris@162 530 static inline void __hv_gt_f(hv_bInf_t bIn0, hv_bInf_t bIn1, hv_bOutf_t bOut) {
chris@162 531 #if HV_SIMD_AVX
chris@162 532 *bOut = _mm256_cmp_ps(bIn0, bIn1, _CMP_GT_OQ);
chris@162 533 #elif HV_SIMD_SSE
chris@162 534 *bOut = _mm_cmpgt_ps(bIn0, bIn1);
chris@162 535 #elif HV_SIMD_NEON
chris@162 536 *bOut = vreinterpretq_f32_u32(vcgtq_f32(bIn0, bIn1));
chris@162 537 #else // HV_SIMD_NONE
chris@162 538 *bOut = (bIn0 > bIn1) ? 1.0f : 0.0f;
chris@162 539 #endif
chris@162 540 }
chris@162 541
chris@162 542 static inline void __hv_gte_f(hv_bInf_t bIn0, hv_bInf_t bIn1, hv_bOutf_t bOut) {
chris@162 543 #if HV_SIMD_AVX
chris@162 544 *bOut = _mm256_cmp_ps(bIn0, bIn1, _CMP_GE_OQ);
chris@162 545 #elif HV_SIMD_SSE
chris@162 546 *bOut = _mm_cmpge_ps(bIn0, bIn1);
chris@162 547 #elif HV_SIMD_NEON
chris@162 548 *bOut = vreinterpretq_f32_u32(vcgeq_f32(bIn0, bIn1));
chris@162 549 #else // HV_SIMD_NONE
chris@162 550 *bOut = (bIn0 >= bIn1) ? 1.0f : 0.0f;
chris@162 551 #endif
chris@162 552 }
chris@162 553
chris@162 554 static inline void __hv_lt_f(hv_bInf_t bIn0, hv_bInf_t bIn1, hv_bOutf_t bOut) {
chris@162 555 #if HV_SIMD_AVX
chris@162 556 *bOut = _mm256_cmp_ps(bIn0, bIn1, _CMP_LT_OQ);
chris@162 557 #elif HV_SIMD_SSE
chris@162 558 *bOut = _mm_cmplt_ps(bIn0, bIn1);
chris@162 559 #elif HV_SIMD_NEON
chris@162 560 *bOut = vreinterpretq_f32_u32(vcltq_f32(bIn0, bIn1));
chris@162 561 #else // HV_SIMD_NONE
chris@162 562 *bOut = (bIn0 < bIn1) ? 1.0f : 0.0f;
chris@162 563 #endif
chris@162 564 }
chris@162 565
chris@162 566 static inline void __hv_lte_f(hv_bInf_t bIn0, hv_bInf_t bIn1, hv_bOutf_t bOut) {
chris@162 567 #if HV_SIMD_AVX
chris@162 568 *bOut = _mm256_cmp_ps(bIn0, bIn1, _CMP_LE_OQ);
chris@162 569 #elif HV_SIMD_SSE
chris@162 570 *bOut = _mm_cmple_ps(bIn0, bIn1);
chris@162 571 #elif HV_SIMD_NEON
chris@162 572 *bOut = vreinterpretq_f32_u32(vcleq_f32(bIn0, bIn1));
chris@162 573 #else // HV_SIMD_NONE
chris@162 574 *bOut = (bIn0 <= bIn1) ? 1.0f : 0.0f;
chris@162 575 #endif
chris@162 576 }
chris@162 577
chris@162 578 static inline void __hv_neq_f(hv_bInf_t bIn0, hv_bInf_t bIn1, hv_bOutf_t bOut) {
chris@162 579 #if HV_SIMD_AVX
chris@162 580 *bOut = _mm256_cmp_ps(bIn0, bIn1, _CMP_NEQ_OQ);
chris@162 581 #elif HV_SIMD_SSE
chris@162 582 *bOut = _mm_cmpneq_ps(bIn0, bIn1);
chris@162 583 #elif HV_SIMD_NEON
chris@162 584 *bOut = vreinterpretq_f32_u32(vmvnq_u32(vceqq_f32(bIn0, bIn1)));
chris@162 585 #else // HV_SIMD_NONE
chris@162 586 *bOut = (bIn0 != bIn1) ? 1.0f : 0.0f;
chris@162 587 #endif
chris@162 588 }
chris@162 589
chris@162 590 static inline void __hv_xor_f(hv_bInf_t bIn0, hv_bInf_t bIn1, hv_bOutf_t bOut) {
chris@162 591 #if HV_SIMD_AVX
chris@162 592 #warning __hv_xor_f() not implemented
chris@162 593 #elif HV_SIMD_SSE
chris@162 594 #warning __hv_xor_f() not implemented
chris@162 595 #elif HV_SIMD_NEON
chris@162 596 #warning __hv_xor_f() not implemented
chris@162 597 #else // HV_SIMD_NONE
chris@162 598 *bOut = (float) (((int) bIn0) ^ ((int) bIn1));
chris@162 599 #endif
chris@162 600 }
chris@162 601
chris@162 602 static inline void __hv_and_f(hv_bInf_t bIn0, hv_bInf_t bIn1, hv_bOutf_t bOut) {
chris@162 603 #if HV_SIMD_AVX
chris@162 604 *bOut = _mm256_and_ps(bIn1, bIn0);
chris@162 605 #elif HV_SIMD_SSE
chris@162 606 *bOut = _mm_and_ps(bIn1, bIn0);
chris@162 607 #elif HV_SIMD_NEON
chris@162 608 *bOut = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(bIn1), vreinterpretq_u32_f32(bIn0)));
chris@162 609 #else // HV_SIMD_NONE
chris@162 610 if (bIn0 == 0.0f || bIn1 == 0.0f) *bOut = 0.0f;
chris@162 611 else if (bIn0 == 1.0f) *bOut = bIn1;
chris@162 612 else if (bIn1 == 1.0f) *bOut = bIn0;
chris@162 613 else hv_assert(0); // NOTE(mhroth): floating point & is pretty much a bad idea, only used for if~
chris@162 614 #endif
chris@162 615 }
chris@162 616
chris@162 617 // bOut = (bIn0 * bIn1) + bIn2
chris@162 618 static inline void __hv_fma_f(hv_bInf_t bIn0, hv_bInf_t bIn1, hv_bInf_t bIn2, hv_bOutf_t bOut) {
chris@162 619 #if HV_SIMD_AVX
chris@162 620 #if HV_SIMD_FMA
chris@162 621 *bOut = _mm256_fmadd_ps(bIn0, bIn1, bIn2);
chris@162 622 #else
chris@162 623 *bOut = _mm256_add_ps(_mm256_mul_ps(bIn0, bIn1), bIn2);
chris@162 624 #endif // HV_SIMD_FMA
chris@162 625 #elif HV_SIMD_SSE
chris@162 626 #if HV_SIMD_FMA
chris@162 627 *bOut = _mm_fmadd_ps(bIn0, bIn1, bIn2);
chris@162 628 #else
chris@162 629 *bOut = _mm_add_ps(_mm_mul_ps(bIn0, bIn1), bIn2);
chris@162 630 #endif // HV_SIMD_FMA
chris@162 631 #elif HV_SIMD_NEON
chris@162 632 #if __ARM_ARCH >= 8
chris@162 633 *bOut = vfmaq_f32(bIn2, bIn0, bIn1);
chris@162 634 #else
chris@162 635 // NOTE(mhroth): it turns out, fma SUUUUCKS on lesser ARM architectures
chris@162 636 // But in fact ideally fma would be disabled in ir2c for ARM architectures.
chris@162 637 // LLVM does a much better job handling fma than we do.
chris@162 638 *bOut = vaddq_f32(vmulq_f32(bIn0, bIn1), bIn2);
chris@162 639 #endif
chris@162 640 #else // HV_SIMD_NONE
chris@162 641 *bOut = hv_fma_f(bIn0, bIn1, bIn2);
chris@162 642 #endif
chris@162 643 }
chris@162 644
chris@162 645 #endif // _HEAVY_MATH_H_