annotate projects/heavy/circularBuffer/HeavyMath.h @ 163:20b52283c7b4 heavy-updated

- added circular buffer pd/heavy example (works but process needs to be killed manually if launched via ssh?)
author chnrx <chris.heinrichs@gmail.com>
date Thu, 12 Nov 2015 15:55:30 +0000
parents
children
rev   line source
chris@163 1 /**
chris@163 2 * Copyright (c) 2014, 2015, Enzien Audio Ltd.
chris@163 3 *
chris@163 4 * Permission to use, copy, modify, and/or distribute this software for any
chris@163 5 * purpose with or without fee is hereby granted, provided that the above
chris@163 6 * copyright notice and this permission notice appear in all copies.
chris@163 7 *
chris@163 8 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH
chris@163 9 * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
chris@163 10 * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT,
chris@163 11 * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
chris@163 12 * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
chris@163 13 * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
chris@163 14 * PERFORMANCE OF THIS SOFTWARE.
chris@163 15 */
chris@163 16
chris@163 17 #ifndef _HEAVY_MATH_H_
chris@163 18 #define _HEAVY_MATH_H_
chris@163 19
chris@163 20 #include "Utils.h"
chris@163 21
chris@163 22 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/
chris@163 23 // https://gcc.gnu.org/onlinedocs/gcc-4.8.1/gcc/ARM-NEON-Intrinsics.html
chris@163 24 // http://codesuppository.blogspot.co.uk/2015/02/sse2neonh-porting-guide-and-header-file.html
chris@163 25
chris@163 26 static inline void __hv_zero_f(hv_bOutf_t bOut) {
chris@163 27 #if HV_SIMD_AVX
chris@163 28 *bOut = _mm256_setzero_ps();
chris@163 29 #elif HV_SIMD_SSE
chris@163 30 *bOut = _mm_setzero_ps();
chris@163 31 #elif HV_SIMD_NEON
chris@163 32 *bOut = vdupq_n_f32(0.0f);
chris@163 33 #else // HV_SIMD_NONE
chris@163 34 *bOut = 0.0f;
chris@163 35 #endif
chris@163 36 }
chris@163 37
chris@163 38 static inline void __hv_load_f(float *bIn, hv_bOutf_t bOut) {
chris@163 39 #if HV_SIMD_AVX
chris@163 40 *bOut = _mm256_load_ps(bIn);
chris@163 41 #elif HV_SIMD_SSE
chris@163 42 *bOut = _mm_load_ps(bIn);
chris@163 43 #elif HV_SIMD_NEON
chris@163 44 *bOut = vld1q_f32(bIn);
chris@163 45 #else // HV_SIMD_NONE
chris@163 46 *bOut = *bIn;
chris@163 47 #endif
chris@163 48 }
chris@163 49
chris@163 50 static inline void __hv_store_f(float *bOut, hv_bInf_t bIn) {
chris@163 51 #if HV_SIMD_AVX
chris@163 52 _mm256_store_ps(bOut, bIn);
chris@163 53 #elif HV_SIMD_SSE
chris@163 54 _mm_store_ps(bOut, bIn);
chris@163 55 #elif HV_SIMD_NEON
chris@163 56 vst1q_f32(bOut, bIn);
chris@163 57 #else // HV_SIMD_NONE
chris@163 58 *bOut = bIn;
chris@163 59 #endif
chris@163 60 }
chris@163 61
chris@163 62 static inline void __hv_log_f(hv_bInf_t bIn, hv_bOutf_t bOut) {
chris@163 63 #if HV_SIMD_AVX
chris@163 64 #warning __hv_log_f() not implemented
chris@163 65 #elif HV_SIMD_SSE
chris@163 66 #warning __hv_log_f() not implemented
chris@163 67 #elif HV_SIMD_NEON
chris@163 68 #warning __hv_log_f() not implemented
chris@163 69 #else // HV_SIMD_NONE
chris@163 70 *bOut = (bIn > 0.0f) ? hv_log_f(bIn) : 0.0f;
chris@163 71 #endif
chris@163 72 }
chris@163 73
chris@163 74 static inline void __hv_log10_f(hv_bInf_t bIn, hv_bOutf_t bOut) {
chris@163 75 #if HV_SIMD_AVX
chris@163 76 #warning __hv_log10_f() not implemented
chris@163 77 #elif HV_SIMD_SSE
chris@163 78 #warning __hv_log10_f() not implemented
chris@163 79 #elif HV_SIMD_NEON
chris@163 80 #warning __hv_log10_f() not implemented
chris@163 81 #else // HV_SIMD_NONE
chris@163 82 *bOut = (bIn > 0.0f) ? hv_log10_f(bIn) : 0.0f;
chris@163 83 #endif
chris@163 84 }
chris@163 85
chris@163 86 static inline void __hv_log2_f(hv_bInf_t bIn, hv_bOutf_t bOut) {
chris@163 87 #if HV_SIMD_AVX
chris@163 88 #warning __hv_log2_f() not implemented
chris@163 89 #elif HV_SIMD_SSE
chris@163 90 #warning __hv_log2_f() not implemented
chris@163 91 #elif HV_SIMD_NEON
chris@163 92 #warning __hv_log2_f() not implemented
chris@163 93 #else // HV_SIMD_NONE
chris@163 94 *bOut = (bIn > 0.0f) ? hv_log2_f(bIn) : 0.0f;
chris@163 95 #endif
chris@163 96 }
chris@163 97
chris@163 98 // NOTE(mhroth): this is a pretty ghetto implementation
chris@163 99 static inline void __hv_cos_f(hv_bInf_t bIn, hv_bOutf_t bOut) {
chris@163 100 #if HV_SIMD_AVX
chris@163 101 *bOut = _mm256_set_ps(
chris@163 102 hv_cos_f(bIn[7]), hv_cos_f(bIn[6]), hv_cos_f(bIn[5]), hv_cos_f(bIn[4]),
chris@163 103 hv_cos_f(bIn[3]), hv_cos_f(bIn[2]), hv_cos_f(bIn[1]), hv_cos_f(bIn[0]));
chris@163 104 #elif HV_SIMD_SSE
chris@163 105 *bOut = _mm_set_ps(hv_cos_f(bIn[3]), hv_cos_f(bIn[2]), hv_cos_f(bIn[1]), hv_cos_f(bIn[0]));
chris@163 106 #elif HV_SIMD_NEON
chris@163 107 *bOut = (float32x4_t) {hv_cos_f(bIn[0]), hv_cos_f(bIn[1]), hv_cos_f(bIn[2]), hv_cos_f(bIn[3])};
chris@163 108 #else // HV_SIMD_NONE
chris@163 109 *bOut = hv_cos_f(bIn);
chris@163 110 #endif
chris@163 111 }
chris@163 112
chris@163 113 static inline void __hv_acos_f(hv_bInf_t bIn, hv_bOutf_t bOut) {
chris@163 114 #if HV_SIMD_AVX
chris@163 115 #warning __hv_acos_f() not implemented
chris@163 116 #elif HV_SIMD_SSE
chris@163 117 #warning __hv_acos_f() not implemented
chris@163 118 #elif HV_SIMD_NEON
chris@163 119 #warning __hv_acos_f() not implemented
chris@163 120 #else // HV_SIMD_NONE
chris@163 121 *bOut = hv_acos_f(bIn);
chris@163 122 #endif
chris@163 123 }
chris@163 124
chris@163 125 static inline void __hv_cosh_f(hv_bInf_t bIn, hv_bOutf_t bOut) {
chris@163 126 #if HV_SIMD_AVX
chris@163 127 #warning __hv_cosh_f() not implemented
chris@163 128 #elif HV_SIMD_SSE
chris@163 129 #warning __hv_cosh_f() not implemented
chris@163 130 #elif HV_SIMD_NEON
chris@163 131 #warning __hv_cosh_f() not implemented
chris@163 132 #else // HV_SIMD_NONE
chris@163 133 *bOut = hv_cosh_f(bIn);
chris@163 134 #endif
chris@163 135 }
chris@163 136
chris@163 137 static inline void __hv_acosh_f(hv_bInf_t bIn, hv_bOutf_t bOut) {
chris@163 138 #if HV_SIMD_AVX
chris@163 139 #warning __hv_acosh_f() not implemented
chris@163 140 #elif HV_SIMD_SSE
chris@163 141 #warning __hv_acosh_f() not implemented
chris@163 142 #elif HV_SIMD_NEON
chris@163 143 #warning __hv_acosh_f() not implemented
chris@163 144 #else // HV_SIMD_NONE
chris@163 145 *bOut = hv_acosh_f(bIn);
chris@163 146 #endif
chris@163 147 }
chris@163 148
chris@163 149 static inline void __hv_sin_f(hv_bInf_t bIn, hv_bOutf_t bOut) {
chris@163 150 #if HV_SIMD_AVX
chris@163 151 #warning __hv_sin_f() not implemented
chris@163 152 #elif HV_SIMD_SSE
chris@163 153 #warning __hv_sin_f() not implemented
chris@163 154 #elif HV_SIMD_NEON
chris@163 155 #warning __hv_sin_f() not implemented
chris@163 156 #else // HV_SIMD_NONE
chris@163 157 *bOut = hv_sin_f(bIn);
chris@163 158 #endif
chris@163 159 }
chris@163 160
chris@163 161 static inline void __hv_asin_f(hv_bInf_t bIn, hv_bOutf_t bOut) {
chris@163 162 #if HV_SIMD_AVX
chris@163 163 #warning __hv_asin_f() not implemented
chris@163 164 #elif HV_SIMD_SSE
chris@163 165 #warning __hv_asin_f() not implemented
chris@163 166 #elif HV_SIMD_NEON
chris@163 167 #warning __hv_asin_f() not implemented
chris@163 168 #else // HV_SIMD_NONE
chris@163 169 *bOut = hv_asin_f(bIn);
chris@163 170 #endif
chris@163 171 }
chris@163 172
chris@163 173 static inline void __hv_sinh_f(hv_bInf_t bIn, hv_bOutf_t bOut) {
chris@163 174 #if HV_SIMD_AVX
chris@163 175 #warning __hv_sinh_f() not implemented
chris@163 176 #elif HV_SIMD_SSE
chris@163 177 #warning __hv_sinh_f() not implemented
chris@163 178 #elif HV_SIMD_NEON
chris@163 179 #warning __hv_sinh_f() not implemented
chris@163 180 #else // HV_SIMD_NONE
chris@163 181 *bOut = hv_sinh_f(bIn);
chris@163 182 #endif
chris@163 183 }
chris@163 184
chris@163 185 static inline void __hv_asinh_f(hv_bInf_t bIn, hv_bOutf_t bOut) {
chris@163 186 #if HV_SIMD_AVX
chris@163 187 #warning __hv_asinh_f() not implemented
chris@163 188 #elif HV_SIMD_SSE
chris@163 189 #warning __hv_asinh_f() not implemented
chris@163 190 #elif HV_SIMD_NEON
chris@163 191 #warning __hv_asinh_f() not implemented
chris@163 192 #else // HV_SIMD_NONE
chris@163 193 *bOut = hv_asinh_f(bIn);
chris@163 194 #endif
chris@163 195 }
chris@163 196
chris@163 197 static inline void __hv_tan_f(hv_bInf_t bIn, hv_bOutf_t bOut) {
chris@163 198 #if HV_SIMD_AVX
chris@163 199 #warning __hv_tan_f() not implemented
chris@163 200 #elif HV_SIMD_SSE
chris@163 201 #warning __hv_tan_f() not implemented
chris@163 202 #elif HV_SIMD_NEON
chris@163 203 #warning __hv_tan_f() not implemented
chris@163 204 #else // HV_SIMD_NONE
chris@163 205 *bOut = hv_tan_f(bIn);
chris@163 206 #endif
chris@163 207 }
chris@163 208
chris@163 209 static inline void __hv_atan_f(hv_bInf_t bIn, hv_bOutf_t bOut) {
chris@163 210 #if HV_SIMD_AVX
chris@163 211 #warning __hv_atan_f() not implemented
chris@163 212 #elif HV_SIMD_SSE
chris@163 213 #warning __hv_atan_f() not implemented
chris@163 214 #elif HV_SIMD_NEON
chris@163 215 #warning __hv_atan_f() not implemented
chris@163 216 #else // HV_SIMD_NONE
chris@163 217 *bOut = hv_atan_f(bIn);
chris@163 218 #endif
chris@163 219 }
chris@163 220
chris@163 221 static inline void __hv_atan2_f(hv_bInf_t bIn0, hv_bInf_t bIn1, hv_bOutf_t bOut) {
chris@163 222 #if HV_SIMD_AVX
chris@163 223 #warning __hv_atan2_f() not implemented
chris@163 224 #elif HV_SIMD_SSE
chris@163 225 #warning __hv_atan2_f() not implemented
chris@163 226 #elif HV_SIMD_NEON
chris@163 227 #warning __hv_atan2_f() not implemented
chris@163 228 #else // HV_SIMD_NONE
chris@163 229 *bOut = hv_atan2_f(bIn0, bIn1);
chris@163 230 #endif
chris@163 231 }
chris@163 232
chris@163 233 static inline void __hv_tanh_f(hv_bInf_t bIn, hv_bOutf_t bOut) {
chris@163 234 #if HV_SIMD_AVX
chris@163 235 #warning __hv_tanh_f() not implemented
chris@163 236 #elif HV_SIMD_SSE
chris@163 237 #warning __hv_tanh_f() not implemented
chris@163 238 #elif HV_SIMD_NEON
chris@163 239 #warning __hv_tanh_f() not implemented
chris@163 240 #else // HV_SIMD_NONE
chris@163 241 *bOut = hv_tanh_f(bIn);
chris@163 242 #endif
chris@163 243 }
chris@163 244
chris@163 245 static inline void __hv_atanh_f(hv_bInf_t bIn, hv_bOutf_t bOut) {
chris@163 246 #if HV_SIMD_AVX
chris@163 247 #warning __hv_atanh_f() not implemented
chris@163 248 #elif HV_SIMD_SSE
chris@163 249 #warning __hv_atanh_f() not implemented
chris@163 250 #elif HV_SIMD_NEON
chris@163 251 #warning __hv_atanh_f() not implemented
chris@163 252 #else // HV_SIMD_NONE
chris@163 253 *bOut = hv_atanh_f(bIn);
chris@163 254 #endif
chris@163 255 }
chris@163 256
chris@163 257 // NOTE(mhroth): use of sqrt is absolute and total MURDER. Make do with recipocal sqrt if possible!!
chris@163 258 static inline void __hv_sqrt_f(hv_bInf_t bIn, hv_bOutf_t bOut) {
chris@163 259 #if HV_SIMD_AVX
chris@163 260 *bOut = _mm256_sqrt_ps(bIn);
chris@163 261 #elif HV_SIMD_SSE
chris@163 262 *bOut = _mm_sqrt_ps(bIn);
chris@163 263 #elif HV_SIMD_NEON
chris@163 264 #warning __hv_sqrt_f() numerical results may be inexact
chris@163 265 *bOut = vrecpeq_f32(vrsqrteq_f32(bIn));
chris@163 266 #else // HV_SIMD_NONE
chris@163 267 *bOut = hv_sqrt_f(bIn);
chris@163 268 #endif
chris@163 269 }
chris@163 270
chris@163 271 static inline void __hv_rsqrt_f(hv_bInf_t bIn, hv_bOutf_t bOut) {
chris@163 272 #if HV_SIMD_AVX
chris@163 273 *bOut = _mm256_rsqrt_ps(bIn);
chris@163 274 #elif HV_SIMD_SSE
chris@163 275 *bOut = _mm_rsqrt_ps(bIn);
chris@163 276 #elif HV_SIMD_NEON
chris@163 277 #warning __hv_rsqrt_f() numerical results may be inexact
chris@163 278 *bOut = vrsqrteq_f32(bIn);
chris@163 279 #else // HV_SIMD_NONE
chris@163 280 *bOut = 1.0f/hv_sqrt_f(bIn);
chris@163 281 #endif
chris@163 282 }
chris@163 283
chris@163 284 static inline void __hv_abs_f(hv_bInf_t bIn, hv_bOutf_t bOut) {
chris@163 285 #if HV_SIMD_AVX
chris@163 286 *bOut = _mm256_andnot_ps(_mm256_set1_ps(-0.0f), bIn);
chris@163 287 #elif HV_SIMD_SSE
chris@163 288 *bOut = _mm_andnot_ps(_mm_set1_ps(-0.0f), bIn); // == 1 << 31
chris@163 289 #elif HV_SIMD_NEON
chris@163 290 *bOut = vabsq_f32(bIn);
chris@163 291 #else // HV_SIMD_NONE
chris@163 292 *bOut = hv_abs_f(bIn);
chris@163 293 #endif
chris@163 294 }
chris@163 295
chris@163 296 static inline void __hv_exp_f(hv_bInf_t bIn, hv_bOutf_t bOut) {
chris@163 297 #if HV_SIMD_AVX
chris@163 298 #warning __hv_exp_f() not implemented
chris@163 299 #elif HV_SIMD_SSE
chris@163 300 #warning __hv_exp_f() not implemented
chris@163 301 #elif HV_SIMD_NEON
chris@163 302 #warning __hv_exp_f() not implemented
chris@163 303 #else // HV_SIMD_NONE
chris@163 304 *bOut = hv_exp_f(bIn);
chris@163 305 #endif
chris@163 306 }
chris@163 307
chris@163 308 static inline void __hv_ceil_f(hv_bInf_t bIn, hv_bOutf_t bOut) {
chris@163 309 #if HV_SIMD_AVX
chris@163 310 *bOut = _mm256_ceil_ps(bIn);
chris@163 311 #elif HV_SIMD_SSE
chris@163 312 *bOut = _mm_ceil_ps(bIn);
chris@163 313 #elif HV_SIMD_NEON
chris@163 314 #if __ARM_ARCH >= 8
chris@163 315 *bOut = vrndpq_f32(bIn);
chris@163 316 #else
chris@163 317 #warning A slow NEON implementation of __hv_ceil_f() is being used because the necessary intrinsic cannot be found. It is only available in ARMv8.
chris@163 318 *bOut = (float32x4_t) {hv_ceil_f(bIn[0]), hv_ceil_f(bIn[1]), hv_ceil_f(bIn[2]), hv_ceil_f(bIn[3])};
chris@163 319 #endif // vrndpq_f32
chris@163 320 #else // HV_SIMD_NONE
chris@163 321 *bOut = hv_ceil_f(bIn);
chris@163 322 #endif
chris@163 323 }
chris@163 324
chris@163 325 static inline void __hv_floor_f(hv_bInf_t bIn, hv_bOutf_t bOut) {
chris@163 326 #if HV_SIMD_AVX
chris@163 327 *bOut = _mm256_floor_ps(bIn);
chris@163 328 #elif HV_SIMD_SSE
chris@163 329 *bOut = _mm_floor_ps(bIn);
chris@163 330 #elif HV_SIMD_NEON
chris@163 331 #if __ARM_ARCH >= 8
chris@163 332 *bOut = vrndmq_f32(bIn);
chris@163 333 #else
chris@163 334 #warning A slow NEON implementation of __hv_floor_f() is being used because the necessary intrinsic cannot be found. It is only available in ARMv8.
chris@163 335 *bOut = (float32x4_t) {hv_floor_f(bIn[0]), hv_floor_f(bIn[1]), hv_floor_f(bIn[2]), hv_floor_f(bIn[3])};
chris@163 336 #endif // vrndmq_f32
chris@163 337 #else // HV_SIMD_NONE
chris@163 338 *bOut = hv_floor_f(bIn);
chris@163 339 #endif
chris@163 340 }
chris@163 341
chris@163 342 // __add~f
chris@163 343 static inline void __hv_add_f(hv_bInf_t bIn0, hv_bInf_t bIn1, hv_bOutf_t bOut) {
chris@163 344 #if HV_SIMD_AVX
chris@163 345 *bOut = _mm256_add_ps(bIn0, bIn1);
chris@163 346 #elif HV_SIMD_SSE
chris@163 347 *bOut = _mm_add_ps(bIn0, bIn1);
chris@163 348 #elif HV_SIMD_NEON
chris@163 349 *bOut = vaddq_f32(bIn0, bIn1);
chris@163 350 #else // HV_SIMD_NONE
chris@163 351 *bOut = bIn0 + bIn1;
chris@163 352 #endif
chris@163 353 }
chris@163 354
chris@163 355 // __add~i
chris@163 356 static inline void __hv_add_i(hv_bIni_t bIn0, hv_bIni_t bIn1, hv_bOuti_t bOut) {
chris@163 357 #if HV_SIMD_AVX
chris@163 358 __m128i x = _mm_add_epi32(_mm256_castsi256_si128(bIn0), _mm256_castsi256_si128(bIn1));
chris@163 359 __m128i y = _mm_add_epi32(_mm256_extractf128_si256(bIn0, 1), _mm256_extractf128_si256(bIn1, 1));
chris@163 360 *bOut = _mm256_insertf128_si256(_mm256_castsi128_si256(x), y, 1);
chris@163 361 #elif HV_SIMD_SSE
chris@163 362 *bOut = _mm_add_epi32(bIn0, bIn1);
chris@163 363 #elif HV_SIMD_NEON
chris@163 364 *bOut = vaddq_s32(bIn0, bIn1);
chris@163 365 #else // HV_SIMD_NONE
chris@163 366 *bOut = bIn0 + bIn1;
chris@163 367 #endif
chris@163 368 }
chris@163 369
chris@163 370 // __sub~f
chris@163 371 static inline void __hv_sub_f(hv_bInf_t bIn0, hv_bInf_t bIn1, hv_bOutf_t bOut) {
chris@163 372 #if HV_SIMD_AVX
chris@163 373 *bOut = _mm256_sub_ps(bIn0, bIn1);
chris@163 374 #elif HV_SIMD_SSE
chris@163 375 *bOut = _mm_sub_ps(bIn0, bIn1);
chris@163 376 #elif HV_SIMD_NEON
chris@163 377 *bOut = vsubq_f32(bIn0, bIn1);
chris@163 378 #else // HV_SIMD_NONE
chris@163 379 *bOut = bIn0 - bIn1;
chris@163 380 #endif
chris@163 381 }
chris@163 382
chris@163 383 // __mul~f
chris@163 384 static inline void __hv_mul_f(hv_bInf_t bIn0, hv_bInf_t bIn1, hv_bOutf_t bOut) {
chris@163 385 #if HV_SIMD_AVX
chris@163 386 *bOut = _mm256_mul_ps(bIn0, bIn1);
chris@163 387 #elif HV_SIMD_SSE
chris@163 388 *bOut = _mm_mul_ps(bIn0, bIn1);
chris@163 389 #elif HV_SIMD_NEON
chris@163 390 *bOut = vmulq_f32(bIn0, bIn1);
chris@163 391 #else // HV_SIMD_NONE
chris@163 392 *bOut = bIn0 * bIn1;
chris@163 393 #endif
chris@163 394 }
chris@163 395
chris@163 396 // __*~i
chris@163 397 static inline void __hv_mul_i(hv_bIni_t bIn0, hv_bIni_t bIn1, hv_bOuti_t bOut) {
chris@163 398 #if HV_SIMD_AVX
chris@163 399 __m128i x = _mm_mullo_epi32(_mm256_castsi256_si128(bIn0), _mm256_castsi256_si128(bIn1));
chris@163 400 __m128i y = _mm_mullo_epi32(_mm256_extractf128_si256(bIn0, 1), _mm256_extractf128_si256(bIn1, 1));
chris@163 401 *bOut = _mm256_insertf128_si256(_mm256_castsi128_si256(x), y, 1);
chris@163 402 #elif HV_SIMD_SSE
chris@163 403 *bOut = _mm_mullo_epi32(bIn0, bIn1);
chris@163 404 #elif HV_SIMD_NEON
chris@163 405 *bOut = vmulq_s32(bIn0, bIn1);
chris@163 406 #else // HV_SIMD_NONE
chris@163 407 *bOut = bIn0 * bIn1;
chris@163 408 #endif
chris@163 409 }
chris@163 410
chris@163 411 // __cast~if
chris@163 412 static inline void __hv_cast_if(hv_bIni_t bIn, hv_bOutf_t bOut) {
chris@163 413 #if HV_SIMD_AVX
chris@163 414 *bOut = _mm256_cvtepi32_ps(bIn);
chris@163 415 #elif HV_SIMD_SSE
chris@163 416 *bOut = _mm_cvtepi32_ps(bIn);
chris@163 417 #elif HV_SIMD_NEON
chris@163 418 *bOut = vcvtq_f32_s32(bIn);
chris@163 419 #else // HV_SIMD_NONE
chris@163 420 *bOut = (float) bIn;
chris@163 421 #endif
chris@163 422 }
chris@163 423
chris@163 424 // __cast~fi
chris@163 425 static inline void __hv_cast_fi(hv_bInf_t bIn, hv_bOuti_t bOut) {
chris@163 426 #if HV_SIMD_AVX
chris@163 427 *bOut = _mm256_cvtps_epi32(bIn);
chris@163 428 #elif HV_SIMD_SSE
chris@163 429 *bOut = _mm_cvtps_epi32(bIn);
chris@163 430 #elif HV_SIMD_NEON
chris@163 431 *bOut = vcvtq_s32_f32(bIn);
chris@163 432 #else // HV_SIMD_NONE
chris@163 433 *bOut = (int) bIn;
chris@163 434 #endif
chris@163 435 }
chris@163 436
chris@163 437 static inline void __hv_div_f(hv_bInf_t bIn0, hv_bInf_t bIn1, hv_bOutf_t bOut) {
chris@163 438 #if HV_SIMD_AVX
chris@163 439 *bOut = _mm256_div_ps(bIn0, bIn1);
chris@163 440 #elif HV_SIMD_SSE
chris@163 441 *bOut = _mm_div_ps(bIn0, bIn1);
chris@163 442 #elif HV_SIMD_NEON
chris@163 443 #warning __hv_div_f() numerical results may be inexact
chris@163 444 *bOut = vmulq_f32(bIn0, vrecpeq_f32(bIn1));
chris@163 445 #else // HV_SIMD_NONE
chris@163 446 *bOut = (bIn1 != 0.0f) ? (bIn0 / bIn1) : 0.0f;
chris@163 447 #endif
chris@163 448 }
chris@163 449
chris@163 450 static inline void __hv_min_f(hv_bInf_t bIn0, hv_bInf_t bIn1, hv_bOutf_t bOut) {
chris@163 451 #if HV_SIMD_AVX
chris@163 452 *bOut = _mm256_min_ps(bIn0, bIn1);
chris@163 453 #elif HV_SIMD_SSE
chris@163 454 *bOut = _mm_min_ps(bIn0, bIn1);
chris@163 455 #elif HV_SIMD_NEON
chris@163 456 *bOut = vminq_f32(bIn0, bIn1);
chris@163 457 #else // HV_SIMD_NONE
chris@163 458 *bOut = hv_min_f(bIn0, bIn1);
chris@163 459 #endif
chris@163 460 }
chris@163 461
chris@163 462 static inline void __hv_min_i(hv_bIni_t bIn0, hv_bIni_t bIn1, hv_bOuti_t bOut) {
chris@163 463 #if HV_SIMD_AVX
chris@163 464 __m128i x = _mm_min_epi32(_mm256_castsi256_si128(bIn0), _mm256_castsi256_si128(bIn1));
chris@163 465 __m128i y = _mm_min_epi32(_mm256_extractf128_si256(bIn0, 1), _mm256_extractf128_si256(bIn1, 1));
chris@163 466 *bOut = _mm256_insertf128_si256(_mm256_castsi128_si256(x), y, 1);
chris@163 467 #elif HV_SIMD_SSE
chris@163 468 *bOut = _mm_min_epi32(bIn0, bIn1);
chris@163 469 #elif HV_SIMD_NEON
chris@163 470 *bOut = vminq_s32(bIn0, bIn1);
chris@163 471 #else // HV_SIMD_NONE
chris@163 472 *bOut = hv_min_i(bIn0, bIn1);
chris@163 473 #endif
chris@163 474 }
chris@163 475
chris@163 476 static inline void __hv_max_f(hv_bInf_t bIn0, hv_bInf_t bIn1, hv_bOutf_t bOut) {
chris@163 477 #if HV_SIMD_AVX
chris@163 478 *bOut = _mm256_max_ps(bIn0, bIn1);
chris@163 479 #elif HV_SIMD_SSE
chris@163 480 *bOut = _mm_max_ps(bIn0, bIn1);
chris@163 481 #elif HV_SIMD_NEON
chris@163 482 *bOut = vmaxq_f32(bIn0, bIn1);
chris@163 483 #else // HV_SIMD_NONE
chris@163 484 *bOut = hv_max_f(bIn0, bIn1);
chris@163 485 #endif
chris@163 486 }
chris@163 487
chris@163 488 static inline void __hv_max_i(hv_bIni_t bIn0, hv_bIni_t bIn1, hv_bOuti_t bOut) {
chris@163 489 #if HV_SIMD_AVX
chris@163 490 __m128i x = _mm_max_epi32(_mm256_castsi256_si128(bIn0), _mm256_castsi256_si128(bIn1));
chris@163 491 __m128i y = _mm_max_epi32(_mm256_extractf128_si256(bIn0, 1), _mm256_extractf128_si256(bIn1, 1));
chris@163 492 *bOut = _mm256_insertf128_si256(_mm256_castsi128_si256(x), y, 1);
chris@163 493 #elif HV_SIMD_SSE
chris@163 494 *bOut = _mm_max_epi32(bIn0, bIn1);
chris@163 495 #elif HV_SIMD_NEON
chris@163 496 *bOut = vmaxq_s32(bIn0, bIn1);
chris@163 497 #else // HV_SIMD_NONE
chris@163 498 *bOut = hv_max_i(bIn0, bIn1);
chris@163 499 #endif
chris@163 500 }
chris@163 501
chris@163 502 static inline void __hv_pow_f(hv_bInf_t bIn0, hv_bInf_t bIn1, hv_bOutf_t bOut) {
chris@163 503 #if HV_SIMD_AVX
chris@163 504 *bOut = _mm256_set_ps(
chris@163 505 hv_pow_f(bIn0[7], bIn1[7]),
chris@163 506 hv_pow_f(bIn0[6], bIn1[6]),
chris@163 507 hv_pow_f(bIn0[5], bIn1[5]),
chris@163 508 hv_pow_f(bIn0[4], bIn1[4]),
chris@163 509 hv_pow_f(bIn0[3], bIn1[3]),
chris@163 510 hv_pow_f(bIn0[2], bIn1[2]),
chris@163 511 hv_pow_f(bIn0[1], bIn1[1]),
chris@163 512 hv_pow_f(bIn0[0], bIn1[0]));
chris@163 513 #elif HV_SIMD_SSE
chris@163 514 *bOut = _mm_set_ps(
chris@163 515 hv_pow_f(bIn0[3], bIn1[3]),
chris@163 516 hv_pow_f(bIn0[2], bIn1[2]),
chris@163 517 hv_pow_f(bIn0[1], bIn1[1]),
chris@163 518 hv_pow_f(bIn0[0], bIn1[0]));
chris@163 519 #elif HV_SIMD_NEON
chris@163 520 *bOut = (float32x4_t) {
chris@163 521 hv_pow_f(bIn0[0], bIn1[0]),
chris@163 522 hv_pow_f(bIn0[1], bIn1[1]),
chris@163 523 hv_pow_f(bIn0[2], bIn1[2]),
chris@163 524 hv_pow_f(bIn0[3], bIn1[3])};
chris@163 525 #else // HV_SIMD_NONE
chris@163 526 *bOut = hv_pow_f(bIn0, bIn1);
chris@163 527 #endif
chris@163 528 }
chris@163 529
chris@163 530 static inline void __hv_gt_f(hv_bInf_t bIn0, hv_bInf_t bIn1, hv_bOutf_t bOut) {
chris@163 531 #if HV_SIMD_AVX
chris@163 532 *bOut = _mm256_cmp_ps(bIn0, bIn1, _CMP_GT_OQ);
chris@163 533 #elif HV_SIMD_SSE
chris@163 534 *bOut = _mm_cmpgt_ps(bIn0, bIn1);
chris@163 535 #elif HV_SIMD_NEON
chris@163 536 *bOut = vreinterpretq_f32_u32(vcgtq_f32(bIn0, bIn1));
chris@163 537 #else // HV_SIMD_NONE
chris@163 538 *bOut = (bIn0 > bIn1) ? 1.0f : 0.0f;
chris@163 539 #endif
chris@163 540 }
chris@163 541
chris@163 542 static inline void __hv_gte_f(hv_bInf_t bIn0, hv_bInf_t bIn1, hv_bOutf_t bOut) {
chris@163 543 #if HV_SIMD_AVX
chris@163 544 *bOut = _mm256_cmp_ps(bIn0, bIn1, _CMP_GE_OQ);
chris@163 545 #elif HV_SIMD_SSE
chris@163 546 *bOut = _mm_cmpge_ps(bIn0, bIn1);
chris@163 547 #elif HV_SIMD_NEON
chris@163 548 *bOut = vreinterpretq_f32_u32(vcgeq_f32(bIn0, bIn1));
chris@163 549 #else // HV_SIMD_NONE
chris@163 550 *bOut = (bIn0 >= bIn1) ? 1.0f : 0.0f;
chris@163 551 #endif
chris@163 552 }
chris@163 553
chris@163 554 static inline void __hv_lt_f(hv_bInf_t bIn0, hv_bInf_t bIn1, hv_bOutf_t bOut) {
chris@163 555 #if HV_SIMD_AVX
chris@163 556 *bOut = _mm256_cmp_ps(bIn0, bIn1, _CMP_LT_OQ);
chris@163 557 #elif HV_SIMD_SSE
chris@163 558 *bOut = _mm_cmplt_ps(bIn0, bIn1);
chris@163 559 #elif HV_SIMD_NEON
chris@163 560 *bOut = vreinterpretq_f32_u32(vcltq_f32(bIn0, bIn1));
chris@163 561 #else // HV_SIMD_NONE
chris@163 562 *bOut = (bIn0 < bIn1) ? 1.0f : 0.0f;
chris@163 563 #endif
chris@163 564 }
chris@163 565
chris@163 566 static inline void __hv_lte_f(hv_bInf_t bIn0, hv_bInf_t bIn1, hv_bOutf_t bOut) {
chris@163 567 #if HV_SIMD_AVX
chris@163 568 *bOut = _mm256_cmp_ps(bIn0, bIn1, _CMP_LE_OQ);
chris@163 569 #elif HV_SIMD_SSE
chris@163 570 *bOut = _mm_cmple_ps(bIn0, bIn1);
chris@163 571 #elif HV_SIMD_NEON
chris@163 572 *bOut = vreinterpretq_f32_u32(vcleq_f32(bIn0, bIn1));
chris@163 573 #else // HV_SIMD_NONE
chris@163 574 *bOut = (bIn0 <= bIn1) ? 1.0f : 0.0f;
chris@163 575 #endif
chris@163 576 }
chris@163 577
chris@163 578 static inline void __hv_neq_f(hv_bInf_t bIn0, hv_bInf_t bIn1, hv_bOutf_t bOut) {
chris@163 579 #if HV_SIMD_AVX
chris@163 580 *bOut = _mm256_cmp_ps(bIn0, bIn1, _CMP_NEQ_OQ);
chris@163 581 #elif HV_SIMD_SSE
chris@163 582 *bOut = _mm_cmpneq_ps(bIn0, bIn1);
chris@163 583 #elif HV_SIMD_NEON
chris@163 584 *bOut = vreinterpretq_f32_u32(vmvnq_u32(vceqq_f32(bIn0, bIn1)));
chris@163 585 #else // HV_SIMD_NONE
chris@163 586 *bOut = (bIn0 != bIn1) ? 1.0f : 0.0f;
chris@163 587 #endif
chris@163 588 }
chris@163 589
chris@163 590 static inline void __hv_xor_f(hv_bInf_t bIn0, hv_bInf_t bIn1, hv_bOutf_t bOut) {
chris@163 591 #if HV_SIMD_AVX
chris@163 592 #warning __hv_xor_f() not implemented
chris@163 593 #elif HV_SIMD_SSE
chris@163 594 #warning __hv_xor_f() not implemented
chris@163 595 #elif HV_SIMD_NEON
chris@163 596 #warning __hv_xor_f() not implemented
chris@163 597 #else // HV_SIMD_NONE
chris@163 598 *bOut = (float) (((int) bIn0) ^ ((int) bIn1));
chris@163 599 #endif
chris@163 600 }
chris@163 601
chris@163 602 static inline void __hv_and_f(hv_bInf_t bIn0, hv_bInf_t bIn1, hv_bOutf_t bOut) {
chris@163 603 #if HV_SIMD_AVX
chris@163 604 *bOut = _mm256_and_ps(bIn1, bIn0);
chris@163 605 #elif HV_SIMD_SSE
chris@163 606 *bOut = _mm_and_ps(bIn1, bIn0);
chris@163 607 #elif HV_SIMD_NEON
chris@163 608 *bOut = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(bIn1), vreinterpretq_u32_f32(bIn0)));
chris@163 609 #else // HV_SIMD_NONE
chris@163 610 if (bIn0 == 0.0f || bIn1 == 0.0f) *bOut = 0.0f;
chris@163 611 else if (bIn0 == 1.0f) *bOut = bIn1;
chris@163 612 else if (bIn1 == 1.0f) *bOut = bIn0;
chris@163 613 else hv_assert(0); // NOTE(mhroth): floating point & is pretty much a bad idea, only used for if~
chris@163 614 #endif
chris@163 615 }
chris@163 616
chris@163 617 // bOut = (bIn0 * bIn1) + bIn2
chris@163 618 static inline void __hv_fma_f(hv_bInf_t bIn0, hv_bInf_t bIn1, hv_bInf_t bIn2, hv_bOutf_t bOut) {
chris@163 619 #if HV_SIMD_AVX
chris@163 620 #if HV_SIMD_FMA
chris@163 621 *bOut = _mm256_fmadd_ps(bIn0, bIn1, bIn2);
chris@163 622 #else
chris@163 623 *bOut = _mm256_add_ps(_mm256_mul_ps(bIn0, bIn1), bIn2);
chris@163 624 #endif // HV_SIMD_FMA
chris@163 625 #elif HV_SIMD_SSE
chris@163 626 #if HV_SIMD_FMA
chris@163 627 *bOut = _mm_fmadd_ps(bIn0, bIn1, bIn2);
chris@163 628 #else
chris@163 629 *bOut = _mm_add_ps(_mm_mul_ps(bIn0, bIn1), bIn2);
chris@163 630 #endif // HV_SIMD_FMA
chris@163 631 #elif HV_SIMD_NEON
chris@163 632 #if __ARM_ARCH >= 8
chris@163 633 *bOut = vfmaq_f32(bIn2, bIn0, bIn1);
chris@163 634 #else
chris@163 635 // NOTE(mhroth): it turns out, fma SUUUUCKS on lesser ARM architectures
chris@163 636 // But in fact ideally fma would be disabled in ir2c for ARM architectures.
chris@163 637 // LLVM does a much better job handling fma than we do.
chris@163 638 *bOut = vaddq_f32(vmulq_f32(bIn0, bIn1), bIn2);
chris@163 639 #endif
chris@163 640 #else // HV_SIMD_NONE
chris@163 641 *bOut = hv_fma_f(bIn0, bIn1, bIn2);
chris@163 642 #endif
chris@163 643 }
chris@163 644
chris@163 645 #endif // _HEAVY_MATH_H_