annotate projects/heavy/samphold/HeavyMath.h @ 160:5bcf04234f80 heavy-updated

- added -std=c99 to Makefile for user-supplied C files (required for heavy files) - changed heavy core render.cpp file to use latest API and removed all redundant functions (e.g. foleyDesigner/touchkey stuff) - use build_pd.sh to compile and run pd files (-h for usage instructions)
author chnrx <chris.heinrichs@gmail.com>
date Thu, 05 Nov 2015 18:58:26 +0000
parents
children
rev   line source
chris@160 1 /**
chris@160 2 * Copyright (c) 2014, 2015, Enzien Audio Ltd.
chris@160 3 *
chris@160 4 * Permission to use, copy, modify, and/or distribute this software for any
chris@160 5 * purpose with or without fee is hereby granted, provided that the above
chris@160 6 * copyright notice and this permission notice appear in all copies.
chris@160 7 *
chris@160 8 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH
chris@160 9 * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
chris@160 10 * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT,
chris@160 11 * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
chris@160 12 * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
chris@160 13 * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
chris@160 14 * PERFORMANCE OF THIS SOFTWARE.
chris@160 15 */
chris@160 16
chris@160 17 #ifndef _HEAVY_MATH_H_
chris@160 18 #define _HEAVY_MATH_H_
chris@160 19
chris@160 20 #include "Utils.h"
chris@160 21
chris@160 22 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/
chris@160 23 // https://gcc.gnu.org/onlinedocs/gcc-4.8.1/gcc/ARM-NEON-Intrinsics.html
chris@160 24 // http://codesuppository.blogspot.co.uk/2015/02/sse2neonh-porting-guide-and-header-file.html
chris@160 25
chris@160 26 static inline void __hv_zero_f(hv_bOutf_t bOut) {
chris@160 27 #if HV_SIMD_AVX
chris@160 28 *bOut = _mm256_setzero_ps();
chris@160 29 #elif HV_SIMD_SSE
chris@160 30 *bOut = _mm_setzero_ps();
chris@160 31 #elif HV_SIMD_NEON
chris@160 32 *bOut = vdupq_n_f32(0.0f);
chris@160 33 #else // HV_SIMD_NONE
chris@160 34 *bOut = 0.0f;
chris@160 35 #endif
chris@160 36 }
chris@160 37
chris@160 38 static inline void __hv_load_f(float *bIn, hv_bOutf_t bOut) {
chris@160 39 #if HV_SIMD_AVX
chris@160 40 *bOut = _mm256_load_ps(bIn);
chris@160 41 #elif HV_SIMD_SSE
chris@160 42 *bOut = _mm_load_ps(bIn);
chris@160 43 #elif HV_SIMD_NEON
chris@160 44 *bOut = vld1q_f32(bIn);
chris@160 45 #else // HV_SIMD_NONE
chris@160 46 *bOut = *bIn;
chris@160 47 #endif
chris@160 48 }
chris@160 49
chris@160 50 static inline void __hv_store_f(float *bOut, hv_bInf_t bIn) {
chris@160 51 #if HV_SIMD_AVX
chris@160 52 _mm256_store_ps(bOut, bIn);
chris@160 53 #elif HV_SIMD_SSE
chris@160 54 _mm_store_ps(bOut, bIn);
chris@160 55 #elif HV_SIMD_NEON
chris@160 56 vst1q_f32(bOut, bIn);
chris@160 57 #else // HV_SIMD_NONE
chris@160 58 *bOut = bIn;
chris@160 59 #endif
chris@160 60 }
chris@160 61
chris@160 62 static inline void __hv_log_f(hv_bInf_t bIn, hv_bOutf_t bOut) {
chris@160 63 #if HV_SIMD_AVX
chris@160 64 #warning __hv_log_f() not implemented
chris@160 65 #elif HV_SIMD_SSE
chris@160 66 #warning __hv_log_f() not implemented
chris@160 67 #elif HV_SIMD_NEON
chris@160 68 #warning __hv_log_f() not implemented
chris@160 69 #else // HV_SIMD_NONE
chris@160 70 *bOut = (bIn > 0.0f) ? hv_log_f(bIn) : 0.0f;
chris@160 71 #endif
chris@160 72 }
chris@160 73
chris@160 74 static inline void __hv_log10_f(hv_bInf_t bIn, hv_bOutf_t bOut) {
chris@160 75 #if HV_SIMD_AVX
chris@160 76 #warning __hv_log10_f() not implemented
chris@160 77 #elif HV_SIMD_SSE
chris@160 78 #warning __hv_log10_f() not implemented
chris@160 79 #elif HV_SIMD_NEON
chris@160 80 #warning __hv_log10_f() not implemented
chris@160 81 #else // HV_SIMD_NONE
chris@160 82 *bOut = (bIn > 0.0f) ? hv_log10_f(bIn) : 0.0f;
chris@160 83 #endif
chris@160 84 }
chris@160 85
chris@160 86 static inline void __hv_log2_f(hv_bInf_t bIn, hv_bOutf_t bOut) {
chris@160 87 #if HV_SIMD_AVX
chris@160 88 #warning __hv_log2_f() not implemented
chris@160 89 #elif HV_SIMD_SSE
chris@160 90 #warning __hv_log2_f() not implemented
chris@160 91 #elif HV_SIMD_NEON
chris@160 92 #warning __hv_log2_f() not implemented
chris@160 93 #else // HV_SIMD_NONE
chris@160 94 *bOut = (bIn > 0.0f) ? hv_log2_f(bIn) : 0.0f;
chris@160 95 #endif
chris@160 96 }
chris@160 97
chris@160 98 // NOTE(mhroth): this is a pretty ghetto implementation
chris@160 99 static inline void __hv_cos_f(hv_bInf_t bIn, hv_bOutf_t bOut) {
chris@160 100 #if HV_SIMD_AVX
chris@160 101 *bOut = _mm256_set_ps(
chris@160 102 hv_cos_f(bIn[7]), hv_cos_f(bIn[6]), hv_cos_f(bIn[5]), hv_cos_f(bIn[4]),
chris@160 103 hv_cos_f(bIn[3]), hv_cos_f(bIn[2]), hv_cos_f(bIn[1]), hv_cos_f(bIn[0]));
chris@160 104 #elif HV_SIMD_SSE
chris@160 105 *bOut = _mm_set_ps(hv_cos_f(bIn[3]), hv_cos_f(bIn[2]), hv_cos_f(bIn[1]), hv_cos_f(bIn[0]));
chris@160 106 #elif HV_SIMD_NEON
chris@160 107 *bOut = (float32x4_t) {hv_cos_f(bIn[0]), hv_cos_f(bIn[1]), hv_cos_f(bIn[2]), hv_cos_f(bIn[3])};
chris@160 108 #else // HV_SIMD_NONE
chris@160 109 *bOut = hv_cos_f(bIn);
chris@160 110 #endif
chris@160 111 }
chris@160 112
chris@160 113 static inline void __hv_acos_f(hv_bInf_t bIn, hv_bOutf_t bOut) {
chris@160 114 #if HV_SIMD_AVX
chris@160 115 #warning __hv_acos_f() not implemented
chris@160 116 #elif HV_SIMD_SSE
chris@160 117 #warning __hv_acos_f() not implemented
chris@160 118 #elif HV_SIMD_NEON
chris@160 119 #warning __hv_acos_f() not implemented
chris@160 120 #else // HV_SIMD_NONE
chris@160 121 *bOut = hv_acos_f(bIn);
chris@160 122 #endif
chris@160 123 }
chris@160 124
chris@160 125 static inline void __hv_cosh_f(hv_bInf_t bIn, hv_bOutf_t bOut) {
chris@160 126 #if HV_SIMD_AVX
chris@160 127 #warning __hv_cosh_f() not implemented
chris@160 128 #elif HV_SIMD_SSE
chris@160 129 #warning __hv_cosh_f() not implemented
chris@160 130 #elif HV_SIMD_NEON
chris@160 131 #warning __hv_cosh_f() not implemented
chris@160 132 #else // HV_SIMD_NONE
chris@160 133 *bOut = hv_cosh_f(bIn);
chris@160 134 #endif
chris@160 135 }
chris@160 136
chris@160 137 static inline void __hv_acosh_f(hv_bInf_t bIn, hv_bOutf_t bOut) {
chris@160 138 #if HV_SIMD_AVX
chris@160 139 #warning __hv_acosh_f() not implemented
chris@160 140 #elif HV_SIMD_SSE
chris@160 141 #warning __hv_acosh_f() not implemented
chris@160 142 #elif HV_SIMD_NEON
chris@160 143 #warning __hv_acosh_f() not implemented
chris@160 144 #else // HV_SIMD_NONE
chris@160 145 *bOut = hv_acosh_f(bIn);
chris@160 146 #endif
chris@160 147 }
chris@160 148
chris@160 149 static inline void __hv_sin_f(hv_bInf_t bIn, hv_bOutf_t bOut) {
chris@160 150 #if HV_SIMD_AVX
chris@160 151 #warning __hv_sin_f() not implemented
chris@160 152 #elif HV_SIMD_SSE
chris@160 153 #warning __hv_sin_f() not implemented
chris@160 154 #elif HV_SIMD_NEON
chris@160 155 #warning __hv_sin_f() not implemented
chris@160 156 #else // HV_SIMD_NONE
chris@160 157 *bOut = hv_sin_f(bIn);
chris@160 158 #endif
chris@160 159 }
chris@160 160
chris@160 161 static inline void __hv_asin_f(hv_bInf_t bIn, hv_bOutf_t bOut) {
chris@160 162 #if HV_SIMD_AVX
chris@160 163 #warning __hv_asin_f() not implemented
chris@160 164 #elif HV_SIMD_SSE
chris@160 165 #warning __hv_asin_f() not implemented
chris@160 166 #elif HV_SIMD_NEON
chris@160 167 #warning __hv_asin_f() not implemented
chris@160 168 #else // HV_SIMD_NONE
chris@160 169 *bOut = hv_asin_f(bIn);
chris@160 170 #endif
chris@160 171 }
chris@160 172
chris@160 173 static inline void __hv_sinh_f(hv_bInf_t bIn, hv_bOutf_t bOut) {
chris@160 174 #if HV_SIMD_AVX
chris@160 175 #warning __hv_sinh_f() not implemented
chris@160 176 #elif HV_SIMD_SSE
chris@160 177 #warning __hv_sinh_f() not implemented
chris@160 178 #elif HV_SIMD_NEON
chris@160 179 #warning __hv_sinh_f() not implemented
chris@160 180 #else // HV_SIMD_NONE
chris@160 181 *bOut = hv_sinh_f(bIn);
chris@160 182 #endif
chris@160 183 }
chris@160 184
chris@160 185 static inline void __hv_asinh_f(hv_bInf_t bIn, hv_bOutf_t bOut) {
chris@160 186 #if HV_SIMD_AVX
chris@160 187 #warning __hv_asinh_f() not implemented
chris@160 188 #elif HV_SIMD_SSE
chris@160 189 #warning __hv_asinh_f() not implemented
chris@160 190 #elif HV_SIMD_NEON
chris@160 191 #warning __hv_asinh_f() not implemented
chris@160 192 #else // HV_SIMD_NONE
chris@160 193 *bOut = hv_asinh_f(bIn);
chris@160 194 #endif
chris@160 195 }
chris@160 196
chris@160 197 static inline void __hv_tan_f(hv_bInf_t bIn, hv_bOutf_t bOut) {
chris@160 198 #if HV_SIMD_AVX
chris@160 199 #warning __hv_tan_f() not implemented
chris@160 200 #elif HV_SIMD_SSE
chris@160 201 #warning __hv_tan_f() not implemented
chris@160 202 #elif HV_SIMD_NEON
chris@160 203 #warning __hv_tan_f() not implemented
chris@160 204 #else // HV_SIMD_NONE
chris@160 205 *bOut = hv_tan_f(bIn);
chris@160 206 #endif
chris@160 207 }
chris@160 208
chris@160 209 static inline void __hv_atan_f(hv_bInf_t bIn, hv_bOutf_t bOut) {
chris@160 210 #if HV_SIMD_AVX
chris@160 211 #warning __hv_atan_f() not implemented
chris@160 212 #elif HV_SIMD_SSE
chris@160 213 #warning __hv_atan_f() not implemented
chris@160 214 #elif HV_SIMD_NEON
chris@160 215 #warning __hv_atan_f() not implemented
chris@160 216 #else // HV_SIMD_NONE
chris@160 217 *bOut = hv_atan_f(bIn);
chris@160 218 #endif
chris@160 219 }
chris@160 220
chris@160 221 static inline void __hv_atan2_f(hv_bInf_t bIn0, hv_bInf_t bIn1, hv_bOutf_t bOut) {
chris@160 222 #if HV_SIMD_AVX
chris@160 223 #warning __hv_atan2_f() not implemented
chris@160 224 #elif HV_SIMD_SSE
chris@160 225 #warning __hv_atan2_f() not implemented
chris@160 226 #elif HV_SIMD_NEON
chris@160 227 #warning __hv_atan2_f() not implemented
chris@160 228 #else // HV_SIMD_NONE
chris@160 229 *bOut = hv_atan2_f(bIn0, bIn1);
chris@160 230 #endif
chris@160 231 }
chris@160 232
chris@160 233 static inline void __hv_tanh_f(hv_bInf_t bIn, hv_bOutf_t bOut) {
chris@160 234 #if HV_SIMD_AVX
chris@160 235 #warning __hv_tanh_f() not implemented
chris@160 236 #elif HV_SIMD_SSE
chris@160 237 #warning __hv_tanh_f() not implemented
chris@160 238 #elif HV_SIMD_NEON
chris@160 239 #warning __hv_tanh_f() not implemented
chris@160 240 #else // HV_SIMD_NONE
chris@160 241 *bOut = hv_tanh_f(bIn);
chris@160 242 #endif
chris@160 243 }
chris@160 244
chris@160 245 static inline void __hv_atanh_f(hv_bInf_t bIn, hv_bOutf_t bOut) {
chris@160 246 #if HV_SIMD_AVX
chris@160 247 #warning __hv_atanh_f() not implemented
chris@160 248 #elif HV_SIMD_SSE
chris@160 249 #warning __hv_atanh_f() not implemented
chris@160 250 #elif HV_SIMD_NEON
chris@160 251 #warning __hv_atanh_f() not implemented
chris@160 252 #else // HV_SIMD_NONE
chris@160 253 *bOut = hv_atanh_f(bIn);
chris@160 254 #endif
chris@160 255 }
chris@160 256
chris@160 257 // NOTE(mhroth): use of sqrt is absolute and total MURDER. Make do with recipocal sqrt if possible!!
chris@160 258 static inline void __hv_sqrt_f(hv_bInf_t bIn, hv_bOutf_t bOut) {
chris@160 259 #if HV_SIMD_AVX
chris@160 260 *bOut = _mm256_sqrt_ps(bIn);
chris@160 261 #elif HV_SIMD_SSE
chris@160 262 *bOut = _mm_sqrt_ps(bIn);
chris@160 263 #elif HV_SIMD_NEON
chris@160 264 #warning __hv_sqrt_f() numerical results may be inexact
chris@160 265 *bOut = vrecpeq_f32(vrsqrteq_f32(bIn));
chris@160 266 #else // HV_SIMD_NONE
chris@160 267 *bOut = hv_sqrt_f(bIn);
chris@160 268 #endif
chris@160 269 }
chris@160 270
chris@160 271 static inline void __hv_rsqrt_f(hv_bInf_t bIn, hv_bOutf_t bOut) {
chris@160 272 #if HV_SIMD_AVX
chris@160 273 *bOut = _mm256_rsqrt_ps(bIn);
chris@160 274 #elif HV_SIMD_SSE
chris@160 275 *bOut = _mm_rsqrt_ps(bIn);
chris@160 276 #elif HV_SIMD_NEON
chris@160 277 #warning __hv_rsqrt_f() numerical results may be inexact
chris@160 278 *bOut = vrsqrteq_f32(bIn);
chris@160 279 #else // HV_SIMD_NONE
chris@160 280 *bOut = 1.0f/hv_sqrt_f(bIn);
chris@160 281 #endif
chris@160 282 }
chris@160 283
chris@160 284 static inline void __hv_abs_f(hv_bInf_t bIn, hv_bOutf_t bOut) {
chris@160 285 #if HV_SIMD_AVX
chris@160 286 *bOut = _mm256_andnot_ps(_mm256_set1_ps(-0.0f), bIn);
chris@160 287 #elif HV_SIMD_SSE
chris@160 288 *bOut = _mm_andnot_ps(_mm_set1_ps(-0.0f), bIn); // == 1 << 31
chris@160 289 #elif HV_SIMD_NEON
chris@160 290 *bOut = vabsq_f32(bIn);
chris@160 291 #else // HV_SIMD_NONE
chris@160 292 *bOut = hv_abs_f(bIn);
chris@160 293 #endif
chris@160 294 }
chris@160 295
chris@160 296 static inline void __hv_exp_f(hv_bInf_t bIn, hv_bOutf_t bOut) {
chris@160 297 #if HV_SIMD_AVX
chris@160 298 #warning __hv_exp_f() not implemented
chris@160 299 #elif HV_SIMD_SSE
chris@160 300 #warning __hv_exp_f() not implemented
chris@160 301 #elif HV_SIMD_NEON
chris@160 302 #warning __hv_exp_f() not implemented
chris@160 303 #else // HV_SIMD_NONE
chris@160 304 *bOut = hv_exp_f(bIn);
chris@160 305 #endif
chris@160 306 }
chris@160 307
chris@160 308 static inline void __hv_ceil_f(hv_bInf_t bIn, hv_bOutf_t bOut) {
chris@160 309 #if HV_SIMD_AVX
chris@160 310 *bOut = _mm256_ceil_ps(bIn);
chris@160 311 #elif HV_SIMD_SSE
chris@160 312 *bOut = _mm_ceil_ps(bIn);
chris@160 313 #elif HV_SIMD_NEON
chris@160 314 #if __ARM_ARCH >= 8
chris@160 315 *bOut = vrndpq_f32(bIn);
chris@160 316 #else
chris@160 317 #warning A slow NEON implementation of __hv_ceil_f() is being used because the necessary intrinsic cannot be found. It is only available in ARMv8.
chris@160 318 *bOut = (float32x4_t) {hv_ceil_f(bIn[0]), hv_ceil_f(bIn[1]), hv_ceil_f(bIn[2]), hv_ceil_f(bIn[3])};
chris@160 319 #endif // vrndpq_f32
chris@160 320 #else // HV_SIMD_NONE
chris@160 321 *bOut = hv_ceil_f(bIn);
chris@160 322 #endif
chris@160 323 }
chris@160 324
chris@160 325 static inline void __hv_floor_f(hv_bInf_t bIn, hv_bOutf_t bOut) {
chris@160 326 #if HV_SIMD_AVX
chris@160 327 *bOut = _mm256_floor_ps(bIn);
chris@160 328 #elif HV_SIMD_SSE
chris@160 329 *bOut = _mm_floor_ps(bIn);
chris@160 330 #elif HV_SIMD_NEON
chris@160 331 #if __ARM_ARCH >= 8
chris@160 332 *bOut = vrndmq_f32(bIn);
chris@160 333 #else
chris@160 334 #warning A slow NEON implementation of __hv_floor_f() is being used because the necessary intrinsic cannot be found. It is only available in ARMv8.
chris@160 335 *bOut = (float32x4_t) {hv_floor_f(bIn[0]), hv_floor_f(bIn[1]), hv_floor_f(bIn[2]), hv_floor_f(bIn[3])};
chris@160 336 #endif // vrndmq_f32
chris@160 337 #else // HV_SIMD_NONE
chris@160 338 *bOut = hv_floor_f(bIn);
chris@160 339 #endif
chris@160 340 }
chris@160 341
chris@160 342 // __add~f
chris@160 343 static inline void __hv_add_f(hv_bInf_t bIn0, hv_bInf_t bIn1, hv_bOutf_t bOut) {
chris@160 344 #if HV_SIMD_AVX
chris@160 345 *bOut = _mm256_add_ps(bIn0, bIn1);
chris@160 346 #elif HV_SIMD_SSE
chris@160 347 *bOut = _mm_add_ps(bIn0, bIn1);
chris@160 348 #elif HV_SIMD_NEON
chris@160 349 *bOut = vaddq_f32(bIn0, bIn1);
chris@160 350 #else // HV_SIMD_NONE
chris@160 351 *bOut = bIn0 + bIn1;
chris@160 352 #endif
chris@160 353 }
chris@160 354
chris@160 355 // __add~i
chris@160 356 static inline void __hv_add_i(hv_bIni_t bIn0, hv_bIni_t bIn1, hv_bOuti_t bOut) {
chris@160 357 #if HV_SIMD_AVX
chris@160 358 __m128i x = _mm_add_epi32(_mm256_castsi256_si128(bIn0), _mm256_castsi256_si128(bIn1));
chris@160 359 __m128i y = _mm_add_epi32(_mm256_extractf128_si256(bIn0, 1), _mm256_extractf128_si256(bIn1, 1));
chris@160 360 *bOut = _mm256_insertf128_si256(_mm256_castsi128_si256(x), y, 1);
chris@160 361 #elif HV_SIMD_SSE
chris@160 362 *bOut = _mm_add_epi32(bIn0, bIn1);
chris@160 363 #elif HV_SIMD_NEON
chris@160 364 *bOut = vaddq_s32(bIn0, bIn1);
chris@160 365 #else // HV_SIMD_NONE
chris@160 366 *bOut = bIn0 + bIn1;
chris@160 367 #endif
chris@160 368 }
chris@160 369
chris@160 370 // __sub~f
chris@160 371 static inline void __hv_sub_f(hv_bInf_t bIn0, hv_bInf_t bIn1, hv_bOutf_t bOut) {
chris@160 372 #if HV_SIMD_AVX
chris@160 373 *bOut = _mm256_sub_ps(bIn0, bIn1);
chris@160 374 #elif HV_SIMD_SSE
chris@160 375 *bOut = _mm_sub_ps(bIn0, bIn1);
chris@160 376 #elif HV_SIMD_NEON
chris@160 377 *bOut = vsubq_f32(bIn0, bIn1);
chris@160 378 #else // HV_SIMD_NONE
chris@160 379 *bOut = bIn0 - bIn1;
chris@160 380 #endif
chris@160 381 }
chris@160 382
chris@160 383 // __mul~f
chris@160 384 static inline void __hv_mul_f(hv_bInf_t bIn0, hv_bInf_t bIn1, hv_bOutf_t bOut) {
chris@160 385 #if HV_SIMD_AVX
chris@160 386 *bOut = _mm256_mul_ps(bIn0, bIn1);
chris@160 387 #elif HV_SIMD_SSE
chris@160 388 *bOut = _mm_mul_ps(bIn0, bIn1);
chris@160 389 #elif HV_SIMD_NEON
chris@160 390 *bOut = vmulq_f32(bIn0, bIn1);
chris@160 391 #else // HV_SIMD_NONE
chris@160 392 *bOut = bIn0 * bIn1;
chris@160 393 #endif
chris@160 394 }
chris@160 395
chris@160 396 // __*~i
chris@160 397 static inline void __hv_mul_i(hv_bIni_t bIn0, hv_bIni_t bIn1, hv_bOuti_t bOut) {
chris@160 398 #if HV_SIMD_AVX
chris@160 399 __m128i x = _mm_mullo_epi32(_mm256_castsi256_si128(bIn0), _mm256_castsi256_si128(bIn1));
chris@160 400 __m128i y = _mm_mullo_epi32(_mm256_extractf128_si256(bIn0, 1), _mm256_extractf128_si256(bIn1, 1));
chris@160 401 *bOut = _mm256_insertf128_si256(_mm256_castsi128_si256(x), y, 1);
chris@160 402 #elif HV_SIMD_SSE
chris@160 403 *bOut = _mm_mullo_epi32(bIn0, bIn1);
chris@160 404 #elif HV_SIMD_NEON
chris@160 405 *bOut = vmulq_s32(bIn0, bIn1);
chris@160 406 #else // HV_SIMD_NONE
chris@160 407 *bOut = bIn0 * bIn1;
chris@160 408 #endif
chris@160 409 }
chris@160 410
chris@160 411 // __cast~if
chris@160 412 static inline void __hv_cast_if(hv_bIni_t bIn, hv_bOutf_t bOut) {
chris@160 413 #if HV_SIMD_AVX
chris@160 414 *bOut = _mm256_cvtepi32_ps(bIn);
chris@160 415 #elif HV_SIMD_SSE
chris@160 416 *bOut = _mm_cvtepi32_ps(bIn);
chris@160 417 #elif HV_SIMD_NEON
chris@160 418 *bOut = vcvtq_f32_s32(bIn);
chris@160 419 #else // HV_SIMD_NONE
chris@160 420 *bOut = (float) bIn;
chris@160 421 #endif
chris@160 422 }
chris@160 423
chris@160 424 // __cast~fi
chris@160 425 static inline void __hv_cast_fi(hv_bInf_t bIn, hv_bOuti_t bOut) {
chris@160 426 #if HV_SIMD_AVX
chris@160 427 *bOut = _mm256_cvtps_epi32(bIn);
chris@160 428 #elif HV_SIMD_SSE
chris@160 429 *bOut = _mm_cvtps_epi32(bIn);
chris@160 430 #elif HV_SIMD_NEON
chris@160 431 *bOut = vcvtq_s32_f32(bIn);
chris@160 432 #else // HV_SIMD_NONE
chris@160 433 *bOut = (int) bIn;
chris@160 434 #endif
chris@160 435 }
chris@160 436
chris@160 437 static inline void __hv_div_f(hv_bInf_t bIn0, hv_bInf_t bIn1, hv_bOutf_t bOut) {
chris@160 438 #if HV_SIMD_AVX
chris@160 439 *bOut = _mm256_div_ps(bIn0, bIn1);
chris@160 440 #elif HV_SIMD_SSE
chris@160 441 *bOut = _mm_div_ps(bIn0, bIn1);
chris@160 442 #elif HV_SIMD_NEON
chris@160 443 #warning __hv_div_f() numerical results may be inexact
chris@160 444 *bOut = vmulq_f32(bIn0, vrecpeq_f32(bIn1));
chris@160 445 #else // HV_SIMD_NONE
chris@160 446 *bOut = (bIn1 != 0.0f) ? (bIn0 / bIn1) : 0.0f;
chris@160 447 #endif
chris@160 448 }
chris@160 449
chris@160 450 static inline void __hv_min_f(hv_bInf_t bIn0, hv_bInf_t bIn1, hv_bOutf_t bOut) {
chris@160 451 #if HV_SIMD_AVX
chris@160 452 *bOut = _mm256_min_ps(bIn0, bIn1);
chris@160 453 #elif HV_SIMD_SSE
chris@160 454 *bOut = _mm_min_ps(bIn0, bIn1);
chris@160 455 #elif HV_SIMD_NEON
chris@160 456 *bOut = vminq_f32(bIn0, bIn1);
chris@160 457 #else // HV_SIMD_NONE
chris@160 458 *bOut = hv_min_f(bIn0, bIn1);
chris@160 459 #endif
chris@160 460 }
chris@160 461
chris@160 462 static inline void __hv_min_i(hv_bIni_t bIn0, hv_bIni_t bIn1, hv_bOuti_t bOut) {
chris@160 463 #if HV_SIMD_AVX
chris@160 464 __m128i x = _mm_min_epi32(_mm256_castsi256_si128(bIn0), _mm256_castsi256_si128(bIn1));
chris@160 465 __m128i y = _mm_min_epi32(_mm256_extractf128_si256(bIn0, 1), _mm256_extractf128_si256(bIn1, 1));
chris@160 466 *bOut = _mm256_insertf128_si256(_mm256_castsi128_si256(x), y, 1);
chris@160 467 #elif HV_SIMD_SSE
chris@160 468 *bOut = _mm_min_epi32(bIn0, bIn1);
chris@160 469 #elif HV_SIMD_NEON
chris@160 470 *bOut = vminq_s32(bIn0, bIn1);
chris@160 471 #else // HV_SIMD_NONE
chris@160 472 *bOut = hv_min_i(bIn0, bIn1);
chris@160 473 #endif
chris@160 474 }
chris@160 475
chris@160 476 static inline void __hv_max_f(hv_bInf_t bIn0, hv_bInf_t bIn1, hv_bOutf_t bOut) {
chris@160 477 #if HV_SIMD_AVX
chris@160 478 *bOut = _mm256_max_ps(bIn0, bIn1);
chris@160 479 #elif HV_SIMD_SSE
chris@160 480 *bOut = _mm_max_ps(bIn0, bIn1);
chris@160 481 #elif HV_SIMD_NEON
chris@160 482 *bOut = vmaxq_f32(bIn0, bIn1);
chris@160 483 #else // HV_SIMD_NONE
chris@160 484 *bOut = hv_max_f(bIn0, bIn1);
chris@160 485 #endif
chris@160 486 }
chris@160 487
chris@160 488 static inline void __hv_max_i(hv_bIni_t bIn0, hv_bIni_t bIn1, hv_bOuti_t bOut) {
chris@160 489 #if HV_SIMD_AVX
chris@160 490 __m128i x = _mm_max_epi32(_mm256_castsi256_si128(bIn0), _mm256_castsi256_si128(bIn1));
chris@160 491 __m128i y = _mm_max_epi32(_mm256_extractf128_si256(bIn0, 1), _mm256_extractf128_si256(bIn1, 1));
chris@160 492 *bOut = _mm256_insertf128_si256(_mm256_castsi128_si256(x), y, 1);
chris@160 493 #elif HV_SIMD_SSE
chris@160 494 *bOut = _mm_max_epi32(bIn0, bIn1);
chris@160 495 #elif HV_SIMD_NEON
chris@160 496 *bOut = vmaxq_s32(bIn0, bIn1);
chris@160 497 #else // HV_SIMD_NONE
chris@160 498 *bOut = hv_max_i(bIn0, bIn1);
chris@160 499 #endif
chris@160 500 }
chris@160 501
chris@160 502 static inline void __hv_pow_f(hv_bInf_t bIn0, hv_bInf_t bIn1, hv_bOutf_t bOut) {
chris@160 503 #if HV_SIMD_AVX
chris@160 504 *bOut = _mm256_set_ps(
chris@160 505 hv_pow_f(bIn0[7], bIn1[7]),
chris@160 506 hv_pow_f(bIn0[6], bIn1[6]),
chris@160 507 hv_pow_f(bIn0[5], bIn1[5]),
chris@160 508 hv_pow_f(bIn0[4], bIn1[4]),
chris@160 509 hv_pow_f(bIn0[3], bIn1[3]),
chris@160 510 hv_pow_f(bIn0[2], bIn1[2]),
chris@160 511 hv_pow_f(bIn0[1], bIn1[1]),
chris@160 512 hv_pow_f(bIn0[0], bIn1[0]));
chris@160 513 #elif HV_SIMD_SSE
chris@160 514 *bOut = _mm_set_ps(
chris@160 515 hv_pow_f(bIn0[3], bIn1[3]),
chris@160 516 hv_pow_f(bIn0[2], bIn1[2]),
chris@160 517 hv_pow_f(bIn0[1], bIn1[1]),
chris@160 518 hv_pow_f(bIn0[0], bIn1[0]));
chris@160 519 #elif HV_SIMD_NEON
chris@160 520 *bOut = (float32x4_t) {
chris@160 521 hv_pow_f(bIn0[0], bIn1[0]),
chris@160 522 hv_pow_f(bIn0[1], bIn1[1]),
chris@160 523 hv_pow_f(bIn0[2], bIn1[2]),
chris@160 524 hv_pow_f(bIn0[3], bIn1[3])};
chris@160 525 #else // HV_SIMD_NONE
chris@160 526 *bOut = hv_pow_f(bIn0, bIn1);
chris@160 527 #endif
chris@160 528 }
chris@160 529
chris@160 530 static inline void __hv_gt_f(hv_bInf_t bIn0, hv_bInf_t bIn1, hv_bOutf_t bOut) {
chris@160 531 #if HV_SIMD_AVX
chris@160 532 *bOut = _mm256_cmp_ps(bIn0, bIn1, _CMP_GT_OQ);
chris@160 533 #elif HV_SIMD_SSE
chris@160 534 *bOut = _mm_cmpgt_ps(bIn0, bIn1);
chris@160 535 #elif HV_SIMD_NEON
chris@160 536 *bOut = vreinterpretq_f32_u32(vcgtq_f32(bIn0, bIn1));
chris@160 537 #else // HV_SIMD_NONE
chris@160 538 *bOut = (bIn0 > bIn1) ? 1.0f : 0.0f;
chris@160 539 #endif
chris@160 540 }
chris@160 541
chris@160 542 static inline void __hv_gte_f(hv_bInf_t bIn0, hv_bInf_t bIn1, hv_bOutf_t bOut) {
chris@160 543 #if HV_SIMD_AVX
chris@160 544 *bOut = _mm256_cmp_ps(bIn0, bIn1, _CMP_GE_OQ);
chris@160 545 #elif HV_SIMD_SSE
chris@160 546 *bOut = _mm_cmpge_ps(bIn0, bIn1);
chris@160 547 #elif HV_SIMD_NEON
chris@160 548 *bOut = vreinterpretq_f32_u32(vcgeq_f32(bIn0, bIn1));
chris@160 549 #else // HV_SIMD_NONE
chris@160 550 *bOut = (bIn0 >= bIn1) ? 1.0f : 0.0f;
chris@160 551 #endif
chris@160 552 }
chris@160 553
chris@160 554 static inline void __hv_lt_f(hv_bInf_t bIn0, hv_bInf_t bIn1, hv_bOutf_t bOut) {
chris@160 555 #if HV_SIMD_AVX
chris@160 556 *bOut = _mm256_cmp_ps(bIn0, bIn1, _CMP_LT_OQ);
chris@160 557 #elif HV_SIMD_SSE
chris@160 558 *bOut = _mm_cmplt_ps(bIn0, bIn1);
chris@160 559 #elif HV_SIMD_NEON
chris@160 560 *bOut = vreinterpretq_f32_u32(vcltq_f32(bIn0, bIn1));
chris@160 561 #else // HV_SIMD_NONE
chris@160 562 *bOut = (bIn0 < bIn1) ? 1.0f : 0.0f;
chris@160 563 #endif
chris@160 564 }
chris@160 565
chris@160 566 static inline void __hv_lte_f(hv_bInf_t bIn0, hv_bInf_t bIn1, hv_bOutf_t bOut) {
chris@160 567 #if HV_SIMD_AVX
chris@160 568 *bOut = _mm256_cmp_ps(bIn0, bIn1, _CMP_LE_OQ);
chris@160 569 #elif HV_SIMD_SSE
chris@160 570 *bOut = _mm_cmple_ps(bIn0, bIn1);
chris@160 571 #elif HV_SIMD_NEON
chris@160 572 *bOut = vreinterpretq_f32_u32(vcleq_f32(bIn0, bIn1));
chris@160 573 #else // HV_SIMD_NONE
chris@160 574 *bOut = (bIn0 <= bIn1) ? 1.0f : 0.0f;
chris@160 575 #endif
chris@160 576 }
chris@160 577
chris@160 578 static inline void __hv_neq_f(hv_bInf_t bIn0, hv_bInf_t bIn1, hv_bOutf_t bOut) {
chris@160 579 #if HV_SIMD_AVX
chris@160 580 *bOut = _mm256_cmp_ps(bIn0, bIn1, _CMP_NEQ_OQ);
chris@160 581 #elif HV_SIMD_SSE
chris@160 582 *bOut = _mm_cmpneq_ps(bIn0, bIn1);
chris@160 583 #elif HV_SIMD_NEON
chris@160 584 *bOut = vreinterpretq_f32_u32(vmvnq_u32(vceqq_f32(bIn0, bIn1)));
chris@160 585 #else // HV_SIMD_NONE
chris@160 586 *bOut = (bIn0 != bIn1) ? 1.0f : 0.0f;
chris@160 587 #endif
chris@160 588 }
chris@160 589
chris@160 590 static inline void __hv_xor_f(hv_bInf_t bIn0, hv_bInf_t bIn1, hv_bOutf_t bOut) {
chris@160 591 #if HV_SIMD_AVX
chris@160 592 #warning __hv_xor_f() not implemented
chris@160 593 #elif HV_SIMD_SSE
chris@160 594 #warning __hv_xor_f() not implemented
chris@160 595 #elif HV_SIMD_NEON
chris@160 596 #warning __hv_xor_f() not implemented
chris@160 597 #else // HV_SIMD_NONE
chris@160 598 *bOut = (float) (((int) bIn0) ^ ((int) bIn1));
chris@160 599 #endif
chris@160 600 }
chris@160 601
chris@160 602 static inline void __hv_and_f(hv_bInf_t bIn0, hv_bInf_t bIn1, hv_bOutf_t bOut) {
chris@160 603 #if HV_SIMD_AVX
chris@160 604 *bOut = _mm256_and_ps(bIn1, bIn0);
chris@160 605 #elif HV_SIMD_SSE
chris@160 606 *bOut = _mm_and_ps(bIn1, bIn0);
chris@160 607 #elif HV_SIMD_NEON
chris@160 608 *bOut = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(bIn1), vreinterpretq_u32_f32(bIn0)));
chris@160 609 #else // HV_SIMD_NONE
chris@160 610 if (bIn0 == 0.0f || bIn1 == 0.0f) *bOut = 0.0f;
chris@160 611 else if (bIn0 == 1.0f) *bOut = bIn1;
chris@160 612 else if (bIn1 == 1.0f) *bOut = bIn0;
chris@160 613 else hv_assert(0); // TODO(mhroth): floating point & is pretty much a bad idea, only used for if~
chris@160 614 #endif
chris@160 615 }
chris@160 616
chris@160 617 // bOut = (bIn0 * bIn1) + bIn2
chris@160 618 static inline void __hv_fma_f(hv_bInf_t bIn0, hv_bInf_t bIn1, hv_bInf_t bIn2, hv_bOutf_t bOut) {
chris@160 619 #if HV_SIMD_AVX
chris@160 620 #if HV_SIMD_FMA
chris@160 621 *bOut = _mm256_fmadd_ps(bIn0, bIn1, bIn2);
chris@160 622 #else
chris@160 623 *bOut = _mm256_add_ps(_mm256_mul_ps(bIn0, bIn1), bIn2);
chris@160 624 #endif // HV_SIMD_FMA
chris@160 625 #elif HV_SIMD_SSE
chris@160 626 #if HV_SIMD_FMA
chris@160 627 *bOut = _mm_fmadd_ps(bIn0, bIn1, bIn2);
chris@160 628 #else
chris@160 629 *bOut = _mm_add_ps(_mm_mul_ps(bIn0, bIn1), bIn2);
chris@160 630 #endif // HV_SIMD_FMA
chris@160 631 #elif HV_SIMD_NEON
chris@160 632 #if __ARM_ARCH >= 8
chris@160 633 *bOut = vfmaq_f32(bIn2, bIn0, bIn1);
chris@160 634 #else
chris@160 635 // NOTE(mhroth): it turns out, fma SUUUUCKS on lesser ARM architectures
chris@160 636 // But in fact ideally fma would be disabled in ir2c for ARM architectures.
chris@160 637 // LLVM does a much better job handling fma than we do.
chris@160 638 *bOut = vaddq_f32(vmulq_f32(bIn0, bIn1), bIn2);
chris@160 639 #endif
chris@160 640 #else // HV_SIMD_NONE
chris@160 641 *bOut = hv_fma_f(bIn0, bIn1, bIn2);
chris@160 642 #endif
chris@160 643 }
chris@160 644
chris@160 645 #endif // _HEAVY_MATH_H_