annotate src/opus-1.3/celt/arm/pitch_neon_intr.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 7aeed7906520
children
rev   line source
Chris@69 1 /***********************************************************************
Chris@69 2 Copyright (c) 2017 Google Inc.
Chris@69 3 Redistribution and use in source and binary forms, with or without
Chris@69 4 modification, are permitted provided that the following conditions
Chris@69 5 are met:
Chris@69 6 - Redistributions of source code must retain the above copyright notice,
Chris@69 7 this list of conditions and the following disclaimer.
Chris@69 8 - Redistributions in binary form must reproduce the above copyright
Chris@69 9 notice, this list of conditions and the following disclaimer in the
Chris@69 10 documentation and/or other materials provided with the distribution.
Chris@69 11 - Neither the name of Internet Society, IETF or IETF Trust, nor the
Chris@69 12 names of specific contributors, may be used to endorse or promote
Chris@69 13 products derived from this software without specific prior written
Chris@69 14 permission.
Chris@69 15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
Chris@69 16 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
Chris@69 17 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
Chris@69 18 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
Chris@69 19 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
Chris@69 20 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
Chris@69 21 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
Chris@69 22 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
Chris@69 23 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
Chris@69 24 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
Chris@69 25 POSSIBILITY OF SUCH DAMAGE.
Chris@69 26 ***********************************************************************/
Chris@69 27
Chris@69 28 #ifdef HAVE_CONFIG_H
Chris@69 29 #include "config.h"
Chris@69 30 #endif
Chris@69 31
Chris@69 32 #include <arm_neon.h>
Chris@69 33 #include "pitch.h"
Chris@69 34
Chris@69 35 #ifdef FIXED_POINT
Chris@69 36
Chris@69 37 opus_val32 celt_inner_prod_neon(const opus_val16 *x, const opus_val16 *y, int N)
Chris@69 38 {
Chris@69 39 int i;
Chris@69 40 opus_val32 xy;
Chris@69 41 int16x8_t x_s16x8, y_s16x8;
Chris@69 42 int32x4_t xy_s32x4 = vdupq_n_s32(0);
Chris@69 43 int64x2_t xy_s64x2;
Chris@69 44 int64x1_t xy_s64x1;
Chris@69 45
Chris@69 46 for (i = 0; i < N - 7; i += 8) {
Chris@69 47 x_s16x8 = vld1q_s16(&x[i]);
Chris@69 48 y_s16x8 = vld1q_s16(&y[i]);
Chris@69 49 xy_s32x4 = vmlal_s16(xy_s32x4, vget_low_s16 (x_s16x8), vget_low_s16 (y_s16x8));
Chris@69 50 xy_s32x4 = vmlal_s16(xy_s32x4, vget_high_s16(x_s16x8), vget_high_s16(y_s16x8));
Chris@69 51 }
Chris@69 52
Chris@69 53 if (N - i >= 4) {
Chris@69 54 const int16x4_t x_s16x4 = vld1_s16(&x[i]);
Chris@69 55 const int16x4_t y_s16x4 = vld1_s16(&y[i]);
Chris@69 56 xy_s32x4 = vmlal_s16(xy_s32x4, x_s16x4, y_s16x4);
Chris@69 57 i += 4;
Chris@69 58 }
Chris@69 59
Chris@69 60 xy_s64x2 = vpaddlq_s32(xy_s32x4);
Chris@69 61 xy_s64x1 = vadd_s64(vget_low_s64(xy_s64x2), vget_high_s64(xy_s64x2));
Chris@69 62 xy = vget_lane_s32(vreinterpret_s32_s64(xy_s64x1), 0);
Chris@69 63
Chris@69 64 for (; i < N; i++) {
Chris@69 65 xy = MAC16_16(xy, x[i], y[i]);
Chris@69 66 }
Chris@69 67
Chris@69 68 #ifdef OPUS_CHECK_ASM
Chris@69 69 celt_assert(celt_inner_prod_c(x, y, N) == xy);
Chris@69 70 #endif
Chris@69 71
Chris@69 72 return xy;
Chris@69 73 }
Chris@69 74
Chris@69 75 void dual_inner_prod_neon(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02,
Chris@69 76 int N, opus_val32 *xy1, opus_val32 *xy2)
Chris@69 77 {
Chris@69 78 int i;
Chris@69 79 opus_val32 xy01, xy02;
Chris@69 80 int16x8_t x_s16x8, y01_s16x8, y02_s16x8;
Chris@69 81 int32x4_t xy01_s32x4 = vdupq_n_s32(0);
Chris@69 82 int32x4_t xy02_s32x4 = vdupq_n_s32(0);
Chris@69 83 int64x2_t xy01_s64x2, xy02_s64x2;
Chris@69 84 int64x1_t xy01_s64x1, xy02_s64x1;
Chris@69 85
Chris@69 86 for (i = 0; i < N - 7; i += 8) {
Chris@69 87 x_s16x8 = vld1q_s16(&x[i]);
Chris@69 88 y01_s16x8 = vld1q_s16(&y01[i]);
Chris@69 89 y02_s16x8 = vld1q_s16(&y02[i]);
Chris@69 90 xy01_s32x4 = vmlal_s16(xy01_s32x4, vget_low_s16 (x_s16x8), vget_low_s16 (y01_s16x8));
Chris@69 91 xy02_s32x4 = vmlal_s16(xy02_s32x4, vget_low_s16 (x_s16x8), vget_low_s16 (y02_s16x8));
Chris@69 92 xy01_s32x4 = vmlal_s16(xy01_s32x4, vget_high_s16(x_s16x8), vget_high_s16(y01_s16x8));
Chris@69 93 xy02_s32x4 = vmlal_s16(xy02_s32x4, vget_high_s16(x_s16x8), vget_high_s16(y02_s16x8));
Chris@69 94 }
Chris@69 95
Chris@69 96 if (N - i >= 4) {
Chris@69 97 const int16x4_t x_s16x4 = vld1_s16(&x[i]);
Chris@69 98 const int16x4_t y01_s16x4 = vld1_s16(&y01[i]);
Chris@69 99 const int16x4_t y02_s16x4 = vld1_s16(&y02[i]);
Chris@69 100 xy01_s32x4 = vmlal_s16(xy01_s32x4, x_s16x4, y01_s16x4);
Chris@69 101 xy02_s32x4 = vmlal_s16(xy02_s32x4, x_s16x4, y02_s16x4);
Chris@69 102 i += 4;
Chris@69 103 }
Chris@69 104
Chris@69 105 xy01_s64x2 = vpaddlq_s32(xy01_s32x4);
Chris@69 106 xy02_s64x2 = vpaddlq_s32(xy02_s32x4);
Chris@69 107 xy01_s64x1 = vadd_s64(vget_low_s64(xy01_s64x2), vget_high_s64(xy01_s64x2));
Chris@69 108 xy02_s64x1 = vadd_s64(vget_low_s64(xy02_s64x2), vget_high_s64(xy02_s64x2));
Chris@69 109 xy01 = vget_lane_s32(vreinterpret_s32_s64(xy01_s64x1), 0);
Chris@69 110 xy02 = vget_lane_s32(vreinterpret_s32_s64(xy02_s64x1), 0);
Chris@69 111
Chris@69 112 for (; i < N; i++) {
Chris@69 113 xy01 = MAC16_16(xy01, x[i], y01[i]);
Chris@69 114 xy02 = MAC16_16(xy02, x[i], y02[i]);
Chris@69 115 }
Chris@69 116 *xy1 = xy01;
Chris@69 117 *xy2 = xy02;
Chris@69 118
Chris@69 119 #ifdef OPUS_CHECK_ASM
Chris@69 120 {
Chris@69 121 opus_val32 xy1_c, xy2_c;
Chris@69 122 dual_inner_prod_c(x, y01, y02, N, &xy1_c, &xy2_c);
Chris@69 123 celt_assert(xy1_c == *xy1);
Chris@69 124 celt_assert(xy2_c == *xy2);
Chris@69 125 }
Chris@69 126 #endif
Chris@69 127 }
Chris@69 128
Chris@69 129 #else /* !FIXED_POINT */
Chris@69 130
Chris@69 131 /* ========================================================================== */
Chris@69 132
Chris@69 133 #ifdef OPUS_CHECK_ASM
Chris@69 134
Chris@69 135 /* This part of code simulates floating-point NEON operations. */
Chris@69 136
Chris@69 137 /* celt_inner_prod_neon_float_c_simulation() simulates the floating-point */
Chris@69 138 /* operations of celt_inner_prod_neon(), and both functions should have bit */
Chris@69 139 /* exact output. */
Chris@69 140 static opus_val32 celt_inner_prod_neon_float_c_simulation(const opus_val16 *x, const opus_val16 *y, int N)
Chris@69 141 {
Chris@69 142 int i;
Chris@69 143 opus_val32 xy, xy0 = 0, xy1 = 0, xy2 = 0, xy3 = 0;
Chris@69 144 for (i = 0; i < N - 3; i += 4) {
Chris@69 145 xy0 = MAC16_16(xy0, x[i + 0], y[i + 0]);
Chris@69 146 xy1 = MAC16_16(xy1, x[i + 1], y[i + 1]);
Chris@69 147 xy2 = MAC16_16(xy2, x[i + 2], y[i + 2]);
Chris@69 148 xy3 = MAC16_16(xy3, x[i + 3], y[i + 3]);
Chris@69 149 }
Chris@69 150 xy0 += xy2;
Chris@69 151 xy1 += xy3;
Chris@69 152 xy = xy0 + xy1;
Chris@69 153 for (; i < N; i++) {
Chris@69 154 xy = MAC16_16(xy, x[i], y[i]);
Chris@69 155 }
Chris@69 156 return xy;
Chris@69 157 }
Chris@69 158
Chris@69 159 /* dual_inner_prod_neon_float_c_simulation() simulates the floating-point */
Chris@69 160 /* operations of dual_inner_prod_neon(), and both functions should have bit */
Chris@69 161 /* exact output. */
Chris@69 162 static void dual_inner_prod_neon_float_c_simulation(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02,
Chris@69 163 int N, opus_val32 *xy1, opus_val32 *xy2)
Chris@69 164 {
Chris@69 165 int i;
Chris@69 166 opus_val32 xy01, xy02, xy01_0 = 0, xy01_1 = 0, xy01_2 = 0, xy01_3 = 0, xy02_0 = 0, xy02_1 = 0, xy02_2 = 0, xy02_3 = 0;
Chris@69 167 for (i = 0; i < N - 3; i += 4) {
Chris@69 168 xy01_0 = MAC16_16(xy01_0, x[i + 0], y01[i + 0]);
Chris@69 169 xy01_1 = MAC16_16(xy01_1, x[i + 1], y01[i + 1]);
Chris@69 170 xy01_2 = MAC16_16(xy01_2, x[i + 2], y01[i + 2]);
Chris@69 171 xy01_3 = MAC16_16(xy01_3, x[i + 3], y01[i + 3]);
Chris@69 172 xy02_0 = MAC16_16(xy02_0, x[i + 0], y02[i + 0]);
Chris@69 173 xy02_1 = MAC16_16(xy02_1, x[i + 1], y02[i + 1]);
Chris@69 174 xy02_2 = MAC16_16(xy02_2, x[i + 2], y02[i + 2]);
Chris@69 175 xy02_3 = MAC16_16(xy02_3, x[i + 3], y02[i + 3]);
Chris@69 176 }
Chris@69 177 xy01_0 += xy01_2;
Chris@69 178 xy02_0 += xy02_2;
Chris@69 179 xy01_1 += xy01_3;
Chris@69 180 xy02_1 += xy02_3;
Chris@69 181 xy01 = xy01_0 + xy01_1;
Chris@69 182 xy02 = xy02_0 + xy02_1;
Chris@69 183 for (; i < N; i++) {
Chris@69 184 xy01 = MAC16_16(xy01, x[i], y01[i]);
Chris@69 185 xy02 = MAC16_16(xy02, x[i], y02[i]);
Chris@69 186 }
Chris@69 187 *xy1 = xy01;
Chris@69 188 *xy2 = xy02;
Chris@69 189 }
Chris@69 190
Chris@69 191 #endif /* OPUS_CHECK_ASM */
Chris@69 192
Chris@69 193 /* ========================================================================== */
Chris@69 194
Chris@69 195 opus_val32 celt_inner_prod_neon(const opus_val16 *x, const opus_val16 *y, int N)
Chris@69 196 {
Chris@69 197 int i;
Chris@69 198 opus_val32 xy;
Chris@69 199 float32x4_t xy_f32x4 = vdupq_n_f32(0);
Chris@69 200 float32x2_t xy_f32x2;
Chris@69 201
Chris@69 202 for (i = 0; i < N - 7; i += 8) {
Chris@69 203 float32x4_t x_f32x4, y_f32x4;
Chris@69 204 x_f32x4 = vld1q_f32(&x[i]);
Chris@69 205 y_f32x4 = vld1q_f32(&y[i]);
Chris@69 206 xy_f32x4 = vmlaq_f32(xy_f32x4, x_f32x4, y_f32x4);
Chris@69 207 x_f32x4 = vld1q_f32(&x[i + 4]);
Chris@69 208 y_f32x4 = vld1q_f32(&y[i + 4]);
Chris@69 209 xy_f32x4 = vmlaq_f32(xy_f32x4, x_f32x4, y_f32x4);
Chris@69 210 }
Chris@69 211
Chris@69 212 if (N - i >= 4) {
Chris@69 213 const float32x4_t x_f32x4 = vld1q_f32(&x[i]);
Chris@69 214 const float32x4_t y_f32x4 = vld1q_f32(&y[i]);
Chris@69 215 xy_f32x4 = vmlaq_f32(xy_f32x4, x_f32x4, y_f32x4);
Chris@69 216 i += 4;
Chris@69 217 }
Chris@69 218
Chris@69 219 xy_f32x2 = vadd_f32(vget_low_f32(xy_f32x4), vget_high_f32(xy_f32x4));
Chris@69 220 xy_f32x2 = vpadd_f32(xy_f32x2, xy_f32x2);
Chris@69 221 xy = vget_lane_f32(xy_f32x2, 0);
Chris@69 222
Chris@69 223 for (; i < N; i++) {
Chris@69 224 xy = MAC16_16(xy, x[i], y[i]);
Chris@69 225 }
Chris@69 226
Chris@69 227 #ifdef OPUS_CHECK_ASM
Chris@69 228 celt_assert(ABS32(celt_inner_prod_neon_float_c_simulation(x, y, N) - xy) <= VERY_SMALL);
Chris@69 229 #endif
Chris@69 230
Chris@69 231 return xy;
Chris@69 232 }
Chris@69 233
Chris@69 234 void dual_inner_prod_neon(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02,
Chris@69 235 int N, opus_val32 *xy1, opus_val32 *xy2)
Chris@69 236 {
Chris@69 237 int i;
Chris@69 238 opus_val32 xy01, xy02;
Chris@69 239 float32x4_t xy01_f32x4 = vdupq_n_f32(0);
Chris@69 240 float32x4_t xy02_f32x4 = vdupq_n_f32(0);
Chris@69 241 float32x2_t xy01_f32x2, xy02_f32x2;
Chris@69 242
Chris@69 243 for (i = 0; i < N - 7; i += 8) {
Chris@69 244 float32x4_t x_f32x4, y01_f32x4, y02_f32x4;
Chris@69 245 x_f32x4 = vld1q_f32(&x[i]);
Chris@69 246 y01_f32x4 = vld1q_f32(&y01[i]);
Chris@69 247 y02_f32x4 = vld1q_f32(&y02[i]);
Chris@69 248 xy01_f32x4 = vmlaq_f32(xy01_f32x4, x_f32x4, y01_f32x4);
Chris@69 249 xy02_f32x4 = vmlaq_f32(xy02_f32x4, x_f32x4, y02_f32x4);
Chris@69 250 x_f32x4 = vld1q_f32(&x[i + 4]);
Chris@69 251 y01_f32x4 = vld1q_f32(&y01[i + 4]);
Chris@69 252 y02_f32x4 = vld1q_f32(&y02[i + 4]);
Chris@69 253 xy01_f32x4 = vmlaq_f32(xy01_f32x4, x_f32x4, y01_f32x4);
Chris@69 254 xy02_f32x4 = vmlaq_f32(xy02_f32x4, x_f32x4, y02_f32x4);
Chris@69 255 }
Chris@69 256
Chris@69 257 if (N - i >= 4) {
Chris@69 258 const float32x4_t x_f32x4 = vld1q_f32(&x[i]);
Chris@69 259 const float32x4_t y01_f32x4 = vld1q_f32(&y01[i]);
Chris@69 260 const float32x4_t y02_f32x4 = vld1q_f32(&y02[i]);
Chris@69 261 xy01_f32x4 = vmlaq_f32(xy01_f32x4, x_f32x4, y01_f32x4);
Chris@69 262 xy02_f32x4 = vmlaq_f32(xy02_f32x4, x_f32x4, y02_f32x4);
Chris@69 263 i += 4;
Chris@69 264 }
Chris@69 265
Chris@69 266 xy01_f32x2 = vadd_f32(vget_low_f32(xy01_f32x4), vget_high_f32(xy01_f32x4));
Chris@69 267 xy02_f32x2 = vadd_f32(vget_low_f32(xy02_f32x4), vget_high_f32(xy02_f32x4));
Chris@69 268 xy01_f32x2 = vpadd_f32(xy01_f32x2, xy01_f32x2);
Chris@69 269 xy02_f32x2 = vpadd_f32(xy02_f32x2, xy02_f32x2);
Chris@69 270 xy01 = vget_lane_f32(xy01_f32x2, 0);
Chris@69 271 xy02 = vget_lane_f32(xy02_f32x2, 0);
Chris@69 272
Chris@69 273 for (; i < N; i++) {
Chris@69 274 xy01 = MAC16_16(xy01, x[i], y01[i]);
Chris@69 275 xy02 = MAC16_16(xy02, x[i], y02[i]);
Chris@69 276 }
Chris@69 277 *xy1 = xy01;
Chris@69 278 *xy2 = xy02;
Chris@69 279
Chris@69 280 #ifdef OPUS_CHECK_ASM
Chris@69 281 {
Chris@69 282 opus_val32 xy1_c, xy2_c;
Chris@69 283 dual_inner_prod_neon_float_c_simulation(x, y01, y02, N, &xy1_c, &xy2_c);
Chris@69 284 celt_assert(ABS32(xy1_c - *xy1) <= VERY_SMALL);
Chris@69 285 celt_assert(ABS32(xy2_c - *xy2) <= VERY_SMALL);
Chris@69 286 }
Chris@69 287 #endif
Chris@69 288 }
Chris@69 289
Chris@69 290 #endif /* FIXED_POINT */