cannam@154
|
1 /***********************************************************************
|
cannam@154
|
2 Copyright (c) 2017 Google Inc.
|
cannam@154
|
3 Redistribution and use in source and binary forms, with or without
|
cannam@154
|
4 modification, are permitted provided that the following conditions
|
cannam@154
|
5 are met:
|
cannam@154
|
6 - Redistributions of source code must retain the above copyright notice,
|
cannam@154
|
7 this list of conditions and the following disclaimer.
|
cannam@154
|
8 - Redistributions in binary form must reproduce the above copyright
|
cannam@154
|
9 notice, this list of conditions and the following disclaimer in the
|
cannam@154
|
10 documentation and/or other materials provided with the distribution.
|
cannam@154
|
11 - Neither the name of Internet Society, IETF or IETF Trust, nor the
|
cannam@154
|
12 names of specific contributors, may be used to endorse or promote
|
cannam@154
|
13 products derived from this software without specific prior written
|
cannam@154
|
14 permission.
|
cannam@154
|
15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
cannam@154
|
16 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
cannam@154
|
17 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
cannam@154
|
18 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
cannam@154
|
19 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
cannam@154
|
20 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
cannam@154
|
21 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
cannam@154
|
22 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
cannam@154
|
23 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
cannam@154
|
24 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
cannam@154
|
25 POSSIBILITY OF SUCH DAMAGE.
|
cannam@154
|
26 ***********************************************************************/
|
cannam@154
|
27
|
cannam@154
|
28 #ifdef HAVE_CONFIG_H
|
cannam@154
|
29 #include "config.h"
|
cannam@154
|
30 #endif
|
cannam@154
|
31
|
cannam@154
|
32 #include <arm_neon.h>
|
cannam@154
|
33 #include "pitch.h"
|
cannam@154
|
34
|
cannam@154
|
35 #ifdef FIXED_POINT
|
cannam@154
|
36
|
cannam@154
|
37 opus_val32 celt_inner_prod_neon(const opus_val16 *x, const opus_val16 *y, int N)
|
cannam@154
|
38 {
|
cannam@154
|
39 int i;
|
cannam@154
|
40 opus_val32 xy;
|
cannam@154
|
41 int16x8_t x_s16x8, y_s16x8;
|
cannam@154
|
42 int32x4_t xy_s32x4 = vdupq_n_s32(0);
|
cannam@154
|
43 int64x2_t xy_s64x2;
|
cannam@154
|
44 int64x1_t xy_s64x1;
|
cannam@154
|
45
|
cannam@154
|
46 for (i = 0; i < N - 7; i += 8) {
|
cannam@154
|
47 x_s16x8 = vld1q_s16(&x[i]);
|
cannam@154
|
48 y_s16x8 = vld1q_s16(&y[i]);
|
cannam@154
|
49 xy_s32x4 = vmlal_s16(xy_s32x4, vget_low_s16 (x_s16x8), vget_low_s16 (y_s16x8));
|
cannam@154
|
50 xy_s32x4 = vmlal_s16(xy_s32x4, vget_high_s16(x_s16x8), vget_high_s16(y_s16x8));
|
cannam@154
|
51 }
|
cannam@154
|
52
|
cannam@154
|
53 if (N - i >= 4) {
|
cannam@154
|
54 const int16x4_t x_s16x4 = vld1_s16(&x[i]);
|
cannam@154
|
55 const int16x4_t y_s16x4 = vld1_s16(&y[i]);
|
cannam@154
|
56 xy_s32x4 = vmlal_s16(xy_s32x4, x_s16x4, y_s16x4);
|
cannam@154
|
57 i += 4;
|
cannam@154
|
58 }
|
cannam@154
|
59
|
cannam@154
|
60 xy_s64x2 = vpaddlq_s32(xy_s32x4);
|
cannam@154
|
61 xy_s64x1 = vadd_s64(vget_low_s64(xy_s64x2), vget_high_s64(xy_s64x2));
|
cannam@154
|
62 xy = vget_lane_s32(vreinterpret_s32_s64(xy_s64x1), 0);
|
cannam@154
|
63
|
cannam@154
|
64 for (; i < N; i++) {
|
cannam@154
|
65 xy = MAC16_16(xy, x[i], y[i]);
|
cannam@154
|
66 }
|
cannam@154
|
67
|
cannam@154
|
68 #ifdef OPUS_CHECK_ASM
|
cannam@154
|
69 celt_assert(celt_inner_prod_c(x, y, N) == xy);
|
cannam@154
|
70 #endif
|
cannam@154
|
71
|
cannam@154
|
72 return xy;
|
cannam@154
|
73 }
|
cannam@154
|
74
|
cannam@154
|
75 void dual_inner_prod_neon(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02,
|
cannam@154
|
76 int N, opus_val32 *xy1, opus_val32 *xy2)
|
cannam@154
|
77 {
|
cannam@154
|
78 int i;
|
cannam@154
|
79 opus_val32 xy01, xy02;
|
cannam@154
|
80 int16x8_t x_s16x8, y01_s16x8, y02_s16x8;
|
cannam@154
|
81 int32x4_t xy01_s32x4 = vdupq_n_s32(0);
|
cannam@154
|
82 int32x4_t xy02_s32x4 = vdupq_n_s32(0);
|
cannam@154
|
83 int64x2_t xy01_s64x2, xy02_s64x2;
|
cannam@154
|
84 int64x1_t xy01_s64x1, xy02_s64x1;
|
cannam@154
|
85
|
cannam@154
|
86 for (i = 0; i < N - 7; i += 8) {
|
cannam@154
|
87 x_s16x8 = vld1q_s16(&x[i]);
|
cannam@154
|
88 y01_s16x8 = vld1q_s16(&y01[i]);
|
cannam@154
|
89 y02_s16x8 = vld1q_s16(&y02[i]);
|
cannam@154
|
90 xy01_s32x4 = vmlal_s16(xy01_s32x4, vget_low_s16 (x_s16x8), vget_low_s16 (y01_s16x8));
|
cannam@154
|
91 xy02_s32x4 = vmlal_s16(xy02_s32x4, vget_low_s16 (x_s16x8), vget_low_s16 (y02_s16x8));
|
cannam@154
|
92 xy01_s32x4 = vmlal_s16(xy01_s32x4, vget_high_s16(x_s16x8), vget_high_s16(y01_s16x8));
|
cannam@154
|
93 xy02_s32x4 = vmlal_s16(xy02_s32x4, vget_high_s16(x_s16x8), vget_high_s16(y02_s16x8));
|
cannam@154
|
94 }
|
cannam@154
|
95
|
cannam@154
|
96 if (N - i >= 4) {
|
cannam@154
|
97 const int16x4_t x_s16x4 = vld1_s16(&x[i]);
|
cannam@154
|
98 const int16x4_t y01_s16x4 = vld1_s16(&y01[i]);
|
cannam@154
|
99 const int16x4_t y02_s16x4 = vld1_s16(&y02[i]);
|
cannam@154
|
100 xy01_s32x4 = vmlal_s16(xy01_s32x4, x_s16x4, y01_s16x4);
|
cannam@154
|
101 xy02_s32x4 = vmlal_s16(xy02_s32x4, x_s16x4, y02_s16x4);
|
cannam@154
|
102 i += 4;
|
cannam@154
|
103 }
|
cannam@154
|
104
|
cannam@154
|
105 xy01_s64x2 = vpaddlq_s32(xy01_s32x4);
|
cannam@154
|
106 xy02_s64x2 = vpaddlq_s32(xy02_s32x4);
|
cannam@154
|
107 xy01_s64x1 = vadd_s64(vget_low_s64(xy01_s64x2), vget_high_s64(xy01_s64x2));
|
cannam@154
|
108 xy02_s64x1 = vadd_s64(vget_low_s64(xy02_s64x2), vget_high_s64(xy02_s64x2));
|
cannam@154
|
109 xy01 = vget_lane_s32(vreinterpret_s32_s64(xy01_s64x1), 0);
|
cannam@154
|
110 xy02 = vget_lane_s32(vreinterpret_s32_s64(xy02_s64x1), 0);
|
cannam@154
|
111
|
cannam@154
|
112 for (; i < N; i++) {
|
cannam@154
|
113 xy01 = MAC16_16(xy01, x[i], y01[i]);
|
cannam@154
|
114 xy02 = MAC16_16(xy02, x[i], y02[i]);
|
cannam@154
|
115 }
|
cannam@154
|
116 *xy1 = xy01;
|
cannam@154
|
117 *xy2 = xy02;
|
cannam@154
|
118
|
cannam@154
|
119 #ifdef OPUS_CHECK_ASM
|
cannam@154
|
120 {
|
cannam@154
|
121 opus_val32 xy1_c, xy2_c;
|
cannam@154
|
122 dual_inner_prod_c(x, y01, y02, N, &xy1_c, &xy2_c);
|
cannam@154
|
123 celt_assert(xy1_c == *xy1);
|
cannam@154
|
124 celt_assert(xy2_c == *xy2);
|
cannam@154
|
125 }
|
cannam@154
|
126 #endif
|
cannam@154
|
127 }
|
cannam@154
|
128
|
cannam@154
|
129 #else /* !FIXED_POINT */
|
cannam@154
|
130
|
cannam@154
|
131 /* ========================================================================== */
|
cannam@154
|
132
|
cannam@154
|
133 #ifdef OPUS_CHECK_ASM
|
cannam@154
|
134
|
cannam@154
|
135 /* This part of code simulates floating-point NEON operations. */
|
cannam@154
|
136
|
cannam@154
|
137 /* celt_inner_prod_neon_float_c_simulation() simulates the floating-point */
|
cannam@154
|
138 /* operations of celt_inner_prod_neon(), and both functions should have bit */
|
cannam@154
|
139 /* exact output. */
|
cannam@154
|
140 static opus_val32 celt_inner_prod_neon_float_c_simulation(const opus_val16 *x, const opus_val16 *y, int N)
|
cannam@154
|
141 {
|
cannam@154
|
142 int i;
|
cannam@154
|
143 opus_val32 xy, xy0 = 0, xy1 = 0, xy2 = 0, xy3 = 0;
|
cannam@154
|
144 for (i = 0; i < N - 3; i += 4) {
|
cannam@154
|
145 xy0 = MAC16_16(xy0, x[i + 0], y[i + 0]);
|
cannam@154
|
146 xy1 = MAC16_16(xy1, x[i + 1], y[i + 1]);
|
cannam@154
|
147 xy2 = MAC16_16(xy2, x[i + 2], y[i + 2]);
|
cannam@154
|
148 xy3 = MAC16_16(xy3, x[i + 3], y[i + 3]);
|
cannam@154
|
149 }
|
cannam@154
|
150 xy0 += xy2;
|
cannam@154
|
151 xy1 += xy3;
|
cannam@154
|
152 xy = xy0 + xy1;
|
cannam@154
|
153 for (; i < N; i++) {
|
cannam@154
|
154 xy = MAC16_16(xy, x[i], y[i]);
|
cannam@154
|
155 }
|
cannam@154
|
156 return xy;
|
cannam@154
|
157 }
|
cannam@154
|
158
|
cannam@154
|
159 /* dual_inner_prod_neon_float_c_simulation() simulates the floating-point */
|
cannam@154
|
160 /* operations of dual_inner_prod_neon(), and both functions should have bit */
|
cannam@154
|
161 /* exact output. */
|
cannam@154
|
162 static void dual_inner_prod_neon_float_c_simulation(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02,
|
cannam@154
|
163 int N, opus_val32 *xy1, opus_val32 *xy2)
|
cannam@154
|
164 {
|
cannam@154
|
165 int i;
|
cannam@154
|
166 opus_val32 xy01, xy02, xy01_0 = 0, xy01_1 = 0, xy01_2 = 0, xy01_3 = 0, xy02_0 = 0, xy02_1 = 0, xy02_2 = 0, xy02_3 = 0;
|
cannam@154
|
167 for (i = 0; i < N - 3; i += 4) {
|
cannam@154
|
168 xy01_0 = MAC16_16(xy01_0, x[i + 0], y01[i + 0]);
|
cannam@154
|
169 xy01_1 = MAC16_16(xy01_1, x[i + 1], y01[i + 1]);
|
cannam@154
|
170 xy01_2 = MAC16_16(xy01_2, x[i + 2], y01[i + 2]);
|
cannam@154
|
171 xy01_3 = MAC16_16(xy01_3, x[i + 3], y01[i + 3]);
|
cannam@154
|
172 xy02_0 = MAC16_16(xy02_0, x[i + 0], y02[i + 0]);
|
cannam@154
|
173 xy02_1 = MAC16_16(xy02_1, x[i + 1], y02[i + 1]);
|
cannam@154
|
174 xy02_2 = MAC16_16(xy02_2, x[i + 2], y02[i + 2]);
|
cannam@154
|
175 xy02_3 = MAC16_16(xy02_3, x[i + 3], y02[i + 3]);
|
cannam@154
|
176 }
|
cannam@154
|
177 xy01_0 += xy01_2;
|
cannam@154
|
178 xy02_0 += xy02_2;
|
cannam@154
|
179 xy01_1 += xy01_3;
|
cannam@154
|
180 xy02_1 += xy02_3;
|
cannam@154
|
181 xy01 = xy01_0 + xy01_1;
|
cannam@154
|
182 xy02 = xy02_0 + xy02_1;
|
cannam@154
|
183 for (; i < N; i++) {
|
cannam@154
|
184 xy01 = MAC16_16(xy01, x[i], y01[i]);
|
cannam@154
|
185 xy02 = MAC16_16(xy02, x[i], y02[i]);
|
cannam@154
|
186 }
|
cannam@154
|
187 *xy1 = xy01;
|
cannam@154
|
188 *xy2 = xy02;
|
cannam@154
|
189 }
|
cannam@154
|
190
|
cannam@154
|
191 #endif /* OPUS_CHECK_ASM */
|
cannam@154
|
192
|
cannam@154
|
193 /* ========================================================================== */
|
cannam@154
|
194
|
cannam@154
|
195 opus_val32 celt_inner_prod_neon(const opus_val16 *x, const opus_val16 *y, int N)
|
cannam@154
|
196 {
|
cannam@154
|
197 int i;
|
cannam@154
|
198 opus_val32 xy;
|
cannam@154
|
199 float32x4_t xy_f32x4 = vdupq_n_f32(0);
|
cannam@154
|
200 float32x2_t xy_f32x2;
|
cannam@154
|
201
|
cannam@154
|
202 for (i = 0; i < N - 7; i += 8) {
|
cannam@154
|
203 float32x4_t x_f32x4, y_f32x4;
|
cannam@154
|
204 x_f32x4 = vld1q_f32(&x[i]);
|
cannam@154
|
205 y_f32x4 = vld1q_f32(&y[i]);
|
cannam@154
|
206 xy_f32x4 = vmlaq_f32(xy_f32x4, x_f32x4, y_f32x4);
|
cannam@154
|
207 x_f32x4 = vld1q_f32(&x[i + 4]);
|
cannam@154
|
208 y_f32x4 = vld1q_f32(&y[i + 4]);
|
cannam@154
|
209 xy_f32x4 = vmlaq_f32(xy_f32x4, x_f32x4, y_f32x4);
|
cannam@154
|
210 }
|
cannam@154
|
211
|
cannam@154
|
212 if (N - i >= 4) {
|
cannam@154
|
213 const float32x4_t x_f32x4 = vld1q_f32(&x[i]);
|
cannam@154
|
214 const float32x4_t y_f32x4 = vld1q_f32(&y[i]);
|
cannam@154
|
215 xy_f32x4 = vmlaq_f32(xy_f32x4, x_f32x4, y_f32x4);
|
cannam@154
|
216 i += 4;
|
cannam@154
|
217 }
|
cannam@154
|
218
|
cannam@154
|
219 xy_f32x2 = vadd_f32(vget_low_f32(xy_f32x4), vget_high_f32(xy_f32x4));
|
cannam@154
|
220 xy_f32x2 = vpadd_f32(xy_f32x2, xy_f32x2);
|
cannam@154
|
221 xy = vget_lane_f32(xy_f32x2, 0);
|
cannam@154
|
222
|
cannam@154
|
223 for (; i < N; i++) {
|
cannam@154
|
224 xy = MAC16_16(xy, x[i], y[i]);
|
cannam@154
|
225 }
|
cannam@154
|
226
|
cannam@154
|
227 #ifdef OPUS_CHECK_ASM
|
cannam@154
|
228 celt_assert(ABS32(celt_inner_prod_neon_float_c_simulation(x, y, N) - xy) <= VERY_SMALL);
|
cannam@154
|
229 #endif
|
cannam@154
|
230
|
cannam@154
|
231 return xy;
|
cannam@154
|
232 }
|
cannam@154
|
233
|
cannam@154
|
234 void dual_inner_prod_neon(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02,
|
cannam@154
|
235 int N, opus_val32 *xy1, opus_val32 *xy2)
|
cannam@154
|
236 {
|
cannam@154
|
237 int i;
|
cannam@154
|
238 opus_val32 xy01, xy02;
|
cannam@154
|
239 float32x4_t xy01_f32x4 = vdupq_n_f32(0);
|
cannam@154
|
240 float32x4_t xy02_f32x4 = vdupq_n_f32(0);
|
cannam@154
|
241 float32x2_t xy01_f32x2, xy02_f32x2;
|
cannam@154
|
242
|
cannam@154
|
243 for (i = 0; i < N - 7; i += 8) {
|
cannam@154
|
244 float32x4_t x_f32x4, y01_f32x4, y02_f32x4;
|
cannam@154
|
245 x_f32x4 = vld1q_f32(&x[i]);
|
cannam@154
|
246 y01_f32x4 = vld1q_f32(&y01[i]);
|
cannam@154
|
247 y02_f32x4 = vld1q_f32(&y02[i]);
|
cannam@154
|
248 xy01_f32x4 = vmlaq_f32(xy01_f32x4, x_f32x4, y01_f32x4);
|
cannam@154
|
249 xy02_f32x4 = vmlaq_f32(xy02_f32x4, x_f32x4, y02_f32x4);
|
cannam@154
|
250 x_f32x4 = vld1q_f32(&x[i + 4]);
|
cannam@154
|
251 y01_f32x4 = vld1q_f32(&y01[i + 4]);
|
cannam@154
|
252 y02_f32x4 = vld1q_f32(&y02[i + 4]);
|
cannam@154
|
253 xy01_f32x4 = vmlaq_f32(xy01_f32x4, x_f32x4, y01_f32x4);
|
cannam@154
|
254 xy02_f32x4 = vmlaq_f32(xy02_f32x4, x_f32x4, y02_f32x4);
|
cannam@154
|
255 }
|
cannam@154
|
256
|
cannam@154
|
257 if (N - i >= 4) {
|
cannam@154
|
258 const float32x4_t x_f32x4 = vld1q_f32(&x[i]);
|
cannam@154
|
259 const float32x4_t y01_f32x4 = vld1q_f32(&y01[i]);
|
cannam@154
|
260 const float32x4_t y02_f32x4 = vld1q_f32(&y02[i]);
|
cannam@154
|
261 xy01_f32x4 = vmlaq_f32(xy01_f32x4, x_f32x4, y01_f32x4);
|
cannam@154
|
262 xy02_f32x4 = vmlaq_f32(xy02_f32x4, x_f32x4, y02_f32x4);
|
cannam@154
|
263 i += 4;
|
cannam@154
|
264 }
|
cannam@154
|
265
|
cannam@154
|
266 xy01_f32x2 = vadd_f32(vget_low_f32(xy01_f32x4), vget_high_f32(xy01_f32x4));
|
cannam@154
|
267 xy02_f32x2 = vadd_f32(vget_low_f32(xy02_f32x4), vget_high_f32(xy02_f32x4));
|
cannam@154
|
268 xy01_f32x2 = vpadd_f32(xy01_f32x2, xy01_f32x2);
|
cannam@154
|
269 xy02_f32x2 = vpadd_f32(xy02_f32x2, xy02_f32x2);
|
cannam@154
|
270 xy01 = vget_lane_f32(xy01_f32x2, 0);
|
cannam@154
|
271 xy02 = vget_lane_f32(xy02_f32x2, 0);
|
cannam@154
|
272
|
cannam@154
|
273 for (; i < N; i++) {
|
cannam@154
|
274 xy01 = MAC16_16(xy01, x[i], y01[i]);
|
cannam@154
|
275 xy02 = MAC16_16(xy02, x[i], y02[i]);
|
cannam@154
|
276 }
|
cannam@154
|
277 *xy1 = xy01;
|
cannam@154
|
278 *xy2 = xy02;
|
cannam@154
|
279
|
cannam@154
|
280 #ifdef OPUS_CHECK_ASM
|
cannam@154
|
281 {
|
cannam@154
|
282 opus_val32 xy1_c, xy2_c;
|
cannam@154
|
283 dual_inner_prod_neon_float_c_simulation(x, y01, y02, N, &xy1_c, &xy2_c);
|
cannam@154
|
284 celt_assert(ABS32(xy1_c - *xy1) <= VERY_SMALL);
|
cannam@154
|
285 celt_assert(ABS32(xy2_c - *xy2) <= VERY_SMALL);
|
cannam@154
|
286 }
|
cannam@154
|
287 #endif
|
cannam@154
|
288 }
|
cannam@154
|
289
|
cannam@154
|
290 #endif /* FIXED_POINT */
|