Mercurial > hg > sv-dependency-builds
comparison src/opus-1.3/celt/arm/pitch_neon_intr.c @ 69:7aeed7906520
Add Opus sources and macOS builds
author | Chris Cannam |
---|---|
date | Wed, 23 Jan 2019 13:48:08 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
68:85d5306e114e | 69:7aeed7906520 |
---|---|
1 /*********************************************************************** | |
2 Copyright (c) 2017 Google Inc. | |
3 Redistribution and use in source and binary forms, with or without | |
4 modification, are permitted provided that the following conditions | |
5 are met: | |
6 - Redistributions of source code must retain the above copyright notice, | |
7 this list of conditions and the following disclaimer. | |
8 - Redistributions in binary form must reproduce the above copyright | |
9 notice, this list of conditions and the following disclaimer in the | |
10 documentation and/or other materials provided with the distribution. | |
11 - Neither the name of Internet Society, IETF or IETF Trust, nor the | |
12 names of specific contributors, may be used to endorse or promote | |
13 products derived from this software without specific prior written | |
14 permission. | |
15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |
16 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
17 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |
18 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |
19 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | |
20 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | |
21 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | |
22 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | |
23 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | |
24 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | |
25 POSSIBILITY OF SUCH DAMAGE. | |
26 ***********************************************************************/ | |
27 | |
28 #ifdef HAVE_CONFIG_H | |
29 #include "config.h" | |
30 #endif | |
31 | |
32 #include <arm_neon.h> | |
33 #include "pitch.h" | |
34 | |
35 #ifdef FIXED_POINT | |
36 | |
37 opus_val32 celt_inner_prod_neon(const opus_val16 *x, const opus_val16 *y, int N) | |
38 { | |
39 int i; | |
40 opus_val32 xy; | |
41 int16x8_t x_s16x8, y_s16x8; | |
42 int32x4_t xy_s32x4 = vdupq_n_s32(0); | |
43 int64x2_t xy_s64x2; | |
44 int64x1_t xy_s64x1; | |
45 | |
46 for (i = 0; i < N - 7; i += 8) { | |
47 x_s16x8 = vld1q_s16(&x[i]); | |
48 y_s16x8 = vld1q_s16(&y[i]); | |
49 xy_s32x4 = vmlal_s16(xy_s32x4, vget_low_s16 (x_s16x8), vget_low_s16 (y_s16x8)); | |
50 xy_s32x4 = vmlal_s16(xy_s32x4, vget_high_s16(x_s16x8), vget_high_s16(y_s16x8)); | |
51 } | |
52 | |
53 if (N - i >= 4) { | |
54 const int16x4_t x_s16x4 = vld1_s16(&x[i]); | |
55 const int16x4_t y_s16x4 = vld1_s16(&y[i]); | |
56 xy_s32x4 = vmlal_s16(xy_s32x4, x_s16x4, y_s16x4); | |
57 i += 4; | |
58 } | |
59 | |
60 xy_s64x2 = vpaddlq_s32(xy_s32x4); | |
61 xy_s64x1 = vadd_s64(vget_low_s64(xy_s64x2), vget_high_s64(xy_s64x2)); | |
62 xy = vget_lane_s32(vreinterpret_s32_s64(xy_s64x1), 0); | |
63 | |
64 for (; i < N; i++) { | |
65 xy = MAC16_16(xy, x[i], y[i]); | |
66 } | |
67 | |
68 #ifdef OPUS_CHECK_ASM | |
69 celt_assert(celt_inner_prod_c(x, y, N) == xy); | |
70 #endif | |
71 | |
72 return xy; | |
73 } | |
74 | |
75 void dual_inner_prod_neon(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02, | |
76 int N, opus_val32 *xy1, opus_val32 *xy2) | |
77 { | |
78 int i; | |
79 opus_val32 xy01, xy02; | |
80 int16x8_t x_s16x8, y01_s16x8, y02_s16x8; | |
81 int32x4_t xy01_s32x4 = vdupq_n_s32(0); | |
82 int32x4_t xy02_s32x4 = vdupq_n_s32(0); | |
83 int64x2_t xy01_s64x2, xy02_s64x2; | |
84 int64x1_t xy01_s64x1, xy02_s64x1; | |
85 | |
86 for (i = 0; i < N - 7; i += 8) { | |
87 x_s16x8 = vld1q_s16(&x[i]); | |
88 y01_s16x8 = vld1q_s16(&y01[i]); | |
89 y02_s16x8 = vld1q_s16(&y02[i]); | |
90 xy01_s32x4 = vmlal_s16(xy01_s32x4, vget_low_s16 (x_s16x8), vget_low_s16 (y01_s16x8)); | |
91 xy02_s32x4 = vmlal_s16(xy02_s32x4, vget_low_s16 (x_s16x8), vget_low_s16 (y02_s16x8)); | |
92 xy01_s32x4 = vmlal_s16(xy01_s32x4, vget_high_s16(x_s16x8), vget_high_s16(y01_s16x8)); | |
93 xy02_s32x4 = vmlal_s16(xy02_s32x4, vget_high_s16(x_s16x8), vget_high_s16(y02_s16x8)); | |
94 } | |
95 | |
96 if (N - i >= 4) { | |
97 const int16x4_t x_s16x4 = vld1_s16(&x[i]); | |
98 const int16x4_t y01_s16x4 = vld1_s16(&y01[i]); | |
99 const int16x4_t y02_s16x4 = vld1_s16(&y02[i]); | |
100 xy01_s32x4 = vmlal_s16(xy01_s32x4, x_s16x4, y01_s16x4); | |
101 xy02_s32x4 = vmlal_s16(xy02_s32x4, x_s16x4, y02_s16x4); | |
102 i += 4; | |
103 } | |
104 | |
105 xy01_s64x2 = vpaddlq_s32(xy01_s32x4); | |
106 xy02_s64x2 = vpaddlq_s32(xy02_s32x4); | |
107 xy01_s64x1 = vadd_s64(vget_low_s64(xy01_s64x2), vget_high_s64(xy01_s64x2)); | |
108 xy02_s64x1 = vadd_s64(vget_low_s64(xy02_s64x2), vget_high_s64(xy02_s64x2)); | |
109 xy01 = vget_lane_s32(vreinterpret_s32_s64(xy01_s64x1), 0); | |
110 xy02 = vget_lane_s32(vreinterpret_s32_s64(xy02_s64x1), 0); | |
111 | |
112 for (; i < N; i++) { | |
113 xy01 = MAC16_16(xy01, x[i], y01[i]); | |
114 xy02 = MAC16_16(xy02, x[i], y02[i]); | |
115 } | |
116 *xy1 = xy01; | |
117 *xy2 = xy02; | |
118 | |
119 #ifdef OPUS_CHECK_ASM | |
120 { | |
121 opus_val32 xy1_c, xy2_c; | |
122 dual_inner_prod_c(x, y01, y02, N, &xy1_c, &xy2_c); | |
123 celt_assert(xy1_c == *xy1); | |
124 celt_assert(xy2_c == *xy2); | |
125 } | |
126 #endif | |
127 } | |
128 | |
129 #else /* !FIXED_POINT */ | |
130 | |
131 /* ========================================================================== */ | |
132 | |
133 #ifdef OPUS_CHECK_ASM | |
134 | |
135 /* This part of code simulates floating-point NEON operations. */ | |
136 | |
137 /* celt_inner_prod_neon_float_c_simulation() simulates the floating-point */ | |
138 /* operations of celt_inner_prod_neon(), and both functions should have bit */ | |
139 /* exact output. */ | |
140 static opus_val32 celt_inner_prod_neon_float_c_simulation(const opus_val16 *x, const opus_val16 *y, int N) | |
141 { | |
142 int i; | |
143 opus_val32 xy, xy0 = 0, xy1 = 0, xy2 = 0, xy3 = 0; | |
144 for (i = 0; i < N - 3; i += 4) { | |
145 xy0 = MAC16_16(xy0, x[i + 0], y[i + 0]); | |
146 xy1 = MAC16_16(xy1, x[i + 1], y[i + 1]); | |
147 xy2 = MAC16_16(xy2, x[i + 2], y[i + 2]); | |
148 xy3 = MAC16_16(xy3, x[i + 3], y[i + 3]); | |
149 } | |
150 xy0 += xy2; | |
151 xy1 += xy3; | |
152 xy = xy0 + xy1; | |
153 for (; i < N; i++) { | |
154 xy = MAC16_16(xy, x[i], y[i]); | |
155 } | |
156 return xy; | |
157 } | |
158 | |
159 /* dual_inner_prod_neon_float_c_simulation() simulates the floating-point */ | |
160 /* operations of dual_inner_prod_neon(), and both functions should have bit */ | |
161 /* exact output. */ | |
162 static void dual_inner_prod_neon_float_c_simulation(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02, | |
163 int N, opus_val32 *xy1, opus_val32 *xy2) | |
164 { | |
165 int i; | |
166 opus_val32 xy01, xy02, xy01_0 = 0, xy01_1 = 0, xy01_2 = 0, xy01_3 = 0, xy02_0 = 0, xy02_1 = 0, xy02_2 = 0, xy02_3 = 0; | |
167 for (i = 0; i < N - 3; i += 4) { | |
168 xy01_0 = MAC16_16(xy01_0, x[i + 0], y01[i + 0]); | |
169 xy01_1 = MAC16_16(xy01_1, x[i + 1], y01[i + 1]); | |
170 xy01_2 = MAC16_16(xy01_2, x[i + 2], y01[i + 2]); | |
171 xy01_3 = MAC16_16(xy01_3, x[i + 3], y01[i + 3]); | |
172 xy02_0 = MAC16_16(xy02_0, x[i + 0], y02[i + 0]); | |
173 xy02_1 = MAC16_16(xy02_1, x[i + 1], y02[i + 1]); | |
174 xy02_2 = MAC16_16(xy02_2, x[i + 2], y02[i + 2]); | |
175 xy02_3 = MAC16_16(xy02_3, x[i + 3], y02[i + 3]); | |
176 } | |
177 xy01_0 += xy01_2; | |
178 xy02_0 += xy02_2; | |
179 xy01_1 += xy01_3; | |
180 xy02_1 += xy02_3; | |
181 xy01 = xy01_0 + xy01_1; | |
182 xy02 = xy02_0 + xy02_1; | |
183 for (; i < N; i++) { | |
184 xy01 = MAC16_16(xy01, x[i], y01[i]); | |
185 xy02 = MAC16_16(xy02, x[i], y02[i]); | |
186 } | |
187 *xy1 = xy01; | |
188 *xy2 = xy02; | |
189 } | |
190 | |
191 #endif /* OPUS_CHECK_ASM */ | |
192 | |
193 /* ========================================================================== */ | |
194 | |
195 opus_val32 celt_inner_prod_neon(const opus_val16 *x, const opus_val16 *y, int N) | |
196 { | |
197 int i; | |
198 opus_val32 xy; | |
199 float32x4_t xy_f32x4 = vdupq_n_f32(0); | |
200 float32x2_t xy_f32x2; | |
201 | |
202 for (i = 0; i < N - 7; i += 8) { | |
203 float32x4_t x_f32x4, y_f32x4; | |
204 x_f32x4 = vld1q_f32(&x[i]); | |
205 y_f32x4 = vld1q_f32(&y[i]); | |
206 xy_f32x4 = vmlaq_f32(xy_f32x4, x_f32x4, y_f32x4); | |
207 x_f32x4 = vld1q_f32(&x[i + 4]); | |
208 y_f32x4 = vld1q_f32(&y[i + 4]); | |
209 xy_f32x4 = vmlaq_f32(xy_f32x4, x_f32x4, y_f32x4); | |
210 } | |
211 | |
212 if (N - i >= 4) { | |
213 const float32x4_t x_f32x4 = vld1q_f32(&x[i]); | |
214 const float32x4_t y_f32x4 = vld1q_f32(&y[i]); | |
215 xy_f32x4 = vmlaq_f32(xy_f32x4, x_f32x4, y_f32x4); | |
216 i += 4; | |
217 } | |
218 | |
219 xy_f32x2 = vadd_f32(vget_low_f32(xy_f32x4), vget_high_f32(xy_f32x4)); | |
220 xy_f32x2 = vpadd_f32(xy_f32x2, xy_f32x2); | |
221 xy = vget_lane_f32(xy_f32x2, 0); | |
222 | |
223 for (; i < N; i++) { | |
224 xy = MAC16_16(xy, x[i], y[i]); | |
225 } | |
226 | |
227 #ifdef OPUS_CHECK_ASM | |
228 celt_assert(ABS32(celt_inner_prod_neon_float_c_simulation(x, y, N) - xy) <= VERY_SMALL); | |
229 #endif | |
230 | |
231 return xy; | |
232 } | |
233 | |
234 void dual_inner_prod_neon(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02, | |
235 int N, opus_val32 *xy1, opus_val32 *xy2) | |
236 { | |
237 int i; | |
238 opus_val32 xy01, xy02; | |
239 float32x4_t xy01_f32x4 = vdupq_n_f32(0); | |
240 float32x4_t xy02_f32x4 = vdupq_n_f32(0); | |
241 float32x2_t xy01_f32x2, xy02_f32x2; | |
242 | |
243 for (i = 0; i < N - 7; i += 8) { | |
244 float32x4_t x_f32x4, y01_f32x4, y02_f32x4; | |
245 x_f32x4 = vld1q_f32(&x[i]); | |
246 y01_f32x4 = vld1q_f32(&y01[i]); | |
247 y02_f32x4 = vld1q_f32(&y02[i]); | |
248 xy01_f32x4 = vmlaq_f32(xy01_f32x4, x_f32x4, y01_f32x4); | |
249 xy02_f32x4 = vmlaq_f32(xy02_f32x4, x_f32x4, y02_f32x4); | |
250 x_f32x4 = vld1q_f32(&x[i + 4]); | |
251 y01_f32x4 = vld1q_f32(&y01[i + 4]); | |
252 y02_f32x4 = vld1q_f32(&y02[i + 4]); | |
253 xy01_f32x4 = vmlaq_f32(xy01_f32x4, x_f32x4, y01_f32x4); | |
254 xy02_f32x4 = vmlaq_f32(xy02_f32x4, x_f32x4, y02_f32x4); | |
255 } | |
256 | |
257 if (N - i >= 4) { | |
258 const float32x4_t x_f32x4 = vld1q_f32(&x[i]); | |
259 const float32x4_t y01_f32x4 = vld1q_f32(&y01[i]); | |
260 const float32x4_t y02_f32x4 = vld1q_f32(&y02[i]); | |
261 xy01_f32x4 = vmlaq_f32(xy01_f32x4, x_f32x4, y01_f32x4); | |
262 xy02_f32x4 = vmlaq_f32(xy02_f32x4, x_f32x4, y02_f32x4); | |
263 i += 4; | |
264 } | |
265 | |
266 xy01_f32x2 = vadd_f32(vget_low_f32(xy01_f32x4), vget_high_f32(xy01_f32x4)); | |
267 xy02_f32x2 = vadd_f32(vget_low_f32(xy02_f32x4), vget_high_f32(xy02_f32x4)); | |
268 xy01_f32x2 = vpadd_f32(xy01_f32x2, xy01_f32x2); | |
269 xy02_f32x2 = vpadd_f32(xy02_f32x2, xy02_f32x2); | |
270 xy01 = vget_lane_f32(xy01_f32x2, 0); | |
271 xy02 = vget_lane_f32(xy02_f32x2, 0); | |
272 | |
273 for (; i < N; i++) { | |
274 xy01 = MAC16_16(xy01, x[i], y01[i]); | |
275 xy02 = MAC16_16(xy02, x[i], y02[i]); | |
276 } | |
277 *xy1 = xy01; | |
278 *xy2 = xy02; | |
279 | |
280 #ifdef OPUS_CHECK_ASM | |
281 { | |
282 opus_val32 xy1_c, xy2_c; | |
283 dual_inner_prod_neon_float_c_simulation(x, y01, y02, N, &xy1_c, &xy2_c); | |
284 celt_assert(ABS32(xy1_c - *xy1) <= VERY_SMALL); | |
285 celt_assert(ABS32(xy2_c - *xy2) <= VERY_SMALL); | |
286 } | |
287 #endif | |
288 } | |
289 | |
290 #endif /* FIXED_POINT */ |