yading@10
|
1 /*
|
yading@10
|
2 * Copyright (c) 2012 Mans Rullgard
|
yading@10
|
3 *
|
yading@10
|
4 * This file is part of Libav.
|
yading@10
|
5 *
|
yading@10
|
6 * Libav is free software; you can redistribute it and/or
|
yading@10
|
7 * modify it under the terms of the GNU Lesser General Public
|
yading@10
|
8 * License as published by the Free Software Foundation; either
|
yading@10
|
9 * version 2.1 of the License, or (at your option) any later version.
|
yading@10
|
10 *
|
yading@10
|
11 * Libav is distributed in the hope that it will be useful,
|
yading@10
|
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
yading@10
|
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
yading@10
|
14 * Lesser General Public License for more details.
|
yading@10
|
15 *
|
yading@10
|
16 * You should have received a copy of the GNU Lesser General Public
|
yading@10
|
17 * License along with Libav; if not, write to the Free Software
|
yading@10
|
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
yading@10
|
19 */
|
yading@10
|
20
|
yading@10
|
21 #include "libavutil/arm/asm.S"
|
yading@10
|
22
|
yading@10
|
23 function ff_sbr_sum64x5_neon, export=1
|
yading@10
|
24 push {lr}
|
yading@10
|
25 add r1, r0, # 64*4
|
yading@10
|
26 add r2, r0, #128*4
|
yading@10
|
27 add r3, r0, #192*4
|
yading@10
|
28 add lr, r0, #256*4
|
yading@10
|
29 mov r12, #64
|
yading@10
|
30 1:
|
yading@10
|
31 vld1.32 {q0}, [r0,:128]
|
yading@10
|
32 vld1.32 {q1}, [r1,:128]!
|
yading@10
|
33 vadd.f32 q0, q0, q1
|
yading@10
|
34 vld1.32 {q2}, [r2,:128]!
|
yading@10
|
35 vadd.f32 q0, q0, q2
|
yading@10
|
36 vld1.32 {q3}, [r3,:128]!
|
yading@10
|
37 vadd.f32 q0, q0, q3
|
yading@10
|
38 vld1.32 {q8}, [lr,:128]!
|
yading@10
|
39 vadd.f32 q0, q0, q8
|
yading@10
|
40 vst1.32 {q0}, [r0,:128]!
|
yading@10
|
41 subs r12, #4
|
yading@10
|
42 bgt 1b
|
yading@10
|
43 pop {pc}
|
yading@10
|
44 endfunc
|
yading@10
|
45
|
yading@10
|
46 function ff_sbr_sum_square_neon, export=1
|
yading@10
|
47 vmov.f32 q0, #0.0
|
yading@10
|
48 1:
|
yading@10
|
49 vld1.32 {q1}, [r0,:128]!
|
yading@10
|
50 vmla.f32 q0, q1, q1
|
yading@10
|
51 subs r1, r1, #2
|
yading@10
|
52 bgt 1b
|
yading@10
|
53 vadd.f32 d0, d0, d1
|
yading@10
|
54 vpadd.f32 d0, d0, d0
|
yading@10
|
55 NOVFP vmov.32 r0, d0[0]
|
yading@10
|
56 bx lr
|
yading@10
|
57 endfunc
|
yading@10
|
58
|
yading@10
|
59 function ff_sbr_neg_odd_64_neon, export=1
|
yading@10
|
60 mov r1, r0
|
yading@10
|
61 vmov.i32 q8, #1<<31
|
yading@10
|
62 vld2.32 {q0,q1}, [r0,:128]!
|
yading@10
|
63 veor q1, q1, q8
|
yading@10
|
64 vld2.32 {q2,q3}, [r0,:128]!
|
yading@10
|
65 .rept 3
|
yading@10
|
66 vst2.32 {q0,q1}, [r1,:128]!
|
yading@10
|
67 veor q3, q3, q8
|
yading@10
|
68 vld2.32 {q0,q1}, [r0,:128]!
|
yading@10
|
69 vst2.32 {q2,q3}, [r1,:128]!
|
yading@10
|
70 veor q1, q1, q8
|
yading@10
|
71 vld2.32 {q2,q3}, [r0,:128]!
|
yading@10
|
72 .endr
|
yading@10
|
73 veor q3, q3, q8
|
yading@10
|
74 vst2.32 {q0,q1}, [r1,:128]!
|
yading@10
|
75 vst2.32 {q2,q3}, [r1,:128]!
|
yading@10
|
76 bx lr
|
yading@10
|
77 endfunc
|
yading@10
|
78
|
yading@10
|
79 function ff_sbr_qmf_pre_shuffle_neon, export=1
|
yading@10
|
80 add r1, r0, #60*4
|
yading@10
|
81 add r2, r0, #64*4
|
yading@10
|
82 vld1.32 {d0}, [r0,:64]!
|
yading@10
|
83 vst1.32 {d0}, [r2,:64]!
|
yading@10
|
84 mov r3, #-16
|
yading@10
|
85 mov r12, #24
|
yading@10
|
86 vmov.i32 q8, #1<<31
|
yading@10
|
87 vld1.32 {q0}, [r1,:128], r3
|
yading@10
|
88 vld1.32 {d2}, [r0,:64]!
|
yading@10
|
89 1:
|
yading@10
|
90 vld1.32 {d3,d4}, [r0,:128]!
|
yading@10
|
91 vrev64.32 q0, q0
|
yading@10
|
92 vld1.32 {q9}, [r1,:128], r3
|
yading@10
|
93 veor q0, q0, q8
|
yading@10
|
94 vld1.32 {d5,d6}, [r0,:128]!
|
yading@10
|
95 vswp d0, d1
|
yading@10
|
96 vrev64.32 q9, q9
|
yading@10
|
97 vst2.32 {q0,q1}, [r2,:64]!
|
yading@10
|
98 vmov q10, q2
|
yading@10
|
99 veor q9, q9, q8
|
yading@10
|
100 vmov d2, d6
|
yading@10
|
101 vswp d18, d19
|
yading@10
|
102 vld1.32 {q0}, [r1,:128], r3
|
yading@10
|
103 vst2.32 {q9,q10}, [r2,:64]!
|
yading@10
|
104 subs r12, r12, #8
|
yading@10
|
105 bgt 1b
|
yading@10
|
106 vld1.32 {d3,d4}, [r0,:128]!
|
yading@10
|
107 vrev64.32 q0, q0
|
yading@10
|
108 vld1.32 {q9}, [r1,:128], r3
|
yading@10
|
109 veor q0, q0, q8
|
yading@10
|
110 vld1.32 {d5}, [r0,:64]!
|
yading@10
|
111 vswp d0, d1
|
yading@10
|
112 vrev64.32 q9, q9
|
yading@10
|
113 vst2.32 {q0,q1}, [r2,:64]!
|
yading@10
|
114 vswp d4, d5
|
yading@10
|
115 veor q1, q9, q8
|
yading@10
|
116 vst2.32 {d3,d5}, [r2,:64]!
|
yading@10
|
117 vst2.32 {d2[0],d4[0]}, [r2,:64]!
|
yading@10
|
118 bx lr
|
yading@10
|
119 endfunc
|
yading@10
|
120
|
yading@10
|
121 function ff_sbr_qmf_post_shuffle_neon, export=1
|
yading@10
|
122 add r2, r1, #60*4
|
yading@10
|
123 mov r3, #-16
|
yading@10
|
124 mov r12, #32
|
yading@10
|
125 vmov.i32 q8, #1<<31
|
yading@10
|
126 vld1.32 {q0}, [r2,:128], r3
|
yading@10
|
127 vld1.32 {q1}, [r1,:128]!
|
yading@10
|
128 1:
|
yading@10
|
129 pld [r2, #-32]
|
yading@10
|
130 vrev64.32 q0, q0
|
yading@10
|
131 vswp d2, d3
|
yading@10
|
132 veor q0, q0, q8
|
yading@10
|
133 vld1.32 {q2}, [r2,:128], r3
|
yading@10
|
134 vld1.32 {q3}, [r1,:128]!
|
yading@10
|
135 vst2.32 {d1,d3}, [r0,:128]!
|
yading@10
|
136 vst2.32 {d0,d2}, [r0,:128]!
|
yading@10
|
137 pld [r2, #-32]
|
yading@10
|
138 vrev64.32 q2, q2
|
yading@10
|
139 vswp d6, d7
|
yading@10
|
140 veor q2, q2, q8
|
yading@10
|
141 vld1.32 {q0}, [r2,:128], r3
|
yading@10
|
142 vld1.32 {q1}, [r1,:128]!
|
yading@10
|
143 vst2.32 {d5,d7}, [r0,:128]!
|
yading@10
|
144 vst2.32 {d4,d6}, [r0,:128]!
|
yading@10
|
145 subs r12, r12, #8
|
yading@10
|
146 bgt 1b
|
yading@10
|
147 bx lr
|
yading@10
|
148 endfunc
|
yading@10
|
149
|
yading@10
|
150 function ff_sbr_qmf_deint_neg_neon, export=1
|
yading@10
|
151 add r1, r1, #60*4
|
yading@10
|
152 add r2, r0, #62*4
|
yading@10
|
153 mov r3, #-16
|
yading@10
|
154 mov r12, #32
|
yading@10
|
155 vmov.i32 d2, #1<<31
|
yading@10
|
156 1:
|
yading@10
|
157 vld2.32 {d0,d1}, [r1,:128], r3
|
yading@10
|
158 veor d0, d0, d2
|
yading@10
|
159 vrev64.32 d1, d1
|
yading@10
|
160 vst1.32 {d0}, [r2,:64]
|
yading@10
|
161 vst1.32 {d1}, [r0,:64]!
|
yading@10
|
162 sub r2, r2, #8
|
yading@10
|
163 subs r12, r12, #2
|
yading@10
|
164 bgt 1b
|
yading@10
|
165 bx lr
|
yading@10
|
166 endfunc
|
yading@10
|
167
|
yading@10
|
168 function ff_sbr_qmf_deint_bfly_neon, export=1
|
yading@10
|
169 push {lr}
|
yading@10
|
170 add r2, r2, #60*4
|
yading@10
|
171 add r3, r0, #124*4
|
yading@10
|
172 mov r12, #64
|
yading@10
|
173 mov lr, #-16
|
yading@10
|
174 1:
|
yading@10
|
175 vld1.32 {q0}, [r1,:128]!
|
yading@10
|
176 vld1.32 {q1}, [r2,:128], lr
|
yading@10
|
177 vrev64.32 q2, q0
|
yading@10
|
178 vrev64.32 q3, q1
|
yading@10
|
179 vadd.f32 d3, d4, d3
|
yading@10
|
180 vadd.f32 d2, d5, d2
|
yading@10
|
181 vsub.f32 d0, d0, d7
|
yading@10
|
182 vsub.f32 d1, d1, d6
|
yading@10
|
183 vst1.32 {q1}, [r3,:128], lr
|
yading@10
|
184 vst1.32 {q0}, [r0,:128]!
|
yading@10
|
185 subs r12, r12, #4
|
yading@10
|
186 bgt 1b
|
yading@10
|
187 pop {pc}
|
yading@10
|
188 endfunc
|
yading@10
|
189
|
yading@10
|
190 function ff_sbr_hf_g_filt_neon, export=1
|
yading@10
|
191 ldr r12, [sp]
|
yading@10
|
192 add r1, r1, r12, lsl #3
|
yading@10
|
193 mov r12, #40*2*4
|
yading@10
|
194 sub r3, r3, #1
|
yading@10
|
195 vld2.32 {d2[],d3[]},[r2,:64]!
|
yading@10
|
196 vld1.32 {d0}, [r1,:64], r12
|
yading@10
|
197 1:
|
yading@10
|
198 vld1.32 {d1}, [r1,:64], r12
|
yading@10
|
199 vmul.f32 q3, q0, q1
|
yading@10
|
200 vld2.32 {d2[],d3[]},[r2,:64]!
|
yading@10
|
201 vld1.32 {d0}, [r1,:64], r12
|
yading@10
|
202 vst1.32 {q3}, [r0,:64]!
|
yading@10
|
203 subs r3, r3, #2
|
yading@10
|
204 bgt 1b
|
yading@10
|
205 it lt
|
yading@10
|
206 bxlt lr
|
yading@10
|
207 vmul.f32 d0, d0, d2
|
yading@10
|
208 vst1.32 {d0}, [r0,:64]!
|
yading@10
|
209 bx lr
|
yading@10
|
210 endfunc
|
yading@10
|
211
|
yading@10
|
212 function ff_sbr_hf_gen_neon, export=1
|
yading@10
|
213 NOVFP vld1.32 {d1[]}, [sp,:32]
|
yading@10
|
214 VFP vdup.32 d1, d0[0]
|
yading@10
|
215 vmul.f32 d0, d1, d1
|
yading@10
|
216 vld1.32 {d3}, [r2,:64]
|
yading@10
|
217 vld1.32 {d2}, [r3,:64]
|
yading@10
|
218 vmul.f32 q0, q0, q1
|
yading@10
|
219 ldrd r2, r3, [sp, #4*!HAVE_VFP_ARGS]
|
yading@10
|
220 vtrn.32 d0, d1
|
yading@10
|
221 vneg.f32 d18, d1
|
yading@10
|
222 vtrn.32 d18, d1
|
yading@10
|
223 add r0, r0, r2, lsl #3
|
yading@10
|
224 add r1, r1, r2, lsl #3
|
yading@10
|
225 sub r1, r1, #2*8
|
yading@10
|
226 sub r3, r3, r2
|
yading@10
|
227 vld1.32 {q1}, [r1,:128]!
|
yading@10
|
228 1:
|
yading@10
|
229 vld1.32 {q3}, [r1,:128]!
|
yading@10
|
230 vrev64.32 q2, q1
|
yading@10
|
231 vmov q8, q3
|
yading@10
|
232 vrev64.32 d20, d3
|
yading@10
|
233 vrev64.32 d21, d6
|
yading@10
|
234 vmla.f32 q3, q1, d0[0]
|
yading@10
|
235 vmla.f32 d6, d4, d18
|
yading@10
|
236 vmla.f32 d7, d20, d18
|
yading@10
|
237 vmla.f32 d6, d3, d0[1]
|
yading@10
|
238 vmla.f32 d7, d16, d0[1]
|
yading@10
|
239 vmla.f32 d6, d5, d1
|
yading@10
|
240 vmla.f32 d7, d21, d1
|
yading@10
|
241 vmov q1, q8
|
yading@10
|
242 vst1.32 {q3}, [r0,:128]!
|
yading@10
|
243 subs r3, r3, #2
|
yading@10
|
244 bgt 1b
|
yading@10
|
245 bx lr
|
yading@10
|
246 endfunc
|
yading@10
|
247
|
yading@10
|
248 function ff_sbr_autocorrelate_neon, export=1
|
yading@10
|
249 vld1.32 {q0}, [r0,:128]!
|
yading@10
|
250 vmov.f32 q1, #0.0
|
yading@10
|
251 vmov.f32 q3, #0.0
|
yading@10
|
252 vmov.f32 d20, #0.0
|
yading@10
|
253 vmul.f32 d21, d1, d1
|
yading@10
|
254 vmov q8, q0
|
yading@10
|
255 vmov q11, q0
|
yading@10
|
256 mov r12, #36
|
yading@10
|
257 1:
|
yading@10
|
258 vld1.32 {q2}, [r0,:128]!
|
yading@10
|
259 vrev64.32 q12, q2
|
yading@10
|
260 vmla.f32 q10, q2, q2
|
yading@10
|
261 vmla.f32 d2, d1, d4
|
yading@10
|
262 vmla.f32 d3, d1, d24
|
yading@10
|
263 vmla.f32 d6, d0, d4
|
yading@10
|
264 vmla.f32 d7, d0, d24
|
yading@10
|
265 vmla.f32 d2, d4, d5
|
yading@10
|
266 vmla.f32 d3, d4, d25
|
yading@10
|
267 vmla.f32 d6, d1, d5
|
yading@10
|
268 vmla.f32 d7, d1, d25
|
yading@10
|
269 vmov q0, q2
|
yading@10
|
270 subs r12, r12, #2
|
yading@10
|
271 bgt 1b
|
yading@10
|
272 vld1.32 {q2}, [r0,:128]!
|
yading@10
|
273 vrev64.32 q12, q2
|
yading@10
|
274 vmla.f32 d2, d1, d4
|
yading@10
|
275 vmla.f32 d3, d1, d24
|
yading@10
|
276 vmla.f32 d6, d0, d4
|
yading@10
|
277 vmla.f32 d7, d0, d24
|
yading@10
|
278 vadd.f32 d20, d20, d21
|
yading@10
|
279 vrev64.32 d18, d17
|
yading@10
|
280 vmla.f32 d6, d1, d5
|
yading@10
|
281 vmla.f32 d7, d1, d25
|
yading@10
|
282 vmov q0, q1
|
yading@10
|
283 vmla.f32 d0, d16, d17
|
yading@10
|
284 vmla.f32 d1, d16, d18
|
yading@10
|
285 vmla.f32 d2, d4, d5
|
yading@10
|
286 vmla.f32 d3, d4, d25
|
yading@10
|
287 vneg.f32 s15, s15
|
yading@10
|
288 vmov d21, d20
|
yading@10
|
289 vpadd.f32 d0, d0, d2
|
yading@10
|
290 vpadd.f32 d7, d6, d7
|
yading@10
|
291 vtrn.32 d1, d3
|
yading@10
|
292 vsub.f32 d6, d1, d3
|
yading@10
|
293 vmla.f32 d20, d22, d22
|
yading@10
|
294 vmla.f32 d21, d4, d4
|
yading@10
|
295 vtrn.32 d0, d6
|
yading@10
|
296 vpadd.f32 d20, d20, d21
|
yading@10
|
297 vst1.32 {q3}, [r1,:128]!
|
yading@10
|
298 vst1.32 {d20[1]}, [r1,:32]
|
yading@10
|
299 add r1, r1, #2*4
|
yading@10
|
300 vst1.32 {d0}, [r1,:64]
|
yading@10
|
301 add r1, r1, #4*4
|
yading@10
|
302 vst1.32 {d20[0]}, [r1,:32]
|
yading@10
|
303 bx lr
|
yading@10
|
304 endfunc
|
yading@10
|
305
|
yading@10
|
306 function ff_sbr_hf_apply_noise_0_neon, export=1
|
yading@10
|
307 vmov.i32 d3, #0
|
yading@10
|
308 .Lhf_apply_noise_0:
|
yading@10
|
309 push {r4,lr}
|
yading@10
|
310 movrelx r4, X(ff_sbr_noise_table)
|
yading@10
|
311 ldr r12, [sp, #12]
|
yading@10
|
312 add r3, r3, #1
|
yading@10
|
313 bfc r3, #9, #23
|
yading@10
|
314 sub r12, r12, #1
|
yading@10
|
315 1:
|
yading@10
|
316 add lr, r4, r3, lsl #3
|
yading@10
|
317 vld2.32 {q0}, [r0,:64]
|
yading@10
|
318 vld2.32 {q3}, [lr,:64]
|
yading@10
|
319 vld1.32 {d2}, [r1,:64]!
|
yading@10
|
320 vld1.32 {d18}, [r2,:64]!
|
yading@10
|
321 vceq.f32 d16, d2, #0
|
yading@10
|
322 veor d2, d2, d3
|
yading@10
|
323 vmov q2, q0
|
yading@10
|
324 vmla.f32 d0, d6, d18
|
yading@10
|
325 vmla.f32 d1, d7, d18
|
yading@10
|
326 vadd.f32 d4, d4, d2
|
yading@10
|
327 add r3, r3, #2
|
yading@10
|
328 bfc r3, #9, #23
|
yading@10
|
329 vbif d0, d4, d16
|
yading@10
|
330 vbif d1, d5, d16
|
yading@10
|
331 vst2.32 {q0}, [r0,:64]!
|
yading@10
|
332 subs r12, r12, #2
|
yading@10
|
333 bgt 1b
|
yading@10
|
334 blt 2f
|
yading@10
|
335 add lr, r4, r3, lsl #3
|
yading@10
|
336 vld1.32 {d0}, [r0,:64]
|
yading@10
|
337 vld1.32 {d6}, [lr,:64]
|
yading@10
|
338 vld1.32 {d2[]}, [r1,:32]!
|
yading@10
|
339 vld1.32 {d3[]}, [r2,:32]!
|
yading@10
|
340 vceq.f32 d4, d2, #0
|
yading@10
|
341 veor d2, d2, d3
|
yading@10
|
342 vmov d1, d0
|
yading@10
|
343 vmla.f32 d0, d6, d3
|
yading@10
|
344 vadd.f32 s2, s2, s4
|
yading@10
|
345 vbif d0, d1, d4
|
yading@10
|
346 vst1.32 {d0}, [r0,:64]!
|
yading@10
|
347 2:
|
yading@10
|
348 pop {r4,pc}
|
yading@10
|
349 endfunc
|
yading@10
|
350
|
yading@10
|
351 function ff_sbr_hf_apply_noise_1_neon, export=1
|
yading@10
|
352 ldr r12, [sp]
|
yading@10
|
353 push {r4,lr}
|
yading@10
|
354 lsl r12, r12, #31
|
yading@10
|
355 eor lr, r12, #1<<31
|
yading@10
|
356 vmov d3, r12, lr
|
yading@10
|
357 .Lhf_apply_noise_1:
|
yading@10
|
358 movrelx r4, X(ff_sbr_noise_table)
|
yading@10
|
359 ldr r12, [sp, #12]
|
yading@10
|
360 add r3, r3, #1
|
yading@10
|
361 bfc r3, #9, #23
|
yading@10
|
362 sub r12, r12, #1
|
yading@10
|
363 1:
|
yading@10
|
364 add lr, r4, r3, lsl #3
|
yading@10
|
365 vld2.32 {q0}, [r0,:64]
|
yading@10
|
366 vld2.32 {q3}, [lr,:64]
|
yading@10
|
367 vld1.32 {d2}, [r1,:64]!
|
yading@10
|
368 vld1.32 {d18}, [r2,:64]!
|
yading@10
|
369 vceq.f32 d16, d2, #0
|
yading@10
|
370 veor d2, d2, d3
|
yading@10
|
371 vmov q2, q0
|
yading@10
|
372 vmla.f32 d0, d6, d18
|
yading@10
|
373 vmla.f32 d1, d7, d18
|
yading@10
|
374 vadd.f32 d5, d5, d2
|
yading@10
|
375 add r3, r3, #2
|
yading@10
|
376 bfc r3, #9, #23
|
yading@10
|
377 vbif d0, d4, d16
|
yading@10
|
378 vbif d1, d5, d16
|
yading@10
|
379 vst2.32 {q0}, [r0,:64]!
|
yading@10
|
380 subs r12, r12, #2
|
yading@10
|
381 bgt 1b
|
yading@10
|
382 blt 2f
|
yading@10
|
383 add lr, r4, r3, lsl #3
|
yading@10
|
384 vld1.32 {d0}, [r0,:64]
|
yading@10
|
385 vld1.32 {d6}, [lr,:64]
|
yading@10
|
386 vld1.32 {d2[]}, [r1,:32]!
|
yading@10
|
387 vld1.32 {d18[]}, [r2,:32]!
|
yading@10
|
388 vceq.f32 d4, d2, #0
|
yading@10
|
389 veor d2, d2, d3
|
yading@10
|
390 vmov d1, d0
|
yading@10
|
391 vmla.f32 d0, d6, d18
|
yading@10
|
392 vadd.f32 s3, s3, s5
|
yading@10
|
393 vbif d0, d1, d4
|
yading@10
|
394 vst1.32 {d0}, [r0,:64]!
|
yading@10
|
395 2:
|
yading@10
|
396 pop {r4,pc}
|
yading@10
|
397 endfunc
|
yading@10
|
398
|
yading@10
|
399 function ff_sbr_hf_apply_noise_2_neon, export=1
|
yading@10
|
400 vmov.i32 d3, #1<<31
|
yading@10
|
401 b .Lhf_apply_noise_0
|
yading@10
|
402 endfunc
|
yading@10
|
403
|
yading@10
|
404 function ff_sbr_hf_apply_noise_3_neon, export=1
|
yading@10
|
405 ldr r12, [sp]
|
yading@10
|
406 push {r4,lr}
|
yading@10
|
407 lsl r12, r12, #31
|
yading@10
|
408 eor lr, r12, #1<<31
|
yading@10
|
409 vmov d3, lr, r12
|
yading@10
|
410 b .Lhf_apply_noise_1
|
yading@10
|
411 endfunc
|