yading@10
|
1 /*
|
yading@10
|
2 * VP8 NEON optimisations
|
yading@10
|
3 *
|
yading@10
|
4 * Copyright (c) 2010 Rob Clark <rob@ti.com>
|
yading@10
|
5 * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
|
yading@10
|
6 *
|
yading@10
|
7 * This file is part of FFmpeg.
|
yading@10
|
8 *
|
yading@10
|
9 * FFmpeg is free software; you can redistribute it and/or
|
yading@10
|
10 * modify it under the terms of the GNU Lesser General Public
|
yading@10
|
11 * License as published by the Free Software Foundation; either
|
yading@10
|
12 * version 2.1 of the License, or (at your option) any later version.
|
yading@10
|
13 *
|
yading@10
|
14 * FFmpeg is distributed in the hope that it will be useful,
|
yading@10
|
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
yading@10
|
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
yading@10
|
17 * Lesser General Public License for more details.
|
yading@10
|
18 *
|
yading@10
|
19 * You should have received a copy of the GNU Lesser General Public
|
yading@10
|
20 * License along with FFmpeg; if not, write to the Free Software
|
yading@10
|
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
yading@10
|
22 */
|
yading@10
|
23
|
yading@10
|
24 #include "libavutil/arm/asm.S"
|
yading@10
|
25 #include "neon.S"
|
yading@10
|
26
|
yading@10
|
27 function ff_vp8_luma_dc_wht_neon, export=1
|
yading@10
|
28 vld1.16 {q0-q1}, [r1,:128]
|
yading@10
|
29 vmov.i16 q15, #0
|
yading@10
|
30
|
yading@10
|
31 vadd.i16 d4, d0, d3
|
yading@10
|
32 vadd.i16 d6, d1, d2
|
yading@10
|
33 vst1.16 {q15}, [r1,:128]!
|
yading@10
|
34 vsub.i16 d7, d1, d2
|
yading@10
|
35 vsub.i16 d5, d0, d3
|
yading@10
|
36 vst1.16 {q15}, [r1,:128]
|
yading@10
|
37 vadd.i16 q0, q2, q3
|
yading@10
|
38 vsub.i16 q1, q2, q3
|
yading@10
|
39
|
yading@10
|
40 vmov.i16 q8, #3
|
yading@10
|
41
|
yading@10
|
42 vtrn.32 d0, d2
|
yading@10
|
43 vtrn.32 d1, d3
|
yading@10
|
44 vtrn.16 d0, d1
|
yading@10
|
45 vtrn.16 d2, d3
|
yading@10
|
46
|
yading@10
|
47 vadd.i16 d0, d0, d16
|
yading@10
|
48
|
yading@10
|
49 vadd.i16 d4, d0, d3
|
yading@10
|
50 vadd.i16 d6, d1, d2
|
yading@10
|
51 vsub.i16 d7, d1, d2
|
yading@10
|
52 vsub.i16 d5, d0, d3
|
yading@10
|
53 vadd.i16 q0, q2, q3
|
yading@10
|
54 vsub.i16 q1, q2, q3
|
yading@10
|
55
|
yading@10
|
56 vshr.s16 q0, q0, #3
|
yading@10
|
57 vshr.s16 q1, q1, #3
|
yading@10
|
58
|
yading@10
|
59 mov r3, #32
|
yading@10
|
60 vst1.16 {d0[0]}, [r0,:16], r3
|
yading@10
|
61 vst1.16 {d1[0]}, [r0,:16], r3
|
yading@10
|
62 vst1.16 {d2[0]}, [r0,:16], r3
|
yading@10
|
63 vst1.16 {d3[0]}, [r0,:16], r3
|
yading@10
|
64 vst1.16 {d0[1]}, [r0,:16], r3
|
yading@10
|
65 vst1.16 {d1[1]}, [r0,:16], r3
|
yading@10
|
66 vst1.16 {d2[1]}, [r0,:16], r3
|
yading@10
|
67 vst1.16 {d3[1]}, [r0,:16], r3
|
yading@10
|
68 vst1.16 {d0[2]}, [r0,:16], r3
|
yading@10
|
69 vst1.16 {d1[2]}, [r0,:16], r3
|
yading@10
|
70 vst1.16 {d2[2]}, [r0,:16], r3
|
yading@10
|
71 vst1.16 {d3[2]}, [r0,:16], r3
|
yading@10
|
72 vst1.16 {d0[3]}, [r0,:16], r3
|
yading@10
|
73 vst1.16 {d1[3]}, [r0,:16], r3
|
yading@10
|
74 vst1.16 {d2[3]}, [r0,:16], r3
|
yading@10
|
75 vst1.16 {d3[3]}, [r0,:16], r3
|
yading@10
|
76
|
yading@10
|
77 bx lr
|
yading@10
|
78 endfunc
|
yading@10
|
79
|
yading@10
|
80 function ff_vp8_idct_add_neon, export=1
|
yading@10
|
81 vld1.16 {q0-q1}, [r1,:128]
|
yading@10
|
82 movw r3, #20091
|
yading@10
|
83 movt r3, #35468/2
|
yading@10
|
84 vdup.32 d4, r3
|
yading@10
|
85
|
yading@10
|
86 vmull.s16 q12, d1, d4[0]
|
yading@10
|
87 vmull.s16 q13, d3, d4[0]
|
yading@10
|
88 vqdmulh.s16 d20, d1, d4[1]
|
yading@10
|
89 vqdmulh.s16 d23, d3, d4[1]
|
yading@10
|
90 vshrn.s32 d21, q12, #16
|
yading@10
|
91 vshrn.s32 d22, q13, #16
|
yading@10
|
92 vadd.s16 d21, d21, d1
|
yading@10
|
93 vadd.s16 d22, d22, d3
|
yading@10
|
94
|
yading@10
|
95 vadd.s16 d16, d0, d2
|
yading@10
|
96 vsub.s16 d17, d0, d2
|
yading@10
|
97 vadd.s16 d18, d21, d23
|
yading@10
|
98 vsub.s16 d19, d20, d22
|
yading@10
|
99 vadd.s16 q0, q8, q9
|
yading@10
|
100 vsub.s16 q1, q8, q9
|
yading@10
|
101
|
yading@10
|
102 vtrn.32 d0, d3
|
yading@10
|
103 vtrn.32 d1, d2
|
yading@10
|
104 vtrn.16 d0, d1
|
yading@10
|
105 vtrn.16 d3, d2
|
yading@10
|
106
|
yading@10
|
107 vmov.i16 q15, #0
|
yading@10
|
108 vmull.s16 q12, d1, d4[0]
|
yading@10
|
109 vst1.16 {q15}, [r1,:128]!
|
yading@10
|
110 vmull.s16 q13, d2, d4[0]
|
yading@10
|
111 vst1.16 {q15}, [r1,:128]
|
yading@10
|
112 vqdmulh.s16 d21, d1, d4[1]
|
yading@10
|
113 vqdmulh.s16 d23, d2, d4[1]
|
yading@10
|
114 vshrn.s32 d20, q12, #16
|
yading@10
|
115 vshrn.s32 d22, q13, #16
|
yading@10
|
116 vadd.i16 d20, d20, d1
|
yading@10
|
117 vadd.i16 d22, d22, d2
|
yading@10
|
118
|
yading@10
|
119 vadd.i16 d16, d0, d3
|
yading@10
|
120 vsub.i16 d17, d0, d3
|
yading@10
|
121 vadd.i16 d18, d20, d23
|
yading@10
|
122 vld1.32 {d20[]}, [r0,:32], r2
|
yading@10
|
123 vsub.i16 d19, d21, d22
|
yading@10
|
124 vld1.32 {d22[]}, [r0,:32], r2
|
yading@10
|
125 vadd.s16 q0, q8, q9
|
yading@10
|
126 vld1.32 {d23[]}, [r0,:32], r2
|
yading@10
|
127 vsub.s16 q1, q8, q9
|
yading@10
|
128 vld1.32 {d21[]}, [r0,:32], r2
|
yading@10
|
129 vrshr.s16 q0, q0, #3
|
yading@10
|
130 vtrn.32 q10, q11
|
yading@10
|
131 vrshr.s16 q1, q1, #3
|
yading@10
|
132
|
yading@10
|
133 sub r0, r0, r2, lsl #2
|
yading@10
|
134
|
yading@10
|
135 vtrn.32 d0, d3
|
yading@10
|
136 vtrn.32 d1, d2
|
yading@10
|
137 vtrn.16 d0, d1
|
yading@10
|
138 vtrn.16 d3, d2
|
yading@10
|
139
|
yading@10
|
140 vaddw.u8 q0, q0, d20
|
yading@10
|
141 vaddw.u8 q1, q1, d21
|
yading@10
|
142 vqmovun.s16 d0, q0
|
yading@10
|
143 vqmovun.s16 d1, q1
|
yading@10
|
144
|
yading@10
|
145 vst1.32 {d0[0]}, [r0,:32], r2
|
yading@10
|
146 vst1.32 {d0[1]}, [r0,:32], r2
|
yading@10
|
147 vst1.32 {d1[1]}, [r0,:32], r2
|
yading@10
|
148 vst1.32 {d1[0]}, [r0,:32], r2
|
yading@10
|
149
|
yading@10
|
150 bx lr
|
yading@10
|
151 endfunc
|
yading@10
|
152
|
yading@10
|
153 function ff_vp8_idct_dc_add_neon, export=1
|
yading@10
|
154 mov r3, #0
|
yading@10
|
155 ldrsh r12, [r1]
|
yading@10
|
156 strh r3, [r1]
|
yading@10
|
157 vdup.16 q1, r12
|
yading@10
|
158 vrshr.s16 q1, q1, #3
|
yading@10
|
159 vld1.32 {d0[]}, [r0,:32], r2
|
yading@10
|
160 vld1.32 {d1[]}, [r0,:32], r2
|
yading@10
|
161 vld1.32 {d0[1]}, [r0,:32], r2
|
yading@10
|
162 vld1.32 {d1[1]}, [r0,:32], r2
|
yading@10
|
163 vaddw.u8 q2, q1, d0
|
yading@10
|
164 vaddw.u8 q3, q1, d1
|
yading@10
|
165 sub r0, r0, r2, lsl #2
|
yading@10
|
166 vqmovun.s16 d0, q2
|
yading@10
|
167 vqmovun.s16 d1, q3
|
yading@10
|
168 vst1.32 {d0[0]}, [r0,:32], r2
|
yading@10
|
169 vst1.32 {d1[0]}, [r0,:32], r2
|
yading@10
|
170 vst1.32 {d0[1]}, [r0,:32], r2
|
yading@10
|
171 vst1.32 {d1[1]}, [r0,:32], r2
|
yading@10
|
172 bx lr
|
yading@10
|
173 endfunc
|
yading@10
|
174
|
yading@10
|
175 function ff_vp8_idct_dc_add4uv_neon, export=1
|
yading@10
|
176 vmov.i16 d0, #0
|
yading@10
|
177 mov r3, #32
|
yading@10
|
178 vld1.16 {d16[]}, [r1,:16]
|
yading@10
|
179 vst1.16 {d0[0]}, [r1,:16], r3
|
yading@10
|
180 vld1.16 {d17[]}, [r1,:16]
|
yading@10
|
181 vst1.16 {d0[0]}, [r1,:16], r3
|
yading@10
|
182 vld1.16 {d18[]}, [r1,:16]
|
yading@10
|
183 vst1.16 {d0[0]}, [r1,:16], r3
|
yading@10
|
184 vld1.16 {d19[]}, [r1,:16]
|
yading@10
|
185 vst1.16 {d0[0]}, [r1,:16], r3
|
yading@10
|
186 mov r3, r0
|
yading@10
|
187 vrshr.s16 q8, q8, #3 @ dc >>= 3
|
yading@10
|
188 vld1.8 {d0}, [r0,:64], r2
|
yading@10
|
189 vrshr.s16 q9, q9, #3
|
yading@10
|
190 vld1.8 {d1}, [r0,:64], r2
|
yading@10
|
191 vaddw.u8 q10, q8, d0
|
yading@10
|
192 vld1.8 {d2}, [r0,:64], r2
|
yading@10
|
193 vaddw.u8 q0, q8, d1
|
yading@10
|
194 vld1.8 {d3}, [r0,:64], r2
|
yading@10
|
195 vaddw.u8 q11, q8, d2
|
yading@10
|
196 vld1.8 {d4}, [r0,:64], r2
|
yading@10
|
197 vaddw.u8 q1, q8, d3
|
yading@10
|
198 vld1.8 {d5}, [r0,:64], r2
|
yading@10
|
199 vaddw.u8 q12, q9, d4
|
yading@10
|
200 vld1.8 {d6}, [r0,:64], r2
|
yading@10
|
201 vaddw.u8 q2, q9, d5
|
yading@10
|
202 vld1.8 {d7}, [r0,:64], r2
|
yading@10
|
203 vaddw.u8 q13, q9, d6
|
yading@10
|
204 vqmovun.s16 d20, q10
|
yading@10
|
205 vaddw.u8 q3, q9, d7
|
yading@10
|
206 vqmovun.s16 d21, q0
|
yading@10
|
207 vqmovun.s16 d22, q11
|
yading@10
|
208 vst1.8 {d20}, [r3,:64], r2
|
yading@10
|
209 vqmovun.s16 d23, q1
|
yading@10
|
210 vst1.8 {d21}, [r3,:64], r2
|
yading@10
|
211 vqmovun.s16 d24, q12
|
yading@10
|
212 vst1.8 {d22}, [r3,:64], r2
|
yading@10
|
213 vqmovun.s16 d25, q2
|
yading@10
|
214 vst1.8 {d23}, [r3,:64], r2
|
yading@10
|
215 vqmovun.s16 d26, q13
|
yading@10
|
216 vst1.8 {d24}, [r3,:64], r2
|
yading@10
|
217 vqmovun.s16 d27, q3
|
yading@10
|
218 vst1.8 {d25}, [r3,:64], r2
|
yading@10
|
219 vst1.8 {d26}, [r3,:64], r2
|
yading@10
|
220 vst1.8 {d27}, [r3,:64], r2
|
yading@10
|
221
|
yading@10
|
222 bx lr
|
yading@10
|
223 endfunc
|
yading@10
|
224
|
yading@10
|
225 function ff_vp8_idct_dc_add4y_neon, export=1
|
yading@10
|
226 vmov.i16 d0, #0
|
yading@10
|
227 mov r3, #32
|
yading@10
|
228 vld1.16 {d16[]}, [r1,:16]
|
yading@10
|
229 vst1.16 {d0[0]}, [r1,:16], r3
|
yading@10
|
230 vld1.16 {d17[]}, [r1,:16]
|
yading@10
|
231 vst1.16 {d0[0]}, [r1,:16], r3
|
yading@10
|
232 vld1.16 {d18[]}, [r1,:16]
|
yading@10
|
233 vst1.16 {d0[0]}, [r1,:16], r3
|
yading@10
|
234 vld1.16 {d19[]}, [r1,:16]
|
yading@10
|
235 vst1.16 {d0[0]}, [r1,:16], r3
|
yading@10
|
236 vrshr.s16 q8, q8, #3 @ dc >>= 3
|
yading@10
|
237 vld1.8 {q0}, [r0,:128], r2
|
yading@10
|
238 vrshr.s16 q9, q9, #3
|
yading@10
|
239 vld1.8 {q1}, [r0,:128], r2
|
yading@10
|
240 vaddw.u8 q10, q8, d0
|
yading@10
|
241 vld1.8 {q2}, [r0,:128], r2
|
yading@10
|
242 vaddw.u8 q0, q9, d1
|
yading@10
|
243 vld1.8 {q3}, [r0,:128], r2
|
yading@10
|
244 vaddw.u8 q11, q8, d2
|
yading@10
|
245 vaddw.u8 q1, q9, d3
|
yading@10
|
246 vaddw.u8 q12, q8, d4
|
yading@10
|
247 vaddw.u8 q2, q9, d5
|
yading@10
|
248 vaddw.u8 q13, q8, d6
|
yading@10
|
249 vaddw.u8 q3, q9, d7
|
yading@10
|
250 sub r0, r0, r2, lsl #2
|
yading@10
|
251 vqmovun.s16 d20, q10
|
yading@10
|
252 vqmovun.s16 d21, q0
|
yading@10
|
253 vqmovun.s16 d22, q11
|
yading@10
|
254 vqmovun.s16 d23, q1
|
yading@10
|
255 vqmovun.s16 d24, q12
|
yading@10
|
256 vst1.8 {q10}, [r0,:128], r2
|
yading@10
|
257 vqmovun.s16 d25, q2
|
yading@10
|
258 vst1.8 {q11}, [r0,:128], r2
|
yading@10
|
259 vqmovun.s16 d26, q13
|
yading@10
|
260 vst1.8 {q12}, [r0,:128], r2
|
yading@10
|
261 vqmovun.s16 d27, q3
|
yading@10
|
262 vst1.8 {q13}, [r0,:128], r2
|
yading@10
|
263
|
yading@10
|
264 bx lr
|
yading@10
|
265 endfunc
|
yading@10
|
266
|
yading@10
|
267 @ Register layout:
|
yading@10
|
268 @ P3..Q3 -> q0..q7
|
yading@10
|
269 @ flim_E -> q14
|
yading@10
|
270 @ flim_I -> q15
|
yading@10
|
271 @ hev_thresh -> r12
|
yading@10
|
272 @
|
yading@10
|
273 .macro vp8_loop_filter, inner=0, simple=0
|
yading@10
|
274 .if \simple
|
yading@10
|
275 vabd.u8 q9, q3, q4 @ abs(P0-Q0)
|
yading@10
|
276 vabd.u8 q15, q2, q5 @ abs(P1-Q1)
|
yading@10
|
277 vqadd.u8 q9, q9, q9 @ abs(P0-Q0) * 2
|
yading@10
|
278 vshr.u8 q10, q15, #1 @ abs(P1-Q1) / 2
|
yading@10
|
279 vqadd.u8 q11, q9, q10 @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2)
|
yading@10
|
280 vmov.i8 q13, #0x80
|
yading@10
|
281 vcle.u8 q8, q11, q14 @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2) <= flim
|
yading@10
|
282 .else
|
yading@10
|
283 @ calculate hev and normal_limit:
|
yading@10
|
284 vabd.u8 q12, q2, q3 @ abs(P1-P0)
|
yading@10
|
285 vabd.u8 q13, q5, q4 @ abs(Q1-Q0)
|
yading@10
|
286 vabd.u8 q10, q0, q1 @ abs(P3-P2)
|
yading@10
|
287 vabd.u8 q11, q1, q2 @ abs(P2-P1)
|
yading@10
|
288 vcle.u8 q8, q12, q15 @ abs(P1-P0) <= flim_I
|
yading@10
|
289 vcle.u8 q9, q13, q15 @ abs(Q1-Q0) <= flim_I
|
yading@10
|
290 vcle.u8 q10, q10, q15 @ abs(P3-P2) <= flim_I
|
yading@10
|
291 vcle.u8 q11, q11, q15 @ abs(P2-P1) <= flim_I
|
yading@10
|
292 vand q8, q8, q9
|
yading@10
|
293 vabd.u8 q9, q7, q6 @ abs(Q3-Q2)
|
yading@10
|
294 vand q8, q8, q11
|
yading@10
|
295 vabd.u8 q11, q6, q5 @ abs(Q2-Q1)
|
yading@10
|
296 vand q8, q8, q10
|
yading@10
|
297 vcle.u8 q10, q9, q15 @ abs(Q3-Q2) <= flim_I
|
yading@10
|
298 vcle.u8 q11, q11, q15 @ abs(Q2-Q1) <= flim_I
|
yading@10
|
299 vabd.u8 q9, q3, q4 @ abs(P0-Q0)
|
yading@10
|
300 vabd.u8 q15, q2, q5 @ abs(P1-Q1)
|
yading@10
|
301 vand q8, q8, q10
|
yading@10
|
302 vqadd.u8 q9, q9, q9 @ abs(P0-Q0) * 2
|
yading@10
|
303 vand q8, q8, q11
|
yading@10
|
304 vshr.u8 q10, q15, #1 @ abs(P1-Q1) / 2
|
yading@10
|
305 vdup.8 q15, r12 @ hev_thresh
|
yading@10
|
306 vqadd.u8 q11, q9, q10 @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2)
|
yading@10
|
307 vcgt.u8 q12, q12, q15 @ abs(P1-P0) > hev_thresh
|
yading@10
|
308 vcle.u8 q11, q11, q14 @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2) <= flim_E
|
yading@10
|
309 vcgt.u8 q14, q13, q15 @ abs(Q1-Q0) > hev_thresh
|
yading@10
|
310 vand q8, q8, q11
|
yading@10
|
311 vmov.i8 q13, #0x80
|
yading@10
|
312 vorr q9, q12, q14
|
yading@10
|
313 .endif
|
yading@10
|
314
|
yading@10
|
315 @ at this point:
|
yading@10
|
316 @ q8: normal_limit
|
yading@10
|
317 @ q9: hev
|
yading@10
|
318
|
yading@10
|
319 @ convert to signed value:
|
yading@10
|
320 veor q3, q3, q13 @ PS0 = P0 ^ 0x80
|
yading@10
|
321 veor q4, q4, q13 @ QS0 = Q0 ^ 0x80
|
yading@10
|
322
|
yading@10
|
323 vmov.i16 q12, #3
|
yading@10
|
324 vsubl.s8 q10, d8, d6 @ QS0 - PS0
|
yading@10
|
325 vsubl.s8 q11, d9, d7 @ (widened to 16bit)
|
yading@10
|
326 veor q2, q2, q13 @ PS1 = P1 ^ 0x80
|
yading@10
|
327 veor q5, q5, q13 @ QS1 = Q1 ^ 0x80
|
yading@10
|
328 vmul.i16 q10, q10, q12 @ w = 3 * (QS0 - PS0)
|
yading@10
|
329 vmul.i16 q11, q11, q12
|
yading@10
|
330
|
yading@10
|
331 vqsub.s8 q12, q2, q5 @ clamp(PS1-QS1)
|
yading@10
|
332 vmov.i8 q14, #4
|
yading@10
|
333 vmov.i8 q15, #3
|
yading@10
|
334 .if \inner
|
yading@10
|
335 vand q12, q12, q9 @ if(hev) w += clamp(PS1-QS1)
|
yading@10
|
336 .endif
|
yading@10
|
337 vaddw.s8 q10, q10, d24 @ w += clamp(PS1-QS1)
|
yading@10
|
338 vaddw.s8 q11, q11, d25
|
yading@10
|
339 vqmovn.s16 d20, q10 @ narrow result back into q10
|
yading@10
|
340 vqmovn.s16 d21, q11
|
yading@10
|
341 .if !\inner && !\simple
|
yading@10
|
342 veor q1, q1, q13 @ PS2 = P2 ^ 0x80
|
yading@10
|
343 veor q6, q6, q13 @ QS2 = Q2 ^ 0x80
|
yading@10
|
344 .endif
|
yading@10
|
345 vand q10, q10, q8 @ w &= normal_limit
|
yading@10
|
346
|
yading@10
|
347 @ registers used at this point..
|
yading@10
|
348 @ q0 -> P3 (don't corrupt)
|
yading@10
|
349 @ q1-q6 -> PS2-QS2
|
yading@10
|
350 @ q7 -> Q3 (don't corrupt)
|
yading@10
|
351 @ q9 -> hev
|
yading@10
|
352 @ q10 -> w
|
yading@10
|
353 @ q13 -> #0x80
|
yading@10
|
354 @ q14 -> #4
|
yading@10
|
355 @ q15 -> #3
|
yading@10
|
356 @ q8, q11, q12 -> unused
|
yading@10
|
357
|
yading@10
|
358 @ filter_common: is4tap==1
|
yading@10
|
359 @ c1 = clamp(w + 4) >> 3;
|
yading@10
|
360 @ c2 = clamp(w + 3) >> 3;
|
yading@10
|
361 @ Q0 = s2u(QS0 - c1);
|
yading@10
|
362 @ P0 = s2u(PS0 + c2);
|
yading@10
|
363
|
yading@10
|
364 .if \simple
|
yading@10
|
365 vqadd.s8 q11, q10, q14 @ c1 = clamp((w&hev)+4)
|
yading@10
|
366 vqadd.s8 q12, q10, q15 @ c2 = clamp((w&hev)+3)
|
yading@10
|
367 vshr.s8 q11, q11, #3 @ c1 >>= 3
|
yading@10
|
368 vshr.s8 q12, q12, #3 @ c2 >>= 3
|
yading@10
|
369 vqsub.s8 q4, q4, q11 @ QS0 = clamp(QS0-c1)
|
yading@10
|
370 vqadd.s8 q3, q3, q12 @ PS0 = clamp(PS0+c2)
|
yading@10
|
371 veor q4, q4, q13 @ Q0 = QS0 ^ 0x80
|
yading@10
|
372 veor q3, q3, q13 @ P0 = PS0 ^ 0x80
|
yading@10
|
373 veor q5, q5, q13 @ Q1 = QS1 ^ 0x80
|
yading@10
|
374 veor q2, q2, q13 @ P1 = PS1 ^ 0x80
|
yading@10
|
375 .elseif \inner
|
yading@10
|
376 @ the !is4tap case of filter_common, only used for inner blocks
|
yading@10
|
377 @ c3 = ((c1&~hev) + 1) >> 1;
|
yading@10
|
378 @ Q1 = s2u(QS1 - c3);
|
yading@10
|
379 @ P1 = s2u(PS1 + c3);
|
yading@10
|
380 vqadd.s8 q11, q10, q14 @ c1 = clamp((w&hev)+4)
|
yading@10
|
381 vqadd.s8 q12, q10, q15 @ c2 = clamp((w&hev)+3)
|
yading@10
|
382 vshr.s8 q11, q11, #3 @ c1 >>= 3
|
yading@10
|
383 vshr.s8 q12, q12, #3 @ c2 >>= 3
|
yading@10
|
384 vqsub.s8 q4, q4, q11 @ QS0 = clamp(QS0-c1)
|
yading@10
|
385 vqadd.s8 q3, q3, q12 @ PS0 = clamp(PS0+c2)
|
yading@10
|
386 vbic q11, q11, q9 @ c1 & ~hev
|
yading@10
|
387 veor q4, q4, q13 @ Q0 = QS0 ^ 0x80
|
yading@10
|
388 vrshr.s8 q11, q11, #1 @ c3 >>= 1
|
yading@10
|
389 veor q3, q3, q13 @ P0 = PS0 ^ 0x80
|
yading@10
|
390 vqsub.s8 q5, q5, q11 @ QS1 = clamp(QS1-c3)
|
yading@10
|
391 vqadd.s8 q2, q2, q11 @ PS1 = clamp(PS1+c3)
|
yading@10
|
392 veor q5, q5, q13 @ Q1 = QS1 ^ 0x80
|
yading@10
|
393 veor q2, q2, q13 @ P1 = PS1 ^ 0x80
|
yading@10
|
394 .else
|
yading@10
|
395 vand q12, q10, q9 @ w & hev
|
yading@10
|
396 vqadd.s8 q11, q12, q14 @ c1 = clamp((w&hev)+4)
|
yading@10
|
397 vqadd.s8 q12, q12, q15 @ c2 = clamp((w&hev)+3)
|
yading@10
|
398 vshr.s8 q11, q11, #3 @ c1 >>= 3
|
yading@10
|
399 vshr.s8 q12, q12, #3 @ c2 >>= 3
|
yading@10
|
400 vbic q10, q10, q9 @ w &= ~hev
|
yading@10
|
401 vqsub.s8 q4, q4, q11 @ QS0 = clamp(QS0-c1)
|
yading@10
|
402 vqadd.s8 q3, q3, q12 @ PS0 = clamp(PS0+c2)
|
yading@10
|
403
|
yading@10
|
404 @ filter_mbedge:
|
yading@10
|
405 @ a = clamp((27*w + 63) >> 7);
|
yading@10
|
406 @ Q0 = s2u(QS0 - a);
|
yading@10
|
407 @ P0 = s2u(PS0 + a);
|
yading@10
|
408 @ a = clamp((18*w + 63) >> 7);
|
yading@10
|
409 @ Q1 = s2u(QS1 - a);
|
yading@10
|
410 @ P1 = s2u(PS1 + a);
|
yading@10
|
411 @ a = clamp((9*w + 63) >> 7);
|
yading@10
|
412 @ Q2 = s2u(QS2 - a);
|
yading@10
|
413 @ P2 = s2u(PS2 + a);
|
yading@10
|
414 vmov.i16 q9, #63
|
yading@10
|
415 vshll.s8 q14, d20, #3
|
yading@10
|
416 vshll.s8 q15, d21, #3
|
yading@10
|
417 vaddw.s8 q14, q14, d20
|
yading@10
|
418 vaddw.s8 q15, q15, d21
|
yading@10
|
419 vadd.s16 q8, q9, q14
|
yading@10
|
420 vadd.s16 q9, q9, q15 @ 9*w + 63
|
yading@10
|
421 vadd.s16 q11, q8, q14
|
yading@10
|
422 vadd.s16 q12, q9, q15 @ 18*w + 63
|
yading@10
|
423 vadd.s16 q14, q11, q14
|
yading@10
|
424 vadd.s16 q15, q12, q15 @ 27*w + 63
|
yading@10
|
425 vqshrn.s16 d16, q8, #7
|
yading@10
|
426 vqshrn.s16 d17, q9, #7 @ clamp(( 9*w + 63)>>7)
|
yading@10
|
427 vqshrn.s16 d22, q11, #7
|
yading@10
|
428 vqshrn.s16 d23, q12, #7 @ clamp((18*w + 63)>>7)
|
yading@10
|
429 vqshrn.s16 d28, q14, #7
|
yading@10
|
430 vqshrn.s16 d29, q15, #7 @ clamp((27*w + 63)>>7)
|
yading@10
|
431 vqadd.s8 q1, q1, q8 @ PS2 = clamp(PS2+a)
|
yading@10
|
432 vqsub.s8 q6, q6, q8 @ QS2 = clamp(QS2-a)
|
yading@10
|
433 vqadd.s8 q2, q2, q11 @ PS1 = clamp(PS1+a)
|
yading@10
|
434 vqsub.s8 q5, q5, q11 @ QS1 = clamp(QS1-a)
|
yading@10
|
435 vqadd.s8 q3, q3, q14 @ PS0 = clamp(PS0+a)
|
yading@10
|
436 vqsub.s8 q4, q4, q14 @ QS0 = clamp(QS0-a)
|
yading@10
|
437 veor q3, q3, q13 @ P0 = PS0 ^ 0x80
|
yading@10
|
438 veor q4, q4, q13 @ Q0 = QS0 ^ 0x80
|
yading@10
|
439 veor q2, q2, q13 @ P1 = PS1 ^ 0x80
|
yading@10
|
440 veor q5, q5, q13 @ Q1 = QS1 ^ 0x80
|
yading@10
|
441 veor q1, q1, q13 @ P2 = PS2 ^ 0x80
|
yading@10
|
442 veor q6, q6, q13 @ Q2 = QS2 ^ 0x80
|
yading@10
|
443 .endif
|
yading@10
|
444 .endm
|
yading@10
|
445
|
yading@10
|
446 .macro vp8_v_loop_filter16 name, inner=0, simple=0
|
yading@10
|
447 function ff_vp8_v_loop_filter16\name\()_neon, export=1
|
yading@10
|
448 vpush {q4-q7}
|
yading@10
|
449 sub r0, r0, r1, lsl #1+!\simple
|
yading@10
|
450
|
yading@10
|
451 @ Load pixels:
|
yading@10
|
452 .if !\simple
|
yading@10
|
453 ldr r12, [sp, #64] @ hev_thresh
|
yading@10
|
454 vld1.8 {q0}, [r0,:128], r1 @ P3
|
yading@10
|
455 vld1.8 {q1}, [r0,:128], r1 @ P2
|
yading@10
|
456 .endif
|
yading@10
|
457 vld1.8 {q2}, [r0,:128], r1 @ P1
|
yading@10
|
458 vld1.8 {q3}, [r0,:128], r1 @ P0
|
yading@10
|
459 vld1.8 {q4}, [r0,:128], r1 @ Q0
|
yading@10
|
460 vld1.8 {q5}, [r0,:128], r1 @ Q1
|
yading@10
|
461 .if !\simple
|
yading@10
|
462 vld1.8 {q6}, [r0,:128], r1 @ Q2
|
yading@10
|
463 vld1.8 {q7}, [r0,:128] @ Q3
|
yading@10
|
464 vdup.8 q15, r3 @ flim_I
|
yading@10
|
465 .endif
|
yading@10
|
466 vdup.8 q14, r2 @ flim_E
|
yading@10
|
467
|
yading@10
|
468 vp8_loop_filter inner=\inner, simple=\simple
|
yading@10
|
469
|
yading@10
|
470 @ back up to P2: dst -= stride * 6
|
yading@10
|
471 sub r0, r0, r1, lsl #2
|
yading@10
|
472 .if !\simple
|
yading@10
|
473 sub r0, r0, r1, lsl #1
|
yading@10
|
474
|
yading@10
|
475 @ Store pixels:
|
yading@10
|
476 vst1.8 {q1}, [r0,:128], r1 @ P2
|
yading@10
|
477 .endif
|
yading@10
|
478 vst1.8 {q2}, [r0,:128], r1 @ P1
|
yading@10
|
479 vst1.8 {q3}, [r0,:128], r1 @ P0
|
yading@10
|
480 vst1.8 {q4}, [r0,:128], r1 @ Q0
|
yading@10
|
481 vst1.8 {q5}, [r0,:128], r1 @ Q1
|
yading@10
|
482 .if !\simple
|
yading@10
|
483 vst1.8 {q6}, [r0,:128] @ Q2
|
yading@10
|
484 .endif
|
yading@10
|
485
|
yading@10
|
486 vpop {q4-q7}
|
yading@10
|
487 bx lr
|
yading@10
|
488 endfunc
|
yading@10
|
489 .endm
|
yading@10
|
490
|
yading@10
|
491 vp8_v_loop_filter16
|
yading@10
|
492 vp8_v_loop_filter16 _inner, inner=1
|
yading@10
|
493 vp8_v_loop_filter16 _simple, simple=1
|
yading@10
|
494
|
yading@10
|
495 .macro vp8_v_loop_filter8uv name, inner=0
|
yading@10
|
496 function ff_vp8_v_loop_filter8uv\name\()_neon, export=1
|
yading@10
|
497 vpush {q4-q7}
|
yading@10
|
498 sub r0, r0, r2, lsl #2
|
yading@10
|
499 sub r1, r1, r2, lsl #2
|
yading@10
|
500 ldr r12, [sp, #64] @ flim_I
|
yading@10
|
501
|
yading@10
|
502 @ Load pixels:
|
yading@10
|
503 vld1.8 {d0}, [r0,:64], r2 @ P3
|
yading@10
|
504 vld1.8 {d1}, [r1,:64], r2 @ P3
|
yading@10
|
505 vld1.8 {d2}, [r0,:64], r2 @ P2
|
yading@10
|
506 vld1.8 {d3}, [r1,:64], r2 @ P2
|
yading@10
|
507 vld1.8 {d4}, [r0,:64], r2 @ P1
|
yading@10
|
508 vld1.8 {d5}, [r1,:64], r2 @ P1
|
yading@10
|
509 vld1.8 {d6}, [r0,:64], r2 @ P0
|
yading@10
|
510 vld1.8 {d7}, [r1,:64], r2 @ P0
|
yading@10
|
511 vld1.8 {d8}, [r0,:64], r2 @ Q0
|
yading@10
|
512 vld1.8 {d9}, [r1,:64], r2 @ Q0
|
yading@10
|
513 vld1.8 {d10}, [r0,:64], r2 @ Q1
|
yading@10
|
514 vld1.8 {d11}, [r1,:64], r2 @ Q1
|
yading@10
|
515 vld1.8 {d12}, [r0,:64], r2 @ Q2
|
yading@10
|
516 vld1.8 {d13}, [r1,:64], r2 @ Q2
|
yading@10
|
517 vld1.8 {d14}, [r0,:64] @ Q3
|
yading@10
|
518 vld1.8 {d15}, [r1,:64] @ Q3
|
yading@10
|
519
|
yading@10
|
520 vdup.8 q14, r3 @ flim_E
|
yading@10
|
521 vdup.8 q15, r12 @ flim_I
|
yading@10
|
522 ldr r12, [sp, #68] @ hev_thresh
|
yading@10
|
523
|
yading@10
|
524 vp8_loop_filter inner=\inner
|
yading@10
|
525
|
yading@10
|
526 @ back up to P2: u,v -= stride * 6
|
yading@10
|
527 sub r0, r0, r2, lsl #2
|
yading@10
|
528 sub r1, r1, r2, lsl #2
|
yading@10
|
529 sub r0, r0, r2, lsl #1
|
yading@10
|
530 sub r1, r1, r2, lsl #1
|
yading@10
|
531
|
yading@10
|
532 @ Store pixels:
|
yading@10
|
533 vst1.8 {d2}, [r0,:64], r2 @ P2
|
yading@10
|
534 vst1.8 {d3}, [r1,:64], r2 @ P2
|
yading@10
|
535 vst1.8 {d4}, [r0,:64], r2 @ P1
|
yading@10
|
536 vst1.8 {d5}, [r1,:64], r2 @ P1
|
yading@10
|
537 vst1.8 {d6}, [r0,:64], r2 @ P0
|
yading@10
|
538 vst1.8 {d7}, [r1,:64], r2 @ P0
|
yading@10
|
539 vst1.8 {d8}, [r0,:64], r2 @ Q0
|
yading@10
|
540 vst1.8 {d9}, [r1,:64], r2 @ Q0
|
yading@10
|
541 vst1.8 {d10}, [r0,:64], r2 @ Q1
|
yading@10
|
542 vst1.8 {d11}, [r1,:64], r2 @ Q1
|
yading@10
|
543 vst1.8 {d12}, [r0,:64] @ Q2
|
yading@10
|
544 vst1.8 {d13}, [r1,:64] @ Q2
|
yading@10
|
545
|
yading@10
|
546 vpop {q4-q7}
|
yading@10
|
547 bx lr
|
yading@10
|
548 endfunc
|
yading@10
|
549 .endm
|
yading@10
|
550
|
yading@10
|
551 vp8_v_loop_filter8uv
|
yading@10
|
552 vp8_v_loop_filter8uv _inner, inner=1
|
yading@10
|
553
|
yading@10
|
554 .macro vp8_h_loop_filter16 name, inner=0, simple=0
|
yading@10
|
555 function ff_vp8_h_loop_filter16\name\()_neon, export=1
|
yading@10
|
556 vpush {q4-q7}
|
yading@10
|
557 sub r0, r0, #4
|
yading@10
|
558 .if !\simple
|
yading@10
|
559 ldr r12, [sp, #64] @ hev_thresh
|
yading@10
|
560 .endif
|
yading@10
|
561
|
yading@10
|
562 @ Load pixels:
|
yading@10
|
563 vld1.8 {d0}, [r0], r1 @ load first 8-line src data
|
yading@10
|
564 vld1.8 {d2}, [r0], r1
|
yading@10
|
565 vld1.8 {d4}, [r0], r1
|
yading@10
|
566 vld1.8 {d6}, [r0], r1
|
yading@10
|
567 vld1.8 {d8}, [r0], r1
|
yading@10
|
568 vld1.8 {d10}, [r0], r1
|
yading@10
|
569 vld1.8 {d12}, [r0], r1
|
yading@10
|
570 vld1.8 {d14}, [r0], r1
|
yading@10
|
571 vld1.8 {d1}, [r0], r1 @ load second 8-line src data
|
yading@10
|
572 vld1.8 {d3}, [r0], r1
|
yading@10
|
573 vld1.8 {d5}, [r0], r1
|
yading@10
|
574 vld1.8 {d7}, [r0], r1
|
yading@10
|
575 vld1.8 {d9}, [r0], r1
|
yading@10
|
576 vld1.8 {d11}, [r0], r1
|
yading@10
|
577 vld1.8 {d13}, [r0], r1
|
yading@10
|
578 vld1.8 {d15}, [r0], r1
|
yading@10
|
579
|
yading@10
|
580 transpose_8x8 q0, q1, q2, q3, q4, q5, q6, q7
|
yading@10
|
581
|
yading@10
|
582 vdup.8 q14, r2 @ flim_E
|
yading@10
|
583 .if !\simple
|
yading@10
|
584 vdup.8 q15, r3 @ flim_I
|
yading@10
|
585 .endif
|
yading@10
|
586
|
yading@10
|
587 vp8_loop_filter inner=\inner, simple=\simple
|
yading@10
|
588
|
yading@10
|
589 sub r0, r0, r1, lsl #4 @ backup 16 rows
|
yading@10
|
590
|
yading@10
|
591 transpose_8x8 q0, q1, q2, q3, q4, q5, q6, q7
|
yading@10
|
592
|
yading@10
|
593 @ Store pixels:
|
yading@10
|
594 vst1.8 {d0}, [r0], r1
|
yading@10
|
595 vst1.8 {d2}, [r0], r1
|
yading@10
|
596 vst1.8 {d4}, [r0], r1
|
yading@10
|
597 vst1.8 {d6}, [r0], r1
|
yading@10
|
598 vst1.8 {d8}, [r0], r1
|
yading@10
|
599 vst1.8 {d10}, [r0], r1
|
yading@10
|
600 vst1.8 {d12}, [r0], r1
|
yading@10
|
601 vst1.8 {d14}, [r0], r1
|
yading@10
|
602 vst1.8 {d1}, [r0], r1
|
yading@10
|
603 vst1.8 {d3}, [r0], r1
|
yading@10
|
604 vst1.8 {d5}, [r0], r1
|
yading@10
|
605 vst1.8 {d7}, [r0], r1
|
yading@10
|
606 vst1.8 {d9}, [r0], r1
|
yading@10
|
607 vst1.8 {d11}, [r0], r1
|
yading@10
|
608 vst1.8 {d13}, [r0], r1
|
yading@10
|
609 vst1.8 {d15}, [r0]
|
yading@10
|
610
|
yading@10
|
611 vpop {q4-q7}
|
yading@10
|
612 bx lr
|
yading@10
|
613 endfunc
|
yading@10
|
614 .endm
|
yading@10
|
615
|
yading@10
|
616 vp8_h_loop_filter16
|
yading@10
|
617 vp8_h_loop_filter16 _inner, inner=1
|
yading@10
|
618 vp8_h_loop_filter16 _simple, simple=1
|
yading@10
|
619
|
yading@10
|
620 .macro vp8_h_loop_filter8uv name, inner=0
|
yading@10
|
621 function ff_vp8_h_loop_filter8uv\name\()_neon, export=1
|
yading@10
|
622 vpush {q4-q7}
|
yading@10
|
623 sub r0, r0, #4
|
yading@10
|
624 sub r1, r1, #4
|
yading@10
|
625 ldr r12, [sp, #64] @ flim_I
|
yading@10
|
626
|
yading@10
|
627 @ Load pixels:
|
yading@10
|
628 vld1.8 {d0}, [r0], r2 @ load u
|
yading@10
|
629 vld1.8 {d1}, [r1], r2 @ load v
|
yading@10
|
630 vld1.8 {d2}, [r0], r2
|
yading@10
|
631 vld1.8 {d3}, [r1], r2
|
yading@10
|
632 vld1.8 {d4}, [r0], r2
|
yading@10
|
633 vld1.8 {d5}, [r1], r2
|
yading@10
|
634 vld1.8 {d6}, [r0], r2
|
yading@10
|
635 vld1.8 {d7}, [r1], r2
|
yading@10
|
636 vld1.8 {d8}, [r0], r2
|
yading@10
|
637 vld1.8 {d9}, [r1], r2
|
yading@10
|
638 vld1.8 {d10}, [r0], r2
|
yading@10
|
639 vld1.8 {d11}, [r1], r2
|
yading@10
|
640 vld1.8 {d12}, [r0], r2
|
yading@10
|
641 vld1.8 {d13}, [r1], r2
|
yading@10
|
642 vld1.8 {d14}, [r0], r2
|
yading@10
|
643 vld1.8 {d15}, [r1], r2
|
yading@10
|
644
|
yading@10
|
645 transpose_8x8 q0, q1, q2, q3, q4, q5, q6, q7
|
yading@10
|
646
|
yading@10
|
647 vdup.8 q14, r3 @ flim_E
|
yading@10
|
648 vdup.8 q15, r12 @ flim_I
|
yading@10
|
649 ldr r12, [sp, #68] @ hev_thresh
|
yading@10
|
650
|
yading@10
|
651 vp8_loop_filter inner=\inner
|
yading@10
|
652
|
yading@10
|
653 sub r0, r0, r2, lsl #3 @ backup u 8 rows
|
yading@10
|
654 sub r1, r1, r2, lsl #3 @ backup v 8 rows
|
yading@10
|
655
|
yading@10
|
656 transpose_8x8 q0, q1, q2, q3, q4, q5, q6, q7
|
yading@10
|
657
|
yading@10
|
658 @ Store pixels:
|
yading@10
|
659 vst1.8 {d0}, [r0], r2
|
yading@10
|
660 vst1.8 {d1}, [r1], r2
|
yading@10
|
661 vst1.8 {d2}, [r0], r2
|
yading@10
|
662 vst1.8 {d3}, [r1], r2
|
yading@10
|
663 vst1.8 {d4}, [r0], r2
|
yading@10
|
664 vst1.8 {d5}, [r1], r2
|
yading@10
|
665 vst1.8 {d6}, [r0], r2
|
yading@10
|
666 vst1.8 {d7}, [r1], r2
|
yading@10
|
667 vst1.8 {d8}, [r0], r2
|
yading@10
|
668 vst1.8 {d9}, [r1], r2
|
yading@10
|
669 vst1.8 {d10}, [r0], r2
|
yading@10
|
670 vst1.8 {d11}, [r1], r2
|
yading@10
|
671 vst1.8 {d12}, [r0], r2
|
yading@10
|
672 vst1.8 {d13}, [r1], r2
|
yading@10
|
673 vst1.8 {d14}, [r0]
|
yading@10
|
674 vst1.8 {d15}, [r1]
|
yading@10
|
675
|
yading@10
|
676 vpop {q4-q7}
|
yading@10
|
677 bx lr
|
yading@10
|
678 endfunc
|
yading@10
|
679 .endm
|
yading@10
|
680
|
yading@10
|
681 vp8_h_loop_filter8uv
|
yading@10
|
682 vp8_h_loop_filter8uv _inner, inner=1
|
yading@10
|
683
|
yading@10
|
684 function ff_put_vp8_pixels16_neon, export=1
|
yading@10
|
685 ldr r12, [sp, #0] @ h
|
yading@10
|
686 1:
|
yading@10
|
687 subs r12, r12, #4
|
yading@10
|
688 vld1.8 {q0}, [r2], r3
|
yading@10
|
689 vld1.8 {q1}, [r2], r3
|
yading@10
|
690 vld1.8 {q2}, [r2], r3
|
yading@10
|
691 vld1.8 {q3}, [r2], r3
|
yading@10
|
692 vst1.8 {q0}, [r0,:128], r1
|
yading@10
|
693 vst1.8 {q1}, [r0,:128], r1
|
yading@10
|
694 vst1.8 {q2}, [r0,:128], r1
|
yading@10
|
695 vst1.8 {q3}, [r0,:128], r1
|
yading@10
|
696 bgt 1b
|
yading@10
|
697 bx lr
|
yading@10
|
698 endfunc
|
yading@10
|
699
|
yading@10
|
700 function ff_put_vp8_pixels8_neon, export=1
|
yading@10
|
701 ldr r12, [sp, #0] @ h
|
yading@10
|
702 1:
|
yading@10
|
703 subs r12, r12, #4
|
yading@10
|
704 vld1.8 {d0}, [r2], r3
|
yading@10
|
705 vld1.8 {d1}, [r2], r3
|
yading@10
|
706 vld1.8 {d2}, [r2], r3
|
yading@10
|
707 vld1.8 {d3}, [r2], r3
|
yading@10
|
708 vst1.8 {d0}, [r0,:64], r1
|
yading@10
|
709 vst1.8 {d1}, [r0,:64], r1
|
yading@10
|
710 vst1.8 {d2}, [r0,:64], r1
|
yading@10
|
711 vst1.8 {d3}, [r0,:64], r1
|
yading@10
|
712 bgt 1b
|
yading@10
|
713 bx lr
|
yading@10
|
714 endfunc
|
yading@10
|
715
|
yading@10
|
716 /* 4/6-tap 8th-pel MC */
|
yading@10
|
717
|
yading@10
|
718 .macro vp8_epel8_h6 d, a, b
|
yading@10
|
719 vext.8 d27, \a, \b, #1
|
yading@10
|
720 vmovl.u8 q8, \a
|
yading@10
|
721 vext.8 d28, \a, \b, #2
|
yading@10
|
722 vmovl.u8 q9, d27
|
yading@10
|
723 vext.8 d29, \a, \b, #3
|
yading@10
|
724 vmovl.u8 q10, d28
|
yading@10
|
725 vext.8 d30, \a, \b, #4
|
yading@10
|
726 vmovl.u8 q11, d29
|
yading@10
|
727 vext.8 d31, \a, \b, #5
|
yading@10
|
728 vmovl.u8 q12, d30
|
yading@10
|
729 vmul.u16 q10, q10, d0[2]
|
yading@10
|
730 vmovl.u8 q13, d31
|
yading@10
|
731 vmul.u16 q11, q11, d0[3]
|
yading@10
|
732 vmls.u16 q10, q9, d0[1]
|
yading@10
|
733 vmls.u16 q11, q12, d1[0]
|
yading@10
|
734 vmla.u16 q10, q8, d0[0]
|
yading@10
|
735 vmla.u16 q11, q13, d1[1]
|
yading@10
|
736 vqadd.s16 q11, q10, q11
|
yading@10
|
737 vqrshrun.s16 \d, q11, #7
|
yading@10
|
738 .endm
|
yading@10
|
739
|
yading@10
|
740 .macro vp8_epel16_h6 d0, d1, s0, s1, s2, q0, q1
|
yading@10
|
741 vext.8 q14, \q0, \q1, #3
|
yading@10
|
742 vext.8 q15, \q0, \q1, #4
|
yading@10
|
743 vmovl.u8 q11, d28
|
yading@10
|
744 vmovl.u8 q14, d29
|
yading@10
|
745 vext.8 q3, \q0, \q1, #2
|
yading@10
|
746 vmovl.u8 q12, d30
|
yading@10
|
747 vmovl.u8 q15, d31
|
yading@10
|
748 vext.8 q8, \q0, \q1, #1
|
yading@10
|
749 vmovl.u8 q10, d6
|
yading@10
|
750 vmovl.u8 q3, d7
|
yading@10
|
751 vext.8 q2, \q0, \q1, #5
|
yading@10
|
752 vmovl.u8 q13, d4
|
yading@10
|
753 vmovl.u8 q2, d5
|
yading@10
|
754 vmovl.u8 q9, d16
|
yading@10
|
755 vmovl.u8 q8, d17
|
yading@10
|
756 vmul.u16 q11, q11, d0[3]
|
yading@10
|
757 vmul.u16 q10, q10, d0[2]
|
yading@10
|
758 vmul.u16 q3, q3, d0[2]
|
yading@10
|
759 vmul.u16 q14, q14, d0[3]
|
yading@10
|
760 vmls.u16 q11, q12, d1[0]
|
yading@10
|
761 vmovl.u8 q12, \s0
|
yading@10
|
762 vmovl.u8 q1, \s1
|
yading@10
|
763 vmls.u16 q10, q9, d0[1]
|
yading@10
|
764 vmls.u16 q3, q8, d0[1]
|
yading@10
|
765 vmls.u16 q14, q15, d1[0]
|
yading@10
|
766 vmla.u16 q10, q12, d0[0]
|
yading@10
|
767 vmla.u16 q11, q13, d1[1]
|
yading@10
|
768 vmla.u16 q3, q1, d0[0]
|
yading@10
|
769 vmla.u16 q14, q2, d1[1]
|
yading@10
|
770 vqadd.s16 q11, q10, q11
|
yading@10
|
771 vqadd.s16 q14, q3, q14
|
yading@10
|
772 vqrshrun.s16 \d0, q11, #7
|
yading@10
|
773 vqrshrun.s16 \d1, q14, #7
|
yading@10
|
774 .endm
|
yading@10
|
775
|
yading@10
|
776 .macro vp8_epel8_v6 d0, s0, s1, s2, s3, s4, s5
|
yading@10
|
777 vmovl.u8 q10, \s2
|
yading@10
|
778 vmovl.u8 q11, \s3
|
yading@10
|
779 vmovl.u8 q9, \s1
|
yading@10
|
780 vmovl.u8 q12, \s4
|
yading@10
|
781 vmovl.u8 q8, \s0
|
yading@10
|
782 vmovl.u8 q13, \s5
|
yading@10
|
783 vmul.u16 q10, q10, d0[2]
|
yading@10
|
784 vmul.u16 q11, q11, d0[3]
|
yading@10
|
785 vmls.u16 q10, q9, d0[1]
|
yading@10
|
786 vmls.u16 q11, q12, d1[0]
|
yading@10
|
787 vmla.u16 q10, q8, d0[0]
|
yading@10
|
788 vmla.u16 q11, q13, d1[1]
|
yading@10
|
789 vqadd.s16 q11, q10, q11
|
yading@10
|
790 vqrshrun.s16 \d0, q11, #7
|
yading@10
|
791 .endm
|
yading@10
|
792
|
yading@10
|
793 .macro vp8_epel8_v6_y2 d0, d1, s0, s1, s2, s3, s4, s5, s6
|
yading@10
|
794 vmovl.u8 q10, \s0
|
yading@10
|
795 vmovl.u8 q11, \s3
|
yading@10
|
796 vmovl.u8 q14, \s6
|
yading@10
|
797 vmovl.u8 q9, \s1
|
yading@10
|
798 vmovl.u8 q12, \s4
|
yading@10
|
799 vmovl.u8 q8, \s2
|
yading@10
|
800 vmovl.u8 q13, \s5
|
yading@10
|
801 vmul.u16 q10, q10, d0[0]
|
yading@10
|
802 vmul.u16 q15, q11, d0[3]
|
yading@10
|
803 vmul.u16 q11, q11, d0[2]
|
yading@10
|
804 vmul.u16 q14, q14, d1[1]
|
yading@10
|
805 vmls.u16 q10, q9, d0[1]
|
yading@10
|
806 vmls.u16 q15, q12, d1[0]
|
yading@10
|
807 vmls.u16 q11, q8, d0[1]
|
yading@10
|
808 vmls.u16 q14, q13, d1[0]
|
yading@10
|
809 vmla.u16 q10, q8, d0[2]
|
yading@10
|
810 vmla.u16 q15, q13, d1[1]
|
yading@10
|
811 vmla.u16 q11, q9, d0[0]
|
yading@10
|
812 vmla.u16 q14, q12, d0[3]
|
yading@10
|
813 vqadd.s16 q15, q10, q15
|
yading@10
|
814 vqadd.s16 q14, q11, q14
|
yading@10
|
815 vqrshrun.s16 \d0, q15, #7
|
yading@10
|
816 vqrshrun.s16 \d1, q14, #7
|
yading@10
|
817 .endm
|
yading@10
|
818
|
yading@10
|
819 .macro vp8_epel8_h4 d, a, b
|
yading@10
|
820 vext.8 d28, \a, \b, #1
|
yading@10
|
821 vmovl.u8 q9, \a
|
yading@10
|
822 vext.8 d29, \a, \b, #2
|
yading@10
|
823 vmovl.u8 q10, d28
|
yading@10
|
824 vext.8 d30, \a, \b, #3
|
yading@10
|
825 vmovl.u8 q11, d29
|
yading@10
|
826 vmovl.u8 q12, d30
|
yading@10
|
827 vmul.u16 q10, q10, d0[2]
|
yading@10
|
828 vmul.u16 q11, q11, d0[3]
|
yading@10
|
829 vmls.u16 q10, q9, d0[1]
|
yading@10
|
830 vmls.u16 q11, q12, d1[0]
|
yading@10
|
831 vqadd.s16 q11, q10, q11
|
yading@10
|
832 vqrshrun.s16 \d, q11, #7
|
yading@10
|
833 .endm
|
yading@10
|
834
|
yading@10
|
835 .macro vp8_epel8_v4_y2 d0, d1, s0, s1, s2, s3, s4
|
yading@10
|
836 vmovl.u8 q9, \s0
|
yading@10
|
837 vmovl.u8 q10, \s1
|
yading@10
|
838 vmovl.u8 q11, \s2
|
yading@10
|
839 vmovl.u8 q12, \s3
|
yading@10
|
840 vmovl.u8 q13, \s4
|
yading@10
|
841 vmul.u16 q8, q10, d0[2]
|
yading@10
|
842 vmul.u16 q14, q11, d0[3]
|
yading@10
|
843 vmul.u16 q11, q11, d0[2]
|
yading@10
|
844 vmul.u16 q15, q12, d0[3]
|
yading@10
|
845 vmls.u16 q8, q9, d0[1]
|
yading@10
|
846 vmls.u16 q14, q12, d1[0]
|
yading@10
|
847 vmls.u16 q11, q10, d0[1]
|
yading@10
|
848 vmls.u16 q15, q13, d1[0]
|
yading@10
|
849 vqadd.s16 q8, q8, q14
|
yading@10
|
850 vqadd.s16 q11, q11, q15
|
yading@10
|
851 vqrshrun.s16 \d0, q8, #7
|
yading@10
|
852 vqrshrun.s16 \d1, q11, #7
|
yading@10
|
853 .endm
|
yading@10
|
854
|
yading@10
|
855 function ff_put_vp8_epel16_v6_neon, export=1
|
yading@10
|
856 sub r2, r2, r3, lsl #1
|
yading@10
|
857 push {r4,lr}
|
yading@10
|
858 vpush {d8-d15}
|
yading@10
|
859
|
yading@10
|
860 ldr r4, [sp, #80] @ my
|
yading@10
|
861 movrel lr, subpel_filters-16
|
yading@10
|
862 ldr r12, [sp, #72] @ h
|
yading@10
|
863 add r4, lr, r4, lsl #4
|
yading@10
|
864 vld1.16 {q0}, [r4,:128]
|
yading@10
|
865 1:
|
yading@10
|
866 vld1.8 {d2-d3}, [r2], r3
|
yading@10
|
867 vld1.8 {d4-d5}, [r2], r3
|
yading@10
|
868 vld1.8 {d6-d7}, [r2], r3
|
yading@10
|
869 vld1.8 {d8-d9}, [r2], r3
|
yading@10
|
870 vld1.8 {d10-d11},[r2], r3
|
yading@10
|
871 vld1.8 {d12-d13},[r2], r3
|
yading@10
|
872 vld1.8 {d14-d15},[r2]
|
yading@10
|
873 sub r2, r2, r3, lsl #2
|
yading@10
|
874
|
yading@10
|
875 vp8_epel8_v6_y2 d2, d4, d2, d4, d6, d8, d10, d12, d14
|
yading@10
|
876 vp8_epel8_v6_y2 d3, d5, d3, d5, d7, d9, d11, d13, d15
|
yading@10
|
877
|
yading@10
|
878 vst1.8 {d2-d3}, [r0,:128], r1
|
yading@10
|
879 vst1.8 {d4-d5}, [r0,:128], r1
|
yading@10
|
880 subs r12, r12, #2
|
yading@10
|
881 bne 1b
|
yading@10
|
882
|
yading@10
|
883 vpop {d8-d15}
|
yading@10
|
884 pop {r4,pc}
|
yading@10
|
885 endfunc
|
yading@10
|
886
|
yading@10
|
887 function ff_put_vp8_epel16_h6_neon, export=1
|
yading@10
|
888 sub r2, r2, #2
|
yading@10
|
889 push {r4,lr}
|
yading@10
|
890
|
yading@10
|
891 ldr r4, [sp, #12] @ mx
|
yading@10
|
892 movrel lr, subpel_filters-16
|
yading@10
|
893 ldr r12, [sp, #8] @ h
|
yading@10
|
894 add r4, lr, r4, lsl #4
|
yading@10
|
895 vld1.16 {q0}, [r4,:128]
|
yading@10
|
896 1:
|
yading@10
|
897 vld1.8 {d2-d4}, [r2], r3
|
yading@10
|
898
|
yading@10
|
899 vp8_epel16_h6 d2, d3, d2, d3, d4, q1, q2
|
yading@10
|
900
|
yading@10
|
901 vst1.8 {d2-d3}, [r0,:128], r1
|
yading@10
|
902 subs r12, r12, #1
|
yading@10
|
903 bne 1b
|
yading@10
|
904
|
yading@10
|
905 pop {r4,pc}
|
yading@10
|
906 endfunc
|
yading@10
|
907
|
yading@10
|
908 function ff_put_vp8_epel16_h6v6_neon, export=1
|
yading@10
|
909 sub r2, r2, r3, lsl #1
|
yading@10
|
910 sub r2, r2, #2
|
yading@10
|
911 push {r4,lr}
|
yading@10
|
912 vpush {d8-d9}
|
yading@10
|
913
|
yading@10
|
914 @ first pass (horizontal):
|
yading@10
|
915 ldr r4, [sp, #28] @ mx
|
yading@10
|
916 movrel lr, subpel_filters-16
|
yading@10
|
917 ldr r12, [sp, #24] @ h
|
yading@10
|
918 add r4, lr, r4, lsl #4
|
yading@10
|
919 sub sp, sp, #336+16
|
yading@10
|
920 vld1.16 {q0}, [r4,:128]
|
yading@10
|
921 add lr, sp, #15
|
yading@10
|
922 add r12, r12, #5
|
yading@10
|
923 bic lr, lr, #15
|
yading@10
|
924 1:
|
yading@10
|
925 vld1.8 {d2,d3,d4}, [r2], r3
|
yading@10
|
926
|
yading@10
|
927 vp8_epel16_h6 d2, d3, d2, d3, d4, q1, q2
|
yading@10
|
928
|
yading@10
|
929 vst1.8 {d2-d3}, [lr,:128]!
|
yading@10
|
930 subs r12, r12, #1
|
yading@10
|
931 bne 1b
|
yading@10
|
932
|
yading@10
|
933 @ second pass (vertical):
|
yading@10
|
934 ldr r4, [sp, #336+16+32] @ my
|
yading@10
|
935 movrel lr, subpel_filters-16
|
yading@10
|
936 ldr r12, [sp, #336+16+24] @ h
|
yading@10
|
937 add r4, lr, r4, lsl #4
|
yading@10
|
938 add lr, sp, #15
|
yading@10
|
939 vld1.16 {q0}, [r4,:128]
|
yading@10
|
940 bic lr, lr, #15
|
yading@10
|
941 2:
|
yading@10
|
942 vld1.8 {d2-d5}, [lr,:128]!
|
yading@10
|
943 vld1.8 {d6-d9}, [lr,:128]!
|
yading@10
|
944 vld1.8 {d28-d31},[lr,:128]
|
yading@10
|
945 sub lr, lr, #48
|
yading@10
|
946
|
yading@10
|
947 vp8_epel8_v6 d2, d2, d4, d6, d8, d28, d30
|
yading@10
|
948 vp8_epel8_v6 d3, d3, d5, d7, d9, d29, d31
|
yading@10
|
949
|
yading@10
|
950 vst1.8 {d2-d3}, [r0,:128], r1
|
yading@10
|
951 subs r12, r12, #1
|
yading@10
|
952 bne 2b
|
yading@10
|
953
|
yading@10
|
954 add sp, sp, #336+16
|
yading@10
|
955 vpop {d8-d9}
|
yading@10
|
956 pop {r4,pc}
|
yading@10
|
957 endfunc
|
yading@10
|
958
|
yading@10
|
959 function ff_put_vp8_epel8_v6_neon, export=1
|
yading@10
|
960 sub r2, r2, r3, lsl #1
|
yading@10
|
961 push {r4,lr}
|
yading@10
|
962
|
yading@10
|
963 ldr r4, [sp, #16] @ my
|
yading@10
|
964 movrel lr, subpel_filters-16
|
yading@10
|
965 ldr r12, [sp, #8] @ h
|
yading@10
|
966 add r4, lr, r4, lsl #4
|
yading@10
|
967 vld1.16 {q0}, [r4,:128]
|
yading@10
|
968 1:
|
yading@10
|
969 vld1.8 {d2}, [r2], r3
|
yading@10
|
970 vld1.8 {d3}, [r2], r3
|
yading@10
|
971 vld1.8 {d4}, [r2], r3
|
yading@10
|
972 vld1.8 {d5}, [r2], r3
|
yading@10
|
973 vld1.8 {d6}, [r2], r3
|
yading@10
|
974 vld1.8 {d7}, [r2], r3
|
yading@10
|
975 vld1.8 {d28}, [r2]
|
yading@10
|
976
|
yading@10
|
977 sub r2, r2, r3, lsl #2
|
yading@10
|
978
|
yading@10
|
979 vp8_epel8_v6_y2 d2, d3, d2, d3, d4, d5, d6, d7, d28
|
yading@10
|
980
|
yading@10
|
981 vst1.8 {d2}, [r0,:64], r1
|
yading@10
|
982 vst1.8 {d3}, [r0,:64], r1
|
yading@10
|
983 subs r12, r12, #2
|
yading@10
|
984 bne 1b
|
yading@10
|
985
|
yading@10
|
986 pop {r4,pc}
|
yading@10
|
987 endfunc
|
yading@10
|
988
|
yading@10
|
989 function ff_put_vp8_epel8_h6_neon, export=1
|
yading@10
|
990 sub r2, r2, #2
|
yading@10
|
991 push {r4,lr}
|
yading@10
|
992
|
yading@10
|
993 ldr r4, [sp, #12] @ mx
|
yading@10
|
994 movrel lr, subpel_filters-16
|
yading@10
|
995 ldr r12, [sp, #8] @ h
|
yading@10
|
996 add r4, lr, r4, lsl #4
|
yading@10
|
997 vld1.16 {q0}, [r4,:128]
|
yading@10
|
998 1:
|
yading@10
|
999 vld1.8 {d2,d3}, [r2], r3
|
yading@10
|
1000
|
yading@10
|
1001 vp8_epel8_h6 d2, d2, d3
|
yading@10
|
1002
|
yading@10
|
1003 vst1.8 {d2}, [r0,:64], r1
|
yading@10
|
1004 subs r12, r12, #1
|
yading@10
|
1005 bne 1b
|
yading@10
|
1006
|
yading@10
|
1007 pop {r4,pc}
|
yading@10
|
1008 endfunc
|
yading@10
|
1009
|
yading@10
|
1010 function ff_put_vp8_epel8_h6v6_neon, export=1
|
yading@10
|
1011 sub r2, r2, r3, lsl #1
|
yading@10
|
1012 sub r2, r2, #2
|
yading@10
|
1013 push {r4,lr}
|
yading@10
|
1014
|
yading@10
|
1015 @ first pass (horizontal):
|
yading@10
|
1016 ldr r4, [sp, #12] @ mx
|
yading@10
|
1017 movrel lr, subpel_filters-16
|
yading@10
|
1018 ldr r12, [sp, #8] @ h
|
yading@10
|
1019 add r4, lr, r4, lsl #4
|
yading@10
|
1020 sub sp, sp, #168+16
|
yading@10
|
1021 vld1.16 {q0}, [r4,:128]
|
yading@10
|
1022 add lr, sp, #15
|
yading@10
|
1023 add r12, r12, #5
|
yading@10
|
1024 bic lr, lr, #15
|
yading@10
|
1025 1:
|
yading@10
|
1026 vld1.8 {d2,d3}, [r2], r3
|
yading@10
|
1027
|
yading@10
|
1028 vp8_epel8_h6 d2, d2, d3
|
yading@10
|
1029
|
yading@10
|
1030 vst1.8 {d2}, [lr,:64]!
|
yading@10
|
1031 subs r12, r12, #1
|
yading@10
|
1032 bne 1b
|
yading@10
|
1033
|
yading@10
|
1034 @ second pass (vertical):
|
yading@10
|
1035 ldr r4, [sp, #168+16+16] @ my
|
yading@10
|
1036 movrel lr, subpel_filters-16
|
yading@10
|
1037 ldr r12, [sp, #168+16+8] @ h
|
yading@10
|
1038 add r4, lr, r4, lsl #4
|
yading@10
|
1039 add lr, sp, #15
|
yading@10
|
1040 vld1.16 {q0}, [r4,:128]
|
yading@10
|
1041 bic lr, lr, #15
|
yading@10
|
1042 2:
|
yading@10
|
1043 vld1.8 {d2-d5}, [lr,:128]!
|
yading@10
|
1044 vld1.8 {d6-d7}, [lr,:128]!
|
yading@10
|
1045 vld1.8 {d30}, [lr,:64]
|
yading@10
|
1046 sub lr, lr, #32
|
yading@10
|
1047
|
yading@10
|
1048 vp8_epel8_v6_y2 d2, d3, d2, d3, d4, d5, d6, d7, d30
|
yading@10
|
1049
|
yading@10
|
1050 vst1.8 {d2}, [r0,:64], r1
|
yading@10
|
1051 vst1.8 {d3}, [r0,:64], r1
|
yading@10
|
1052 subs r12, r12, #2
|
yading@10
|
1053 bne 2b
|
yading@10
|
1054
|
yading@10
|
1055 add sp, sp, #168+16
|
yading@10
|
1056 pop {r4,pc}
|
yading@10
|
1057 endfunc
|
yading@10
|
1058
|
yading@10
|
1059 function ff_put_vp8_epel8_v4_neon, export=1
|
yading@10
|
1060 sub r2, r2, r3
|
yading@10
|
1061 push {r4,lr}
|
yading@10
|
1062
|
yading@10
|
1063 ldr r4, [sp, #16] @ my
|
yading@10
|
1064 movrel lr, subpel_filters-16
|
yading@10
|
1065 ldr r12, [sp, #8] @ h
|
yading@10
|
1066 add r4, lr, r4, lsl #4
|
yading@10
|
1067 vld1.16 {q0}, [r4,:128]
|
yading@10
|
1068 1:
|
yading@10
|
1069 vld1.8 {d2}, [r2], r3
|
yading@10
|
1070 vld1.8 {d3}, [r2], r3
|
yading@10
|
1071 vld1.8 {d4}, [r2], r3
|
yading@10
|
1072 vld1.8 {d5}, [r2], r3
|
yading@10
|
1073 vld1.8 {d6}, [r2]
|
yading@10
|
1074 sub r2, r2, r3, lsl #1
|
yading@10
|
1075
|
yading@10
|
1076 vp8_epel8_v4_y2 d2, d3, d2, d3, d4, d5, d6
|
yading@10
|
1077
|
yading@10
|
1078 vst1.8 {d2}, [r0,:64], r1
|
yading@10
|
1079 vst1.8 {d3}, [r0,:64], r1
|
yading@10
|
1080 subs r12, r12, #2
|
yading@10
|
1081 bne 1b
|
yading@10
|
1082
|
yading@10
|
1083 pop {r4,pc}
|
yading@10
|
1084 endfunc
|
yading@10
|
1085
|
yading@10
|
1086 function ff_put_vp8_epel8_h4_neon, export=1
|
yading@10
|
1087 sub r2, r2, #1
|
yading@10
|
1088 push {r4,lr}
|
yading@10
|
1089
|
yading@10
|
1090 ldr r4, [sp, #12] @ mx
|
yading@10
|
1091 movrel lr, subpel_filters-16
|
yading@10
|
1092 ldr r12, [sp, #8] @ h
|
yading@10
|
1093 add r4, lr, r4, lsl #4
|
yading@10
|
1094 vld1.16 {q0}, [r4,:128]
|
yading@10
|
1095 1:
|
yading@10
|
1096 vld1.8 {d2,d3}, [r2], r3
|
yading@10
|
1097
|
yading@10
|
1098 vp8_epel8_h4 d2, d2, d3
|
yading@10
|
1099
|
yading@10
|
1100 vst1.8 {d2}, [r0,:64], r1
|
yading@10
|
1101 subs r12, r12, #1
|
yading@10
|
1102 bne 1b
|
yading@10
|
1103
|
yading@10
|
1104 pop {r4,pc}
|
yading@10
|
1105 endfunc
|
yading@10
|
1106
|
yading@10
|
1107 function ff_put_vp8_epel8_h4v4_neon, export=1
|
yading@10
|
1108 sub r2, r2, r3
|
yading@10
|
1109 sub r2, r2, #1
|
yading@10
|
1110 push {r4,lr}
|
yading@10
|
1111
|
yading@10
|
1112 @ first pass (horizontal):
|
yading@10
|
1113 ldr r4, [sp, #12] @ mx
|
yading@10
|
1114 movrel lr, subpel_filters-16
|
yading@10
|
1115 ldr r12, [sp, #8] @ h
|
yading@10
|
1116 add r4, lr, r4, lsl #4
|
yading@10
|
1117 sub sp, sp, #168+16
|
yading@10
|
1118 vld1.16 {q0}, [r4,:128]
|
yading@10
|
1119 add lr, sp, #15
|
yading@10
|
1120 add r12, r12, #3
|
yading@10
|
1121 bic lr, lr, #15
|
yading@10
|
1122 1:
|
yading@10
|
1123 vld1.8 {d2,d3}, [r2], r3
|
yading@10
|
1124
|
yading@10
|
1125 vp8_epel8_h4 d2, d2, d3
|
yading@10
|
1126
|
yading@10
|
1127 vst1.8 {d2}, [lr,:64]!
|
yading@10
|
1128 subs r12, r12, #1
|
yading@10
|
1129 bne 1b
|
yading@10
|
1130
|
yading@10
|
1131 @ second pass (vertical):
|
yading@10
|
1132 ldr r4, [sp, #168+16+16] @ my
|
yading@10
|
1133 movrel lr, subpel_filters-16
|
yading@10
|
1134 ldr r12, [sp, #168+16+8] @ h
|
yading@10
|
1135 add r4, lr, r4, lsl #4
|
yading@10
|
1136 add lr, sp, #15
|
yading@10
|
1137 vld1.16 {q0}, [r4,:128]
|
yading@10
|
1138 bic lr, lr, #15
|
yading@10
|
1139 2:
|
yading@10
|
1140 vld1.8 {d2-d5}, [lr,:128]!
|
yading@10
|
1141 vld1.8 {d6}, [lr,:64]
|
yading@10
|
1142 sub lr, lr, #16
|
yading@10
|
1143
|
yading@10
|
1144 vp8_epel8_v4_y2 d2, d3, d2, d3, d4, d5, d6
|
yading@10
|
1145
|
yading@10
|
1146 vst1.8 {d2}, [r0,:64], r1
|
yading@10
|
1147 vst1.8 {d3}, [r0,:64], r1
|
yading@10
|
1148 subs r12, r12, #2
|
yading@10
|
1149 bne 2b
|
yading@10
|
1150
|
yading@10
|
1151 add sp, sp, #168+16
|
yading@10
|
1152 pop {r4,pc}
|
yading@10
|
1153 endfunc
|
yading@10
|
1154
|
yading@10
|
1155 function ff_put_vp8_epel8_h6v4_neon, export=1
|
yading@10
|
1156 sub r2, r2, r3
|
yading@10
|
1157 sub r2, r2, #2
|
yading@10
|
1158 push {r4,lr}
|
yading@10
|
1159
|
yading@10
|
1160 @ first pass (horizontal):
|
yading@10
|
1161 ldr r4, [sp, #12] @ mx
|
yading@10
|
1162 movrel lr, subpel_filters-16
|
yading@10
|
1163 ldr r12, [sp, #8] @ h
|
yading@10
|
1164 add r4, lr, r4, lsl #4
|
yading@10
|
1165 sub sp, sp, #168+16
|
yading@10
|
1166 vld1.16 {q0}, [r4,:128]
|
yading@10
|
1167 add lr, sp, #15
|
yading@10
|
1168 add r12, r12, #3
|
yading@10
|
1169 bic lr, lr, #15
|
yading@10
|
1170 1:
|
yading@10
|
1171 vld1.8 {d2,d3}, [r2], r3
|
yading@10
|
1172
|
yading@10
|
1173 vp8_epel8_h6 d2, d2, d3
|
yading@10
|
1174
|
yading@10
|
1175 vst1.8 {d2}, [lr,:64]!
|
yading@10
|
1176 subs r12, r12, #1
|
yading@10
|
1177 bne 1b
|
yading@10
|
1178
|
yading@10
|
1179 @ second pass (vertical):
|
yading@10
|
1180 ldr r4, [sp, #168+16+16] @ my
|
yading@10
|
1181 movrel lr, subpel_filters-16
|
yading@10
|
1182 ldr r12, [sp, #168+16+8] @ h
|
yading@10
|
1183 add r4, lr, r4, lsl #4
|
yading@10
|
1184 add lr, sp, #15
|
yading@10
|
1185 vld1.16 {q0}, [r4,:128]
|
yading@10
|
1186 bic lr, lr, #15
|
yading@10
|
1187 2:
|
yading@10
|
1188 vld1.8 {d2-d5}, [lr,:128]!
|
yading@10
|
1189 vld1.8 {d6}, [lr,:64]
|
yading@10
|
1190 sub lr, lr, #16
|
yading@10
|
1191
|
yading@10
|
1192 vp8_epel8_v4_y2 d2, d3, d2, d3, d4, d5, d6
|
yading@10
|
1193
|
yading@10
|
1194 vst1.8 {d2}, [r0,:64], r1
|
yading@10
|
1195 vst1.8 {d3}, [r0,:64], r1
|
yading@10
|
1196 subs r12, r12, #2
|
yading@10
|
1197 bne 2b
|
yading@10
|
1198
|
yading@10
|
1199 add sp, sp, #168+16
|
yading@10
|
1200 pop {r4,pc}
|
yading@10
|
1201 endfunc
|
yading@10
|
1202
|
yading@10
|
1203 function ff_put_vp8_epel8_h4v6_neon, export=1
|
yading@10
|
1204 sub r2, r2, r3, lsl #1
|
yading@10
|
1205 sub r2, r2, #1
|
yading@10
|
1206 push {r4,lr}
|
yading@10
|
1207
|
yading@10
|
1208 @ first pass (horizontal):
|
yading@10
|
1209 ldr r4, [sp, #12] @ mx
|
yading@10
|
1210 movrel lr, subpel_filters-16
|
yading@10
|
1211 ldr r12, [sp, #8] @ h
|
yading@10
|
1212 add r4, lr, r4, lsl #4
|
yading@10
|
1213 sub sp, sp, #168+16
|
yading@10
|
1214 vld1.16 {q0}, [r4,:128]
|
yading@10
|
1215 add lr, sp, #15
|
yading@10
|
1216 add r12, r12, #5
|
yading@10
|
1217 bic lr, lr, #15
|
yading@10
|
1218 1:
|
yading@10
|
1219 vld1.8 {d2,d3}, [r2], r3
|
yading@10
|
1220
|
yading@10
|
1221 vp8_epel8_h4 d2, d2, d3
|
yading@10
|
1222
|
yading@10
|
1223 vst1.8 {d2}, [lr,:64]!
|
yading@10
|
1224 subs r12, r12, #1
|
yading@10
|
1225 bne 1b
|
yading@10
|
1226
|
yading@10
|
1227 @ second pass (vertical):
|
yading@10
|
1228 ldr r4, [sp, #168+16+16] @ my
|
yading@10
|
1229 movrel lr, subpel_filters-16
|
yading@10
|
1230 ldr r12, [sp, #168+16+8] @ h
|
yading@10
|
1231 add r4, lr, r4, lsl #4
|
yading@10
|
1232 add lr, sp, #15
|
yading@10
|
1233 vld1.16 {q0}, [r4,:128]
|
yading@10
|
1234 bic lr, lr, #15
|
yading@10
|
1235 2:
|
yading@10
|
1236 vld1.8 {d2-d5}, [lr,:128]!
|
yading@10
|
1237 vld1.8 {d6-d7}, [lr,:128]!
|
yading@10
|
1238 vld1.8 {d30}, [lr,:64]
|
yading@10
|
1239 sub lr, lr, #32
|
yading@10
|
1240
|
yading@10
|
1241 vp8_epel8_v6_y2 d2, d3, d2, d3, d4, d5, d6, d7, d30
|
yading@10
|
1242
|
yading@10
|
1243 vst1.8 {d2}, [r0,:64], r1
|
yading@10
|
1244 vst1.8 {d3}, [r0,:64], r1
|
yading@10
|
1245 subs r12, r12, #2
|
yading@10
|
1246 bne 2b
|
yading@10
|
1247
|
yading@10
|
1248 add sp, sp, #168+16
|
yading@10
|
1249 pop {r4,pc}
|
yading@10
|
1250 endfunc
|
yading@10
|
1251
|
yading@10
|
1252 .ltorg
|
yading@10
|
1253
|
yading@10
|
1254 function ff_put_vp8_epel4_v6_neon, export=1
|
yading@10
|
1255 sub r2, r2, r3, lsl #1
|
yading@10
|
1256 push {r4,lr}
|
yading@10
|
1257
|
yading@10
|
1258 ldr r4, [sp, #16] @ my
|
yading@10
|
1259 movrel lr, subpel_filters-16
|
yading@10
|
1260 ldr r12, [sp, #8] @ h
|
yading@10
|
1261 add r4, lr, r4, lsl #4
|
yading@10
|
1262 vld1.16 {q0}, [r4,:128]
|
yading@10
|
1263 1:
|
yading@10
|
1264 vld1.32 {d2[]}, [r2], r3
|
yading@10
|
1265 vld1.32 {d3[]}, [r2], r3
|
yading@10
|
1266 vld1.32 {d4[]}, [r2], r3
|
yading@10
|
1267 vld1.32 {d5[]}, [r2], r3
|
yading@10
|
1268 vld1.32 {d6[]}, [r2], r3
|
yading@10
|
1269 vld1.32 {d7[]}, [r2], r3
|
yading@10
|
1270 vld1.32 {d28[]}, [r2]
|
yading@10
|
1271 sub r2, r2, r3, lsl #2
|
yading@10
|
1272 vld1.32 {d2[1]}, [r2], r3
|
yading@10
|
1273 vld1.32 {d3[1]}, [r2], r3
|
yading@10
|
1274 vld1.32 {d4[1]}, [r2], r3
|
yading@10
|
1275 vld1.32 {d5[1]}, [r2], r3
|
yading@10
|
1276 vld1.32 {d6[1]}, [r2], r3
|
yading@10
|
1277 vld1.32 {d7[1]}, [r2], r3
|
yading@10
|
1278 vld1.32 {d28[1]}, [r2]
|
yading@10
|
1279 sub r2, r2, r3, lsl #2
|
yading@10
|
1280
|
yading@10
|
1281 vp8_epel8_v6_y2 d2, d3, d2, d3, d4, d5, d6, d7, d28
|
yading@10
|
1282
|
yading@10
|
1283 vst1.32 {d2[0]}, [r0,:32], r1
|
yading@10
|
1284 vst1.32 {d3[0]}, [r0,:32], r1
|
yading@10
|
1285 vst1.32 {d2[1]}, [r0,:32], r1
|
yading@10
|
1286 vst1.32 {d3[1]}, [r0,:32], r1
|
yading@10
|
1287 subs r12, r12, #4
|
yading@10
|
1288 bne 1b
|
yading@10
|
1289
|
yading@10
|
1290 pop {r4,pc}
|
yading@10
|
1291 endfunc
|
yading@10
|
1292
|
yading@10
|
1293 function ff_put_vp8_epel4_h6_neon, export=1
|
yading@10
|
1294 sub r2, r2, #2
|
yading@10
|
1295 push {r4,lr}
|
yading@10
|
1296
|
yading@10
|
1297 ldr r4, [sp, #12] @ mx
|
yading@10
|
1298 movrel lr, subpel_filters-16
|
yading@10
|
1299 ldr r12, [sp, #8] @ h
|
yading@10
|
1300 add r4, lr, r4, lsl #4
|
yading@10
|
1301 vld1.16 {q0}, [r4,:128]
|
yading@10
|
1302 1:
|
yading@10
|
1303 vld1.8 {q1}, [r2], r3
|
yading@10
|
1304 vp8_epel8_h6 d2, d2, d3
|
yading@10
|
1305 vst1.32 {d2[0]}, [r0,:32], r1
|
yading@10
|
1306 subs r12, r12, #1
|
yading@10
|
1307 bne 1b
|
yading@10
|
1308
|
yading@10
|
1309 pop {r4,pc}
|
yading@10
|
1310 endfunc
|
yading@10
|
1311
|
yading@10
|
1312 function ff_put_vp8_epel4_h6v6_neon, export=1
|
yading@10
|
1313 sub r2, r2, r3, lsl #1
|
yading@10
|
1314 sub r2, r2, #2
|
yading@10
|
1315 push {r4,lr}
|
yading@10
|
1316
|
yading@10
|
1317 ldr r4, [sp, #12] @ mx
|
yading@10
|
1318 movrel lr, subpel_filters-16
|
yading@10
|
1319 ldr r12, [sp, #8] @ h
|
yading@10
|
1320 add r4, lr, r4, lsl #4
|
yading@10
|
1321 sub sp, sp, #52+16
|
yading@10
|
1322 vld1.16 {q0}, [r4,:128]
|
yading@10
|
1323 add lr, sp, #15
|
yading@10
|
1324 add r12, r12, #5
|
yading@10
|
1325 bic lr, lr, #15
|
yading@10
|
1326 1:
|
yading@10
|
1327 vld1.8 {q1}, [r2], r3
|
yading@10
|
1328 vp8_epel8_h6 d2, d2, d3
|
yading@10
|
1329 vst1.32 {d2[0]}, [lr,:32]!
|
yading@10
|
1330 subs r12, r12, #1
|
yading@10
|
1331 bne 1b
|
yading@10
|
1332
|
yading@10
|
1333 ldr r4, [sp, #52+16+16] @ my
|
yading@10
|
1334 movrel lr, subpel_filters-16
|
yading@10
|
1335 ldr r12, [sp, #52+16+8] @ h
|
yading@10
|
1336 add r4, lr, r4, lsl #4
|
yading@10
|
1337 add lr, sp, #15
|
yading@10
|
1338 vld1.16 {q0}, [r4,:128]
|
yading@10
|
1339 bic lr, lr, #15
|
yading@10
|
1340 2:
|
yading@10
|
1341 vld1.8 {d2-d3}, [lr,:128]!
|
yading@10
|
1342 vld1.8 {d6}, [lr,:64]!
|
yading@10
|
1343 vld1.32 {d28[]}, [lr,:32]
|
yading@10
|
1344 sub lr, lr, #16
|
yading@10
|
1345 vld1.8 {d4-d5}, [lr]!
|
yading@10
|
1346 vld1.8 {d7}, [lr,:64]!
|
yading@10
|
1347 vld1.32 {d28[1]}, [lr,:32]
|
yading@10
|
1348 sub lr, lr, #16
|
yading@10
|
1349 vtrn.32 q1, q2
|
yading@10
|
1350 vtrn.32 d6, d7
|
yading@10
|
1351 vp8_epel8_v6_y2 d2, d3, d2, d4, d3, d5, d6, d7, d28
|
yading@10
|
1352 vst1.32 {d2[0]}, [r0,:32], r1
|
yading@10
|
1353 vst1.32 {d3[0]}, [r0,:32], r1
|
yading@10
|
1354 vst1.32 {d2[1]}, [r0,:32], r1
|
yading@10
|
1355 vst1.32 {d3[1]}, [r0,:32], r1
|
yading@10
|
1356 subs r12, r12, #4
|
yading@10
|
1357 bne 2b
|
yading@10
|
1358
|
yading@10
|
1359 add sp, sp, #52+16
|
yading@10
|
1360 pop {r4,pc}
|
yading@10
|
1361 endfunc
|
yading@10
|
1362
|
yading@10
|
1363 function ff_put_vp8_epel4_h4v6_neon, export=1
|
yading@10
|
1364 sub r2, r2, r3, lsl #1
|
yading@10
|
1365 sub r2, r2, #1
|
yading@10
|
1366 push {r4,lr}
|
yading@10
|
1367
|
yading@10
|
1368 ldr r4, [sp, #12] @ mx
|
yading@10
|
1369 movrel lr, subpel_filters-16
|
yading@10
|
1370 ldr r12, [sp, #8] @ h
|
yading@10
|
1371 add r4, lr, r4, lsl #4
|
yading@10
|
1372 sub sp, sp, #52+16
|
yading@10
|
1373 vld1.16 {q0}, [r4,:128]
|
yading@10
|
1374 add lr, sp, #15
|
yading@10
|
1375 add r12, r12, #5
|
yading@10
|
1376 bic lr, lr, #15
|
yading@10
|
1377 1:
|
yading@10
|
1378 vld1.8 {d2}, [r2], r3
|
yading@10
|
1379 vp8_epel8_h4 d2, d2, d2
|
yading@10
|
1380 vst1.32 {d2[0]}, [lr,:32]!
|
yading@10
|
1381 subs r12, r12, #1
|
yading@10
|
1382 bne 1b
|
yading@10
|
1383
|
yading@10
|
1384 ldr r4, [sp, #52+16+16] @ my
|
yading@10
|
1385 movrel lr, subpel_filters-16
|
yading@10
|
1386 ldr r12, [sp, #52+16+8] @ h
|
yading@10
|
1387 add r4, lr, r4, lsl #4
|
yading@10
|
1388 add lr, sp, #15
|
yading@10
|
1389 vld1.16 {q0}, [r4,:128]
|
yading@10
|
1390 bic lr, lr, #15
|
yading@10
|
1391 2:
|
yading@10
|
1392 vld1.8 {d2-d3}, [lr,:128]!
|
yading@10
|
1393 vld1.8 {d6}, [lr,:64]!
|
yading@10
|
1394 vld1.32 {d28[]}, [lr,:32]
|
yading@10
|
1395 sub lr, lr, #16
|
yading@10
|
1396 vld1.8 {d4-d5}, [lr]!
|
yading@10
|
1397 vld1.8 {d7}, [lr,:64]!
|
yading@10
|
1398 vld1.32 {d28[1]}, [lr,:32]
|
yading@10
|
1399 sub lr, lr, #16
|
yading@10
|
1400 vtrn.32 q1, q2
|
yading@10
|
1401 vtrn.32 d6, d7
|
yading@10
|
1402 vp8_epel8_v6_y2 d2, d3, d2, d4, d3, d5, d6, d7, d28
|
yading@10
|
1403 vst1.32 {d2[0]}, [r0,:32], r1
|
yading@10
|
1404 vst1.32 {d3[0]}, [r0,:32], r1
|
yading@10
|
1405 vst1.32 {d2[1]}, [r0,:32], r1
|
yading@10
|
1406 vst1.32 {d3[1]}, [r0,:32], r1
|
yading@10
|
1407 subs r12, r12, #4
|
yading@10
|
1408 bne 2b
|
yading@10
|
1409
|
yading@10
|
1410 add sp, sp, #52+16
|
yading@10
|
1411 pop {r4,pc}
|
yading@10
|
1412 endfunc
|
yading@10
|
1413
|
yading@10
|
1414 function ff_put_vp8_epel4_h6v4_neon, export=1
|
yading@10
|
1415 sub r2, r2, r3
|
yading@10
|
1416 sub r2, r2, #2
|
yading@10
|
1417 push {r4,lr}
|
yading@10
|
1418
|
yading@10
|
1419 ldr r4, [sp, #12] @ mx
|
yading@10
|
1420 movrel lr, subpel_filters-16
|
yading@10
|
1421 ldr r12, [sp, #8] @ h
|
yading@10
|
1422 add r4, lr, r4, lsl #4
|
yading@10
|
1423 sub sp, sp, #44+16
|
yading@10
|
1424 vld1.16 {q0}, [r4,:128]
|
yading@10
|
1425 add lr, sp, #15
|
yading@10
|
1426 add r12, r12, #3
|
yading@10
|
1427 bic lr, lr, #15
|
yading@10
|
1428 1:
|
yading@10
|
1429 vld1.8 {q1}, [r2], r3
|
yading@10
|
1430 vp8_epel8_h6 d2, d2, d3
|
yading@10
|
1431 vst1.32 {d2[0]}, [lr,:32]!
|
yading@10
|
1432 subs r12, r12, #1
|
yading@10
|
1433 bne 1b
|
yading@10
|
1434
|
yading@10
|
1435 ldr r4, [sp, #44+16+16] @ my
|
yading@10
|
1436 movrel lr, subpel_filters-16
|
yading@10
|
1437 ldr r12, [sp, #44+16+8] @ h
|
yading@10
|
1438 add r4, lr, r4, lsl #4
|
yading@10
|
1439 add lr, sp, #15
|
yading@10
|
1440 vld1.16 {q0}, [r4,:128]
|
yading@10
|
1441 bic lr, lr, #15
|
yading@10
|
1442 2:
|
yading@10
|
1443 vld1.8 {d2-d3}, [lr,:128]!
|
yading@10
|
1444 vld1.32 {d6[]}, [lr,:32]
|
yading@10
|
1445 sub lr, lr, #8
|
yading@10
|
1446 vld1.8 {d4-d5}, [lr]!
|
yading@10
|
1447 vld1.32 {d6[1]}, [lr,:32]
|
yading@10
|
1448 sub lr, lr, #8
|
yading@10
|
1449 vtrn.32 q1, q2
|
yading@10
|
1450 vp8_epel8_v4_y2 d2, d3, d2, d4, d3, d5, d6
|
yading@10
|
1451 vst1.32 {d2[0]}, [r0,:32], r1
|
yading@10
|
1452 vst1.32 {d3[0]}, [r0,:32], r1
|
yading@10
|
1453 vst1.32 {d2[1]}, [r0,:32], r1
|
yading@10
|
1454 vst1.32 {d3[1]}, [r0,:32], r1
|
yading@10
|
1455 subs r12, r12, #4
|
yading@10
|
1456 bne 2b
|
yading@10
|
1457
|
yading@10
|
1458 add sp, sp, #44+16
|
yading@10
|
1459 pop {r4,pc}
|
yading@10
|
1460 endfunc
|
yading@10
|
1461
|
yading@10
|
1462 function ff_put_vp8_epel4_h4_neon, export=1
|
yading@10
|
1463 sub r2, r2, #1
|
yading@10
|
1464 push {r4,lr}
|
yading@10
|
1465
|
yading@10
|
1466 ldr r4, [sp, #12] @ mx
|
yading@10
|
1467 movrel lr, subpel_filters-16
|
yading@10
|
1468 ldr r12, [sp, #8] @ h
|
yading@10
|
1469 add r4, lr, r4, lsl #4
|
yading@10
|
1470 vld1.16 {q0}, [r4,:128]
|
yading@10
|
1471 1:
|
yading@10
|
1472 vld1.8 {d2}, [r2], r3
|
yading@10
|
1473 vp8_epel8_h4 d2, d2, d2
|
yading@10
|
1474 vst1.32 {d2[0]}, [r0,:32], r1
|
yading@10
|
1475 subs r12, r12, #1
|
yading@10
|
1476 bne 1b
|
yading@10
|
1477
|
yading@10
|
1478 pop {r4,pc}
|
yading@10
|
1479 endfunc
|
yading@10
|
1480
|
yading@10
|
1481 function ff_put_vp8_epel4_v4_neon, export=1
|
yading@10
|
1482 sub r2, r2, r3
|
yading@10
|
1483 push {r4,lr}
|
yading@10
|
1484
|
yading@10
|
1485 ldr r4, [sp, #16] @ my
|
yading@10
|
1486 movrel lr, subpel_filters-16
|
yading@10
|
1487 ldr r12, [sp, #8] @ h
|
yading@10
|
1488 add r4, lr, r4, lsl #4
|
yading@10
|
1489 vld1.16 {q0}, [r4,:128]
|
yading@10
|
1490 1:
|
yading@10
|
1491 vld1.32 {d2[]}, [r2], r3
|
yading@10
|
1492 vld1.32 {d3[]}, [r2], r3
|
yading@10
|
1493 vld1.32 {d4[]}, [r2], r3
|
yading@10
|
1494 vld1.32 {d5[]}, [r2], r3
|
yading@10
|
1495 vld1.32 {d6[]}, [r2]
|
yading@10
|
1496 sub r2, r2, r3, lsl #1
|
yading@10
|
1497 vld1.32 {d2[1]}, [r2], r3
|
yading@10
|
1498 vld1.32 {d3[1]}, [r2], r3
|
yading@10
|
1499 vld1.32 {d4[1]}, [r2], r3
|
yading@10
|
1500 vld1.32 {d5[1]}, [r2], r3
|
yading@10
|
1501 vld1.32 {d6[1]}, [r2]
|
yading@10
|
1502 sub r2, r2, r3, lsl #1
|
yading@10
|
1503
|
yading@10
|
1504 vp8_epel8_v4_y2 d2, d3, d2, d3, d4, d5, d6
|
yading@10
|
1505
|
yading@10
|
1506 vst1.32 {d2[0]}, [r0,:32], r1
|
yading@10
|
1507 vst1.32 {d3[0]}, [r0,:32], r1
|
yading@10
|
1508 vst1.32 {d2[1]}, [r0,:32], r1
|
yading@10
|
1509 vst1.32 {d3[1]}, [r0,:32], r1
|
yading@10
|
1510 subs r12, r12, #4
|
yading@10
|
1511 bne 1b
|
yading@10
|
1512
|
yading@10
|
1513 pop {r4,pc}
|
yading@10
|
1514 endfunc
|
yading@10
|
1515
|
yading@10
|
1516 function ff_put_vp8_epel4_h4v4_neon, export=1
|
yading@10
|
1517 sub r2, r2, r3
|
yading@10
|
1518 sub r2, r2, #1
|
yading@10
|
1519 push {r4,lr}
|
yading@10
|
1520
|
yading@10
|
1521 ldr r4, [sp, #12] @ mx
|
yading@10
|
1522 movrel lr, subpel_filters-16
|
yading@10
|
1523 ldr r12, [sp, #8] @ h
|
yading@10
|
1524 add r4, lr, r4, lsl #4
|
yading@10
|
1525 sub sp, sp, #44+16
|
yading@10
|
1526 vld1.16 {q0}, [r4,:128]
|
yading@10
|
1527 add lr, sp, #15
|
yading@10
|
1528 add r12, r12, #3
|
yading@10
|
1529 bic lr, lr, #15
|
yading@10
|
1530 1:
|
yading@10
|
1531 vld1.8 {d2}, [r2], r3
|
yading@10
|
1532 vp8_epel8_h4 d2, d2, d3
|
yading@10
|
1533 vst1.32 {d2[0]}, [lr,:32]!
|
yading@10
|
1534 subs r12, r12, #1
|
yading@10
|
1535 bne 1b
|
yading@10
|
1536
|
yading@10
|
1537 ldr r4, [sp, #44+16+16] @ my
|
yading@10
|
1538 movrel lr, subpel_filters-16
|
yading@10
|
1539 ldr r12, [sp, #44+16+8] @ h
|
yading@10
|
1540 add r4, lr, r4, lsl #4
|
yading@10
|
1541 add lr, sp, #15
|
yading@10
|
1542 vld1.16 {q0}, [r4,:128]
|
yading@10
|
1543 bic lr, lr, #15
|
yading@10
|
1544 2:
|
yading@10
|
1545 vld1.8 {d2-d3}, [lr,:128]!
|
yading@10
|
1546 vld1.32 {d6[]}, [lr,:32]
|
yading@10
|
1547 sub lr, lr, #8
|
yading@10
|
1548 vld1.8 {d4-d5}, [lr]!
|
yading@10
|
1549 vld1.32 {d6[1]}, [lr,:32]
|
yading@10
|
1550 sub lr, lr, #8
|
yading@10
|
1551 vtrn.32 q1, q2
|
yading@10
|
1552 vp8_epel8_v4_y2 d2, d3, d2, d4, d3, d5, d6
|
yading@10
|
1553 vst1.32 {d2[0]}, [r0,:32], r1
|
yading@10
|
1554 vst1.32 {d3[0]}, [r0,:32], r1
|
yading@10
|
1555 vst1.32 {d2[1]}, [r0,:32], r1
|
yading@10
|
1556 vst1.32 {d3[1]}, [r0,:32], r1
|
yading@10
|
1557 subs r12, r12, #4
|
yading@10
|
1558 bne 2b
|
yading@10
|
1559
|
yading@10
|
1560 add sp, sp, #44+16
|
yading@10
|
1561 pop {r4,pc}
|
yading@10
|
1562 endfunc
|
yading@10
|
1563
|
yading@10
|
1564 @ note: worst case sum of all 6-tap filter values * 255 is 0x7f80 so 16 bit
|
yading@10
|
1565 @ arithmatic can be used to apply filters
|
yading@10
|
1566 const subpel_filters, align=4
|
yading@10
|
1567 .short 0, 6, 123, 12, 1, 0, 0, 0
|
yading@10
|
1568 .short 2, 11, 108, 36, 8, 1, 0, 0
|
yading@10
|
1569 .short 0, 9, 93, 50, 6, 0, 0, 0
|
yading@10
|
1570 .short 3, 16, 77, 77, 16, 3, 0, 0
|
yading@10
|
1571 .short 0, 6, 50, 93, 9, 0, 0, 0
|
yading@10
|
1572 .short 1, 8, 36, 108, 11, 2, 0, 0
|
yading@10
|
1573 .short 0, 1, 12, 123, 6, 0, 0, 0
|
yading@10
|
1574 endconst
|
yading@10
|
1575
|
yading@10
|
1576 /* Bilinear MC */
|
yading@10
|
1577
|
yading@10
|
1578 function ff_put_vp8_bilin16_h_neon, export=1
|
yading@10
|
1579 ldr r3, [sp, #4] @ mx
|
yading@10
|
1580 rsb r12, r3, #8
|
yading@10
|
1581 vdup.8 d0, r3
|
yading@10
|
1582 vdup.8 d1, r12
|
yading@10
|
1583 ldr r12, [sp] @ h
|
yading@10
|
1584 1:
|
yading@10
|
1585 subs r12, r12, #2
|
yading@10
|
1586 vld1.8 {d2-d4}, [r2], r1
|
yading@10
|
1587 vext.8 q2, q1, q2, #1
|
yading@10
|
1588 vmull.u8 q8, d2, d1
|
yading@10
|
1589 vmlal.u8 q8, d4, d0
|
yading@10
|
1590 vld1.8 {d18-d20},[r2], r1
|
yading@10
|
1591 vmull.u8 q3, d3, d1
|
yading@10
|
1592 vmlal.u8 q3, d5, d0
|
yading@10
|
1593 vext.8 q10, q9, q10, #1
|
yading@10
|
1594 vmull.u8 q11, d18, d1
|
yading@10
|
1595 vmlal.u8 q11, d20, d0
|
yading@10
|
1596 vmull.u8 q12, d19, d1
|
yading@10
|
1597 vmlal.u8 q12, d21, d0
|
yading@10
|
1598 vrshrn.u16 d4, q8, #3
|
yading@10
|
1599 vrshrn.u16 d5, q3, #3
|
yading@10
|
1600 vrshrn.u16 d6, q11, #3
|
yading@10
|
1601 vrshrn.u16 d7, q12, #3
|
yading@10
|
1602 vst1.8 {q2}, [r0,:128], r1
|
yading@10
|
1603 vst1.8 {q3}, [r0,:128], r1
|
yading@10
|
1604 bgt 1b
|
yading@10
|
1605
|
yading@10
|
1606 bx lr
|
yading@10
|
1607 endfunc
|
yading@10
|
1608
|
yading@10
|
1609 function ff_put_vp8_bilin16_v_neon, export=1
|
yading@10
|
1610 ldr r3, [sp, #8] @ my
|
yading@10
|
1611 rsb r12, r3, #8
|
yading@10
|
1612 vdup.8 d0, r3
|
yading@10
|
1613 vdup.8 d1, r12
|
yading@10
|
1614 ldr r12, [sp] @ h
|
yading@10
|
1615 vld1.8 {q1}, [r2], r1
|
yading@10
|
1616 1:
|
yading@10
|
1617 subs r12, r12, #2
|
yading@10
|
1618 vld1.8 {q2}, [r2], r1
|
yading@10
|
1619 vmull.u8 q3, d2, d1
|
yading@10
|
1620 vmlal.u8 q3, d4, d0
|
yading@10
|
1621 vmull.u8 q8, d3, d1
|
yading@10
|
1622 vmlal.u8 q8, d5, d0
|
yading@10
|
1623 vld1.8 {q1}, [r2], r1
|
yading@10
|
1624 vmull.u8 q9, d4, d1
|
yading@10
|
1625 vmlal.u8 q9, d2, d0
|
yading@10
|
1626 vmull.u8 q10, d5, d1
|
yading@10
|
1627 vmlal.u8 q10, d3, d0
|
yading@10
|
1628 vrshrn.u16 d4, q3, #3
|
yading@10
|
1629 vrshrn.u16 d5, q8, #3
|
yading@10
|
1630 vrshrn.u16 d6, q9, #3
|
yading@10
|
1631 vrshrn.u16 d7, q10, #3
|
yading@10
|
1632 vst1.8 {q2}, [r0,:128], r1
|
yading@10
|
1633 vst1.8 {q3}, [r0,:128], r1
|
yading@10
|
1634 bgt 1b
|
yading@10
|
1635
|
yading@10
|
1636 bx lr
|
yading@10
|
1637 endfunc
|
yading@10
|
1638
|
yading@10
|
1639 function ff_put_vp8_bilin16_hv_neon, export=1
|
yading@10
|
1640 ldr r3, [sp, #4] @ mx
|
yading@10
|
1641 rsb r12, r3, #8
|
yading@10
|
1642 vdup.8 d0, r3
|
yading@10
|
1643 vdup.8 d1, r12
|
yading@10
|
1644 ldr r3, [sp, #8] @ my
|
yading@10
|
1645 rsb r12, r3, #8
|
yading@10
|
1646 vdup.8 d2, r3
|
yading@10
|
1647 vdup.8 d3, r12
|
yading@10
|
1648 ldr r12, [sp] @ h
|
yading@10
|
1649
|
yading@10
|
1650 vld1.8 {d4-d6}, [r2], r1
|
yading@10
|
1651 vext.8 q3, q2, q3, #1
|
yading@10
|
1652 vmull.u8 q8, d4, d1
|
yading@10
|
1653 vmlal.u8 q8, d6, d0
|
yading@10
|
1654 vmull.u8 q9, d5, d1
|
yading@10
|
1655 vmlal.u8 q9, d7, d0
|
yading@10
|
1656 vrshrn.u16 d4, q8, #3
|
yading@10
|
1657 vrshrn.u16 d5, q9, #3
|
yading@10
|
1658 1:
|
yading@10
|
1659 subs r12, r12, #2
|
yading@10
|
1660 vld1.8 {d18-d20},[r2], r1
|
yading@10
|
1661 vext.8 q10, q9, q10, #1
|
yading@10
|
1662 vmull.u8 q11, d18, d1
|
yading@10
|
1663 vmlal.u8 q11, d20, d0
|
yading@10
|
1664 vld1.8 {d26-d28},[r2], r1
|
yading@10
|
1665 vmull.u8 q12, d19, d1
|
yading@10
|
1666 vmlal.u8 q12, d21, d0
|
yading@10
|
1667 vext.8 q14, q13, q14, #1
|
yading@10
|
1668 vmull.u8 q8, d26, d1
|
yading@10
|
1669 vmlal.u8 q8, d28, d0
|
yading@10
|
1670 vmull.u8 q9, d27, d1
|
yading@10
|
1671 vmlal.u8 q9, d29, d0
|
yading@10
|
1672 vrshrn.u16 d6, q11, #3
|
yading@10
|
1673 vrshrn.u16 d7, q12, #3
|
yading@10
|
1674 vmull.u8 q12, d4, d3
|
yading@10
|
1675 vmlal.u8 q12, d6, d2
|
yading@10
|
1676 vmull.u8 q15, d5, d3
|
yading@10
|
1677 vmlal.u8 q15, d7, d2
|
yading@10
|
1678 vrshrn.u16 d4, q8, #3
|
yading@10
|
1679 vrshrn.u16 d5, q9, #3
|
yading@10
|
1680 vmull.u8 q10, d6, d3
|
yading@10
|
1681 vmlal.u8 q10, d4, d2
|
yading@10
|
1682 vmull.u8 q11, d7, d3
|
yading@10
|
1683 vmlal.u8 q11, d5, d2
|
yading@10
|
1684 vrshrn.u16 d24, q12, #3
|
yading@10
|
1685 vrshrn.u16 d25, q15, #3
|
yading@10
|
1686 vst1.8 {q12}, [r0,:128], r1
|
yading@10
|
1687 vrshrn.u16 d20, q10, #3
|
yading@10
|
1688 vrshrn.u16 d21, q11, #3
|
yading@10
|
1689 vst1.8 {q10}, [r0,:128], r1
|
yading@10
|
1690 bgt 1b
|
yading@10
|
1691
|
yading@10
|
1692 bx lr
|
yading@10
|
1693 endfunc
|
yading@10
|
1694
|
yading@10
|
1695 function ff_put_vp8_bilin8_h_neon, export=1
|
yading@10
|
1696 ldr r3, [sp, #4] @ mx
|
yading@10
|
1697 rsb r12, r3, #8
|
yading@10
|
1698 vdup.8 d0, r3
|
yading@10
|
1699 vdup.8 d1, r12
|
yading@10
|
1700 ldr r12, [sp] @ h
|
yading@10
|
1701 1:
|
yading@10
|
1702 subs r12, r12, #2
|
yading@10
|
1703 vld1.8 {q1}, [r2], r1
|
yading@10
|
1704 vext.8 d3, d2, d3, #1
|
yading@10
|
1705 vmull.u8 q2, d2, d1
|
yading@10
|
1706 vmlal.u8 q2, d3, d0
|
yading@10
|
1707 vld1.8 {q3}, [r2], r1
|
yading@10
|
1708 vext.8 d7, d6, d7, #1
|
yading@10
|
1709 vmull.u8 q8, d6, d1
|
yading@10
|
1710 vmlal.u8 q8, d7, d0
|
yading@10
|
1711 vrshrn.u16 d4, q2, #3
|
yading@10
|
1712 vrshrn.u16 d16, q8, #3
|
yading@10
|
1713 vst1.8 {d4}, [r0,:64], r1
|
yading@10
|
1714 vst1.8 {d16}, [r0,:64], r1
|
yading@10
|
1715 bgt 1b
|
yading@10
|
1716
|
yading@10
|
1717 bx lr
|
yading@10
|
1718 endfunc
|
yading@10
|
1719
|
yading@10
|
1720 function ff_put_vp8_bilin8_v_neon, export=1
|
yading@10
|
1721 ldr r3, [sp, #8] @ my
|
yading@10
|
1722 rsb r12, r3, #8
|
yading@10
|
1723 vdup.8 d0, r3
|
yading@10
|
1724 vdup.8 d1, r12
|
yading@10
|
1725 ldr r12, [sp] @ h
|
yading@10
|
1726 vld1.8 {d2}, [r2], r1
|
yading@10
|
1727 1:
|
yading@10
|
1728 subs r12, r12, #2
|
yading@10
|
1729 vld1.8 {d3}, [r2], r1
|
yading@10
|
1730 vmull.u8 q2, d2, d1
|
yading@10
|
1731 vmlal.u8 q2, d3, d0
|
yading@10
|
1732 vld1.8 {d2}, [r2], r1
|
yading@10
|
1733 vmull.u8 q3, d3, d1
|
yading@10
|
1734 vmlal.u8 q3, d2, d0
|
yading@10
|
1735 vrshrn.u16 d4, q2, #3
|
yading@10
|
1736 vrshrn.u16 d6, q3, #3
|
yading@10
|
1737 vst1.8 {d4}, [r0,:64], r1
|
yading@10
|
1738 vst1.8 {d6}, [r0,:64], r1
|
yading@10
|
1739 bgt 1b
|
yading@10
|
1740
|
yading@10
|
1741 bx lr
|
yading@10
|
1742 endfunc
|
yading@10
|
1743
|
yading@10
|
1744 function ff_put_vp8_bilin8_hv_neon, export=1
|
yading@10
|
1745 ldr r3, [sp, #4] @ mx
|
yading@10
|
1746 rsb r12, r3, #8
|
yading@10
|
1747 vdup.8 d0, r3
|
yading@10
|
1748 vdup.8 d1, r12
|
yading@10
|
1749 ldr r3, [sp, #8] @ my
|
yading@10
|
1750 rsb r12, r3, #8
|
yading@10
|
1751 vdup.8 d2, r3
|
yading@10
|
1752 vdup.8 d3, r12
|
yading@10
|
1753 ldr r12, [sp] @ h
|
yading@10
|
1754
|
yading@10
|
1755 vld1.8 {q2}, [r2], r1
|
yading@10
|
1756 vext.8 d5, d4, d5, #1
|
yading@10
|
1757 vmull.u8 q9, d4, d1
|
yading@10
|
1758 vmlal.u8 q9, d5, d0
|
yading@10
|
1759 vrshrn.u16 d22, q9, #3
|
yading@10
|
1760 1:
|
yading@10
|
1761 subs r12, r12, #2
|
yading@10
|
1762 vld1.8 {q3}, [r2], r1
|
yading@10
|
1763 vext.8 d7, d6, d7, #1
|
yading@10
|
1764 vmull.u8 q8, d6, d1
|
yading@10
|
1765 vmlal.u8 q8, d7, d0
|
yading@10
|
1766 vld1.8 {q2}, [r2], r1
|
yading@10
|
1767 vext.8 d5, d4, d5, #1
|
yading@10
|
1768 vmull.u8 q9, d4, d1
|
yading@10
|
1769 vmlal.u8 q9, d5, d0
|
yading@10
|
1770 vrshrn.u16 d16, q8, #3
|
yading@10
|
1771 vmull.u8 q10, d22, d3
|
yading@10
|
1772 vmlal.u8 q10, d16, d2
|
yading@10
|
1773 vrshrn.u16 d22, q9, #3
|
yading@10
|
1774 vmull.u8 q12, d16, d3
|
yading@10
|
1775 vmlal.u8 q12, d22, d2
|
yading@10
|
1776 vrshrn.u16 d20, q10, #3
|
yading@10
|
1777 vst1.8 {d20}, [r0,:64], r1
|
yading@10
|
1778 vrshrn.u16 d23, q12, #3
|
yading@10
|
1779 vst1.8 {d23}, [r0,:64], r1
|
yading@10
|
1780 bgt 1b
|
yading@10
|
1781
|
yading@10
|
1782 bx lr
|
yading@10
|
1783 endfunc
|
yading@10
|
1784
|
yading@10
|
1785 function ff_put_vp8_bilin4_h_neon, export=1
|
yading@10
|
1786 ldr r3, [sp, #4] @ mx
|
yading@10
|
1787 rsb r12, r3, #8
|
yading@10
|
1788 vdup.8 d0, r3
|
yading@10
|
1789 vdup.8 d1, r12
|
yading@10
|
1790 ldr r12, [sp] @ h
|
yading@10
|
1791 1:
|
yading@10
|
1792 subs r12, r12, #2
|
yading@10
|
1793 vld1.8 {d2}, [r2], r1
|
yading@10
|
1794 vext.8 d3, d2, d3, #1
|
yading@10
|
1795 vld1.8 {d6}, [r2], r1
|
yading@10
|
1796 vext.8 d7, d6, d7, #1
|
yading@10
|
1797 vtrn.32 q1, q3
|
yading@10
|
1798 vmull.u8 q2, d2, d1
|
yading@10
|
1799 vmlal.u8 q2, d3, d0
|
yading@10
|
1800 vrshrn.u16 d4, q2, #3
|
yading@10
|
1801 vst1.32 {d4[0]}, [r0,:32], r1
|
yading@10
|
1802 vst1.32 {d4[1]}, [r0,:32], r1
|
yading@10
|
1803 bgt 1b
|
yading@10
|
1804
|
yading@10
|
1805 bx lr
|
yading@10
|
1806 endfunc
|
yading@10
|
1807
|
yading@10
|
1808 function ff_put_vp8_bilin4_v_neon, export=1
|
yading@10
|
1809 ldr r3, [sp, #8] @ my
|
yading@10
|
1810 rsb r12, r3, #8
|
yading@10
|
1811 vdup.8 d0, r3
|
yading@10
|
1812 vdup.8 d1, r12
|
yading@10
|
1813 ldr r12, [sp] @ h
|
yading@10
|
1814 vld1.32 {d2[]}, [r2], r1
|
yading@10
|
1815 1:
|
yading@10
|
1816 vld1.32 {d3[]}, [r2]
|
yading@10
|
1817 vld1.32 {d2[1]}, [r2], r1
|
yading@10
|
1818 vld1.32 {d3[1]}, [r2], r1
|
yading@10
|
1819 vmull.u8 q2, d2, d1
|
yading@10
|
1820 vmlal.u8 q2, d3, d0
|
yading@10
|
1821 vtrn.32 d3, d2
|
yading@10
|
1822 vrshrn.u16 d4, q2, #3
|
yading@10
|
1823 vst1.32 {d4[0]}, [r0,:32], r1
|
yading@10
|
1824 vst1.32 {d4[1]}, [r0,:32], r1
|
yading@10
|
1825 subs r12, r12, #2
|
yading@10
|
1826 bgt 1b
|
yading@10
|
1827
|
yading@10
|
1828 bx lr
|
yading@10
|
1829 endfunc
|
yading@10
|
1830
|
yading@10
|
1831 function ff_put_vp8_bilin4_hv_neon, export=1
|
yading@10
|
1832 ldr r3, [sp, #4] @ mx
|
yading@10
|
1833 rsb r12, r3, #8
|
yading@10
|
1834 vdup.8 d0, r3
|
yading@10
|
1835 vdup.8 d1, r12
|
yading@10
|
1836 ldr r3, [sp, #8] @ my
|
yading@10
|
1837 rsb r12, r3, #8
|
yading@10
|
1838 vdup.8 d2, r3
|
yading@10
|
1839 vdup.8 d3, r12
|
yading@10
|
1840 ldr r12, [sp] @ h
|
yading@10
|
1841
|
yading@10
|
1842 vld1.8 {d4}, [r2], r1
|
yading@10
|
1843 vext.8 d5, d4, d4, #1
|
yading@10
|
1844 vmull.u8 q9, d4, d1
|
yading@10
|
1845 vmlal.u8 q9, d5, d0
|
yading@10
|
1846 vrshrn.u16 d22, q9, #3
|
yading@10
|
1847 1:
|
yading@10
|
1848 subs r12, r12, #2
|
yading@10
|
1849 vld1.8 {d6}, [r2], r1
|
yading@10
|
1850 vext.8 d7, d6, d6, #1
|
yading@10
|
1851 vld1.8 {d4}, [r2], r1
|
yading@10
|
1852 vext.8 d5, d4, d4, #1
|
yading@10
|
1853 vtrn.32 q3, q2
|
yading@10
|
1854 vmull.u8 q8, d6, d1
|
yading@10
|
1855 vmlal.u8 q8, d7, d0
|
yading@10
|
1856 vrshrn.u16 d16, q8, #3
|
yading@10
|
1857 vmull.u8 q10, d16, d2
|
yading@10
|
1858 vtrn.32 d22, d16
|
yading@10
|
1859 vmlal.u8 q10, d22, d3
|
yading@10
|
1860 vrev64.32 d22, d16
|
yading@10
|
1861 vrshrn.u16 d20, q10, #3
|
yading@10
|
1862 vst1.32 {d20[0]}, [r0,:32], r1
|
yading@10
|
1863 vst1.32 {d20[1]}, [r0,:32], r1
|
yading@10
|
1864 bgt 1b
|
yading@10
|
1865
|
yading@10
|
1866 bx lr
|
yading@10
|
1867 endfunc
|