yading@10
|
1 /*
|
yading@10
|
2 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
|
yading@10
|
3 *
|
yading@10
|
4 * This file is part of Libav.
|
yading@10
|
5 *
|
yading@10
|
6 * Libav is free software; you can redistribute it and/or
|
yading@10
|
7 * modify it under the terms of the GNU Lesser General Public
|
yading@10
|
8 * License as published by the Free Software Foundation; either
|
yading@10
|
9 * version 2.1 of the License, or (at your option) any later version.
|
yading@10
|
10 *
|
yading@10
|
11 * Libav is distributed in the hope that it will be useful,
|
yading@10
|
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
yading@10
|
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
yading@10
|
14 * Lesser General Public License for more details.
|
yading@10
|
15 *
|
yading@10
|
16 * You should have received a copy of the GNU Lesser General Public
|
yading@10
|
17 * License along with Libav; if not, write to the Free Software
|
yading@10
|
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
yading@10
|
19 */
|
yading@10
|
20
|
yading@10
|
21 #include "libavutil/arm/asm.S"
|
yading@10
|
22
|
yading@10
|
23 /* chroma_mc8(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
|
yading@10
|
24 .macro h264_chroma_mc8 type, codec=h264
|
yading@10
|
25 function ff_\type\()_\codec\()_chroma_mc8_neon, export=1
|
yading@10
|
26 push {r4-r7, lr}
|
yading@10
|
27 ldrd r4, r5, [sp, #20]
|
yading@10
|
28 .ifc \type,avg
|
yading@10
|
29 mov lr, r0
|
yading@10
|
30 .endif
|
yading@10
|
31 pld [r1]
|
yading@10
|
32 pld [r1, r2]
|
yading@10
|
33
|
yading@10
|
34 .ifc \codec,rv40
|
yading@10
|
35 movrel r6, rv40bias
|
yading@10
|
36 lsr r7, r5, #1
|
yading@10
|
37 add r6, r6, r7, lsl #3
|
yading@10
|
38 lsr r7, r4, #1
|
yading@10
|
39 add r6, r6, r7, lsl #1
|
yading@10
|
40 vld1.16 {d22[],d23[]}, [r6,:16]
|
yading@10
|
41 .endif
|
yading@10
|
42
|
yading@10
|
43 A muls r7, r4, r5
|
yading@10
|
44 T mul r7, r4, r5
|
yading@10
|
45 T cmp r7, #0
|
yading@10
|
46 rsb r6, r7, r5, lsl #3
|
yading@10
|
47 rsb r12, r7, r4, lsl #3
|
yading@10
|
48 sub r4, r7, r4, lsl #3
|
yading@10
|
49 sub r4, r4, r5, lsl #3
|
yading@10
|
50 add r4, r4, #64
|
yading@10
|
51
|
yading@10
|
52 beq 2f
|
yading@10
|
53
|
yading@10
|
54 vdup.8 d0, r4
|
yading@10
|
55 vdup.8 d1, r12
|
yading@10
|
56 vld1.8 {d4, d5}, [r1], r2
|
yading@10
|
57 vdup.8 d2, r6
|
yading@10
|
58 vdup.8 d3, r7
|
yading@10
|
59 vext.8 d5, d4, d5, #1
|
yading@10
|
60
|
yading@10
|
61 1: vld1.8 {d6, d7}, [r1], r2
|
yading@10
|
62 vmull.u8 q8, d4, d0
|
yading@10
|
63 vmlal.u8 q8, d5, d1
|
yading@10
|
64 vext.8 d7, d6, d7, #1
|
yading@10
|
65 vld1.8 {d4, d5}, [r1], r2
|
yading@10
|
66 vmlal.u8 q8, d6, d2
|
yading@10
|
67 pld [r1]
|
yading@10
|
68 vext.8 d5, d4, d5, #1
|
yading@10
|
69 vmlal.u8 q8, d7, d3
|
yading@10
|
70 vmull.u8 q9, d6, d0
|
yading@10
|
71 subs r3, r3, #2
|
yading@10
|
72 vmlal.u8 q9, d7, d1
|
yading@10
|
73 vmlal.u8 q9, d4, d2
|
yading@10
|
74 vmlal.u8 q9, d5, d3
|
yading@10
|
75 pld [r1, r2]
|
yading@10
|
76 .ifc \codec,h264
|
yading@10
|
77 vrshrn.u16 d16, q8, #6
|
yading@10
|
78 vrshrn.u16 d17, q9, #6
|
yading@10
|
79 .else
|
yading@10
|
80 vadd.u16 q8, q8, q11
|
yading@10
|
81 vadd.u16 q9, q9, q11
|
yading@10
|
82 vshrn.u16 d16, q8, #6
|
yading@10
|
83 vshrn.u16 d17, q9, #6
|
yading@10
|
84 .endif
|
yading@10
|
85 .ifc \type,avg
|
yading@10
|
86 vld1.8 {d20}, [lr,:64], r2
|
yading@10
|
87 vld1.8 {d21}, [lr,:64], r2
|
yading@10
|
88 vrhadd.u8 q8, q8, q10
|
yading@10
|
89 .endif
|
yading@10
|
90 vst1.8 {d16}, [r0,:64], r2
|
yading@10
|
91 vst1.8 {d17}, [r0,:64], r2
|
yading@10
|
92 bgt 1b
|
yading@10
|
93
|
yading@10
|
94 pop {r4-r7, pc}
|
yading@10
|
95
|
yading@10
|
96 2: tst r6, r6
|
yading@10
|
97 add r12, r12, r6
|
yading@10
|
98 vdup.8 d0, r4
|
yading@10
|
99 vdup.8 d1, r12
|
yading@10
|
100
|
yading@10
|
101 beq 4f
|
yading@10
|
102
|
yading@10
|
103 vld1.8 {d4}, [r1], r2
|
yading@10
|
104
|
yading@10
|
105 3: vld1.8 {d6}, [r1], r2
|
yading@10
|
106 vmull.u8 q8, d4, d0
|
yading@10
|
107 vmlal.u8 q8, d6, d1
|
yading@10
|
108 vld1.8 {d4}, [r1], r2
|
yading@10
|
109 vmull.u8 q9, d6, d0
|
yading@10
|
110 vmlal.u8 q9, d4, d1
|
yading@10
|
111 pld [r1]
|
yading@10
|
112 .ifc \codec,h264
|
yading@10
|
113 vrshrn.u16 d16, q8, #6
|
yading@10
|
114 vrshrn.u16 d17, q9, #6
|
yading@10
|
115 .else
|
yading@10
|
116 vadd.u16 q8, q8, q11
|
yading@10
|
117 vadd.u16 q9, q9, q11
|
yading@10
|
118 vshrn.u16 d16, q8, #6
|
yading@10
|
119 vshrn.u16 d17, q9, #6
|
yading@10
|
120 .endif
|
yading@10
|
121 pld [r1, r2]
|
yading@10
|
122 .ifc \type,avg
|
yading@10
|
123 vld1.8 {d20}, [lr,:64], r2
|
yading@10
|
124 vld1.8 {d21}, [lr,:64], r2
|
yading@10
|
125 vrhadd.u8 q8, q8, q10
|
yading@10
|
126 .endif
|
yading@10
|
127 subs r3, r3, #2
|
yading@10
|
128 vst1.8 {d16}, [r0,:64], r2
|
yading@10
|
129 vst1.8 {d17}, [r0,:64], r2
|
yading@10
|
130 bgt 3b
|
yading@10
|
131
|
yading@10
|
132 pop {r4-r7, pc}
|
yading@10
|
133
|
yading@10
|
134 4: vld1.8 {d4, d5}, [r1], r2
|
yading@10
|
135 vld1.8 {d6, d7}, [r1], r2
|
yading@10
|
136 vext.8 d5, d4, d5, #1
|
yading@10
|
137 vext.8 d7, d6, d7, #1
|
yading@10
|
138 pld [r1]
|
yading@10
|
139 subs r3, r3, #2
|
yading@10
|
140 vmull.u8 q8, d4, d0
|
yading@10
|
141 vmlal.u8 q8, d5, d1
|
yading@10
|
142 vmull.u8 q9, d6, d0
|
yading@10
|
143 vmlal.u8 q9, d7, d1
|
yading@10
|
144 pld [r1, r2]
|
yading@10
|
145 .ifc \codec,h264
|
yading@10
|
146 vrshrn.u16 d16, q8, #6
|
yading@10
|
147 vrshrn.u16 d17, q9, #6
|
yading@10
|
148 .else
|
yading@10
|
149 vadd.u16 q8, q8, q11
|
yading@10
|
150 vadd.u16 q9, q9, q11
|
yading@10
|
151 vshrn.u16 d16, q8, #6
|
yading@10
|
152 vshrn.u16 d17, q9, #6
|
yading@10
|
153 .endif
|
yading@10
|
154 .ifc \type,avg
|
yading@10
|
155 vld1.8 {d20}, [lr,:64], r2
|
yading@10
|
156 vld1.8 {d21}, [lr,:64], r2
|
yading@10
|
157 vrhadd.u8 q8, q8, q10
|
yading@10
|
158 .endif
|
yading@10
|
159 vst1.8 {d16}, [r0,:64], r2
|
yading@10
|
160 vst1.8 {d17}, [r0,:64], r2
|
yading@10
|
161 bgt 4b
|
yading@10
|
162
|
yading@10
|
163 pop {r4-r7, pc}
|
yading@10
|
164 endfunc
|
yading@10
|
165 .endm
|
yading@10
|
166
|
yading@10
|
167 /* chroma_mc4(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
|
yading@10
|
168 .macro h264_chroma_mc4 type, codec=h264
|
yading@10
|
169 function ff_\type\()_\codec\()_chroma_mc4_neon, export=1
|
yading@10
|
170 push {r4-r7, lr}
|
yading@10
|
171 ldrd r4, r5, [sp, #20]
|
yading@10
|
172 .ifc \type,avg
|
yading@10
|
173 mov lr, r0
|
yading@10
|
174 .endif
|
yading@10
|
175 pld [r1]
|
yading@10
|
176 pld [r1, r2]
|
yading@10
|
177
|
yading@10
|
178 .ifc \codec,rv40
|
yading@10
|
179 movrel r6, rv40bias
|
yading@10
|
180 lsr r7, r5, #1
|
yading@10
|
181 add r6, r6, r7, lsl #3
|
yading@10
|
182 lsr r7, r4, #1
|
yading@10
|
183 add r6, r6, r7, lsl #1
|
yading@10
|
184 vld1.16 {d22[],d23[]}, [r6,:16]
|
yading@10
|
185 .endif
|
yading@10
|
186
|
yading@10
|
187 A muls r7, r4, r5
|
yading@10
|
188 T mul r7, r4, r5
|
yading@10
|
189 T cmp r7, #0
|
yading@10
|
190 rsb r6, r7, r5, lsl #3
|
yading@10
|
191 rsb r12, r7, r4, lsl #3
|
yading@10
|
192 sub r4, r7, r4, lsl #3
|
yading@10
|
193 sub r4, r4, r5, lsl #3
|
yading@10
|
194 add r4, r4, #64
|
yading@10
|
195
|
yading@10
|
196 beq 2f
|
yading@10
|
197
|
yading@10
|
198 vdup.8 d0, r4
|
yading@10
|
199 vdup.8 d1, r12
|
yading@10
|
200 vld1.8 {d4}, [r1], r2
|
yading@10
|
201 vdup.8 d2, r6
|
yading@10
|
202 vdup.8 d3, r7
|
yading@10
|
203
|
yading@10
|
204 vext.8 d5, d4, d5, #1
|
yading@10
|
205 vtrn.32 d4, d5
|
yading@10
|
206
|
yading@10
|
207 vtrn.32 d0, d1
|
yading@10
|
208 vtrn.32 d2, d3
|
yading@10
|
209
|
yading@10
|
210 1: vld1.8 {d6}, [r1], r2
|
yading@10
|
211 vext.8 d7, d6, d7, #1
|
yading@10
|
212 vtrn.32 d6, d7
|
yading@10
|
213 vmull.u8 q8, d4, d0
|
yading@10
|
214 vmlal.u8 q8, d6, d2
|
yading@10
|
215 vld1.8 {d4}, [r1], r2
|
yading@10
|
216 vext.8 d5, d4, d5, #1
|
yading@10
|
217 vtrn.32 d4, d5
|
yading@10
|
218 pld [r1]
|
yading@10
|
219 vmull.u8 q9, d6, d0
|
yading@10
|
220 vmlal.u8 q9, d4, d2
|
yading@10
|
221 vadd.i16 d16, d16, d17
|
yading@10
|
222 vadd.i16 d17, d18, d19
|
yading@10
|
223 .ifc \codec,h264
|
yading@10
|
224 vrshrn.u16 d16, q8, #6
|
yading@10
|
225 .else
|
yading@10
|
226 vadd.u16 q8, q8, q11
|
yading@10
|
227 vshrn.u16 d16, q8, #6
|
yading@10
|
228 .endif
|
yading@10
|
229 subs r3, r3, #2
|
yading@10
|
230 pld [r1, r2]
|
yading@10
|
231 .ifc \type,avg
|
yading@10
|
232 vld1.32 {d20[0]}, [lr,:32], r2
|
yading@10
|
233 vld1.32 {d20[1]}, [lr,:32], r2
|
yading@10
|
234 vrhadd.u8 d16, d16, d20
|
yading@10
|
235 .endif
|
yading@10
|
236 vst1.32 {d16[0]}, [r0,:32], r2
|
yading@10
|
237 vst1.32 {d16[1]}, [r0,:32], r2
|
yading@10
|
238 bgt 1b
|
yading@10
|
239
|
yading@10
|
240 pop {r4-r7, pc}
|
yading@10
|
241
|
yading@10
|
242 2: tst r6, r6
|
yading@10
|
243 add r12, r12, r6
|
yading@10
|
244 vdup.8 d0, r4
|
yading@10
|
245 vdup.8 d1, r12
|
yading@10
|
246 vtrn.32 d0, d1
|
yading@10
|
247
|
yading@10
|
248 beq 4f
|
yading@10
|
249
|
yading@10
|
250 vext.32 d1, d0, d1, #1
|
yading@10
|
251 vld1.32 {d4[0]}, [r1], r2
|
yading@10
|
252
|
yading@10
|
253 3: vld1.32 {d4[1]}, [r1], r2
|
yading@10
|
254 vmull.u8 q8, d4, d0
|
yading@10
|
255 vld1.32 {d4[0]}, [r1], r2
|
yading@10
|
256 vmull.u8 q9, d4, d1
|
yading@10
|
257 vadd.i16 d16, d16, d17
|
yading@10
|
258 vadd.i16 d17, d18, d19
|
yading@10
|
259 pld [r1]
|
yading@10
|
260 .ifc \codec,h264
|
yading@10
|
261 vrshrn.u16 d16, q8, #6
|
yading@10
|
262 .else
|
yading@10
|
263 vadd.u16 q8, q8, q11
|
yading@10
|
264 vshrn.u16 d16, q8, #6
|
yading@10
|
265 .endif
|
yading@10
|
266 .ifc \type,avg
|
yading@10
|
267 vld1.32 {d20[0]}, [lr,:32], r2
|
yading@10
|
268 vld1.32 {d20[1]}, [lr,:32], r2
|
yading@10
|
269 vrhadd.u8 d16, d16, d20
|
yading@10
|
270 .endif
|
yading@10
|
271 subs r3, r3, #2
|
yading@10
|
272 pld [r1, r2]
|
yading@10
|
273 vst1.32 {d16[0]}, [r0,:32], r2
|
yading@10
|
274 vst1.32 {d16[1]}, [r0,:32], r2
|
yading@10
|
275 bgt 3b
|
yading@10
|
276
|
yading@10
|
277 pop {r4-r7, pc}
|
yading@10
|
278
|
yading@10
|
279 4: vld1.8 {d4}, [r1], r2
|
yading@10
|
280 vld1.8 {d6}, [r1], r2
|
yading@10
|
281 vext.8 d5, d4, d5, #1
|
yading@10
|
282 vext.8 d7, d6, d7, #1
|
yading@10
|
283 vtrn.32 d4, d5
|
yading@10
|
284 vtrn.32 d6, d7
|
yading@10
|
285 vmull.u8 q8, d4, d0
|
yading@10
|
286 vmull.u8 q9, d6, d0
|
yading@10
|
287 subs r3, r3, #2
|
yading@10
|
288 vadd.i16 d16, d16, d17
|
yading@10
|
289 vadd.i16 d17, d18, d19
|
yading@10
|
290 pld [r1]
|
yading@10
|
291 .ifc \codec,h264
|
yading@10
|
292 vrshrn.u16 d16, q8, #6
|
yading@10
|
293 .else
|
yading@10
|
294 vadd.u16 q8, q8, q11
|
yading@10
|
295 vshrn.u16 d16, q8, #6
|
yading@10
|
296 .endif
|
yading@10
|
297 .ifc \type,avg
|
yading@10
|
298 vld1.32 {d20[0]}, [lr,:32], r2
|
yading@10
|
299 vld1.32 {d20[1]}, [lr,:32], r2
|
yading@10
|
300 vrhadd.u8 d16, d16, d20
|
yading@10
|
301 .endif
|
yading@10
|
302 pld [r1]
|
yading@10
|
303 vst1.32 {d16[0]}, [r0,:32], r2
|
yading@10
|
304 vst1.32 {d16[1]}, [r0,:32], r2
|
yading@10
|
305 bgt 4b
|
yading@10
|
306
|
yading@10
|
307 pop {r4-r7, pc}
|
yading@10
|
308 endfunc
|
yading@10
|
309 .endm
|
yading@10
|
310
|
yading@10
|
311 .macro h264_chroma_mc2 type
|
yading@10
|
312 function ff_\type\()_h264_chroma_mc2_neon, export=1
|
yading@10
|
313 push {r4-r6, lr}
|
yading@10
|
314 ldr r4, [sp, #16]
|
yading@10
|
315 ldr lr, [sp, #20]
|
yading@10
|
316 pld [r1]
|
yading@10
|
317 pld [r1, r2]
|
yading@10
|
318 orrs r5, r4, lr
|
yading@10
|
319 beq 2f
|
yading@10
|
320
|
yading@10
|
321 mul r5, r4, lr
|
yading@10
|
322 rsb r6, r5, lr, lsl #3
|
yading@10
|
323 rsb r12, r5, r4, lsl #3
|
yading@10
|
324 sub r4, r5, r4, lsl #3
|
yading@10
|
325 sub r4, r4, lr, lsl #3
|
yading@10
|
326 add r4, r4, #64
|
yading@10
|
327 vdup.8 d0, r4
|
yading@10
|
328 vdup.8 d2, r12
|
yading@10
|
329 vdup.8 d1, r6
|
yading@10
|
330 vdup.8 d3, r5
|
yading@10
|
331 vtrn.16 q0, q1
|
yading@10
|
332 1:
|
yading@10
|
333 vld1.32 {d4[0]}, [r1], r2
|
yading@10
|
334 vld1.32 {d4[1]}, [r1], r2
|
yading@10
|
335 vrev64.32 d5, d4
|
yading@10
|
336 vld1.32 {d5[1]}, [r1]
|
yading@10
|
337 vext.8 q3, q2, q2, #1
|
yading@10
|
338 vtrn.16 q2, q3
|
yading@10
|
339 vmull.u8 q8, d4, d0
|
yading@10
|
340 vmlal.u8 q8, d5, d1
|
yading@10
|
341 .ifc \type,avg
|
yading@10
|
342 vld1.16 {d18[0]}, [r0,:16], r2
|
yading@10
|
343 vld1.16 {d18[1]}, [r0,:16]
|
yading@10
|
344 sub r0, r0, r2
|
yading@10
|
345 .endif
|
yading@10
|
346 vtrn.32 d16, d17
|
yading@10
|
347 vadd.i16 d16, d16, d17
|
yading@10
|
348 vrshrn.u16 d16, q8, #6
|
yading@10
|
349 .ifc \type,avg
|
yading@10
|
350 vrhadd.u8 d16, d16, d18
|
yading@10
|
351 .endif
|
yading@10
|
352 vst1.16 {d16[0]}, [r0,:16], r2
|
yading@10
|
353 vst1.16 {d16[1]}, [r0,:16], r2
|
yading@10
|
354 subs r3, r3, #2
|
yading@10
|
355 bgt 1b
|
yading@10
|
356 pop {r4-r6, pc}
|
yading@10
|
357 2:
|
yading@10
|
358 .ifc \type,put
|
yading@10
|
359 ldrh_post r5, r1, r2
|
yading@10
|
360 strh_post r5, r0, r2
|
yading@10
|
361 ldrh_post r6, r1, r2
|
yading@10
|
362 strh_post r6, r0, r2
|
yading@10
|
363 .else
|
yading@10
|
364 vld1.16 {d16[0]}, [r1], r2
|
yading@10
|
365 vld1.16 {d16[1]}, [r1], r2
|
yading@10
|
366 vld1.16 {d18[0]}, [r0,:16], r2
|
yading@10
|
367 vld1.16 {d18[1]}, [r0,:16]
|
yading@10
|
368 sub r0, r0, r2
|
yading@10
|
369 vrhadd.u8 d16, d16, d18
|
yading@10
|
370 vst1.16 {d16[0]}, [r0,:16], r2
|
yading@10
|
371 vst1.16 {d16[1]}, [r0,:16], r2
|
yading@10
|
372 .endif
|
yading@10
|
373 subs r3, r3, #2
|
yading@10
|
374 bgt 2b
|
yading@10
|
375 pop {r4-r6, pc}
|
yading@10
|
376 endfunc
|
yading@10
|
377 .endm
|
yading@10
|
378
|
yading@10
|
379 #if CONFIG_H264_DECODER
|
yading@10
|
380 h264_chroma_mc8 put
|
yading@10
|
381 h264_chroma_mc8 avg
|
yading@10
|
382 h264_chroma_mc4 put
|
yading@10
|
383 h264_chroma_mc4 avg
|
yading@10
|
384 h264_chroma_mc2 put
|
yading@10
|
385 h264_chroma_mc2 avg
|
yading@10
|
386 #endif
|
yading@10
|
387
|
yading@10
|
388 #if CONFIG_RV40_DECODER
|
yading@10
|
389 const rv40bias
|
yading@10
|
390 .short 0, 16, 32, 16
|
yading@10
|
391 .short 32, 28, 32, 28
|
yading@10
|
392 .short 0, 32, 16, 32
|
yading@10
|
393 .short 32, 28, 32, 28
|
yading@10
|
394 endconst
|
yading@10
|
395
|
yading@10
|
396 h264_chroma_mc8 put, rv40
|
yading@10
|
397 h264_chroma_mc8 avg, rv40
|
yading@10
|
398 h264_chroma_mc4 put, rv40
|
yading@10
|
399 h264_chroma_mc4 avg, rv40
|
yading@10
|
400 #endif
|