yading@10
|
1 /*
|
yading@10
|
2 * ARM NEON IDCT
|
yading@10
|
3 *
|
yading@10
|
4 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
|
yading@10
|
5 *
|
yading@10
|
6 * Based on Simple IDCT
|
yading@10
|
7 * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at>
|
yading@10
|
8 *
|
yading@10
|
9 * This file is part of FFmpeg.
|
yading@10
|
10 *
|
yading@10
|
11 * FFmpeg is free software; you can redistribute it and/or
|
yading@10
|
12 * modify it under the terms of the GNU Lesser General Public
|
yading@10
|
13 * License as published by the Free Software Foundation; either
|
yading@10
|
14 * version 2.1 of the License, or (at your option) any later version.
|
yading@10
|
15 *
|
yading@10
|
16 * FFmpeg is distributed in the hope that it will be useful,
|
yading@10
|
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
yading@10
|
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
yading@10
|
19 * Lesser General Public License for more details.
|
yading@10
|
20 *
|
yading@10
|
21 * You should have received a copy of the GNU Lesser General Public
|
yading@10
|
22 * License along with FFmpeg; if not, write to the Free Software
|
yading@10
|
23 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
yading@10
|
24 */
|
yading@10
|
25
|
yading@10
|
26 #include "libavutil/arm/asm.S"
|
yading@10
|
27
|
yading@10
|
28 #define W1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
|
yading@10
|
29 #define W2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
|
yading@10
|
30 #define W3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
|
yading@10
|
31 #define W4 16383 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
|
yading@10
|
32 #define W5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
|
yading@10
|
33 #define W6 8867 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
|
yading@10
|
34 #define W7 4520 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
|
yading@10
|
35 #define W4c ((1<<(COL_SHIFT-1))/W4)
|
yading@10
|
36 #define ROW_SHIFT 11
|
yading@10
|
37 #define COL_SHIFT 20
|
yading@10
|
38
|
yading@10
|
39 #define w1 d0[0]
|
yading@10
|
40 #define w2 d0[1]
|
yading@10
|
41 #define w3 d0[2]
|
yading@10
|
42 #define w4 d0[3]
|
yading@10
|
43 #define w5 d1[0]
|
yading@10
|
44 #define w6 d1[1]
|
yading@10
|
45 #define w7 d1[2]
|
yading@10
|
46 #define w4c d1[3]
|
yading@10
|
47
|
yading@10
|
48 .macro idct_col4_top
|
yading@10
|
49 vmull.s16 q7, d6, w2 /* q9 = W2 * col[2] */
|
yading@10
|
50 vmull.s16 q8, d6, w6 /* q10 = W6 * col[2] */
|
yading@10
|
51 vmull.s16 q9, d4, w1 /* q9 = W1 * col[1] */
|
yading@10
|
52 vadd.i32 q11, q15, q7
|
yading@10
|
53 vmull.s16 q10, d4, w3 /* q10 = W3 * col[1] */
|
yading@10
|
54 vadd.i32 q12, q15, q8
|
yading@10
|
55 vmull.s16 q5, d4, w5 /* q5 = W5 * col[1] */
|
yading@10
|
56 vsub.i32 q13, q15, q8
|
yading@10
|
57 vmull.s16 q6, d4, w7 /* q6 = W7 * col[1] */
|
yading@10
|
58 vsub.i32 q14, q15, q7
|
yading@10
|
59
|
yading@10
|
60 vmlal.s16 q9, d8, w3 /* q9 += W3 * col[3] */
|
yading@10
|
61 vmlsl.s16 q10, d8, w7 /* q10 -= W7 * col[3] */
|
yading@10
|
62 vmlsl.s16 q5, d8, w1 /* q5 -= W1 * col[3] */
|
yading@10
|
63 vmlsl.s16 q6, d8, w5 /* q6 -= W5 * col[3] */
|
yading@10
|
64 .endm
|
yading@10
|
65
|
yading@10
|
66 .text
|
yading@10
|
67 .align 6
|
yading@10
|
68
|
yading@10
|
69 function idct_row4_pld_neon
|
yading@10
|
70 pld [r0]
|
yading@10
|
71 add r3, r0, r1, lsl #2
|
yading@10
|
72 pld [r0, r1]
|
yading@10
|
73 pld [r0, r1, lsl #1]
|
yading@10
|
74 A pld [r3, -r1]
|
yading@10
|
75 pld [r3]
|
yading@10
|
76 pld [r3, r1]
|
yading@10
|
77 add r3, r3, r1, lsl #1
|
yading@10
|
78 pld [r3]
|
yading@10
|
79 pld [r3, r1]
|
yading@10
|
80 endfunc
|
yading@10
|
81
|
yading@10
|
82 function idct_row4_neon
|
yading@10
|
83 vmov.i32 q15, #(1<<(ROW_SHIFT-1))
|
yading@10
|
84 vld1.64 {d2-d5}, [r2,:128]!
|
yading@10
|
85 vmlal.s16 q15, d2, w4 /* q15 += W4 * col[0] */
|
yading@10
|
86 vld1.64 {d6,d7}, [r2,:128]!
|
yading@10
|
87 vorr d10, d3, d5
|
yading@10
|
88 vld1.64 {d8,d9}, [r2,:128]!
|
yading@10
|
89 add r2, r2, #-64
|
yading@10
|
90
|
yading@10
|
91 vorr d11, d7, d9
|
yading@10
|
92 vorr d10, d10, d11
|
yading@10
|
93 vmov r3, r4, d10
|
yading@10
|
94
|
yading@10
|
95 idct_col4_top
|
yading@10
|
96
|
yading@10
|
97 orrs r3, r3, r4
|
yading@10
|
98 beq 1f
|
yading@10
|
99
|
yading@10
|
100 vmull.s16 q7, d3, w4 /* q7 = W4 * col[4] */
|
yading@10
|
101 vmlal.s16 q9, d5, w5 /* q9 += W5 * col[5] */
|
yading@10
|
102 vmlsl.s16 q10, d5, w1 /* q10 -= W1 * col[5] */
|
yading@10
|
103 vmull.s16 q8, d7, w2 /* q8 = W2 * col[6] */
|
yading@10
|
104 vmlal.s16 q5, d5, w7 /* q5 += W7 * col[5] */
|
yading@10
|
105 vadd.i32 q11, q11, q7
|
yading@10
|
106 vsub.i32 q12, q12, q7
|
yading@10
|
107 vsub.i32 q13, q13, q7
|
yading@10
|
108 vadd.i32 q14, q14, q7
|
yading@10
|
109 vmlal.s16 q6, d5, w3 /* q6 += W3 * col[5] */
|
yading@10
|
110 vmull.s16 q7, d7, w6 /* q7 = W6 * col[6] */
|
yading@10
|
111 vmlal.s16 q9, d9, w7
|
yading@10
|
112 vmlsl.s16 q10, d9, w5
|
yading@10
|
113 vmlal.s16 q5, d9, w3
|
yading@10
|
114 vmlsl.s16 q6, d9, w1
|
yading@10
|
115 vadd.i32 q11, q11, q7
|
yading@10
|
116 vsub.i32 q12, q12, q8
|
yading@10
|
117 vadd.i32 q13, q13, q8
|
yading@10
|
118 vsub.i32 q14, q14, q7
|
yading@10
|
119
|
yading@10
|
120 1: vadd.i32 q3, q11, q9
|
yading@10
|
121 vadd.i32 q4, q12, q10
|
yading@10
|
122 vshrn.i32 d2, q3, #ROW_SHIFT
|
yading@10
|
123 vshrn.i32 d4, q4, #ROW_SHIFT
|
yading@10
|
124 vadd.i32 q7, q13, q5
|
yading@10
|
125 vadd.i32 q8, q14, q6
|
yading@10
|
126 vtrn.16 d2, d4
|
yading@10
|
127 vshrn.i32 d6, q7, #ROW_SHIFT
|
yading@10
|
128 vshrn.i32 d8, q8, #ROW_SHIFT
|
yading@10
|
129 vsub.i32 q14, q14, q6
|
yading@10
|
130 vsub.i32 q11, q11, q9
|
yading@10
|
131 vtrn.16 d6, d8
|
yading@10
|
132 vsub.i32 q13, q13, q5
|
yading@10
|
133 vshrn.i32 d3, q14, #ROW_SHIFT
|
yading@10
|
134 vtrn.32 d2, d6
|
yading@10
|
135 vsub.i32 q12, q12, q10
|
yading@10
|
136 vtrn.32 d4, d8
|
yading@10
|
137 vshrn.i32 d5, q13, #ROW_SHIFT
|
yading@10
|
138 vshrn.i32 d7, q12, #ROW_SHIFT
|
yading@10
|
139 vshrn.i32 d9, q11, #ROW_SHIFT
|
yading@10
|
140
|
yading@10
|
141 vtrn.16 d3, d5
|
yading@10
|
142 vtrn.16 d7, d9
|
yading@10
|
143 vtrn.32 d3, d7
|
yading@10
|
144 vtrn.32 d5, d9
|
yading@10
|
145
|
yading@10
|
146 vst1.64 {d2-d5}, [r2,:128]!
|
yading@10
|
147 vst1.64 {d6-d9}, [r2,:128]!
|
yading@10
|
148
|
yading@10
|
149 bx lr
|
yading@10
|
150 endfunc
|
yading@10
|
151
|
yading@10
|
152 function idct_col4_neon
|
yading@10
|
153 mov ip, #16
|
yading@10
|
154 vld1.64 {d2}, [r2,:64], ip /* d2 = col[0] */
|
yading@10
|
155 vdup.16 d30, w4c
|
yading@10
|
156 vld1.64 {d4}, [r2,:64], ip /* d3 = col[1] */
|
yading@10
|
157 vadd.i16 d30, d30, d2
|
yading@10
|
158 vld1.64 {d6}, [r2,:64], ip /* d4 = col[2] */
|
yading@10
|
159 vmull.s16 q15, d30, w4 /* q15 = W4*(col[0]+(1<<COL_SHIFT-1)/W4)*/
|
yading@10
|
160 vld1.64 {d8}, [r2,:64], ip /* d5 = col[3] */
|
yading@10
|
161
|
yading@10
|
162 ldrd r4, r5, [r2]
|
yading@10
|
163 ldrd r6, r7, [r2, #16]
|
yading@10
|
164 orrs r4, r4, r5
|
yading@10
|
165
|
yading@10
|
166 idct_col4_top
|
yading@10
|
167 it eq
|
yading@10
|
168 addeq r2, r2, #16
|
yading@10
|
169 beq 1f
|
yading@10
|
170
|
yading@10
|
171 vld1.64 {d3}, [r2,:64], ip /* d6 = col[4] */
|
yading@10
|
172 vmull.s16 q7, d3, w4 /* q7 = W4 * col[4] */
|
yading@10
|
173 vadd.i32 q11, q11, q7
|
yading@10
|
174 vsub.i32 q12, q12, q7
|
yading@10
|
175 vsub.i32 q13, q13, q7
|
yading@10
|
176 vadd.i32 q14, q14, q7
|
yading@10
|
177
|
yading@10
|
178 1: orrs r6, r6, r7
|
yading@10
|
179 ldrd r4, r5, [r2, #16]
|
yading@10
|
180 it eq
|
yading@10
|
181 addeq r2, r2, #16
|
yading@10
|
182 beq 2f
|
yading@10
|
183
|
yading@10
|
184 vld1.64 {d5}, [r2,:64], ip /* d7 = col[5] */
|
yading@10
|
185 vmlal.s16 q9, d5, w5 /* q9 += W5 * col[5] */
|
yading@10
|
186 vmlsl.s16 q10, d5, w1 /* q10 -= W1 * col[5] */
|
yading@10
|
187 vmlal.s16 q5, d5, w7 /* q5 += W7 * col[5] */
|
yading@10
|
188 vmlal.s16 q6, d5, w3 /* q6 += W3 * col[5] */
|
yading@10
|
189
|
yading@10
|
190 2: orrs r4, r4, r5
|
yading@10
|
191 ldrd r4, r5, [r2, #16]
|
yading@10
|
192 it eq
|
yading@10
|
193 addeq r2, r2, #16
|
yading@10
|
194 beq 3f
|
yading@10
|
195
|
yading@10
|
196 vld1.64 {d7}, [r2,:64], ip /* d8 = col[6] */
|
yading@10
|
197 vmull.s16 q7, d7, w6 /* q7 = W6 * col[6] */
|
yading@10
|
198 vmull.s16 q8, d7, w2 /* q8 = W2 * col[6] */
|
yading@10
|
199 vadd.i32 q11, q11, q7
|
yading@10
|
200 vsub.i32 q14, q14, q7
|
yading@10
|
201 vsub.i32 q12, q12, q8
|
yading@10
|
202 vadd.i32 q13, q13, q8
|
yading@10
|
203
|
yading@10
|
204 3: orrs r4, r4, r5
|
yading@10
|
205 it eq
|
yading@10
|
206 addeq r2, r2, #16
|
yading@10
|
207 beq 4f
|
yading@10
|
208
|
yading@10
|
209 vld1.64 {d9}, [r2,:64], ip /* d9 = col[7] */
|
yading@10
|
210 vmlal.s16 q9, d9, w7
|
yading@10
|
211 vmlsl.s16 q10, d9, w5
|
yading@10
|
212 vmlal.s16 q5, d9, w3
|
yading@10
|
213 vmlsl.s16 q6, d9, w1
|
yading@10
|
214
|
yading@10
|
215 4: vaddhn.i32 d2, q11, q9
|
yading@10
|
216 vaddhn.i32 d3, q12, q10
|
yading@10
|
217 vaddhn.i32 d4, q13, q5
|
yading@10
|
218 vaddhn.i32 d5, q14, q6
|
yading@10
|
219 vsubhn.i32 d9, q11, q9
|
yading@10
|
220 vsubhn.i32 d8, q12, q10
|
yading@10
|
221 vsubhn.i32 d7, q13, q5
|
yading@10
|
222 vsubhn.i32 d6, q14, q6
|
yading@10
|
223
|
yading@10
|
224 bx lr
|
yading@10
|
225 endfunc
|
yading@10
|
226
|
yading@10
|
227 .align 6
|
yading@10
|
228
|
yading@10
|
229 function idct_col4_st8_neon
|
yading@10
|
230 vqshrun.s16 d2, q1, #COL_SHIFT-16
|
yading@10
|
231 vqshrun.s16 d3, q2, #COL_SHIFT-16
|
yading@10
|
232 vqshrun.s16 d4, q3, #COL_SHIFT-16
|
yading@10
|
233 vqshrun.s16 d5, q4, #COL_SHIFT-16
|
yading@10
|
234 vst1.32 {d2[0]}, [r0,:32], r1
|
yading@10
|
235 vst1.32 {d2[1]}, [r0,:32], r1
|
yading@10
|
236 vst1.32 {d3[0]}, [r0,:32], r1
|
yading@10
|
237 vst1.32 {d3[1]}, [r0,:32], r1
|
yading@10
|
238 vst1.32 {d4[0]}, [r0,:32], r1
|
yading@10
|
239 vst1.32 {d4[1]}, [r0,:32], r1
|
yading@10
|
240 vst1.32 {d5[0]}, [r0,:32], r1
|
yading@10
|
241 vst1.32 {d5[1]}, [r0,:32], r1
|
yading@10
|
242
|
yading@10
|
243 bx lr
|
yading@10
|
244 endfunc
|
yading@10
|
245
|
yading@10
|
246 const idct_coeff_neon, align=4
|
yading@10
|
247 .short W1, W2, W3, W4, W5, W6, W7, W4c
|
yading@10
|
248 endconst
|
yading@10
|
249
|
yading@10
|
250 .macro idct_start data
|
yading@10
|
251 push {r4-r7, lr}
|
yading@10
|
252 pld [\data]
|
yading@10
|
253 pld [\data, #64]
|
yading@10
|
254 vpush {d8-d15}
|
yading@10
|
255 movrel r3, idct_coeff_neon
|
yading@10
|
256 vld1.64 {d0,d1}, [r3,:128]
|
yading@10
|
257 .endm
|
yading@10
|
258
|
yading@10
|
259 .macro idct_end
|
yading@10
|
260 vpop {d8-d15}
|
yading@10
|
261 pop {r4-r7, pc}
|
yading@10
|
262 .endm
|
yading@10
|
263
|
yading@10
|
264 /* void ff_simple_idct_put_neon(uint8_t *dst, int line_size, int16_t *data); */
|
yading@10
|
265 function ff_simple_idct_put_neon, export=1
|
yading@10
|
266 idct_start r2
|
yading@10
|
267
|
yading@10
|
268 bl idct_row4_pld_neon
|
yading@10
|
269 bl idct_row4_neon
|
yading@10
|
270 add r2, r2, #-128
|
yading@10
|
271 bl idct_col4_neon
|
yading@10
|
272 bl idct_col4_st8_neon
|
yading@10
|
273 sub r0, r0, r1, lsl #3
|
yading@10
|
274 add r0, r0, #4
|
yading@10
|
275 add r2, r2, #-120
|
yading@10
|
276 bl idct_col4_neon
|
yading@10
|
277 bl idct_col4_st8_neon
|
yading@10
|
278
|
yading@10
|
279 idct_end
|
yading@10
|
280 endfunc
|
yading@10
|
281
|
yading@10
|
282 .align 6
|
yading@10
|
283
|
yading@10
|
284 function idct_col4_add8_neon
|
yading@10
|
285 mov ip, r0
|
yading@10
|
286
|
yading@10
|
287 vld1.32 {d10[0]}, [r0,:32], r1
|
yading@10
|
288 vshr.s16 q1, q1, #COL_SHIFT-16
|
yading@10
|
289 vld1.32 {d10[1]}, [r0,:32], r1
|
yading@10
|
290 vshr.s16 q2, q2, #COL_SHIFT-16
|
yading@10
|
291 vld1.32 {d11[0]}, [r0,:32], r1
|
yading@10
|
292 vshr.s16 q3, q3, #COL_SHIFT-16
|
yading@10
|
293 vld1.32 {d11[1]}, [r0,:32], r1
|
yading@10
|
294 vshr.s16 q4, q4, #COL_SHIFT-16
|
yading@10
|
295 vld1.32 {d12[0]}, [r0,:32], r1
|
yading@10
|
296 vaddw.u8 q1, q1, d10
|
yading@10
|
297 vld1.32 {d12[1]}, [r0,:32], r1
|
yading@10
|
298 vaddw.u8 q2, q2, d11
|
yading@10
|
299 vld1.32 {d13[0]}, [r0,:32], r1
|
yading@10
|
300 vqmovun.s16 d2, q1
|
yading@10
|
301 vld1.32 {d13[1]}, [r0,:32], r1
|
yading@10
|
302 vaddw.u8 q3, q3, d12
|
yading@10
|
303 vst1.32 {d2[0]}, [ip,:32], r1
|
yading@10
|
304 vqmovun.s16 d3, q2
|
yading@10
|
305 vst1.32 {d2[1]}, [ip,:32], r1
|
yading@10
|
306 vaddw.u8 q4, q4, d13
|
yading@10
|
307 vst1.32 {d3[0]}, [ip,:32], r1
|
yading@10
|
308 vqmovun.s16 d4, q3
|
yading@10
|
309 vst1.32 {d3[1]}, [ip,:32], r1
|
yading@10
|
310 vqmovun.s16 d5, q4
|
yading@10
|
311 vst1.32 {d4[0]}, [ip,:32], r1
|
yading@10
|
312 vst1.32 {d4[1]}, [ip,:32], r1
|
yading@10
|
313 vst1.32 {d5[0]}, [ip,:32], r1
|
yading@10
|
314 vst1.32 {d5[1]}, [ip,:32], r1
|
yading@10
|
315
|
yading@10
|
316 bx lr
|
yading@10
|
317 endfunc
|
yading@10
|
318
|
yading@10
|
319 /* void ff_simple_idct_add_neon(uint8_t *dst, int line_size, int16_t *data); */
|
yading@10
|
320 function ff_simple_idct_add_neon, export=1
|
yading@10
|
321 idct_start r2
|
yading@10
|
322
|
yading@10
|
323 bl idct_row4_pld_neon
|
yading@10
|
324 bl idct_row4_neon
|
yading@10
|
325 add r2, r2, #-128
|
yading@10
|
326 bl idct_col4_neon
|
yading@10
|
327 bl idct_col4_add8_neon
|
yading@10
|
328 sub r0, r0, r1, lsl #3
|
yading@10
|
329 add r0, r0, #4
|
yading@10
|
330 add r2, r2, #-120
|
yading@10
|
331 bl idct_col4_neon
|
yading@10
|
332 bl idct_col4_add8_neon
|
yading@10
|
333
|
yading@10
|
334 idct_end
|
yading@10
|
335 endfunc
|
yading@10
|
336
|
yading@10
|
337 .align 6
|
yading@10
|
338
|
yading@10
|
339 function idct_col4_st16_neon
|
yading@10
|
340 mov ip, #16
|
yading@10
|
341
|
yading@10
|
342 vshr.s16 q1, q1, #COL_SHIFT-16
|
yading@10
|
343 vshr.s16 q2, q2, #COL_SHIFT-16
|
yading@10
|
344 vst1.64 {d2}, [r2,:64], ip
|
yading@10
|
345 vshr.s16 q3, q3, #COL_SHIFT-16
|
yading@10
|
346 vst1.64 {d3}, [r2,:64], ip
|
yading@10
|
347 vshr.s16 q4, q4, #COL_SHIFT-16
|
yading@10
|
348 vst1.64 {d4}, [r2,:64], ip
|
yading@10
|
349 vst1.64 {d5}, [r2,:64], ip
|
yading@10
|
350 vst1.64 {d6}, [r2,:64], ip
|
yading@10
|
351 vst1.64 {d7}, [r2,:64], ip
|
yading@10
|
352 vst1.64 {d8}, [r2,:64], ip
|
yading@10
|
353 vst1.64 {d9}, [r2,:64], ip
|
yading@10
|
354
|
yading@10
|
355 bx lr
|
yading@10
|
356 endfunc
|
yading@10
|
357
|
yading@10
|
358 /* void ff_simple_idct_neon(int16_t *data); */
|
yading@10
|
359 function ff_simple_idct_neon, export=1
|
yading@10
|
360 idct_start r0
|
yading@10
|
361
|
yading@10
|
362 mov r2, r0
|
yading@10
|
363 bl idct_row4_neon
|
yading@10
|
364 bl idct_row4_neon
|
yading@10
|
365 add r2, r2, #-128
|
yading@10
|
366 bl idct_col4_neon
|
yading@10
|
367 add r2, r2, #-128
|
yading@10
|
368 bl idct_col4_st16_neon
|
yading@10
|
369 add r2, r2, #-120
|
yading@10
|
370 bl idct_col4_neon
|
yading@10
|
371 add r2, r2, #-128
|
yading@10
|
372 bl idct_col4_st16_neon
|
yading@10
|
373
|
yading@10
|
374 idct_end
|
yading@10
|
375 endfunc
|