yading@10
|
1 ;******************************************************************************
|
yading@10
|
2 ;* MMX/SSE2-optimized functions for the VP3 decoder
|
yading@10
|
3 ;* Copyright (c) 2007 Aurelien Jacobs <aurel@gnuage.org>
|
yading@10
|
4 ;*
|
yading@10
|
5 ;* This file is part of FFmpeg.
|
yading@10
|
6 ;*
|
yading@10
|
7 ;* FFmpeg is free software; you can redistribute it and/or
|
yading@10
|
8 ;* modify it under the terms of the GNU Lesser General Public
|
yading@10
|
9 ;* License as published by the Free Software Foundation; either
|
yading@10
|
10 ;* version 2.1 of the License, or (at your option) any later version.
|
yading@10
|
11 ;*
|
yading@10
|
12 ;* FFmpeg is distributed in the hope that it will be useful,
|
yading@10
|
13 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
yading@10
|
14 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
yading@10
|
15 ;* Lesser General Public License for more details.
|
yading@10
|
16 ;*
|
yading@10
|
17 ;* You should have received a copy of the GNU Lesser General Public
|
yading@10
|
18 ;* License along with FFmpeg; if not, write to the Free Software
|
yading@10
|
19 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
yading@10
|
20 ;******************************************************************************
|
yading@10
|
21
|
yading@10
|
22 %include "libavutil/x86/x86util.asm"
|
yading@10
|
23
|
yading@10
|
24 ; MMX-optimized functions cribbed from the original VP3 source code.
|
yading@10
|
25
|
yading@10
|
26 SECTION_RODATA
|
yading@10
|
27
|
yading@10
|
28 vp3_idct_data: times 8 dw 64277
|
yading@10
|
29 times 8 dw 60547
|
yading@10
|
30 times 8 dw 54491
|
yading@10
|
31 times 8 dw 46341
|
yading@10
|
32 times 8 dw 36410
|
yading@10
|
33 times 8 dw 25080
|
yading@10
|
34 times 8 dw 12785
|
yading@10
|
35
|
yading@10
|
36 pb_7: times 8 db 0x07
|
yading@10
|
37 pb_1F: times 8 db 0x1f
|
yading@10
|
38 pb_81: times 8 db 0x81
|
yading@10
|
39
|
yading@10
|
40 cextern pb_1
|
yading@10
|
41 cextern pb_3
|
yading@10
|
42 cextern pb_80
|
yading@10
|
43
|
yading@10
|
44 cextern pw_8
|
yading@10
|
45
|
yading@10
|
46 SECTION .text
|
yading@10
|
47
|
yading@10
|
48 ; this is off by one or two for some cases when filter_limit is greater than 63
|
yading@10
|
49 ; in: p0 in mm6, p1 in mm4, p2 in mm2, p3 in mm1
|
yading@10
|
50 ; out: p1 in mm4, p2 in mm3
|
yading@10
|
51 %macro VP3_LOOP_FILTER 0
|
yading@10
|
52 movq m7, m6
|
yading@10
|
53 pand m6, [pb_7] ; p0&7
|
yading@10
|
54 psrlw m7, 3
|
yading@10
|
55 pand m7, [pb_1F] ; p0>>3
|
yading@10
|
56 movq m3, m2 ; p2
|
yading@10
|
57 pxor m2, m4
|
yading@10
|
58 pand m2, [pb_1] ; (p2^p1)&1
|
yading@10
|
59 movq m5, m2
|
yading@10
|
60 paddb m2, m2
|
yading@10
|
61 paddb m2, m5 ; 3*(p2^p1)&1
|
yading@10
|
62 paddb m2, m6 ; extra bits lost in shifts
|
yading@10
|
63 pcmpeqb m0, m0
|
yading@10
|
64 pxor m1, m0 ; 255 - p3
|
yading@10
|
65 pavgb m1, m2 ; (256 - p3 + extrabits) >> 1
|
yading@10
|
66 pxor m0, m4 ; 255 - p1
|
yading@10
|
67 pavgb m0, m3 ; (256 + p2-p1) >> 1
|
yading@10
|
68 paddb m1, [pb_3]
|
yading@10
|
69 pavgb m1, m0 ; 128+2+( p2-p1 - p3) >> 2
|
yading@10
|
70 pavgb m1, m0 ; 128+1+(3*(p2-p1) - p3) >> 3
|
yading@10
|
71 paddusb m7, m1 ; d+128+1
|
yading@10
|
72 movq m6, [pb_81]
|
yading@10
|
73 psubusb m6, m7
|
yading@10
|
74 psubusb m7, [pb_81]
|
yading@10
|
75
|
yading@10
|
76 movq m5, [r2+516] ; flim
|
yading@10
|
77 pminub m6, m5
|
yading@10
|
78 pminub m7, m5
|
yading@10
|
79 movq m0, m6
|
yading@10
|
80 movq m1, m7
|
yading@10
|
81 paddb m6, m6
|
yading@10
|
82 paddb m7, m7
|
yading@10
|
83 pminub m6, m5
|
yading@10
|
84 pminub m7, m5
|
yading@10
|
85 psubb m6, m0
|
yading@10
|
86 psubb m7, m1
|
yading@10
|
87 paddusb m4, m7
|
yading@10
|
88 psubusb m4, m6
|
yading@10
|
89 psubusb m3, m7
|
yading@10
|
90 paddusb m3, m6
|
yading@10
|
91 %endmacro
|
yading@10
|
92
|
yading@10
|
93 %macro STORE_4_WORDS 1
|
yading@10
|
94 movd r2d, %1
|
yading@10
|
95 mov [r0 -1], r2w
|
yading@10
|
96 psrlq %1, 32
|
yading@10
|
97 shr r2, 16
|
yading@10
|
98 mov [r0+r1 -1], r2w
|
yading@10
|
99 movd r2d, %1
|
yading@10
|
100 mov [r0+r1*2-1], r2w
|
yading@10
|
101 shr r2, 16
|
yading@10
|
102 mov [r0+r3 -1], r2w
|
yading@10
|
103 %endmacro
|
yading@10
|
104
|
yading@10
|
105 INIT_MMX mmxext
|
yading@10
|
106 cglobal vp3_v_loop_filter, 3, 4
|
yading@10
|
107 %if ARCH_X86_64
|
yading@10
|
108 movsxd r1, r1d
|
yading@10
|
109 %endif
|
yading@10
|
110 mov r3, r1
|
yading@10
|
111 neg r1
|
yading@10
|
112 movq m6, [r0+r1*2]
|
yading@10
|
113 movq m4, [r0+r1 ]
|
yading@10
|
114 movq m2, [r0 ]
|
yading@10
|
115 movq m1, [r0+r3 ]
|
yading@10
|
116
|
yading@10
|
117 VP3_LOOP_FILTER
|
yading@10
|
118
|
yading@10
|
119 movq [r0+r1], m4
|
yading@10
|
120 movq [r0 ], m3
|
yading@10
|
121 RET
|
yading@10
|
122
|
yading@10
|
123 cglobal vp3_h_loop_filter, 3, 4
|
yading@10
|
124 %if ARCH_X86_64
|
yading@10
|
125 movsxd r1, r1d
|
yading@10
|
126 %endif
|
yading@10
|
127 lea r3, [r1*3]
|
yading@10
|
128
|
yading@10
|
129 movd m6, [r0 -2]
|
yading@10
|
130 movd m4, [r0+r1 -2]
|
yading@10
|
131 movd m2, [r0+r1*2-2]
|
yading@10
|
132 movd m1, [r0+r3 -2]
|
yading@10
|
133 lea r0, [r0+r1*4 ]
|
yading@10
|
134 punpcklbw m6, [r0 -2]
|
yading@10
|
135 punpcklbw m4, [r0+r1 -2]
|
yading@10
|
136 punpcklbw m2, [r0+r1*2-2]
|
yading@10
|
137 punpcklbw m1, [r0+r3 -2]
|
yading@10
|
138 sub r0, r3
|
yading@10
|
139 sub r0, r1
|
yading@10
|
140
|
yading@10
|
141 TRANSPOSE4x4B 6, 4, 2, 1, 0
|
yading@10
|
142 VP3_LOOP_FILTER
|
yading@10
|
143 SBUTTERFLY bw, 4, 3, 5
|
yading@10
|
144
|
yading@10
|
145 STORE_4_WORDS m4
|
yading@10
|
146 lea r0, [r0+r1*4 ]
|
yading@10
|
147 STORE_4_WORDS m3
|
yading@10
|
148 RET
|
yading@10
|
149
|
yading@10
|
150 ; from original comments: The Macro does IDct on 4 1-D Dcts
|
yading@10
|
151 %macro BeginIDCT 0
|
yading@10
|
152 movq m2, I(3)
|
yading@10
|
153 movq m6, C(3)
|
yading@10
|
154 movq m4, m2
|
yading@10
|
155 movq m7, J(5)
|
yading@10
|
156 pmulhw m4, m6 ; r4 = c3*i3 - i3
|
yading@10
|
157 movq m1, C(5)
|
yading@10
|
158 pmulhw m6, m7 ; r6 = c3*i5 - i5
|
yading@10
|
159 movq m5, m1
|
yading@10
|
160 pmulhw m1, m2 ; r1 = c5*i3 - i3
|
yading@10
|
161 movq m3, I(1)
|
yading@10
|
162 pmulhw m5, m7 ; r5 = c5*i5 - i5
|
yading@10
|
163 movq m0, C(1)
|
yading@10
|
164 paddw m4, m2 ; r4 = c3*i3
|
yading@10
|
165 paddw m6, m7 ; r6 = c3*i5
|
yading@10
|
166 paddw m2, m1 ; r2 = c5*i3
|
yading@10
|
167 movq m1, J(7)
|
yading@10
|
168 paddw m7, m5 ; r7 = c5*i5
|
yading@10
|
169 movq m5, m0 ; r5 = c1
|
yading@10
|
170 pmulhw m0, m3 ; r0 = c1*i1 - i1
|
yading@10
|
171 paddsw m4, m7 ; r4 = C = c3*i3 + c5*i5
|
yading@10
|
172 pmulhw m5, m1 ; r5 = c1*i7 - i7
|
yading@10
|
173 movq m7, C(7)
|
yading@10
|
174 psubsw m6, m2 ; r6 = D = c3*i5 - c5*i3
|
yading@10
|
175 paddw m0, m3 ; r0 = c1*i1
|
yading@10
|
176 pmulhw m3, m7 ; r3 = c7*i1
|
yading@10
|
177 movq m2, I(2)
|
yading@10
|
178 pmulhw m7, m1 ; r7 = c7*i7
|
yading@10
|
179 paddw m5, m1 ; r5 = c1*i7
|
yading@10
|
180 movq m1, m2 ; r1 = i2
|
yading@10
|
181 pmulhw m2, C(2) ; r2 = c2*i2 - i2
|
yading@10
|
182 psubsw m3, m5 ; r3 = B = c7*i1 - c1*i7
|
yading@10
|
183 movq m5, J(6)
|
yading@10
|
184 paddsw m0, m7 ; r0 = A = c1*i1 + c7*i7
|
yading@10
|
185 movq m7, m5 ; r7 = i6
|
yading@10
|
186 psubsw m0, m4 ; r0 = A - C
|
yading@10
|
187 pmulhw m5, C(2) ; r5 = c2*i6 - i6
|
yading@10
|
188 paddw m2, m1 ; r2 = c2*i2
|
yading@10
|
189 pmulhw m1, C(6) ; r1 = c6*i2
|
yading@10
|
190 paddsw m4, m4 ; r4 = C + C
|
yading@10
|
191 paddsw m4, m0 ; r4 = C. = A + C
|
yading@10
|
192 psubsw m3, m6 ; r3 = B - D
|
yading@10
|
193 paddw m5, m7 ; r5 = c2*i6
|
yading@10
|
194 paddsw m6, m6 ; r6 = D + D
|
yading@10
|
195 pmulhw m7, C(6) ; r7 = c6*i6
|
yading@10
|
196 paddsw m6, m3 ; r6 = D. = B + D
|
yading@10
|
197 movq I(1), m4 ; save C. at I(1)
|
yading@10
|
198 psubsw m1, m5 ; r1 = H = c6*i2 - c2*i6
|
yading@10
|
199 movq m4, C(4)
|
yading@10
|
200 movq m5, m3 ; r5 = B - D
|
yading@10
|
201 pmulhw m3, m4 ; r3 = (c4 - 1) * (B - D)
|
yading@10
|
202 paddsw m7, m2 ; r3 = (c4 - 1) * (B - D)
|
yading@10
|
203 movq I(2), m6 ; save D. at I(2)
|
yading@10
|
204 movq m2, m0 ; r2 = A - C
|
yading@10
|
205 movq m6, I(0)
|
yading@10
|
206 pmulhw m0, m4 ; r0 = (c4 - 1) * (A - C)
|
yading@10
|
207 paddw m5, m3 ; r5 = B. = c4 * (B - D)
|
yading@10
|
208 movq m3, J(4)
|
yading@10
|
209 psubsw m5, m1 ; r5 = B.. = B. - H
|
yading@10
|
210 paddw m2, m0 ; r0 = A. = c4 * (A - C)
|
yading@10
|
211 psubsw m6, m3 ; r6 = i0 - i4
|
yading@10
|
212 movq m0, m6
|
yading@10
|
213 pmulhw m6, m4 ; r6 = (c4 - 1) * (i0 - i4)
|
yading@10
|
214 paddsw m3, m3 ; r3 = i4 + i4
|
yading@10
|
215 paddsw m1, m1 ; r1 = H + H
|
yading@10
|
216 paddsw m3, m0 ; r3 = i0 + i4
|
yading@10
|
217 paddsw m1, m5 ; r1 = H. = B + H
|
yading@10
|
218 pmulhw m4, m3 ; r4 = (c4 - 1) * (i0 + i4)
|
yading@10
|
219 paddsw m6, m0 ; r6 = F = c4 * (i0 - i4)
|
yading@10
|
220 psubsw m6, m2 ; r6 = F. = F - A.
|
yading@10
|
221 paddsw m2, m2 ; r2 = A. + A.
|
yading@10
|
222 movq m0, I(1) ; r0 = C.
|
yading@10
|
223 paddsw m2, m6 ; r2 = A.. = F + A.
|
yading@10
|
224 paddw m4, m3 ; r4 = E = c4 * (i0 + i4)
|
yading@10
|
225 psubsw m2, m1 ; r2 = R2 = A.. - H.
|
yading@10
|
226 %endmacro
|
yading@10
|
227
|
yading@10
|
228 ; RowIDCT gets ready to transpose
|
yading@10
|
229 %macro RowIDCT 0
|
yading@10
|
230 BeginIDCT
|
yading@10
|
231 movq m3, I(2) ; r3 = D.
|
yading@10
|
232 psubsw m4, m7 ; r4 = E. = E - G
|
yading@10
|
233 paddsw m1, m1 ; r1 = H. + H.
|
yading@10
|
234 paddsw m7, m7 ; r7 = G + G
|
yading@10
|
235 paddsw m1, m2 ; r1 = R1 = A.. + H.
|
yading@10
|
236 paddsw m7, m4 ; r1 = R1 = A.. + H.
|
yading@10
|
237 psubsw m4, m3 ; r4 = R4 = E. - D.
|
yading@10
|
238 paddsw m3, m3
|
yading@10
|
239 psubsw m6, m5 ; r6 = R6 = F. - B..
|
yading@10
|
240 paddsw m5, m5
|
yading@10
|
241 paddsw m3, m4 ; r3 = R3 = E. + D.
|
yading@10
|
242 paddsw m5, m6 ; r5 = R5 = F. + B..
|
yading@10
|
243 psubsw m7, m0 ; r7 = R7 = G. - C.
|
yading@10
|
244 paddsw m0, m0
|
yading@10
|
245 movq I(1), m1 ; save R1
|
yading@10
|
246 paddsw m0, m7 ; r0 = R0 = G. + C.
|
yading@10
|
247 %endmacro
|
yading@10
|
248
|
yading@10
|
249 ; Column IDCT normalizes and stores final results
|
yading@10
|
250 %macro ColumnIDCT 0
|
yading@10
|
251 BeginIDCT
|
yading@10
|
252 paddsw m2, OC_8 ; adjust R2 (and R1) for shift
|
yading@10
|
253 paddsw m1, m1 ; r1 = H. + H.
|
yading@10
|
254 paddsw m1, m2 ; r1 = R1 = A.. + H.
|
yading@10
|
255 psraw m2, 4 ; r2 = NR2
|
yading@10
|
256 psubsw m4, m7 ; r4 = E. = E - G
|
yading@10
|
257 psraw m1, 4 ; r1 = NR2
|
yading@10
|
258 movq m3, I(2) ; r3 = D.
|
yading@10
|
259 paddsw m7, m7 ; r7 = G + G
|
yading@10
|
260 movq I(2), m2 ; store NR2 at I2
|
yading@10
|
261 paddsw m7, m4 ; r7 = G. = E + G
|
yading@10
|
262 movq I(1), m1 ; store NR1 at I1
|
yading@10
|
263 psubsw m4, m3 ; r4 = R4 = E. - D.
|
yading@10
|
264 paddsw m4, OC_8 ; adjust R4 (and R3) for shift
|
yading@10
|
265 paddsw m3, m3 ; r3 = D. + D.
|
yading@10
|
266 paddsw m3, m4 ; r3 = R3 = E. + D.
|
yading@10
|
267 psraw m4, 4 ; r4 = NR4
|
yading@10
|
268 psubsw m6, m5 ; r6 = R6 = F. - B..
|
yading@10
|
269 psraw m3, 4 ; r3 = NR3
|
yading@10
|
270 paddsw m6, OC_8 ; adjust R6 (and R5) for shift
|
yading@10
|
271 paddsw m5, m5 ; r5 = B.. + B..
|
yading@10
|
272 paddsw m5, m6 ; r5 = R5 = F. + B..
|
yading@10
|
273 psraw m6, 4 ; r6 = NR6
|
yading@10
|
274 movq J(4), m4 ; store NR4 at J4
|
yading@10
|
275 psraw m5, 4 ; r5 = NR5
|
yading@10
|
276 movq I(3), m3 ; store NR3 at I3
|
yading@10
|
277 psubsw m7, m0 ; r7 = R7 = G. - C.
|
yading@10
|
278 paddsw m7, OC_8 ; adjust R7 (and R0) for shift
|
yading@10
|
279 paddsw m0, m0 ; r0 = C. + C.
|
yading@10
|
280 paddsw m0, m7 ; r0 = R0 = G. + C.
|
yading@10
|
281 psraw m7, 4 ; r7 = NR7
|
yading@10
|
282 movq J(6), m6 ; store NR6 at J6
|
yading@10
|
283 psraw m0, 4 ; r0 = NR0
|
yading@10
|
284 movq J(5), m5 ; store NR5 at J5
|
yading@10
|
285 movq J(7), m7 ; store NR7 at J7
|
yading@10
|
286 movq I(0), m0 ; store NR0 at I0
|
yading@10
|
287 %endmacro
|
yading@10
|
288
|
yading@10
|
289 ; Following macro does two 4x4 transposes in place.
|
yading@10
|
290 ;
|
yading@10
|
291 ; At entry (we assume):
|
yading@10
|
292 ;
|
yading@10
|
293 ; r0 = a3 a2 a1 a0
|
yading@10
|
294 ; I(1) = b3 b2 b1 b0
|
yading@10
|
295 ; r2 = c3 c2 c1 c0
|
yading@10
|
296 ; r3 = d3 d2 d1 d0
|
yading@10
|
297 ;
|
yading@10
|
298 ; r4 = e3 e2 e1 e0
|
yading@10
|
299 ; r5 = f3 f2 f1 f0
|
yading@10
|
300 ; r6 = g3 g2 g1 g0
|
yading@10
|
301 ; r7 = h3 h2 h1 h0
|
yading@10
|
302 ;
|
yading@10
|
303 ; At exit, we have:
|
yading@10
|
304 ;
|
yading@10
|
305 ; I(0) = d0 c0 b0 a0
|
yading@10
|
306 ; I(1) = d1 c1 b1 a1
|
yading@10
|
307 ; I(2) = d2 c2 b2 a2
|
yading@10
|
308 ; I(3) = d3 c3 b3 a3
|
yading@10
|
309 ;
|
yading@10
|
310 ; J(4) = h0 g0 f0 e0
|
yading@10
|
311 ; J(5) = h1 g1 f1 e1
|
yading@10
|
312 ; J(6) = h2 g2 f2 e2
|
yading@10
|
313 ; J(7) = h3 g3 f3 e3
|
yading@10
|
314 ;
|
yading@10
|
315 ; I(0) I(1) I(2) I(3) is the transpose of r0 I(1) r2 r3.
|
yading@10
|
316 ; J(4) J(5) J(6) J(7) is the transpose of r4 r5 r6 r7.
|
yading@10
|
317 ;
|
yading@10
|
318 ; Since r1 is free at entry, we calculate the Js first.
|
yading@10
|
319 %macro Transpose 0
|
yading@10
|
320 movq m1, m4 ; r1 = e3 e2 e1 e0
|
yading@10
|
321 punpcklwd m4, m5 ; r4 = f1 e1 f0 e0
|
yading@10
|
322 movq I(0), m0 ; save a3 a2 a1 a0
|
yading@10
|
323 punpckhwd m1, m5 ; r1 = f3 e3 f2 e2
|
yading@10
|
324 movq m0, m6 ; r0 = g3 g2 g1 g0
|
yading@10
|
325 punpcklwd m6, m7 ; r6 = h1 g1 h0 g0
|
yading@10
|
326 movq m5, m4 ; r5 = f1 e1 f0 e0
|
yading@10
|
327 punpckldq m4, m6 ; r4 = h0 g0 f0 e0 = R4
|
yading@10
|
328 punpckhdq m5, m6 ; r5 = h1 g1 f1 e1 = R5
|
yading@10
|
329 movq m6, m1 ; r6 = f3 e3 f2 e2
|
yading@10
|
330 movq J(4), m4
|
yading@10
|
331 punpckhwd m0, m7 ; r0 = h3 g3 h2 g2
|
yading@10
|
332 movq J(5), m5
|
yading@10
|
333 punpckhdq m6, m0 ; r6 = h3 g3 f3 e3 = R7
|
yading@10
|
334 movq m4, I(0) ; r4 = a3 a2 a1 a0
|
yading@10
|
335 punpckldq m1, m0 ; r1 = h2 g2 f2 e2 = R6
|
yading@10
|
336 movq m5, I(1) ; r5 = b3 b2 b1 b0
|
yading@10
|
337 movq m0, m4 ; r0 = a3 a2 a1 a0
|
yading@10
|
338 movq J(7), m6
|
yading@10
|
339 punpcklwd m0, m5 ; r0 = b1 a1 b0 a0
|
yading@10
|
340 movq J(6), m1
|
yading@10
|
341 punpckhwd m4, m5 ; r4 = b3 a3 b2 a2
|
yading@10
|
342 movq m5, m2 ; r5 = c3 c2 c1 c0
|
yading@10
|
343 punpcklwd m2, m3 ; r2 = d1 c1 d0 c0
|
yading@10
|
344 movq m1, m0 ; r1 = b1 a1 b0 a0
|
yading@10
|
345 punpckldq m0, m2 ; r0 = d0 c0 b0 a0 = R0
|
yading@10
|
346 punpckhdq m1, m2 ; r1 = d1 c1 b1 a1 = R1
|
yading@10
|
347 movq m2, m4 ; r2 = b3 a3 b2 a2
|
yading@10
|
348 movq I(0), m0
|
yading@10
|
349 punpckhwd m5, m3 ; r5 = d3 c3 d2 c2
|
yading@10
|
350 movq I(1), m1
|
yading@10
|
351 punpckhdq m4, m5 ; r4 = d3 c3 b3 a3 = R3
|
yading@10
|
352 punpckldq m2, m5 ; r2 = d2 c2 b2 a2 = R2
|
yading@10
|
353 movq I(3), m4
|
yading@10
|
354 movq I(2), m2
|
yading@10
|
355 %endmacro
|
yading@10
|
356
|
yading@10
|
357 %macro VP3_1D_IDCT_SSE2 0
|
yading@10
|
358 movdqa m2, I(3) ; xmm2 = i3
|
yading@10
|
359 movdqa m6, C(3) ; xmm6 = c3
|
yading@10
|
360 movdqa m4, m2 ; xmm4 = i3
|
yading@10
|
361 movdqa m7, I(5) ; xmm7 = i5
|
yading@10
|
362 pmulhw m4, m6 ; xmm4 = c3 * i3 - i3
|
yading@10
|
363 movdqa m1, C(5) ; xmm1 = c5
|
yading@10
|
364 pmulhw m6, m7 ; xmm6 = c3 * i5 - i5
|
yading@10
|
365 movdqa m5, m1 ; xmm5 = c5
|
yading@10
|
366 pmulhw m1, m2 ; xmm1 = c5 * i3 - i3
|
yading@10
|
367 movdqa m3, I(1) ; xmm3 = i1
|
yading@10
|
368 pmulhw m5, m7 ; xmm5 = c5 * i5 - i5
|
yading@10
|
369 movdqa m0, C(1) ; xmm0 = c1
|
yading@10
|
370 paddw m4, m2 ; xmm4 = c3 * i3
|
yading@10
|
371 paddw m6, m7 ; xmm6 = c3 * i5
|
yading@10
|
372 paddw m2, m1 ; xmm2 = c5 * i3
|
yading@10
|
373 movdqa m1, I(7) ; xmm1 = i7
|
yading@10
|
374 paddw m7, m5 ; xmm7 = c5 * i5
|
yading@10
|
375 movdqa m5, m0 ; xmm5 = c1
|
yading@10
|
376 pmulhw m0, m3 ; xmm0 = c1 * i1 - i1
|
yading@10
|
377 paddsw m4, m7 ; xmm4 = c3 * i3 + c5 * i5 = C
|
yading@10
|
378 pmulhw m5, m1 ; xmm5 = c1 * i7 - i7
|
yading@10
|
379 movdqa m7, C(7) ; xmm7 = c7
|
yading@10
|
380 psubsw m6, m2 ; xmm6 = c3 * i5 - c5 * i3 = D
|
yading@10
|
381 paddw m0, m3 ; xmm0 = c1 * i1
|
yading@10
|
382 pmulhw m3, m7 ; xmm3 = c7 * i1
|
yading@10
|
383 movdqa m2, I(2) ; xmm2 = i2
|
yading@10
|
384 pmulhw m7, m1 ; xmm7 = c7 * i7
|
yading@10
|
385 paddw m5, m1 ; xmm5 = c1 * i7
|
yading@10
|
386 movdqa m1, m2 ; xmm1 = i2
|
yading@10
|
387 pmulhw m2, C(2) ; xmm2 = i2 * c2 -i2
|
yading@10
|
388 psubsw m3, m5 ; xmm3 = c7 * i1 - c1 * i7 = B
|
yading@10
|
389 movdqa m5, I(6) ; xmm5 = i6
|
yading@10
|
390 paddsw m0, m7 ; xmm0 = c1 * i1 + c7 * i7 = A
|
yading@10
|
391 movdqa m7, m5 ; xmm7 = i6
|
yading@10
|
392 psubsw m0, m4 ; xmm0 = A - C
|
yading@10
|
393 pmulhw m5, C(2) ; xmm5 = c2 * i6 - i6
|
yading@10
|
394 paddw m2, m1 ; xmm2 = i2 * c2
|
yading@10
|
395 pmulhw m1, C(6) ; xmm1 = c6 * i2
|
yading@10
|
396 paddsw m4, m4 ; xmm4 = C + C
|
yading@10
|
397 paddsw m4, m0 ; xmm4 = A + C = C.
|
yading@10
|
398 psubsw m3, m6 ; xmm3 = B - D
|
yading@10
|
399 paddw m5, m7 ; xmm5 = c2 * i6
|
yading@10
|
400 paddsw m6, m6 ; xmm6 = D + D
|
yading@10
|
401 pmulhw m7, C(6) ; xmm7 = c6 * i6
|
yading@10
|
402 paddsw m6, m3 ; xmm6 = B + D = D.
|
yading@10
|
403 movdqa I(1), m4 ; Save C. at I(1)
|
yading@10
|
404 psubsw m1, m5 ; xmm1 = c6 * i2 - c2 * i6 = H
|
yading@10
|
405 movdqa m4, C(4) ; xmm4 = C4
|
yading@10
|
406 movdqa m5, m3 ; xmm5 = B - D
|
yading@10
|
407 pmulhw m3, m4 ; xmm3 = ( c4 -1 ) * ( B - D )
|
yading@10
|
408 paddsw m7, m2 ; xmm7 = c2 * i2 + c6 * i6 = G
|
yading@10
|
409 movdqa I(2), m6 ; save D. at I(2)
|
yading@10
|
410 movdqa m2, m0 ; xmm2 = A - C
|
yading@10
|
411 movdqa m6, I(0) ; xmm6 = i0
|
yading@10
|
412 pmulhw m0, m4 ; xmm0 = ( c4 - 1 ) * ( A - C ) = A.
|
yading@10
|
413 paddw m5, m3 ; xmm5 = c4 * ( B - D ) = B.
|
yading@10
|
414 movdqa m3, I(4) ; xmm3 = i4
|
yading@10
|
415 psubsw m5, m1 ; xmm5 = B. - H = B..
|
yading@10
|
416 paddw m2, m0 ; xmm2 = c4 * ( A - C) = A.
|
yading@10
|
417 psubsw m6, m3 ; xmm6 = i0 - i4
|
yading@10
|
418 movdqa m0, m6 ; xmm0 = i0 - i4
|
yading@10
|
419 pmulhw m6, m4 ; xmm6 = (c4 - 1) * (i0 - i4) = F
|
yading@10
|
420 paddsw m3, m3 ; xmm3 = i4 + i4
|
yading@10
|
421 paddsw m1, m1 ; xmm1 = H + H
|
yading@10
|
422 paddsw m3, m0 ; xmm3 = i0 + i4
|
yading@10
|
423 paddsw m1, m5 ; xmm1 = B. + H = H.
|
yading@10
|
424 pmulhw m4, m3 ; xmm4 = ( c4 - 1 ) * ( i0 + i4 )
|
yading@10
|
425 paddw m6, m0 ; xmm6 = c4 * ( i0 - i4 )
|
yading@10
|
426 psubsw m6, m2 ; xmm6 = F - A. = F.
|
yading@10
|
427 paddsw m2, m2 ; xmm2 = A. + A.
|
yading@10
|
428 movdqa m0, I(1) ; Load C. from I(1)
|
yading@10
|
429 paddsw m2, m6 ; xmm2 = F + A. = A..
|
yading@10
|
430 paddw m4, m3 ; xmm4 = c4 * ( i0 + i4 ) = 3
|
yading@10
|
431 psubsw m2, m1 ; xmm2 = A.. - H. = R2
|
yading@10
|
432 ADD(m2) ; Adjust R2 and R1 before shifting
|
yading@10
|
433 paddsw m1, m1 ; xmm1 = H. + H.
|
yading@10
|
434 paddsw m1, m2 ; xmm1 = A.. + H. = R1
|
yading@10
|
435 SHIFT(m2) ; xmm2 = op2
|
yading@10
|
436 psubsw m4, m7 ; xmm4 = E - G = E.
|
yading@10
|
437 SHIFT(m1) ; xmm1 = op1
|
yading@10
|
438 movdqa m3, I(2) ; Load D. from I(2)
|
yading@10
|
439 paddsw m7, m7 ; xmm7 = G + G
|
yading@10
|
440 paddsw m7, m4 ; xmm7 = E + G = G.
|
yading@10
|
441 psubsw m4, m3 ; xmm4 = E. - D. = R4
|
yading@10
|
442 ADD(m4) ; Adjust R4 and R3 before shifting
|
yading@10
|
443 paddsw m3, m3 ; xmm3 = D. + D.
|
yading@10
|
444 paddsw m3, m4 ; xmm3 = E. + D. = R3
|
yading@10
|
445 SHIFT(m4) ; xmm4 = op4
|
yading@10
|
446 psubsw m6, m5 ; xmm6 = F. - B..= R6
|
yading@10
|
447 SHIFT(m3) ; xmm3 = op3
|
yading@10
|
448 ADD(m6) ; Adjust R6 and R5 before shifting
|
yading@10
|
449 paddsw m5, m5 ; xmm5 = B.. + B..
|
yading@10
|
450 paddsw m5, m6 ; xmm5 = F. + B.. = R5
|
yading@10
|
451 SHIFT(m6) ; xmm6 = op6
|
yading@10
|
452 SHIFT(m5) ; xmm5 = op5
|
yading@10
|
453 psubsw m7, m0 ; xmm7 = G. - C. = R7
|
yading@10
|
454 ADD(m7) ; Adjust R7 and R0 before shifting
|
yading@10
|
455 paddsw m0, m0 ; xmm0 = C. + C.
|
yading@10
|
456 paddsw m0, m7 ; xmm0 = G. + C.
|
yading@10
|
457 SHIFT(m7) ; xmm7 = op7
|
yading@10
|
458 SHIFT(m0) ; xmm0 = op0
|
yading@10
|
459 %endmacro
|
yading@10
|
460
|
yading@10
|
461 %macro PUT_BLOCK 8
|
yading@10
|
462 movdqa O(0), m%1
|
yading@10
|
463 movdqa O(1), m%2
|
yading@10
|
464 movdqa O(2), m%3
|
yading@10
|
465 movdqa O(3), m%4
|
yading@10
|
466 movdqa O(4), m%5
|
yading@10
|
467 movdqa O(5), m%6
|
yading@10
|
468 movdqa O(6), m%7
|
yading@10
|
469 movdqa O(7), m%8
|
yading@10
|
470 %endmacro
|
yading@10
|
471
|
yading@10
|
472 %macro VP3_IDCT 1
|
yading@10
|
473 %if mmsize == 16
|
yading@10
|
474 %define I(x) [%1+16*x]
|
yading@10
|
475 %define O(x) [%1+16*x]
|
yading@10
|
476 %define C(x) [vp3_idct_data+16*(x-1)]
|
yading@10
|
477 %define SHIFT(x)
|
yading@10
|
478 %define ADD(x)
|
yading@10
|
479 VP3_1D_IDCT_SSE2
|
yading@10
|
480 %if ARCH_X86_64
|
yading@10
|
481 TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8
|
yading@10
|
482 %else
|
yading@10
|
483 TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [%1], [%1+16]
|
yading@10
|
484 %endif
|
yading@10
|
485 PUT_BLOCK 0, 1, 2, 3, 4, 5, 6, 7
|
yading@10
|
486
|
yading@10
|
487 %define SHIFT(x) psraw x, 4
|
yading@10
|
488 %define ADD(x) paddsw x, [pw_8]
|
yading@10
|
489 VP3_1D_IDCT_SSE2
|
yading@10
|
490 PUT_BLOCK 0, 1, 2, 3, 4, 5, 6, 7
|
yading@10
|
491 %else ; mmsize == 8
|
yading@10
|
492 ; eax = quantized input
|
yading@10
|
493 ; ebx = dequantizer matrix
|
yading@10
|
494 ; ecx = IDCT constants
|
yading@10
|
495 ; M(I) = ecx + MaskOffset(0) + I * 8
|
yading@10
|
496 ; C(I) = ecx + CosineOffset(32) + (I-1) * 8
|
yading@10
|
497 ; edx = output
|
yading@10
|
498 ; r0..r7 = mm0..mm7
|
yading@10
|
499 %define OC_8 [pw_8]
|
yading@10
|
500 %define C(x) [vp3_idct_data+16*(x-1)]
|
yading@10
|
501
|
yading@10
|
502 ; at this point, function has completed dequantization + dezigzag +
|
yading@10
|
503 ; partial transposition; now do the idct itself
|
yading@10
|
504 %define I(x) [%1+16*x]
|
yading@10
|
505 %define J(x) [%1+16*x]
|
yading@10
|
506 RowIDCT
|
yading@10
|
507 Transpose
|
yading@10
|
508
|
yading@10
|
509 %define I(x) [%1+16*x+8]
|
yading@10
|
510 %define J(x) [%1+16*x+8]
|
yading@10
|
511 RowIDCT
|
yading@10
|
512 Transpose
|
yading@10
|
513
|
yading@10
|
514 %define I(x) [%1+16* x]
|
yading@10
|
515 %define J(x) [%1+16*(x-4)+8]
|
yading@10
|
516 ColumnIDCT
|
yading@10
|
517
|
yading@10
|
518 %define I(x) [%1+16* x +64]
|
yading@10
|
519 %define J(x) [%1+16*(x-4)+72]
|
yading@10
|
520 ColumnIDCT
|
yading@10
|
521 %endif ; mmsize == 16/8
|
yading@10
|
522 %endmacro
|
yading@10
|
523
|
yading@10
|
524 %macro vp3_idct_funcs 0
|
yading@10
|
525 cglobal vp3_idct_put, 3, 4, 9
|
yading@10
|
526 VP3_IDCT r2
|
yading@10
|
527
|
yading@10
|
528 movsxdifnidn r1, r1d
|
yading@10
|
529 mova m4, [pb_80]
|
yading@10
|
530 lea r3, [r1*3]
|
yading@10
|
531 %assign %%i 0
|
yading@10
|
532 %rep 16/mmsize
|
yading@10
|
533 mova m0, [r2+mmsize*0+%%i]
|
yading@10
|
534 mova m1, [r2+mmsize*2+%%i]
|
yading@10
|
535 mova m2, [r2+mmsize*4+%%i]
|
yading@10
|
536 mova m3, [r2+mmsize*6+%%i]
|
yading@10
|
537 %if mmsize == 8
|
yading@10
|
538 packsswb m0, [r2+mmsize*8+%%i]
|
yading@10
|
539 packsswb m1, [r2+mmsize*10+%%i]
|
yading@10
|
540 packsswb m2, [r2+mmsize*12+%%i]
|
yading@10
|
541 packsswb m3, [r2+mmsize*14+%%i]
|
yading@10
|
542 %else
|
yading@10
|
543 packsswb m0, [r2+mmsize*1+%%i]
|
yading@10
|
544 packsswb m1, [r2+mmsize*3+%%i]
|
yading@10
|
545 packsswb m2, [r2+mmsize*5+%%i]
|
yading@10
|
546 packsswb m3, [r2+mmsize*7+%%i]
|
yading@10
|
547 %endif
|
yading@10
|
548 paddb m0, m4
|
yading@10
|
549 paddb m1, m4
|
yading@10
|
550 paddb m2, m4
|
yading@10
|
551 paddb m3, m4
|
yading@10
|
552 movq [r0 ], m0
|
yading@10
|
553 %if mmsize == 8
|
yading@10
|
554 movq [r0+r1 ], m1
|
yading@10
|
555 movq [r0+r1*2], m2
|
yading@10
|
556 movq [r0+r3 ], m3
|
yading@10
|
557 %else
|
yading@10
|
558 movhps [r0+r1 ], m0
|
yading@10
|
559 movq [r0+r1*2], m1
|
yading@10
|
560 movhps [r0+r3 ], m1
|
yading@10
|
561 %endif
|
yading@10
|
562 %if %%i == 0
|
yading@10
|
563 lea r0, [r0+r1*4]
|
yading@10
|
564 %endif
|
yading@10
|
565 %if mmsize == 16
|
yading@10
|
566 movq [r0 ], m2
|
yading@10
|
567 movhps [r0+r1 ], m2
|
yading@10
|
568 movq [r0+r1*2], m3
|
yading@10
|
569 movhps [r0+r3 ], m3
|
yading@10
|
570 %endif
|
yading@10
|
571 %assign %%i %%i+8
|
yading@10
|
572 %endrep
|
yading@10
|
573
|
yading@10
|
574 pxor m0, m0
|
yading@10
|
575 %assign %%offset 0
|
yading@10
|
576 %rep 128/mmsize
|
yading@10
|
577 mova [r2+%%offset], m0
|
yading@10
|
578 %assign %%offset %%offset+mmsize
|
yading@10
|
579 %endrep
|
yading@10
|
580 RET
|
yading@10
|
581
|
yading@10
|
582 cglobal vp3_idct_add, 3, 4, 9
|
yading@10
|
583 VP3_IDCT r2
|
yading@10
|
584
|
yading@10
|
585 movsxdifnidn r1, r1d
|
yading@10
|
586 lea r3, [r1*3]
|
yading@10
|
587 pxor m4, m4
|
yading@10
|
588 %if mmsize == 16
|
yading@10
|
589 %assign %%i 0
|
yading@10
|
590 %rep 2
|
yading@10
|
591 movq m0, [r0]
|
yading@10
|
592 movq m1, [r0+r1]
|
yading@10
|
593 movq m2, [r0+r1*2]
|
yading@10
|
594 movq m3, [r0+r3]
|
yading@10
|
595 punpcklbw m0, m4
|
yading@10
|
596 punpcklbw m1, m4
|
yading@10
|
597 punpcklbw m2, m4
|
yading@10
|
598 punpcklbw m3, m4
|
yading@10
|
599 paddsw m0, [r2+ 0+%%i]
|
yading@10
|
600 paddsw m1, [r2+16+%%i]
|
yading@10
|
601 paddsw m2, [r2+32+%%i]
|
yading@10
|
602 paddsw m3, [r2+48+%%i]
|
yading@10
|
603 packuswb m0, m1
|
yading@10
|
604 packuswb m2, m3
|
yading@10
|
605 movq [r0 ], m0
|
yading@10
|
606 movhps [r0+r1 ], m0
|
yading@10
|
607 movq [r0+r1*2], m2
|
yading@10
|
608 movhps [r0+r3 ], m2
|
yading@10
|
609 %if %%i == 0
|
yading@10
|
610 lea r0, [r0+r1*4]
|
yading@10
|
611 %endif
|
yading@10
|
612 %assign %%i %%i+64
|
yading@10
|
613 %endrep
|
yading@10
|
614 %else
|
yading@10
|
615 %assign %%i 0
|
yading@10
|
616 %rep 2
|
yading@10
|
617 movq m0, [r0]
|
yading@10
|
618 movq m1, [r0+r1]
|
yading@10
|
619 movq m2, [r0+r1*2]
|
yading@10
|
620 movq m3, [r0+r3]
|
yading@10
|
621 movq m5, m0
|
yading@10
|
622 movq m6, m1
|
yading@10
|
623 movq m7, m2
|
yading@10
|
624 punpcklbw m0, m4
|
yading@10
|
625 punpcklbw m1, m4
|
yading@10
|
626 punpcklbw m2, m4
|
yading@10
|
627 punpckhbw m5, m4
|
yading@10
|
628 punpckhbw m6, m4
|
yading@10
|
629 punpckhbw m7, m4
|
yading@10
|
630 paddsw m0, [r2+ 0+%%i]
|
yading@10
|
631 paddsw m1, [r2+16+%%i]
|
yading@10
|
632 paddsw m2, [r2+32+%%i]
|
yading@10
|
633 paddsw m5, [r2+64+%%i]
|
yading@10
|
634 paddsw m6, [r2+80+%%i]
|
yading@10
|
635 paddsw m7, [r2+96+%%i]
|
yading@10
|
636 packuswb m0, m5
|
yading@10
|
637 movq m5, m3
|
yading@10
|
638 punpcklbw m3, m4
|
yading@10
|
639 punpckhbw m5, m4
|
yading@10
|
640 packuswb m1, m6
|
yading@10
|
641 paddsw m3, [r2+48+%%i]
|
yading@10
|
642 paddsw m5, [r2+112+%%i]
|
yading@10
|
643 packuswb m2, m7
|
yading@10
|
644 packuswb m3, m5
|
yading@10
|
645 movq [r0 ], m0
|
yading@10
|
646 movq [r0+r1 ], m1
|
yading@10
|
647 movq [r0+r1*2], m2
|
yading@10
|
648 movq [r0+r3 ], m3
|
yading@10
|
649 %if %%i == 0
|
yading@10
|
650 lea r0, [r0+r1*4]
|
yading@10
|
651 %endif
|
yading@10
|
652 %assign %%i %%i+8
|
yading@10
|
653 %endrep
|
yading@10
|
654 %endif
|
yading@10
|
655 %assign %%i 0
|
yading@10
|
656 %rep 128/mmsize
|
yading@10
|
657 mova [r2+%%i], m4
|
yading@10
|
658 %assign %%i %%i+mmsize
|
yading@10
|
659 %endrep
|
yading@10
|
660 RET
|
yading@10
|
661 %endmacro
|
yading@10
|
662
|
yading@10
|
663 %if ARCH_X86_32
|
yading@10
|
664 INIT_MMX mmx
|
yading@10
|
665 vp3_idct_funcs
|
yading@10
|
666 %endif
|
yading@10
|
667
|
yading@10
|
668 INIT_XMM sse2
|
yading@10
|
669 vp3_idct_funcs
|
yading@10
|
670
|
yading@10
|
671 %macro DC_ADD 0
|
yading@10
|
672 movq m2, [r0 ]
|
yading@10
|
673 movq m3, [r0+r1 ]
|
yading@10
|
674 paddusb m2, m0
|
yading@10
|
675 movq m4, [r0+r1*2]
|
yading@10
|
676 paddusb m3, m0
|
yading@10
|
677 movq m5, [r0+r2 ]
|
yading@10
|
678 paddusb m4, m0
|
yading@10
|
679 paddusb m5, m0
|
yading@10
|
680 psubusb m2, m1
|
yading@10
|
681 psubusb m3, m1
|
yading@10
|
682 movq [r0 ], m2
|
yading@10
|
683 psubusb m4, m1
|
yading@10
|
684 movq [r0+r1 ], m3
|
yading@10
|
685 psubusb m5, m1
|
yading@10
|
686 movq [r0+r1*2], m4
|
yading@10
|
687 movq [r0+r2 ], m5
|
yading@10
|
688 %endmacro
|
yading@10
|
689
|
yading@10
|
690 INIT_MMX mmxext
|
yading@10
|
691 cglobal vp3_idct_dc_add, 3, 4
|
yading@10
|
692 %if ARCH_X86_64
|
yading@10
|
693 movsxd r1, r1d
|
yading@10
|
694 %endif
|
yading@10
|
695 movsx r3, word [r2]
|
yading@10
|
696 mov word [r2], 0
|
yading@10
|
697 lea r2, [r1*3]
|
yading@10
|
698 add r3, 15
|
yading@10
|
699 sar r3, 5
|
yading@10
|
700 movd m0, r3d
|
yading@10
|
701 pshufw m0, m0, 0x0
|
yading@10
|
702 pxor m1, m1
|
yading@10
|
703 psubw m1, m0
|
yading@10
|
704 packuswb m0, m0
|
yading@10
|
705 packuswb m1, m1
|
yading@10
|
706 DC_ADD
|
yading@10
|
707 lea r0, [r0+r1*4]
|
yading@10
|
708 DC_ADD
|
yading@10
|
709 RET
|