yading@10
|
1 ;*****************************************************************************
|
yading@10
|
2 ;* MMX/SSE2-optimized H.264 iDCT
|
yading@10
|
3 ;*****************************************************************************
|
yading@10
|
4 ;* Copyright (C) 2004-2005 Michael Niedermayer, Loren Merritt
|
yading@10
|
5 ;* Copyright (C) 2003-2008 x264 project
|
yading@10
|
6 ;*
|
yading@10
|
7 ;* Authors: Laurent Aimar <fenrir@via.ecp.fr>
|
yading@10
|
8 ;* Loren Merritt <lorenm@u.washington.edu>
|
yading@10
|
9 ;* Holger Lubitz <hal@duncan.ol.sub.de>
|
yading@10
|
10 ;* Min Chen <chenm001.163.com>
|
yading@10
|
11 ;*
|
yading@10
|
12 ;* This file is part of FFmpeg.
|
yading@10
|
13 ;*
|
yading@10
|
14 ;* FFmpeg is free software; you can redistribute it and/or
|
yading@10
|
15 ;* modify it under the terms of the GNU Lesser General Public
|
yading@10
|
16 ;* License as published by the Free Software Foundation; either
|
yading@10
|
17 ;* version 2.1 of the License, or (at your option) any later version.
|
yading@10
|
18 ;*
|
yading@10
|
19 ;* FFmpeg is distributed in the hope that it will be useful,
|
yading@10
|
20 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
yading@10
|
21 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
yading@10
|
22 ;* Lesser General Public License for more details.
|
yading@10
|
23 ;*
|
yading@10
|
24 ;* You should have received a copy of the GNU Lesser General Public
|
yading@10
|
25 ;* License along with FFmpeg; if not, write to the Free Software
|
yading@10
|
26 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
yading@10
|
27 ;*****************************************************************************
|
yading@10
|
28
|
yading@10
|
29 %include "libavutil/x86/x86util.asm"
|
yading@10
|
30
|
yading@10
|
31 SECTION_RODATA
|
yading@10
|
32
|
yading@10
|
33 ; FIXME this table is a duplicate from h264data.h, and will be removed once the tables from, h264 have been split
|
yading@10
|
34 scan8_mem: db 4+ 1*8, 5+ 1*8, 4+ 2*8, 5+ 2*8
|
yading@10
|
35 db 6+ 1*8, 7+ 1*8, 6+ 2*8, 7+ 2*8
|
yading@10
|
36 db 4+ 3*8, 5+ 3*8, 4+ 4*8, 5+ 4*8
|
yading@10
|
37 db 6+ 3*8, 7+ 3*8, 6+ 4*8, 7+ 4*8
|
yading@10
|
38 db 4+ 6*8, 5+ 6*8, 4+ 7*8, 5+ 7*8
|
yading@10
|
39 db 6+ 6*8, 7+ 6*8, 6+ 7*8, 7+ 7*8
|
yading@10
|
40 db 4+ 8*8, 5+ 8*8, 4+ 9*8, 5+ 9*8
|
yading@10
|
41 db 6+ 8*8, 7+ 8*8, 6+ 9*8, 7+ 9*8
|
yading@10
|
42 db 4+11*8, 5+11*8, 4+12*8, 5+12*8
|
yading@10
|
43 db 6+11*8, 7+11*8, 6+12*8, 7+12*8
|
yading@10
|
44 db 4+13*8, 5+13*8, 4+14*8, 5+14*8
|
yading@10
|
45 db 6+13*8, 7+13*8, 6+14*8, 7+14*8
|
yading@10
|
46 %ifdef PIC
|
yading@10
|
47 %define npicregs 1
|
yading@10
|
48 %define scan8 picregq
|
yading@10
|
49 %else
|
yading@10
|
50 %define npicregs 0
|
yading@10
|
51 %define scan8 scan8_mem
|
yading@10
|
52 %endif
|
yading@10
|
53
|
yading@10
|
54 cextern pw_32
|
yading@10
|
55 cextern pw_1
|
yading@10
|
56
|
yading@10
|
57 SECTION .text
|
yading@10
|
58
|
yading@10
|
59 ; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
|
yading@10
|
60 %macro IDCT4_ADD 3
|
yading@10
|
61 ; Load dct coeffs
|
yading@10
|
62 movq m0, [%2]
|
yading@10
|
63 movq m1, [%2+8]
|
yading@10
|
64 movq m2, [%2+16]
|
yading@10
|
65 movq m3, [%2+24]
|
yading@10
|
66
|
yading@10
|
67 IDCT4_1D w, 0, 1, 2, 3, 4, 5
|
yading@10
|
68 mova m6, [pw_32]
|
yading@10
|
69 TRANSPOSE4x4W 0, 1, 2, 3, 4
|
yading@10
|
70 paddw m0, m6
|
yading@10
|
71 IDCT4_1D w, 0, 1, 2, 3, 4, 5
|
yading@10
|
72 pxor m7, m7
|
yading@10
|
73 movq [%2+ 0], m7
|
yading@10
|
74 movq [%2+ 8], m7
|
yading@10
|
75 movq [%2+16], m7
|
yading@10
|
76 movq [%2+24], m7
|
yading@10
|
77
|
yading@10
|
78 STORE_DIFFx2 m0, m1, m4, m5, m7, 6, %1, %3
|
yading@10
|
79 lea %1, [%1+%3*2]
|
yading@10
|
80 STORE_DIFFx2 m2, m3, m4, m5, m7, 6, %1, %3
|
yading@10
|
81 %endmacro
|
yading@10
|
82
|
yading@10
|
83 INIT_MMX mmx
|
yading@10
|
84 ; ff_h264_idct_add_mmx(uint8_t *dst, int16_t *block, int stride)
|
yading@10
|
85 cglobal h264_idct_add_8, 3, 3, 0
|
yading@10
|
86 IDCT4_ADD r0, r1, r2
|
yading@10
|
87 RET
|
yading@10
|
88
|
yading@10
|
89 %macro IDCT8_1D 2
|
yading@10
|
90 mova m0, m1
|
yading@10
|
91 psraw m1, 1
|
yading@10
|
92 mova m4, m5
|
yading@10
|
93 psraw m4, 1
|
yading@10
|
94 paddw m4, m5
|
yading@10
|
95 paddw m1, m0
|
yading@10
|
96 paddw m4, m7
|
yading@10
|
97 paddw m1, m5
|
yading@10
|
98 psubw m4, m0
|
yading@10
|
99 paddw m1, m3
|
yading@10
|
100
|
yading@10
|
101 psubw m0, m3
|
yading@10
|
102 psubw m5, m3
|
yading@10
|
103 psraw m3, 1
|
yading@10
|
104 paddw m0, m7
|
yading@10
|
105 psubw m5, m7
|
yading@10
|
106 psraw m7, 1
|
yading@10
|
107 psubw m0, m3
|
yading@10
|
108 psubw m5, m7
|
yading@10
|
109
|
yading@10
|
110 mova m7, m1
|
yading@10
|
111 psraw m1, 2
|
yading@10
|
112 mova m3, m4
|
yading@10
|
113 psraw m3, 2
|
yading@10
|
114 paddw m3, m0
|
yading@10
|
115 psraw m0, 2
|
yading@10
|
116 paddw m1, m5
|
yading@10
|
117 psraw m5, 2
|
yading@10
|
118 psubw m0, m4
|
yading@10
|
119 psubw m7, m5
|
yading@10
|
120
|
yading@10
|
121 mova m5, m6
|
yading@10
|
122 psraw m6, 1
|
yading@10
|
123 mova m4, m2
|
yading@10
|
124 psraw m4, 1
|
yading@10
|
125 paddw m6, m2
|
yading@10
|
126 psubw m4, m5
|
yading@10
|
127
|
yading@10
|
128 mova m2, %1
|
yading@10
|
129 mova m5, %2
|
yading@10
|
130 SUMSUB_BA w, 5, 2
|
yading@10
|
131 SUMSUB_BA w, 6, 5
|
yading@10
|
132 SUMSUB_BA w, 4, 2
|
yading@10
|
133 SUMSUB_BA w, 7, 6
|
yading@10
|
134 SUMSUB_BA w, 0, 4
|
yading@10
|
135 SUMSUB_BA w, 3, 2
|
yading@10
|
136 SUMSUB_BA w, 1, 5
|
yading@10
|
137 SWAP 7, 6, 4, 5, 2, 3, 1, 0 ; 70315246 -> 01234567
|
yading@10
|
138 %endmacro
|
yading@10
|
139
|
yading@10
|
140 %macro IDCT8_1D_FULL 1
|
yading@10
|
141 mova m7, [%1+112]
|
yading@10
|
142 mova m6, [%1+ 96]
|
yading@10
|
143 mova m5, [%1+ 80]
|
yading@10
|
144 mova m3, [%1+ 48]
|
yading@10
|
145 mova m2, [%1+ 32]
|
yading@10
|
146 mova m1, [%1+ 16]
|
yading@10
|
147 IDCT8_1D [%1], [%1+ 64]
|
yading@10
|
148 %endmacro
|
yading@10
|
149
|
yading@10
|
150 ; %1=int16_t *block, %2=int16_t *dstblock
|
yading@10
|
151 %macro IDCT8_ADD_MMX_START 2
|
yading@10
|
152 IDCT8_1D_FULL %1
|
yading@10
|
153 mova [%1], m7
|
yading@10
|
154 TRANSPOSE4x4W 0, 1, 2, 3, 7
|
yading@10
|
155 mova m7, [%1]
|
yading@10
|
156 mova [%2 ], m0
|
yading@10
|
157 mova [%2+16], m1
|
yading@10
|
158 mova [%2+32], m2
|
yading@10
|
159 mova [%2+48], m3
|
yading@10
|
160 TRANSPOSE4x4W 4, 5, 6, 7, 3
|
yading@10
|
161 mova [%2+ 8], m4
|
yading@10
|
162 mova [%2+24], m5
|
yading@10
|
163 mova [%2+40], m6
|
yading@10
|
164 mova [%2+56], m7
|
yading@10
|
165 %endmacro
|
yading@10
|
166
|
yading@10
|
167 ; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
|
yading@10
|
168 %macro IDCT8_ADD_MMX_END 3-4
|
yading@10
|
169 IDCT8_1D_FULL %2
|
yading@10
|
170 mova [%2 ], m5
|
yading@10
|
171 mova [%2+16], m6
|
yading@10
|
172 mova [%2+32], m7
|
yading@10
|
173
|
yading@10
|
174 pxor m7, m7
|
yading@10
|
175 %if %0 == 4
|
yading@10
|
176 movq [%4+ 0], m7
|
yading@10
|
177 movq [%4+ 8], m7
|
yading@10
|
178 movq [%4+ 16], m7
|
yading@10
|
179 movq [%4+ 24], m7
|
yading@10
|
180 movq [%4+ 32], m7
|
yading@10
|
181 movq [%4+ 40], m7
|
yading@10
|
182 movq [%4+ 48], m7
|
yading@10
|
183 movq [%4+ 56], m7
|
yading@10
|
184 movq [%4+ 64], m7
|
yading@10
|
185 movq [%4+ 72], m7
|
yading@10
|
186 movq [%4+ 80], m7
|
yading@10
|
187 movq [%4+ 88], m7
|
yading@10
|
188 movq [%4+ 96], m7
|
yading@10
|
189 movq [%4+104], m7
|
yading@10
|
190 movq [%4+112], m7
|
yading@10
|
191 movq [%4+120], m7
|
yading@10
|
192 %endif
|
yading@10
|
193 STORE_DIFFx2 m0, m1, m5, m6, m7, 6, %1, %3
|
yading@10
|
194 lea %1, [%1+%3*2]
|
yading@10
|
195 STORE_DIFFx2 m2, m3, m5, m6, m7, 6, %1, %3
|
yading@10
|
196 mova m0, [%2 ]
|
yading@10
|
197 mova m1, [%2+16]
|
yading@10
|
198 mova m2, [%2+32]
|
yading@10
|
199 lea %1, [%1+%3*2]
|
yading@10
|
200 STORE_DIFFx2 m4, m0, m5, m6, m7, 6, %1, %3
|
yading@10
|
201 lea %1, [%1+%3*2]
|
yading@10
|
202 STORE_DIFFx2 m1, m2, m5, m6, m7, 6, %1, %3
|
yading@10
|
203 %endmacro
|
yading@10
|
204
|
yading@10
|
205 INIT_MMX mmx
|
yading@10
|
206 ; ff_h264_idct8_add_mmx(uint8_t *dst, int16_t *block, int stride)
|
yading@10
|
207 cglobal h264_idct8_add_8, 3, 4, 0
|
yading@10
|
208 %assign pad 128+4-(stack_offset&7)
|
yading@10
|
209 SUB rsp, pad
|
yading@10
|
210
|
yading@10
|
211 add word [r1], 32
|
yading@10
|
212 IDCT8_ADD_MMX_START r1 , rsp
|
yading@10
|
213 IDCT8_ADD_MMX_START r1+8, rsp+64
|
yading@10
|
214 lea r3, [r0+4]
|
yading@10
|
215 IDCT8_ADD_MMX_END r0 , rsp, r2, r1
|
yading@10
|
216 IDCT8_ADD_MMX_END r3 , rsp+8, r2
|
yading@10
|
217
|
yading@10
|
218 ADD rsp, pad
|
yading@10
|
219 RET
|
yading@10
|
220
|
yading@10
|
221 ; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
|
yading@10
|
222 %macro IDCT8_ADD_SSE 4
|
yading@10
|
223 IDCT8_1D_FULL %2
|
yading@10
|
224 %if ARCH_X86_64
|
yading@10
|
225 TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8
|
yading@10
|
226 %else
|
yading@10
|
227 TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [%2], [%2+16]
|
yading@10
|
228 %endif
|
yading@10
|
229 paddw m0, [pw_32]
|
yading@10
|
230
|
yading@10
|
231 %if ARCH_X86_64 == 0
|
yading@10
|
232 mova [%2 ], m0
|
yading@10
|
233 mova [%2+16], m4
|
yading@10
|
234 IDCT8_1D [%2], [%2+ 16]
|
yading@10
|
235 mova [%2 ], m6
|
yading@10
|
236 mova [%2+16], m7
|
yading@10
|
237 %else
|
yading@10
|
238 SWAP 0, 8
|
yading@10
|
239 SWAP 4, 9
|
yading@10
|
240 IDCT8_1D m8, m9
|
yading@10
|
241 SWAP 6, 8
|
yading@10
|
242 SWAP 7, 9
|
yading@10
|
243 %endif
|
yading@10
|
244
|
yading@10
|
245 pxor m7, m7
|
yading@10
|
246 lea %4, [%3*3]
|
yading@10
|
247 STORE_DIFF m0, m6, m7, [%1 ]
|
yading@10
|
248 STORE_DIFF m1, m6, m7, [%1+%3 ]
|
yading@10
|
249 STORE_DIFF m2, m6, m7, [%1+%3*2]
|
yading@10
|
250 STORE_DIFF m3, m6, m7, [%1+%4 ]
|
yading@10
|
251 %if ARCH_X86_64 == 0
|
yading@10
|
252 mova m0, [%2 ]
|
yading@10
|
253 mova m1, [%2+16]
|
yading@10
|
254 %else
|
yading@10
|
255 SWAP 0, 8
|
yading@10
|
256 SWAP 1, 9
|
yading@10
|
257 %endif
|
yading@10
|
258 mova [%2+ 0], m7
|
yading@10
|
259 mova [%2+ 16], m7
|
yading@10
|
260 mova [%2+ 32], m7
|
yading@10
|
261 mova [%2+ 48], m7
|
yading@10
|
262 mova [%2+ 64], m7
|
yading@10
|
263 mova [%2+ 80], m7
|
yading@10
|
264 mova [%2+ 96], m7
|
yading@10
|
265 mova [%2+112], m7
|
yading@10
|
266 lea %1, [%1+%3*4]
|
yading@10
|
267 STORE_DIFF m4, m6, m7, [%1 ]
|
yading@10
|
268 STORE_DIFF m5, m6, m7, [%1+%3 ]
|
yading@10
|
269 STORE_DIFF m0, m6, m7, [%1+%3*2]
|
yading@10
|
270 STORE_DIFF m1, m6, m7, [%1+%4 ]
|
yading@10
|
271 %endmacro
|
yading@10
|
272
|
yading@10
|
273 INIT_XMM sse2
|
yading@10
|
274 ; ff_h264_idct8_add_sse2(uint8_t *dst, int16_t *block, int stride)
|
yading@10
|
275 cglobal h264_idct8_add_8, 3, 4, 10
|
yading@10
|
276 IDCT8_ADD_SSE r0, r1, r2, r3
|
yading@10
|
277 RET
|
yading@10
|
278
|
yading@10
|
279 %macro DC_ADD_MMXEXT_INIT 2
|
yading@10
|
280 add %1, 32
|
yading@10
|
281 sar %1, 6
|
yading@10
|
282 movd m0, %1d
|
yading@10
|
283 lea %1, [%2*3]
|
yading@10
|
284 pshufw m0, m0, 0
|
yading@10
|
285 pxor m1, m1
|
yading@10
|
286 psubw m1, m0
|
yading@10
|
287 packuswb m0, m0
|
yading@10
|
288 packuswb m1, m1
|
yading@10
|
289 %endmacro
|
yading@10
|
290
|
yading@10
|
291 %macro DC_ADD_MMXEXT_OP 4
|
yading@10
|
292 %1 m2, [%2 ]
|
yading@10
|
293 %1 m3, [%2+%3 ]
|
yading@10
|
294 %1 m4, [%2+%3*2]
|
yading@10
|
295 %1 m5, [%2+%4 ]
|
yading@10
|
296 paddusb m2, m0
|
yading@10
|
297 paddusb m3, m0
|
yading@10
|
298 paddusb m4, m0
|
yading@10
|
299 paddusb m5, m0
|
yading@10
|
300 psubusb m2, m1
|
yading@10
|
301 psubusb m3, m1
|
yading@10
|
302 psubusb m4, m1
|
yading@10
|
303 psubusb m5, m1
|
yading@10
|
304 %1 [%2 ], m2
|
yading@10
|
305 %1 [%2+%3 ], m3
|
yading@10
|
306 %1 [%2+%3*2], m4
|
yading@10
|
307 %1 [%2+%4 ], m5
|
yading@10
|
308 %endmacro
|
yading@10
|
309
|
yading@10
|
310 INIT_MMX mmxext
|
yading@10
|
311 ; ff_h264_idct_dc_add_mmxext(uint8_t *dst, int16_t *block, int stride)
|
yading@10
|
312 %if ARCH_X86_64
|
yading@10
|
313 cglobal h264_idct_dc_add_8, 3, 4, 0
|
yading@10
|
314 movsx r3, word [r1]
|
yading@10
|
315 mov dword [r1], 0
|
yading@10
|
316 DC_ADD_MMXEXT_INIT r3, r2
|
yading@10
|
317 DC_ADD_MMXEXT_OP movh, r0, r2, r3
|
yading@10
|
318 RET
|
yading@10
|
319
|
yading@10
|
320 ; ff_h264_idct8_dc_add_mmxext(uint8_t *dst, int16_t *block, int stride)
|
yading@10
|
321 cglobal h264_idct8_dc_add_8, 3, 4, 0
|
yading@10
|
322 movsx r3, word [r1]
|
yading@10
|
323 mov dword [r1], 0
|
yading@10
|
324 DC_ADD_MMXEXT_INIT r3, r2
|
yading@10
|
325 DC_ADD_MMXEXT_OP mova, r0, r2, r3
|
yading@10
|
326 lea r0, [r0+r2*4]
|
yading@10
|
327 DC_ADD_MMXEXT_OP mova, r0, r2, r3
|
yading@10
|
328 RET
|
yading@10
|
329 %else
|
yading@10
|
330 cglobal h264_idct_dc_add_8, 2, 3, 0
|
yading@10
|
331 movsx r2, word [r1]
|
yading@10
|
332 mov dword [r1], 0
|
yading@10
|
333 mov r1, r2m
|
yading@10
|
334 DC_ADD_MMXEXT_INIT r2, r1
|
yading@10
|
335 DC_ADD_MMXEXT_OP movh, r0, r1, r2
|
yading@10
|
336 RET
|
yading@10
|
337
|
yading@10
|
338 ; ff_h264_idct8_dc_add_mmxext(uint8_t *dst, int16_t *block, int stride)
|
yading@10
|
339 cglobal h264_idct8_dc_add_8, 2, 3, 0
|
yading@10
|
340 movsx r2, word [r1]
|
yading@10
|
341 mov dword [r1], 0
|
yading@10
|
342 mov r1, r2m
|
yading@10
|
343 DC_ADD_MMXEXT_INIT r2, r1
|
yading@10
|
344 DC_ADD_MMXEXT_OP mova, r0, r1, r2
|
yading@10
|
345 lea r0, [r0+r1*4]
|
yading@10
|
346 DC_ADD_MMXEXT_OP mova, r0, r1, r2
|
yading@10
|
347 RET
|
yading@10
|
348 %endif
|
yading@10
|
349
|
yading@10
|
350 INIT_MMX mmx
|
yading@10
|
351 ; ff_h264_idct_add16_mmx(uint8_t *dst, const int *block_offset,
|
yading@10
|
352 ; int16_t *block, int stride, const uint8_t nnzc[6*8])
|
yading@10
|
353 cglobal h264_idct_add16_8, 5, 7 + npicregs, 0, dst, block_offset, block, stride, nnzc, cntr, coeff, picreg
|
yading@10
|
354 xor r5, r5
|
yading@10
|
355 %ifdef PIC
|
yading@10
|
356 lea picregq, [scan8_mem]
|
yading@10
|
357 %endif
|
yading@10
|
358 .nextblock:
|
yading@10
|
359 movzx r6, byte [scan8+r5]
|
yading@10
|
360 movzx r6, byte [r4+r6]
|
yading@10
|
361 test r6, r6
|
yading@10
|
362 jz .skipblock
|
yading@10
|
363 mov r6d, dword [r1+r5*4]
|
yading@10
|
364 lea r6, [r0+r6]
|
yading@10
|
365 IDCT4_ADD r6, r2, r3
|
yading@10
|
366 .skipblock:
|
yading@10
|
367 inc r5
|
yading@10
|
368 add r2, 32
|
yading@10
|
369 cmp r5, 16
|
yading@10
|
370 jl .nextblock
|
yading@10
|
371 REP_RET
|
yading@10
|
372
|
yading@10
|
373 ; ff_h264_idct8_add4_mmx(uint8_t *dst, const int *block_offset,
|
yading@10
|
374 ; int16_t *block, int stride, const uint8_t nnzc[6*8])
|
yading@10
|
375 cglobal h264_idct8_add4_8, 5, 7 + npicregs, 0, dst, block_offset, block, stride, nnzc, cntr, coeff, picreg
|
yading@10
|
376 %assign pad 128+4-(stack_offset&7)
|
yading@10
|
377 SUB rsp, pad
|
yading@10
|
378
|
yading@10
|
379 xor r5, r5
|
yading@10
|
380 %ifdef PIC
|
yading@10
|
381 lea picregq, [scan8_mem]
|
yading@10
|
382 %endif
|
yading@10
|
383 .nextblock:
|
yading@10
|
384 movzx r6, byte [scan8+r5]
|
yading@10
|
385 movzx r6, byte [r4+r6]
|
yading@10
|
386 test r6, r6
|
yading@10
|
387 jz .skipblock
|
yading@10
|
388 mov r6d, dword [r1+r5*4]
|
yading@10
|
389 add r6, r0
|
yading@10
|
390 add word [r2], 32
|
yading@10
|
391 IDCT8_ADD_MMX_START r2 , rsp
|
yading@10
|
392 IDCT8_ADD_MMX_START r2+8, rsp+64
|
yading@10
|
393 IDCT8_ADD_MMX_END r6 , rsp, r3, r2
|
yading@10
|
394 mov r6d, dword [r1+r5*4]
|
yading@10
|
395 lea r6, [r0+r6+4]
|
yading@10
|
396 IDCT8_ADD_MMX_END r6 , rsp+8, r3
|
yading@10
|
397 .skipblock:
|
yading@10
|
398 add r5, 4
|
yading@10
|
399 add r2, 128
|
yading@10
|
400 cmp r5, 16
|
yading@10
|
401 jl .nextblock
|
yading@10
|
402 ADD rsp, pad
|
yading@10
|
403 RET
|
yading@10
|
404
|
yading@10
|
405 INIT_MMX mmxext
|
yading@10
|
406 ; ff_h264_idct_add16_mmxext(uint8_t *dst, const int *block_offset,
|
yading@10
|
407 ; int16_t *block, int stride, const uint8_t nnzc[6*8])
|
yading@10
|
408 cglobal h264_idct_add16_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
|
yading@10
|
409 xor r5, r5
|
yading@10
|
410 %ifdef PIC
|
yading@10
|
411 lea picregq, [scan8_mem]
|
yading@10
|
412 %endif
|
yading@10
|
413 .nextblock:
|
yading@10
|
414 movzx r6, byte [scan8+r5]
|
yading@10
|
415 movzx r6, byte [r4+r6]
|
yading@10
|
416 test r6, r6
|
yading@10
|
417 jz .skipblock
|
yading@10
|
418 cmp r6, 1
|
yading@10
|
419 jnz .no_dc
|
yading@10
|
420 movsx r6, word [r2]
|
yading@10
|
421 test r6, r6
|
yading@10
|
422 jz .no_dc
|
yading@10
|
423 mov word [r2], 0
|
yading@10
|
424 DC_ADD_MMXEXT_INIT r6, r3
|
yading@10
|
425 %if ARCH_X86_64 == 0
|
yading@10
|
426 %define dst2q r1
|
yading@10
|
427 %define dst2d r1d
|
yading@10
|
428 %endif
|
yading@10
|
429 mov dst2d, dword [r1+r5*4]
|
yading@10
|
430 lea dst2q, [r0+dst2q]
|
yading@10
|
431 DC_ADD_MMXEXT_OP movh, dst2q, r3, r6
|
yading@10
|
432 %if ARCH_X86_64 == 0
|
yading@10
|
433 mov r1, r1m
|
yading@10
|
434 %endif
|
yading@10
|
435 inc r5
|
yading@10
|
436 add r2, 32
|
yading@10
|
437 cmp r5, 16
|
yading@10
|
438 jl .nextblock
|
yading@10
|
439 REP_RET
|
yading@10
|
440 .no_dc:
|
yading@10
|
441 mov r6d, dword [r1+r5*4]
|
yading@10
|
442 add r6, r0
|
yading@10
|
443 IDCT4_ADD r6, r2, r3
|
yading@10
|
444 .skipblock:
|
yading@10
|
445 inc r5
|
yading@10
|
446 add r2, 32
|
yading@10
|
447 cmp r5, 16
|
yading@10
|
448 jl .nextblock
|
yading@10
|
449 REP_RET
|
yading@10
|
450
|
yading@10
|
451 INIT_MMX mmx
|
yading@10
|
452 ; ff_h264_idct_add16intra_mmx(uint8_t *dst, const int *block_offset,
|
yading@10
|
453 ; int16_t *block, int stride, const uint8_t nnzc[6*8])
|
yading@10
|
454 cglobal h264_idct_add16intra_8, 5, 7 + npicregs, 0, dst, block_offset, block, stride, nnzc, cntr, coeff, picreg
|
yading@10
|
455 xor r5, r5
|
yading@10
|
456 %ifdef PIC
|
yading@10
|
457 lea picregq, [scan8_mem]
|
yading@10
|
458 %endif
|
yading@10
|
459 .nextblock:
|
yading@10
|
460 movzx r6, byte [scan8+r5]
|
yading@10
|
461 movzx r6, byte [r4+r6]
|
yading@10
|
462 or r6w, word [r2]
|
yading@10
|
463 test r6, r6
|
yading@10
|
464 jz .skipblock
|
yading@10
|
465 mov r6d, dword [r1+r5*4]
|
yading@10
|
466 add r6, r0
|
yading@10
|
467 IDCT4_ADD r6, r2, r3
|
yading@10
|
468 .skipblock:
|
yading@10
|
469 inc r5
|
yading@10
|
470 add r2, 32
|
yading@10
|
471 cmp r5, 16
|
yading@10
|
472 jl .nextblock
|
yading@10
|
473 REP_RET
|
yading@10
|
474
|
yading@10
|
475 INIT_MMX mmxext
|
yading@10
|
476 ; ff_h264_idct_add16intra_mmxext(uint8_t *dst, const int *block_offset,
|
yading@10
|
477 ; int16_t *block, int stride,
|
yading@10
|
478 ; const uint8_t nnzc[6*8])
|
yading@10
|
479 cglobal h264_idct_add16intra_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
|
yading@10
|
480 xor r5, r5
|
yading@10
|
481 %ifdef PIC
|
yading@10
|
482 lea picregq, [scan8_mem]
|
yading@10
|
483 %endif
|
yading@10
|
484 .nextblock:
|
yading@10
|
485 movzx r6, byte [scan8+r5]
|
yading@10
|
486 movzx r6, byte [r4+r6]
|
yading@10
|
487 test r6, r6
|
yading@10
|
488 jz .try_dc
|
yading@10
|
489 mov r6d, dword [r1+r5*4]
|
yading@10
|
490 lea r6, [r0+r6]
|
yading@10
|
491 IDCT4_ADD r6, r2, r3
|
yading@10
|
492 inc r5
|
yading@10
|
493 add r2, 32
|
yading@10
|
494 cmp r5, 16
|
yading@10
|
495 jl .nextblock
|
yading@10
|
496 REP_RET
|
yading@10
|
497 .try_dc:
|
yading@10
|
498 movsx r6, word [r2]
|
yading@10
|
499 test r6, r6
|
yading@10
|
500 jz .skipblock
|
yading@10
|
501 mov word [r2], 0
|
yading@10
|
502 DC_ADD_MMXEXT_INIT r6, r3
|
yading@10
|
503 %if ARCH_X86_64 == 0
|
yading@10
|
504 %define dst2q r1
|
yading@10
|
505 %define dst2d r1d
|
yading@10
|
506 %endif
|
yading@10
|
507 mov dst2d, dword [r1+r5*4]
|
yading@10
|
508 add dst2q, r0
|
yading@10
|
509 DC_ADD_MMXEXT_OP movh, dst2q, r3, r6
|
yading@10
|
510 %if ARCH_X86_64 == 0
|
yading@10
|
511 mov r1, r1m
|
yading@10
|
512 %endif
|
yading@10
|
513 .skipblock:
|
yading@10
|
514 inc r5
|
yading@10
|
515 add r2, 32
|
yading@10
|
516 cmp r5, 16
|
yading@10
|
517 jl .nextblock
|
yading@10
|
518 REP_RET
|
yading@10
|
519
|
yading@10
|
520 ; ff_h264_idct8_add4_mmxext(uint8_t *dst, const int *block_offset,
|
yading@10
|
521 ; int16_t *block, int stride,
|
yading@10
|
522 ; const uint8_t nnzc[6*8])
|
yading@10
|
523 cglobal h264_idct8_add4_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
|
yading@10
|
524 %assign pad 128+4-(stack_offset&7)
|
yading@10
|
525 SUB rsp, pad
|
yading@10
|
526
|
yading@10
|
527 xor r5, r5
|
yading@10
|
528 %ifdef PIC
|
yading@10
|
529 lea picregq, [scan8_mem]
|
yading@10
|
530 %endif
|
yading@10
|
531 .nextblock:
|
yading@10
|
532 movzx r6, byte [scan8+r5]
|
yading@10
|
533 movzx r6, byte [r4+r6]
|
yading@10
|
534 test r6, r6
|
yading@10
|
535 jz .skipblock
|
yading@10
|
536 cmp r6, 1
|
yading@10
|
537 jnz .no_dc
|
yading@10
|
538 movsx r6, word [r2]
|
yading@10
|
539 test r6, r6
|
yading@10
|
540 jz .no_dc
|
yading@10
|
541 mov word [r2], 0
|
yading@10
|
542 DC_ADD_MMXEXT_INIT r6, r3
|
yading@10
|
543 %if ARCH_X86_64 == 0
|
yading@10
|
544 %define dst2q r1
|
yading@10
|
545 %define dst2d r1d
|
yading@10
|
546 %endif
|
yading@10
|
547 mov dst2d, dword [r1+r5*4]
|
yading@10
|
548 lea dst2q, [r0+dst2q]
|
yading@10
|
549 DC_ADD_MMXEXT_OP mova, dst2q, r3, r6
|
yading@10
|
550 lea dst2q, [dst2q+r3*4]
|
yading@10
|
551 DC_ADD_MMXEXT_OP mova, dst2q, r3, r6
|
yading@10
|
552 %if ARCH_X86_64 == 0
|
yading@10
|
553 mov r1, r1m
|
yading@10
|
554 %endif
|
yading@10
|
555 add r5, 4
|
yading@10
|
556 add r2, 128
|
yading@10
|
557 cmp r5, 16
|
yading@10
|
558 jl .nextblock
|
yading@10
|
559
|
yading@10
|
560 ADD rsp, pad
|
yading@10
|
561 RET
|
yading@10
|
562 .no_dc:
|
yading@10
|
563 mov r6d, dword [r1+r5*4]
|
yading@10
|
564 add r6, r0
|
yading@10
|
565 add word [r2], 32
|
yading@10
|
566 IDCT8_ADD_MMX_START r2 , rsp
|
yading@10
|
567 IDCT8_ADD_MMX_START r2+8, rsp+64
|
yading@10
|
568 IDCT8_ADD_MMX_END r6 , rsp, r3, r2
|
yading@10
|
569 mov r6d, dword [r1+r5*4]
|
yading@10
|
570 lea r6, [r0+r6+4]
|
yading@10
|
571 IDCT8_ADD_MMX_END r6 , rsp+8, r3
|
yading@10
|
572 .skipblock:
|
yading@10
|
573 add r5, 4
|
yading@10
|
574 add r2, 128
|
yading@10
|
575 cmp r5, 16
|
yading@10
|
576 jl .nextblock
|
yading@10
|
577
|
yading@10
|
578 ADD rsp, pad
|
yading@10
|
579 RET
|
yading@10
|
580
|
yading@10
|
581 INIT_XMM sse2
|
yading@10
|
582 ; ff_h264_idct8_add4_sse2(uint8_t *dst, const int *block_offset,
|
yading@10
|
583 ; int16_t *block, int stride, const uint8_t nnzc[6*8])
|
yading@10
|
584 cglobal h264_idct8_add4_8, 5, 8 + npicregs, 10, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
|
yading@10
|
585 xor r5, r5
|
yading@10
|
586 %ifdef PIC
|
yading@10
|
587 lea picregq, [scan8_mem]
|
yading@10
|
588 %endif
|
yading@10
|
589 .nextblock:
|
yading@10
|
590 movzx r6, byte [scan8+r5]
|
yading@10
|
591 movzx r6, byte [r4+r6]
|
yading@10
|
592 test r6, r6
|
yading@10
|
593 jz .skipblock
|
yading@10
|
594 cmp r6, 1
|
yading@10
|
595 jnz .no_dc
|
yading@10
|
596 movsx r6, word [r2]
|
yading@10
|
597 test r6, r6
|
yading@10
|
598 jz .no_dc
|
yading@10
|
599 INIT_MMX cpuname
|
yading@10
|
600 mov word [r2], 0
|
yading@10
|
601 DC_ADD_MMXEXT_INIT r6, r3
|
yading@10
|
602 %if ARCH_X86_64 == 0
|
yading@10
|
603 %define dst2q r1
|
yading@10
|
604 %define dst2d r1d
|
yading@10
|
605 %endif
|
yading@10
|
606 mov dst2d, dword [r1+r5*4]
|
yading@10
|
607 add dst2q, r0
|
yading@10
|
608 DC_ADD_MMXEXT_OP mova, dst2q, r3, r6
|
yading@10
|
609 lea dst2q, [dst2q+r3*4]
|
yading@10
|
610 DC_ADD_MMXEXT_OP mova, dst2q, r3, r6
|
yading@10
|
611 %if ARCH_X86_64 == 0
|
yading@10
|
612 mov r1, r1m
|
yading@10
|
613 %endif
|
yading@10
|
614 add r5, 4
|
yading@10
|
615 add r2, 128
|
yading@10
|
616 cmp r5, 16
|
yading@10
|
617 jl .nextblock
|
yading@10
|
618 REP_RET
|
yading@10
|
619 .no_dc:
|
yading@10
|
620 INIT_XMM cpuname
|
yading@10
|
621 mov dst2d, dword [r1+r5*4]
|
yading@10
|
622 add dst2q, r0
|
yading@10
|
623 IDCT8_ADD_SSE dst2q, r2, r3, r6
|
yading@10
|
624 %if ARCH_X86_64 == 0
|
yading@10
|
625 mov r1, r1m
|
yading@10
|
626 %endif
|
yading@10
|
627 .skipblock:
|
yading@10
|
628 add r5, 4
|
yading@10
|
629 add r2, 128
|
yading@10
|
630 cmp r5, 16
|
yading@10
|
631 jl .nextblock
|
yading@10
|
632 REP_RET
|
yading@10
|
633
|
yading@10
|
634 INIT_MMX mmx
|
yading@10
|
635 h264_idct_add8_mmx_plane:
|
yading@10
|
636 .nextblock:
|
yading@10
|
637 movzx r6, byte [scan8+r5]
|
yading@10
|
638 movzx r6, byte [r4+r6]
|
yading@10
|
639 or r6w, word [r2]
|
yading@10
|
640 test r6, r6
|
yading@10
|
641 jz .skipblock
|
yading@10
|
642 %if ARCH_X86_64
|
yading@10
|
643 mov r0d, dword [r1+r5*4]
|
yading@10
|
644 add r0, [dst2q]
|
yading@10
|
645 %else
|
yading@10
|
646 mov r0, r1m ; XXX r1m here is actually r0m of the calling func
|
yading@10
|
647 mov r0, [r0]
|
yading@10
|
648 add r0, dword [r1+r5*4]
|
yading@10
|
649 %endif
|
yading@10
|
650 IDCT4_ADD r0, r2, r3
|
yading@10
|
651 .skipblock:
|
yading@10
|
652 inc r5
|
yading@10
|
653 add r2, 32
|
yading@10
|
654 test r5, 3
|
yading@10
|
655 jnz .nextblock
|
yading@10
|
656 rep ret
|
yading@10
|
657
|
yading@10
|
658 ; ff_h264_idct_add8_mmx(uint8_t **dest, const int *block_offset,
|
yading@10
|
659 ; int16_t *block, int stride, const uint8_t nnzc[6*8])
|
yading@10
|
660 cglobal h264_idct_add8_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
|
yading@10
|
661 mov r5, 16
|
yading@10
|
662 add r2, 512
|
yading@10
|
663 %ifdef PIC
|
yading@10
|
664 lea picregq, [scan8_mem]
|
yading@10
|
665 %endif
|
yading@10
|
666 %if ARCH_X86_64
|
yading@10
|
667 mov dst2q, r0
|
yading@10
|
668 %endif
|
yading@10
|
669 call h264_idct_add8_mmx_plane
|
yading@10
|
670 mov r5, 32
|
yading@10
|
671 add r2, 384
|
yading@10
|
672 %if ARCH_X86_64
|
yading@10
|
673 add dst2q, gprsize
|
yading@10
|
674 %else
|
yading@10
|
675 add r0mp, gprsize
|
yading@10
|
676 %endif
|
yading@10
|
677 call h264_idct_add8_mmx_plane
|
yading@10
|
678 RET
|
yading@10
|
679
|
yading@10
|
680 h264_idct_add8_mmxext_plane:
|
yading@10
|
681 .nextblock:
|
yading@10
|
682 movzx r6, byte [scan8+r5]
|
yading@10
|
683 movzx r6, byte [r4+r6]
|
yading@10
|
684 test r6, r6
|
yading@10
|
685 jz .try_dc
|
yading@10
|
686 %if ARCH_X86_64
|
yading@10
|
687 mov r0d, dword [r1+r5*4]
|
yading@10
|
688 add r0, [dst2q]
|
yading@10
|
689 %else
|
yading@10
|
690 mov r0, r1m ; XXX r1m here is actually r0m of the calling func
|
yading@10
|
691 mov r0, [r0]
|
yading@10
|
692 add r0, dword [r1+r5*4]
|
yading@10
|
693 %endif
|
yading@10
|
694 IDCT4_ADD r0, r2, r3
|
yading@10
|
695 inc r5
|
yading@10
|
696 add r2, 32
|
yading@10
|
697 test r5, 3
|
yading@10
|
698 jnz .nextblock
|
yading@10
|
699 rep ret
|
yading@10
|
700 .try_dc:
|
yading@10
|
701 movsx r6, word [r2]
|
yading@10
|
702 test r6, r6
|
yading@10
|
703 jz .skipblock
|
yading@10
|
704 mov word [r2], 0
|
yading@10
|
705 DC_ADD_MMXEXT_INIT r6, r3
|
yading@10
|
706 %if ARCH_X86_64
|
yading@10
|
707 mov r0d, dword [r1+r5*4]
|
yading@10
|
708 add r0, [dst2q]
|
yading@10
|
709 %else
|
yading@10
|
710 mov r0, r1m ; XXX r1m here is actually r0m of the calling func
|
yading@10
|
711 mov r0, [r0]
|
yading@10
|
712 add r0, dword [r1+r5*4]
|
yading@10
|
713 %endif
|
yading@10
|
714 DC_ADD_MMXEXT_OP movh, r0, r3, r6
|
yading@10
|
715 .skipblock:
|
yading@10
|
716 inc r5
|
yading@10
|
717 add r2, 32
|
yading@10
|
718 test r5, 3
|
yading@10
|
719 jnz .nextblock
|
yading@10
|
720 rep ret
|
yading@10
|
721
|
yading@10
|
722 INIT_MMX mmxext
|
yading@10
|
723 ; ff_h264_idct_add8_mmxext(uint8_t **dest, const int *block_offset,
|
yading@10
|
724 ; int16_t *block, int stride, const uint8_t nnzc[6*8])
|
yading@10
|
725 cglobal h264_idct_add8_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
|
yading@10
|
726 mov r5, 16
|
yading@10
|
727 add r2, 512
|
yading@10
|
728 %if ARCH_X86_64
|
yading@10
|
729 mov dst2q, r0
|
yading@10
|
730 %endif
|
yading@10
|
731 %ifdef PIC
|
yading@10
|
732 lea picregq, [scan8_mem]
|
yading@10
|
733 %endif
|
yading@10
|
734 call h264_idct_add8_mmxext_plane
|
yading@10
|
735 mov r5, 32
|
yading@10
|
736 add r2, 384
|
yading@10
|
737 %if ARCH_X86_64
|
yading@10
|
738 add dst2q, gprsize
|
yading@10
|
739 %else
|
yading@10
|
740 add r0mp, gprsize
|
yading@10
|
741 %endif
|
yading@10
|
742 call h264_idct_add8_mmxext_plane
|
yading@10
|
743 RET
|
yading@10
|
744
|
yading@10
|
745 ; r0 = uint8_t *dst, r2 = int16_t *block, r3 = int stride, r6=clobbered
|
yading@10
|
746 h264_idct_dc_add8_mmxext:
|
yading@10
|
747 movd m0, [r2 ] ; 0 0 X D
|
yading@10
|
748 mov word [r2+ 0], 0
|
yading@10
|
749 punpcklwd m0, [r2+32] ; x X d D
|
yading@10
|
750 mov word [r2+32], 0
|
yading@10
|
751 paddsw m0, [pw_32]
|
yading@10
|
752 psraw m0, 6
|
yading@10
|
753 punpcklwd m0, m0 ; d d D D
|
yading@10
|
754 pxor m1, m1 ; 0 0 0 0
|
yading@10
|
755 psubw m1, m0 ; -d-d-D-D
|
yading@10
|
756 packuswb m0, m1 ; -d-d-D-D d d D D
|
yading@10
|
757 pshufw m1, m0, 0xFA ; -d-d-d-d-D-D-D-D
|
yading@10
|
758 punpcklwd m0, m0 ; d d d d D D D D
|
yading@10
|
759 lea r6, [r3*3]
|
yading@10
|
760 DC_ADD_MMXEXT_OP movq, r0, r3, r6
|
yading@10
|
761 ret
|
yading@10
|
762
|
yading@10
|
763 ALIGN 16
|
yading@10
|
764 INIT_XMM sse2
|
yading@10
|
765 ; r0 = uint8_t *dst (clobbered), r2 = int16_t *block, r3 = int stride
|
yading@10
|
766 h264_add8x4_idct_sse2:
|
yading@10
|
767 movq m0, [r2+ 0]
|
yading@10
|
768 movq m1, [r2+ 8]
|
yading@10
|
769 movq m2, [r2+16]
|
yading@10
|
770 movq m3, [r2+24]
|
yading@10
|
771 movhps m0, [r2+32]
|
yading@10
|
772 movhps m1, [r2+40]
|
yading@10
|
773 movhps m2, [r2+48]
|
yading@10
|
774 movhps m3, [r2+56]
|
yading@10
|
775 IDCT4_1D w,0,1,2,3,4,5
|
yading@10
|
776 TRANSPOSE2x4x4W 0,1,2,3,4
|
yading@10
|
777 paddw m0, [pw_32]
|
yading@10
|
778 IDCT4_1D w,0,1,2,3,4,5
|
yading@10
|
779 pxor m7, m7
|
yading@10
|
780 mova [r2+ 0], m7
|
yading@10
|
781 mova [r2+16], m7
|
yading@10
|
782 mova [r2+32], m7
|
yading@10
|
783 mova [r2+48], m7
|
yading@10
|
784 STORE_DIFFx2 m0, m1, m4, m5, m7, 6, r0, r3
|
yading@10
|
785 lea r0, [r0+r3*2]
|
yading@10
|
786 STORE_DIFFx2 m2, m3, m4, m5, m7, 6, r0, r3
|
yading@10
|
787 ret
|
yading@10
|
788
|
yading@10
|
789 %macro add16_sse2_cycle 2
|
yading@10
|
790 movzx r0, word [r4+%2]
|
yading@10
|
791 test r0, r0
|
yading@10
|
792 jz .cycle%1end
|
yading@10
|
793 mov r0d, dword [r1+%1*8]
|
yading@10
|
794 %if ARCH_X86_64
|
yading@10
|
795 add r0, r5
|
yading@10
|
796 %else
|
yading@10
|
797 add r0, r0m
|
yading@10
|
798 %endif
|
yading@10
|
799 call h264_add8x4_idct_sse2
|
yading@10
|
800 .cycle%1end:
|
yading@10
|
801 %if %1 < 7
|
yading@10
|
802 add r2, 64
|
yading@10
|
803 %endif
|
yading@10
|
804 %endmacro
|
yading@10
|
805
|
yading@10
|
806 ; ff_h264_idct_add16_sse2(uint8_t *dst, const int *block_offset,
|
yading@10
|
807 ; int16_t *block, int stride, const uint8_t nnzc[6*8])
|
yading@10
|
808 cglobal h264_idct_add16_8, 5, 5 + ARCH_X86_64, 8
|
yading@10
|
809 %if ARCH_X86_64
|
yading@10
|
810 mov r5, r0
|
yading@10
|
811 %endif
|
yading@10
|
812 ; unrolling of the loop leads to an average performance gain of
|
yading@10
|
813 ; 20-25%
|
yading@10
|
814 add16_sse2_cycle 0, 0xc
|
yading@10
|
815 add16_sse2_cycle 1, 0x14
|
yading@10
|
816 add16_sse2_cycle 2, 0xe
|
yading@10
|
817 add16_sse2_cycle 3, 0x16
|
yading@10
|
818 add16_sse2_cycle 4, 0x1c
|
yading@10
|
819 add16_sse2_cycle 5, 0x24
|
yading@10
|
820 add16_sse2_cycle 6, 0x1e
|
yading@10
|
821 add16_sse2_cycle 7, 0x26
|
yading@10
|
822 RET
|
yading@10
|
823
|
yading@10
|
824 %macro add16intra_sse2_cycle 2
|
yading@10
|
825 movzx r0, word [r4+%2]
|
yading@10
|
826 test r0, r0
|
yading@10
|
827 jz .try%1dc
|
yading@10
|
828 mov r0d, dword [r1+%1*8]
|
yading@10
|
829 %if ARCH_X86_64
|
yading@10
|
830 add r0, r7
|
yading@10
|
831 %else
|
yading@10
|
832 add r0, r0m
|
yading@10
|
833 %endif
|
yading@10
|
834 call h264_add8x4_idct_sse2
|
yading@10
|
835 jmp .cycle%1end
|
yading@10
|
836 .try%1dc:
|
yading@10
|
837 movsx r0, word [r2 ]
|
yading@10
|
838 or r0w, word [r2+32]
|
yading@10
|
839 jz .cycle%1end
|
yading@10
|
840 mov r0d, dword [r1+%1*8]
|
yading@10
|
841 %if ARCH_X86_64
|
yading@10
|
842 add r0, r7
|
yading@10
|
843 %else
|
yading@10
|
844 add r0, r0m
|
yading@10
|
845 %endif
|
yading@10
|
846 call h264_idct_dc_add8_mmxext
|
yading@10
|
847 .cycle%1end:
|
yading@10
|
848 %if %1 < 7
|
yading@10
|
849 add r2, 64
|
yading@10
|
850 %endif
|
yading@10
|
851 %endmacro
|
yading@10
|
852
|
yading@10
|
853 ; ff_h264_idct_add16intra_sse2(uint8_t *dst, const int *block_offset,
|
yading@10
|
854 ; int16_t *block, int stride, const uint8_t nnzc[6*8])
|
yading@10
|
855 cglobal h264_idct_add16intra_8, 5, 7 + ARCH_X86_64, 8
|
yading@10
|
856 %if ARCH_X86_64
|
yading@10
|
857 mov r7, r0
|
yading@10
|
858 %endif
|
yading@10
|
859 add16intra_sse2_cycle 0, 0xc
|
yading@10
|
860 add16intra_sse2_cycle 1, 0x14
|
yading@10
|
861 add16intra_sse2_cycle 2, 0xe
|
yading@10
|
862 add16intra_sse2_cycle 3, 0x16
|
yading@10
|
863 add16intra_sse2_cycle 4, 0x1c
|
yading@10
|
864 add16intra_sse2_cycle 5, 0x24
|
yading@10
|
865 add16intra_sse2_cycle 6, 0x1e
|
yading@10
|
866 add16intra_sse2_cycle 7, 0x26
|
yading@10
|
867 RET
|
yading@10
|
868
|
yading@10
|
869 %macro add8_sse2_cycle 2
|
yading@10
|
870 movzx r0, word [r4+%2]
|
yading@10
|
871 test r0, r0
|
yading@10
|
872 jz .try%1dc
|
yading@10
|
873 %if ARCH_X86_64
|
yading@10
|
874 mov r0d, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
|
yading@10
|
875 add r0, [r7]
|
yading@10
|
876 %else
|
yading@10
|
877 mov r0, r0m
|
yading@10
|
878 mov r0, [r0]
|
yading@10
|
879 add r0, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
|
yading@10
|
880 %endif
|
yading@10
|
881 call h264_add8x4_idct_sse2
|
yading@10
|
882 jmp .cycle%1end
|
yading@10
|
883 .try%1dc:
|
yading@10
|
884 movsx r0, word [r2 ]
|
yading@10
|
885 or r0w, word [r2+32]
|
yading@10
|
886 jz .cycle%1end
|
yading@10
|
887 %if ARCH_X86_64
|
yading@10
|
888 mov r0d, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
|
yading@10
|
889 add r0, [r7]
|
yading@10
|
890 %else
|
yading@10
|
891 mov r0, r0m
|
yading@10
|
892 mov r0, [r0]
|
yading@10
|
893 add r0, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
|
yading@10
|
894 %endif
|
yading@10
|
895 call h264_idct_dc_add8_mmxext
|
yading@10
|
896 .cycle%1end:
|
yading@10
|
897 %if %1 == 1
|
yading@10
|
898 add r2, 384+64
|
yading@10
|
899 %elif %1 < 3
|
yading@10
|
900 add r2, 64
|
yading@10
|
901 %endif
|
yading@10
|
902 %endmacro
|
yading@10
|
903
|
yading@10
|
904 ; ff_h264_idct_add8_sse2(uint8_t **dest, const int *block_offset,
|
yading@10
|
905 ; int16_t *block, int stride, const uint8_t nnzc[6*8])
|
yading@10
|
906 cglobal h264_idct_add8_8, 5, 7 + ARCH_X86_64, 8
|
yading@10
|
907 add r2, 512
|
yading@10
|
908 %if ARCH_X86_64
|
yading@10
|
909 mov r7, r0
|
yading@10
|
910 %endif
|
yading@10
|
911 add8_sse2_cycle 0, 0x34
|
yading@10
|
912 add8_sse2_cycle 1, 0x3c
|
yading@10
|
913 %if ARCH_X86_64
|
yading@10
|
914 add r7, gprsize
|
yading@10
|
915 %else
|
yading@10
|
916 add r0mp, gprsize
|
yading@10
|
917 %endif
|
yading@10
|
918 add8_sse2_cycle 2, 0x5c
|
yading@10
|
919 add8_sse2_cycle 3, 0x64
|
yading@10
|
920 RET
|
yading@10
|
921
|
yading@10
|
922 ;void ff_h264_luma_dc_dequant_idct_mmx(int16_t *output, int16_t *input, int qmul)
|
yading@10
|
923
|
yading@10
|
924 %macro WALSH4_1D 5
|
yading@10
|
925 SUMSUB_BADC w, %4, %3, %2, %1, %5
|
yading@10
|
926 SUMSUB_BADC w, %4, %2, %3, %1, %5
|
yading@10
|
927 SWAP %1, %4, %3
|
yading@10
|
928 %endmacro
|
yading@10
|
929
|
yading@10
|
930 %macro DEQUANT_MMX 3
|
yading@10
|
931 mova m7, [pw_1]
|
yading@10
|
932 mova m4, %1
|
yading@10
|
933 punpcklwd %1, m7
|
yading@10
|
934 punpckhwd m4, m7
|
yading@10
|
935 mova m5, %2
|
yading@10
|
936 punpcklwd %2, m7
|
yading@10
|
937 punpckhwd m5, m7
|
yading@10
|
938 movd m7, t3d
|
yading@10
|
939 punpckldq m7, m7
|
yading@10
|
940 pmaddwd %1, m7
|
yading@10
|
941 pmaddwd %2, m7
|
yading@10
|
942 pmaddwd m4, m7
|
yading@10
|
943 pmaddwd m5, m7
|
yading@10
|
944 psrad %1, %3
|
yading@10
|
945 psrad %2, %3
|
yading@10
|
946 psrad m4, %3
|
yading@10
|
947 psrad m5, %3
|
yading@10
|
948 packssdw %1, m4
|
yading@10
|
949 packssdw %2, m5
|
yading@10
|
950 %endmacro
|
yading@10
|
951
|
yading@10
|
952 %macro STORE_WORDS 5-9
|
yading@10
|
953 %if cpuflag(sse)
|
yading@10
|
954 movd t0d, %1
|
yading@10
|
955 psrldq %1, 4
|
yading@10
|
956 movd t1d, %1
|
yading@10
|
957 psrldq %1, 4
|
yading@10
|
958 mov [t2+%2*32], t0w
|
yading@10
|
959 mov [t2+%4*32], t1w
|
yading@10
|
960 shr t0d, 16
|
yading@10
|
961 shr t1d, 16
|
yading@10
|
962 mov [t2+%3*32], t0w
|
yading@10
|
963 mov [t2+%5*32], t1w
|
yading@10
|
964 movd t0d, %1
|
yading@10
|
965 psrldq %1, 4
|
yading@10
|
966 movd t1d, %1
|
yading@10
|
967 mov [t2+%6*32], t0w
|
yading@10
|
968 mov [t2+%8*32], t1w
|
yading@10
|
969 shr t0d, 16
|
yading@10
|
970 shr t1d, 16
|
yading@10
|
971 mov [t2+%7*32], t0w
|
yading@10
|
972 mov [t2+%9*32], t1w
|
yading@10
|
973 %else
|
yading@10
|
974 movd t0d, %1
|
yading@10
|
975 psrlq %1, 32
|
yading@10
|
976 movd t1d, %1
|
yading@10
|
977 mov [t2+%2*32], t0w
|
yading@10
|
978 mov [t2+%4*32], t1w
|
yading@10
|
979 shr t0d, 16
|
yading@10
|
980 shr t1d, 16
|
yading@10
|
981 mov [t2+%3*32], t0w
|
yading@10
|
982 mov [t2+%5*32], t1w
|
yading@10
|
983 %endif
|
yading@10
|
984 %endmacro
|
yading@10
|
985
|
yading@10
|
986 %macro DEQUANT_STORE 1
|
yading@10
|
987 %if cpuflag(sse2)
|
yading@10
|
988 movd xmm4, t3d
|
yading@10
|
989 movq xmm5, [pw_1]
|
yading@10
|
990 pshufd xmm4, xmm4, 0
|
yading@10
|
991 movq2dq xmm0, m0
|
yading@10
|
992 movq2dq xmm1, m1
|
yading@10
|
993 movq2dq xmm2, m2
|
yading@10
|
994 movq2dq xmm3, m3
|
yading@10
|
995 punpcklwd xmm0, xmm5
|
yading@10
|
996 punpcklwd xmm1, xmm5
|
yading@10
|
997 punpcklwd xmm2, xmm5
|
yading@10
|
998 punpcklwd xmm3, xmm5
|
yading@10
|
999 pmaddwd xmm0, xmm4
|
yading@10
|
1000 pmaddwd xmm1, xmm4
|
yading@10
|
1001 pmaddwd xmm2, xmm4
|
yading@10
|
1002 pmaddwd xmm3, xmm4
|
yading@10
|
1003 psrad xmm0, %1
|
yading@10
|
1004 psrad xmm1, %1
|
yading@10
|
1005 psrad xmm2, %1
|
yading@10
|
1006 psrad xmm3, %1
|
yading@10
|
1007 packssdw xmm0, xmm1
|
yading@10
|
1008 packssdw xmm2, xmm3
|
yading@10
|
1009 STORE_WORDS xmm0, 0, 1, 4, 5, 2, 3, 6, 7
|
yading@10
|
1010 STORE_WORDS xmm2, 8, 9, 12, 13, 10, 11, 14, 15
|
yading@10
|
1011 %else
|
yading@10
|
1012 DEQUANT_MMX m0, m1, %1
|
yading@10
|
1013 STORE_WORDS m0, 0, 1, 4, 5
|
yading@10
|
1014 STORE_WORDS m1, 2, 3, 6, 7
|
yading@10
|
1015
|
yading@10
|
1016 DEQUANT_MMX m2, m3, %1
|
yading@10
|
1017 STORE_WORDS m2, 8, 9, 12, 13
|
yading@10
|
1018 STORE_WORDS m3, 10, 11, 14, 15
|
yading@10
|
1019 %endif
|
yading@10
|
1020 %endmacro
|
yading@10
|
1021
|
yading@10
|
1022 %macro IDCT_DC_DEQUANT 1
|
yading@10
|
1023 cglobal h264_luma_dc_dequant_idct, 3, 4, %1
|
yading@10
|
1024 ; manually spill XMM registers for Win64 because
|
yading@10
|
1025 ; the code here is initialized with INIT_MMX
|
yading@10
|
1026 WIN64_SPILL_XMM %1
|
yading@10
|
1027 movq m3, [r1+24]
|
yading@10
|
1028 movq m2, [r1+16]
|
yading@10
|
1029 movq m1, [r1+ 8]
|
yading@10
|
1030 movq m0, [r1+ 0]
|
yading@10
|
1031 WALSH4_1D 0,1,2,3,4
|
yading@10
|
1032 TRANSPOSE4x4W 0,1,2,3,4
|
yading@10
|
1033 WALSH4_1D 0,1,2,3,4
|
yading@10
|
1034
|
yading@10
|
1035 ; shift, tmp, output, qmul
|
yading@10
|
1036 %if WIN64
|
yading@10
|
1037 DECLARE_REG_TMP 0,3,1,2
|
yading@10
|
1038 ; we can't avoid this, because r0 is the shift register (ecx) on win64
|
yading@10
|
1039 xchg r0, t2
|
yading@10
|
1040 %elif ARCH_X86_64
|
yading@10
|
1041 DECLARE_REG_TMP 3,1,0,2
|
yading@10
|
1042 %else
|
yading@10
|
1043 DECLARE_REG_TMP 1,3,0,2
|
yading@10
|
1044 %endif
|
yading@10
|
1045
|
yading@10
|
1046 cmp t3d, 32767
|
yading@10
|
1047 jg .big_qmul
|
yading@10
|
1048 add t3d, 128 << 16
|
yading@10
|
1049 DEQUANT_STORE 8
|
yading@10
|
1050 RET
|
yading@10
|
1051 .big_qmul:
|
yading@10
|
1052 bsr t0d, t3d
|
yading@10
|
1053 add t3d, 128 << 16
|
yading@10
|
1054 mov t1d, 7
|
yading@10
|
1055 cmp t0d, t1d
|
yading@10
|
1056 cmovg t0d, t1d
|
yading@10
|
1057 inc t1d
|
yading@10
|
1058 shr t3d, t0b
|
yading@10
|
1059 sub t1d, t0d
|
yading@10
|
1060 %if cpuflag(sse2)
|
yading@10
|
1061 movd xmm6, t1d
|
yading@10
|
1062 DEQUANT_STORE xmm6
|
yading@10
|
1063 %else
|
yading@10
|
1064 movd m6, t1d
|
yading@10
|
1065 DEQUANT_STORE m6
|
yading@10
|
1066 %endif
|
yading@10
|
1067 RET
|
yading@10
|
1068 %endmacro
|
yading@10
|
1069
|
yading@10
|
1070 INIT_MMX mmx
|
yading@10
|
1071 IDCT_DC_DEQUANT 0
|
yading@10
|
1072 INIT_MMX sse2
|
yading@10
|
1073 IDCT_DC_DEQUANT 7
|