yading@10
|
1 ;*****************************************************************************
|
yading@10
|
2 ;* MMX/SSE2/SSSE3-optimized H.264 QPEL code
|
yading@10
|
3 ;*****************************************************************************
|
yading@10
|
4 ;* Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt
|
yading@10
|
5 ;* Copyright (C) 2012 Daniel Kang
|
yading@10
|
6 ;*
|
yading@10
|
7 ;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
|
yading@10
|
8 ;*
|
yading@10
|
9 ;* This file is part of FFmpeg.
|
yading@10
|
10 ;*
|
yading@10
|
11 ;* FFmpeg is free software; you can redistribute it and/or
|
yading@10
|
12 ;* modify it under the terms of the GNU Lesser General Public
|
yading@10
|
13 ;* License as published by the Free Software Foundation; either
|
yading@10
|
14 ;* version 2.1 of the License, or (at your option) any later version.
|
yading@10
|
15 ;*
|
yading@10
|
16 ;* FFmpeg is distributed in the hope that it will be useful,
|
yading@10
|
17 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
yading@10
|
18 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
yading@10
|
19 ;* Lesser General Public License for more details.
|
yading@10
|
20 ;*
|
yading@10
|
21 ;* You should have received a copy of the GNU Lesser General Public
|
yading@10
|
22 ;* License along with FFmpeg; if not, write to the Free Software
|
yading@10
|
23 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
yading@10
|
24 ;******************************************************************************
|
yading@10
|
25
|
yading@10
|
26 %include "libavutil/x86/x86util.asm"
|
yading@10
|
27
|
yading@10
|
28 SECTION_RODATA 32
|
yading@10
|
29
|
yading@10
|
30 cextern pw_16
|
yading@10
|
31 cextern pw_5
|
yading@10
|
32 cextern pb_0
|
yading@10
|
33
|
yading@10
|
34 SECTION .text
|
yading@10
|
35
|
yading@10
|
36
|
yading@10
|
37 %macro op_avgh 3
|
yading@10
|
38 movh %3, %2
|
yading@10
|
39 pavgb %1, %3
|
yading@10
|
40 movh %2, %1
|
yading@10
|
41 %endmacro
|
yading@10
|
42
|
yading@10
|
43 %macro op_avg 2-3
|
yading@10
|
44 pavgb %1, %2
|
yading@10
|
45 mova %2, %1
|
yading@10
|
46 %endmacro
|
yading@10
|
47
|
yading@10
|
48 %macro op_puth 2-3
|
yading@10
|
49 movh %2, %1
|
yading@10
|
50 %endmacro
|
yading@10
|
51
|
yading@10
|
52 %macro op_put 2-3
|
yading@10
|
53 mova %2, %1
|
yading@10
|
54 %endmacro
|
yading@10
|
55
|
yading@10
|
56 %macro QPEL4_H_LOWPASS_OP 1
|
yading@10
|
57 cglobal %1_h264_qpel4_h_lowpass, 4,5 ; dst, src, dstStride, srcStride
|
yading@10
|
58 movsxdifnidn r2, r2d
|
yading@10
|
59 movsxdifnidn r3, r3d
|
yading@10
|
60 pxor m7, m7
|
yading@10
|
61 mova m4, [pw_5]
|
yading@10
|
62 mova m5, [pw_16]
|
yading@10
|
63 mov r4d, 4
|
yading@10
|
64 .loop:
|
yading@10
|
65 movh m1, [r1-1]
|
yading@10
|
66 movh m2, [r1+0]
|
yading@10
|
67 movh m3, [r1+1]
|
yading@10
|
68 movh m0, [r1+2]
|
yading@10
|
69 punpcklbw m1, m7
|
yading@10
|
70 punpcklbw m2, m7
|
yading@10
|
71 punpcklbw m3, m7
|
yading@10
|
72 punpcklbw m0, m7
|
yading@10
|
73 paddw m1, m0
|
yading@10
|
74 paddw m2, m3
|
yading@10
|
75 movh m0, [r1-2]
|
yading@10
|
76 movh m3, [r1+3]
|
yading@10
|
77 punpcklbw m0, m7
|
yading@10
|
78 punpcklbw m3, m7
|
yading@10
|
79 paddw m0, m3
|
yading@10
|
80 psllw m2, 2
|
yading@10
|
81 psubw m2, m1
|
yading@10
|
82 pmullw m2, m4
|
yading@10
|
83 paddw m0, m5
|
yading@10
|
84 paddw m0, m2
|
yading@10
|
85 psraw m0, 5
|
yading@10
|
86 packuswb m0, m0
|
yading@10
|
87 op_%1h m0, [r0], m6
|
yading@10
|
88 add r0, r2
|
yading@10
|
89 add r1, r3
|
yading@10
|
90 dec r4d
|
yading@10
|
91 jg .loop
|
yading@10
|
92 REP_RET
|
yading@10
|
93 %endmacro
|
yading@10
|
94
|
yading@10
|
95 INIT_MMX mmxext
|
yading@10
|
96 QPEL4_H_LOWPASS_OP put
|
yading@10
|
97 QPEL4_H_LOWPASS_OP avg
|
yading@10
|
98
|
yading@10
|
99 %macro QPEL8_H_LOWPASS_OP 1
|
yading@10
|
100 cglobal %1_h264_qpel8_h_lowpass, 4,5 ; dst, src, dstStride, srcStride
|
yading@10
|
101 movsxdifnidn r2, r2d
|
yading@10
|
102 movsxdifnidn r3, r3d
|
yading@10
|
103 mov r4d, 8
|
yading@10
|
104 pxor m7, m7
|
yading@10
|
105 mova m6, [pw_5]
|
yading@10
|
106 .loop:
|
yading@10
|
107 mova m0, [r1]
|
yading@10
|
108 mova m2, [r1+1]
|
yading@10
|
109 mova m1, m0
|
yading@10
|
110 mova m3, m2
|
yading@10
|
111 punpcklbw m0, m7
|
yading@10
|
112 punpckhbw m1, m7
|
yading@10
|
113 punpcklbw m2, m7
|
yading@10
|
114 punpckhbw m3, m7
|
yading@10
|
115 paddw m0, m2
|
yading@10
|
116 paddw m1, m3
|
yading@10
|
117 psllw m0, 2
|
yading@10
|
118 psllw m1, 2
|
yading@10
|
119 mova m2, [r1-1]
|
yading@10
|
120 mova m4, [r1+2]
|
yading@10
|
121 mova m3, m2
|
yading@10
|
122 mova m5, m4
|
yading@10
|
123 punpcklbw m2, m7
|
yading@10
|
124 punpckhbw m3, m7
|
yading@10
|
125 punpcklbw m4, m7
|
yading@10
|
126 punpckhbw m5, m7
|
yading@10
|
127 paddw m2, m4
|
yading@10
|
128 paddw m5, m3
|
yading@10
|
129 psubw m0, m2
|
yading@10
|
130 psubw m1, m5
|
yading@10
|
131 pmullw m0, m6
|
yading@10
|
132 pmullw m1, m6
|
yading@10
|
133 movd m2, [r1-2]
|
yading@10
|
134 movd m5, [r1+7]
|
yading@10
|
135 punpcklbw m2, m7
|
yading@10
|
136 punpcklbw m5, m7
|
yading@10
|
137 paddw m2, m3
|
yading@10
|
138 paddw m4, m5
|
yading@10
|
139 mova m5, [pw_16]
|
yading@10
|
140 paddw m2, m5
|
yading@10
|
141 paddw m4, m5
|
yading@10
|
142 paddw m0, m2
|
yading@10
|
143 paddw m1, m4
|
yading@10
|
144 psraw m0, 5
|
yading@10
|
145 psraw m1, 5
|
yading@10
|
146 packuswb m0, m1
|
yading@10
|
147 op_%1 m0, [r0], m4
|
yading@10
|
148 add r0, r2
|
yading@10
|
149 add r1, r3
|
yading@10
|
150 dec r4d
|
yading@10
|
151 jg .loop
|
yading@10
|
152 REP_RET
|
yading@10
|
153 %endmacro
|
yading@10
|
154
|
yading@10
|
155 INIT_MMX mmxext
|
yading@10
|
156 QPEL8_H_LOWPASS_OP put
|
yading@10
|
157 QPEL8_H_LOWPASS_OP avg
|
yading@10
|
158
|
yading@10
|
159 %macro QPEL8_H_LOWPASS_OP_XMM 1
|
yading@10
|
160 cglobal %1_h264_qpel8_h_lowpass, 4,5,8 ; dst, src, dstStride, srcStride
|
yading@10
|
161 movsxdifnidn r2, r2d
|
yading@10
|
162 movsxdifnidn r3, r3d
|
yading@10
|
163 mov r4d, 8
|
yading@10
|
164 pxor m7, m7
|
yading@10
|
165 mova m6, [pw_5]
|
yading@10
|
166 .loop:
|
yading@10
|
167 movu m1, [r1-2]
|
yading@10
|
168 mova m0, m1
|
yading@10
|
169 punpckhbw m1, m7
|
yading@10
|
170 punpcklbw m0, m7
|
yading@10
|
171 mova m2, m1
|
yading@10
|
172 mova m3, m1
|
yading@10
|
173 mova m4, m1
|
yading@10
|
174 mova m5, m1
|
yading@10
|
175 palignr m4, m0, 2
|
yading@10
|
176 palignr m3, m0, 4
|
yading@10
|
177 palignr m2, m0, 6
|
yading@10
|
178 palignr m1, m0, 8
|
yading@10
|
179 palignr m5, m0, 10
|
yading@10
|
180 paddw m0, m5
|
yading@10
|
181 paddw m2, m3
|
yading@10
|
182 paddw m1, m4
|
yading@10
|
183 psllw m2, 2
|
yading@10
|
184 psubw m2, m1
|
yading@10
|
185 paddw m0, [pw_16]
|
yading@10
|
186 pmullw m2, m6
|
yading@10
|
187 paddw m2, m0
|
yading@10
|
188 psraw m2, 5
|
yading@10
|
189 packuswb m2, m2
|
yading@10
|
190 op_%1h m2, [r0], m4
|
yading@10
|
191 add r1, r3
|
yading@10
|
192 add r0, r2
|
yading@10
|
193 dec r4d
|
yading@10
|
194 jne .loop
|
yading@10
|
195 REP_RET
|
yading@10
|
196 %endmacro
|
yading@10
|
197
|
yading@10
|
198 INIT_XMM ssse3
|
yading@10
|
199 QPEL8_H_LOWPASS_OP_XMM put
|
yading@10
|
200 QPEL8_H_LOWPASS_OP_XMM avg
|
yading@10
|
201
|
yading@10
|
202
|
yading@10
|
203 %macro QPEL4_H_LOWPASS_L2_OP 1
|
yading@10
|
204 cglobal %1_h264_qpel4_h_lowpass_l2, 5,6 ; dst, src, src2, dstStride, srcStride
|
yading@10
|
205 movsxdifnidn r3, r3d
|
yading@10
|
206 movsxdifnidn r4, r4d
|
yading@10
|
207 pxor m7, m7
|
yading@10
|
208 mova m4, [pw_5]
|
yading@10
|
209 mova m5, [pw_16]
|
yading@10
|
210 mov r5d, 4
|
yading@10
|
211 .loop:
|
yading@10
|
212 movh m1, [r1-1]
|
yading@10
|
213 movh m2, [r1+0]
|
yading@10
|
214 movh m3, [r1+1]
|
yading@10
|
215 movh m0, [r1+2]
|
yading@10
|
216 punpcklbw m1, m7
|
yading@10
|
217 punpcklbw m2, m7
|
yading@10
|
218 punpcklbw m3, m7
|
yading@10
|
219 punpcklbw m0, m7
|
yading@10
|
220 paddw m1, m0
|
yading@10
|
221 paddw m2, m3
|
yading@10
|
222 movh m0, [r1-2]
|
yading@10
|
223 movh m3, [r1+3]
|
yading@10
|
224 punpcklbw m0, m7
|
yading@10
|
225 punpcklbw m3, m7
|
yading@10
|
226 paddw m0, m3
|
yading@10
|
227 psllw m2, 2
|
yading@10
|
228 psubw m2, m1
|
yading@10
|
229 pmullw m2, m4
|
yading@10
|
230 paddw m0, m5
|
yading@10
|
231 paddw m0, m2
|
yading@10
|
232 movh m3, [r2]
|
yading@10
|
233 psraw m0, 5
|
yading@10
|
234 packuswb m0, m0
|
yading@10
|
235 pavgb m0, m3
|
yading@10
|
236 op_%1h m0, [r0], m6
|
yading@10
|
237 add r0, r3
|
yading@10
|
238 add r1, r3
|
yading@10
|
239 add r2, r4
|
yading@10
|
240 dec r5d
|
yading@10
|
241 jg .loop
|
yading@10
|
242 REP_RET
|
yading@10
|
243 %endmacro
|
yading@10
|
244
|
yading@10
|
245 INIT_MMX mmxext
|
yading@10
|
246 QPEL4_H_LOWPASS_L2_OP put
|
yading@10
|
247 QPEL4_H_LOWPASS_L2_OP avg
|
yading@10
|
248
|
yading@10
|
249
|
yading@10
|
250 %macro QPEL8_H_LOWPASS_L2_OP 1
|
yading@10
|
251 cglobal %1_h264_qpel8_h_lowpass_l2, 5,6 ; dst, src, src2, dstStride, srcStride
|
yading@10
|
252 movsxdifnidn r3, r3d
|
yading@10
|
253 movsxdifnidn r4, r4d
|
yading@10
|
254 mov r5d, 8
|
yading@10
|
255 pxor m7, m7
|
yading@10
|
256 mova m6, [pw_5]
|
yading@10
|
257 .loop:
|
yading@10
|
258 mova m0, [r1]
|
yading@10
|
259 mova m2, [r1+1]
|
yading@10
|
260 mova m1, m0
|
yading@10
|
261 mova m3, m2
|
yading@10
|
262 punpcklbw m0, m7
|
yading@10
|
263 punpckhbw m1, m7
|
yading@10
|
264 punpcklbw m2, m7
|
yading@10
|
265 punpckhbw m3, m7
|
yading@10
|
266 paddw m0, m2
|
yading@10
|
267 paddw m1, m3
|
yading@10
|
268 psllw m0, 2
|
yading@10
|
269 psllw m1, 2
|
yading@10
|
270 mova m2, [r1-1]
|
yading@10
|
271 mova m4, [r1+2]
|
yading@10
|
272 mova m3, m2
|
yading@10
|
273 mova m5, m4
|
yading@10
|
274 punpcklbw m2, m7
|
yading@10
|
275 punpckhbw m3, m7
|
yading@10
|
276 punpcklbw m4, m7
|
yading@10
|
277 punpckhbw m5, m7
|
yading@10
|
278 paddw m2, m4
|
yading@10
|
279 paddw m5, m3
|
yading@10
|
280 psubw m0, m2
|
yading@10
|
281 psubw m1, m5
|
yading@10
|
282 pmullw m0, m6
|
yading@10
|
283 pmullw m1, m6
|
yading@10
|
284 movd m2, [r1-2]
|
yading@10
|
285 movd m5, [r1+7]
|
yading@10
|
286 punpcklbw m2, m7
|
yading@10
|
287 punpcklbw m5, m7
|
yading@10
|
288 paddw m2, m3
|
yading@10
|
289 paddw m4, m5
|
yading@10
|
290 mova m5, [pw_16]
|
yading@10
|
291 paddw m2, m5
|
yading@10
|
292 paddw m4, m5
|
yading@10
|
293 paddw m0, m2
|
yading@10
|
294 paddw m1, m4
|
yading@10
|
295 psraw m0, 5
|
yading@10
|
296 psraw m1, 5
|
yading@10
|
297 mova m4, [r2]
|
yading@10
|
298 packuswb m0, m1
|
yading@10
|
299 pavgb m0, m4
|
yading@10
|
300 op_%1 m0, [r0], m4
|
yading@10
|
301 add r0, r3
|
yading@10
|
302 add r1, r3
|
yading@10
|
303 add r2, r4
|
yading@10
|
304 dec r5d
|
yading@10
|
305 jg .loop
|
yading@10
|
306 REP_RET
|
yading@10
|
307 %endmacro
|
yading@10
|
308
|
yading@10
|
309 INIT_MMX mmxext
|
yading@10
|
310 QPEL8_H_LOWPASS_L2_OP put
|
yading@10
|
311 QPEL8_H_LOWPASS_L2_OP avg
|
yading@10
|
312
|
yading@10
|
313
|
yading@10
|
314 %macro QPEL8_H_LOWPASS_L2_OP_XMM 1
|
yading@10
|
315 cglobal %1_h264_qpel8_h_lowpass_l2, 5,6,8 ; dst, src, src2, dstStride, src2Stride
|
yading@10
|
316 movsxdifnidn r3, r3d
|
yading@10
|
317 movsxdifnidn r4, r4d
|
yading@10
|
318 mov r5d, 8
|
yading@10
|
319 pxor m7, m7
|
yading@10
|
320 mova m6, [pw_5]
|
yading@10
|
321 .loop:
|
yading@10
|
322 lddqu m1, [r1-2]
|
yading@10
|
323 mova m0, m1
|
yading@10
|
324 punpckhbw m1, m7
|
yading@10
|
325 punpcklbw m0, m7
|
yading@10
|
326 mova m2, m1
|
yading@10
|
327 mova m3, m1
|
yading@10
|
328 mova m4, m1
|
yading@10
|
329 mova m5, m1
|
yading@10
|
330 palignr m4, m0, 2
|
yading@10
|
331 palignr m3, m0, 4
|
yading@10
|
332 palignr m2, m0, 6
|
yading@10
|
333 palignr m1, m0, 8
|
yading@10
|
334 palignr m5, m0, 10
|
yading@10
|
335 paddw m0, m5
|
yading@10
|
336 paddw m2, m3
|
yading@10
|
337 paddw m1, m4
|
yading@10
|
338 psllw m2, 2
|
yading@10
|
339 movh m3, [r2]
|
yading@10
|
340 psubw m2, m1
|
yading@10
|
341 paddw m0, [pw_16]
|
yading@10
|
342 pmullw m2, m6
|
yading@10
|
343 paddw m2, m0
|
yading@10
|
344 psraw m2, 5
|
yading@10
|
345 packuswb m2, m2
|
yading@10
|
346 pavgb m2, m3
|
yading@10
|
347 op_%1h m2, [r0], m4
|
yading@10
|
348 add r1, r3
|
yading@10
|
349 add r0, r3
|
yading@10
|
350 add r2, r4
|
yading@10
|
351 dec r5d
|
yading@10
|
352 jg .loop
|
yading@10
|
353 REP_RET
|
yading@10
|
354 %endmacro
|
yading@10
|
355
|
yading@10
|
356 INIT_XMM ssse3
|
yading@10
|
357 QPEL8_H_LOWPASS_L2_OP_XMM put
|
yading@10
|
358 QPEL8_H_LOWPASS_L2_OP_XMM avg
|
yading@10
|
359
|
yading@10
|
360
|
yading@10
|
361 ; All functions that call this are required to have function arguments of
|
yading@10
|
362 ; dst, src, dstStride, srcStride
|
yading@10
|
363 %macro FILT_V 1
|
yading@10
|
364 mova m6, m2
|
yading@10
|
365 movh m5, [r1]
|
yading@10
|
366 paddw m6, m3
|
yading@10
|
367 psllw m6, 2
|
yading@10
|
368 psubw m6, m1
|
yading@10
|
369 psubw m6, m4
|
yading@10
|
370 punpcklbw m5, m7
|
yading@10
|
371 pmullw m6, [pw_5]
|
yading@10
|
372 paddw m0, [pw_16]
|
yading@10
|
373 add r1, r3
|
yading@10
|
374 paddw m0, m5
|
yading@10
|
375 paddw m6, m0
|
yading@10
|
376 psraw m6, 5
|
yading@10
|
377 packuswb m6, m6
|
yading@10
|
378 op_%1h m6, [r0], m0 ; 1
|
yading@10
|
379 add r0, r2
|
yading@10
|
380 SWAP 0, 1, 2, 3, 4, 5
|
yading@10
|
381 %endmacro
|
yading@10
|
382
|
yading@10
|
383 %macro QPEL4_V_LOWPASS_OP 1
|
yading@10
|
384 cglobal %1_h264_qpel4_v_lowpass, 4,4 ; dst, src, dstStride, srcStride
|
yading@10
|
385 movsxdifnidn r2, r2d
|
yading@10
|
386 movsxdifnidn r3, r3d
|
yading@10
|
387 sub r1, r3
|
yading@10
|
388 sub r1, r3
|
yading@10
|
389 pxor m7, m7
|
yading@10
|
390 movh m0, [r1]
|
yading@10
|
391 movh m1, [r1+r3]
|
yading@10
|
392 lea r1, [r1+2*r3]
|
yading@10
|
393 movh m2, [r1]
|
yading@10
|
394 movh m3, [r1+r3]
|
yading@10
|
395 lea r1, [r1+2*r3]
|
yading@10
|
396 movh m4, [r1]
|
yading@10
|
397 add r1, r3
|
yading@10
|
398 punpcklbw m0, m7
|
yading@10
|
399 punpcklbw m1, m7
|
yading@10
|
400 punpcklbw m2, m7
|
yading@10
|
401 punpcklbw m3, m7
|
yading@10
|
402 punpcklbw m4, m7
|
yading@10
|
403 FILT_V %1
|
yading@10
|
404 FILT_V %1
|
yading@10
|
405 FILT_V %1
|
yading@10
|
406 FILT_V %1
|
yading@10
|
407 RET
|
yading@10
|
408 %endmacro
|
yading@10
|
409
|
yading@10
|
410 INIT_MMX mmxext
|
yading@10
|
411 QPEL4_V_LOWPASS_OP put
|
yading@10
|
412 QPEL4_V_LOWPASS_OP avg
|
yading@10
|
413
|
yading@10
|
414
|
yading@10
|
415
|
yading@10
|
416 %macro QPEL8OR16_V_LOWPASS_OP 1
|
yading@10
|
417 %if cpuflag(sse2)
|
yading@10
|
418 cglobal %1_h264_qpel8or16_v_lowpass, 5,5,8 ; dst, src, dstStride, srcStride, h
|
yading@10
|
419 movsxdifnidn r2, r2d
|
yading@10
|
420 movsxdifnidn r3, r3d
|
yading@10
|
421 sub r1, r3
|
yading@10
|
422 sub r1, r3
|
yading@10
|
423 %else
|
yading@10
|
424 cglobal %1_h264_qpel8or16_v_lowpass_op, 5,5,8 ; dst, src, dstStride, srcStride, h
|
yading@10
|
425 movsxdifnidn r2, r2d
|
yading@10
|
426 movsxdifnidn r3, r3d
|
yading@10
|
427 %endif
|
yading@10
|
428 pxor m7, m7
|
yading@10
|
429 movh m0, [r1]
|
yading@10
|
430 movh m1, [r1+r3]
|
yading@10
|
431 lea r1, [r1+2*r3]
|
yading@10
|
432 movh m2, [r1]
|
yading@10
|
433 movh m3, [r1+r3]
|
yading@10
|
434 lea r1, [r1+2*r3]
|
yading@10
|
435 movh m4, [r1]
|
yading@10
|
436 add r1, r3
|
yading@10
|
437 punpcklbw m0, m7
|
yading@10
|
438 punpcklbw m1, m7
|
yading@10
|
439 punpcklbw m2, m7
|
yading@10
|
440 punpcklbw m3, m7
|
yading@10
|
441 punpcklbw m4, m7
|
yading@10
|
442 FILT_V %1
|
yading@10
|
443 FILT_V %1
|
yading@10
|
444 FILT_V %1
|
yading@10
|
445 FILT_V %1
|
yading@10
|
446 FILT_V %1
|
yading@10
|
447 FILT_V %1
|
yading@10
|
448 FILT_V %1
|
yading@10
|
449 FILT_V %1
|
yading@10
|
450 cmp r4d, 16
|
yading@10
|
451 jne .end
|
yading@10
|
452 FILT_V %1
|
yading@10
|
453 FILT_V %1
|
yading@10
|
454 FILT_V %1
|
yading@10
|
455 FILT_V %1
|
yading@10
|
456 FILT_V %1
|
yading@10
|
457 FILT_V %1
|
yading@10
|
458 FILT_V %1
|
yading@10
|
459 FILT_V %1
|
yading@10
|
460 .end:
|
yading@10
|
461 REP_RET
|
yading@10
|
462 %endmacro
|
yading@10
|
463
|
yading@10
|
464 INIT_MMX mmxext
|
yading@10
|
465 QPEL8OR16_V_LOWPASS_OP put
|
yading@10
|
466 QPEL8OR16_V_LOWPASS_OP avg
|
yading@10
|
467
|
yading@10
|
468 INIT_XMM sse2
|
yading@10
|
469 QPEL8OR16_V_LOWPASS_OP put
|
yading@10
|
470 QPEL8OR16_V_LOWPASS_OP avg
|
yading@10
|
471
|
yading@10
|
472
|
yading@10
|
473 ; All functions that use this are required to have args:
|
yading@10
|
474 ; src, tmp, srcSize
|
yading@10
|
475 %macro FILT_HV 1 ; offset
|
yading@10
|
476 mova m6, m2
|
yading@10
|
477 movh m5, [r0]
|
yading@10
|
478 paddw m6, m3
|
yading@10
|
479 psllw m6, 2
|
yading@10
|
480 paddw m0, [pw_16]
|
yading@10
|
481 psubw m6, m1
|
yading@10
|
482 psubw m6, m4
|
yading@10
|
483 punpcklbw m5, m7
|
yading@10
|
484 pmullw m6, [pw_5]
|
yading@10
|
485 paddw m0, m5
|
yading@10
|
486 add r0, r2
|
yading@10
|
487 paddw m6, m0
|
yading@10
|
488 mova [r1+%1], m6
|
yading@10
|
489 SWAP 0, 1, 2, 3, 4, 5
|
yading@10
|
490 %endmacro
|
yading@10
|
491
|
yading@10
|
492 %macro QPEL4_HV1_LOWPASS_OP 1
|
yading@10
|
493 cglobal %1_h264_qpel4_hv_lowpass_v, 3,3 ; src, tmp, srcStride
|
yading@10
|
494 movsxdifnidn r2, r2d
|
yading@10
|
495 pxor m7, m7
|
yading@10
|
496 movh m0, [r0]
|
yading@10
|
497 movh m1, [r0+r2]
|
yading@10
|
498 lea r0, [r0+2*r2]
|
yading@10
|
499 movh m2, [r0]
|
yading@10
|
500 movh m3, [r0+r2]
|
yading@10
|
501 lea r0, [r0+2*r2]
|
yading@10
|
502 movh m4, [r0]
|
yading@10
|
503 add r0, r2
|
yading@10
|
504 punpcklbw m0, m7
|
yading@10
|
505 punpcklbw m1, m7
|
yading@10
|
506 punpcklbw m2, m7
|
yading@10
|
507 punpcklbw m3, m7
|
yading@10
|
508 punpcklbw m4, m7
|
yading@10
|
509 FILT_HV 0*24
|
yading@10
|
510 FILT_HV 1*24
|
yading@10
|
511 FILT_HV 2*24
|
yading@10
|
512 FILT_HV 3*24
|
yading@10
|
513 RET
|
yading@10
|
514
|
yading@10
|
515 cglobal %1_h264_qpel4_hv_lowpass_h, 3,4 ; tmp, dst, dstStride
|
yading@10
|
516 movsxdifnidn r2, r2d
|
yading@10
|
517 mov r3d, 4
|
yading@10
|
518 .loop:
|
yading@10
|
519 mova m0, [r0]
|
yading@10
|
520 paddw m0, [r0+10]
|
yading@10
|
521 mova m1, [r0+2]
|
yading@10
|
522 paddw m1, [r0+8]
|
yading@10
|
523 mova m2, [r0+4]
|
yading@10
|
524 paddw m2, [r0+6]
|
yading@10
|
525 psubw m0, m1
|
yading@10
|
526 psraw m0, 2
|
yading@10
|
527 psubw m0, m1
|
yading@10
|
528 paddsw m0, m2
|
yading@10
|
529 psraw m0, 2
|
yading@10
|
530 paddw m0, m2
|
yading@10
|
531 psraw m0, 6
|
yading@10
|
532 packuswb m0, m0
|
yading@10
|
533 op_%1h m0, [r1], m7
|
yading@10
|
534 add r0, 24
|
yading@10
|
535 add r1, r2
|
yading@10
|
536 dec r3d
|
yading@10
|
537 jnz .loop
|
yading@10
|
538 REP_RET
|
yading@10
|
539 %endmacro
|
yading@10
|
540
|
yading@10
|
541 INIT_MMX mmxext
|
yading@10
|
542 QPEL4_HV1_LOWPASS_OP put
|
yading@10
|
543 QPEL4_HV1_LOWPASS_OP avg
|
yading@10
|
544
|
yading@10
|
545 %macro QPEL8OR16_HV1_LOWPASS_OP 1
|
yading@10
|
546 cglobal %1_h264_qpel8or16_hv1_lowpass_op, 4,4,8 ; src, tmp, srcStride, size
|
yading@10
|
547 movsxdifnidn r2, r2d
|
yading@10
|
548 pxor m7, m7
|
yading@10
|
549 movh m0, [r0]
|
yading@10
|
550 movh m1, [r0+r2]
|
yading@10
|
551 lea r0, [r0+2*r2]
|
yading@10
|
552 movh m2, [r0]
|
yading@10
|
553 movh m3, [r0+r2]
|
yading@10
|
554 lea r0, [r0+2*r2]
|
yading@10
|
555 movh m4, [r0]
|
yading@10
|
556 add r0, r2
|
yading@10
|
557 punpcklbw m0, m7
|
yading@10
|
558 punpcklbw m1, m7
|
yading@10
|
559 punpcklbw m2, m7
|
yading@10
|
560 punpcklbw m3, m7
|
yading@10
|
561 punpcklbw m4, m7
|
yading@10
|
562 FILT_HV 0*48
|
yading@10
|
563 FILT_HV 1*48
|
yading@10
|
564 FILT_HV 2*48
|
yading@10
|
565 FILT_HV 3*48
|
yading@10
|
566 FILT_HV 4*48
|
yading@10
|
567 FILT_HV 5*48
|
yading@10
|
568 FILT_HV 6*48
|
yading@10
|
569 FILT_HV 7*48
|
yading@10
|
570 cmp r3d, 16
|
yading@10
|
571 jne .end
|
yading@10
|
572 FILT_HV 8*48
|
yading@10
|
573 FILT_HV 9*48
|
yading@10
|
574 FILT_HV 10*48
|
yading@10
|
575 FILT_HV 11*48
|
yading@10
|
576 FILT_HV 12*48
|
yading@10
|
577 FILT_HV 13*48
|
yading@10
|
578 FILT_HV 14*48
|
yading@10
|
579 FILT_HV 15*48
|
yading@10
|
580 .end:
|
yading@10
|
581 REP_RET
|
yading@10
|
582 %endmacro
|
yading@10
|
583
|
yading@10
|
584 INIT_MMX mmxext
|
yading@10
|
585 QPEL8OR16_HV1_LOWPASS_OP put
|
yading@10
|
586 QPEL8OR16_HV1_LOWPASS_OP avg
|
yading@10
|
587
|
yading@10
|
588 INIT_XMM sse2
|
yading@10
|
589 QPEL8OR16_HV1_LOWPASS_OP put
|
yading@10
|
590
|
yading@10
|
591
|
yading@10
|
592
|
yading@10
|
593 %macro QPEL8OR16_HV2_LOWPASS_OP 1
|
yading@10
|
594 ; unused is to match ssse3 and mmxext args
|
yading@10
|
595 cglobal %1_h264_qpel8or16_hv2_lowpass_op, 5,5 ; dst, tmp, dstStride, unused, h
|
yading@10
|
596 movsxdifnidn r2, r2d
|
yading@10
|
597 .loop:
|
yading@10
|
598 mova m0, [r1]
|
yading@10
|
599 mova m3, [r1+8]
|
yading@10
|
600 mova m1, [r1+2]
|
yading@10
|
601 mova m4, [r1+10]
|
yading@10
|
602 paddw m0, m4
|
yading@10
|
603 paddw m1, m3
|
yading@10
|
604 paddw m3, [r1+18]
|
yading@10
|
605 paddw m4, [r1+16]
|
yading@10
|
606 mova m2, [r1+4]
|
yading@10
|
607 mova m5, [r1+12]
|
yading@10
|
608 paddw m2, [r1+6]
|
yading@10
|
609 paddw m5, [r1+14]
|
yading@10
|
610 psubw m0, m1
|
yading@10
|
611 psubw m3, m4
|
yading@10
|
612 psraw m0, 2
|
yading@10
|
613 psraw m3, 2
|
yading@10
|
614 psubw m0, m1
|
yading@10
|
615 psubw m3, m4
|
yading@10
|
616 paddsw m0, m2
|
yading@10
|
617 paddsw m3, m5
|
yading@10
|
618 psraw m0, 2
|
yading@10
|
619 psraw m3, 2
|
yading@10
|
620 paddw m0, m2
|
yading@10
|
621 paddw m3, m5
|
yading@10
|
622 psraw m0, 6
|
yading@10
|
623 psraw m3, 6
|
yading@10
|
624 packuswb m0, m3
|
yading@10
|
625 op_%1 m0, [r0], m7
|
yading@10
|
626 add r1, 48
|
yading@10
|
627 add r0, r2
|
yading@10
|
628 dec r4d
|
yading@10
|
629 jne .loop
|
yading@10
|
630 REP_RET
|
yading@10
|
631 %endmacro
|
yading@10
|
632
|
yading@10
|
633 INIT_MMX mmxext
|
yading@10
|
634 QPEL8OR16_HV2_LOWPASS_OP put
|
yading@10
|
635 QPEL8OR16_HV2_LOWPASS_OP avg
|
yading@10
|
636
|
yading@10
|
637 %macro QPEL8OR16_HV2_LOWPASS_OP_XMM 1
|
yading@10
|
638 cglobal %1_h264_qpel8or16_hv2_lowpass, 5,5,8 ; dst, tmp, dstStride, tmpStride, size
|
yading@10
|
639 movsxdifnidn r2, r2d
|
yading@10
|
640 movsxdifnidn r3, r3d
|
yading@10
|
641 cmp r4d, 16
|
yading@10
|
642 je .op16
|
yading@10
|
643 .loop8:
|
yading@10
|
644 mova m1, [r1+16]
|
yading@10
|
645 mova m0, [r1]
|
yading@10
|
646 mova m2, m1
|
yading@10
|
647 mova m3, m1
|
yading@10
|
648 mova m4, m1
|
yading@10
|
649 mova m5, m1
|
yading@10
|
650 palignr m5, m0, 10
|
yading@10
|
651 palignr m4, m0, 8
|
yading@10
|
652 palignr m3, m0, 6
|
yading@10
|
653 palignr m2, m0, 4
|
yading@10
|
654 palignr m1, m0, 2
|
yading@10
|
655 paddw m0, m5
|
yading@10
|
656 paddw m1, m4
|
yading@10
|
657 paddw m2, m3
|
yading@10
|
658 psubw m0, m1
|
yading@10
|
659 psraw m0, 2
|
yading@10
|
660 psubw m0, m1
|
yading@10
|
661 paddw m0, m2
|
yading@10
|
662 psraw m0, 2
|
yading@10
|
663 paddw m0, m2
|
yading@10
|
664 psraw m0, 6
|
yading@10
|
665 packuswb m0, m0
|
yading@10
|
666 op_%1h m0, [r0], m7
|
yading@10
|
667 add r1, 48
|
yading@10
|
668 add r0, r2
|
yading@10
|
669 dec r4d
|
yading@10
|
670 jne .loop8
|
yading@10
|
671 jmp .done
|
yading@10
|
672 .op16:
|
yading@10
|
673 mova m4, [r1+32]
|
yading@10
|
674 mova m5, [r1+16]
|
yading@10
|
675 mova m7, [r1]
|
yading@10
|
676 mova m3, m4
|
yading@10
|
677 mova m2, m4
|
yading@10
|
678 mova m1, m4
|
yading@10
|
679 mova m0, m4
|
yading@10
|
680 palignr m0, m5, 10
|
yading@10
|
681 palignr m1, m5, 8
|
yading@10
|
682 palignr m2, m5, 6
|
yading@10
|
683 palignr m3, m5, 4
|
yading@10
|
684 palignr m4, m5, 2
|
yading@10
|
685 paddw m0, m5
|
yading@10
|
686 paddw m1, m4
|
yading@10
|
687 paddw m2, m3
|
yading@10
|
688 mova m6, m5
|
yading@10
|
689 mova m4, m5
|
yading@10
|
690 mova m3, m5
|
yading@10
|
691 palignr m4, m7, 8
|
yading@10
|
692 palignr m6, m7, 2
|
yading@10
|
693 palignr m3, m7, 10
|
yading@10
|
694 paddw m4, m6
|
yading@10
|
695 mova m6, m5
|
yading@10
|
696 palignr m5, m7, 6
|
yading@10
|
697 palignr m6, m7, 4
|
yading@10
|
698 paddw m3, m7
|
yading@10
|
699 paddw m5, m6
|
yading@10
|
700 psubw m0, m1
|
yading@10
|
701 psubw m3, m4
|
yading@10
|
702 psraw m0, 2
|
yading@10
|
703 psraw m3, 2
|
yading@10
|
704 psubw m0, m1
|
yading@10
|
705 psubw m3, m4
|
yading@10
|
706 paddw m0, m2
|
yading@10
|
707 paddw m3, m5
|
yading@10
|
708 psraw m0, 2
|
yading@10
|
709 psraw m3, 2
|
yading@10
|
710 paddw m0, m2
|
yading@10
|
711 paddw m3, m5
|
yading@10
|
712 psraw m0, 6
|
yading@10
|
713 psraw m3, 6
|
yading@10
|
714 packuswb m3, m0
|
yading@10
|
715 op_%1 m3, [r0], m7
|
yading@10
|
716 add r1, 48
|
yading@10
|
717 add r0, r2
|
yading@10
|
718 dec r4d
|
yading@10
|
719 jne .op16
|
yading@10
|
720 .done:
|
yading@10
|
721 REP_RET
|
yading@10
|
722 %endmacro
|
yading@10
|
723
|
yading@10
|
724 INIT_XMM ssse3
|
yading@10
|
725 QPEL8OR16_HV2_LOWPASS_OP_XMM put
|
yading@10
|
726 QPEL8OR16_HV2_LOWPASS_OP_XMM avg
|
yading@10
|
727
|
yading@10
|
728
|
yading@10
|
729 %macro PIXELS4_L2_SHIFT5 1
|
yading@10
|
730 cglobal %1_pixels4_l2_shift5,6,6 ; dst, src16, src8, dstStride, src8Stride, h
|
yading@10
|
731 movsxdifnidn r3, r3d
|
yading@10
|
732 movsxdifnidn r4, r4d
|
yading@10
|
733 mova m0, [r1]
|
yading@10
|
734 mova m1, [r1+24]
|
yading@10
|
735 psraw m0, 5
|
yading@10
|
736 psraw m1, 5
|
yading@10
|
737 packuswb m0, m0
|
yading@10
|
738 packuswb m1, m1
|
yading@10
|
739 pavgb m0, [r2]
|
yading@10
|
740 pavgb m1, [r2+r4]
|
yading@10
|
741 op_%1h m0, [r0], m4
|
yading@10
|
742 op_%1h m1, [r0+r3], m5
|
yading@10
|
743 lea r2, [r2+r4*2]
|
yading@10
|
744 lea r0, [r0+r3*2]
|
yading@10
|
745 mova m0, [r1+48]
|
yading@10
|
746 mova m1, [r1+72]
|
yading@10
|
747 psraw m0, 5
|
yading@10
|
748 psraw m1, 5
|
yading@10
|
749 packuswb m0, m0
|
yading@10
|
750 packuswb m1, m1
|
yading@10
|
751 pavgb m0, [r2]
|
yading@10
|
752 pavgb m1, [r2+r4]
|
yading@10
|
753 op_%1h m0, [r0], m4
|
yading@10
|
754 op_%1h m1, [r0+r3], m5
|
yading@10
|
755 RET
|
yading@10
|
756 %endmacro
|
yading@10
|
757
|
yading@10
|
758 INIT_MMX mmxext
|
yading@10
|
759 PIXELS4_L2_SHIFT5 put
|
yading@10
|
760 PIXELS4_L2_SHIFT5 avg
|
yading@10
|
761
|
yading@10
|
762
|
yading@10
|
763 %macro PIXELS8_L2_SHIFT5 1
|
yading@10
|
764 cglobal %1_pixels8_l2_shift5, 6, 6 ; dst, src16, src8, dstStride, src8Stride, h
|
yading@10
|
765 movsxdifnidn r3, r3d
|
yading@10
|
766 movsxdifnidn r4, r4d
|
yading@10
|
767 .loop:
|
yading@10
|
768 mova m0, [r1]
|
yading@10
|
769 mova m1, [r1+8]
|
yading@10
|
770 mova m2, [r1+48]
|
yading@10
|
771 mova m3, [r1+48+8]
|
yading@10
|
772 psraw m0, 5
|
yading@10
|
773 psraw m1, 5
|
yading@10
|
774 psraw m2, 5
|
yading@10
|
775 psraw m3, 5
|
yading@10
|
776 packuswb m0, m1
|
yading@10
|
777 packuswb m2, m3
|
yading@10
|
778 pavgb m0, [r2]
|
yading@10
|
779 pavgb m2, [r2+r4]
|
yading@10
|
780 op_%1 m0, [r0], m4
|
yading@10
|
781 op_%1 m2, [r0+r3], m5
|
yading@10
|
782 lea r2, [r2+2*r4]
|
yading@10
|
783 add r1, 48*2
|
yading@10
|
784 lea r0, [r0+2*r3]
|
yading@10
|
785 sub r5d, 2
|
yading@10
|
786 jne .loop
|
yading@10
|
787 REP_RET
|
yading@10
|
788 %endmacro
|
yading@10
|
789
|
yading@10
|
790 INIT_MMX mmxext
|
yading@10
|
791 PIXELS8_L2_SHIFT5 put
|
yading@10
|
792 PIXELS8_L2_SHIFT5 avg
|
yading@10
|
793
|
yading@10
|
794
|
yading@10
|
795 %if ARCH_X86_64
|
yading@10
|
796 %macro QPEL16_H_LOWPASS_L2_OP 1
|
yading@10
|
797 cglobal %1_h264_qpel16_h_lowpass_l2, 5, 6, 16 ; dst, src, src2, dstStride, src2Stride
|
yading@10
|
798 movsxdifnidn r3, r3d
|
yading@10
|
799 movsxdifnidn r4, r4d
|
yading@10
|
800 mov r5d, 16
|
yading@10
|
801 pxor m15, m15
|
yading@10
|
802 mova m14, [pw_5]
|
yading@10
|
803 mova m13, [pw_16]
|
yading@10
|
804 .loop:
|
yading@10
|
805 lddqu m1, [r1+6]
|
yading@10
|
806 lddqu m7, [r1-2]
|
yading@10
|
807 mova m0, m1
|
yading@10
|
808 punpckhbw m1, m15
|
yading@10
|
809 punpcklbw m0, m15
|
yading@10
|
810 punpcklbw m7, m15
|
yading@10
|
811 mova m2, m1
|
yading@10
|
812 mova m6, m0
|
yading@10
|
813 mova m3, m1
|
yading@10
|
814 mova m8, m0
|
yading@10
|
815 mova m4, m1
|
yading@10
|
816 mova m9, m0
|
yading@10
|
817 mova m12, m0
|
yading@10
|
818 mova m11, m1
|
yading@10
|
819 palignr m11, m0, 10
|
yading@10
|
820 palignr m12, m7, 10
|
yading@10
|
821 palignr m4, m0, 2
|
yading@10
|
822 palignr m9, m7, 2
|
yading@10
|
823 palignr m3, m0, 4
|
yading@10
|
824 palignr m8, m7, 4
|
yading@10
|
825 palignr m2, m0, 6
|
yading@10
|
826 palignr m6, m7, 6
|
yading@10
|
827 paddw m11, m0
|
yading@10
|
828 palignr m1, m0, 8
|
yading@10
|
829 palignr m0, m7, 8
|
yading@10
|
830 paddw m7, m12
|
yading@10
|
831 paddw m2, m3
|
yading@10
|
832 paddw m6, m8
|
yading@10
|
833 paddw m1, m4
|
yading@10
|
834 paddw m0, m9
|
yading@10
|
835 psllw m2, 2
|
yading@10
|
836 psllw m6, 2
|
yading@10
|
837 psubw m2, m1
|
yading@10
|
838 psubw m6, m0
|
yading@10
|
839 paddw m11, m13
|
yading@10
|
840 paddw m7, m13
|
yading@10
|
841 pmullw m2, m14
|
yading@10
|
842 pmullw m6, m14
|
yading@10
|
843 lddqu m3, [r2]
|
yading@10
|
844 paddw m2, m11
|
yading@10
|
845 paddw m6, m7
|
yading@10
|
846 psraw m2, 5
|
yading@10
|
847 psraw m6, 5
|
yading@10
|
848 packuswb m6, m2
|
yading@10
|
849 pavgb m6, m3
|
yading@10
|
850 op_%1 m6, [r0], m11
|
yading@10
|
851 add r1, r3
|
yading@10
|
852 add r0, r3
|
yading@10
|
853 add r2, r4
|
yading@10
|
854 dec r5d
|
yading@10
|
855 jg .loop
|
yading@10
|
856 REP_RET
|
yading@10
|
857 %endmacro
|
yading@10
|
858
|
yading@10
|
859 INIT_XMM ssse3
|
yading@10
|
860 QPEL16_H_LOWPASS_L2_OP put
|
yading@10
|
861 QPEL16_H_LOWPASS_L2_OP avg
|
yading@10
|
862 %endif
|