yading@11
|
1 ;*****************************************************************************
|
yading@11
|
2 ;* x86-optimized functions for yadif filter
|
yading@11
|
3 ;*
|
yading@11
|
4 ;* Copyright (C) 2006 Michael Niedermayer <michaelni@gmx.at>
|
yading@11
|
5 ;* Copyright (c) 2013 Daniel Kang <daniel.d.kang@gmail.com>
|
yading@11
|
6 ;* Copyright (c) 2011-2013 James Darnley <james.darnley@gmail.com>
|
yading@11
|
7 ;*
|
yading@11
|
8 ;* This file is part of FFmpeg.
|
yading@11
|
9 ;*
|
yading@11
|
10 ;* FFmpeg is free software; you can redistribute it and/or modify
|
yading@11
|
11 ;* it under the terms of the GNU General Public License as published by
|
yading@11
|
12 ;* the Free Software Foundation; either version 2 of the License, or
|
yading@11
|
13 ;* (at your option) any later version.
|
yading@11
|
14 ;*
|
yading@11
|
15 ;* FFmpeg is distributed in the hope that it will be useful,
|
yading@11
|
16 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
yading@11
|
17 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
yading@11
|
18 ;* GNU General Public License for more details.
|
yading@11
|
19 ;*
|
yading@11
|
20 ;* You should have received a copy of the GNU General Public License along
|
yading@11
|
21 ;* with FFmpeg; if not, write to the Free Software Foundation, Inc.,
|
yading@11
|
22 ;* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
yading@11
|
23 ;******************************************************************************
|
yading@11
|
24
|
yading@11
|
25 %include "libavutil/x86/x86util.asm"
|
yading@11
|
26
|
yading@11
|
27 SECTION_RODATA
|
yading@11
|
28
|
yading@11
|
29 pw_1: times 8 dw 1
|
yading@11
|
30 pw_8000: times 8 dw 0x8000
|
yading@11
|
31 pd_1: times 4 dd 1
|
yading@11
|
32 pd_8000: times 4 dd 0x8000
|
yading@11
|
33
|
yading@11
|
34 SECTION .text
|
yading@11
|
35
|
yading@11
|
36 %macro PIXSHIFT1 1
|
yading@11
|
37 %if cpuflag(sse2)
|
yading@11
|
38 psrldq %1, 2
|
yading@11
|
39 %else
|
yading@11
|
40 psrlq %1, 16
|
yading@11
|
41 %endif
|
yading@11
|
42 %endmacro
|
yading@11
|
43
|
yading@11
|
44 %macro PIXSHIFT2 1
|
yading@11
|
45 %if cpuflag(sse2)
|
yading@11
|
46 psrldq %1, 4
|
yading@11
|
47 %else
|
yading@11
|
48 psrlq %1, 32
|
yading@11
|
49 %endif
|
yading@11
|
50 %endmacro
|
yading@11
|
51
|
yading@11
|
52 %macro PABS 2
|
yading@11
|
53 %if cpuflag(ssse3)
|
yading@11
|
54 pabsd %1, %1
|
yading@11
|
55 %else
|
yading@11
|
56 pxor %2, %2
|
yading@11
|
57 pcmpgtd %2, %1
|
yading@11
|
58 pxor %1, %2
|
yading@11
|
59 psubd %1, %2
|
yading@11
|
60 %endif
|
yading@11
|
61 %endmacro
|
yading@11
|
62
|
yading@11
|
63 %macro PACK 1
|
yading@11
|
64 %if cpuflag(sse4)
|
yading@11
|
65 packusdw %1, %1
|
yading@11
|
66 %else
|
yading@11
|
67 psubd %1, [pd_8000]
|
yading@11
|
68 packssdw %1, %1
|
yading@11
|
69 paddw %1, [pw_8000]
|
yading@11
|
70 %endif
|
yading@11
|
71 %endmacro
|
yading@11
|
72
|
yading@11
|
73 %macro PMINSD 3
|
yading@11
|
74 %if cpuflag(sse4)
|
yading@11
|
75 pminsd %1, %2
|
yading@11
|
76 %else
|
yading@11
|
77 mova %3, %2
|
yading@11
|
78 pcmpgtd %3, %1
|
yading@11
|
79 pand %1, %3
|
yading@11
|
80 pandn %3, %2
|
yading@11
|
81 por %1, %3
|
yading@11
|
82 %endif
|
yading@11
|
83 %endmacro
|
yading@11
|
84
|
yading@11
|
85 %macro PMAXSD 3
|
yading@11
|
86 %if cpuflag(sse4)
|
yading@11
|
87 pmaxsd %1, %2
|
yading@11
|
88 %else
|
yading@11
|
89 mova %3, %1
|
yading@11
|
90 pcmpgtd %3, %2
|
yading@11
|
91 pand %1, %3
|
yading@11
|
92 pandn %3, %2
|
yading@11
|
93 por %1, %3
|
yading@11
|
94 %endif
|
yading@11
|
95 %endmacro
|
yading@11
|
96
|
yading@11
|
97 %macro PMAXUW 2
|
yading@11
|
98 %if cpuflag(sse4)
|
yading@11
|
99 pmaxuw %1, %2
|
yading@11
|
100 %else
|
yading@11
|
101 psubusw %1, %2
|
yading@11
|
102 paddusw %1, %2
|
yading@11
|
103 %endif
|
yading@11
|
104 %endmacro
|
yading@11
|
105
|
yading@11
|
106 %macro CHECK 2
|
yading@11
|
107 movu m2, [curq+t1+%1*2]
|
yading@11
|
108 movu m3, [curq+t0+%2*2]
|
yading@11
|
109 mova m4, m2
|
yading@11
|
110 mova m5, m2
|
yading@11
|
111 pxor m4, m3
|
yading@11
|
112 pavgw m5, m3
|
yading@11
|
113 pand m4, [pw_1]
|
yading@11
|
114 psubusw m5, m4
|
yading@11
|
115 %if mmsize == 16
|
yading@11
|
116 psrldq m5, 2
|
yading@11
|
117 %else
|
yading@11
|
118 psrlq m5, 16
|
yading@11
|
119 %endif
|
yading@11
|
120 punpcklwd m5, m7
|
yading@11
|
121 mova m4, m2
|
yading@11
|
122 psubusw m2, m3
|
yading@11
|
123 psubusw m3, m4
|
yading@11
|
124 PMAXUW m2, m3
|
yading@11
|
125 mova m3, m2
|
yading@11
|
126 mova m4, m2
|
yading@11
|
127 %if mmsize == 16
|
yading@11
|
128 psrldq m3, 2
|
yading@11
|
129 psrldq m4, 4
|
yading@11
|
130 %else
|
yading@11
|
131 psrlq m3, 16
|
yading@11
|
132 psrlq m4, 32
|
yading@11
|
133 %endif
|
yading@11
|
134 punpcklwd m2, m7
|
yading@11
|
135 punpcklwd m3, m7
|
yading@11
|
136 punpcklwd m4, m7
|
yading@11
|
137 paddd m2, m3
|
yading@11
|
138 paddd m2, m4
|
yading@11
|
139 %endmacro
|
yading@11
|
140
|
yading@11
|
141 %macro CHECK1 0
|
yading@11
|
142 mova m3, m0
|
yading@11
|
143 pcmpgtd m3, m2
|
yading@11
|
144 PMINSD m0, m2, m6
|
yading@11
|
145 mova m6, m3
|
yading@11
|
146 pand m5, m3
|
yading@11
|
147 pandn m3, m1
|
yading@11
|
148 por m3, m5
|
yading@11
|
149 mova m1, m3
|
yading@11
|
150 %endmacro
|
yading@11
|
151
|
yading@11
|
152 %macro CHECK2 0
|
yading@11
|
153 paddd m6, [pd_1]
|
yading@11
|
154 pslld m6, 30
|
yading@11
|
155 paddd m2, m6
|
yading@11
|
156 mova m3, m0
|
yading@11
|
157 pcmpgtd m3, m2
|
yading@11
|
158 PMINSD m0, m2, m4
|
yading@11
|
159 pand m5, m3
|
yading@11
|
160 pandn m3, m1
|
yading@11
|
161 por m3, m5
|
yading@11
|
162 mova m1, m3
|
yading@11
|
163 %endmacro
|
yading@11
|
164
|
yading@11
|
165 ; This version of CHECK2 has 3 fewer instructions on sets older than SSE4 but I
|
yading@11
|
166 ; am not sure whether it is any faster. A rewrite or refactor of the filter
|
yading@11
|
167 ; code should make it possible to eliminate the move intruction at the end. It
|
yading@11
|
168 ; exists to satisfy the expectation that the "score" values are in m1.
|
yading@11
|
169
|
yading@11
|
170 ; %macro CHECK2 0
|
yading@11
|
171 ; mova m3, m0
|
yading@11
|
172 ; pcmpgtd m0, m2
|
yading@11
|
173 ; pand m0, m6
|
yading@11
|
174 ; mova m6, m0
|
yading@11
|
175 ; pand m5, m6
|
yading@11
|
176 ; pand m2, m0
|
yading@11
|
177 ; pandn m6, m1
|
yading@11
|
178 ; pandn m0, m3
|
yading@11
|
179 ; por m6, m5
|
yading@11
|
180 ; por m0, m2
|
yading@11
|
181 ; mova m1, m6
|
yading@11
|
182 ; %endmacro
|
yading@11
|
183
|
yading@11
|
184 %macro LOAD 2
|
yading@11
|
185 movh %1, %2
|
yading@11
|
186 punpcklwd %1, m7
|
yading@11
|
187 %endmacro
|
yading@11
|
188
|
yading@11
|
189 %macro FILTER 3
|
yading@11
|
190 .loop%1:
|
yading@11
|
191 pxor m7, m7
|
yading@11
|
192 LOAD m0, [curq+t1]
|
yading@11
|
193 LOAD m1, [curq+t0]
|
yading@11
|
194 LOAD m2, [%2]
|
yading@11
|
195 LOAD m3, [%3]
|
yading@11
|
196 mova m4, m3
|
yading@11
|
197 paddd m3, m2
|
yading@11
|
198 psrad m3, 1
|
yading@11
|
199 mova [rsp+ 0], m0
|
yading@11
|
200 mova [rsp+16], m3
|
yading@11
|
201 mova [rsp+32], m1
|
yading@11
|
202 psubd m2, m4
|
yading@11
|
203 PABS m2, m4
|
yading@11
|
204 LOAD m3, [prevq+t1]
|
yading@11
|
205 LOAD m4, [prevq+t0]
|
yading@11
|
206 psubd m3, m0
|
yading@11
|
207 psubd m4, m1
|
yading@11
|
208 PABS m3, m5
|
yading@11
|
209 PABS m4, m5
|
yading@11
|
210 paddd m3, m4
|
yading@11
|
211 psrld m2, 1
|
yading@11
|
212 psrld m3, 1
|
yading@11
|
213 PMAXSD m2, m3, m6
|
yading@11
|
214 LOAD m3, [nextq+t1]
|
yading@11
|
215 LOAD m4, [nextq+t0]
|
yading@11
|
216 psubd m3, m0
|
yading@11
|
217 psubd m4, m1
|
yading@11
|
218 PABS m3, m5
|
yading@11
|
219 PABS m4, m5
|
yading@11
|
220 paddd m3, m4
|
yading@11
|
221 psrld m3, 1
|
yading@11
|
222 PMAXSD m2, m3, m6
|
yading@11
|
223 mova [rsp+48], m2
|
yading@11
|
224
|
yading@11
|
225 paddd m1, m0
|
yading@11
|
226 paddd m0, m0
|
yading@11
|
227 psubd m0, m1
|
yading@11
|
228 psrld m1, 1
|
yading@11
|
229 PABS m0, m2
|
yading@11
|
230
|
yading@11
|
231 movu m2, [curq+t1-1*2]
|
yading@11
|
232 movu m3, [curq+t0-1*2]
|
yading@11
|
233 mova m4, m2
|
yading@11
|
234 psubusw m2, m3
|
yading@11
|
235 psubusw m3, m4
|
yading@11
|
236 PMAXUW m2, m3
|
yading@11
|
237 %if mmsize == 16
|
yading@11
|
238 mova m3, m2
|
yading@11
|
239 psrldq m3, 4
|
yading@11
|
240 %else
|
yading@11
|
241 mova m3, m2
|
yading@11
|
242 psrlq m3, 32
|
yading@11
|
243 %endif
|
yading@11
|
244 punpcklwd m2, m7
|
yading@11
|
245 punpcklwd m3, m7
|
yading@11
|
246 paddd m0, m2
|
yading@11
|
247 paddd m0, m3
|
yading@11
|
248 psubd m0, [pd_1]
|
yading@11
|
249
|
yading@11
|
250 CHECK -2, 0
|
yading@11
|
251 CHECK1
|
yading@11
|
252 CHECK -3, 1
|
yading@11
|
253 CHECK2
|
yading@11
|
254 CHECK 0, -2
|
yading@11
|
255 CHECK1
|
yading@11
|
256 CHECK 1, -3
|
yading@11
|
257 CHECK2
|
yading@11
|
258
|
yading@11
|
259 mova m6, [rsp+48]
|
yading@11
|
260 cmp DWORD r8m, 2
|
yading@11
|
261 jge .end%1
|
yading@11
|
262 LOAD m2, [%2+t1*2]
|
yading@11
|
263 LOAD m4, [%3+t1*2]
|
yading@11
|
264 LOAD m3, [%2+t0*2]
|
yading@11
|
265 LOAD m5, [%3+t0*2]
|
yading@11
|
266 paddd m2, m4
|
yading@11
|
267 paddd m3, m5
|
yading@11
|
268 psrld m2, 1
|
yading@11
|
269 psrld m3, 1
|
yading@11
|
270 mova m4, [rsp+ 0]
|
yading@11
|
271 mova m5, [rsp+16]
|
yading@11
|
272 mova m7, [rsp+32]
|
yading@11
|
273 psubd m2, m4
|
yading@11
|
274 psubd m3, m7
|
yading@11
|
275 mova m0, m5
|
yading@11
|
276 psubd m5, m4
|
yading@11
|
277 psubd m0, m7
|
yading@11
|
278 mova m4, m2
|
yading@11
|
279 PMINSD m2, m3, m7
|
yading@11
|
280 PMAXSD m3, m4, m7
|
yading@11
|
281 PMAXSD m2, m5, m7
|
yading@11
|
282 PMINSD m3, m5, m7
|
yading@11
|
283 PMAXSD m2, m0, m7
|
yading@11
|
284 PMINSD m3, m0, m7
|
yading@11
|
285 pxor m4, m4
|
yading@11
|
286 PMAXSD m6, m3, m7
|
yading@11
|
287 psubd m4, m2
|
yading@11
|
288 PMAXSD m6, m4, m7
|
yading@11
|
289
|
yading@11
|
290 .end%1:
|
yading@11
|
291 mova m2, [rsp+16]
|
yading@11
|
292 mova m3, m2
|
yading@11
|
293 psubd m2, m6
|
yading@11
|
294 paddd m3, m6
|
yading@11
|
295 PMAXSD m1, m2, m7
|
yading@11
|
296 PMINSD m1, m3, m7
|
yading@11
|
297 PACK m1
|
yading@11
|
298
|
yading@11
|
299 movh [dstq], m1
|
yading@11
|
300 add dstq, mmsize/2
|
yading@11
|
301 add prevq, mmsize/2
|
yading@11
|
302 add curq, mmsize/2
|
yading@11
|
303 add nextq, mmsize/2
|
yading@11
|
304 sub DWORD r4m, mmsize/4
|
yading@11
|
305 jg .loop%1
|
yading@11
|
306 %endmacro
|
yading@11
|
307
|
yading@11
|
308 %macro YADIF 0
|
yading@11
|
309 %if ARCH_X86_32
|
yading@11
|
310 cglobal yadif_filter_line_16bit, 4, 6, 8, 80, dst, prev, cur, next, w, \
|
yading@11
|
311 prefs, mrefs, parity, mode
|
yading@11
|
312 %else
|
yading@11
|
313 cglobal yadif_filter_line_16bit, 4, 7, 8, 80, dst, prev, cur, next, w, \
|
yading@11
|
314 prefs, mrefs, parity, mode
|
yading@11
|
315 %endif
|
yading@11
|
316 %if ARCH_X86_32
|
yading@11
|
317 mov r4, r5mp
|
yading@11
|
318 mov r5, r6mp
|
yading@11
|
319 DECLARE_REG_TMP 4,5
|
yading@11
|
320 %else
|
yading@11
|
321 movsxd r5, DWORD r5m
|
yading@11
|
322 movsxd r6, DWORD r6m
|
yading@11
|
323 DECLARE_REG_TMP 5,6
|
yading@11
|
324 %endif
|
yading@11
|
325
|
yading@11
|
326 cmp DWORD paritym, 0
|
yading@11
|
327 je .parity0
|
yading@11
|
328 FILTER 1, prevq, curq
|
yading@11
|
329 jmp .ret
|
yading@11
|
330
|
yading@11
|
331 .parity0:
|
yading@11
|
332 FILTER 0, curq, nextq
|
yading@11
|
333
|
yading@11
|
334 .ret:
|
yading@11
|
335 RET
|
yading@11
|
336 %endmacro
|
yading@11
|
337
|
yading@11
|
338 INIT_XMM sse4
|
yading@11
|
339 YADIF
|
yading@11
|
340 INIT_XMM ssse3
|
yading@11
|
341 YADIF
|
yading@11
|
342 INIT_XMM sse2
|
yading@11
|
343 YADIF
|
yading@11
|
344 %if ARCH_X86_32
|
yading@11
|
345 INIT_MMX mmxext
|
yading@11
|
346 YADIF
|
yading@11
|
347 %endif
|