yading@11
|
1 ;*****************************************************************************
|
yading@11
|
2 ;* x86-optimized functions for yadif filter
|
yading@11
|
3 ;*
|
yading@11
|
4 ;* Copyright (C) 2006 Michael Niedermayer <michaelni@gmx.at>
|
yading@11
|
5 ;* Copyright (c) 2013 Daniel Kang <daniel.d.kang@gmail.com>
|
yading@11
|
6 ;* Copyright (c) 2011-2013 James Darnley <james.darnley@gmail.com>
|
yading@11
|
7 ;*
|
yading@11
|
8 ;* This file is part of FFmpeg.
|
yading@11
|
9 ;*
|
yading@11
|
10 ;* FFmpeg is free software; you can redistribute it and/or modify
|
yading@11
|
11 ;* it under the terms of the GNU General Public License as published by
|
yading@11
|
12 ;* the Free Software Foundation; either version 2 of the License, or
|
yading@11
|
13 ;* (at your option) any later version.
|
yading@11
|
14 ;*
|
yading@11
|
15 ;* FFmpeg is distributed in the hope that it will be useful,
|
yading@11
|
16 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
yading@11
|
17 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
yading@11
|
18 ;* GNU General Public License for more details.
|
yading@11
|
19 ;*
|
yading@11
|
20 ;* You should have received a copy of the GNU General Public License along
|
yading@11
|
21 ;* with FFmpeg; if not, write to the Free Software Foundation, Inc.,
|
yading@11
|
22 ;* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
yading@11
|
23 ;******************************************************************************
|
yading@11
|
24
|
yading@11
|
25 %include "libavutil/x86/x86util.asm"
|
yading@11
|
26
|
yading@11
|
27 SECTION_RODATA
|
yading@11
|
28
|
yading@11
|
29 pw_1: times 8 dw 1
|
yading@11
|
30
|
yading@11
|
31 SECTION .text
|
yading@11
|
32
|
yading@11
|
33 %macro PABS 2
|
yading@11
|
34 %if cpuflag(ssse3)
|
yading@11
|
35 pabsw %1, %1
|
yading@11
|
36 %else
|
yading@11
|
37 pxor %2, %2
|
yading@11
|
38 pcmpgtw %2, %1
|
yading@11
|
39 pxor %1, %2
|
yading@11
|
40 psubw %1, %2
|
yading@11
|
41 %endif
|
yading@11
|
42 %endmacro
|
yading@11
|
43
|
yading@11
|
44 %macro PMAXUW 2
|
yading@11
|
45 %if cpuflag(sse4)
|
yading@11
|
46 pmaxuw %1, %2
|
yading@11
|
47 %else
|
yading@11
|
48 psubusw %1, %2
|
yading@11
|
49 paddusw %1, %2
|
yading@11
|
50 %endif
|
yading@11
|
51 %endmacro
|
yading@11
|
52
|
yading@11
|
53 %macro CHECK 2
|
yading@11
|
54 movu m2, [curq+t1+%1*2]
|
yading@11
|
55 movu m3, [curq+t0+%2*2]
|
yading@11
|
56 mova m4, m2
|
yading@11
|
57 mova m5, m2
|
yading@11
|
58 pxor m4, m3
|
yading@11
|
59 pavgw m5, m3
|
yading@11
|
60 pand m4, [pw_1]
|
yading@11
|
61 psubusw m5, m4
|
yading@11
|
62 %if mmsize == 16
|
yading@11
|
63 psrldq m5, 2
|
yading@11
|
64 %else
|
yading@11
|
65 psrlq m5, 16
|
yading@11
|
66 %endif
|
yading@11
|
67 mova m4, m2
|
yading@11
|
68 psubusw m2, m3
|
yading@11
|
69 psubusw m3, m4
|
yading@11
|
70 PMAXUW m2, m3
|
yading@11
|
71 mova m3, m2
|
yading@11
|
72 mova m4, m2
|
yading@11
|
73 %if mmsize == 16
|
yading@11
|
74 psrldq m3, 2
|
yading@11
|
75 psrldq m4, 4
|
yading@11
|
76 %else
|
yading@11
|
77 psrlq m3, 16
|
yading@11
|
78 psrlq m4, 32
|
yading@11
|
79 %endif
|
yading@11
|
80 paddw m2, m3
|
yading@11
|
81 paddw m2, m4
|
yading@11
|
82 %endmacro
|
yading@11
|
83
|
yading@11
|
84 %macro CHECK1 0
|
yading@11
|
85 mova m3, m0
|
yading@11
|
86 pcmpgtw m3, m2
|
yading@11
|
87 pminsw m0, m2
|
yading@11
|
88 mova m6, m3
|
yading@11
|
89 pand m5, m3
|
yading@11
|
90 pandn m3, m1
|
yading@11
|
91 por m3, m5
|
yading@11
|
92 mova m1, m3
|
yading@11
|
93 %endmacro
|
yading@11
|
94
|
yading@11
|
95 ; %macro CHECK2 0
|
yading@11
|
96 ; paddw m6, [pw_1]
|
yading@11
|
97 ; psllw m6, 14
|
yading@11
|
98 ; paddsw m2, m6
|
yading@11
|
99 ; mova m3, m0
|
yading@11
|
100 ; pcmpgtw m3, m2
|
yading@11
|
101 ; pminsw m0, m2
|
yading@11
|
102 ; pand m5, m3
|
yading@11
|
103 ; pandn m3, m1
|
yading@11
|
104 ; por m3, m5
|
yading@11
|
105 ; mova m1, m3
|
yading@11
|
106 ; %endmacro
|
yading@11
|
107
|
yading@11
|
108 ; This version of CHECK2 is required for 14-bit samples. The left-shift trick
|
yading@11
|
109 ; in the old code is not large enough to correctly select pixels or scores.
|
yading@11
|
110
|
yading@11
|
111 %macro CHECK2 0
|
yading@11
|
112 mova m3, m0
|
yading@11
|
113 pcmpgtw m0, m2
|
yading@11
|
114 pand m0, m6
|
yading@11
|
115 mova m6, m0
|
yading@11
|
116 pand m5, m6
|
yading@11
|
117 pand m2, m0
|
yading@11
|
118 pandn m6, m1
|
yading@11
|
119 pandn m0, m3
|
yading@11
|
120 por m6, m5
|
yading@11
|
121 por m0, m2
|
yading@11
|
122 mova m1, m6
|
yading@11
|
123 %endmacro
|
yading@11
|
124
|
yading@11
|
125 %macro LOAD 2
|
yading@11
|
126 movu %1, %2
|
yading@11
|
127 %endmacro
|
yading@11
|
128
|
yading@11
|
129 %macro FILTER 3
|
yading@11
|
130 .loop%1:
|
yading@11
|
131 pxor m7, m7
|
yading@11
|
132 LOAD m0, [curq+t1]
|
yading@11
|
133 LOAD m1, [curq+t0]
|
yading@11
|
134 LOAD m2, [%2]
|
yading@11
|
135 LOAD m3, [%3]
|
yading@11
|
136 mova m4, m3
|
yading@11
|
137 paddw m3, m2
|
yading@11
|
138 psraw m3, 1
|
yading@11
|
139 mova [rsp+ 0], m0
|
yading@11
|
140 mova [rsp+16], m3
|
yading@11
|
141 mova [rsp+32], m1
|
yading@11
|
142 psubw m2, m4
|
yading@11
|
143 PABS m2, m4
|
yading@11
|
144 LOAD m3, [prevq+t1]
|
yading@11
|
145 LOAD m4, [prevq+t0]
|
yading@11
|
146 psubw m3, m0
|
yading@11
|
147 psubw m4, m1
|
yading@11
|
148 PABS m3, m5
|
yading@11
|
149 PABS m4, m5
|
yading@11
|
150 paddw m3, m4
|
yading@11
|
151 psrlw m2, 1
|
yading@11
|
152 psrlw m3, 1
|
yading@11
|
153 pmaxsw m2, m3
|
yading@11
|
154 LOAD m3, [nextq+t1]
|
yading@11
|
155 LOAD m4, [nextq+t0]
|
yading@11
|
156 psubw m3, m0
|
yading@11
|
157 psubw m4, m1
|
yading@11
|
158 PABS m3, m5
|
yading@11
|
159 PABS m4, m5
|
yading@11
|
160 paddw m3, m4
|
yading@11
|
161 psrlw m3, 1
|
yading@11
|
162 pmaxsw m2, m3
|
yading@11
|
163 mova [rsp+48], m2
|
yading@11
|
164
|
yading@11
|
165 paddw m1, m0
|
yading@11
|
166 paddw m0, m0
|
yading@11
|
167 psubw m0, m1
|
yading@11
|
168 psrlw m1, 1
|
yading@11
|
169 PABS m0, m2
|
yading@11
|
170
|
yading@11
|
171 movu m2, [curq+t1-1*2]
|
yading@11
|
172 movu m3, [curq+t0-1*2]
|
yading@11
|
173 mova m4, m2
|
yading@11
|
174 psubusw m2, m3
|
yading@11
|
175 psubusw m3, m4
|
yading@11
|
176 PMAXUW m2, m3
|
yading@11
|
177 %if mmsize == 16
|
yading@11
|
178 mova m3, m2
|
yading@11
|
179 psrldq m3, 4
|
yading@11
|
180 %else
|
yading@11
|
181 mova m3, m2
|
yading@11
|
182 psrlq m3, 32
|
yading@11
|
183 %endif
|
yading@11
|
184 paddw m0, m2
|
yading@11
|
185 paddw m0, m3
|
yading@11
|
186 psubw m0, [pw_1]
|
yading@11
|
187
|
yading@11
|
188 CHECK -2, 0
|
yading@11
|
189 CHECK1
|
yading@11
|
190 CHECK -3, 1
|
yading@11
|
191 CHECK2
|
yading@11
|
192 CHECK 0, -2
|
yading@11
|
193 CHECK1
|
yading@11
|
194 CHECK 1, -3
|
yading@11
|
195 CHECK2
|
yading@11
|
196
|
yading@11
|
197 mova m6, [rsp+48]
|
yading@11
|
198 cmp DWORD r8m, 2
|
yading@11
|
199 jge .end%1
|
yading@11
|
200 LOAD m2, [%2+t1*2]
|
yading@11
|
201 LOAD m4, [%3+t1*2]
|
yading@11
|
202 LOAD m3, [%2+t0*2]
|
yading@11
|
203 LOAD m5, [%3+t0*2]
|
yading@11
|
204 paddw m2, m4
|
yading@11
|
205 paddw m3, m5
|
yading@11
|
206 psrlw m2, 1
|
yading@11
|
207 psrlw m3, 1
|
yading@11
|
208 mova m4, [rsp+ 0]
|
yading@11
|
209 mova m5, [rsp+16]
|
yading@11
|
210 mova m7, [rsp+32]
|
yading@11
|
211 psubw m2, m4
|
yading@11
|
212 psubw m3, m7
|
yading@11
|
213 mova m0, m5
|
yading@11
|
214 psubw m5, m4
|
yading@11
|
215 psubw m0, m7
|
yading@11
|
216 mova m4, m2
|
yading@11
|
217 pminsw m2, m3
|
yading@11
|
218 pmaxsw m3, m4
|
yading@11
|
219 pmaxsw m2, m5
|
yading@11
|
220 pminsw m3, m5
|
yading@11
|
221 pmaxsw m2, m0
|
yading@11
|
222 pminsw m3, m0
|
yading@11
|
223 pxor m4, m4
|
yading@11
|
224 pmaxsw m6, m3
|
yading@11
|
225 psubw m4, m2
|
yading@11
|
226 pmaxsw m6, m4
|
yading@11
|
227
|
yading@11
|
228 .end%1:
|
yading@11
|
229 mova m2, [rsp+16]
|
yading@11
|
230 mova m3, m2
|
yading@11
|
231 psubw m2, m6
|
yading@11
|
232 paddw m3, m6
|
yading@11
|
233 pmaxsw m1, m2
|
yading@11
|
234 pminsw m1, m3
|
yading@11
|
235
|
yading@11
|
236 movu [dstq], m1
|
yading@11
|
237 add dstq, mmsize-4
|
yading@11
|
238 add prevq, mmsize-4
|
yading@11
|
239 add curq, mmsize-4
|
yading@11
|
240 add nextq, mmsize-4
|
yading@11
|
241 sub DWORD r4m, mmsize/2-2
|
yading@11
|
242 jg .loop%1
|
yading@11
|
243 %endmacro
|
yading@11
|
244
|
yading@11
|
245 %macro YADIF 0
|
yading@11
|
246 %if ARCH_X86_32
|
yading@11
|
247 cglobal yadif_filter_line_10bit, 4, 6, 8, 80, dst, prev, cur, next, w, \
|
yading@11
|
248 prefs, mrefs, parity, mode
|
yading@11
|
249 %else
|
yading@11
|
250 cglobal yadif_filter_line_10bit, 4, 7, 8, 80, dst, prev, cur, next, w, \
|
yading@11
|
251 prefs, mrefs, parity, mode
|
yading@11
|
252 %endif
|
yading@11
|
253 %if ARCH_X86_32
|
yading@11
|
254 mov r4, r5mp
|
yading@11
|
255 mov r5, r6mp
|
yading@11
|
256 DECLARE_REG_TMP 4,5
|
yading@11
|
257 %else
|
yading@11
|
258 movsxd r5, DWORD r5m
|
yading@11
|
259 movsxd r6, DWORD r6m
|
yading@11
|
260 DECLARE_REG_TMP 5,6
|
yading@11
|
261 %endif
|
yading@11
|
262
|
yading@11
|
263 cmp DWORD paritym, 0
|
yading@11
|
264 je .parity0
|
yading@11
|
265 FILTER 1, prevq, curq
|
yading@11
|
266 jmp .ret
|
yading@11
|
267
|
yading@11
|
268 .parity0:
|
yading@11
|
269 FILTER 0, curq, nextq
|
yading@11
|
270
|
yading@11
|
271 .ret:
|
yading@11
|
272 RET
|
yading@11
|
273 %endmacro
|
yading@11
|
274
|
yading@11
|
275 INIT_XMM ssse3
|
yading@11
|
276 YADIF
|
yading@11
|
277 INIT_XMM sse2
|
yading@11
|
278 YADIF
|
yading@11
|
279 %if ARCH_X86_32
|
yading@11
|
280 INIT_MMX mmxext
|
yading@11
|
281 YADIF
|
yading@11
|
282 %endif
|