yading@10
|
1 ;******************************************************************************
|
yading@10
|
2 ;* VP8 MMXEXT optimizations
|
yading@10
|
3 ;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
|
yading@10
|
4 ;* Copyright (c) 2010 Jason Garrett-Glaser <darkshikari@gmail.com>
|
yading@10
|
5 ;*
|
yading@10
|
6 ;* This file is part of FFmpeg.
|
yading@10
|
7 ;*
|
yading@10
|
8 ;* FFmpeg is free software; you can redistribute it and/or
|
yading@10
|
9 ;* modify it under the terms of the GNU Lesser General Public
|
yading@10
|
10 ;* License as published by the Free Software Foundation; either
|
yading@10
|
11 ;* version 2.1 of the License, or (at your option) any later version.
|
yading@10
|
12 ;*
|
yading@10
|
13 ;* FFmpeg is distributed in the hope that it will be useful,
|
yading@10
|
14 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
yading@10
|
15 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
yading@10
|
16 ;* Lesser General Public License for more details.
|
yading@10
|
17 ;*
|
yading@10
|
18 ;* You should have received a copy of the GNU Lesser General Public
|
yading@10
|
19 ;* License along with FFmpeg; if not, write to the Free Software
|
yading@10
|
20 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
yading@10
|
21 ;******************************************************************************
|
yading@10
|
22
|
yading@10
|
23 %include "libavutil/x86/x86util.asm"
|
yading@10
|
24
|
yading@10
|
25 SECTION_RODATA
|
yading@10
|
26
|
yading@10
|
27 fourtap_filter_hw_m: times 4 dw -6, 123
|
yading@10
|
28 times 4 dw 12, -1
|
yading@10
|
29 times 4 dw -9, 93
|
yading@10
|
30 times 4 dw 50, -6
|
yading@10
|
31 times 4 dw -6, 50
|
yading@10
|
32 times 4 dw 93, -9
|
yading@10
|
33 times 4 dw -1, 12
|
yading@10
|
34 times 4 dw 123, -6
|
yading@10
|
35
|
yading@10
|
36 sixtap_filter_hw_m: times 4 dw 2, -11
|
yading@10
|
37 times 4 dw 108, 36
|
yading@10
|
38 times 4 dw -8, 1
|
yading@10
|
39 times 4 dw 3, -16
|
yading@10
|
40 times 4 dw 77, 77
|
yading@10
|
41 times 4 dw -16, 3
|
yading@10
|
42 times 4 dw 1, -8
|
yading@10
|
43 times 4 dw 36, 108
|
yading@10
|
44 times 4 dw -11, 2
|
yading@10
|
45
|
yading@10
|
46 fourtap_filter_hb_m: times 8 db -6, 123
|
yading@10
|
47 times 8 db 12, -1
|
yading@10
|
48 times 8 db -9, 93
|
yading@10
|
49 times 8 db 50, -6
|
yading@10
|
50 times 8 db -6, 50
|
yading@10
|
51 times 8 db 93, -9
|
yading@10
|
52 times 8 db -1, 12
|
yading@10
|
53 times 8 db 123, -6
|
yading@10
|
54
|
yading@10
|
55 sixtap_filter_hb_m: times 8 db 2, 1
|
yading@10
|
56 times 8 db -11, 108
|
yading@10
|
57 times 8 db 36, -8
|
yading@10
|
58 times 8 db 3, 3
|
yading@10
|
59 times 8 db -16, 77
|
yading@10
|
60 times 8 db 77, -16
|
yading@10
|
61 times 8 db 1, 2
|
yading@10
|
62 times 8 db -8, 36
|
yading@10
|
63 times 8 db 108, -11
|
yading@10
|
64
|
yading@10
|
65 fourtap_filter_v_m: times 8 dw -6
|
yading@10
|
66 times 8 dw 123
|
yading@10
|
67 times 8 dw 12
|
yading@10
|
68 times 8 dw -1
|
yading@10
|
69 times 8 dw -9
|
yading@10
|
70 times 8 dw 93
|
yading@10
|
71 times 8 dw 50
|
yading@10
|
72 times 8 dw -6
|
yading@10
|
73 times 8 dw -6
|
yading@10
|
74 times 8 dw 50
|
yading@10
|
75 times 8 dw 93
|
yading@10
|
76 times 8 dw -9
|
yading@10
|
77 times 8 dw -1
|
yading@10
|
78 times 8 dw 12
|
yading@10
|
79 times 8 dw 123
|
yading@10
|
80 times 8 dw -6
|
yading@10
|
81
|
yading@10
|
82 sixtap_filter_v_m: times 8 dw 2
|
yading@10
|
83 times 8 dw -11
|
yading@10
|
84 times 8 dw 108
|
yading@10
|
85 times 8 dw 36
|
yading@10
|
86 times 8 dw -8
|
yading@10
|
87 times 8 dw 1
|
yading@10
|
88 times 8 dw 3
|
yading@10
|
89 times 8 dw -16
|
yading@10
|
90 times 8 dw 77
|
yading@10
|
91 times 8 dw 77
|
yading@10
|
92 times 8 dw -16
|
yading@10
|
93 times 8 dw 3
|
yading@10
|
94 times 8 dw 1
|
yading@10
|
95 times 8 dw -8
|
yading@10
|
96 times 8 dw 36
|
yading@10
|
97 times 8 dw 108
|
yading@10
|
98 times 8 dw -11
|
yading@10
|
99 times 8 dw 2
|
yading@10
|
100
|
yading@10
|
101 bilinear_filter_vw_m: times 8 dw 1
|
yading@10
|
102 times 8 dw 2
|
yading@10
|
103 times 8 dw 3
|
yading@10
|
104 times 8 dw 4
|
yading@10
|
105 times 8 dw 5
|
yading@10
|
106 times 8 dw 6
|
yading@10
|
107 times 8 dw 7
|
yading@10
|
108
|
yading@10
|
109 bilinear_filter_vb_m: times 8 db 7, 1
|
yading@10
|
110 times 8 db 6, 2
|
yading@10
|
111 times 8 db 5, 3
|
yading@10
|
112 times 8 db 4, 4
|
yading@10
|
113 times 8 db 3, 5
|
yading@10
|
114 times 8 db 2, 6
|
yading@10
|
115 times 8 db 1, 7
|
yading@10
|
116
|
yading@10
|
117 %ifdef PIC
|
yading@10
|
118 %define fourtap_filter_hw picregq
|
yading@10
|
119 %define sixtap_filter_hw picregq
|
yading@10
|
120 %define fourtap_filter_hb picregq
|
yading@10
|
121 %define sixtap_filter_hb picregq
|
yading@10
|
122 %define fourtap_filter_v picregq
|
yading@10
|
123 %define sixtap_filter_v picregq
|
yading@10
|
124 %define bilinear_filter_vw picregq
|
yading@10
|
125 %define bilinear_filter_vb picregq
|
yading@10
|
126 %define npicregs 1
|
yading@10
|
127 %else
|
yading@10
|
128 %define fourtap_filter_hw fourtap_filter_hw_m
|
yading@10
|
129 %define sixtap_filter_hw sixtap_filter_hw_m
|
yading@10
|
130 %define fourtap_filter_hb fourtap_filter_hb_m
|
yading@10
|
131 %define sixtap_filter_hb sixtap_filter_hb_m
|
yading@10
|
132 %define fourtap_filter_v fourtap_filter_v_m
|
yading@10
|
133 %define sixtap_filter_v sixtap_filter_v_m
|
yading@10
|
134 %define bilinear_filter_vw bilinear_filter_vw_m
|
yading@10
|
135 %define bilinear_filter_vb bilinear_filter_vb_m
|
yading@10
|
136 %define npicregs 0
|
yading@10
|
137 %endif
|
yading@10
|
138
|
yading@10
|
139 filter_h2_shuf: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
|
yading@10
|
140 filter_h4_shuf: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
|
yading@10
|
141
|
yading@10
|
142 filter_h6_shuf1: db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12
|
yading@10
|
143 filter_h6_shuf2: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9
|
yading@10
|
144 filter_h6_shuf3: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11
|
yading@10
|
145
|
yading@10
|
146 pw_27: times 8 dw 27
|
yading@10
|
147 pw_63: times 8 dw 63
|
yading@10
|
148 pw_256: times 8 dw 256
|
yading@10
|
149 pw_20091: times 4 dw 20091
|
yading@10
|
150 pw_17734: times 4 dw 17734
|
yading@10
|
151
|
yading@10
|
152 pb_4: times 16 db 4
|
yading@10
|
153 pb_F8: times 16 db 0xF8
|
yading@10
|
154 pb_FE: times 16 db 0xFE
|
yading@10
|
155 pb_27_63: times 8 db 27, 63
|
yading@10
|
156 pb_18_63: times 8 db 18, 63
|
yading@10
|
157 pb_9_63: times 8 db 9, 63
|
yading@10
|
158
|
yading@10
|
159 cextern pb_1
|
yading@10
|
160 cextern pw_3
|
yading@10
|
161 cextern pb_3
|
yading@10
|
162 cextern pw_4
|
yading@10
|
163 cextern pw_9
|
yading@10
|
164 cextern pw_18
|
yading@10
|
165 cextern pw_64
|
yading@10
|
166 cextern pb_80
|
yading@10
|
167
|
yading@10
|
168 SECTION .text
|
yading@10
|
169
|
yading@10
|
170 ;-----------------------------------------------------------------------------
|
yading@10
|
171 ; subpel MC functions:
|
yading@10
|
172 ;
|
yading@10
|
173 ; void put_vp8_epel<size>_h<htap>v<vtap>_<opt>(uint8_t *dst, int deststride,
|
yading@10
|
174 ; uint8_t *src, int srcstride,
|
yading@10
|
175 ; int height, int mx, int my);
|
yading@10
|
176 ;-----------------------------------------------------------------------------
|
yading@10
|
177
|
yading@10
|
178 %macro FILTER_SSSE3 1
|
yading@10
|
179 cglobal put_vp8_epel%1_h6, 6, 6 + npicregs, 8, dst, dststride, src, srcstride, height, mx, picreg
|
yading@10
|
180 lea mxd, [mxq*3]
|
yading@10
|
181 mova m3, [filter_h6_shuf2]
|
yading@10
|
182 mova m4, [filter_h6_shuf3]
|
yading@10
|
183 %ifdef PIC
|
yading@10
|
184 lea picregq, [sixtap_filter_hb_m]
|
yading@10
|
185 %endif
|
yading@10
|
186 mova m5, [sixtap_filter_hb+mxq*8-48] ; set up 6tap filter in bytes
|
yading@10
|
187 mova m6, [sixtap_filter_hb+mxq*8-32]
|
yading@10
|
188 mova m7, [sixtap_filter_hb+mxq*8-16]
|
yading@10
|
189
|
yading@10
|
190 .nextrow:
|
yading@10
|
191 movu m0, [srcq-2]
|
yading@10
|
192 mova m1, m0
|
yading@10
|
193 mova m2, m0
|
yading@10
|
194 %if mmsize == 8
|
yading@10
|
195 ; For epel4, we need 9 bytes, but only 8 get loaded; to compensate, do the
|
yading@10
|
196 ; shuffle with a memory operand
|
yading@10
|
197 punpcklbw m0, [srcq+3]
|
yading@10
|
198 %else
|
yading@10
|
199 pshufb m0, [filter_h6_shuf1]
|
yading@10
|
200 %endif
|
yading@10
|
201 pshufb m1, m3
|
yading@10
|
202 pshufb m2, m4
|
yading@10
|
203 pmaddubsw m0, m5
|
yading@10
|
204 pmaddubsw m1, m6
|
yading@10
|
205 pmaddubsw m2, m7
|
yading@10
|
206 paddsw m0, m1
|
yading@10
|
207 paddsw m0, m2
|
yading@10
|
208 pmulhrsw m0, [pw_256]
|
yading@10
|
209 packuswb m0, m0
|
yading@10
|
210 movh [dstq], m0 ; store
|
yading@10
|
211
|
yading@10
|
212 ; go to next line
|
yading@10
|
213 add dstq, dststrideq
|
yading@10
|
214 add srcq, srcstrideq
|
yading@10
|
215 dec heightd ; next row
|
yading@10
|
216 jg .nextrow
|
yading@10
|
217 REP_RET
|
yading@10
|
218
|
yading@10
|
219 cglobal put_vp8_epel%1_h4, 6, 6 + npicregs, 7, dst, dststride, src, srcstride, height, mx, picreg
|
yading@10
|
220 shl mxd, 4
|
yading@10
|
221 mova m2, [pw_256]
|
yading@10
|
222 mova m3, [filter_h2_shuf]
|
yading@10
|
223 mova m4, [filter_h4_shuf]
|
yading@10
|
224 %ifdef PIC
|
yading@10
|
225 lea picregq, [fourtap_filter_hb_m]
|
yading@10
|
226 %endif
|
yading@10
|
227 mova m5, [fourtap_filter_hb+mxq-16] ; set up 4tap filter in bytes
|
yading@10
|
228 mova m6, [fourtap_filter_hb+mxq]
|
yading@10
|
229
|
yading@10
|
230 .nextrow:
|
yading@10
|
231 movu m0, [srcq-1]
|
yading@10
|
232 mova m1, m0
|
yading@10
|
233 pshufb m0, m3
|
yading@10
|
234 pshufb m1, m4
|
yading@10
|
235 pmaddubsw m0, m5
|
yading@10
|
236 pmaddubsw m1, m6
|
yading@10
|
237 paddsw m0, m1
|
yading@10
|
238 pmulhrsw m0, m2
|
yading@10
|
239 packuswb m0, m0
|
yading@10
|
240 movh [dstq], m0 ; store
|
yading@10
|
241
|
yading@10
|
242 ; go to next line
|
yading@10
|
243 add dstq, dststrideq
|
yading@10
|
244 add srcq, srcstrideq
|
yading@10
|
245 dec heightd ; next row
|
yading@10
|
246 jg .nextrow
|
yading@10
|
247 REP_RET
|
yading@10
|
248
|
yading@10
|
249 cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
|
yading@10
|
250 shl myd, 4
|
yading@10
|
251 %ifdef PIC
|
yading@10
|
252 lea picregq, [fourtap_filter_hb_m]
|
yading@10
|
253 %endif
|
yading@10
|
254 mova m5, [fourtap_filter_hb+myq-16]
|
yading@10
|
255 mova m6, [fourtap_filter_hb+myq]
|
yading@10
|
256 mova m7, [pw_256]
|
yading@10
|
257
|
yading@10
|
258 ; read 3 lines
|
yading@10
|
259 sub srcq, srcstrideq
|
yading@10
|
260 movh m0, [srcq]
|
yading@10
|
261 movh m1, [srcq+ srcstrideq]
|
yading@10
|
262 movh m2, [srcq+2*srcstrideq]
|
yading@10
|
263 add srcq, srcstrideq
|
yading@10
|
264
|
yading@10
|
265 .nextrow:
|
yading@10
|
266 movh m3, [srcq+2*srcstrideq] ; read new row
|
yading@10
|
267 mova m4, m0
|
yading@10
|
268 mova m0, m1
|
yading@10
|
269 punpcklbw m4, m1
|
yading@10
|
270 mova m1, m2
|
yading@10
|
271 punpcklbw m2, m3
|
yading@10
|
272 pmaddubsw m4, m5
|
yading@10
|
273 pmaddubsw m2, m6
|
yading@10
|
274 paddsw m4, m2
|
yading@10
|
275 mova m2, m3
|
yading@10
|
276 pmulhrsw m4, m7
|
yading@10
|
277 packuswb m4, m4
|
yading@10
|
278 movh [dstq], m4
|
yading@10
|
279
|
yading@10
|
280 ; go to next line
|
yading@10
|
281 add dstq, dststrideq
|
yading@10
|
282 add srcq, srcstrideq
|
yading@10
|
283 dec heightd ; next row
|
yading@10
|
284 jg .nextrow
|
yading@10
|
285 REP_RET
|
yading@10
|
286
|
yading@10
|
287 cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
|
yading@10
|
288 lea myd, [myq*3]
|
yading@10
|
289 %ifdef PIC
|
yading@10
|
290 lea picregq, [sixtap_filter_hb_m]
|
yading@10
|
291 %endif
|
yading@10
|
292 lea myq, [sixtap_filter_hb+myq*8]
|
yading@10
|
293
|
yading@10
|
294 ; read 5 lines
|
yading@10
|
295 sub srcq, srcstrideq
|
yading@10
|
296 sub srcq, srcstrideq
|
yading@10
|
297 movh m0, [srcq]
|
yading@10
|
298 movh m1, [srcq+srcstrideq]
|
yading@10
|
299 movh m2, [srcq+srcstrideq*2]
|
yading@10
|
300 lea srcq, [srcq+srcstrideq*2]
|
yading@10
|
301 add srcq, srcstrideq
|
yading@10
|
302 movh m3, [srcq]
|
yading@10
|
303 movh m4, [srcq+srcstrideq]
|
yading@10
|
304
|
yading@10
|
305 .nextrow:
|
yading@10
|
306 movh m5, [srcq+2*srcstrideq] ; read new row
|
yading@10
|
307 mova m6, m0
|
yading@10
|
308 punpcklbw m6, m5
|
yading@10
|
309 mova m0, m1
|
yading@10
|
310 punpcklbw m1, m2
|
yading@10
|
311 mova m7, m3
|
yading@10
|
312 punpcklbw m7, m4
|
yading@10
|
313 pmaddubsw m6, [myq-48]
|
yading@10
|
314 pmaddubsw m1, [myq-32]
|
yading@10
|
315 pmaddubsw m7, [myq-16]
|
yading@10
|
316 paddsw m6, m1
|
yading@10
|
317 paddsw m6, m7
|
yading@10
|
318 mova m1, m2
|
yading@10
|
319 mova m2, m3
|
yading@10
|
320 pmulhrsw m6, [pw_256]
|
yading@10
|
321 mova m3, m4
|
yading@10
|
322 packuswb m6, m6
|
yading@10
|
323 mova m4, m5
|
yading@10
|
324 movh [dstq], m6
|
yading@10
|
325
|
yading@10
|
326 ; go to next line
|
yading@10
|
327 add dstq, dststrideq
|
yading@10
|
328 add srcq, srcstrideq
|
yading@10
|
329 dec heightd ; next row
|
yading@10
|
330 jg .nextrow
|
yading@10
|
331 REP_RET
|
yading@10
|
332 %endmacro
|
yading@10
|
333
|
yading@10
|
334 INIT_MMX ssse3
|
yading@10
|
335 FILTER_SSSE3 4
|
yading@10
|
336 INIT_XMM ssse3
|
yading@10
|
337 FILTER_SSSE3 8
|
yading@10
|
338
|
yading@10
|
339 ; 4x4 block, H-only 4-tap filter
|
yading@10
|
340 INIT_MMX mmxext
|
yading@10
|
341 cglobal put_vp8_epel4_h4, 6, 6 + npicregs, 0, dst, dststride, src, srcstride, height, mx, picreg
|
yading@10
|
342 shl mxd, 4
|
yading@10
|
343 %ifdef PIC
|
yading@10
|
344 lea picregq, [fourtap_filter_hw_m]
|
yading@10
|
345 %endif
|
yading@10
|
346 movq mm4, [fourtap_filter_hw+mxq-16] ; set up 4tap filter in words
|
yading@10
|
347 movq mm5, [fourtap_filter_hw+mxq]
|
yading@10
|
348 movq mm7, [pw_64]
|
yading@10
|
349 pxor mm6, mm6
|
yading@10
|
350
|
yading@10
|
351 .nextrow:
|
yading@10
|
352 movq mm1, [srcq-1] ; (ABCDEFGH) load 8 horizontal pixels
|
yading@10
|
353
|
yading@10
|
354 ; first set of 2 pixels
|
yading@10
|
355 movq mm2, mm1 ; byte ABCD..
|
yading@10
|
356 punpcklbw mm1, mm6 ; byte->word ABCD
|
yading@10
|
357 pshufw mm0, mm2, 9 ; byte CDEF..
|
yading@10
|
358 punpcklbw mm0, mm6 ; byte->word CDEF
|
yading@10
|
359 pshufw mm3, mm1, 0x94 ; word ABBC
|
yading@10
|
360 pshufw mm1, mm0, 0x94 ; word CDDE
|
yading@10
|
361 pmaddwd mm3, mm4 ; multiply 2px with F0/F1
|
yading@10
|
362 movq mm0, mm1 ; backup for second set of pixels
|
yading@10
|
363 pmaddwd mm1, mm5 ; multiply 2px with F2/F3
|
yading@10
|
364 paddd mm3, mm1 ; finish 1st 2px
|
yading@10
|
365
|
yading@10
|
366 ; second set of 2 pixels, use backup of above
|
yading@10
|
367 punpckhbw mm2, mm6 ; byte->word EFGH
|
yading@10
|
368 pmaddwd mm0, mm4 ; multiply backed up 2px with F0/F1
|
yading@10
|
369 pshufw mm1, mm2, 0x94 ; word EFFG
|
yading@10
|
370 pmaddwd mm1, mm5 ; multiply 2px with F2/F3
|
yading@10
|
371 paddd mm0, mm1 ; finish 2nd 2px
|
yading@10
|
372
|
yading@10
|
373 ; merge two sets of 2 pixels into one set of 4, round/clip/store
|
yading@10
|
374 packssdw mm3, mm0 ; merge dword->word (4px)
|
yading@10
|
375 paddsw mm3, mm7 ; rounding
|
yading@10
|
376 psraw mm3, 7
|
yading@10
|
377 packuswb mm3, mm6 ; clip and word->bytes
|
yading@10
|
378 movd [dstq], mm3 ; store
|
yading@10
|
379
|
yading@10
|
380 ; go to next line
|
yading@10
|
381 add dstq, dststrideq
|
yading@10
|
382 add srcq, srcstrideq
|
yading@10
|
383 dec heightd ; next row
|
yading@10
|
384 jg .nextrow
|
yading@10
|
385 REP_RET
|
yading@10
|
386
|
yading@10
|
387 ; 4x4 block, H-only 6-tap filter
|
yading@10
|
388 INIT_MMX mmxext
|
yading@10
|
389 cglobal put_vp8_epel4_h6, 6, 6 + npicregs, 0, dst, dststride, src, srcstride, height, mx, picreg
|
yading@10
|
390 lea mxd, [mxq*3]
|
yading@10
|
391 %ifdef PIC
|
yading@10
|
392 lea picregq, [sixtap_filter_hw_m]
|
yading@10
|
393 %endif
|
yading@10
|
394 movq mm4, [sixtap_filter_hw+mxq*8-48] ; set up 4tap filter in words
|
yading@10
|
395 movq mm5, [sixtap_filter_hw+mxq*8-32]
|
yading@10
|
396 movq mm6, [sixtap_filter_hw+mxq*8-16]
|
yading@10
|
397 movq mm7, [pw_64]
|
yading@10
|
398 pxor mm3, mm3
|
yading@10
|
399
|
yading@10
|
400 .nextrow:
|
yading@10
|
401 movq mm1, [srcq-2] ; (ABCDEFGH) load 8 horizontal pixels
|
yading@10
|
402
|
yading@10
|
403 ; first set of 2 pixels
|
yading@10
|
404 movq mm2, mm1 ; byte ABCD..
|
yading@10
|
405 punpcklbw mm1, mm3 ; byte->word ABCD
|
yading@10
|
406 pshufw mm0, mm2, 0x9 ; byte CDEF..
|
yading@10
|
407 punpckhbw mm2, mm3 ; byte->word EFGH
|
yading@10
|
408 punpcklbw mm0, mm3 ; byte->word CDEF
|
yading@10
|
409 pshufw mm1, mm1, 0x94 ; word ABBC
|
yading@10
|
410 pshufw mm2, mm2, 0x94 ; word EFFG
|
yading@10
|
411 pmaddwd mm1, mm4 ; multiply 2px with F0/F1
|
yading@10
|
412 pshufw mm3, mm0, 0x94 ; word CDDE
|
yading@10
|
413 movq mm0, mm3 ; backup for second set of pixels
|
yading@10
|
414 pmaddwd mm3, mm5 ; multiply 2px with F2/F3
|
yading@10
|
415 paddd mm1, mm3 ; add to 1st 2px cache
|
yading@10
|
416 movq mm3, mm2 ; backup for second set of pixels
|
yading@10
|
417 pmaddwd mm2, mm6 ; multiply 2px with F4/F5
|
yading@10
|
418 paddd mm1, mm2 ; finish 1st 2px
|
yading@10
|
419
|
yading@10
|
420 ; second set of 2 pixels, use backup of above
|
yading@10
|
421 movd mm2, [srcq+3] ; byte FGHI (prevent overreads)
|
yading@10
|
422 pmaddwd mm0, mm4 ; multiply 1st backed up 2px with F0/F1
|
yading@10
|
423 pmaddwd mm3, mm5 ; multiply 2nd backed up 2px with F2/F3
|
yading@10
|
424 paddd mm0, mm3 ; add to 2nd 2px cache
|
yading@10
|
425 pxor mm3, mm3
|
yading@10
|
426 punpcklbw mm2, mm3 ; byte->word FGHI
|
yading@10
|
427 pshufw mm2, mm2, 0xE9 ; word GHHI
|
yading@10
|
428 pmaddwd mm2, mm6 ; multiply 2px with F4/F5
|
yading@10
|
429 paddd mm0, mm2 ; finish 2nd 2px
|
yading@10
|
430
|
yading@10
|
431 ; merge two sets of 2 pixels into one set of 4, round/clip/store
|
yading@10
|
432 packssdw mm1, mm0 ; merge dword->word (4px)
|
yading@10
|
433 paddsw mm1, mm7 ; rounding
|
yading@10
|
434 psraw mm1, 7
|
yading@10
|
435 packuswb mm1, mm3 ; clip and word->bytes
|
yading@10
|
436 movd [dstq], mm1 ; store
|
yading@10
|
437
|
yading@10
|
438 ; go to next line
|
yading@10
|
439 add dstq, dststrideq
|
yading@10
|
440 add srcq, srcstrideq
|
yading@10
|
441 dec heightd ; next row
|
yading@10
|
442 jg .nextrow
|
yading@10
|
443 REP_RET
|
yading@10
|
444
|
yading@10
|
445 INIT_XMM sse2
|
yading@10
|
446 cglobal put_vp8_epel8_h4, 6, 6 + npicregs, 10, dst, dststride, src, srcstride, height, mx, picreg
|
yading@10
|
447 shl mxd, 5
|
yading@10
|
448 %ifdef PIC
|
yading@10
|
449 lea picregq, [fourtap_filter_v_m]
|
yading@10
|
450 %endif
|
yading@10
|
451 lea mxq, [fourtap_filter_v+mxq-32]
|
yading@10
|
452 pxor m7, m7
|
yading@10
|
453 mova m4, [pw_64]
|
yading@10
|
454 mova m5, [mxq+ 0]
|
yading@10
|
455 mova m6, [mxq+16]
|
yading@10
|
456 %ifdef m8
|
yading@10
|
457 mova m8, [mxq+32]
|
yading@10
|
458 mova m9, [mxq+48]
|
yading@10
|
459 %endif
|
yading@10
|
460 .nextrow:
|
yading@10
|
461 movq m0, [srcq-1]
|
yading@10
|
462 movq m1, [srcq-0]
|
yading@10
|
463 movq m2, [srcq+1]
|
yading@10
|
464 movq m3, [srcq+2]
|
yading@10
|
465 punpcklbw m0, m7
|
yading@10
|
466 punpcklbw m1, m7
|
yading@10
|
467 punpcklbw m2, m7
|
yading@10
|
468 punpcklbw m3, m7
|
yading@10
|
469 pmullw m0, m5
|
yading@10
|
470 pmullw m1, m6
|
yading@10
|
471 %ifdef m8
|
yading@10
|
472 pmullw m2, m8
|
yading@10
|
473 pmullw m3, m9
|
yading@10
|
474 %else
|
yading@10
|
475 pmullw m2, [mxq+32]
|
yading@10
|
476 pmullw m3, [mxq+48]
|
yading@10
|
477 %endif
|
yading@10
|
478 paddsw m0, m1
|
yading@10
|
479 paddsw m2, m3
|
yading@10
|
480 paddsw m0, m2
|
yading@10
|
481 paddsw m0, m4
|
yading@10
|
482 psraw m0, 7
|
yading@10
|
483 packuswb m0, m7
|
yading@10
|
484 movh [dstq], m0 ; store
|
yading@10
|
485
|
yading@10
|
486 ; go to next line
|
yading@10
|
487 add dstq, dststrideq
|
yading@10
|
488 add srcq, srcstrideq
|
yading@10
|
489 dec heightd ; next row
|
yading@10
|
490 jg .nextrow
|
yading@10
|
491 REP_RET
|
yading@10
|
492
|
yading@10
|
493 INIT_XMM sse2
|
yading@10
|
494 cglobal put_vp8_epel8_h6, 6, 6 + npicregs, 14, dst, dststride, src, srcstride, height, mx, picreg
|
yading@10
|
495 lea mxd, [mxq*3]
|
yading@10
|
496 shl mxd, 4
|
yading@10
|
497 %ifdef PIC
|
yading@10
|
498 lea picregq, [sixtap_filter_v_m]
|
yading@10
|
499 %endif
|
yading@10
|
500 lea mxq, [sixtap_filter_v+mxq-96]
|
yading@10
|
501 pxor m7, m7
|
yading@10
|
502 mova m6, [pw_64]
|
yading@10
|
503 %ifdef m8
|
yading@10
|
504 mova m8, [mxq+ 0]
|
yading@10
|
505 mova m9, [mxq+16]
|
yading@10
|
506 mova m10, [mxq+32]
|
yading@10
|
507 mova m11, [mxq+48]
|
yading@10
|
508 mova m12, [mxq+64]
|
yading@10
|
509 mova m13, [mxq+80]
|
yading@10
|
510 %endif
|
yading@10
|
511 .nextrow:
|
yading@10
|
512 movq m0, [srcq-2]
|
yading@10
|
513 movq m1, [srcq-1]
|
yading@10
|
514 movq m2, [srcq-0]
|
yading@10
|
515 movq m3, [srcq+1]
|
yading@10
|
516 movq m4, [srcq+2]
|
yading@10
|
517 movq m5, [srcq+3]
|
yading@10
|
518 punpcklbw m0, m7
|
yading@10
|
519 punpcklbw m1, m7
|
yading@10
|
520 punpcklbw m2, m7
|
yading@10
|
521 punpcklbw m3, m7
|
yading@10
|
522 punpcklbw m4, m7
|
yading@10
|
523 punpcklbw m5, m7
|
yading@10
|
524 %ifdef m8
|
yading@10
|
525 pmullw m0, m8
|
yading@10
|
526 pmullw m1, m9
|
yading@10
|
527 pmullw m2, m10
|
yading@10
|
528 pmullw m3, m11
|
yading@10
|
529 pmullw m4, m12
|
yading@10
|
530 pmullw m5, m13
|
yading@10
|
531 %else
|
yading@10
|
532 pmullw m0, [mxq+ 0]
|
yading@10
|
533 pmullw m1, [mxq+16]
|
yading@10
|
534 pmullw m2, [mxq+32]
|
yading@10
|
535 pmullw m3, [mxq+48]
|
yading@10
|
536 pmullw m4, [mxq+64]
|
yading@10
|
537 pmullw m5, [mxq+80]
|
yading@10
|
538 %endif
|
yading@10
|
539 paddsw m1, m4
|
yading@10
|
540 paddsw m0, m5
|
yading@10
|
541 paddsw m1, m2
|
yading@10
|
542 paddsw m0, m3
|
yading@10
|
543 paddsw m0, m1
|
yading@10
|
544 paddsw m0, m6
|
yading@10
|
545 psraw m0, 7
|
yading@10
|
546 packuswb m0, m7
|
yading@10
|
547 movh [dstq], m0 ; store
|
yading@10
|
548
|
yading@10
|
549 ; go to next line
|
yading@10
|
550 add dstq, dststrideq
|
yading@10
|
551 add srcq, srcstrideq
|
yading@10
|
552 dec heightd ; next row
|
yading@10
|
553 jg .nextrow
|
yading@10
|
554 REP_RET
|
yading@10
|
555
|
yading@10
|
556 %macro FILTER_V 1
|
yading@10
|
557 ; 4x4 block, V-only 4-tap filter
|
yading@10
|
558 cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
|
yading@10
|
559 shl myd, 5
|
yading@10
|
560 %ifdef PIC
|
yading@10
|
561 lea picregq, [fourtap_filter_v_m]
|
yading@10
|
562 %endif
|
yading@10
|
563 lea myq, [fourtap_filter_v+myq-32]
|
yading@10
|
564 mova m6, [pw_64]
|
yading@10
|
565 pxor m7, m7
|
yading@10
|
566 mova m5, [myq+48]
|
yading@10
|
567
|
yading@10
|
568 ; read 3 lines
|
yading@10
|
569 sub srcq, srcstrideq
|
yading@10
|
570 movh m0, [srcq]
|
yading@10
|
571 movh m1, [srcq+ srcstrideq]
|
yading@10
|
572 movh m2, [srcq+2*srcstrideq]
|
yading@10
|
573 add srcq, srcstrideq
|
yading@10
|
574 punpcklbw m0, m7
|
yading@10
|
575 punpcklbw m1, m7
|
yading@10
|
576 punpcklbw m2, m7
|
yading@10
|
577
|
yading@10
|
578 .nextrow:
|
yading@10
|
579 ; first calculate negative taps (to prevent losing positive overflows)
|
yading@10
|
580 movh m4, [srcq+2*srcstrideq] ; read new row
|
yading@10
|
581 punpcklbw m4, m7
|
yading@10
|
582 mova m3, m4
|
yading@10
|
583 pmullw m0, [myq+0]
|
yading@10
|
584 pmullw m4, m5
|
yading@10
|
585 paddsw m4, m0
|
yading@10
|
586
|
yading@10
|
587 ; then calculate positive taps
|
yading@10
|
588 mova m0, m1
|
yading@10
|
589 pmullw m1, [myq+16]
|
yading@10
|
590 paddsw m4, m1
|
yading@10
|
591 mova m1, m2
|
yading@10
|
592 pmullw m2, [myq+32]
|
yading@10
|
593 paddsw m4, m2
|
yading@10
|
594 mova m2, m3
|
yading@10
|
595
|
yading@10
|
596 ; round/clip/store
|
yading@10
|
597 paddsw m4, m6
|
yading@10
|
598 psraw m4, 7
|
yading@10
|
599 packuswb m4, m7
|
yading@10
|
600 movh [dstq], m4
|
yading@10
|
601
|
yading@10
|
602 ; go to next line
|
yading@10
|
603 add dstq, dststrideq
|
yading@10
|
604 add srcq, srcstrideq
|
yading@10
|
605 dec heightd ; next row
|
yading@10
|
606 jg .nextrow
|
yading@10
|
607 REP_RET
|
yading@10
|
608
|
yading@10
|
609
|
yading@10
|
610 ; 4x4 block, V-only 6-tap filter
|
yading@10
|
611 cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
|
yading@10
|
612 shl myd, 4
|
yading@10
|
613 lea myq, [myq*3]
|
yading@10
|
614 %ifdef PIC
|
yading@10
|
615 lea picregq, [sixtap_filter_v_m]
|
yading@10
|
616 %endif
|
yading@10
|
617 lea myq, [sixtap_filter_v+myq-96]
|
yading@10
|
618 pxor m7, m7
|
yading@10
|
619
|
yading@10
|
620 ; read 5 lines
|
yading@10
|
621 sub srcq, srcstrideq
|
yading@10
|
622 sub srcq, srcstrideq
|
yading@10
|
623 movh m0, [srcq]
|
yading@10
|
624 movh m1, [srcq+srcstrideq]
|
yading@10
|
625 movh m2, [srcq+srcstrideq*2]
|
yading@10
|
626 lea srcq, [srcq+srcstrideq*2]
|
yading@10
|
627 add srcq, srcstrideq
|
yading@10
|
628 movh m3, [srcq]
|
yading@10
|
629 movh m4, [srcq+srcstrideq]
|
yading@10
|
630 punpcklbw m0, m7
|
yading@10
|
631 punpcklbw m1, m7
|
yading@10
|
632 punpcklbw m2, m7
|
yading@10
|
633 punpcklbw m3, m7
|
yading@10
|
634 punpcklbw m4, m7
|
yading@10
|
635
|
yading@10
|
636 .nextrow:
|
yading@10
|
637 ; first calculate negative taps (to prevent losing positive overflows)
|
yading@10
|
638 mova m5, m1
|
yading@10
|
639 pmullw m5, [myq+16]
|
yading@10
|
640 mova m6, m4
|
yading@10
|
641 pmullw m6, [myq+64]
|
yading@10
|
642 paddsw m6, m5
|
yading@10
|
643
|
yading@10
|
644 ; then calculate positive taps
|
yading@10
|
645 movh m5, [srcq+2*srcstrideq] ; read new row
|
yading@10
|
646 punpcklbw m5, m7
|
yading@10
|
647 pmullw m0, [myq+0]
|
yading@10
|
648 paddsw m6, m0
|
yading@10
|
649 mova m0, m1
|
yading@10
|
650 mova m1, m2
|
yading@10
|
651 pmullw m2, [myq+32]
|
yading@10
|
652 paddsw m6, m2
|
yading@10
|
653 mova m2, m3
|
yading@10
|
654 pmullw m3, [myq+48]
|
yading@10
|
655 paddsw m6, m3
|
yading@10
|
656 mova m3, m4
|
yading@10
|
657 mova m4, m5
|
yading@10
|
658 pmullw m5, [myq+80]
|
yading@10
|
659 paddsw m6, m5
|
yading@10
|
660
|
yading@10
|
661 ; round/clip/store
|
yading@10
|
662 paddsw m6, [pw_64]
|
yading@10
|
663 psraw m6, 7
|
yading@10
|
664 packuswb m6, m7
|
yading@10
|
665 movh [dstq], m6
|
yading@10
|
666
|
yading@10
|
667 ; go to next line
|
yading@10
|
668 add dstq, dststrideq
|
yading@10
|
669 add srcq, srcstrideq
|
yading@10
|
670 dec heightd ; next row
|
yading@10
|
671 jg .nextrow
|
yading@10
|
672 REP_RET
|
yading@10
|
673 %endmacro
|
yading@10
|
674
|
yading@10
|
675 INIT_MMX mmxext
|
yading@10
|
676 FILTER_V 4
|
yading@10
|
677 INIT_XMM sse2
|
yading@10
|
678 FILTER_V 8
|
yading@10
|
679
|
yading@10
|
680 %macro FILTER_BILINEAR 1
|
yading@10
|
681 cglobal put_vp8_bilinear%1_v, 7, 7, 7, dst, dststride, src, srcstride, height, picreg, my
|
yading@10
|
682 shl myd, 4
|
yading@10
|
683 %ifdef PIC
|
yading@10
|
684 lea picregq, [bilinear_filter_vw_m]
|
yading@10
|
685 %endif
|
yading@10
|
686 pxor m6, m6
|
yading@10
|
687 mova m5, [bilinear_filter_vw+myq-1*16]
|
yading@10
|
688 neg myq
|
yading@10
|
689 mova m4, [bilinear_filter_vw+myq+7*16]
|
yading@10
|
690 .nextrow:
|
yading@10
|
691 movh m0, [srcq+srcstrideq*0]
|
yading@10
|
692 movh m1, [srcq+srcstrideq*1]
|
yading@10
|
693 movh m3, [srcq+srcstrideq*2]
|
yading@10
|
694 punpcklbw m0, m6
|
yading@10
|
695 punpcklbw m1, m6
|
yading@10
|
696 punpcklbw m3, m6
|
yading@10
|
697 mova m2, m1
|
yading@10
|
698 pmullw m0, m4
|
yading@10
|
699 pmullw m1, m5
|
yading@10
|
700 pmullw m2, m4
|
yading@10
|
701 pmullw m3, m5
|
yading@10
|
702 paddsw m0, m1
|
yading@10
|
703 paddsw m2, m3
|
yading@10
|
704 psraw m0, 2
|
yading@10
|
705 psraw m2, 2
|
yading@10
|
706 pavgw m0, m6
|
yading@10
|
707 pavgw m2, m6
|
yading@10
|
708 %if mmsize == 8
|
yading@10
|
709 packuswb m0, m0
|
yading@10
|
710 packuswb m2, m2
|
yading@10
|
711 movh [dstq+dststrideq*0], m0
|
yading@10
|
712 movh [dstq+dststrideq*1], m2
|
yading@10
|
713 %else
|
yading@10
|
714 packuswb m0, m2
|
yading@10
|
715 movh [dstq+dststrideq*0], m0
|
yading@10
|
716 movhps [dstq+dststrideq*1], m0
|
yading@10
|
717 %endif
|
yading@10
|
718
|
yading@10
|
719 lea dstq, [dstq+dststrideq*2]
|
yading@10
|
720 lea srcq, [srcq+srcstrideq*2]
|
yading@10
|
721 sub heightd, 2
|
yading@10
|
722 jg .nextrow
|
yading@10
|
723 REP_RET
|
yading@10
|
724
|
yading@10
|
725 cglobal put_vp8_bilinear%1_h, 6, 6 + npicregs, 7, dst, dststride, src, srcstride, height, mx, picreg
|
yading@10
|
726 shl mxd, 4
|
yading@10
|
727 %ifdef PIC
|
yading@10
|
728 lea picregq, [bilinear_filter_vw_m]
|
yading@10
|
729 %endif
|
yading@10
|
730 pxor m6, m6
|
yading@10
|
731 mova m5, [bilinear_filter_vw+mxq-1*16]
|
yading@10
|
732 neg mxq
|
yading@10
|
733 mova m4, [bilinear_filter_vw+mxq+7*16]
|
yading@10
|
734 .nextrow:
|
yading@10
|
735 movh m0, [srcq+srcstrideq*0+0]
|
yading@10
|
736 movh m1, [srcq+srcstrideq*0+1]
|
yading@10
|
737 movh m2, [srcq+srcstrideq*1+0]
|
yading@10
|
738 movh m3, [srcq+srcstrideq*1+1]
|
yading@10
|
739 punpcklbw m0, m6
|
yading@10
|
740 punpcklbw m1, m6
|
yading@10
|
741 punpcklbw m2, m6
|
yading@10
|
742 punpcklbw m3, m6
|
yading@10
|
743 pmullw m0, m4
|
yading@10
|
744 pmullw m1, m5
|
yading@10
|
745 pmullw m2, m4
|
yading@10
|
746 pmullw m3, m5
|
yading@10
|
747 paddsw m0, m1
|
yading@10
|
748 paddsw m2, m3
|
yading@10
|
749 psraw m0, 2
|
yading@10
|
750 psraw m2, 2
|
yading@10
|
751 pavgw m0, m6
|
yading@10
|
752 pavgw m2, m6
|
yading@10
|
753 %if mmsize == 8
|
yading@10
|
754 packuswb m0, m0
|
yading@10
|
755 packuswb m2, m2
|
yading@10
|
756 movh [dstq+dststrideq*0], m0
|
yading@10
|
757 movh [dstq+dststrideq*1], m2
|
yading@10
|
758 %else
|
yading@10
|
759 packuswb m0, m2
|
yading@10
|
760 movh [dstq+dststrideq*0], m0
|
yading@10
|
761 movhps [dstq+dststrideq*1], m0
|
yading@10
|
762 %endif
|
yading@10
|
763
|
yading@10
|
764 lea dstq, [dstq+dststrideq*2]
|
yading@10
|
765 lea srcq, [srcq+srcstrideq*2]
|
yading@10
|
766 sub heightd, 2
|
yading@10
|
767 jg .nextrow
|
yading@10
|
768 REP_RET
|
yading@10
|
769 %endmacro
|
yading@10
|
770
|
yading@10
|
771 INIT_MMX mmxext
|
yading@10
|
772 FILTER_BILINEAR 4
|
yading@10
|
773 INIT_XMM sse2
|
yading@10
|
774 FILTER_BILINEAR 8
|
yading@10
|
775
|
yading@10
|
776 %macro FILTER_BILINEAR_SSSE3 1
|
yading@10
|
777 cglobal put_vp8_bilinear%1_v, 7, 7, 5, dst, dststride, src, srcstride, height, picreg, my
|
yading@10
|
778 shl myd, 4
|
yading@10
|
779 %ifdef PIC
|
yading@10
|
780 lea picregq, [bilinear_filter_vb_m]
|
yading@10
|
781 %endif
|
yading@10
|
782 pxor m4, m4
|
yading@10
|
783 mova m3, [bilinear_filter_vb+myq-16]
|
yading@10
|
784 .nextrow:
|
yading@10
|
785 movh m0, [srcq+srcstrideq*0]
|
yading@10
|
786 movh m1, [srcq+srcstrideq*1]
|
yading@10
|
787 movh m2, [srcq+srcstrideq*2]
|
yading@10
|
788 punpcklbw m0, m1
|
yading@10
|
789 punpcklbw m1, m2
|
yading@10
|
790 pmaddubsw m0, m3
|
yading@10
|
791 pmaddubsw m1, m3
|
yading@10
|
792 psraw m0, 2
|
yading@10
|
793 psraw m1, 2
|
yading@10
|
794 pavgw m0, m4
|
yading@10
|
795 pavgw m1, m4
|
yading@10
|
796 %if mmsize==8
|
yading@10
|
797 packuswb m0, m0
|
yading@10
|
798 packuswb m1, m1
|
yading@10
|
799 movh [dstq+dststrideq*0], m0
|
yading@10
|
800 movh [dstq+dststrideq*1], m1
|
yading@10
|
801 %else
|
yading@10
|
802 packuswb m0, m1
|
yading@10
|
803 movh [dstq+dststrideq*0], m0
|
yading@10
|
804 movhps [dstq+dststrideq*1], m0
|
yading@10
|
805 %endif
|
yading@10
|
806
|
yading@10
|
807 lea dstq, [dstq+dststrideq*2]
|
yading@10
|
808 lea srcq, [srcq+srcstrideq*2]
|
yading@10
|
809 sub heightd, 2
|
yading@10
|
810 jg .nextrow
|
yading@10
|
811 REP_RET
|
yading@10
|
812
|
yading@10
|
813 cglobal put_vp8_bilinear%1_h, 6, 6 + npicregs, 5, dst, dststride, src, srcstride, height, mx, picreg
|
yading@10
|
814 shl mxd, 4
|
yading@10
|
815 %ifdef PIC
|
yading@10
|
816 lea picregq, [bilinear_filter_vb_m]
|
yading@10
|
817 %endif
|
yading@10
|
818 pxor m4, m4
|
yading@10
|
819 mova m2, [filter_h2_shuf]
|
yading@10
|
820 mova m3, [bilinear_filter_vb+mxq-16]
|
yading@10
|
821 .nextrow:
|
yading@10
|
822 movu m0, [srcq+srcstrideq*0]
|
yading@10
|
823 movu m1, [srcq+srcstrideq*1]
|
yading@10
|
824 pshufb m0, m2
|
yading@10
|
825 pshufb m1, m2
|
yading@10
|
826 pmaddubsw m0, m3
|
yading@10
|
827 pmaddubsw m1, m3
|
yading@10
|
828 psraw m0, 2
|
yading@10
|
829 psraw m1, 2
|
yading@10
|
830 pavgw m0, m4
|
yading@10
|
831 pavgw m1, m4
|
yading@10
|
832 %if mmsize==8
|
yading@10
|
833 packuswb m0, m0
|
yading@10
|
834 packuswb m1, m1
|
yading@10
|
835 movh [dstq+dststrideq*0], m0
|
yading@10
|
836 movh [dstq+dststrideq*1], m1
|
yading@10
|
837 %else
|
yading@10
|
838 packuswb m0, m1
|
yading@10
|
839 movh [dstq+dststrideq*0], m0
|
yading@10
|
840 movhps [dstq+dststrideq*1], m0
|
yading@10
|
841 %endif
|
yading@10
|
842
|
yading@10
|
843 lea dstq, [dstq+dststrideq*2]
|
yading@10
|
844 lea srcq, [srcq+srcstrideq*2]
|
yading@10
|
845 sub heightd, 2
|
yading@10
|
846 jg .nextrow
|
yading@10
|
847 REP_RET
|
yading@10
|
848 %endmacro
|
yading@10
|
849
|
yading@10
|
850 INIT_MMX ssse3
|
yading@10
|
851 FILTER_BILINEAR_SSSE3 4
|
yading@10
|
852 INIT_XMM ssse3
|
yading@10
|
853 FILTER_BILINEAR_SSSE3 8
|
yading@10
|
854
|
yading@10
|
855 INIT_MMX mmx
|
yading@10
|
856 cglobal put_vp8_pixels8, 5, 5, 0, dst, dststride, src, srcstride, height
|
yading@10
|
857 .nextrow:
|
yading@10
|
858 movq mm0, [srcq+srcstrideq*0]
|
yading@10
|
859 movq mm1, [srcq+srcstrideq*1]
|
yading@10
|
860 lea srcq, [srcq+srcstrideq*2]
|
yading@10
|
861 movq [dstq+dststrideq*0], mm0
|
yading@10
|
862 movq [dstq+dststrideq*1], mm1
|
yading@10
|
863 lea dstq, [dstq+dststrideq*2]
|
yading@10
|
864 sub heightd, 2
|
yading@10
|
865 jg .nextrow
|
yading@10
|
866 REP_RET
|
yading@10
|
867
|
yading@10
|
868 %if ARCH_X86_32
|
yading@10
|
869 INIT_MMX mmx
|
yading@10
|
870 cglobal put_vp8_pixels16, 5, 5, 0, dst, dststride, src, srcstride, height
|
yading@10
|
871 .nextrow:
|
yading@10
|
872 movq mm0, [srcq+srcstrideq*0+0]
|
yading@10
|
873 movq mm1, [srcq+srcstrideq*0+8]
|
yading@10
|
874 movq mm2, [srcq+srcstrideq*1+0]
|
yading@10
|
875 movq mm3, [srcq+srcstrideq*1+8]
|
yading@10
|
876 lea srcq, [srcq+srcstrideq*2]
|
yading@10
|
877 movq [dstq+dststrideq*0+0], mm0
|
yading@10
|
878 movq [dstq+dststrideq*0+8], mm1
|
yading@10
|
879 movq [dstq+dststrideq*1+0], mm2
|
yading@10
|
880 movq [dstq+dststrideq*1+8], mm3
|
yading@10
|
881 lea dstq, [dstq+dststrideq*2]
|
yading@10
|
882 sub heightd, 2
|
yading@10
|
883 jg .nextrow
|
yading@10
|
884 REP_RET
|
yading@10
|
885 %endif
|
yading@10
|
886
|
yading@10
|
887 INIT_XMM sse
|
yading@10
|
888 cglobal put_vp8_pixels16, 5, 5, 2, dst, dststride, src, srcstride, height
|
yading@10
|
889 .nextrow:
|
yading@10
|
890 movups xmm0, [srcq+srcstrideq*0]
|
yading@10
|
891 movups xmm1, [srcq+srcstrideq*1]
|
yading@10
|
892 lea srcq, [srcq+srcstrideq*2]
|
yading@10
|
893 movaps [dstq+dststrideq*0], xmm0
|
yading@10
|
894 movaps [dstq+dststrideq*1], xmm1
|
yading@10
|
895 lea dstq, [dstq+dststrideq*2]
|
yading@10
|
896 sub heightd, 2
|
yading@10
|
897 jg .nextrow
|
yading@10
|
898 REP_RET
|
yading@10
|
899
|
yading@10
|
900 ;-----------------------------------------------------------------------------
|
yading@10
|
901 ; void vp8_idct_dc_add_<opt>(uint8_t *dst, int16_t block[16], int stride);
|
yading@10
|
902 ;-----------------------------------------------------------------------------
|
yading@10
|
903
|
yading@10
|
904 %macro ADD_DC 4
|
yading@10
|
905 %4 m2, [dst1q+%3]
|
yading@10
|
906 %4 m3, [dst1q+strideq+%3]
|
yading@10
|
907 %4 m4, [dst2q+%3]
|
yading@10
|
908 %4 m5, [dst2q+strideq+%3]
|
yading@10
|
909 paddusb m2, %1
|
yading@10
|
910 paddusb m3, %1
|
yading@10
|
911 paddusb m4, %1
|
yading@10
|
912 paddusb m5, %1
|
yading@10
|
913 psubusb m2, %2
|
yading@10
|
914 psubusb m3, %2
|
yading@10
|
915 psubusb m4, %2
|
yading@10
|
916 psubusb m5, %2
|
yading@10
|
917 %4 [dst1q+%3], m2
|
yading@10
|
918 %4 [dst1q+strideq+%3], m3
|
yading@10
|
919 %4 [dst2q+%3], m4
|
yading@10
|
920 %4 [dst2q+strideq+%3], m5
|
yading@10
|
921 %endmacro
|
yading@10
|
922
|
yading@10
|
923 INIT_MMX mmx
|
yading@10
|
924 cglobal vp8_idct_dc_add, 3, 3, 0, dst, block, stride
|
yading@10
|
925 ; load data
|
yading@10
|
926 movd m0, [blockq]
|
yading@10
|
927
|
yading@10
|
928 ; calculate DC
|
yading@10
|
929 paddw m0, [pw_4]
|
yading@10
|
930 pxor m1, m1
|
yading@10
|
931 psraw m0, 3
|
yading@10
|
932 movd [blockq], m1
|
yading@10
|
933 psubw m1, m0
|
yading@10
|
934 packuswb m0, m0
|
yading@10
|
935 packuswb m1, m1
|
yading@10
|
936 punpcklbw m0, m0
|
yading@10
|
937 punpcklbw m1, m1
|
yading@10
|
938 punpcklwd m0, m0
|
yading@10
|
939 punpcklwd m1, m1
|
yading@10
|
940
|
yading@10
|
941 ; add DC
|
yading@10
|
942 DEFINE_ARGS dst1, dst2, stride
|
yading@10
|
943 lea dst2q, [dst1q+strideq*2]
|
yading@10
|
944 ADD_DC m0, m1, 0, movh
|
yading@10
|
945 RET
|
yading@10
|
946
|
yading@10
|
947 INIT_XMM sse4
|
yading@10
|
948 cglobal vp8_idct_dc_add, 3, 3, 6, dst, block, stride
|
yading@10
|
949 ; load data
|
yading@10
|
950 movd m0, [blockq]
|
yading@10
|
951 pxor m1, m1
|
yading@10
|
952
|
yading@10
|
953 ; calculate DC
|
yading@10
|
954 paddw m0, [pw_4]
|
yading@10
|
955 movd [blockq], m1
|
yading@10
|
956 DEFINE_ARGS dst1, dst2, stride
|
yading@10
|
957 lea dst2q, [dst1q+strideq*2]
|
yading@10
|
958 movd m2, [dst1q]
|
yading@10
|
959 movd m3, [dst1q+strideq]
|
yading@10
|
960 movd m4, [dst2q]
|
yading@10
|
961 movd m5, [dst2q+strideq]
|
yading@10
|
962 psraw m0, 3
|
yading@10
|
963 pshuflw m0, m0, 0
|
yading@10
|
964 punpcklqdq m0, m0
|
yading@10
|
965 punpckldq m2, m3
|
yading@10
|
966 punpckldq m4, m5
|
yading@10
|
967 punpcklbw m2, m1
|
yading@10
|
968 punpcklbw m4, m1
|
yading@10
|
969 paddw m2, m0
|
yading@10
|
970 paddw m4, m0
|
yading@10
|
971 packuswb m2, m4
|
yading@10
|
972 movd [dst1q], m2
|
yading@10
|
973 pextrd [dst1q+strideq], m2, 1
|
yading@10
|
974 pextrd [dst2q], m2, 2
|
yading@10
|
975 pextrd [dst2q+strideq], m2, 3
|
yading@10
|
976 RET
|
yading@10
|
977
|
yading@10
|
978 ;-----------------------------------------------------------------------------
|
yading@10
|
979 ; void vp8_idct_dc_add4y_<opt>(uint8_t *dst, int16_t block[4][16], int stride);
|
yading@10
|
980 ;-----------------------------------------------------------------------------
|
yading@10
|
981
|
yading@10
|
982 %if ARCH_X86_32
|
yading@10
|
983 INIT_MMX mmx
|
yading@10
|
984 cglobal vp8_idct_dc_add4y, 3, 3, 0, dst, block, stride
|
yading@10
|
985 ; load data
|
yading@10
|
986 movd m0, [blockq+32*0] ; A
|
yading@10
|
987 movd m1, [blockq+32*2] ; C
|
yading@10
|
988 punpcklwd m0, [blockq+32*1] ; A B
|
yading@10
|
989 punpcklwd m1, [blockq+32*3] ; C D
|
yading@10
|
990 punpckldq m0, m1 ; A B C D
|
yading@10
|
991 pxor m6, m6
|
yading@10
|
992
|
yading@10
|
993 ; calculate DC
|
yading@10
|
994 paddw m0, [pw_4]
|
yading@10
|
995 movd [blockq+32*0], m6
|
yading@10
|
996 movd [blockq+32*1], m6
|
yading@10
|
997 movd [blockq+32*2], m6
|
yading@10
|
998 movd [blockq+32*3], m6
|
yading@10
|
999 psraw m0, 3
|
yading@10
|
1000 psubw m6, m0
|
yading@10
|
1001 packuswb m0, m0
|
yading@10
|
1002 packuswb m6, m6
|
yading@10
|
1003 punpcklbw m0, m0 ; AABBCCDD
|
yading@10
|
1004 punpcklbw m6, m6 ; AABBCCDD
|
yading@10
|
1005 movq m1, m0
|
yading@10
|
1006 movq m7, m6
|
yading@10
|
1007 punpcklbw m0, m0 ; AAAABBBB
|
yading@10
|
1008 punpckhbw m1, m1 ; CCCCDDDD
|
yading@10
|
1009 punpcklbw m6, m6 ; AAAABBBB
|
yading@10
|
1010 punpckhbw m7, m7 ; CCCCDDDD
|
yading@10
|
1011
|
yading@10
|
1012 ; add DC
|
yading@10
|
1013 DEFINE_ARGS dst1, dst2, stride
|
yading@10
|
1014 lea dst2q, [dst1q+strideq*2]
|
yading@10
|
1015 ADD_DC m0, m6, 0, mova
|
yading@10
|
1016 ADD_DC m1, m7, 8, mova
|
yading@10
|
1017 RET
|
yading@10
|
1018 %endif
|
yading@10
|
1019
|
yading@10
|
1020 INIT_XMM sse2
|
yading@10
|
1021 cglobal vp8_idct_dc_add4y, 3, 3, 6, dst, block, stride
|
yading@10
|
1022 ; load data
|
yading@10
|
1023 movd m0, [blockq+32*0] ; A
|
yading@10
|
1024 movd m1, [blockq+32*2] ; C
|
yading@10
|
1025 punpcklwd m0, [blockq+32*1] ; A B
|
yading@10
|
1026 punpcklwd m1, [blockq+32*3] ; C D
|
yading@10
|
1027 punpckldq m0, m1 ; A B C D
|
yading@10
|
1028 pxor m1, m1
|
yading@10
|
1029
|
yading@10
|
1030 ; calculate DC
|
yading@10
|
1031 paddw m0, [pw_4]
|
yading@10
|
1032 movd [blockq+32*0], m1
|
yading@10
|
1033 movd [blockq+32*1], m1
|
yading@10
|
1034 movd [blockq+32*2], m1
|
yading@10
|
1035 movd [blockq+32*3], m1
|
yading@10
|
1036 psraw m0, 3
|
yading@10
|
1037 psubw m1, m0
|
yading@10
|
1038 packuswb m0, m0
|
yading@10
|
1039 packuswb m1, m1
|
yading@10
|
1040 punpcklbw m0, m0
|
yading@10
|
1041 punpcklbw m1, m1
|
yading@10
|
1042 punpcklbw m0, m0
|
yading@10
|
1043 punpcklbw m1, m1
|
yading@10
|
1044
|
yading@10
|
1045 ; add DC
|
yading@10
|
1046 DEFINE_ARGS dst1, dst2, stride
|
yading@10
|
1047 lea dst2q, [dst1q+strideq*2]
|
yading@10
|
1048 ADD_DC m0, m1, 0, mova
|
yading@10
|
1049 RET
|
yading@10
|
1050
|
yading@10
|
1051 ;-----------------------------------------------------------------------------
|
yading@10
|
1052 ; void vp8_idct_dc_add4uv_<opt>(uint8_t *dst, int16_t block[4][16], int stride);
|
yading@10
|
1053 ;-----------------------------------------------------------------------------
|
yading@10
|
1054
|
yading@10
|
1055 INIT_MMX mmx
|
yading@10
|
1056 cglobal vp8_idct_dc_add4uv, 3, 3, 0, dst, block, stride
|
yading@10
|
1057 ; load data
|
yading@10
|
1058 movd m0, [blockq+32*0] ; A
|
yading@10
|
1059 movd m1, [blockq+32*2] ; C
|
yading@10
|
1060 punpcklwd m0, [blockq+32*1] ; A B
|
yading@10
|
1061 punpcklwd m1, [blockq+32*3] ; C D
|
yading@10
|
1062 punpckldq m0, m1 ; A B C D
|
yading@10
|
1063 pxor m6, m6
|
yading@10
|
1064
|
yading@10
|
1065 ; calculate DC
|
yading@10
|
1066 paddw m0, [pw_4]
|
yading@10
|
1067 movd [blockq+32*0], m6
|
yading@10
|
1068 movd [blockq+32*1], m6
|
yading@10
|
1069 movd [blockq+32*2], m6
|
yading@10
|
1070 movd [blockq+32*3], m6
|
yading@10
|
1071 psraw m0, 3
|
yading@10
|
1072 psubw m6, m0
|
yading@10
|
1073 packuswb m0, m0
|
yading@10
|
1074 packuswb m6, m6
|
yading@10
|
1075 punpcklbw m0, m0 ; AABBCCDD
|
yading@10
|
1076 punpcklbw m6, m6 ; AABBCCDD
|
yading@10
|
1077 movq m1, m0
|
yading@10
|
1078 movq m7, m6
|
yading@10
|
1079 punpcklbw m0, m0 ; AAAABBBB
|
yading@10
|
1080 punpckhbw m1, m1 ; CCCCDDDD
|
yading@10
|
1081 punpcklbw m6, m6 ; AAAABBBB
|
yading@10
|
1082 punpckhbw m7, m7 ; CCCCDDDD
|
yading@10
|
1083
|
yading@10
|
1084 ; add DC
|
yading@10
|
1085 DEFINE_ARGS dst1, dst2, stride
|
yading@10
|
1086 lea dst2q, [dst1q+strideq*2]
|
yading@10
|
1087 ADD_DC m0, m6, 0, mova
|
yading@10
|
1088 lea dst1q, [dst1q+strideq*4]
|
yading@10
|
1089 lea dst2q, [dst2q+strideq*4]
|
yading@10
|
1090 ADD_DC m1, m7, 0, mova
|
yading@10
|
1091 RET
|
yading@10
|
1092
|
yading@10
|
1093 ;-----------------------------------------------------------------------------
|
yading@10
|
1094 ; void vp8_idct_add_<opt>(uint8_t *dst, int16_t block[16], int stride);
|
yading@10
|
1095 ;-----------------------------------------------------------------------------
|
yading@10
|
1096
|
yading@10
|
1097 ; calculate %1=mul_35468(%1)-mul_20091(%2); %2=mul_20091(%1)+mul_35468(%2)
|
yading@10
|
1098 ; this macro assumes that m6/m7 have words for 20091/17734 loaded
|
yading@10
|
1099 %macro VP8_MULTIPLY_SUMSUB 4
|
yading@10
|
1100 mova %3, %1
|
yading@10
|
1101 mova %4, %2
|
yading@10
|
1102 pmulhw %3, m6 ;20091(1)
|
yading@10
|
1103 pmulhw %4, m6 ;20091(2)
|
yading@10
|
1104 paddw %3, %1
|
yading@10
|
1105 paddw %4, %2
|
yading@10
|
1106 paddw %1, %1
|
yading@10
|
1107 paddw %2, %2
|
yading@10
|
1108 pmulhw %1, m7 ;35468(1)
|
yading@10
|
1109 pmulhw %2, m7 ;35468(2)
|
yading@10
|
1110 psubw %1, %4
|
yading@10
|
1111 paddw %2, %3
|
yading@10
|
1112 %endmacro
|
yading@10
|
1113
|
yading@10
|
1114 ; calculate x0=%1+%3; x1=%1-%3
|
yading@10
|
1115 ; x2=mul_35468(%2)-mul_20091(%4); x3=mul_20091(%2)+mul_35468(%4)
|
yading@10
|
1116 ; %1=x0+x3 (tmp0); %2=x1+x2 (tmp1); %3=x1-x2 (tmp2); %4=x0-x3 (tmp3)
|
yading@10
|
1117 ; %5/%6 are temporary registers
|
yading@10
|
1118 ; we assume m6/m7 have constant words 20091/17734 loaded in them
|
yading@10
|
1119 %macro VP8_IDCT_TRANSFORM4x4_1D 6
|
yading@10
|
1120 SUMSUB_BA w, %3, %1, %5 ;t0, t1
|
yading@10
|
1121 VP8_MULTIPLY_SUMSUB m%2, m%4, m%5,m%6 ;t2, t3
|
yading@10
|
1122 SUMSUB_BA w, %4, %3, %5 ;tmp0, tmp3
|
yading@10
|
1123 SUMSUB_BA w, %2, %1, %5 ;tmp1, tmp2
|
yading@10
|
1124 SWAP %4, %1
|
yading@10
|
1125 SWAP %4, %3
|
yading@10
|
1126 %endmacro
|
yading@10
|
1127
|
yading@10
|
1128 %macro VP8_IDCT_ADD 0
|
yading@10
|
1129 cglobal vp8_idct_add, 3, 3, 0, dst, block, stride
|
yading@10
|
1130 ; load block data
|
yading@10
|
1131 movq m0, [blockq+ 0]
|
yading@10
|
1132 movq m1, [blockq+ 8]
|
yading@10
|
1133 movq m2, [blockq+16]
|
yading@10
|
1134 movq m3, [blockq+24]
|
yading@10
|
1135 movq m6, [pw_20091]
|
yading@10
|
1136 movq m7, [pw_17734]
|
yading@10
|
1137 %if cpuflag(sse)
|
yading@10
|
1138 xorps xmm0, xmm0
|
yading@10
|
1139 movaps [blockq+ 0], xmm0
|
yading@10
|
1140 movaps [blockq+16], xmm0
|
yading@10
|
1141 %else
|
yading@10
|
1142 pxor m4, m4
|
yading@10
|
1143 movq [blockq+ 0], m4
|
yading@10
|
1144 movq [blockq+ 8], m4
|
yading@10
|
1145 movq [blockq+16], m4
|
yading@10
|
1146 movq [blockq+24], m4
|
yading@10
|
1147 %endif
|
yading@10
|
1148
|
yading@10
|
1149 ; actual IDCT
|
yading@10
|
1150 VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5
|
yading@10
|
1151 TRANSPOSE4x4W 0, 1, 2, 3, 4
|
yading@10
|
1152 paddw m0, [pw_4]
|
yading@10
|
1153 VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5
|
yading@10
|
1154 TRANSPOSE4x4W 0, 1, 2, 3, 4
|
yading@10
|
1155
|
yading@10
|
1156 ; store
|
yading@10
|
1157 pxor m4, m4
|
yading@10
|
1158 DEFINE_ARGS dst1, dst2, stride
|
yading@10
|
1159 lea dst2q, [dst1q+2*strideq]
|
yading@10
|
1160 STORE_DIFFx2 m0, m1, m6, m7, m4, 3, dst1q, strideq
|
yading@10
|
1161 STORE_DIFFx2 m2, m3, m6, m7, m4, 3, dst2q, strideq
|
yading@10
|
1162
|
yading@10
|
1163 RET
|
yading@10
|
1164 %endmacro
|
yading@10
|
1165
|
yading@10
|
1166 %if ARCH_X86_32
|
yading@10
|
1167 INIT_MMX mmx
|
yading@10
|
1168 VP8_IDCT_ADD
|
yading@10
|
1169 %endif
|
yading@10
|
1170 INIT_MMX sse
|
yading@10
|
1171 VP8_IDCT_ADD
|
yading@10
|
1172
|
yading@10
|
1173 ;-----------------------------------------------------------------------------
|
yading@10
|
1174 ; void vp8_luma_dc_wht_mmxext(int16_t block[4][4][16], int16_t dc[16])
|
yading@10
|
1175 ;-----------------------------------------------------------------------------
|
yading@10
|
1176
|
yading@10
|
1177 %macro SCATTER_WHT 3
|
yading@10
|
1178 movd dc1d, m%1
|
yading@10
|
1179 movd dc2d, m%2
|
yading@10
|
1180 mov [blockq+2*16*(0+%3)], dc1w
|
yading@10
|
1181 mov [blockq+2*16*(1+%3)], dc2w
|
yading@10
|
1182 shr dc1d, 16
|
yading@10
|
1183 shr dc2d, 16
|
yading@10
|
1184 psrlq m%1, 32
|
yading@10
|
1185 psrlq m%2, 32
|
yading@10
|
1186 mov [blockq+2*16*(4+%3)], dc1w
|
yading@10
|
1187 mov [blockq+2*16*(5+%3)], dc2w
|
yading@10
|
1188 movd dc1d, m%1
|
yading@10
|
1189 movd dc2d, m%2
|
yading@10
|
1190 mov [blockq+2*16*(8+%3)], dc1w
|
yading@10
|
1191 mov [blockq+2*16*(9+%3)], dc2w
|
yading@10
|
1192 shr dc1d, 16
|
yading@10
|
1193 shr dc2d, 16
|
yading@10
|
1194 mov [blockq+2*16*(12+%3)], dc1w
|
yading@10
|
1195 mov [blockq+2*16*(13+%3)], dc2w
|
yading@10
|
1196 %endmacro
|
yading@10
|
1197
|
yading@10
|
1198 %macro HADAMARD4_1D 4
|
yading@10
|
1199 SUMSUB_BADC w, %2, %1, %4, %3
|
yading@10
|
1200 SUMSUB_BADC w, %4, %2, %3, %1
|
yading@10
|
1201 SWAP %1, %4, %3
|
yading@10
|
1202 %endmacro
|
yading@10
|
1203
|
yading@10
|
1204 %macro VP8_DC_WHT 0
|
yading@10
|
1205 cglobal vp8_luma_dc_wht, 2, 3, 0, block, dc1, dc2
|
yading@10
|
1206 movq m0, [dc1q]
|
yading@10
|
1207 movq m1, [dc1q+8]
|
yading@10
|
1208 movq m2, [dc1q+16]
|
yading@10
|
1209 movq m3, [dc1q+24]
|
yading@10
|
1210 %if cpuflag(sse)
|
yading@10
|
1211 xorps xmm0, xmm0
|
yading@10
|
1212 movaps [dc1q+ 0], xmm0
|
yading@10
|
1213 movaps [dc1q+16], xmm0
|
yading@10
|
1214 %else
|
yading@10
|
1215 pxor m4, m4
|
yading@10
|
1216 movq [dc1q+ 0], m4
|
yading@10
|
1217 movq [dc1q+ 8], m4
|
yading@10
|
1218 movq [dc1q+16], m4
|
yading@10
|
1219 movq [dc1q+24], m4
|
yading@10
|
1220 %endif
|
yading@10
|
1221 HADAMARD4_1D 0, 1, 2, 3
|
yading@10
|
1222 TRANSPOSE4x4W 0, 1, 2, 3, 4
|
yading@10
|
1223 paddw m0, [pw_3]
|
yading@10
|
1224 HADAMARD4_1D 0, 1, 2, 3
|
yading@10
|
1225 psraw m0, 3
|
yading@10
|
1226 psraw m1, 3
|
yading@10
|
1227 psraw m2, 3
|
yading@10
|
1228 psraw m3, 3
|
yading@10
|
1229 SCATTER_WHT 0, 1, 0
|
yading@10
|
1230 SCATTER_WHT 2, 3, 2
|
yading@10
|
1231 RET
|
yading@10
|
1232 %endmacro
|
yading@10
|
1233
|
yading@10
|
1234 %if ARCH_X86_32
|
yading@10
|
1235 INIT_MMX mmx
|
yading@10
|
1236 VP8_DC_WHT
|
yading@10
|
1237 %endif
|
yading@10
|
1238 INIT_MMX sse
|
yading@10
|
1239 VP8_DC_WHT
|
yading@10
|
1240
|
yading@10
|
1241 ;-----------------------------------------------------------------------------
|
yading@10
|
1242 ; void vp8_h/v_loop_filter_simple_<opt>(uint8_t *dst, int stride, int flim);
|
yading@10
|
1243 ;-----------------------------------------------------------------------------
|
yading@10
|
1244
|
yading@10
|
1245 ; macro called with 7 mm register indexes as argument, and 4 regular registers
|
yading@10
|
1246 ;
|
yading@10
|
1247 ; first 4 mm registers will carry the transposed pixel data
|
yading@10
|
1248 ; the other three are scratchspace (one would be sufficient, but this allows
|
yading@10
|
1249 ; for more spreading/pipelining and thus faster execution on OOE CPUs)
|
yading@10
|
1250 ;
|
yading@10
|
1251 ; first two regular registers are buf+4*stride and buf+5*stride
|
yading@10
|
1252 ; third is -stride, fourth is +stride
|
yading@10
|
1253 %macro READ_8x4_INTERLEAVED 11
|
yading@10
|
1254 ; interleave 8 (A-H) rows of 4 pixels each
|
yading@10
|
1255 movd m%1, [%8+%10*4] ; A0-3
|
yading@10
|
1256 movd m%5, [%9+%10*4] ; B0-3
|
yading@10
|
1257 movd m%2, [%8+%10*2] ; C0-3
|
yading@10
|
1258 movd m%6, [%8+%10] ; D0-3
|
yading@10
|
1259 movd m%3, [%8] ; E0-3
|
yading@10
|
1260 movd m%7, [%9] ; F0-3
|
yading@10
|
1261 movd m%4, [%9+%11] ; G0-3
|
yading@10
|
1262 punpcklbw m%1, m%5 ; A/B interleaved
|
yading@10
|
1263 movd m%5, [%9+%11*2] ; H0-3
|
yading@10
|
1264 punpcklbw m%2, m%6 ; C/D interleaved
|
yading@10
|
1265 punpcklbw m%3, m%7 ; E/F interleaved
|
yading@10
|
1266 punpcklbw m%4, m%5 ; G/H interleaved
|
yading@10
|
1267 %endmacro
|
yading@10
|
1268
|
yading@10
|
1269 ; macro called with 7 mm register indexes as argument, and 5 regular registers
|
yading@10
|
1270 ; first 11 mean the same as READ_8x4_TRANSPOSED above
|
yading@10
|
1271 ; fifth regular register is scratchspace to reach the bottom 8 rows, it
|
yading@10
|
1272 ; will be set to second regular register + 8*stride at the end
|
yading@10
|
1273 %macro READ_16x4_INTERLEAVED 12
|
yading@10
|
1274 ; transpose 16 (A-P) rows of 4 pixels each
|
yading@10
|
1275 lea %12, [r0+8*r2]
|
yading@10
|
1276
|
yading@10
|
1277 ; read (and interleave) those addressable by %8 (=r0), A/C/D/E/I/K/L/M
|
yading@10
|
1278 movd m%1, [%8+%10*4] ; A0-3
|
yading@10
|
1279 movd m%3, [%12+%10*4] ; I0-3
|
yading@10
|
1280 movd m%2, [%8+%10*2] ; C0-3
|
yading@10
|
1281 movd m%4, [%12+%10*2] ; K0-3
|
yading@10
|
1282 movd m%6, [%8+%10] ; D0-3
|
yading@10
|
1283 movd m%5, [%12+%10] ; L0-3
|
yading@10
|
1284 movd m%7, [%12] ; M0-3
|
yading@10
|
1285 add %12, %11
|
yading@10
|
1286 punpcklbw m%1, m%3 ; A/I
|
yading@10
|
1287 movd m%3, [%8] ; E0-3
|
yading@10
|
1288 punpcklbw m%2, m%4 ; C/K
|
yading@10
|
1289 punpcklbw m%6, m%5 ; D/L
|
yading@10
|
1290 punpcklbw m%3, m%7 ; E/M
|
yading@10
|
1291 punpcklbw m%2, m%6 ; C/D/K/L interleaved
|
yading@10
|
1292
|
yading@10
|
1293 ; read (and interleave) those addressable by %9 (=r4), B/F/G/H/J/N/O/P
|
yading@10
|
1294 movd m%5, [%9+%10*4] ; B0-3
|
yading@10
|
1295 movd m%4, [%12+%10*4] ; J0-3
|
yading@10
|
1296 movd m%7, [%9] ; F0-3
|
yading@10
|
1297 movd m%6, [%12] ; N0-3
|
yading@10
|
1298 punpcklbw m%5, m%4 ; B/J
|
yading@10
|
1299 punpcklbw m%7, m%6 ; F/N
|
yading@10
|
1300 punpcklbw m%1, m%5 ; A/B/I/J interleaved
|
yading@10
|
1301 punpcklbw m%3, m%7 ; E/F/M/N interleaved
|
yading@10
|
1302 movd m%4, [%9+%11] ; G0-3
|
yading@10
|
1303 movd m%6, [%12+%11] ; O0-3
|
yading@10
|
1304 movd m%5, [%9+%11*2] ; H0-3
|
yading@10
|
1305 movd m%7, [%12+%11*2] ; P0-3
|
yading@10
|
1306 punpcklbw m%4, m%6 ; G/O
|
yading@10
|
1307 punpcklbw m%5, m%7 ; H/P
|
yading@10
|
1308 punpcklbw m%4, m%5 ; G/H/O/P interleaved
|
yading@10
|
1309 %endmacro
|
yading@10
|
1310
|
yading@10
|
1311 ; write 4 mm registers of 2 dwords each
|
yading@10
|
1312 ; first four arguments are mm register indexes containing source data
|
yading@10
|
1313 ; last four are registers containing buf+4*stride, buf+5*stride,
|
yading@10
|
1314 ; -stride and +stride
|
yading@10
|
1315 %macro WRITE_4x2D 8
|
yading@10
|
1316 ; write out (2 dwords per register)
|
yading@10
|
1317 movd [%5+%7*4], m%1
|
yading@10
|
1318 movd [%5+%7*2], m%2
|
yading@10
|
1319 movd [%5], m%3
|
yading@10
|
1320 movd [%6+%8], m%4
|
yading@10
|
1321 punpckhdq m%1, m%1
|
yading@10
|
1322 punpckhdq m%2, m%2
|
yading@10
|
1323 punpckhdq m%3, m%3
|
yading@10
|
1324 punpckhdq m%4, m%4
|
yading@10
|
1325 movd [%6+%7*4], m%1
|
yading@10
|
1326 movd [%5+%7], m%2
|
yading@10
|
1327 movd [%6], m%3
|
yading@10
|
1328 movd [%6+%8*2], m%4
|
yading@10
|
1329 %endmacro
|
yading@10
|
1330
|
yading@10
|
1331 ; write 4 xmm registers of 4 dwords each
|
yading@10
|
1332 ; arguments same as WRITE_2x4D, but with an extra register, so that the 5 regular
|
yading@10
|
1333 ; registers contain buf+4*stride, buf+5*stride, buf+12*stride, -stride and +stride
|
yading@10
|
1334 ; we add 1*stride to the third regular registry in the process
|
yading@10
|
1335 ; the 10th argument is 16 if it's a Y filter (i.e. all regular registers cover the
|
yading@10
|
1336 ; same memory region), or 8 if they cover two separate buffers (third one points to
|
yading@10
|
1337 ; a different memory region than the first two), allowing for more optimal code for
|
yading@10
|
1338 ; the 16-width case
|
yading@10
|
1339 %macro WRITE_4x4D 10
|
yading@10
|
1340 ; write out (4 dwords per register), start with dwords zero
|
yading@10
|
1341 movd [%5+%8*4], m%1
|
yading@10
|
1342 movd [%5], m%2
|
yading@10
|
1343 movd [%7+%8*4], m%3
|
yading@10
|
1344 movd [%7], m%4
|
yading@10
|
1345
|
yading@10
|
1346 ; store dwords 1
|
yading@10
|
1347 psrldq m%1, 4
|
yading@10
|
1348 psrldq m%2, 4
|
yading@10
|
1349 psrldq m%3, 4
|
yading@10
|
1350 psrldq m%4, 4
|
yading@10
|
1351 movd [%6+%8*4], m%1
|
yading@10
|
1352 movd [%6], m%2
|
yading@10
|
1353 %if %10 == 16
|
yading@10
|
1354 movd [%6+%9*4], m%3
|
yading@10
|
1355 %endif
|
yading@10
|
1356 movd [%7+%9], m%4
|
yading@10
|
1357
|
yading@10
|
1358 ; write dwords 2
|
yading@10
|
1359 psrldq m%1, 4
|
yading@10
|
1360 psrldq m%2, 4
|
yading@10
|
1361 %if %10 == 8
|
yading@10
|
1362 movd [%5+%8*2], m%1
|
yading@10
|
1363 movd %5d, m%3
|
yading@10
|
1364 %endif
|
yading@10
|
1365 psrldq m%3, 4
|
yading@10
|
1366 psrldq m%4, 4
|
yading@10
|
1367 %if %10 == 16
|
yading@10
|
1368 movd [%5+%8*2], m%1
|
yading@10
|
1369 %endif
|
yading@10
|
1370 movd [%6+%9], m%2
|
yading@10
|
1371 movd [%7+%8*2], m%3
|
yading@10
|
1372 movd [%7+%9*2], m%4
|
yading@10
|
1373 add %7, %9
|
yading@10
|
1374
|
yading@10
|
1375 ; store dwords 3
|
yading@10
|
1376 psrldq m%1, 4
|
yading@10
|
1377 psrldq m%2, 4
|
yading@10
|
1378 psrldq m%3, 4
|
yading@10
|
1379 psrldq m%4, 4
|
yading@10
|
1380 %if %10 == 8
|
yading@10
|
1381 mov [%7+%8*4], %5d
|
yading@10
|
1382 movd [%6+%8*2], m%1
|
yading@10
|
1383 %else
|
yading@10
|
1384 movd [%5+%8], m%1
|
yading@10
|
1385 %endif
|
yading@10
|
1386 movd [%6+%9*2], m%2
|
yading@10
|
1387 movd [%7+%8*2], m%3
|
yading@10
|
1388 movd [%7+%9*2], m%4
|
yading@10
|
1389 %endmacro
|
yading@10
|
1390
|
yading@10
|
1391 ; write 4 or 8 words in the mmx/xmm registers as 8 lines
|
yading@10
|
1392 ; 1 and 2 are the registers to write, this can be the same (for SSE2)
|
yading@10
|
1393 ; for pre-SSE4:
|
yading@10
|
1394 ; 3 is a general-purpose register that we will clobber
|
yading@10
|
1395 ; for SSE4:
|
yading@10
|
1396 ; 3 is a pointer to the destination's 5th line
|
yading@10
|
1397 ; 4 is a pointer to the destination's 4th line
|
yading@10
|
1398 ; 5/6 is -stride and +stride
|
yading@10
|
1399 %macro WRITE_2x4W 6
|
yading@10
|
1400 movd %3d, %1
|
yading@10
|
1401 punpckhdq %1, %1
|
yading@10
|
1402 mov [%4+%5*4], %3w
|
yading@10
|
1403 shr %3, 16
|
yading@10
|
1404 add %4, %6
|
yading@10
|
1405 mov [%4+%5*4], %3w
|
yading@10
|
1406
|
yading@10
|
1407 movd %3d, %1
|
yading@10
|
1408 add %4, %5
|
yading@10
|
1409 mov [%4+%5*2], %3w
|
yading@10
|
1410 shr %3, 16
|
yading@10
|
1411 mov [%4+%5 ], %3w
|
yading@10
|
1412
|
yading@10
|
1413 movd %3d, %2
|
yading@10
|
1414 punpckhdq %2, %2
|
yading@10
|
1415 mov [%4 ], %3w
|
yading@10
|
1416 shr %3, 16
|
yading@10
|
1417 mov [%4+%6 ], %3w
|
yading@10
|
1418
|
yading@10
|
1419 movd %3d, %2
|
yading@10
|
1420 add %4, %6
|
yading@10
|
1421 mov [%4+%6 ], %3w
|
yading@10
|
1422 shr %3, 16
|
yading@10
|
1423 mov [%4+%6*2], %3w
|
yading@10
|
1424 add %4, %5
|
yading@10
|
1425 %endmacro
|
yading@10
|
1426
|
yading@10
|
1427 %macro WRITE_8W 5
|
yading@10
|
1428 %if cpuflag(sse4)
|
yading@10
|
1429 pextrw [%3+%4*4], %1, 0
|
yading@10
|
1430 pextrw [%2+%4*4], %1, 1
|
yading@10
|
1431 pextrw [%3+%4*2], %1, 2
|
yading@10
|
1432 pextrw [%3+%4 ], %1, 3
|
yading@10
|
1433 pextrw [%3 ], %1, 4
|
yading@10
|
1434 pextrw [%2 ], %1, 5
|
yading@10
|
1435 pextrw [%2+%5 ], %1, 6
|
yading@10
|
1436 pextrw [%2+%5*2], %1, 7
|
yading@10
|
1437 %else
|
yading@10
|
1438 movd %2d, %1
|
yading@10
|
1439 psrldq %1, 4
|
yading@10
|
1440 mov [%3+%4*4], %2w
|
yading@10
|
1441 shr %2, 16
|
yading@10
|
1442 add %3, %5
|
yading@10
|
1443 mov [%3+%4*4], %2w
|
yading@10
|
1444
|
yading@10
|
1445 movd %2d, %1
|
yading@10
|
1446 psrldq %1, 4
|
yading@10
|
1447 add %3, %4
|
yading@10
|
1448 mov [%3+%4*2], %2w
|
yading@10
|
1449 shr %2, 16
|
yading@10
|
1450 mov [%3+%4 ], %2w
|
yading@10
|
1451
|
yading@10
|
1452 movd %2d, %1
|
yading@10
|
1453 psrldq %1, 4
|
yading@10
|
1454 mov [%3 ], %2w
|
yading@10
|
1455 shr %2, 16
|
yading@10
|
1456 mov [%3+%5 ], %2w
|
yading@10
|
1457
|
yading@10
|
1458 movd %2d, %1
|
yading@10
|
1459 add %3, %5
|
yading@10
|
1460 mov [%3+%5 ], %2w
|
yading@10
|
1461 shr %2, 16
|
yading@10
|
1462 mov [%3+%5*2], %2w
|
yading@10
|
1463 %endif
|
yading@10
|
1464 %endmacro
|
yading@10
|
1465
|
yading@10
|
1466 %macro SIMPLE_LOOPFILTER 2
|
yading@10
|
1467 cglobal vp8_%1_loop_filter_simple, 3, %2, 8, dst, stride, flim, cntr
|
yading@10
|
1468 %if mmsize == 8 ; mmx/mmxext
|
yading@10
|
1469 mov cntrq, 2
|
yading@10
|
1470 %endif
|
yading@10
|
1471 %if cpuflag(ssse3)
|
yading@10
|
1472 pxor m0, m0
|
yading@10
|
1473 %endif
|
yading@10
|
1474 SPLATB_REG m7, flim, m0 ; splat "flim" into register
|
yading@10
|
1475
|
yading@10
|
1476 ; set up indexes to address 4 rows
|
yading@10
|
1477 %if mmsize == 8
|
yading@10
|
1478 DEFINE_ARGS dst1, mstride, stride, cntr, dst2
|
yading@10
|
1479 %else
|
yading@10
|
1480 DEFINE_ARGS dst1, mstride, stride, dst3, dst2
|
yading@10
|
1481 %endif
|
yading@10
|
1482 mov strideq, mstrideq
|
yading@10
|
1483 neg mstrideq
|
yading@10
|
1484 %ifidn %1, h
|
yading@10
|
1485 lea dst1q, [dst1q+4*strideq-2]
|
yading@10
|
1486 %endif
|
yading@10
|
1487
|
yading@10
|
1488 %if mmsize == 8 ; mmx / mmxext
|
yading@10
|
1489 .next8px:
|
yading@10
|
1490 %endif
|
yading@10
|
1491 %ifidn %1, v
|
yading@10
|
1492 ; read 4 half/full rows of pixels
|
yading@10
|
1493 mova m0, [dst1q+mstrideq*2] ; p1
|
yading@10
|
1494 mova m1, [dst1q+mstrideq] ; p0
|
yading@10
|
1495 mova m2, [dst1q] ; q0
|
yading@10
|
1496 mova m3, [dst1q+ strideq] ; q1
|
yading@10
|
1497 %else ; h
|
yading@10
|
1498 lea dst2q, [dst1q+ strideq]
|
yading@10
|
1499
|
yading@10
|
1500 %if mmsize == 8 ; mmx/mmxext
|
yading@10
|
1501 READ_8x4_INTERLEAVED 0, 1, 2, 3, 4, 5, 6, dst1q, dst2q, mstrideq, strideq
|
yading@10
|
1502 %else ; sse2
|
yading@10
|
1503 READ_16x4_INTERLEAVED 0, 1, 2, 3, 4, 5, 6, dst1q, dst2q, mstrideq, strideq, dst3q
|
yading@10
|
1504 %endif
|
yading@10
|
1505 TRANSPOSE4x4W 0, 1, 2, 3, 4
|
yading@10
|
1506 %endif
|
yading@10
|
1507
|
yading@10
|
1508 ; simple_limit
|
yading@10
|
1509 mova m5, m2 ; m5=backup of q0
|
yading@10
|
1510 mova m6, m1 ; m6=backup of p0
|
yading@10
|
1511 psubusb m1, m2 ; p0-q0
|
yading@10
|
1512 psubusb m2, m6 ; q0-p0
|
yading@10
|
1513 por m1, m2 ; FFABS(p0-q0)
|
yading@10
|
1514 paddusb m1, m1 ; m1=FFABS(p0-q0)*2
|
yading@10
|
1515
|
yading@10
|
1516 mova m4, m3
|
yading@10
|
1517 mova m2, m0
|
yading@10
|
1518 psubusb m3, m0 ; q1-p1
|
yading@10
|
1519 psubusb m0, m4 ; p1-q1
|
yading@10
|
1520 por m3, m0 ; FFABS(p1-q1)
|
yading@10
|
1521 mova m0, [pb_80]
|
yading@10
|
1522 pxor m2, m0
|
yading@10
|
1523 pxor m4, m0
|
yading@10
|
1524 psubsb m2, m4 ; m2=p1-q1 (signed) backup for below
|
yading@10
|
1525 pand m3, [pb_FE]
|
yading@10
|
1526 psrlq m3, 1 ; m3=FFABS(p1-q1)/2, this can be used signed
|
yading@10
|
1527 paddusb m3, m1
|
yading@10
|
1528 psubusb m3, m7
|
yading@10
|
1529 pxor m1, m1
|
yading@10
|
1530 pcmpeqb m3, m1 ; abs(p0-q0)*2+abs(p1-q1)/2<=flim mask(0xff/0x0)
|
yading@10
|
1531
|
yading@10
|
1532 ; filter_common (use m2/p1-q1, m4=q0, m6=p0, m5/q0-p0 and m3/mask)
|
yading@10
|
1533 mova m4, m5
|
yading@10
|
1534 pxor m5, m0
|
yading@10
|
1535 pxor m0, m6
|
yading@10
|
1536 psubsb m5, m0 ; q0-p0 (signed)
|
yading@10
|
1537 paddsb m2, m5
|
yading@10
|
1538 paddsb m2, m5
|
yading@10
|
1539 paddsb m2, m5 ; a=(p1-q1) + 3*(q0-p0)
|
yading@10
|
1540 pand m2, m3 ; apply filter mask (m3)
|
yading@10
|
1541
|
yading@10
|
1542 mova m3, [pb_F8]
|
yading@10
|
1543 mova m1, m2
|
yading@10
|
1544 paddsb m2, [pb_4] ; f1<<3=a+4
|
yading@10
|
1545 paddsb m1, [pb_3] ; f2<<3=a+3
|
yading@10
|
1546 pand m2, m3
|
yading@10
|
1547 pand m1, m3 ; cache f2<<3
|
yading@10
|
1548
|
yading@10
|
1549 pxor m0, m0
|
yading@10
|
1550 pxor m3, m3
|
yading@10
|
1551 pcmpgtb m0, m2 ; which values are <0?
|
yading@10
|
1552 psubb m3, m2 ; -f1<<3
|
yading@10
|
1553 psrlq m2, 3 ; +f1
|
yading@10
|
1554 psrlq m3, 3 ; -f1
|
yading@10
|
1555 pand m3, m0
|
yading@10
|
1556 pandn m0, m2
|
yading@10
|
1557 psubusb m4, m0
|
yading@10
|
1558 paddusb m4, m3 ; q0-f1
|
yading@10
|
1559
|
yading@10
|
1560 pxor m0, m0
|
yading@10
|
1561 pxor m3, m3
|
yading@10
|
1562 pcmpgtb m0, m1 ; which values are <0?
|
yading@10
|
1563 psubb m3, m1 ; -f2<<3
|
yading@10
|
1564 psrlq m1, 3 ; +f2
|
yading@10
|
1565 psrlq m3, 3 ; -f2
|
yading@10
|
1566 pand m3, m0
|
yading@10
|
1567 pandn m0, m1
|
yading@10
|
1568 paddusb m6, m0
|
yading@10
|
1569 psubusb m6, m3 ; p0+f2
|
yading@10
|
1570
|
yading@10
|
1571 ; store
|
yading@10
|
1572 %ifidn %1, v
|
yading@10
|
1573 mova [dst1q], m4
|
yading@10
|
1574 mova [dst1q+mstrideq], m6
|
yading@10
|
1575 %else ; h
|
yading@10
|
1576 inc dst1q
|
yading@10
|
1577 SBUTTERFLY bw, 6, 4, 0
|
yading@10
|
1578
|
yading@10
|
1579 %if mmsize == 16 ; sse2
|
yading@10
|
1580 %if cpuflag(sse4)
|
yading@10
|
1581 inc dst2q
|
yading@10
|
1582 %endif
|
yading@10
|
1583 WRITE_8W m6, dst2q, dst1q, mstrideq, strideq
|
yading@10
|
1584 lea dst2q, [dst3q+mstrideq+1]
|
yading@10
|
1585 %if cpuflag(sse4)
|
yading@10
|
1586 inc dst3q
|
yading@10
|
1587 %endif
|
yading@10
|
1588 WRITE_8W m4, dst3q, dst2q, mstrideq, strideq
|
yading@10
|
1589 %else ; mmx/mmxext
|
yading@10
|
1590 WRITE_2x4W m6, m4, dst2q, dst1q, mstrideq, strideq
|
yading@10
|
1591 %endif
|
yading@10
|
1592 %endif
|
yading@10
|
1593
|
yading@10
|
1594 %if mmsize == 8 ; mmx/mmxext
|
yading@10
|
1595 ; next 8 pixels
|
yading@10
|
1596 %ifidn %1, v
|
yading@10
|
1597 add dst1q, 8 ; advance 8 cols = pixels
|
yading@10
|
1598 %else ; h
|
yading@10
|
1599 lea dst1q, [dst1q+strideq*8-1] ; advance 8 rows = lines
|
yading@10
|
1600 %endif
|
yading@10
|
1601 dec cntrq
|
yading@10
|
1602 jg .next8px
|
yading@10
|
1603 REP_RET
|
yading@10
|
1604 %else ; sse2
|
yading@10
|
1605 RET
|
yading@10
|
1606 %endif
|
yading@10
|
1607 %endmacro
|
yading@10
|
1608
|
yading@10
|
1609 %if ARCH_X86_32
|
yading@10
|
1610 INIT_MMX mmx
|
yading@10
|
1611 SIMPLE_LOOPFILTER v, 4
|
yading@10
|
1612 SIMPLE_LOOPFILTER h, 5
|
yading@10
|
1613 INIT_MMX mmxext
|
yading@10
|
1614 SIMPLE_LOOPFILTER v, 4
|
yading@10
|
1615 SIMPLE_LOOPFILTER h, 5
|
yading@10
|
1616 %endif
|
yading@10
|
1617
|
yading@10
|
1618 INIT_XMM sse2
|
yading@10
|
1619 SIMPLE_LOOPFILTER v, 3
|
yading@10
|
1620 SIMPLE_LOOPFILTER h, 5
|
yading@10
|
1621 INIT_XMM ssse3
|
yading@10
|
1622 SIMPLE_LOOPFILTER v, 3
|
yading@10
|
1623 SIMPLE_LOOPFILTER h, 5
|
yading@10
|
1624 INIT_XMM sse4
|
yading@10
|
1625 SIMPLE_LOOPFILTER h, 5
|
yading@10
|
1626
|
yading@10
|
1627 ;-----------------------------------------------------------------------------
|
yading@10
|
1628 ; void vp8_h/v_loop_filter<size>_inner_<opt>(uint8_t *dst, [uint8_t *v,] int stride,
|
yading@10
|
1629 ; int flimE, int flimI, int hev_thr);
|
yading@10
|
1630 ;-----------------------------------------------------------------------------
|
yading@10
|
1631
|
yading@10
|
1632 %macro INNER_LOOPFILTER 2
|
yading@10
|
1633 %define stack_size 0
|
yading@10
|
1634 %ifndef m8 ; stack layout: [0]=E, [1]=I, [2]=hev_thr
|
yading@10
|
1635 %ifidn %1, v ; [3]=hev() result
|
yading@10
|
1636 %define stack_size mmsize * -4
|
yading@10
|
1637 %else ; h ; extra storage space for transposes
|
yading@10
|
1638 %define stack_size mmsize * -5
|
yading@10
|
1639 %endif
|
yading@10
|
1640 %endif
|
yading@10
|
1641
|
yading@10
|
1642 %if %2 == 8 ; chroma
|
yading@10
|
1643 cglobal vp8_%1_loop_filter8uv_inner, 6, 6, 13, stack_size, dst, dst8, stride, flimE, flimI, hevthr
|
yading@10
|
1644 %else ; luma
|
yading@10
|
1645 cglobal vp8_%1_loop_filter16y_inner, 5, 5, 13, stack_size, dst, stride, flimE, flimI, hevthr
|
yading@10
|
1646 %endif
|
yading@10
|
1647
|
yading@10
|
1648 %if cpuflag(ssse3)
|
yading@10
|
1649 pxor m7, m7
|
yading@10
|
1650 %endif
|
yading@10
|
1651
|
yading@10
|
1652 %ifndef m8
|
yading@10
|
1653 ; splat function arguments
|
yading@10
|
1654 SPLATB_REG m0, flimEq, m7 ; E
|
yading@10
|
1655 SPLATB_REG m1, flimIq, m7 ; I
|
yading@10
|
1656 SPLATB_REG m2, hevthrq, m7 ; hev_thresh
|
yading@10
|
1657
|
yading@10
|
1658 %define m_flimE [rsp]
|
yading@10
|
1659 %define m_flimI [rsp+mmsize]
|
yading@10
|
1660 %define m_hevthr [rsp+mmsize*2]
|
yading@10
|
1661 %define m_maskres [rsp+mmsize*3]
|
yading@10
|
1662 %define m_p0backup [rsp+mmsize*3]
|
yading@10
|
1663 %define m_q0backup [rsp+mmsize*4]
|
yading@10
|
1664
|
yading@10
|
1665 mova m_flimE, m0
|
yading@10
|
1666 mova m_flimI, m1
|
yading@10
|
1667 mova m_hevthr, m2
|
yading@10
|
1668 %else
|
yading@10
|
1669 %define m_flimE m9
|
yading@10
|
1670 %define m_flimI m10
|
yading@10
|
1671 %define m_hevthr m11
|
yading@10
|
1672 %define m_maskres m12
|
yading@10
|
1673 %define m_p0backup m12
|
yading@10
|
1674 %define m_q0backup m8
|
yading@10
|
1675
|
yading@10
|
1676 ; splat function arguments
|
yading@10
|
1677 SPLATB_REG m_flimE, flimEq, m7 ; E
|
yading@10
|
1678 SPLATB_REG m_flimI, flimIq, m7 ; I
|
yading@10
|
1679 SPLATB_REG m_hevthr, hevthrq, m7 ; hev_thresh
|
yading@10
|
1680 %endif
|
yading@10
|
1681
|
yading@10
|
1682 %if %2 == 8 ; chroma
|
yading@10
|
1683 DEFINE_ARGS dst1, dst8, mstride, stride, dst2
|
yading@10
|
1684 %elif mmsize == 8
|
yading@10
|
1685 DEFINE_ARGS dst1, mstride, stride, dst2, cntr
|
yading@10
|
1686 mov cntrq, 2
|
yading@10
|
1687 %else
|
yading@10
|
1688 DEFINE_ARGS dst1, mstride, stride, dst2, dst8
|
yading@10
|
1689 %endif
|
yading@10
|
1690 mov strideq, mstrideq
|
yading@10
|
1691 neg mstrideq
|
yading@10
|
1692 %ifidn %1, h
|
yading@10
|
1693 lea dst1q, [dst1q+strideq*4-4]
|
yading@10
|
1694 %if %2 == 8 ; chroma
|
yading@10
|
1695 lea dst8q, [dst8q+strideq*4-4]
|
yading@10
|
1696 %endif
|
yading@10
|
1697 %endif
|
yading@10
|
1698
|
yading@10
|
1699 %if mmsize == 8
|
yading@10
|
1700 .next8px:
|
yading@10
|
1701 %endif
|
yading@10
|
1702 ; read
|
yading@10
|
1703 lea dst2q, [dst1q+strideq]
|
yading@10
|
1704 %ifidn %1, v
|
yading@10
|
1705 %if %2 == 8 && mmsize == 16
|
yading@10
|
1706 %define movrow movh
|
yading@10
|
1707 %else
|
yading@10
|
1708 %define movrow mova
|
yading@10
|
1709 %endif
|
yading@10
|
1710 movrow m0, [dst1q+mstrideq*4] ; p3
|
yading@10
|
1711 movrow m1, [dst2q+mstrideq*4] ; p2
|
yading@10
|
1712 movrow m2, [dst1q+mstrideq*2] ; p1
|
yading@10
|
1713 movrow m5, [dst2q] ; q1
|
yading@10
|
1714 movrow m6, [dst2q+ strideq*1] ; q2
|
yading@10
|
1715 movrow m7, [dst2q+ strideq*2] ; q3
|
yading@10
|
1716 %if mmsize == 16 && %2 == 8
|
yading@10
|
1717 movhps m0, [dst8q+mstrideq*4]
|
yading@10
|
1718 movhps m2, [dst8q+mstrideq*2]
|
yading@10
|
1719 add dst8q, strideq
|
yading@10
|
1720 movhps m1, [dst8q+mstrideq*4]
|
yading@10
|
1721 movhps m5, [dst8q]
|
yading@10
|
1722 movhps m6, [dst8q+ strideq ]
|
yading@10
|
1723 movhps m7, [dst8q+ strideq*2]
|
yading@10
|
1724 add dst8q, mstrideq
|
yading@10
|
1725 %endif
|
yading@10
|
1726 %elif mmsize == 8 ; mmx/mmxext (h)
|
yading@10
|
1727 ; read 8 rows of 8px each
|
yading@10
|
1728 movu m0, [dst1q+mstrideq*4]
|
yading@10
|
1729 movu m1, [dst2q+mstrideq*4]
|
yading@10
|
1730 movu m2, [dst1q+mstrideq*2]
|
yading@10
|
1731 movu m3, [dst1q+mstrideq ]
|
yading@10
|
1732 movu m4, [dst1q]
|
yading@10
|
1733 movu m5, [dst2q]
|
yading@10
|
1734 movu m6, [dst2q+ strideq ]
|
yading@10
|
1735
|
yading@10
|
1736 ; 8x8 transpose
|
yading@10
|
1737 TRANSPOSE4x4B 0, 1, 2, 3, 7
|
yading@10
|
1738 mova m_q0backup, m1
|
yading@10
|
1739 movu m7, [dst2q+ strideq*2]
|
yading@10
|
1740 TRANSPOSE4x4B 4, 5, 6, 7, 1
|
yading@10
|
1741 SBUTTERFLY dq, 0, 4, 1 ; p3/p2
|
yading@10
|
1742 SBUTTERFLY dq, 2, 6, 1 ; q0/q1
|
yading@10
|
1743 SBUTTERFLY dq, 3, 7, 1 ; q2/q3
|
yading@10
|
1744 mova m1, m_q0backup
|
yading@10
|
1745 mova m_q0backup, m2 ; store q0
|
yading@10
|
1746 SBUTTERFLY dq, 1, 5, 2 ; p1/p0
|
yading@10
|
1747 mova m_p0backup, m5 ; store p0
|
yading@10
|
1748 SWAP 1, 4
|
yading@10
|
1749 SWAP 2, 4
|
yading@10
|
1750 SWAP 6, 3
|
yading@10
|
1751 SWAP 5, 3
|
yading@10
|
1752 %else ; sse2 (h)
|
yading@10
|
1753 %if %2 == 16
|
yading@10
|
1754 lea dst8q, [dst1q+ strideq*8]
|
yading@10
|
1755 %endif
|
yading@10
|
1756
|
yading@10
|
1757 ; read 16 rows of 8px each, interleave
|
yading@10
|
1758 movh m0, [dst1q+mstrideq*4]
|
yading@10
|
1759 movh m1, [dst8q+mstrideq*4]
|
yading@10
|
1760 movh m2, [dst1q+mstrideq*2]
|
yading@10
|
1761 movh m5, [dst8q+mstrideq*2]
|
yading@10
|
1762 movh m3, [dst1q+mstrideq ]
|
yading@10
|
1763 movh m6, [dst8q+mstrideq ]
|
yading@10
|
1764 movh m4, [dst1q]
|
yading@10
|
1765 movh m7, [dst8q]
|
yading@10
|
1766 punpcklbw m0, m1 ; A/I
|
yading@10
|
1767 punpcklbw m2, m5 ; C/K
|
yading@10
|
1768 punpcklbw m3, m6 ; D/L
|
yading@10
|
1769 punpcklbw m4, m7 ; E/M
|
yading@10
|
1770
|
yading@10
|
1771 add dst8q, strideq
|
yading@10
|
1772 movh m1, [dst2q+mstrideq*4]
|
yading@10
|
1773 movh m6, [dst8q+mstrideq*4]
|
yading@10
|
1774 movh m5, [dst2q]
|
yading@10
|
1775 movh m7, [dst8q]
|
yading@10
|
1776 punpcklbw m1, m6 ; B/J
|
yading@10
|
1777 punpcklbw m5, m7 ; F/N
|
yading@10
|
1778 movh m6, [dst2q+ strideq ]
|
yading@10
|
1779 movh m7, [dst8q+ strideq ]
|
yading@10
|
1780 punpcklbw m6, m7 ; G/O
|
yading@10
|
1781
|
yading@10
|
1782 ; 8x16 transpose
|
yading@10
|
1783 TRANSPOSE4x4B 0, 1, 2, 3, 7
|
yading@10
|
1784 %ifdef m8
|
yading@10
|
1785 SWAP 1, 8
|
yading@10
|
1786 %else
|
yading@10
|
1787 mova m_q0backup, m1
|
yading@10
|
1788 %endif
|
yading@10
|
1789 movh m7, [dst2q+ strideq*2]
|
yading@10
|
1790 movh m1, [dst8q+ strideq*2]
|
yading@10
|
1791 punpcklbw m7, m1 ; H/P
|
yading@10
|
1792 TRANSPOSE4x4B 4, 5, 6, 7, 1
|
yading@10
|
1793 SBUTTERFLY dq, 0, 4, 1 ; p3/p2
|
yading@10
|
1794 SBUTTERFLY dq, 2, 6, 1 ; q0/q1
|
yading@10
|
1795 SBUTTERFLY dq, 3, 7, 1 ; q2/q3
|
yading@10
|
1796 %ifdef m8
|
yading@10
|
1797 SWAP 1, 8
|
yading@10
|
1798 SWAP 2, 8
|
yading@10
|
1799 %else
|
yading@10
|
1800 mova m1, m_q0backup
|
yading@10
|
1801 mova m_q0backup, m2 ; store q0
|
yading@10
|
1802 %endif
|
yading@10
|
1803 SBUTTERFLY dq, 1, 5, 2 ; p1/p0
|
yading@10
|
1804 %ifdef m12
|
yading@10
|
1805 SWAP 5, 12
|
yading@10
|
1806 %else
|
yading@10
|
1807 mova m_p0backup, m5 ; store p0
|
yading@10
|
1808 %endif
|
yading@10
|
1809 SWAP 1, 4
|
yading@10
|
1810 SWAP 2, 4
|
yading@10
|
1811 SWAP 6, 3
|
yading@10
|
1812 SWAP 5, 3
|
yading@10
|
1813 %endif
|
yading@10
|
1814
|
yading@10
|
1815 ; normal_limit for p3-p2, p2-p1, q3-q2 and q2-q1
|
yading@10
|
1816 mova m4, m1
|
yading@10
|
1817 SWAP 4, 1
|
yading@10
|
1818 psubusb m4, m0 ; p2-p3
|
yading@10
|
1819 psubusb m0, m1 ; p3-p2
|
yading@10
|
1820 por m0, m4 ; abs(p3-p2)
|
yading@10
|
1821
|
yading@10
|
1822 mova m4, m2
|
yading@10
|
1823 SWAP 4, 2
|
yading@10
|
1824 psubusb m4, m1 ; p1-p2
|
yading@10
|
1825 psubusb m1, m2 ; p2-p1
|
yading@10
|
1826 por m1, m4 ; abs(p2-p1)
|
yading@10
|
1827
|
yading@10
|
1828 mova m4, m6
|
yading@10
|
1829 SWAP 4, 6
|
yading@10
|
1830 psubusb m4, m7 ; q2-q3
|
yading@10
|
1831 psubusb m7, m6 ; q3-q2
|
yading@10
|
1832 por m7, m4 ; abs(q3-q2)
|
yading@10
|
1833
|
yading@10
|
1834 mova m4, m5
|
yading@10
|
1835 SWAP 4, 5
|
yading@10
|
1836 psubusb m4, m6 ; q1-q2
|
yading@10
|
1837 psubusb m6, m5 ; q2-q1
|
yading@10
|
1838 por m6, m4 ; abs(q2-q1)
|
yading@10
|
1839
|
yading@10
|
1840 %if notcpuflag(mmxext)
|
yading@10
|
1841 mova m4, m_flimI
|
yading@10
|
1842 pxor m3, m3
|
yading@10
|
1843 psubusb m0, m4
|
yading@10
|
1844 psubusb m1, m4
|
yading@10
|
1845 psubusb m7, m4
|
yading@10
|
1846 psubusb m6, m4
|
yading@10
|
1847 pcmpeqb m0, m3 ; abs(p3-p2) <= I
|
yading@10
|
1848 pcmpeqb m1, m3 ; abs(p2-p1) <= I
|
yading@10
|
1849 pcmpeqb m7, m3 ; abs(q3-q2) <= I
|
yading@10
|
1850 pcmpeqb m6, m3 ; abs(q2-q1) <= I
|
yading@10
|
1851 pand m0, m1
|
yading@10
|
1852 pand m7, m6
|
yading@10
|
1853 pand m0, m7
|
yading@10
|
1854 %else ; mmxext/sse2
|
yading@10
|
1855 pmaxub m0, m1
|
yading@10
|
1856 pmaxub m6, m7
|
yading@10
|
1857 pmaxub m0, m6
|
yading@10
|
1858 %endif
|
yading@10
|
1859
|
yading@10
|
1860 ; normal_limit and high_edge_variance for p1-p0, q1-q0
|
yading@10
|
1861 SWAP 7, 3 ; now m7 is zero
|
yading@10
|
1862 %ifidn %1, v
|
yading@10
|
1863 movrow m3, [dst1q+mstrideq ] ; p0
|
yading@10
|
1864 %if mmsize == 16 && %2 == 8
|
yading@10
|
1865 movhps m3, [dst8q+mstrideq ]
|
yading@10
|
1866 %endif
|
yading@10
|
1867 %elifdef m12
|
yading@10
|
1868 SWAP 3, 12
|
yading@10
|
1869 %else
|
yading@10
|
1870 mova m3, m_p0backup
|
yading@10
|
1871 %endif
|
yading@10
|
1872
|
yading@10
|
1873 mova m1, m2
|
yading@10
|
1874 SWAP 1, 2
|
yading@10
|
1875 mova m6, m3
|
yading@10
|
1876 SWAP 3, 6
|
yading@10
|
1877 psubusb m1, m3 ; p1-p0
|
yading@10
|
1878 psubusb m6, m2 ; p0-p1
|
yading@10
|
1879 por m1, m6 ; abs(p1-p0)
|
yading@10
|
1880 %if notcpuflag(mmxext)
|
yading@10
|
1881 mova m6, m1
|
yading@10
|
1882 psubusb m1, m4
|
yading@10
|
1883 psubusb m6, m_hevthr
|
yading@10
|
1884 pcmpeqb m1, m7 ; abs(p1-p0) <= I
|
yading@10
|
1885 pcmpeqb m6, m7 ; abs(p1-p0) <= hev_thresh
|
yading@10
|
1886 pand m0, m1
|
yading@10
|
1887 mova m_maskres, m6
|
yading@10
|
1888 %else ; mmxext/sse2
|
yading@10
|
1889 pmaxub m0, m1 ; max_I
|
yading@10
|
1890 SWAP 1, 4 ; max_hev_thresh
|
yading@10
|
1891 %endif
|
yading@10
|
1892
|
yading@10
|
1893 SWAP 6, 4 ; now m6 is I
|
yading@10
|
1894 %ifidn %1, v
|
yading@10
|
1895 movrow m4, [dst1q] ; q0
|
yading@10
|
1896 %if mmsize == 16 && %2 == 8
|
yading@10
|
1897 movhps m4, [dst8q]
|
yading@10
|
1898 %endif
|
yading@10
|
1899 %elifdef m8
|
yading@10
|
1900 SWAP 4, 8
|
yading@10
|
1901 %else
|
yading@10
|
1902 mova m4, m_q0backup
|
yading@10
|
1903 %endif
|
yading@10
|
1904 mova m1, m4
|
yading@10
|
1905 SWAP 1, 4
|
yading@10
|
1906 mova m7, m5
|
yading@10
|
1907 SWAP 7, 5
|
yading@10
|
1908 psubusb m1, m5 ; q0-q1
|
yading@10
|
1909 psubusb m7, m4 ; q1-q0
|
yading@10
|
1910 por m1, m7 ; abs(q1-q0)
|
yading@10
|
1911 %if notcpuflag(mmxext)
|
yading@10
|
1912 mova m7, m1
|
yading@10
|
1913 psubusb m1, m6
|
yading@10
|
1914 psubusb m7, m_hevthr
|
yading@10
|
1915 pxor m6, m6
|
yading@10
|
1916 pcmpeqb m1, m6 ; abs(q1-q0) <= I
|
yading@10
|
1917 pcmpeqb m7, m6 ; abs(q1-q0) <= hev_thresh
|
yading@10
|
1918 mova m6, m_maskres
|
yading@10
|
1919 pand m0, m1 ; abs([pq][321]-[pq][210]) <= I
|
yading@10
|
1920 pand m6, m7
|
yading@10
|
1921 %else ; mmxext/sse2
|
yading@10
|
1922 pxor m7, m7
|
yading@10
|
1923 pmaxub m0, m1
|
yading@10
|
1924 pmaxub m6, m1
|
yading@10
|
1925 psubusb m0, m_flimI
|
yading@10
|
1926 psubusb m6, m_hevthr
|
yading@10
|
1927 pcmpeqb m0, m7 ; max(abs(..)) <= I
|
yading@10
|
1928 pcmpeqb m6, m7 ; !(max(abs..) > thresh)
|
yading@10
|
1929 %endif
|
yading@10
|
1930 %ifdef m12
|
yading@10
|
1931 SWAP 6, 12
|
yading@10
|
1932 %else
|
yading@10
|
1933 mova m_maskres, m6 ; !(abs(p1-p0) > hev_t || abs(q1-q0) > hev_t)
|
yading@10
|
1934 %endif
|
yading@10
|
1935
|
yading@10
|
1936 ; simple_limit
|
yading@10
|
1937 mova m1, m3
|
yading@10
|
1938 SWAP 1, 3
|
yading@10
|
1939 mova m6, m4 ; keep copies of p0/q0 around for later use
|
yading@10
|
1940 SWAP 6, 4
|
yading@10
|
1941 psubusb m1, m4 ; p0-q0
|
yading@10
|
1942 psubusb m6, m3 ; q0-p0
|
yading@10
|
1943 por m1, m6 ; abs(q0-p0)
|
yading@10
|
1944 paddusb m1, m1 ; m1=2*abs(q0-p0)
|
yading@10
|
1945
|
yading@10
|
1946 mova m7, m2
|
yading@10
|
1947 SWAP 7, 2
|
yading@10
|
1948 mova m6, m5
|
yading@10
|
1949 SWAP 6, 5
|
yading@10
|
1950 psubusb m7, m5 ; p1-q1
|
yading@10
|
1951 psubusb m6, m2 ; q1-p1
|
yading@10
|
1952 por m7, m6 ; abs(q1-p1)
|
yading@10
|
1953 pxor m6, m6
|
yading@10
|
1954 pand m7, [pb_FE]
|
yading@10
|
1955 psrlq m7, 1 ; abs(q1-p1)/2
|
yading@10
|
1956 paddusb m7, m1 ; abs(q0-p0)*2+abs(q1-p1)/2
|
yading@10
|
1957 psubusb m7, m_flimE
|
yading@10
|
1958 pcmpeqb m7, m6 ; abs(q0-p0)*2+abs(q1-p1)/2 <= E
|
yading@10
|
1959 pand m0, m7 ; normal_limit result
|
yading@10
|
1960
|
yading@10
|
1961 ; filter_common; at this point, m2-m5=p1-q1 and m0 is filter_mask
|
yading@10
|
1962 %ifdef m8 ; x86-64 && sse2
|
yading@10
|
1963 mova m8, [pb_80]
|
yading@10
|
1964 %define m_pb_80 m8
|
yading@10
|
1965 %else ; x86-32 or mmx/mmxext
|
yading@10
|
1966 %define m_pb_80 [pb_80]
|
yading@10
|
1967 %endif
|
yading@10
|
1968 mova m1, m4
|
yading@10
|
1969 mova m7, m3
|
yading@10
|
1970 pxor m1, m_pb_80
|
yading@10
|
1971 pxor m7, m_pb_80
|
yading@10
|
1972 psubsb m1, m7 ; (signed) q0-p0
|
yading@10
|
1973 mova m6, m2
|
yading@10
|
1974 mova m7, m5
|
yading@10
|
1975 pxor m6, m_pb_80
|
yading@10
|
1976 pxor m7, m_pb_80
|
yading@10
|
1977 psubsb m6, m7 ; (signed) p1-q1
|
yading@10
|
1978 mova m7, m_maskres
|
yading@10
|
1979 pandn m7, m6
|
yading@10
|
1980 paddsb m7, m1
|
yading@10
|
1981 paddsb m7, m1
|
yading@10
|
1982 paddsb m7, m1 ; 3*(q0-p0)+is4tap?(p1-q1)
|
yading@10
|
1983
|
yading@10
|
1984 pand m7, m0
|
yading@10
|
1985 mova m1, [pb_F8]
|
yading@10
|
1986 mova m6, m7
|
yading@10
|
1987 paddsb m7, [pb_3]
|
yading@10
|
1988 paddsb m6, [pb_4]
|
yading@10
|
1989 pand m7, m1
|
yading@10
|
1990 pand m6, m1
|
yading@10
|
1991
|
yading@10
|
1992 pxor m1, m1
|
yading@10
|
1993 pxor m0, m0
|
yading@10
|
1994 pcmpgtb m1, m7
|
yading@10
|
1995 psubb m0, m7
|
yading@10
|
1996 psrlq m7, 3 ; +f2
|
yading@10
|
1997 psrlq m0, 3 ; -f2
|
yading@10
|
1998 pand m0, m1
|
yading@10
|
1999 pandn m1, m7
|
yading@10
|
2000 psubusb m3, m0
|
yading@10
|
2001 paddusb m3, m1 ; p0+f2
|
yading@10
|
2002
|
yading@10
|
2003 pxor m1, m1
|
yading@10
|
2004 pxor m0, m0
|
yading@10
|
2005 pcmpgtb m0, m6
|
yading@10
|
2006 psubb m1, m6
|
yading@10
|
2007 psrlq m6, 3 ; +f1
|
yading@10
|
2008 psrlq m1, 3 ; -f1
|
yading@10
|
2009 pand m1, m0
|
yading@10
|
2010 pandn m0, m6
|
yading@10
|
2011 psubusb m4, m0
|
yading@10
|
2012 paddusb m4, m1 ; q0-f1
|
yading@10
|
2013
|
yading@10
|
2014 %ifdef m12
|
yading@10
|
2015 SWAP 6, 12
|
yading@10
|
2016 %else
|
yading@10
|
2017 mova m6, m_maskres
|
yading@10
|
2018 %endif
|
yading@10
|
2019 %if notcpuflag(mmxext)
|
yading@10
|
2020 mova m7, [pb_1]
|
yading@10
|
2021 %else ; mmxext/sse2
|
yading@10
|
2022 pxor m7, m7
|
yading@10
|
2023 %endif
|
yading@10
|
2024 pand m0, m6
|
yading@10
|
2025 pand m1, m6
|
yading@10
|
2026 %if notcpuflag(mmxext)
|
yading@10
|
2027 paddusb m0, m7
|
yading@10
|
2028 pand m1, [pb_FE]
|
yading@10
|
2029 pandn m7, m0
|
yading@10
|
2030 psrlq m1, 1
|
yading@10
|
2031 psrlq m7, 1
|
yading@10
|
2032 SWAP 0, 7
|
yading@10
|
2033 %else ; mmxext/sse2
|
yading@10
|
2034 psubusb m1, [pb_1]
|
yading@10
|
2035 pavgb m0, m7 ; a
|
yading@10
|
2036 pavgb m1, m7 ; -a
|
yading@10
|
2037 %endif
|
yading@10
|
2038 psubusb m5, m0
|
yading@10
|
2039 psubusb m2, m1
|
yading@10
|
2040 paddusb m5, m1 ; q1-a
|
yading@10
|
2041 paddusb m2, m0 ; p1+a
|
yading@10
|
2042
|
yading@10
|
2043 ; store
|
yading@10
|
2044 %ifidn %1, v
|
yading@10
|
2045 movrow [dst1q+mstrideq*2], m2
|
yading@10
|
2046 movrow [dst1q+mstrideq ], m3
|
yading@10
|
2047 movrow [dst1q], m4
|
yading@10
|
2048 movrow [dst1q+ strideq ], m5
|
yading@10
|
2049 %if mmsize == 16 && %2 == 8
|
yading@10
|
2050 movhps [dst8q+mstrideq*2], m2
|
yading@10
|
2051 movhps [dst8q+mstrideq ], m3
|
yading@10
|
2052 movhps [dst8q], m4
|
yading@10
|
2053 movhps [dst8q+ strideq ], m5
|
yading@10
|
2054 %endif
|
yading@10
|
2055 %else ; h
|
yading@10
|
2056 add dst1q, 2
|
yading@10
|
2057 add dst2q, 2
|
yading@10
|
2058
|
yading@10
|
2059 ; 4x8/16 transpose
|
yading@10
|
2060 TRANSPOSE4x4B 2, 3, 4, 5, 6
|
yading@10
|
2061
|
yading@10
|
2062 %if mmsize == 8 ; mmx/mmxext (h)
|
yading@10
|
2063 WRITE_4x2D 2, 3, 4, 5, dst1q, dst2q, mstrideq, strideq
|
yading@10
|
2064 %else ; sse2 (h)
|
yading@10
|
2065 lea dst8q, [dst8q+mstrideq +2]
|
yading@10
|
2066 WRITE_4x4D 2, 3, 4, 5, dst1q, dst2q, dst8q, mstrideq, strideq, %2
|
yading@10
|
2067 %endif
|
yading@10
|
2068 %endif
|
yading@10
|
2069
|
yading@10
|
2070 %if mmsize == 8
|
yading@10
|
2071 %if %2 == 8 ; chroma
|
yading@10
|
2072 %ifidn %1, h
|
yading@10
|
2073 sub dst1q, 2
|
yading@10
|
2074 %endif
|
yading@10
|
2075 cmp dst1q, dst8q
|
yading@10
|
2076 mov dst1q, dst8q
|
yading@10
|
2077 jnz .next8px
|
yading@10
|
2078 %else
|
yading@10
|
2079 %ifidn %1, h
|
yading@10
|
2080 lea dst1q, [dst1q+ strideq*8-2]
|
yading@10
|
2081 %else ; v
|
yading@10
|
2082 add dst1q, 8
|
yading@10
|
2083 %endif
|
yading@10
|
2084 dec cntrq
|
yading@10
|
2085 jg .next8px
|
yading@10
|
2086 %endif
|
yading@10
|
2087 REP_RET
|
yading@10
|
2088 %else ; mmsize == 16
|
yading@10
|
2089 RET
|
yading@10
|
2090 %endif
|
yading@10
|
2091 %endmacro
|
yading@10
|
2092
|
yading@10
|
2093 %if ARCH_X86_32
|
yading@10
|
2094 INIT_MMX mmx
|
yading@10
|
2095 INNER_LOOPFILTER v, 16
|
yading@10
|
2096 INNER_LOOPFILTER h, 16
|
yading@10
|
2097 INNER_LOOPFILTER v, 8
|
yading@10
|
2098 INNER_LOOPFILTER h, 8
|
yading@10
|
2099
|
yading@10
|
2100 INIT_MMX mmxext
|
yading@10
|
2101 INNER_LOOPFILTER v, 16
|
yading@10
|
2102 INNER_LOOPFILTER h, 16
|
yading@10
|
2103 INNER_LOOPFILTER v, 8
|
yading@10
|
2104 INNER_LOOPFILTER h, 8
|
yading@10
|
2105 %endif
|
yading@10
|
2106
|
yading@10
|
2107 INIT_XMM sse2
|
yading@10
|
2108 INNER_LOOPFILTER v, 16
|
yading@10
|
2109 INNER_LOOPFILTER h, 16
|
yading@10
|
2110 INNER_LOOPFILTER v, 8
|
yading@10
|
2111 INNER_LOOPFILTER h, 8
|
yading@10
|
2112
|
yading@10
|
2113 INIT_XMM ssse3
|
yading@10
|
2114 INNER_LOOPFILTER v, 16
|
yading@10
|
2115 INNER_LOOPFILTER h, 16
|
yading@10
|
2116 INNER_LOOPFILTER v, 8
|
yading@10
|
2117 INNER_LOOPFILTER h, 8
|
yading@10
|
2118
|
yading@10
|
2119 ;-----------------------------------------------------------------------------
|
yading@10
|
2120 ; void vp8_h/v_loop_filter<size>_mbedge_<opt>(uint8_t *dst, [uint8_t *v,] int stride,
|
yading@10
|
2121 ; int flimE, int flimI, int hev_thr);
|
yading@10
|
2122 ;-----------------------------------------------------------------------------
|
yading@10
|
2123
|
yading@10
|
2124 %macro MBEDGE_LOOPFILTER 2
|
yading@10
|
2125 %define stack_size 0
|
yading@10
|
2126 %ifndef m8 ; stack layout: [0]=E, [1]=I, [2]=hev_thr
|
yading@10
|
2127 %if mmsize == 16 ; [3]=hev() result
|
yading@10
|
2128 ; [4]=filter tmp result
|
yading@10
|
2129 ; [5]/[6] = p2/q2 backup
|
yading@10
|
2130 ; [7]=lim_res sign result
|
yading@10
|
2131 %define stack_size mmsize * -7
|
yading@10
|
2132 %else ; 8 ; extra storage space for transposes
|
yading@10
|
2133 %define stack_size mmsize * -8
|
yading@10
|
2134 %endif
|
yading@10
|
2135 %endif
|
yading@10
|
2136
|
yading@10
|
2137 %if %2 == 8 ; chroma
|
yading@10
|
2138 cglobal vp8_%1_loop_filter8uv_mbedge, 6, 6, 15, stack_size, dst1, dst8, stride, flimE, flimI, hevthr
|
yading@10
|
2139 %else ; luma
|
yading@10
|
2140 cglobal vp8_%1_loop_filter16y_mbedge, 5, 5, 15, stack_size, dst1, stride, flimE, flimI, hevthr
|
yading@10
|
2141 %endif
|
yading@10
|
2142
|
yading@10
|
2143 %if cpuflag(ssse3)
|
yading@10
|
2144 pxor m7, m7
|
yading@10
|
2145 %endif
|
yading@10
|
2146
|
yading@10
|
2147 %ifndef m8
|
yading@10
|
2148 ; splat function arguments
|
yading@10
|
2149 SPLATB_REG m0, flimEq, m7 ; E
|
yading@10
|
2150 SPLATB_REG m1, flimIq, m7 ; I
|
yading@10
|
2151 SPLATB_REG m2, hevthrq, m7 ; hev_thresh
|
yading@10
|
2152
|
yading@10
|
2153 %define m_flimE [rsp]
|
yading@10
|
2154 %define m_flimI [rsp+mmsize]
|
yading@10
|
2155 %define m_hevthr [rsp+mmsize*2]
|
yading@10
|
2156 %define m_maskres [rsp+mmsize*3]
|
yading@10
|
2157 %define m_limres [rsp+mmsize*4]
|
yading@10
|
2158 %define m_p0backup [rsp+mmsize*3]
|
yading@10
|
2159 %define m_q0backup [rsp+mmsize*4]
|
yading@10
|
2160 %define m_p2backup [rsp+mmsize*5]
|
yading@10
|
2161 %define m_q2backup [rsp+mmsize*6]
|
yading@10
|
2162 %if mmsize == 16
|
yading@10
|
2163 %define m_limsign [rsp]
|
yading@10
|
2164 %else
|
yading@10
|
2165 %define m_limsign [rsp+mmsize*7]
|
yading@10
|
2166 %endif
|
yading@10
|
2167
|
yading@10
|
2168 mova m_flimE, m0
|
yading@10
|
2169 mova m_flimI, m1
|
yading@10
|
2170 mova m_hevthr, m2
|
yading@10
|
2171 %else ; sse2 on x86-64
|
yading@10
|
2172 %define m_flimE m9
|
yading@10
|
2173 %define m_flimI m10
|
yading@10
|
2174 %define m_hevthr m11
|
yading@10
|
2175 %define m_maskres m12
|
yading@10
|
2176 %define m_limres m8
|
yading@10
|
2177 %define m_p0backup m12
|
yading@10
|
2178 %define m_q0backup m8
|
yading@10
|
2179 %define m_p2backup m13
|
yading@10
|
2180 %define m_q2backup m14
|
yading@10
|
2181 %define m_limsign m9
|
yading@10
|
2182
|
yading@10
|
2183 ; splat function arguments
|
yading@10
|
2184 SPLATB_REG m_flimE, flimEq, m7 ; E
|
yading@10
|
2185 SPLATB_REG m_flimI, flimIq, m7 ; I
|
yading@10
|
2186 SPLATB_REG m_hevthr, hevthrq, m7 ; hev_thresh
|
yading@10
|
2187 %endif
|
yading@10
|
2188
|
yading@10
|
2189 %if %2 == 8 ; chroma
|
yading@10
|
2190 DEFINE_ARGS dst1, dst8, mstride, stride, dst2
|
yading@10
|
2191 %elif mmsize == 8
|
yading@10
|
2192 DEFINE_ARGS dst1, mstride, stride, dst2, cntr
|
yading@10
|
2193 mov cntrq, 2
|
yading@10
|
2194 %else
|
yading@10
|
2195 DEFINE_ARGS dst1, mstride, stride, dst2, dst8
|
yading@10
|
2196 %endif
|
yading@10
|
2197 mov strideq, mstrideq
|
yading@10
|
2198 neg mstrideq
|
yading@10
|
2199 %ifidn %1, h
|
yading@10
|
2200 lea dst1q, [dst1q+strideq*4-4]
|
yading@10
|
2201 %if %2 == 8 ; chroma
|
yading@10
|
2202 lea dst8q, [dst8q+strideq*4-4]
|
yading@10
|
2203 %endif
|
yading@10
|
2204 %endif
|
yading@10
|
2205
|
yading@10
|
2206 %if mmsize == 8
|
yading@10
|
2207 .next8px:
|
yading@10
|
2208 %endif
|
yading@10
|
2209 ; read
|
yading@10
|
2210 lea dst2q, [dst1q+ strideq ]
|
yading@10
|
2211 %ifidn %1, v
|
yading@10
|
2212 %if %2 == 8 && mmsize == 16
|
yading@10
|
2213 %define movrow movh
|
yading@10
|
2214 %else
|
yading@10
|
2215 %define movrow mova
|
yading@10
|
2216 %endif
|
yading@10
|
2217 movrow m0, [dst1q+mstrideq*4] ; p3
|
yading@10
|
2218 movrow m1, [dst2q+mstrideq*4] ; p2
|
yading@10
|
2219 movrow m2, [dst1q+mstrideq*2] ; p1
|
yading@10
|
2220 movrow m5, [dst2q] ; q1
|
yading@10
|
2221 movrow m6, [dst2q+ strideq ] ; q2
|
yading@10
|
2222 movrow m7, [dst2q+ strideq*2] ; q3
|
yading@10
|
2223 %if mmsize == 16 && %2 == 8
|
yading@10
|
2224 movhps m0, [dst8q+mstrideq*4]
|
yading@10
|
2225 movhps m2, [dst8q+mstrideq*2]
|
yading@10
|
2226 add dst8q, strideq
|
yading@10
|
2227 movhps m1, [dst8q+mstrideq*4]
|
yading@10
|
2228 movhps m5, [dst8q]
|
yading@10
|
2229 movhps m6, [dst8q+ strideq ]
|
yading@10
|
2230 movhps m7, [dst8q+ strideq*2]
|
yading@10
|
2231 add dst8q, mstrideq
|
yading@10
|
2232 %endif
|
yading@10
|
2233 %elif mmsize == 8 ; mmx/mmxext (h)
|
yading@10
|
2234 ; read 8 rows of 8px each
|
yading@10
|
2235 movu m0, [dst1q+mstrideq*4]
|
yading@10
|
2236 movu m1, [dst2q+mstrideq*4]
|
yading@10
|
2237 movu m2, [dst1q+mstrideq*2]
|
yading@10
|
2238 movu m3, [dst1q+mstrideq ]
|
yading@10
|
2239 movu m4, [dst1q]
|
yading@10
|
2240 movu m5, [dst2q]
|
yading@10
|
2241 movu m6, [dst2q+ strideq ]
|
yading@10
|
2242
|
yading@10
|
2243 ; 8x8 transpose
|
yading@10
|
2244 TRANSPOSE4x4B 0, 1, 2, 3, 7
|
yading@10
|
2245 mova m_q0backup, m1
|
yading@10
|
2246 movu m7, [dst2q+ strideq*2]
|
yading@10
|
2247 TRANSPOSE4x4B 4, 5, 6, 7, 1
|
yading@10
|
2248 SBUTTERFLY dq, 0, 4, 1 ; p3/p2
|
yading@10
|
2249 SBUTTERFLY dq, 2, 6, 1 ; q0/q1
|
yading@10
|
2250 SBUTTERFLY dq, 3, 7, 1 ; q2/q3
|
yading@10
|
2251 mova m1, m_q0backup
|
yading@10
|
2252 mova m_q0backup, m2 ; store q0
|
yading@10
|
2253 SBUTTERFLY dq, 1, 5, 2 ; p1/p0
|
yading@10
|
2254 mova m_p0backup, m5 ; store p0
|
yading@10
|
2255 SWAP 1, 4
|
yading@10
|
2256 SWAP 2, 4
|
yading@10
|
2257 SWAP 6, 3
|
yading@10
|
2258 SWAP 5, 3
|
yading@10
|
2259 %else ; sse2 (h)
|
yading@10
|
2260 %if %2 == 16
|
yading@10
|
2261 lea dst8q, [dst1q+ strideq*8 ]
|
yading@10
|
2262 %endif
|
yading@10
|
2263
|
yading@10
|
2264 ; read 16 rows of 8px each, interleave
|
yading@10
|
2265 movh m0, [dst1q+mstrideq*4]
|
yading@10
|
2266 movh m1, [dst8q+mstrideq*4]
|
yading@10
|
2267 movh m2, [dst1q+mstrideq*2]
|
yading@10
|
2268 movh m5, [dst8q+mstrideq*2]
|
yading@10
|
2269 movh m3, [dst1q+mstrideq ]
|
yading@10
|
2270 movh m6, [dst8q+mstrideq ]
|
yading@10
|
2271 movh m4, [dst1q]
|
yading@10
|
2272 movh m7, [dst8q]
|
yading@10
|
2273 punpcklbw m0, m1 ; A/I
|
yading@10
|
2274 punpcklbw m2, m5 ; C/K
|
yading@10
|
2275 punpcklbw m3, m6 ; D/L
|
yading@10
|
2276 punpcklbw m4, m7 ; E/M
|
yading@10
|
2277
|
yading@10
|
2278 add dst8q, strideq
|
yading@10
|
2279 movh m1, [dst2q+mstrideq*4]
|
yading@10
|
2280 movh m6, [dst8q+mstrideq*4]
|
yading@10
|
2281 movh m5, [dst2q]
|
yading@10
|
2282 movh m7, [dst8q]
|
yading@10
|
2283 punpcklbw m1, m6 ; B/J
|
yading@10
|
2284 punpcklbw m5, m7 ; F/N
|
yading@10
|
2285 movh m6, [dst2q+ strideq ]
|
yading@10
|
2286 movh m7, [dst8q+ strideq ]
|
yading@10
|
2287 punpcklbw m6, m7 ; G/O
|
yading@10
|
2288
|
yading@10
|
2289 ; 8x16 transpose
|
yading@10
|
2290 TRANSPOSE4x4B 0, 1, 2, 3, 7
|
yading@10
|
2291 %ifdef m8
|
yading@10
|
2292 SWAP 1, 8
|
yading@10
|
2293 %else
|
yading@10
|
2294 mova m_q0backup, m1
|
yading@10
|
2295 %endif
|
yading@10
|
2296 movh m7, [dst2q+ strideq*2]
|
yading@10
|
2297 movh m1, [dst8q+ strideq*2]
|
yading@10
|
2298 punpcklbw m7, m1 ; H/P
|
yading@10
|
2299 TRANSPOSE4x4B 4, 5, 6, 7, 1
|
yading@10
|
2300 SBUTTERFLY dq, 0, 4, 1 ; p3/p2
|
yading@10
|
2301 SBUTTERFLY dq, 2, 6, 1 ; q0/q1
|
yading@10
|
2302 SBUTTERFLY dq, 3, 7, 1 ; q2/q3
|
yading@10
|
2303 %ifdef m8
|
yading@10
|
2304 SWAP 1, 8
|
yading@10
|
2305 SWAP 2, 8
|
yading@10
|
2306 %else
|
yading@10
|
2307 mova m1, m_q0backup
|
yading@10
|
2308 mova m_q0backup, m2 ; store q0
|
yading@10
|
2309 %endif
|
yading@10
|
2310 SBUTTERFLY dq, 1, 5, 2 ; p1/p0
|
yading@10
|
2311 %ifdef m12
|
yading@10
|
2312 SWAP 5, 12
|
yading@10
|
2313 %else
|
yading@10
|
2314 mova m_p0backup, m5 ; store p0
|
yading@10
|
2315 %endif
|
yading@10
|
2316 SWAP 1, 4
|
yading@10
|
2317 SWAP 2, 4
|
yading@10
|
2318 SWAP 6, 3
|
yading@10
|
2319 SWAP 5, 3
|
yading@10
|
2320 %endif
|
yading@10
|
2321
|
yading@10
|
2322 ; normal_limit for p3-p2, p2-p1, q3-q2 and q2-q1
|
yading@10
|
2323 mova m4, m1
|
yading@10
|
2324 SWAP 4, 1
|
yading@10
|
2325 psubusb m4, m0 ; p2-p3
|
yading@10
|
2326 psubusb m0, m1 ; p3-p2
|
yading@10
|
2327 por m0, m4 ; abs(p3-p2)
|
yading@10
|
2328
|
yading@10
|
2329 mova m4, m2
|
yading@10
|
2330 SWAP 4, 2
|
yading@10
|
2331 psubusb m4, m1 ; p1-p2
|
yading@10
|
2332 mova m_p2backup, m1
|
yading@10
|
2333 psubusb m1, m2 ; p2-p1
|
yading@10
|
2334 por m1, m4 ; abs(p2-p1)
|
yading@10
|
2335
|
yading@10
|
2336 mova m4, m6
|
yading@10
|
2337 SWAP 4, 6
|
yading@10
|
2338 psubusb m4, m7 ; q2-q3
|
yading@10
|
2339 psubusb m7, m6 ; q3-q2
|
yading@10
|
2340 por m7, m4 ; abs(q3-q2)
|
yading@10
|
2341
|
yading@10
|
2342 mova m4, m5
|
yading@10
|
2343 SWAP 4, 5
|
yading@10
|
2344 psubusb m4, m6 ; q1-q2
|
yading@10
|
2345 mova m_q2backup, m6
|
yading@10
|
2346 psubusb m6, m5 ; q2-q1
|
yading@10
|
2347 por m6, m4 ; abs(q2-q1)
|
yading@10
|
2348
|
yading@10
|
2349 %if notcpuflag(mmxext)
|
yading@10
|
2350 mova m4, m_flimI
|
yading@10
|
2351 pxor m3, m3
|
yading@10
|
2352 psubusb m0, m4
|
yading@10
|
2353 psubusb m1, m4
|
yading@10
|
2354 psubusb m7, m4
|
yading@10
|
2355 psubusb m6, m4
|
yading@10
|
2356 pcmpeqb m0, m3 ; abs(p3-p2) <= I
|
yading@10
|
2357 pcmpeqb m1, m3 ; abs(p2-p1) <= I
|
yading@10
|
2358 pcmpeqb m7, m3 ; abs(q3-q2) <= I
|
yading@10
|
2359 pcmpeqb m6, m3 ; abs(q2-q1) <= I
|
yading@10
|
2360 pand m0, m1
|
yading@10
|
2361 pand m7, m6
|
yading@10
|
2362 pand m0, m7
|
yading@10
|
2363 %else ; mmxext/sse2
|
yading@10
|
2364 pmaxub m0, m1
|
yading@10
|
2365 pmaxub m6, m7
|
yading@10
|
2366 pmaxub m0, m6
|
yading@10
|
2367 %endif
|
yading@10
|
2368
|
yading@10
|
2369 ; normal_limit and high_edge_variance for p1-p0, q1-q0
|
yading@10
|
2370 SWAP 7, 3 ; now m7 is zero
|
yading@10
|
2371 %ifidn %1, v
|
yading@10
|
2372 movrow m3, [dst1q+mstrideq ] ; p0
|
yading@10
|
2373 %if mmsize == 16 && %2 == 8
|
yading@10
|
2374 movhps m3, [dst8q+mstrideq ]
|
yading@10
|
2375 %endif
|
yading@10
|
2376 %elifdef m12
|
yading@10
|
2377 SWAP 3, 12
|
yading@10
|
2378 %else
|
yading@10
|
2379 mova m3, m_p0backup
|
yading@10
|
2380 %endif
|
yading@10
|
2381
|
yading@10
|
2382 mova m1, m2
|
yading@10
|
2383 SWAP 1, 2
|
yading@10
|
2384 mova m6, m3
|
yading@10
|
2385 SWAP 3, 6
|
yading@10
|
2386 psubusb m1, m3 ; p1-p0
|
yading@10
|
2387 psubusb m6, m2 ; p0-p1
|
yading@10
|
2388 por m1, m6 ; abs(p1-p0)
|
yading@10
|
2389 %if notcpuflag(mmxext)
|
yading@10
|
2390 mova m6, m1
|
yading@10
|
2391 psubusb m1, m4
|
yading@10
|
2392 psubusb m6, m_hevthr
|
yading@10
|
2393 pcmpeqb m1, m7 ; abs(p1-p0) <= I
|
yading@10
|
2394 pcmpeqb m6, m7 ; abs(p1-p0) <= hev_thresh
|
yading@10
|
2395 pand m0, m1
|
yading@10
|
2396 mova m_maskres, m6
|
yading@10
|
2397 %else ; mmxext/sse2
|
yading@10
|
2398 pmaxub m0, m1 ; max_I
|
yading@10
|
2399 SWAP 1, 4 ; max_hev_thresh
|
yading@10
|
2400 %endif
|
yading@10
|
2401
|
yading@10
|
2402 SWAP 6, 4 ; now m6 is I
|
yading@10
|
2403 %ifidn %1, v
|
yading@10
|
2404 movrow m4, [dst1q] ; q0
|
yading@10
|
2405 %if mmsize == 16 && %2 == 8
|
yading@10
|
2406 movhps m4, [dst8q]
|
yading@10
|
2407 %endif
|
yading@10
|
2408 %elifdef m8
|
yading@10
|
2409 SWAP 4, 8
|
yading@10
|
2410 %else
|
yading@10
|
2411 mova m4, m_q0backup
|
yading@10
|
2412 %endif
|
yading@10
|
2413 mova m1, m4
|
yading@10
|
2414 SWAP 1, 4
|
yading@10
|
2415 mova m7, m5
|
yading@10
|
2416 SWAP 7, 5
|
yading@10
|
2417 psubusb m1, m5 ; q0-q1
|
yading@10
|
2418 psubusb m7, m4 ; q1-q0
|
yading@10
|
2419 por m1, m7 ; abs(q1-q0)
|
yading@10
|
2420 %if notcpuflag(mmxext)
|
yading@10
|
2421 mova m7, m1
|
yading@10
|
2422 psubusb m1, m6
|
yading@10
|
2423 psubusb m7, m_hevthr
|
yading@10
|
2424 pxor m6, m6
|
yading@10
|
2425 pcmpeqb m1, m6 ; abs(q1-q0) <= I
|
yading@10
|
2426 pcmpeqb m7, m6 ; abs(q1-q0) <= hev_thresh
|
yading@10
|
2427 mova m6, m_maskres
|
yading@10
|
2428 pand m0, m1 ; abs([pq][321]-[pq][210]) <= I
|
yading@10
|
2429 pand m6, m7
|
yading@10
|
2430 %else ; mmxext/sse2
|
yading@10
|
2431 pxor m7, m7
|
yading@10
|
2432 pmaxub m0, m1
|
yading@10
|
2433 pmaxub m6, m1
|
yading@10
|
2434 psubusb m0, m_flimI
|
yading@10
|
2435 psubusb m6, m_hevthr
|
yading@10
|
2436 pcmpeqb m0, m7 ; max(abs(..)) <= I
|
yading@10
|
2437 pcmpeqb m6, m7 ; !(max(abs..) > thresh)
|
yading@10
|
2438 %endif
|
yading@10
|
2439 %ifdef m12
|
yading@10
|
2440 SWAP 6, 12
|
yading@10
|
2441 %else
|
yading@10
|
2442 mova m_maskres, m6 ; !(abs(p1-p0) > hev_t || abs(q1-q0) > hev_t)
|
yading@10
|
2443 %endif
|
yading@10
|
2444
|
yading@10
|
2445 ; simple_limit
|
yading@10
|
2446 mova m1, m3
|
yading@10
|
2447 SWAP 1, 3
|
yading@10
|
2448 mova m6, m4 ; keep copies of p0/q0 around for later use
|
yading@10
|
2449 SWAP 6, 4
|
yading@10
|
2450 psubusb m1, m4 ; p0-q0
|
yading@10
|
2451 psubusb m6, m3 ; q0-p0
|
yading@10
|
2452 por m1, m6 ; abs(q0-p0)
|
yading@10
|
2453 paddusb m1, m1 ; m1=2*abs(q0-p0)
|
yading@10
|
2454
|
yading@10
|
2455 mova m7, m2
|
yading@10
|
2456 SWAP 7, 2
|
yading@10
|
2457 mova m6, m5
|
yading@10
|
2458 SWAP 6, 5
|
yading@10
|
2459 psubusb m7, m5 ; p1-q1
|
yading@10
|
2460 psubusb m6, m2 ; q1-p1
|
yading@10
|
2461 por m7, m6 ; abs(q1-p1)
|
yading@10
|
2462 pxor m6, m6
|
yading@10
|
2463 pand m7, [pb_FE]
|
yading@10
|
2464 psrlq m7, 1 ; abs(q1-p1)/2
|
yading@10
|
2465 paddusb m7, m1 ; abs(q0-p0)*2+abs(q1-p1)/2
|
yading@10
|
2466 psubusb m7, m_flimE
|
yading@10
|
2467 pcmpeqb m7, m6 ; abs(q0-p0)*2+abs(q1-p1)/2 <= E
|
yading@10
|
2468 pand m0, m7 ; normal_limit result
|
yading@10
|
2469
|
yading@10
|
2470 ; filter_common; at this point, m2-m5=p1-q1 and m0 is filter_mask
|
yading@10
|
2471 %ifdef m8 ; x86-64 && sse2
|
yading@10
|
2472 mova m8, [pb_80]
|
yading@10
|
2473 %define m_pb_80 m8
|
yading@10
|
2474 %else ; x86-32 or mmx/mmxext
|
yading@10
|
2475 %define m_pb_80 [pb_80]
|
yading@10
|
2476 %endif
|
yading@10
|
2477 mova m1, m4
|
yading@10
|
2478 mova m7, m3
|
yading@10
|
2479 pxor m1, m_pb_80
|
yading@10
|
2480 pxor m7, m_pb_80
|
yading@10
|
2481 psubsb m1, m7 ; (signed) q0-p0
|
yading@10
|
2482 mova m6, m2
|
yading@10
|
2483 mova m7, m5
|
yading@10
|
2484 pxor m6, m_pb_80
|
yading@10
|
2485 pxor m7, m_pb_80
|
yading@10
|
2486 psubsb m6, m7 ; (signed) p1-q1
|
yading@10
|
2487 mova m7, m_maskres
|
yading@10
|
2488 paddsb m6, m1
|
yading@10
|
2489 paddsb m6, m1
|
yading@10
|
2490 paddsb m6, m1
|
yading@10
|
2491 pand m6, m0
|
yading@10
|
2492 %ifdef m8
|
yading@10
|
2493 mova m_limres, m6 ; 3*(qp-p0)+(p1-q1) masked for filter_mbedge
|
yading@10
|
2494 pand m_limres, m7
|
yading@10
|
2495 %else
|
yading@10
|
2496 mova m0, m6
|
yading@10
|
2497 pand m0, m7
|
yading@10
|
2498 mova m_limres, m0
|
yading@10
|
2499 %endif
|
yading@10
|
2500 pandn m7, m6 ; 3*(q0-p0)+(p1-q1) masked for filter_common
|
yading@10
|
2501
|
yading@10
|
2502 mova m1, [pb_F8]
|
yading@10
|
2503 mova m6, m7
|
yading@10
|
2504 paddsb m7, [pb_3]
|
yading@10
|
2505 paddsb m6, [pb_4]
|
yading@10
|
2506 pand m7, m1
|
yading@10
|
2507 pand m6, m1
|
yading@10
|
2508
|
yading@10
|
2509 pxor m1, m1
|
yading@10
|
2510 pxor m0, m0
|
yading@10
|
2511 pcmpgtb m1, m7
|
yading@10
|
2512 psubb m0, m7
|
yading@10
|
2513 psrlq m7, 3 ; +f2
|
yading@10
|
2514 psrlq m0, 3 ; -f2
|
yading@10
|
2515 pand m0, m1
|
yading@10
|
2516 pandn m1, m7
|
yading@10
|
2517 psubusb m3, m0
|
yading@10
|
2518 paddusb m3, m1 ; p0+f2
|
yading@10
|
2519
|
yading@10
|
2520 pxor m1, m1
|
yading@10
|
2521 pxor m0, m0
|
yading@10
|
2522 pcmpgtb m0, m6
|
yading@10
|
2523 psubb m1, m6
|
yading@10
|
2524 psrlq m6, 3 ; +f1
|
yading@10
|
2525 psrlq m1, 3 ; -f1
|
yading@10
|
2526 pand m1, m0
|
yading@10
|
2527 pandn m0, m6
|
yading@10
|
2528 psubusb m4, m0
|
yading@10
|
2529 paddusb m4, m1 ; q0-f1
|
yading@10
|
2530
|
yading@10
|
2531 ; filter_mbedge (m2-m5 = p1-q1; lim_res carries w)
|
yading@10
|
2532 %if cpuflag(ssse3)
|
yading@10
|
2533 mova m7, [pb_1]
|
yading@10
|
2534 %else
|
yading@10
|
2535 mova m7, [pw_63]
|
yading@10
|
2536 %endif
|
yading@10
|
2537 %ifdef m8
|
yading@10
|
2538 SWAP 1, 8
|
yading@10
|
2539 %else
|
yading@10
|
2540 mova m1, m_limres
|
yading@10
|
2541 %endif
|
yading@10
|
2542 pxor m0, m0
|
yading@10
|
2543 mova m6, m1
|
yading@10
|
2544 pcmpgtb m0, m1 ; which are negative
|
yading@10
|
2545 %if cpuflag(ssse3)
|
yading@10
|
2546 punpcklbw m6, m7 ; interleave with "1" for rounding
|
yading@10
|
2547 punpckhbw m1, m7
|
yading@10
|
2548 %else
|
yading@10
|
2549 punpcklbw m6, m0 ; signed byte->word
|
yading@10
|
2550 punpckhbw m1, m0
|
yading@10
|
2551 %endif
|
yading@10
|
2552 mova m_limsign, m0
|
yading@10
|
2553 %if cpuflag(ssse3)
|
yading@10
|
2554 mova m7, [pb_27_63]
|
yading@10
|
2555 %ifndef m8
|
yading@10
|
2556 mova m_limres, m1
|
yading@10
|
2557 %endif
|
yading@10
|
2558 %ifdef m10
|
yading@10
|
2559 SWAP 0, 10 ; don't lose lim_sign copy
|
yading@10
|
2560 %endif
|
yading@10
|
2561 mova m0, m7
|
yading@10
|
2562 pmaddubsw m7, m6
|
yading@10
|
2563 SWAP 6, 7
|
yading@10
|
2564 pmaddubsw m0, m1
|
yading@10
|
2565 SWAP 1, 0
|
yading@10
|
2566 %ifdef m10
|
yading@10
|
2567 SWAP 0, 10
|
yading@10
|
2568 %else
|
yading@10
|
2569 mova m0, m_limsign
|
yading@10
|
2570 %endif
|
yading@10
|
2571 %else
|
yading@10
|
2572 mova m_maskres, m6 ; backup for later in filter
|
yading@10
|
2573 mova m_limres, m1
|
yading@10
|
2574 pmullw m6, [pw_27]
|
yading@10
|
2575 pmullw m1, [pw_27]
|
yading@10
|
2576 paddw m6, m7
|
yading@10
|
2577 paddw m1, m7
|
yading@10
|
2578 %endif
|
yading@10
|
2579 psraw m6, 7
|
yading@10
|
2580 psraw m1, 7
|
yading@10
|
2581 packsswb m6, m1 ; a0
|
yading@10
|
2582 pxor m1, m1
|
yading@10
|
2583 psubb m1, m6
|
yading@10
|
2584 pand m1, m0 ; -a0
|
yading@10
|
2585 pandn m0, m6 ; +a0
|
yading@10
|
2586 %if cpuflag(ssse3)
|
yading@10
|
2587 mova m6, [pb_18_63] ; pipelining
|
yading@10
|
2588 %endif
|
yading@10
|
2589 psubusb m3, m1
|
yading@10
|
2590 paddusb m4, m1
|
yading@10
|
2591 paddusb m3, m0 ; p0+a0
|
yading@10
|
2592 psubusb m4, m0 ; q0-a0
|
yading@10
|
2593
|
yading@10
|
2594 %if cpuflag(ssse3)
|
yading@10
|
2595 SWAP 6, 7
|
yading@10
|
2596 %ifdef m10
|
yading@10
|
2597 SWAP 1, 10
|
yading@10
|
2598 %else
|
yading@10
|
2599 mova m1, m_limres
|
yading@10
|
2600 %endif
|
yading@10
|
2601 mova m0, m7
|
yading@10
|
2602 pmaddubsw m7, m6
|
yading@10
|
2603 SWAP 6, 7
|
yading@10
|
2604 pmaddubsw m0, m1
|
yading@10
|
2605 SWAP 1, 0
|
yading@10
|
2606 %ifdef m10
|
yading@10
|
2607 SWAP 0, 10
|
yading@10
|
2608 %endif
|
yading@10
|
2609 mova m0, m_limsign
|
yading@10
|
2610 %else
|
yading@10
|
2611 mova m6, m_maskres
|
yading@10
|
2612 mova m1, m_limres
|
yading@10
|
2613 pmullw m6, [pw_18]
|
yading@10
|
2614 pmullw m1, [pw_18]
|
yading@10
|
2615 paddw m6, m7
|
yading@10
|
2616 paddw m1, m7
|
yading@10
|
2617 %endif
|
yading@10
|
2618 mova m0, m_limsign
|
yading@10
|
2619 psraw m6, 7
|
yading@10
|
2620 psraw m1, 7
|
yading@10
|
2621 packsswb m6, m1 ; a1
|
yading@10
|
2622 pxor m1, m1
|
yading@10
|
2623 psubb m1, m6
|
yading@10
|
2624 pand m1, m0 ; -a1
|
yading@10
|
2625 pandn m0, m6 ; +a1
|
yading@10
|
2626 %if cpuflag(ssse3)
|
yading@10
|
2627 mova m6, [pb_9_63]
|
yading@10
|
2628 %endif
|
yading@10
|
2629 psubusb m2, m1
|
yading@10
|
2630 paddusb m5, m1
|
yading@10
|
2631 paddusb m2, m0 ; p1+a1
|
yading@10
|
2632 psubusb m5, m0 ; q1-a1
|
yading@10
|
2633
|
yading@10
|
2634 %if cpuflag(ssse3)
|
yading@10
|
2635 SWAP 6, 7
|
yading@10
|
2636 %ifdef m10
|
yading@10
|
2637 SWAP 1, 10
|
yading@10
|
2638 %else
|
yading@10
|
2639 mova m1, m_limres
|
yading@10
|
2640 %endif
|
yading@10
|
2641 mova m0, m7
|
yading@10
|
2642 pmaddubsw m7, m6
|
yading@10
|
2643 SWAP 6, 7
|
yading@10
|
2644 pmaddubsw m0, m1
|
yading@10
|
2645 SWAP 1, 0
|
yading@10
|
2646 %else
|
yading@10
|
2647 %ifdef m8
|
yading@10
|
2648 SWAP 6, 12
|
yading@10
|
2649 SWAP 1, 8
|
yading@10
|
2650 %else
|
yading@10
|
2651 mova m6, m_maskres
|
yading@10
|
2652 mova m1, m_limres
|
yading@10
|
2653 %endif
|
yading@10
|
2654 pmullw m6, [pw_9]
|
yading@10
|
2655 pmullw m1, [pw_9]
|
yading@10
|
2656 paddw m6, m7
|
yading@10
|
2657 paddw m1, m7
|
yading@10
|
2658 %endif
|
yading@10
|
2659 %ifdef m9
|
yading@10
|
2660 SWAP 7, 9
|
yading@10
|
2661 %else
|
yading@10
|
2662 mova m7, m_limsign
|
yading@10
|
2663 %endif
|
yading@10
|
2664 psraw m6, 7
|
yading@10
|
2665 psraw m1, 7
|
yading@10
|
2666 packsswb m6, m1 ; a1
|
yading@10
|
2667 pxor m0, m0
|
yading@10
|
2668 psubb m0, m6
|
yading@10
|
2669 pand m0, m7 ; -a1
|
yading@10
|
2670 pandn m7, m6 ; +a1
|
yading@10
|
2671 %ifdef m8
|
yading@10
|
2672 SWAP 1, 13
|
yading@10
|
2673 SWAP 6, 14
|
yading@10
|
2674 %else
|
yading@10
|
2675 mova m1, m_p2backup
|
yading@10
|
2676 mova m6, m_q2backup
|
yading@10
|
2677 %endif
|
yading@10
|
2678 psubusb m1, m0
|
yading@10
|
2679 paddusb m6, m0
|
yading@10
|
2680 paddusb m1, m7 ; p1+a1
|
yading@10
|
2681 psubusb m6, m7 ; q1-a1
|
yading@10
|
2682
|
yading@10
|
2683 ; store
|
yading@10
|
2684 %ifidn %1, v
|
yading@10
|
2685 movrow [dst2q+mstrideq*4], m1
|
yading@10
|
2686 movrow [dst1q+mstrideq*2], m2
|
yading@10
|
2687 movrow [dst1q+mstrideq ], m3
|
yading@10
|
2688 movrow [dst1q], m4
|
yading@10
|
2689 movrow [dst2q], m5
|
yading@10
|
2690 movrow [dst2q+ strideq ], m6
|
yading@10
|
2691 %if mmsize == 16 && %2 == 8
|
yading@10
|
2692 add dst8q, mstrideq
|
yading@10
|
2693 movhps [dst8q+mstrideq*2], m1
|
yading@10
|
2694 movhps [dst8q+mstrideq ], m2
|
yading@10
|
2695 movhps [dst8q], m3
|
yading@10
|
2696 add dst8q, strideq
|
yading@10
|
2697 movhps [dst8q], m4
|
yading@10
|
2698 movhps [dst8q+ strideq ], m5
|
yading@10
|
2699 movhps [dst8q+ strideq*2], m6
|
yading@10
|
2700 %endif
|
yading@10
|
2701 %else ; h
|
yading@10
|
2702 inc dst1q
|
yading@10
|
2703 inc dst2q
|
yading@10
|
2704
|
yading@10
|
2705 ; 4x8/16 transpose
|
yading@10
|
2706 TRANSPOSE4x4B 1, 2, 3, 4, 0
|
yading@10
|
2707 SBUTTERFLY bw, 5, 6, 0
|
yading@10
|
2708
|
yading@10
|
2709 %if mmsize == 8 ; mmx/mmxext (h)
|
yading@10
|
2710 WRITE_4x2D 1, 2, 3, 4, dst1q, dst2q, mstrideq, strideq
|
yading@10
|
2711 add dst1q, 4
|
yading@10
|
2712 WRITE_2x4W m5, m6, dst2q, dst1q, mstrideq, strideq
|
yading@10
|
2713 %else ; sse2 (h)
|
yading@10
|
2714 lea dst8q, [dst8q+mstrideq+1]
|
yading@10
|
2715 WRITE_4x4D 1, 2, 3, 4, dst1q, dst2q, dst8q, mstrideq, strideq, %2
|
yading@10
|
2716 lea dst1q, [dst2q+mstrideq+4]
|
yading@10
|
2717 lea dst8q, [dst8q+mstrideq+4]
|
yading@10
|
2718 %if cpuflag(sse4)
|
yading@10
|
2719 add dst2q, 4
|
yading@10
|
2720 %endif
|
yading@10
|
2721 WRITE_8W m5, dst2q, dst1q, mstrideq, strideq
|
yading@10
|
2722 %if cpuflag(sse4)
|
yading@10
|
2723 lea dst2q, [dst8q+ strideq ]
|
yading@10
|
2724 %endif
|
yading@10
|
2725 WRITE_8W m6, dst2q, dst8q, mstrideq, strideq
|
yading@10
|
2726 %endif
|
yading@10
|
2727 %endif
|
yading@10
|
2728
|
yading@10
|
2729 %if mmsize == 8
|
yading@10
|
2730 %if %2 == 8 ; chroma
|
yading@10
|
2731 %ifidn %1, h
|
yading@10
|
2732 sub dst1q, 5
|
yading@10
|
2733 %endif
|
yading@10
|
2734 cmp dst1q, dst8q
|
yading@10
|
2735 mov dst1q, dst8q
|
yading@10
|
2736 jnz .next8px
|
yading@10
|
2737 %else
|
yading@10
|
2738 %ifidn %1, h
|
yading@10
|
2739 lea dst1q, [dst1q+ strideq*8-5]
|
yading@10
|
2740 %else ; v
|
yading@10
|
2741 add dst1q, 8
|
yading@10
|
2742 %endif
|
yading@10
|
2743 dec cntrq
|
yading@10
|
2744 jg .next8px
|
yading@10
|
2745 %endif
|
yading@10
|
2746 REP_RET
|
yading@10
|
2747 %else ; mmsize == 16
|
yading@10
|
2748 RET
|
yading@10
|
2749 %endif
|
yading@10
|
2750 %endmacro
|
yading@10
|
2751
|
yading@10
|
2752 %if ARCH_X86_32
|
yading@10
|
2753 INIT_MMX mmx
|
yading@10
|
2754 MBEDGE_LOOPFILTER v, 16
|
yading@10
|
2755 MBEDGE_LOOPFILTER h, 16
|
yading@10
|
2756 MBEDGE_LOOPFILTER v, 8
|
yading@10
|
2757 MBEDGE_LOOPFILTER h, 8
|
yading@10
|
2758
|
yading@10
|
2759 INIT_MMX mmxext
|
yading@10
|
2760 MBEDGE_LOOPFILTER v, 16
|
yading@10
|
2761 MBEDGE_LOOPFILTER h, 16
|
yading@10
|
2762 MBEDGE_LOOPFILTER v, 8
|
yading@10
|
2763 MBEDGE_LOOPFILTER h, 8
|
yading@10
|
2764 %endif
|
yading@10
|
2765
|
yading@10
|
2766 INIT_XMM sse2
|
yading@10
|
2767 MBEDGE_LOOPFILTER v, 16
|
yading@10
|
2768 MBEDGE_LOOPFILTER h, 16
|
yading@10
|
2769 MBEDGE_LOOPFILTER v, 8
|
yading@10
|
2770 MBEDGE_LOOPFILTER h, 8
|
yading@10
|
2771
|
yading@10
|
2772 INIT_XMM ssse3
|
yading@10
|
2773 MBEDGE_LOOPFILTER v, 16
|
yading@10
|
2774 MBEDGE_LOOPFILTER h, 16
|
yading@10
|
2775 MBEDGE_LOOPFILTER v, 8
|
yading@10
|
2776 MBEDGE_LOOPFILTER h, 8
|
yading@10
|
2777
|
yading@10
|
2778 INIT_XMM sse4
|
yading@10
|
2779 MBEDGE_LOOPFILTER h, 16
|
yading@10
|
2780 MBEDGE_LOOPFILTER h, 8
|