yading@11
|
1 ;******************************************************************************
|
yading@11
|
2 ;* x86-optimized vertical line scaling functions
|
yading@11
|
3 ;* Copyright (c) 2011 Ronald S. Bultje <rsbultje@gmail.com>
|
yading@11
|
4 ;* Kieran Kunhya <kieran@kunhya.com>
|
yading@11
|
5 ;*
|
yading@11
|
6 ;* This file is part of Libav.
|
yading@11
|
7 ;*
|
yading@11
|
8 ;* Libav is free software; you can redistribute it and/or
|
yading@11
|
9 ;* modify it under the terms of the GNU Lesser General Public
|
yading@11
|
10 ;* License as published by the Free Software Foundation; either
|
yading@11
|
11 ;* version 2.1 of the License, or (at your option) any later version.
|
yading@11
|
12 ;*
|
yading@11
|
13 ;* Libav is distributed in the hope that it will be useful,
|
yading@11
|
14 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
yading@11
|
15 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
yading@11
|
16 ;* Lesser General Public License for more details.
|
yading@11
|
17 ;*
|
yading@11
|
18 ;* You should have received a copy of the GNU Lesser General Public
|
yading@11
|
19 ;* License along with Libav; if not, write to the Free Software
|
yading@11
|
20 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
yading@11
|
21 ;******************************************************************************
|
yading@11
|
22
|
yading@11
|
23 %include "libavutil/x86/x86util.asm"
|
yading@11
|
24
|
yading@11
|
25 SECTION_RODATA
|
yading@11
|
26
|
yading@11
|
27 minshort: times 8 dw 0x8000
|
yading@11
|
28 yuv2yuvX_16_start: times 4 dd 0x4000 - 0x40000000
|
yading@11
|
29 yuv2yuvX_10_start: times 4 dd 0x10000
|
yading@11
|
30 yuv2yuvX_9_start: times 4 dd 0x20000
|
yading@11
|
31 yuv2yuvX_10_upper: times 8 dw 0x3ff
|
yading@11
|
32 yuv2yuvX_9_upper: times 8 dw 0x1ff
|
yading@11
|
33 pd_4: times 4 dd 4
|
yading@11
|
34 pd_4min0x40000:times 4 dd 4 - (0x40000)
|
yading@11
|
35 pw_16: times 8 dw 16
|
yading@11
|
36 pw_32: times 8 dw 32
|
yading@11
|
37 pw_512: times 8 dw 512
|
yading@11
|
38 pw_1024: times 8 dw 1024
|
yading@11
|
39
|
yading@11
|
40 SECTION .text
|
yading@11
|
41
|
yading@11
|
42 ;-----------------------------------------------------------------------------
|
yading@11
|
43 ; vertical line scaling
|
yading@11
|
44 ;
|
yading@11
|
45 ; void yuv2plane1_<output_size>_<opt>(const int16_t *src, uint8_t *dst, int dstW,
|
yading@11
|
46 ; const uint8_t *dither, int offset)
|
yading@11
|
47 ; and
|
yading@11
|
48 ; void yuv2planeX_<output_size>_<opt>(const int16_t *filter, int filterSize,
|
yading@11
|
49 ; const int16_t **src, uint8_t *dst, int dstW,
|
yading@11
|
50 ; const uint8_t *dither, int offset)
|
yading@11
|
51 ;
|
yading@11
|
52 ; Scale one or $filterSize lines of source data to generate one line of output
|
yading@11
|
53 ; data. The input is 15-bit in int16_t if $output_size is [8,10] and 19-bit in
|
yading@11
|
54 ; int32_t if $output_size is 16. $filter is 12-bits. $filterSize is a multiple
|
yading@11
|
55 ; of 2. $offset is either 0 or 3. $dither holds 8 values.
|
yading@11
|
56 ;-----------------------------------------------------------------------------
|
yading@11
|
57
|
yading@11
|
58 %macro yuv2planeX_fn 3
|
yading@11
|
59
|
yading@11
|
60 %if ARCH_X86_32
|
yading@11
|
61 %define cntr_reg fltsizeq
|
yading@11
|
62 %define movsx mov
|
yading@11
|
63 %else
|
yading@11
|
64 %define cntr_reg r7
|
yading@11
|
65 %define movsx movsxd
|
yading@11
|
66 %endif
|
yading@11
|
67
|
yading@11
|
68 cglobal yuv2planeX_%1, %3, 8, %2, filter, fltsize, src, dst, w, dither, offset
|
yading@11
|
69 %if %1 == 8 || %1 == 9 || %1 == 10
|
yading@11
|
70 pxor m6, m6
|
yading@11
|
71 %endif ; %1 == 8/9/10
|
yading@11
|
72
|
yading@11
|
73 %if %1 == 8
|
yading@11
|
74 %if ARCH_X86_32
|
yading@11
|
75 %assign pad 0x2c - (stack_offset & 15)
|
yading@11
|
76 SUB rsp, pad
|
yading@11
|
77 %define m_dith m7
|
yading@11
|
78 %else ; x86-64
|
yading@11
|
79 %define m_dith m9
|
yading@11
|
80 %endif ; x86-32
|
yading@11
|
81
|
yading@11
|
82 ; create registers holding dither
|
yading@11
|
83 movq m_dith, [ditherq] ; dither
|
yading@11
|
84 test offsetd, offsetd
|
yading@11
|
85 jz .no_rot
|
yading@11
|
86 %if mmsize == 16
|
yading@11
|
87 punpcklqdq m_dith, m_dith
|
yading@11
|
88 %endif ; mmsize == 16
|
yading@11
|
89 PALIGNR m_dith, m_dith, 3, m0
|
yading@11
|
90 .no_rot:
|
yading@11
|
91 %if mmsize == 16
|
yading@11
|
92 punpcklbw m_dith, m6
|
yading@11
|
93 %if ARCH_X86_64
|
yading@11
|
94 punpcklwd m8, m_dith, m6
|
yading@11
|
95 pslld m8, 12
|
yading@11
|
96 %else ; x86-32
|
yading@11
|
97 punpcklwd m5, m_dith, m6
|
yading@11
|
98 pslld m5, 12
|
yading@11
|
99 %endif ; x86-32/64
|
yading@11
|
100 punpckhwd m_dith, m6
|
yading@11
|
101 pslld m_dith, 12
|
yading@11
|
102 %if ARCH_X86_32
|
yading@11
|
103 mova [rsp+ 0], m5
|
yading@11
|
104 mova [rsp+16], m_dith
|
yading@11
|
105 %endif
|
yading@11
|
106 %else ; mmsize == 8
|
yading@11
|
107 punpcklbw m5, m_dith, m6
|
yading@11
|
108 punpckhbw m_dith, m6
|
yading@11
|
109 punpcklwd m4, m5, m6
|
yading@11
|
110 punpckhwd m5, m6
|
yading@11
|
111 punpcklwd m3, m_dith, m6
|
yading@11
|
112 punpckhwd m_dith, m6
|
yading@11
|
113 pslld m4, 12
|
yading@11
|
114 pslld m5, 12
|
yading@11
|
115 pslld m3, 12
|
yading@11
|
116 pslld m_dith, 12
|
yading@11
|
117 mova [rsp+ 0], m4
|
yading@11
|
118 mova [rsp+ 8], m5
|
yading@11
|
119 mova [rsp+16], m3
|
yading@11
|
120 mova [rsp+24], m_dith
|
yading@11
|
121 %endif ; mmsize == 8/16
|
yading@11
|
122 %endif ; %1 == 8
|
yading@11
|
123
|
yading@11
|
124 xor r5, r5
|
yading@11
|
125
|
yading@11
|
126 .pixelloop:
|
yading@11
|
127 %assign %%i 0
|
yading@11
|
128 ; the rep here is for the 8bit output mmx case, where dither covers
|
yading@11
|
129 ; 8 pixels but we can only handle 2 pixels per register, and thus 4
|
yading@11
|
130 ; pixels per iteration. In order to not have to keep track of where
|
yading@11
|
131 ; we are w.r.t. dithering, we unroll the mmx/8bit loop x2.
|
yading@11
|
132 %if %1 == 8
|
yading@11
|
133 %assign %%repcnt 16/mmsize
|
yading@11
|
134 %else
|
yading@11
|
135 %assign %%repcnt 1
|
yading@11
|
136 %endif
|
yading@11
|
137
|
yading@11
|
138 %rep %%repcnt
|
yading@11
|
139
|
yading@11
|
140 %if %1 == 8
|
yading@11
|
141 %if ARCH_X86_32
|
yading@11
|
142 mova m2, [rsp+mmsize*(0+%%i)]
|
yading@11
|
143 mova m1, [rsp+mmsize*(1+%%i)]
|
yading@11
|
144 %else ; x86-64
|
yading@11
|
145 mova m2, m8
|
yading@11
|
146 mova m1, m_dith
|
yading@11
|
147 %endif ; x86-32/64
|
yading@11
|
148 %else ; %1 == 9/10/16
|
yading@11
|
149 mova m1, [yuv2yuvX_%1_start]
|
yading@11
|
150 mova m2, m1
|
yading@11
|
151 %endif ; %1 == 8/9/10/16
|
yading@11
|
152 movsx cntr_reg, fltsizem
|
yading@11
|
153 .filterloop_ %+ %%i:
|
yading@11
|
154 ; input pixels
|
yading@11
|
155 mov r6, [srcq+gprsize*cntr_reg-2*gprsize]
|
yading@11
|
156 %if %1 == 16
|
yading@11
|
157 mova m3, [r6+r5*4]
|
yading@11
|
158 mova m5, [r6+r5*4+mmsize]
|
yading@11
|
159 %else ; %1 == 8/9/10
|
yading@11
|
160 mova m3, [r6+r5*2]
|
yading@11
|
161 %endif ; %1 == 8/9/10/16
|
yading@11
|
162 mov r6, [srcq+gprsize*cntr_reg-gprsize]
|
yading@11
|
163 %if %1 == 16
|
yading@11
|
164 mova m4, [r6+r5*4]
|
yading@11
|
165 mova m6, [r6+r5*4+mmsize]
|
yading@11
|
166 %else ; %1 == 8/9/10
|
yading@11
|
167 mova m4, [r6+r5*2]
|
yading@11
|
168 %endif ; %1 == 8/9/10/16
|
yading@11
|
169
|
yading@11
|
170 ; coefficients
|
yading@11
|
171 movd m0, [filterq+2*cntr_reg-4] ; coeff[0], coeff[1]
|
yading@11
|
172 %if %1 == 16
|
yading@11
|
173 pshuflw m7, m0, 0 ; coeff[0]
|
yading@11
|
174 pshuflw m0, m0, 0x55 ; coeff[1]
|
yading@11
|
175 pmovsxwd m7, m7 ; word -> dword
|
yading@11
|
176 pmovsxwd m0, m0 ; word -> dword
|
yading@11
|
177
|
yading@11
|
178 pmulld m3, m7
|
yading@11
|
179 pmulld m5, m7
|
yading@11
|
180 pmulld m4, m0
|
yading@11
|
181 pmulld m6, m0
|
yading@11
|
182
|
yading@11
|
183 paddd m2, m3
|
yading@11
|
184 paddd m1, m5
|
yading@11
|
185 paddd m2, m4
|
yading@11
|
186 paddd m1, m6
|
yading@11
|
187 %else ; %1 == 10/9/8
|
yading@11
|
188 punpcklwd m5, m3, m4
|
yading@11
|
189 punpckhwd m3, m4
|
yading@11
|
190 SPLATD m0
|
yading@11
|
191
|
yading@11
|
192 pmaddwd m5, m0
|
yading@11
|
193 pmaddwd m3, m0
|
yading@11
|
194
|
yading@11
|
195 paddd m2, m5
|
yading@11
|
196 paddd m1, m3
|
yading@11
|
197 %endif ; %1 == 8/9/10/16
|
yading@11
|
198
|
yading@11
|
199 sub cntr_reg, 2
|
yading@11
|
200 jg .filterloop_ %+ %%i
|
yading@11
|
201
|
yading@11
|
202 %if %1 == 16
|
yading@11
|
203 psrad m2, 31 - %1
|
yading@11
|
204 psrad m1, 31 - %1
|
yading@11
|
205 %else ; %1 == 10/9/8
|
yading@11
|
206 psrad m2, 27 - %1
|
yading@11
|
207 psrad m1, 27 - %1
|
yading@11
|
208 %endif ; %1 == 8/9/10/16
|
yading@11
|
209
|
yading@11
|
210 %if %1 == 8
|
yading@11
|
211 packssdw m2, m1
|
yading@11
|
212 packuswb m2, m2
|
yading@11
|
213 movh [dstq+r5*1], m2
|
yading@11
|
214 %else ; %1 == 9/10/16
|
yading@11
|
215 %if %1 == 16
|
yading@11
|
216 packssdw m2, m1
|
yading@11
|
217 paddw m2, [minshort]
|
yading@11
|
218 %else ; %1 == 9/10
|
yading@11
|
219 %if cpuflag(sse4)
|
yading@11
|
220 packusdw m2, m1
|
yading@11
|
221 %else ; mmxext/sse2
|
yading@11
|
222 packssdw m2, m1
|
yading@11
|
223 pmaxsw m2, m6
|
yading@11
|
224 %endif ; mmxext/sse2/sse4/avx
|
yading@11
|
225 pminsw m2, [yuv2yuvX_%1_upper]
|
yading@11
|
226 %endif ; %1 == 9/10/16
|
yading@11
|
227 mova [dstq+r5*2], m2
|
yading@11
|
228 %endif ; %1 == 8/9/10/16
|
yading@11
|
229
|
yading@11
|
230 add r5, mmsize/2
|
yading@11
|
231 sub wd, mmsize/2
|
yading@11
|
232
|
yading@11
|
233 %assign %%i %%i+2
|
yading@11
|
234 %endrep
|
yading@11
|
235 jg .pixelloop
|
yading@11
|
236
|
yading@11
|
237 %if %1 == 8
|
yading@11
|
238 %if ARCH_X86_32
|
yading@11
|
239 ADD rsp, pad
|
yading@11
|
240 RET
|
yading@11
|
241 %else ; x86-64
|
yading@11
|
242 REP_RET
|
yading@11
|
243 %endif ; x86-32/64
|
yading@11
|
244 %else ; %1 == 9/10/16
|
yading@11
|
245 REP_RET
|
yading@11
|
246 %endif ; %1 == 8/9/10/16
|
yading@11
|
247 %endmacro
|
yading@11
|
248
|
yading@11
|
249 %if ARCH_X86_32
|
yading@11
|
250 INIT_MMX mmxext
|
yading@11
|
251 yuv2planeX_fn 8, 0, 7
|
yading@11
|
252 yuv2planeX_fn 9, 0, 5
|
yading@11
|
253 yuv2planeX_fn 10, 0, 5
|
yading@11
|
254 %endif
|
yading@11
|
255
|
yading@11
|
256 INIT_XMM sse2
|
yading@11
|
257 yuv2planeX_fn 8, 10, 7
|
yading@11
|
258 yuv2planeX_fn 9, 7, 5
|
yading@11
|
259 yuv2planeX_fn 10, 7, 5
|
yading@11
|
260
|
yading@11
|
261 INIT_XMM sse4
|
yading@11
|
262 yuv2planeX_fn 8, 10, 7
|
yading@11
|
263 yuv2planeX_fn 9, 7, 5
|
yading@11
|
264 yuv2planeX_fn 10, 7, 5
|
yading@11
|
265 yuv2planeX_fn 16, 8, 5
|
yading@11
|
266
|
yading@11
|
267 %if HAVE_AVX_EXTERNAL
|
yading@11
|
268 INIT_XMM avx
|
yading@11
|
269 yuv2planeX_fn 8, 10, 7
|
yading@11
|
270 yuv2planeX_fn 9, 7, 5
|
yading@11
|
271 yuv2planeX_fn 10, 7, 5
|
yading@11
|
272 %endif
|
yading@11
|
273
|
yading@11
|
274 ; %1=outout-bpc, %2=alignment (u/a)
|
yading@11
|
275 %macro yuv2plane1_mainloop 2
|
yading@11
|
276 .loop_%2:
|
yading@11
|
277 %if %1 == 8
|
yading@11
|
278 paddsw m0, m2, [srcq+wq*2+mmsize*0]
|
yading@11
|
279 paddsw m1, m3, [srcq+wq*2+mmsize*1]
|
yading@11
|
280 psraw m0, 7
|
yading@11
|
281 psraw m1, 7
|
yading@11
|
282 packuswb m0, m1
|
yading@11
|
283 mov%2 [dstq+wq], m0
|
yading@11
|
284 %elif %1 == 16
|
yading@11
|
285 paddd m0, m4, [srcq+wq*4+mmsize*0]
|
yading@11
|
286 paddd m1, m4, [srcq+wq*4+mmsize*1]
|
yading@11
|
287 paddd m2, m4, [srcq+wq*4+mmsize*2]
|
yading@11
|
288 paddd m3, m4, [srcq+wq*4+mmsize*3]
|
yading@11
|
289 psrad m0, 3
|
yading@11
|
290 psrad m1, 3
|
yading@11
|
291 psrad m2, 3
|
yading@11
|
292 psrad m3, 3
|
yading@11
|
293 %if cpuflag(sse4) ; avx/sse4
|
yading@11
|
294 packusdw m0, m1
|
yading@11
|
295 packusdw m2, m3
|
yading@11
|
296 %else ; mmx/sse2
|
yading@11
|
297 packssdw m0, m1
|
yading@11
|
298 packssdw m2, m3
|
yading@11
|
299 paddw m0, m5
|
yading@11
|
300 paddw m2, m5
|
yading@11
|
301 %endif ; mmx/sse2/sse4/avx
|
yading@11
|
302 mov%2 [dstq+wq*2+mmsize*0], m0
|
yading@11
|
303 mov%2 [dstq+wq*2+mmsize*1], m2
|
yading@11
|
304 %else ; %1 == 9/10
|
yading@11
|
305 paddsw m0, m2, [srcq+wq*2+mmsize*0]
|
yading@11
|
306 paddsw m1, m2, [srcq+wq*2+mmsize*1]
|
yading@11
|
307 psraw m0, 15 - %1
|
yading@11
|
308 psraw m1, 15 - %1
|
yading@11
|
309 pmaxsw m0, m4
|
yading@11
|
310 pmaxsw m1, m4
|
yading@11
|
311 pminsw m0, m3
|
yading@11
|
312 pminsw m1, m3
|
yading@11
|
313 mov%2 [dstq+wq*2+mmsize*0], m0
|
yading@11
|
314 mov%2 [dstq+wq*2+mmsize*1], m1
|
yading@11
|
315 %endif
|
yading@11
|
316 add wq, mmsize
|
yading@11
|
317 jl .loop_%2
|
yading@11
|
318 %endmacro
|
yading@11
|
319
|
yading@11
|
320 %macro yuv2plane1_fn 3
|
yading@11
|
321 cglobal yuv2plane1_%1, %3, %3, %2, src, dst, w, dither, offset
|
yading@11
|
322 movsxdifnidn wq, wd
|
yading@11
|
323 add wq, mmsize - 1
|
yading@11
|
324 and wq, ~(mmsize - 1)
|
yading@11
|
325 %if %1 == 8
|
yading@11
|
326 add dstq, wq
|
yading@11
|
327 %else ; %1 != 8
|
yading@11
|
328 lea dstq, [dstq+wq*2]
|
yading@11
|
329 %endif ; %1 == 8
|
yading@11
|
330 %if %1 == 16
|
yading@11
|
331 lea srcq, [srcq+wq*4]
|
yading@11
|
332 %else ; %1 != 16
|
yading@11
|
333 lea srcq, [srcq+wq*2]
|
yading@11
|
334 %endif ; %1 == 16
|
yading@11
|
335 neg wq
|
yading@11
|
336
|
yading@11
|
337 %if %1 == 8
|
yading@11
|
338 pxor m4, m4 ; zero
|
yading@11
|
339
|
yading@11
|
340 ; create registers holding dither
|
yading@11
|
341 movq m3, [ditherq] ; dither
|
yading@11
|
342 test offsetd, offsetd
|
yading@11
|
343 jz .no_rot
|
yading@11
|
344 %if mmsize == 16
|
yading@11
|
345 punpcklqdq m3, m3
|
yading@11
|
346 %endif ; mmsize == 16
|
yading@11
|
347 PALIGNR m3, m3, 3, m2
|
yading@11
|
348 .no_rot:
|
yading@11
|
349 %if mmsize == 8
|
yading@11
|
350 mova m2, m3
|
yading@11
|
351 punpckhbw m3, m4 ; byte->word
|
yading@11
|
352 punpcklbw m2, m4 ; byte->word
|
yading@11
|
353 %else
|
yading@11
|
354 punpcklbw m3, m4
|
yading@11
|
355 mova m2, m3
|
yading@11
|
356 %endif
|
yading@11
|
357 %elif %1 == 9
|
yading@11
|
358 pxor m4, m4
|
yading@11
|
359 mova m3, [pw_512]
|
yading@11
|
360 mova m2, [pw_32]
|
yading@11
|
361 %elif %1 == 10
|
yading@11
|
362 pxor m4, m4
|
yading@11
|
363 mova m3, [pw_1024]
|
yading@11
|
364 mova m2, [pw_16]
|
yading@11
|
365 %else ; %1 == 16
|
yading@11
|
366 %if cpuflag(sse4) ; sse4/avx
|
yading@11
|
367 mova m4, [pd_4]
|
yading@11
|
368 %else ; mmx/sse2
|
yading@11
|
369 mova m4, [pd_4min0x40000]
|
yading@11
|
370 mova m5, [minshort]
|
yading@11
|
371 %endif ; mmx/sse2/sse4/avx
|
yading@11
|
372 %endif ; %1 == ..
|
yading@11
|
373
|
yading@11
|
374 ; actual pixel scaling
|
yading@11
|
375 %if mmsize == 8
|
yading@11
|
376 yuv2plane1_mainloop %1, a
|
yading@11
|
377 %else ; mmsize == 16
|
yading@11
|
378 test dstq, 15
|
yading@11
|
379 jnz .unaligned
|
yading@11
|
380 yuv2plane1_mainloop %1, a
|
yading@11
|
381 REP_RET
|
yading@11
|
382 .unaligned:
|
yading@11
|
383 yuv2plane1_mainloop %1, u
|
yading@11
|
384 %endif ; mmsize == 8/16
|
yading@11
|
385 REP_RET
|
yading@11
|
386 %endmacro
|
yading@11
|
387
|
yading@11
|
388 %if ARCH_X86_32
|
yading@11
|
389 INIT_MMX mmx
|
yading@11
|
390 yuv2plane1_fn 8, 0, 5
|
yading@11
|
391 yuv2plane1_fn 16, 0, 3
|
yading@11
|
392
|
yading@11
|
393 INIT_MMX mmxext
|
yading@11
|
394 yuv2plane1_fn 9, 0, 3
|
yading@11
|
395 yuv2plane1_fn 10, 0, 3
|
yading@11
|
396 %endif
|
yading@11
|
397
|
yading@11
|
398 INIT_XMM sse2
|
yading@11
|
399 yuv2plane1_fn 8, 5, 5
|
yading@11
|
400 yuv2plane1_fn 9, 5, 3
|
yading@11
|
401 yuv2plane1_fn 10, 5, 3
|
yading@11
|
402 yuv2plane1_fn 16, 6, 3
|
yading@11
|
403
|
yading@11
|
404 INIT_XMM sse4
|
yading@11
|
405 yuv2plane1_fn 16, 5, 3
|
yading@11
|
406
|
yading@11
|
407 %if HAVE_AVX_EXTERNAL
|
yading@11
|
408 INIT_XMM avx
|
yading@11
|
409 yuv2plane1_fn 8, 5, 5
|
yading@11
|
410 yuv2plane1_fn 9, 5, 3
|
yading@11
|
411 yuv2plane1_fn 10, 5, 3
|
yading@11
|
412 yuv2plane1_fn 16, 5, 3
|
yading@11
|
413 %endif
|