yading@11: ;****************************************************************************** yading@11: ;* x86-optimized vertical line scaling functions yading@11: ;* Copyright (c) 2011 Ronald S. Bultje yading@11: ;* Kieran Kunhya yading@11: ;* yading@11: ;* This file is part of Libav. yading@11: ;* yading@11: ;* Libav is free software; you can redistribute it and/or yading@11: ;* modify it under the terms of the GNU Lesser General Public yading@11: ;* License as published by the Free Software Foundation; either yading@11: ;* version 2.1 of the License, or (at your option) any later version. yading@11: ;* yading@11: ;* Libav is distributed in the hope that it will be useful, yading@11: ;* but WITHOUT ANY WARRANTY; without even the implied warranty of yading@11: ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU yading@11: ;* Lesser General Public License for more details. yading@11: ;* yading@11: ;* You should have received a copy of the GNU Lesser General Public yading@11: ;* License along with Libav; if not, write to the Free Software yading@11: ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA yading@11: ;****************************************************************************** yading@11: yading@11: %include "libavutil/x86/x86util.asm" yading@11: yading@11: SECTION_RODATA yading@11: yading@11: minshort: times 8 dw 0x8000 yading@11: yuv2yuvX_16_start: times 4 dd 0x4000 - 0x40000000 yading@11: yuv2yuvX_10_start: times 4 dd 0x10000 yading@11: yuv2yuvX_9_start: times 4 dd 0x20000 yading@11: yuv2yuvX_10_upper: times 8 dw 0x3ff yading@11: yuv2yuvX_9_upper: times 8 dw 0x1ff yading@11: pd_4: times 4 dd 4 yading@11: pd_4min0x40000:times 4 dd 4 - (0x40000) yading@11: pw_16: times 8 dw 16 yading@11: pw_32: times 8 dw 32 yading@11: pw_512: times 8 dw 512 yading@11: pw_1024: times 8 dw 1024 yading@11: yading@11: SECTION .text yading@11: yading@11: ;----------------------------------------------------------------------------- yading@11: ; vertical line scaling yading@11: ; yading@11: ; void yuv2plane1__(const int16_t *src, uint8_t *dst, int dstW, yading@11: ; const uint8_t *dither, int offset) yading@11: ; and yading@11: ; void yuv2planeX__(const int16_t *filter, int filterSize, yading@11: ; const int16_t **src, uint8_t *dst, int dstW, yading@11: ; const uint8_t *dither, int offset) yading@11: ; yading@11: ; Scale one or $filterSize lines of source data to generate one line of output yading@11: ; data. The input is 15-bit in int16_t if $output_size is [8,10] and 19-bit in yading@11: ; int32_t if $output_size is 16. $filter is 12-bits. $filterSize is a multiple yading@11: ; of 2. $offset is either 0 or 3. $dither holds 8 values. yading@11: ;----------------------------------------------------------------------------- yading@11: yading@11: %macro yuv2planeX_fn 3 yading@11: yading@11: %if ARCH_X86_32 yading@11: %define cntr_reg fltsizeq yading@11: %define movsx mov yading@11: %else yading@11: %define cntr_reg r7 yading@11: %define movsx movsxd yading@11: %endif yading@11: yading@11: cglobal yuv2planeX_%1, %3, 8, %2, filter, fltsize, src, dst, w, dither, offset yading@11: %if %1 == 8 || %1 == 9 || %1 == 10 yading@11: pxor m6, m6 yading@11: %endif ; %1 == 8/9/10 yading@11: yading@11: %if %1 == 8 yading@11: %if ARCH_X86_32 yading@11: %assign pad 0x2c - (stack_offset & 15) yading@11: SUB rsp, pad yading@11: %define m_dith m7 yading@11: %else ; x86-64 yading@11: %define m_dith m9 yading@11: %endif ; x86-32 yading@11: yading@11: ; create registers holding dither yading@11: movq m_dith, [ditherq] ; dither yading@11: test offsetd, offsetd yading@11: jz .no_rot yading@11: %if mmsize == 16 yading@11: punpcklqdq m_dith, m_dith yading@11: %endif ; mmsize == 16 yading@11: PALIGNR m_dith, m_dith, 3, m0 yading@11: .no_rot: yading@11: %if mmsize == 16 yading@11: punpcklbw m_dith, m6 yading@11: %if ARCH_X86_64 yading@11: punpcklwd m8, m_dith, m6 yading@11: pslld m8, 12 yading@11: %else ; x86-32 yading@11: punpcklwd m5, m_dith, m6 yading@11: pslld m5, 12 yading@11: %endif ; x86-32/64 yading@11: punpckhwd m_dith, m6 yading@11: pslld m_dith, 12 yading@11: %if ARCH_X86_32 yading@11: mova [rsp+ 0], m5 yading@11: mova [rsp+16], m_dith yading@11: %endif yading@11: %else ; mmsize == 8 yading@11: punpcklbw m5, m_dith, m6 yading@11: punpckhbw m_dith, m6 yading@11: punpcklwd m4, m5, m6 yading@11: punpckhwd m5, m6 yading@11: punpcklwd m3, m_dith, m6 yading@11: punpckhwd m_dith, m6 yading@11: pslld m4, 12 yading@11: pslld m5, 12 yading@11: pslld m3, 12 yading@11: pslld m_dith, 12 yading@11: mova [rsp+ 0], m4 yading@11: mova [rsp+ 8], m5 yading@11: mova [rsp+16], m3 yading@11: mova [rsp+24], m_dith yading@11: %endif ; mmsize == 8/16 yading@11: %endif ; %1 == 8 yading@11: yading@11: xor r5, r5 yading@11: yading@11: .pixelloop: yading@11: %assign %%i 0 yading@11: ; the rep here is for the 8bit output mmx case, where dither covers yading@11: ; 8 pixels but we can only handle 2 pixels per register, and thus 4 yading@11: ; pixels per iteration. In order to not have to keep track of where yading@11: ; we are w.r.t. dithering, we unroll the mmx/8bit loop x2. yading@11: %if %1 == 8 yading@11: %assign %%repcnt 16/mmsize yading@11: %else yading@11: %assign %%repcnt 1 yading@11: %endif yading@11: yading@11: %rep %%repcnt yading@11: yading@11: %if %1 == 8 yading@11: %if ARCH_X86_32 yading@11: mova m2, [rsp+mmsize*(0+%%i)] yading@11: mova m1, [rsp+mmsize*(1+%%i)] yading@11: %else ; x86-64 yading@11: mova m2, m8 yading@11: mova m1, m_dith yading@11: %endif ; x86-32/64 yading@11: %else ; %1 == 9/10/16 yading@11: mova m1, [yuv2yuvX_%1_start] yading@11: mova m2, m1 yading@11: %endif ; %1 == 8/9/10/16 yading@11: movsx cntr_reg, fltsizem yading@11: .filterloop_ %+ %%i: yading@11: ; input pixels yading@11: mov r6, [srcq+gprsize*cntr_reg-2*gprsize] yading@11: %if %1 == 16 yading@11: mova m3, [r6+r5*4] yading@11: mova m5, [r6+r5*4+mmsize] yading@11: %else ; %1 == 8/9/10 yading@11: mova m3, [r6+r5*2] yading@11: %endif ; %1 == 8/9/10/16 yading@11: mov r6, [srcq+gprsize*cntr_reg-gprsize] yading@11: %if %1 == 16 yading@11: mova m4, [r6+r5*4] yading@11: mova m6, [r6+r5*4+mmsize] yading@11: %else ; %1 == 8/9/10 yading@11: mova m4, [r6+r5*2] yading@11: %endif ; %1 == 8/9/10/16 yading@11: yading@11: ; coefficients yading@11: movd m0, [filterq+2*cntr_reg-4] ; coeff[0], coeff[1] yading@11: %if %1 == 16 yading@11: pshuflw m7, m0, 0 ; coeff[0] yading@11: pshuflw m0, m0, 0x55 ; coeff[1] yading@11: pmovsxwd m7, m7 ; word -> dword yading@11: pmovsxwd m0, m0 ; word -> dword yading@11: yading@11: pmulld m3, m7 yading@11: pmulld m5, m7 yading@11: pmulld m4, m0 yading@11: pmulld m6, m0 yading@11: yading@11: paddd m2, m3 yading@11: paddd m1, m5 yading@11: paddd m2, m4 yading@11: paddd m1, m6 yading@11: %else ; %1 == 10/9/8 yading@11: punpcklwd m5, m3, m4 yading@11: punpckhwd m3, m4 yading@11: SPLATD m0 yading@11: yading@11: pmaddwd m5, m0 yading@11: pmaddwd m3, m0 yading@11: yading@11: paddd m2, m5 yading@11: paddd m1, m3 yading@11: %endif ; %1 == 8/9/10/16 yading@11: yading@11: sub cntr_reg, 2 yading@11: jg .filterloop_ %+ %%i yading@11: yading@11: %if %1 == 16 yading@11: psrad m2, 31 - %1 yading@11: psrad m1, 31 - %1 yading@11: %else ; %1 == 10/9/8 yading@11: psrad m2, 27 - %1 yading@11: psrad m1, 27 - %1 yading@11: %endif ; %1 == 8/9/10/16 yading@11: yading@11: %if %1 == 8 yading@11: packssdw m2, m1 yading@11: packuswb m2, m2 yading@11: movh [dstq+r5*1], m2 yading@11: %else ; %1 == 9/10/16 yading@11: %if %1 == 16 yading@11: packssdw m2, m1 yading@11: paddw m2, [minshort] yading@11: %else ; %1 == 9/10 yading@11: %if cpuflag(sse4) yading@11: packusdw m2, m1 yading@11: %else ; mmxext/sse2 yading@11: packssdw m2, m1 yading@11: pmaxsw m2, m6 yading@11: %endif ; mmxext/sse2/sse4/avx yading@11: pminsw m2, [yuv2yuvX_%1_upper] yading@11: %endif ; %1 == 9/10/16 yading@11: mova [dstq+r5*2], m2 yading@11: %endif ; %1 == 8/9/10/16 yading@11: yading@11: add r5, mmsize/2 yading@11: sub wd, mmsize/2 yading@11: yading@11: %assign %%i %%i+2 yading@11: %endrep yading@11: jg .pixelloop yading@11: yading@11: %if %1 == 8 yading@11: %if ARCH_X86_32 yading@11: ADD rsp, pad yading@11: RET yading@11: %else ; x86-64 yading@11: REP_RET yading@11: %endif ; x86-32/64 yading@11: %else ; %1 == 9/10/16 yading@11: REP_RET yading@11: %endif ; %1 == 8/9/10/16 yading@11: %endmacro yading@11: yading@11: %if ARCH_X86_32 yading@11: INIT_MMX mmxext yading@11: yuv2planeX_fn 8, 0, 7 yading@11: yuv2planeX_fn 9, 0, 5 yading@11: yuv2planeX_fn 10, 0, 5 yading@11: %endif yading@11: yading@11: INIT_XMM sse2 yading@11: yuv2planeX_fn 8, 10, 7 yading@11: yuv2planeX_fn 9, 7, 5 yading@11: yuv2planeX_fn 10, 7, 5 yading@11: yading@11: INIT_XMM sse4 yading@11: yuv2planeX_fn 8, 10, 7 yading@11: yuv2planeX_fn 9, 7, 5 yading@11: yuv2planeX_fn 10, 7, 5 yading@11: yuv2planeX_fn 16, 8, 5 yading@11: yading@11: %if HAVE_AVX_EXTERNAL yading@11: INIT_XMM avx yading@11: yuv2planeX_fn 8, 10, 7 yading@11: yuv2planeX_fn 9, 7, 5 yading@11: yuv2planeX_fn 10, 7, 5 yading@11: %endif yading@11: yading@11: ; %1=outout-bpc, %2=alignment (u/a) yading@11: %macro yuv2plane1_mainloop 2 yading@11: .loop_%2: yading@11: %if %1 == 8 yading@11: paddsw m0, m2, [srcq+wq*2+mmsize*0] yading@11: paddsw m1, m3, [srcq+wq*2+mmsize*1] yading@11: psraw m0, 7 yading@11: psraw m1, 7 yading@11: packuswb m0, m1 yading@11: mov%2 [dstq+wq], m0 yading@11: %elif %1 == 16 yading@11: paddd m0, m4, [srcq+wq*4+mmsize*0] yading@11: paddd m1, m4, [srcq+wq*4+mmsize*1] yading@11: paddd m2, m4, [srcq+wq*4+mmsize*2] yading@11: paddd m3, m4, [srcq+wq*4+mmsize*3] yading@11: psrad m0, 3 yading@11: psrad m1, 3 yading@11: psrad m2, 3 yading@11: psrad m3, 3 yading@11: %if cpuflag(sse4) ; avx/sse4 yading@11: packusdw m0, m1 yading@11: packusdw m2, m3 yading@11: %else ; mmx/sse2 yading@11: packssdw m0, m1 yading@11: packssdw m2, m3 yading@11: paddw m0, m5 yading@11: paddw m2, m5 yading@11: %endif ; mmx/sse2/sse4/avx yading@11: mov%2 [dstq+wq*2+mmsize*0], m0 yading@11: mov%2 [dstq+wq*2+mmsize*1], m2 yading@11: %else ; %1 == 9/10 yading@11: paddsw m0, m2, [srcq+wq*2+mmsize*0] yading@11: paddsw m1, m2, [srcq+wq*2+mmsize*1] yading@11: psraw m0, 15 - %1 yading@11: psraw m1, 15 - %1 yading@11: pmaxsw m0, m4 yading@11: pmaxsw m1, m4 yading@11: pminsw m0, m3 yading@11: pminsw m1, m3 yading@11: mov%2 [dstq+wq*2+mmsize*0], m0 yading@11: mov%2 [dstq+wq*2+mmsize*1], m1 yading@11: %endif yading@11: add wq, mmsize yading@11: jl .loop_%2 yading@11: %endmacro yading@11: yading@11: %macro yuv2plane1_fn 3 yading@11: cglobal yuv2plane1_%1, %3, %3, %2, src, dst, w, dither, offset yading@11: movsxdifnidn wq, wd yading@11: add wq, mmsize - 1 yading@11: and wq, ~(mmsize - 1) yading@11: %if %1 == 8 yading@11: add dstq, wq yading@11: %else ; %1 != 8 yading@11: lea dstq, [dstq+wq*2] yading@11: %endif ; %1 == 8 yading@11: %if %1 == 16 yading@11: lea srcq, [srcq+wq*4] yading@11: %else ; %1 != 16 yading@11: lea srcq, [srcq+wq*2] yading@11: %endif ; %1 == 16 yading@11: neg wq yading@11: yading@11: %if %1 == 8 yading@11: pxor m4, m4 ; zero yading@11: yading@11: ; create registers holding dither yading@11: movq m3, [ditherq] ; dither yading@11: test offsetd, offsetd yading@11: jz .no_rot yading@11: %if mmsize == 16 yading@11: punpcklqdq m3, m3 yading@11: %endif ; mmsize == 16 yading@11: PALIGNR m3, m3, 3, m2 yading@11: .no_rot: yading@11: %if mmsize == 8 yading@11: mova m2, m3 yading@11: punpckhbw m3, m4 ; byte->word yading@11: punpcklbw m2, m4 ; byte->word yading@11: %else yading@11: punpcklbw m3, m4 yading@11: mova m2, m3 yading@11: %endif yading@11: %elif %1 == 9 yading@11: pxor m4, m4 yading@11: mova m3, [pw_512] yading@11: mova m2, [pw_32] yading@11: %elif %1 == 10 yading@11: pxor m4, m4 yading@11: mova m3, [pw_1024] yading@11: mova m2, [pw_16] yading@11: %else ; %1 == 16 yading@11: %if cpuflag(sse4) ; sse4/avx yading@11: mova m4, [pd_4] yading@11: %else ; mmx/sse2 yading@11: mova m4, [pd_4min0x40000] yading@11: mova m5, [minshort] yading@11: %endif ; mmx/sse2/sse4/avx yading@11: %endif ; %1 == .. yading@11: yading@11: ; actual pixel scaling yading@11: %if mmsize == 8 yading@11: yuv2plane1_mainloop %1, a yading@11: %else ; mmsize == 16 yading@11: test dstq, 15 yading@11: jnz .unaligned yading@11: yuv2plane1_mainloop %1, a yading@11: REP_RET yading@11: .unaligned: yading@11: yuv2plane1_mainloop %1, u yading@11: %endif ; mmsize == 8/16 yading@11: REP_RET yading@11: %endmacro yading@11: yading@11: %if ARCH_X86_32 yading@11: INIT_MMX mmx yading@11: yuv2plane1_fn 8, 0, 5 yading@11: yuv2plane1_fn 16, 0, 3 yading@11: yading@11: INIT_MMX mmxext yading@11: yuv2plane1_fn 9, 0, 3 yading@11: yuv2plane1_fn 10, 0, 3 yading@11: %endif yading@11: yading@11: INIT_XMM sse2 yading@11: yuv2plane1_fn 8, 5, 5 yading@11: yuv2plane1_fn 9, 5, 3 yading@11: yuv2plane1_fn 10, 5, 3 yading@11: yuv2plane1_fn 16, 6, 3 yading@11: yading@11: INIT_XMM sse4 yading@11: yuv2plane1_fn 16, 5, 3 yading@11: yading@11: %if HAVE_AVX_EXTERNAL yading@11: INIT_XMM avx yading@11: yuv2plane1_fn 8, 5, 5 yading@11: yuv2plane1_fn 9, 5, 3 yading@11: yuv2plane1_fn 10, 5, 3 yading@11: yuv2plane1_fn 16, 5, 3 yading@11: %endif