yading@11: ;***************************************************************************** yading@11: ;* x86util.asm yading@11: ;***************************************************************************** yading@11: ;* Copyright (C) 2008-2010 x264 project yading@11: ;* yading@11: ;* Authors: Loren Merritt yading@11: ;* Holger Lubitz yading@11: ;* yading@11: ;* This file is part of FFmpeg. yading@11: ;* yading@11: ;* FFmpeg is free software; you can redistribute it and/or yading@11: ;* modify it under the terms of the GNU Lesser General Public yading@11: ;* License as published by the Free Software Foundation; either yading@11: ;* version 2.1 of the License, or (at your option) any later version. yading@11: ;* yading@11: ;* FFmpeg is distributed in the hope that it will be useful, yading@11: ;* but WITHOUT ANY WARRANTY; without even the implied warranty of yading@11: ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU yading@11: ;* Lesser General Public License for more details. yading@11: ;* yading@11: ;* You should have received a copy of the GNU Lesser General Public yading@11: ;* License along with FFmpeg; if not, write to the Free Software yading@11: ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA yading@11: ;****************************************************************************** yading@11: yading@11: %define private_prefix ff yading@11: %define public_prefix avpriv yading@11: %define cpuflags_mmxext cpuflags_mmx2 yading@11: yading@11: %include "libavutil/x86/x86inc.asm" yading@11: yading@11: %macro SBUTTERFLY 4 yading@11: %if avx_enabled == 0 yading@11: mova m%4, m%2 yading@11: punpckl%1 m%2, m%3 yading@11: punpckh%1 m%4, m%3 yading@11: %else yading@11: punpckh%1 m%4, m%2, m%3 yading@11: punpckl%1 m%2, m%3 yading@11: %endif yading@11: SWAP %3, %4 yading@11: %endmacro yading@11: yading@11: %macro SBUTTERFLY2 4 yading@11: punpckl%1 m%4, m%2, m%3 yading@11: punpckh%1 m%2, m%2, m%3 yading@11: SWAP %2, %4, %3 yading@11: %endmacro yading@11: yading@11: %macro SBUTTERFLYPS 3 yading@11: unpcklps m%3, m%1, m%2 yading@11: unpckhps m%1, m%1, m%2 yading@11: SWAP %1, %3, %2 yading@11: %endmacro yading@11: yading@11: %macro TRANSPOSE4x4B 5 yading@11: SBUTTERFLY bw, %1, %2, %5 yading@11: SBUTTERFLY bw, %3, %4, %5 yading@11: SBUTTERFLY wd, %1, %3, %5 yading@11: SBUTTERFLY wd, %2, %4, %5 yading@11: SWAP %2, %3 yading@11: %endmacro yading@11: yading@11: %macro TRANSPOSE4x4W 5 yading@11: SBUTTERFLY wd, %1, %2, %5 yading@11: SBUTTERFLY wd, %3, %4, %5 yading@11: SBUTTERFLY dq, %1, %3, %5 yading@11: SBUTTERFLY dq, %2, %4, %5 yading@11: SWAP %2, %3 yading@11: %endmacro yading@11: yading@11: %macro TRANSPOSE2x4x4W 5 yading@11: SBUTTERFLY wd, %1, %2, %5 yading@11: SBUTTERFLY wd, %3, %4, %5 yading@11: SBUTTERFLY dq, %1, %3, %5 yading@11: SBUTTERFLY dq, %2, %4, %5 yading@11: SBUTTERFLY qdq, %1, %2, %5 yading@11: SBUTTERFLY qdq, %3, %4, %5 yading@11: %endmacro yading@11: yading@11: %macro TRANSPOSE4x4D 5 yading@11: SBUTTERFLY dq, %1, %2, %5 yading@11: SBUTTERFLY dq, %3, %4, %5 yading@11: SBUTTERFLY qdq, %1, %3, %5 yading@11: SBUTTERFLY qdq, %2, %4, %5 yading@11: SWAP %2, %3 yading@11: %endmacro yading@11: yading@11: ; identical behavior to TRANSPOSE4x4D, but using SSE1 float ops yading@11: %macro TRANSPOSE4x4PS 5 yading@11: SBUTTERFLYPS %1, %2, %5 yading@11: SBUTTERFLYPS %3, %4, %5 yading@11: movlhps m%5, m%1, m%3 yading@11: movhlps m%3, m%1 yading@11: SWAP %5, %1 yading@11: movlhps m%5, m%2, m%4 yading@11: movhlps m%4, m%2 yading@11: SWAP %5, %2, %3 yading@11: %endmacro yading@11: yading@11: %macro TRANSPOSE8x8W 9-11 yading@11: %if ARCH_X86_64 yading@11: SBUTTERFLY wd, %1, %2, %9 yading@11: SBUTTERFLY wd, %3, %4, %9 yading@11: SBUTTERFLY wd, %5, %6, %9 yading@11: SBUTTERFLY wd, %7, %8, %9 yading@11: SBUTTERFLY dq, %1, %3, %9 yading@11: SBUTTERFLY dq, %2, %4, %9 yading@11: SBUTTERFLY dq, %5, %7, %9 yading@11: SBUTTERFLY dq, %6, %8, %9 yading@11: SBUTTERFLY qdq, %1, %5, %9 yading@11: SBUTTERFLY qdq, %2, %6, %9 yading@11: SBUTTERFLY qdq, %3, %7, %9 yading@11: SBUTTERFLY qdq, %4, %8, %9 yading@11: SWAP %2, %5 yading@11: SWAP %4, %7 yading@11: %else yading@11: ; in: m0..m7, unless %11 in which case m6 is in %9 yading@11: ; out: m0..m7, unless %11 in which case m4 is in %10 yading@11: ; spills into %9 and %10 yading@11: %if %0<11 yading@11: movdqa %9, m%7 yading@11: %endif yading@11: SBUTTERFLY wd, %1, %2, %7 yading@11: movdqa %10, m%2 yading@11: movdqa m%7, %9 yading@11: SBUTTERFLY wd, %3, %4, %2 yading@11: SBUTTERFLY wd, %5, %6, %2 yading@11: SBUTTERFLY wd, %7, %8, %2 yading@11: SBUTTERFLY dq, %1, %3, %2 yading@11: movdqa %9, m%3 yading@11: movdqa m%2, %10 yading@11: SBUTTERFLY dq, %2, %4, %3 yading@11: SBUTTERFLY dq, %5, %7, %3 yading@11: SBUTTERFLY dq, %6, %8, %3 yading@11: SBUTTERFLY qdq, %1, %5, %3 yading@11: SBUTTERFLY qdq, %2, %6, %3 yading@11: movdqa %10, m%2 yading@11: movdqa m%3, %9 yading@11: SBUTTERFLY qdq, %3, %7, %2 yading@11: SBUTTERFLY qdq, %4, %8, %2 yading@11: SWAP %2, %5 yading@11: SWAP %4, %7 yading@11: %if %0<11 yading@11: movdqa m%5, %10 yading@11: %endif yading@11: %endif yading@11: %endmacro yading@11: yading@11: ; PABSW macro assumes %1 != %2, while ABS1/2 macros work in-place yading@11: %macro PABSW 2 yading@11: %if cpuflag(ssse3) yading@11: pabsw %1, %2 yading@11: %elif cpuflag(mmxext) yading@11: pxor %1, %1 yading@11: psubw %1, %2 yading@11: pmaxsw %1, %2 yading@11: %else yading@11: pxor %1, %1 yading@11: pcmpgtw %1, %2 yading@11: pxor %2, %1 yading@11: psubw %2, %1 yading@11: SWAP %1, %2 yading@11: %endif yading@11: %endmacro yading@11: yading@11: %macro PSIGNW_MMX 2 yading@11: pxor %1, %2 yading@11: psubw %1, %2 yading@11: %endmacro yading@11: yading@11: %macro PSIGNW_SSSE3 2 yading@11: psignw %1, %2 yading@11: %endmacro yading@11: yading@11: %macro ABS1 2 yading@11: %if cpuflag(ssse3) yading@11: pabsw %1, %1 yading@11: %elif cpuflag(mmxext) ; a, tmp yading@11: pxor %2, %2 yading@11: psubw %2, %1 yading@11: pmaxsw %1, %2 yading@11: %else ; a, tmp yading@11: pxor %2, %2 yading@11: pcmpgtw %2, %1 yading@11: pxor %1, %2 yading@11: psubw %1, %2 yading@11: %endif yading@11: %endmacro yading@11: yading@11: %macro ABS2 4 yading@11: %if cpuflag(ssse3) yading@11: pabsw %1, %1 yading@11: pabsw %2, %2 yading@11: %elif cpuflag(mmxext) ; a, b, tmp0, tmp1 yading@11: pxor %3, %3 yading@11: pxor %4, %4 yading@11: psubw %3, %1 yading@11: psubw %4, %2 yading@11: pmaxsw %1, %3 yading@11: pmaxsw %2, %4 yading@11: %else ; a, b, tmp0, tmp1 yading@11: pxor %3, %3 yading@11: pxor %4, %4 yading@11: pcmpgtw %3, %1 yading@11: pcmpgtw %4, %2 yading@11: pxor %1, %3 yading@11: pxor %2, %4 yading@11: psubw %1, %3 yading@11: psubw %2, %4 yading@11: %endif yading@11: %endmacro yading@11: yading@11: %macro ABSB 2 ; source mmreg, temp mmreg (unused for ssse3) yading@11: %if cpuflag(ssse3) yading@11: pabsb %1, %1 yading@11: %else yading@11: pxor %2, %2 yading@11: psubb %2, %1 yading@11: pminub %1, %2 yading@11: %endif yading@11: %endmacro yading@11: yading@11: %macro ABSB2 4 ; src1, src2, tmp1, tmp2 (tmp1/2 unused for SSSE3) yading@11: %if cpuflag(ssse3) yading@11: pabsb %1, %1 yading@11: pabsb %2, %2 yading@11: %else yading@11: pxor %3, %3 yading@11: pxor %4, %4 yading@11: psubb %3, %1 yading@11: psubb %4, %2 yading@11: pminub %1, %3 yading@11: pminub %2, %4 yading@11: %endif yading@11: %endmacro yading@11: yading@11: %macro ABSD2_MMX 4 yading@11: pxor %3, %3 yading@11: pxor %4, %4 yading@11: pcmpgtd %3, %1 yading@11: pcmpgtd %4, %2 yading@11: pxor %1, %3 yading@11: pxor %2, %4 yading@11: psubd %1, %3 yading@11: psubd %2, %4 yading@11: %endmacro yading@11: yading@11: %macro ABS4 6 yading@11: ABS2 %1, %2, %5, %6 yading@11: ABS2 %3, %4, %5, %6 yading@11: %endmacro yading@11: yading@11: %macro SPLATB_LOAD 3 yading@11: %if cpuflag(ssse3) yading@11: movd %1, [%2-3] yading@11: pshufb %1, %3 yading@11: %else yading@11: movd %1, [%2-3] ;to avoid crossing a cacheline yading@11: punpcklbw %1, %1 yading@11: SPLATW %1, %1, 3 yading@11: %endif yading@11: %endmacro yading@11: yading@11: %macro SPLATB_REG 3 yading@11: %if cpuflag(ssse3) yading@11: movd %1, %2d yading@11: pshufb %1, %3 yading@11: %else yading@11: movd %1, %2d yading@11: punpcklbw %1, %1 yading@11: SPLATW %1, %1, 0 yading@11: %endif yading@11: %endmacro yading@11: yading@11: %macro PALIGNR 4-5 yading@11: %if cpuflag(ssse3) yading@11: %if %0==5 yading@11: palignr %1, %2, %3, %4 yading@11: %else yading@11: palignr %1, %2, %3 yading@11: %endif yading@11: %elif cpuflag(mmx) ; [dst,] src1, src2, imm, tmp yading@11: %define %%dst %1 yading@11: %if %0==5 yading@11: %ifnidn %1, %2 yading@11: mova %%dst, %2 yading@11: %endif yading@11: %rotate 1 yading@11: %endif yading@11: %ifnidn %4, %2 yading@11: mova %4, %2 yading@11: %endif yading@11: %if mmsize==8 yading@11: psllq %%dst, (8-%3)*8 yading@11: psrlq %4, %3*8 yading@11: %else yading@11: pslldq %%dst, 16-%3 yading@11: psrldq %4, %3 yading@11: %endif yading@11: por %%dst, %4 yading@11: %endif yading@11: %endmacro yading@11: yading@11: %macro PAVGB 2 yading@11: %if cpuflag(mmxext) yading@11: pavgb %1, %2 yading@11: %elif cpuflag(3dnow) yading@11: pavgusb %1, %2 yading@11: %endif yading@11: %endmacro yading@11: yading@11: %macro PSHUFLW 1+ yading@11: %if mmsize == 8 yading@11: pshufw %1 yading@11: %else yading@11: pshuflw %1 yading@11: %endif yading@11: %endmacro yading@11: yading@11: %macro PSWAPD 2 yading@11: %if cpuflag(mmxext) yading@11: pshufw %1, %2, q1032 yading@11: %elif cpuflag(3dnowext) yading@11: pswapd %1, %2 yading@11: %elif cpuflag(3dnow) yading@11: movq %1, %2 yading@11: psrlq %1, 32 yading@11: punpckldq %1, %2 yading@11: %endif yading@11: %endmacro yading@11: yading@11: %macro DEINTB 5 ; mask, reg1, mask, reg2, optional src to fill masks from yading@11: %ifnum %5 yading@11: pand m%3, m%5, m%4 ; src .. y6 .. y4 yading@11: pand m%1, m%5, m%2 ; dst .. y6 .. y4 yading@11: %else yading@11: mova m%1, %5 yading@11: pand m%3, m%1, m%4 ; src .. y6 .. y4 yading@11: pand m%1, m%1, m%2 ; dst .. y6 .. y4 yading@11: %endif yading@11: psrlw m%2, 8 ; dst .. y7 .. y5 yading@11: psrlw m%4, 8 ; src .. y7 .. y5 yading@11: %endmacro yading@11: yading@11: %macro SUMSUB_BA 3-4 yading@11: %if %0==3 yading@11: padd%1 m%2, m%3 yading@11: padd%1 m%3, m%3 yading@11: psub%1 m%3, m%2 yading@11: %else yading@11: %if avx_enabled == 0 yading@11: mova m%4, m%2 yading@11: padd%1 m%2, m%3 yading@11: psub%1 m%3, m%4 yading@11: %else yading@11: padd%1 m%4, m%2, m%3 yading@11: psub%1 m%3, m%2 yading@11: SWAP %2, %4 yading@11: %endif yading@11: %endif yading@11: %endmacro yading@11: yading@11: %macro SUMSUB_BADC 5-6 yading@11: %if %0==6 yading@11: SUMSUB_BA %1, %2, %3, %6 yading@11: SUMSUB_BA %1, %4, %5, %6 yading@11: %else yading@11: padd%1 m%2, m%3 yading@11: padd%1 m%4, m%5 yading@11: padd%1 m%3, m%3 yading@11: padd%1 m%5, m%5 yading@11: psub%1 m%3, m%2 yading@11: psub%1 m%5, m%4 yading@11: %endif yading@11: %endmacro yading@11: yading@11: %macro SUMSUB2_AB 4 yading@11: %ifnum %3 yading@11: psub%1 m%4, m%2, m%3 yading@11: psub%1 m%4, m%3 yading@11: padd%1 m%2, m%2 yading@11: padd%1 m%2, m%3 yading@11: %else yading@11: mova m%4, m%2 yading@11: padd%1 m%2, m%2 yading@11: padd%1 m%2, %3 yading@11: psub%1 m%4, %3 yading@11: psub%1 m%4, %3 yading@11: %endif yading@11: %endmacro yading@11: yading@11: %macro SUMSUB2_BA 4 yading@11: %if avx_enabled == 0 yading@11: mova m%4, m%2 yading@11: padd%1 m%2, m%3 yading@11: padd%1 m%2, m%3 yading@11: psub%1 m%3, m%4 yading@11: psub%1 m%3, m%4 yading@11: %else yading@11: padd%1 m%4, m%2, m%3 yading@11: padd%1 m%4, m%3 yading@11: psub%1 m%3, m%2 yading@11: psub%1 m%3, m%2 yading@11: SWAP %2, %4 yading@11: %endif yading@11: %endmacro yading@11: yading@11: %macro SUMSUBD2_AB 5 yading@11: %ifnum %4 yading@11: psra%1 m%5, m%2, 1 ; %3: %3>>1 yading@11: psra%1 m%4, m%3, 1 ; %2: %2>>1 yading@11: padd%1 m%4, m%2 ; %3: %3>>1+%2 yading@11: psub%1 m%5, m%3 ; %2: %2>>1-%3 yading@11: SWAP %2, %5 yading@11: SWAP %3, %4 yading@11: %else yading@11: mova %5, m%2 yading@11: mova %4, m%3 yading@11: psra%1 m%3, 1 ; %3: %3>>1 yading@11: psra%1 m%2, 1 ; %2: %2>>1 yading@11: padd%1 m%3, %5 ; %3: %3>>1+%2 yading@11: psub%1 m%2, %4 ; %2: %2>>1-%3 yading@11: %endif yading@11: %endmacro yading@11: yading@11: %macro DCT4_1D 5 yading@11: %ifnum %5 yading@11: SUMSUB_BADC w, %4, %1, %3, %2, %5 yading@11: SUMSUB_BA w, %3, %4, %5 yading@11: SUMSUB2_AB w, %1, %2, %5 yading@11: SWAP %1, %3, %4, %5, %2 yading@11: %else yading@11: SUMSUB_BADC w, %4, %1, %3, %2 yading@11: SUMSUB_BA w, %3, %4 yading@11: mova [%5], m%2 yading@11: SUMSUB2_AB w, %1, [%5], %2 yading@11: SWAP %1, %3, %4, %2 yading@11: %endif yading@11: %endmacro yading@11: yading@11: %macro IDCT4_1D 6-7 yading@11: %ifnum %6 yading@11: SUMSUBD2_AB %1, %3, %5, %7, %6 yading@11: ; %3: %3>>1-%5 %5: %3+%5>>1 yading@11: SUMSUB_BA %1, %4, %2, %7 yading@11: ; %4: %2+%4 %2: %2-%4 yading@11: SUMSUB_BADC %1, %5, %4, %3, %2, %7 yading@11: ; %5: %2+%4 + (%3+%5>>1) yading@11: ; %4: %2+%4 - (%3+%5>>1) yading@11: ; %3: %2-%4 + (%3>>1-%5) yading@11: ; %2: %2-%4 - (%3>>1-%5) yading@11: %else yading@11: %ifidn %1, w yading@11: SUMSUBD2_AB %1, %3, %5, [%6], [%6+16] yading@11: %else yading@11: SUMSUBD2_AB %1, %3, %5, [%6], [%6+32] yading@11: %endif yading@11: SUMSUB_BA %1, %4, %2 yading@11: SUMSUB_BADC %1, %5, %4, %3, %2 yading@11: %endif yading@11: SWAP %2, %5, %4 yading@11: ; %2: %2+%4 + (%3+%5>>1) row0 yading@11: ; %3: %2-%4 + (%3>>1-%5) row1 yading@11: ; %4: %2-%4 - (%3>>1-%5) row2 yading@11: ; %5: %2+%4 - (%3+%5>>1) row3 yading@11: %endmacro yading@11: yading@11: yading@11: %macro LOAD_DIFF 5 yading@11: %ifidn %3, none yading@11: movh %1, %4 yading@11: movh %2, %5 yading@11: punpcklbw %1, %2 yading@11: punpcklbw %2, %2 yading@11: psubw %1, %2 yading@11: %else yading@11: movh %1, %4 yading@11: punpcklbw %1, %3 yading@11: movh %2, %5 yading@11: punpcklbw %2, %3 yading@11: psubw %1, %2 yading@11: %endif yading@11: %endmacro yading@11: yading@11: %macro STORE_DCT 6 yading@11: movq [%5+%6+ 0], m%1 yading@11: movq [%5+%6+ 8], m%2 yading@11: movq [%5+%6+16], m%3 yading@11: movq [%5+%6+24], m%4 yading@11: movhps [%5+%6+32], m%1 yading@11: movhps [%5+%6+40], m%2 yading@11: movhps [%5+%6+48], m%3 yading@11: movhps [%5+%6+56], m%4 yading@11: %endmacro yading@11: yading@11: %macro LOAD_DIFF_8x4P 7-10 r0,r2,0 ; 4x dest, 2x temp, 2x pointer, increment? yading@11: LOAD_DIFF m%1, m%5, m%7, [%8], [%9] yading@11: LOAD_DIFF m%2, m%6, m%7, [%8+r1], [%9+r3] yading@11: LOAD_DIFF m%3, m%5, m%7, [%8+2*r1], [%9+2*r3] yading@11: LOAD_DIFF m%4, m%6, m%7, [%8+r4], [%9+r5] yading@11: %if %10 yading@11: lea %8, [%8+4*r1] yading@11: lea %9, [%9+4*r3] yading@11: %endif yading@11: %endmacro yading@11: yading@11: %macro DIFFx2 6-7 yading@11: movh %3, %5 yading@11: punpcklbw %3, %4 yading@11: psraw %1, 6 yading@11: paddsw %1, %3 yading@11: movh %3, %6 yading@11: punpcklbw %3, %4 yading@11: psraw %2, 6 yading@11: paddsw %2, %3 yading@11: packuswb %2, %1 yading@11: %endmacro yading@11: yading@11: %macro STORE_DIFF 4 yading@11: movh %2, %4 yading@11: punpcklbw %2, %3 yading@11: psraw %1, 6 yading@11: paddsw %1, %2 yading@11: packuswb %1, %1 yading@11: movh %4, %1 yading@11: %endmacro yading@11: yading@11: %macro STORE_DIFFx2 8 ; add1, add2, reg1, reg2, zero, shift, source, stride yading@11: movh %3, [%7] yading@11: movh %4, [%7+%8] yading@11: psraw %1, %6 yading@11: psraw %2, %6 yading@11: punpcklbw %3, %5 yading@11: punpcklbw %4, %5 yading@11: paddw %3, %1 yading@11: paddw %4, %2 yading@11: packuswb %3, %5 yading@11: packuswb %4, %5 yading@11: movh [%7], %3 yading@11: movh [%7+%8], %4 yading@11: %endmacro yading@11: yading@11: %macro PMINUB 3 ; dst, src, ignored yading@11: %if cpuflag(mmxext) yading@11: pminub %1, %2 yading@11: %else ; dst, src, tmp yading@11: mova %3, %1 yading@11: psubusb %3, %2 yading@11: psubb %1, %3 yading@11: %endif yading@11: %endmacro yading@11: yading@11: %macro SPLATW 2-3 0 yading@11: %if mmsize == 16 yading@11: pshuflw %1, %2, (%3)*0x55 yading@11: punpcklqdq %1, %1 yading@11: %elif cpuflag(mmxext) yading@11: pshufw %1, %2, (%3)*0x55 yading@11: %else yading@11: %ifnidn %1, %2 yading@11: mova %1, %2 yading@11: %endif yading@11: %if %3 & 2 yading@11: punpckhwd %1, %1 yading@11: %else yading@11: punpcklwd %1, %1 yading@11: %endif yading@11: %if %3 & 1 yading@11: punpckhwd %1, %1 yading@11: %else yading@11: punpcklwd %1, %1 yading@11: %endif yading@11: %endif yading@11: %endmacro yading@11: yading@11: %macro SPLATD 1 yading@11: %if mmsize == 8 yading@11: punpckldq %1, %1 yading@11: %elif cpuflag(sse2) yading@11: pshufd %1, %1, 0 yading@11: %elif cpuflag(sse) yading@11: shufps %1, %1, 0 yading@11: %endif yading@11: %endmacro yading@11: yading@11: %macro CLIPW 3 ;(dst, min, max) yading@11: pmaxsw %1, %2 yading@11: pminsw %1, %3 yading@11: %endmacro yading@11: yading@11: %macro PMINSD_MMX 3 ; dst, src, tmp yading@11: mova %3, %2 yading@11: pcmpgtd %3, %1 yading@11: pxor %1, %2 yading@11: pand %1, %3 yading@11: pxor %1, %2 yading@11: %endmacro yading@11: yading@11: %macro PMAXSD_MMX 3 ; dst, src, tmp yading@11: mova %3, %1 yading@11: pcmpgtd %3, %2 yading@11: pand %1, %3 yading@11: pandn %3, %2 yading@11: por %1, %3 yading@11: %endmacro yading@11: yading@11: %macro CLIPD_MMX 3-4 ; src/dst, min, max, tmp yading@11: PMINSD_MMX %1, %3, %4 yading@11: PMAXSD_MMX %1, %2, %4 yading@11: %endmacro yading@11: yading@11: %macro CLIPD_SSE2 3-4 ; src/dst, min (float), max (float), unused yading@11: cvtdq2ps %1, %1 yading@11: minps %1, %3 yading@11: maxps %1, %2 yading@11: cvtps2dq %1, %1 yading@11: %endmacro yading@11: yading@11: %macro CLIPD_SSE41 3-4 ; src/dst, min, max, unused yading@11: pminsd %1, %3 yading@11: pmaxsd %1, %2 yading@11: %endmacro yading@11: yading@11: %macro VBROADCASTSS 2 ; dst xmm/ymm, src m32 yading@11: %if cpuflag(avx) yading@11: vbroadcastss %1, %2 yading@11: %else ; sse yading@11: movss %1, %2 yading@11: shufps %1, %1, 0 yading@11: %endif yading@11: %endmacro yading@11: yading@11: %macro VBROADCASTSD 2 ; dst xmm/ymm, src m64 yading@11: %if cpuflag(avx) && mmsize == 32 yading@11: vbroadcastsd %1, %2 yading@11: %elif cpuflag(sse3) yading@11: movddup %1, %2 yading@11: %else ; sse2 yading@11: movsd %1, %2 yading@11: movlhps %1, %1 yading@11: %endif yading@11: %endmacro yading@11: yading@11: %macro SHUFFLE_MASK_W 8 yading@11: %rep 8 yading@11: %if %1>=0x80 yading@11: db %1, %1 yading@11: %else yading@11: db %1*2 yading@11: db %1*2+1 yading@11: %endif yading@11: %rotate 1 yading@11: %endrep yading@11: %endmacro yading@11: yading@11: %macro PMOVSXWD 2; dst, src yading@11: %if cpuflag(sse4) yading@11: pmovsxwd %1, %2 yading@11: %else yading@11: %ifnidn %1, %2 yading@11: mova %1, %2 yading@11: %endif yading@11: punpcklwd %1, %1 yading@11: psrad %1, 16 yading@11: %endif yading@11: %endmacro