yading@10: ;***************************************************************************** yading@10: ;* MMX/SSE2/SSSE3-optimized H.264 QPEL code yading@10: ;***************************************************************************** yading@10: ;* Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt yading@10: ;* Copyright (C) 2012 Daniel Kang yading@10: ;* yading@10: ;* Authors: Daniel Kang yading@10: ;* yading@10: ;* This file is part of FFmpeg. yading@10: ;* yading@10: ;* FFmpeg is free software; you can redistribute it and/or yading@10: ;* modify it under the terms of the GNU Lesser General Public yading@10: ;* License as published by the Free Software Foundation; either yading@10: ;* version 2.1 of the License, or (at your option) any later version. yading@10: ;* yading@10: ;* FFmpeg is distributed in the hope that it will be useful, yading@10: ;* but WITHOUT ANY WARRANTY; without even the implied warranty of yading@10: ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU yading@10: ;* Lesser General Public License for more details. yading@10: ;* yading@10: ;* You should have received a copy of the GNU Lesser General Public yading@10: ;* License along with FFmpeg; if not, write to the Free Software yading@10: ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA yading@10: ;****************************************************************************** yading@10: yading@10: %include "libavutil/x86/x86util.asm" yading@10: yading@10: SECTION_RODATA 32 yading@10: yading@10: cextern pw_16 yading@10: cextern pw_5 yading@10: cextern pb_0 yading@10: yading@10: SECTION .text yading@10: yading@10: yading@10: %macro op_avgh 3 yading@10: movh %3, %2 yading@10: pavgb %1, %3 yading@10: movh %2, %1 yading@10: %endmacro yading@10: yading@10: %macro op_avg 2-3 yading@10: pavgb %1, %2 yading@10: mova %2, %1 yading@10: %endmacro yading@10: yading@10: %macro op_puth 2-3 yading@10: movh %2, %1 yading@10: %endmacro yading@10: yading@10: %macro op_put 2-3 yading@10: mova %2, %1 yading@10: %endmacro yading@10: yading@10: %macro QPEL4_H_LOWPASS_OP 1 yading@10: cglobal %1_h264_qpel4_h_lowpass, 4,5 ; dst, src, dstStride, srcStride yading@10: movsxdifnidn r2, r2d yading@10: movsxdifnidn r3, r3d yading@10: pxor m7, m7 yading@10: mova m4, [pw_5] yading@10: mova m5, [pw_16] yading@10: mov r4d, 4 yading@10: .loop: yading@10: movh m1, [r1-1] yading@10: movh m2, [r1+0] yading@10: movh m3, [r1+1] yading@10: movh m0, [r1+2] yading@10: punpcklbw m1, m7 yading@10: punpcklbw m2, m7 yading@10: punpcklbw m3, m7 yading@10: punpcklbw m0, m7 yading@10: paddw m1, m0 yading@10: paddw m2, m3 yading@10: movh m0, [r1-2] yading@10: movh m3, [r1+3] yading@10: punpcklbw m0, m7 yading@10: punpcklbw m3, m7 yading@10: paddw m0, m3 yading@10: psllw m2, 2 yading@10: psubw m2, m1 yading@10: pmullw m2, m4 yading@10: paddw m0, m5 yading@10: paddw m0, m2 yading@10: psraw m0, 5 yading@10: packuswb m0, m0 yading@10: op_%1h m0, [r0], m6 yading@10: add r0, r2 yading@10: add r1, r3 yading@10: dec r4d yading@10: jg .loop yading@10: REP_RET yading@10: %endmacro yading@10: yading@10: INIT_MMX mmxext yading@10: QPEL4_H_LOWPASS_OP put yading@10: QPEL4_H_LOWPASS_OP avg yading@10: yading@10: %macro QPEL8_H_LOWPASS_OP 1 yading@10: cglobal %1_h264_qpel8_h_lowpass, 4,5 ; dst, src, dstStride, srcStride yading@10: movsxdifnidn r2, r2d yading@10: movsxdifnidn r3, r3d yading@10: mov r4d, 8 yading@10: pxor m7, m7 yading@10: mova m6, [pw_5] yading@10: .loop: yading@10: mova m0, [r1] yading@10: mova m2, [r1+1] yading@10: mova m1, m0 yading@10: mova m3, m2 yading@10: punpcklbw m0, m7 yading@10: punpckhbw m1, m7 yading@10: punpcklbw m2, m7 yading@10: punpckhbw m3, m7 yading@10: paddw m0, m2 yading@10: paddw m1, m3 yading@10: psllw m0, 2 yading@10: psllw m1, 2 yading@10: mova m2, [r1-1] yading@10: mova m4, [r1+2] yading@10: mova m3, m2 yading@10: mova m5, m4 yading@10: punpcklbw m2, m7 yading@10: punpckhbw m3, m7 yading@10: punpcklbw m4, m7 yading@10: punpckhbw m5, m7 yading@10: paddw m2, m4 yading@10: paddw m5, m3 yading@10: psubw m0, m2 yading@10: psubw m1, m5 yading@10: pmullw m0, m6 yading@10: pmullw m1, m6 yading@10: movd m2, [r1-2] yading@10: movd m5, [r1+7] yading@10: punpcklbw m2, m7 yading@10: punpcklbw m5, m7 yading@10: paddw m2, m3 yading@10: paddw m4, m5 yading@10: mova m5, [pw_16] yading@10: paddw m2, m5 yading@10: paddw m4, m5 yading@10: paddw m0, m2 yading@10: paddw m1, m4 yading@10: psraw m0, 5 yading@10: psraw m1, 5 yading@10: packuswb m0, m1 yading@10: op_%1 m0, [r0], m4 yading@10: add r0, r2 yading@10: add r1, r3 yading@10: dec r4d yading@10: jg .loop yading@10: REP_RET yading@10: %endmacro yading@10: yading@10: INIT_MMX mmxext yading@10: QPEL8_H_LOWPASS_OP put yading@10: QPEL8_H_LOWPASS_OP avg yading@10: yading@10: %macro QPEL8_H_LOWPASS_OP_XMM 1 yading@10: cglobal %1_h264_qpel8_h_lowpass, 4,5,8 ; dst, src, dstStride, srcStride yading@10: movsxdifnidn r2, r2d yading@10: movsxdifnidn r3, r3d yading@10: mov r4d, 8 yading@10: pxor m7, m7 yading@10: mova m6, [pw_5] yading@10: .loop: yading@10: movu m1, [r1-2] yading@10: mova m0, m1 yading@10: punpckhbw m1, m7 yading@10: punpcklbw m0, m7 yading@10: mova m2, m1 yading@10: mova m3, m1 yading@10: mova m4, m1 yading@10: mova m5, m1 yading@10: palignr m4, m0, 2 yading@10: palignr m3, m0, 4 yading@10: palignr m2, m0, 6 yading@10: palignr m1, m0, 8 yading@10: palignr m5, m0, 10 yading@10: paddw m0, m5 yading@10: paddw m2, m3 yading@10: paddw m1, m4 yading@10: psllw m2, 2 yading@10: psubw m2, m1 yading@10: paddw m0, [pw_16] yading@10: pmullw m2, m6 yading@10: paddw m2, m0 yading@10: psraw m2, 5 yading@10: packuswb m2, m2 yading@10: op_%1h m2, [r0], m4 yading@10: add r1, r3 yading@10: add r0, r2 yading@10: dec r4d yading@10: jne .loop yading@10: REP_RET yading@10: %endmacro yading@10: yading@10: INIT_XMM ssse3 yading@10: QPEL8_H_LOWPASS_OP_XMM put yading@10: QPEL8_H_LOWPASS_OP_XMM avg yading@10: yading@10: yading@10: %macro QPEL4_H_LOWPASS_L2_OP 1 yading@10: cglobal %1_h264_qpel4_h_lowpass_l2, 5,6 ; dst, src, src2, dstStride, srcStride yading@10: movsxdifnidn r3, r3d yading@10: movsxdifnidn r4, r4d yading@10: pxor m7, m7 yading@10: mova m4, [pw_5] yading@10: mova m5, [pw_16] yading@10: mov r5d, 4 yading@10: .loop: yading@10: movh m1, [r1-1] yading@10: movh m2, [r1+0] yading@10: movh m3, [r1+1] yading@10: movh m0, [r1+2] yading@10: punpcklbw m1, m7 yading@10: punpcklbw m2, m7 yading@10: punpcklbw m3, m7 yading@10: punpcklbw m0, m7 yading@10: paddw m1, m0 yading@10: paddw m2, m3 yading@10: movh m0, [r1-2] yading@10: movh m3, [r1+3] yading@10: punpcklbw m0, m7 yading@10: punpcklbw m3, m7 yading@10: paddw m0, m3 yading@10: psllw m2, 2 yading@10: psubw m2, m1 yading@10: pmullw m2, m4 yading@10: paddw m0, m5 yading@10: paddw m0, m2 yading@10: movh m3, [r2] yading@10: psraw m0, 5 yading@10: packuswb m0, m0 yading@10: pavgb m0, m3 yading@10: op_%1h m0, [r0], m6 yading@10: add r0, r3 yading@10: add r1, r3 yading@10: add r2, r4 yading@10: dec r5d yading@10: jg .loop yading@10: REP_RET yading@10: %endmacro yading@10: yading@10: INIT_MMX mmxext yading@10: QPEL4_H_LOWPASS_L2_OP put yading@10: QPEL4_H_LOWPASS_L2_OP avg yading@10: yading@10: yading@10: %macro QPEL8_H_LOWPASS_L2_OP 1 yading@10: cglobal %1_h264_qpel8_h_lowpass_l2, 5,6 ; dst, src, src2, dstStride, srcStride yading@10: movsxdifnidn r3, r3d yading@10: movsxdifnidn r4, r4d yading@10: mov r5d, 8 yading@10: pxor m7, m7 yading@10: mova m6, [pw_5] yading@10: .loop: yading@10: mova m0, [r1] yading@10: mova m2, [r1+1] yading@10: mova m1, m0 yading@10: mova m3, m2 yading@10: punpcklbw m0, m7 yading@10: punpckhbw m1, m7 yading@10: punpcklbw m2, m7 yading@10: punpckhbw m3, m7 yading@10: paddw m0, m2 yading@10: paddw m1, m3 yading@10: psllw m0, 2 yading@10: psllw m1, 2 yading@10: mova m2, [r1-1] yading@10: mova m4, [r1+2] yading@10: mova m3, m2 yading@10: mova m5, m4 yading@10: punpcklbw m2, m7 yading@10: punpckhbw m3, m7 yading@10: punpcklbw m4, m7 yading@10: punpckhbw m5, m7 yading@10: paddw m2, m4 yading@10: paddw m5, m3 yading@10: psubw m0, m2 yading@10: psubw m1, m5 yading@10: pmullw m0, m6 yading@10: pmullw m1, m6 yading@10: movd m2, [r1-2] yading@10: movd m5, [r1+7] yading@10: punpcklbw m2, m7 yading@10: punpcklbw m5, m7 yading@10: paddw m2, m3 yading@10: paddw m4, m5 yading@10: mova m5, [pw_16] yading@10: paddw m2, m5 yading@10: paddw m4, m5 yading@10: paddw m0, m2 yading@10: paddw m1, m4 yading@10: psraw m0, 5 yading@10: psraw m1, 5 yading@10: mova m4, [r2] yading@10: packuswb m0, m1 yading@10: pavgb m0, m4 yading@10: op_%1 m0, [r0], m4 yading@10: add r0, r3 yading@10: add r1, r3 yading@10: add r2, r4 yading@10: dec r5d yading@10: jg .loop yading@10: REP_RET yading@10: %endmacro yading@10: yading@10: INIT_MMX mmxext yading@10: QPEL8_H_LOWPASS_L2_OP put yading@10: QPEL8_H_LOWPASS_L2_OP avg yading@10: yading@10: yading@10: %macro QPEL8_H_LOWPASS_L2_OP_XMM 1 yading@10: cglobal %1_h264_qpel8_h_lowpass_l2, 5,6,8 ; dst, src, src2, dstStride, src2Stride yading@10: movsxdifnidn r3, r3d yading@10: movsxdifnidn r4, r4d yading@10: mov r5d, 8 yading@10: pxor m7, m7 yading@10: mova m6, [pw_5] yading@10: .loop: yading@10: lddqu m1, [r1-2] yading@10: mova m0, m1 yading@10: punpckhbw m1, m7 yading@10: punpcklbw m0, m7 yading@10: mova m2, m1 yading@10: mova m3, m1 yading@10: mova m4, m1 yading@10: mova m5, m1 yading@10: palignr m4, m0, 2 yading@10: palignr m3, m0, 4 yading@10: palignr m2, m0, 6 yading@10: palignr m1, m0, 8 yading@10: palignr m5, m0, 10 yading@10: paddw m0, m5 yading@10: paddw m2, m3 yading@10: paddw m1, m4 yading@10: psllw m2, 2 yading@10: movh m3, [r2] yading@10: psubw m2, m1 yading@10: paddw m0, [pw_16] yading@10: pmullw m2, m6 yading@10: paddw m2, m0 yading@10: psraw m2, 5 yading@10: packuswb m2, m2 yading@10: pavgb m2, m3 yading@10: op_%1h m2, [r0], m4 yading@10: add r1, r3 yading@10: add r0, r3 yading@10: add r2, r4 yading@10: dec r5d yading@10: jg .loop yading@10: REP_RET yading@10: %endmacro yading@10: yading@10: INIT_XMM ssse3 yading@10: QPEL8_H_LOWPASS_L2_OP_XMM put yading@10: QPEL8_H_LOWPASS_L2_OP_XMM avg yading@10: yading@10: yading@10: ; All functions that call this are required to have function arguments of yading@10: ; dst, src, dstStride, srcStride yading@10: %macro FILT_V 1 yading@10: mova m6, m2 yading@10: movh m5, [r1] yading@10: paddw m6, m3 yading@10: psllw m6, 2 yading@10: psubw m6, m1 yading@10: psubw m6, m4 yading@10: punpcklbw m5, m7 yading@10: pmullw m6, [pw_5] yading@10: paddw m0, [pw_16] yading@10: add r1, r3 yading@10: paddw m0, m5 yading@10: paddw m6, m0 yading@10: psraw m6, 5 yading@10: packuswb m6, m6 yading@10: op_%1h m6, [r0], m0 ; 1 yading@10: add r0, r2 yading@10: SWAP 0, 1, 2, 3, 4, 5 yading@10: %endmacro yading@10: yading@10: %macro QPEL4_V_LOWPASS_OP 1 yading@10: cglobal %1_h264_qpel4_v_lowpass, 4,4 ; dst, src, dstStride, srcStride yading@10: movsxdifnidn r2, r2d yading@10: movsxdifnidn r3, r3d yading@10: sub r1, r3 yading@10: sub r1, r3 yading@10: pxor m7, m7 yading@10: movh m0, [r1] yading@10: movh m1, [r1+r3] yading@10: lea r1, [r1+2*r3] yading@10: movh m2, [r1] yading@10: movh m3, [r1+r3] yading@10: lea r1, [r1+2*r3] yading@10: movh m4, [r1] yading@10: add r1, r3 yading@10: punpcklbw m0, m7 yading@10: punpcklbw m1, m7 yading@10: punpcklbw m2, m7 yading@10: punpcklbw m3, m7 yading@10: punpcklbw m4, m7 yading@10: FILT_V %1 yading@10: FILT_V %1 yading@10: FILT_V %1 yading@10: FILT_V %1 yading@10: RET yading@10: %endmacro yading@10: yading@10: INIT_MMX mmxext yading@10: QPEL4_V_LOWPASS_OP put yading@10: QPEL4_V_LOWPASS_OP avg yading@10: yading@10: yading@10: yading@10: %macro QPEL8OR16_V_LOWPASS_OP 1 yading@10: %if cpuflag(sse2) yading@10: cglobal %1_h264_qpel8or16_v_lowpass, 5,5,8 ; dst, src, dstStride, srcStride, h yading@10: movsxdifnidn r2, r2d yading@10: movsxdifnidn r3, r3d yading@10: sub r1, r3 yading@10: sub r1, r3 yading@10: %else yading@10: cglobal %1_h264_qpel8or16_v_lowpass_op, 5,5,8 ; dst, src, dstStride, srcStride, h yading@10: movsxdifnidn r2, r2d yading@10: movsxdifnidn r3, r3d yading@10: %endif yading@10: pxor m7, m7 yading@10: movh m0, [r1] yading@10: movh m1, [r1+r3] yading@10: lea r1, [r1+2*r3] yading@10: movh m2, [r1] yading@10: movh m3, [r1+r3] yading@10: lea r1, [r1+2*r3] yading@10: movh m4, [r1] yading@10: add r1, r3 yading@10: punpcklbw m0, m7 yading@10: punpcklbw m1, m7 yading@10: punpcklbw m2, m7 yading@10: punpcklbw m3, m7 yading@10: punpcklbw m4, m7 yading@10: FILT_V %1 yading@10: FILT_V %1 yading@10: FILT_V %1 yading@10: FILT_V %1 yading@10: FILT_V %1 yading@10: FILT_V %1 yading@10: FILT_V %1 yading@10: FILT_V %1 yading@10: cmp r4d, 16 yading@10: jne .end yading@10: FILT_V %1 yading@10: FILT_V %1 yading@10: FILT_V %1 yading@10: FILT_V %1 yading@10: FILT_V %1 yading@10: FILT_V %1 yading@10: FILT_V %1 yading@10: FILT_V %1 yading@10: .end: yading@10: REP_RET yading@10: %endmacro yading@10: yading@10: INIT_MMX mmxext yading@10: QPEL8OR16_V_LOWPASS_OP put yading@10: QPEL8OR16_V_LOWPASS_OP avg yading@10: yading@10: INIT_XMM sse2 yading@10: QPEL8OR16_V_LOWPASS_OP put yading@10: QPEL8OR16_V_LOWPASS_OP avg yading@10: yading@10: yading@10: ; All functions that use this are required to have args: yading@10: ; src, tmp, srcSize yading@10: %macro FILT_HV 1 ; offset yading@10: mova m6, m2 yading@10: movh m5, [r0] yading@10: paddw m6, m3 yading@10: psllw m6, 2 yading@10: paddw m0, [pw_16] yading@10: psubw m6, m1 yading@10: psubw m6, m4 yading@10: punpcklbw m5, m7 yading@10: pmullw m6, [pw_5] yading@10: paddw m0, m5 yading@10: add r0, r2 yading@10: paddw m6, m0 yading@10: mova [r1+%1], m6 yading@10: SWAP 0, 1, 2, 3, 4, 5 yading@10: %endmacro yading@10: yading@10: %macro QPEL4_HV1_LOWPASS_OP 1 yading@10: cglobal %1_h264_qpel4_hv_lowpass_v, 3,3 ; src, tmp, srcStride yading@10: movsxdifnidn r2, r2d yading@10: pxor m7, m7 yading@10: movh m0, [r0] yading@10: movh m1, [r0+r2] yading@10: lea r0, [r0+2*r2] yading@10: movh m2, [r0] yading@10: movh m3, [r0+r2] yading@10: lea r0, [r0+2*r2] yading@10: movh m4, [r0] yading@10: add r0, r2 yading@10: punpcklbw m0, m7 yading@10: punpcklbw m1, m7 yading@10: punpcklbw m2, m7 yading@10: punpcklbw m3, m7 yading@10: punpcklbw m4, m7 yading@10: FILT_HV 0*24 yading@10: FILT_HV 1*24 yading@10: FILT_HV 2*24 yading@10: FILT_HV 3*24 yading@10: RET yading@10: yading@10: cglobal %1_h264_qpel4_hv_lowpass_h, 3,4 ; tmp, dst, dstStride yading@10: movsxdifnidn r2, r2d yading@10: mov r3d, 4 yading@10: .loop: yading@10: mova m0, [r0] yading@10: paddw m0, [r0+10] yading@10: mova m1, [r0+2] yading@10: paddw m1, [r0+8] yading@10: mova m2, [r0+4] yading@10: paddw m2, [r0+6] yading@10: psubw m0, m1 yading@10: psraw m0, 2 yading@10: psubw m0, m1 yading@10: paddsw m0, m2 yading@10: psraw m0, 2 yading@10: paddw m0, m2 yading@10: psraw m0, 6 yading@10: packuswb m0, m0 yading@10: op_%1h m0, [r1], m7 yading@10: add r0, 24 yading@10: add r1, r2 yading@10: dec r3d yading@10: jnz .loop yading@10: REP_RET yading@10: %endmacro yading@10: yading@10: INIT_MMX mmxext yading@10: QPEL4_HV1_LOWPASS_OP put yading@10: QPEL4_HV1_LOWPASS_OP avg yading@10: yading@10: %macro QPEL8OR16_HV1_LOWPASS_OP 1 yading@10: cglobal %1_h264_qpel8or16_hv1_lowpass_op, 4,4,8 ; src, tmp, srcStride, size yading@10: movsxdifnidn r2, r2d yading@10: pxor m7, m7 yading@10: movh m0, [r0] yading@10: movh m1, [r0+r2] yading@10: lea r0, [r0+2*r2] yading@10: movh m2, [r0] yading@10: movh m3, [r0+r2] yading@10: lea r0, [r0+2*r2] yading@10: movh m4, [r0] yading@10: add r0, r2 yading@10: punpcklbw m0, m7 yading@10: punpcklbw m1, m7 yading@10: punpcklbw m2, m7 yading@10: punpcklbw m3, m7 yading@10: punpcklbw m4, m7 yading@10: FILT_HV 0*48 yading@10: FILT_HV 1*48 yading@10: FILT_HV 2*48 yading@10: FILT_HV 3*48 yading@10: FILT_HV 4*48 yading@10: FILT_HV 5*48 yading@10: FILT_HV 6*48 yading@10: FILT_HV 7*48 yading@10: cmp r3d, 16 yading@10: jne .end yading@10: FILT_HV 8*48 yading@10: FILT_HV 9*48 yading@10: FILT_HV 10*48 yading@10: FILT_HV 11*48 yading@10: FILT_HV 12*48 yading@10: FILT_HV 13*48 yading@10: FILT_HV 14*48 yading@10: FILT_HV 15*48 yading@10: .end: yading@10: REP_RET yading@10: %endmacro yading@10: yading@10: INIT_MMX mmxext yading@10: QPEL8OR16_HV1_LOWPASS_OP put yading@10: QPEL8OR16_HV1_LOWPASS_OP avg yading@10: yading@10: INIT_XMM sse2 yading@10: QPEL8OR16_HV1_LOWPASS_OP put yading@10: yading@10: yading@10: yading@10: %macro QPEL8OR16_HV2_LOWPASS_OP 1 yading@10: ; unused is to match ssse3 and mmxext args yading@10: cglobal %1_h264_qpel8or16_hv2_lowpass_op, 5,5 ; dst, tmp, dstStride, unused, h yading@10: movsxdifnidn r2, r2d yading@10: .loop: yading@10: mova m0, [r1] yading@10: mova m3, [r1+8] yading@10: mova m1, [r1+2] yading@10: mova m4, [r1+10] yading@10: paddw m0, m4 yading@10: paddw m1, m3 yading@10: paddw m3, [r1+18] yading@10: paddw m4, [r1+16] yading@10: mova m2, [r1+4] yading@10: mova m5, [r1+12] yading@10: paddw m2, [r1+6] yading@10: paddw m5, [r1+14] yading@10: psubw m0, m1 yading@10: psubw m3, m4 yading@10: psraw m0, 2 yading@10: psraw m3, 2 yading@10: psubw m0, m1 yading@10: psubw m3, m4 yading@10: paddsw m0, m2 yading@10: paddsw m3, m5 yading@10: psraw m0, 2 yading@10: psraw m3, 2 yading@10: paddw m0, m2 yading@10: paddw m3, m5 yading@10: psraw m0, 6 yading@10: psraw m3, 6 yading@10: packuswb m0, m3 yading@10: op_%1 m0, [r0], m7 yading@10: add r1, 48 yading@10: add r0, r2 yading@10: dec r4d yading@10: jne .loop yading@10: REP_RET yading@10: %endmacro yading@10: yading@10: INIT_MMX mmxext yading@10: QPEL8OR16_HV2_LOWPASS_OP put yading@10: QPEL8OR16_HV2_LOWPASS_OP avg yading@10: yading@10: %macro QPEL8OR16_HV2_LOWPASS_OP_XMM 1 yading@10: cglobal %1_h264_qpel8or16_hv2_lowpass, 5,5,8 ; dst, tmp, dstStride, tmpStride, size yading@10: movsxdifnidn r2, r2d yading@10: movsxdifnidn r3, r3d yading@10: cmp r4d, 16 yading@10: je .op16 yading@10: .loop8: yading@10: mova m1, [r1+16] yading@10: mova m0, [r1] yading@10: mova m2, m1 yading@10: mova m3, m1 yading@10: mova m4, m1 yading@10: mova m5, m1 yading@10: palignr m5, m0, 10 yading@10: palignr m4, m0, 8 yading@10: palignr m3, m0, 6 yading@10: palignr m2, m0, 4 yading@10: palignr m1, m0, 2 yading@10: paddw m0, m5 yading@10: paddw m1, m4 yading@10: paddw m2, m3 yading@10: psubw m0, m1 yading@10: psraw m0, 2 yading@10: psubw m0, m1 yading@10: paddw m0, m2 yading@10: psraw m0, 2 yading@10: paddw m0, m2 yading@10: psraw m0, 6 yading@10: packuswb m0, m0 yading@10: op_%1h m0, [r0], m7 yading@10: add r1, 48 yading@10: add r0, r2 yading@10: dec r4d yading@10: jne .loop8 yading@10: jmp .done yading@10: .op16: yading@10: mova m4, [r1+32] yading@10: mova m5, [r1+16] yading@10: mova m7, [r1] yading@10: mova m3, m4 yading@10: mova m2, m4 yading@10: mova m1, m4 yading@10: mova m0, m4 yading@10: palignr m0, m5, 10 yading@10: palignr m1, m5, 8 yading@10: palignr m2, m5, 6 yading@10: palignr m3, m5, 4 yading@10: palignr m4, m5, 2 yading@10: paddw m0, m5 yading@10: paddw m1, m4 yading@10: paddw m2, m3 yading@10: mova m6, m5 yading@10: mova m4, m5 yading@10: mova m3, m5 yading@10: palignr m4, m7, 8 yading@10: palignr m6, m7, 2 yading@10: palignr m3, m7, 10 yading@10: paddw m4, m6 yading@10: mova m6, m5 yading@10: palignr m5, m7, 6 yading@10: palignr m6, m7, 4 yading@10: paddw m3, m7 yading@10: paddw m5, m6 yading@10: psubw m0, m1 yading@10: psubw m3, m4 yading@10: psraw m0, 2 yading@10: psraw m3, 2 yading@10: psubw m0, m1 yading@10: psubw m3, m4 yading@10: paddw m0, m2 yading@10: paddw m3, m5 yading@10: psraw m0, 2 yading@10: psraw m3, 2 yading@10: paddw m0, m2 yading@10: paddw m3, m5 yading@10: psraw m0, 6 yading@10: psraw m3, 6 yading@10: packuswb m3, m0 yading@10: op_%1 m3, [r0], m7 yading@10: add r1, 48 yading@10: add r0, r2 yading@10: dec r4d yading@10: jne .op16 yading@10: .done: yading@10: REP_RET yading@10: %endmacro yading@10: yading@10: INIT_XMM ssse3 yading@10: QPEL8OR16_HV2_LOWPASS_OP_XMM put yading@10: QPEL8OR16_HV2_LOWPASS_OP_XMM avg yading@10: yading@10: yading@10: %macro PIXELS4_L2_SHIFT5 1 yading@10: cglobal %1_pixels4_l2_shift5,6,6 ; dst, src16, src8, dstStride, src8Stride, h yading@10: movsxdifnidn r3, r3d yading@10: movsxdifnidn r4, r4d yading@10: mova m0, [r1] yading@10: mova m1, [r1+24] yading@10: psraw m0, 5 yading@10: psraw m1, 5 yading@10: packuswb m0, m0 yading@10: packuswb m1, m1 yading@10: pavgb m0, [r2] yading@10: pavgb m1, [r2+r4] yading@10: op_%1h m0, [r0], m4 yading@10: op_%1h m1, [r0+r3], m5 yading@10: lea r2, [r2+r4*2] yading@10: lea r0, [r0+r3*2] yading@10: mova m0, [r1+48] yading@10: mova m1, [r1+72] yading@10: psraw m0, 5 yading@10: psraw m1, 5 yading@10: packuswb m0, m0 yading@10: packuswb m1, m1 yading@10: pavgb m0, [r2] yading@10: pavgb m1, [r2+r4] yading@10: op_%1h m0, [r0], m4 yading@10: op_%1h m1, [r0+r3], m5 yading@10: RET yading@10: %endmacro yading@10: yading@10: INIT_MMX mmxext yading@10: PIXELS4_L2_SHIFT5 put yading@10: PIXELS4_L2_SHIFT5 avg yading@10: yading@10: yading@10: %macro PIXELS8_L2_SHIFT5 1 yading@10: cglobal %1_pixels8_l2_shift5, 6, 6 ; dst, src16, src8, dstStride, src8Stride, h yading@10: movsxdifnidn r3, r3d yading@10: movsxdifnidn r4, r4d yading@10: .loop: yading@10: mova m0, [r1] yading@10: mova m1, [r1+8] yading@10: mova m2, [r1+48] yading@10: mova m3, [r1+48+8] yading@10: psraw m0, 5 yading@10: psraw m1, 5 yading@10: psraw m2, 5 yading@10: psraw m3, 5 yading@10: packuswb m0, m1 yading@10: packuswb m2, m3 yading@10: pavgb m0, [r2] yading@10: pavgb m2, [r2+r4] yading@10: op_%1 m0, [r0], m4 yading@10: op_%1 m2, [r0+r3], m5 yading@10: lea r2, [r2+2*r4] yading@10: add r1, 48*2 yading@10: lea r0, [r0+2*r3] yading@10: sub r5d, 2 yading@10: jne .loop yading@10: REP_RET yading@10: %endmacro yading@10: yading@10: INIT_MMX mmxext yading@10: PIXELS8_L2_SHIFT5 put yading@10: PIXELS8_L2_SHIFT5 avg yading@10: yading@10: yading@10: %if ARCH_X86_64 yading@10: %macro QPEL16_H_LOWPASS_L2_OP 1 yading@10: cglobal %1_h264_qpel16_h_lowpass_l2, 5, 6, 16 ; dst, src, src2, dstStride, src2Stride yading@10: movsxdifnidn r3, r3d yading@10: movsxdifnidn r4, r4d yading@10: mov r5d, 16 yading@10: pxor m15, m15 yading@10: mova m14, [pw_5] yading@10: mova m13, [pw_16] yading@10: .loop: yading@10: lddqu m1, [r1+6] yading@10: lddqu m7, [r1-2] yading@10: mova m0, m1 yading@10: punpckhbw m1, m15 yading@10: punpcklbw m0, m15 yading@10: punpcklbw m7, m15 yading@10: mova m2, m1 yading@10: mova m6, m0 yading@10: mova m3, m1 yading@10: mova m8, m0 yading@10: mova m4, m1 yading@10: mova m9, m0 yading@10: mova m12, m0 yading@10: mova m11, m1 yading@10: palignr m11, m0, 10 yading@10: palignr m12, m7, 10 yading@10: palignr m4, m0, 2 yading@10: palignr m9, m7, 2 yading@10: palignr m3, m0, 4 yading@10: palignr m8, m7, 4 yading@10: palignr m2, m0, 6 yading@10: palignr m6, m7, 6 yading@10: paddw m11, m0 yading@10: palignr m1, m0, 8 yading@10: palignr m0, m7, 8 yading@10: paddw m7, m12 yading@10: paddw m2, m3 yading@10: paddw m6, m8 yading@10: paddw m1, m4 yading@10: paddw m0, m9 yading@10: psllw m2, 2 yading@10: psllw m6, 2 yading@10: psubw m2, m1 yading@10: psubw m6, m0 yading@10: paddw m11, m13 yading@10: paddw m7, m13 yading@10: pmullw m2, m14 yading@10: pmullw m6, m14 yading@10: lddqu m3, [r2] yading@10: paddw m2, m11 yading@10: paddw m6, m7 yading@10: psraw m2, 5 yading@10: psraw m6, 5 yading@10: packuswb m6, m2 yading@10: pavgb m6, m3 yading@10: op_%1 m6, [r0], m11 yading@10: add r1, r3 yading@10: add r0, r3 yading@10: add r2, r4 yading@10: dec r5d yading@10: jg .loop yading@10: REP_RET yading@10: %endmacro yading@10: yading@10: INIT_XMM ssse3 yading@10: QPEL16_H_LOWPASS_L2_OP put yading@10: QPEL16_H_LOWPASS_L2_OP avg yading@10: %endif