yading@10: ;***************************************************************************** yading@10: ;* SSE2-optimized weighted prediction code yading@10: ;***************************************************************************** yading@10: ;* Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt yading@10: ;* Copyright (C) 2010 Eli Friedman yading@10: ;* yading@10: ;* This file is part of FFmpeg. yading@10: ;* yading@10: ;* FFmpeg is free software; you can redistribute it and/or yading@10: ;* modify it under the terms of the GNU Lesser General Public yading@10: ;* License as published by the Free Software Foundation; either yading@10: ;* version 2.1 of the License, or (at your option) any later version. yading@10: ;* yading@10: ;* FFmpeg is distributed in the hope that it will be useful, yading@10: ;* but WITHOUT ANY WARRANTY; without even the implied warranty of yading@10: ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU yading@10: ;* Lesser General Public License for more details. yading@10: ;* yading@10: ;* You should have received a copy of the GNU Lesser General Public yading@10: ;* License along with FFmpeg; if not, write to the Free Software yading@10: ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA yading@10: ;****************************************************************************** yading@10: yading@10: %include "libavutil/x86/x86util.asm" yading@10: yading@10: SECTION .text yading@10: yading@10: ;----------------------------------------------------------------------------- yading@10: ; biweight pred: yading@10: ; yading@10: ; void h264_biweight_16_sse2(uint8_t *dst, uint8_t *src, int stride, yading@10: ; int height, int log2_denom, int weightd, yading@10: ; int weights, int offset); yading@10: ; and yading@10: ; void h264_weight_16_sse2(uint8_t *dst, int stride, int height, yading@10: ; int log2_denom, int weight, int offset); yading@10: ;----------------------------------------------------------------------------- yading@10: yading@10: %macro WEIGHT_SETUP 0 yading@10: add r5, r5 yading@10: inc r5 yading@10: movd m3, r4d yading@10: movd m5, r5d yading@10: movd m6, r3d yading@10: pslld m5, m6 yading@10: psrld m5, 1 yading@10: %if mmsize == 16 yading@10: pshuflw m3, m3, 0 yading@10: pshuflw m5, m5, 0 yading@10: punpcklqdq m3, m3 yading@10: punpcklqdq m5, m5 yading@10: %else yading@10: pshufw m3, m3, 0 yading@10: pshufw m5, m5, 0 yading@10: %endif yading@10: pxor m7, m7 yading@10: %endmacro yading@10: yading@10: %macro WEIGHT_OP 2 yading@10: movh m0, [r0+%1] yading@10: movh m1, [r0+%2] yading@10: punpcklbw m0, m7 yading@10: punpcklbw m1, m7 yading@10: pmullw m0, m3 yading@10: pmullw m1, m3 yading@10: paddsw m0, m5 yading@10: paddsw m1, m5 yading@10: psraw m0, m6 yading@10: psraw m1, m6 yading@10: packuswb m0, m1 yading@10: %endmacro yading@10: yading@10: INIT_MMX mmxext yading@10: cglobal h264_weight_16, 6, 6, 0 yading@10: WEIGHT_SETUP yading@10: .nextrow: yading@10: WEIGHT_OP 0, 4 yading@10: mova [r0 ], m0 yading@10: WEIGHT_OP 8, 12 yading@10: mova [r0+8], m0 yading@10: add r0, r1 yading@10: dec r2d yading@10: jnz .nextrow yading@10: REP_RET yading@10: yading@10: %macro WEIGHT_FUNC_MM 2 yading@10: cglobal h264_weight_%1, 6, 6, %2 yading@10: WEIGHT_SETUP yading@10: .nextrow: yading@10: WEIGHT_OP 0, mmsize/2 yading@10: mova [r0], m0 yading@10: add r0, r1 yading@10: dec r2d yading@10: jnz .nextrow yading@10: REP_RET yading@10: %endmacro yading@10: yading@10: INIT_MMX mmxext yading@10: WEIGHT_FUNC_MM 8, 0 yading@10: INIT_XMM sse2 yading@10: WEIGHT_FUNC_MM 16, 8 yading@10: yading@10: %macro WEIGHT_FUNC_HALF_MM 2 yading@10: cglobal h264_weight_%1, 6, 6, %2 yading@10: WEIGHT_SETUP yading@10: sar r2d, 1 yading@10: lea r3, [r1*2] yading@10: .nextrow: yading@10: WEIGHT_OP 0, r1 yading@10: movh [r0], m0 yading@10: %if mmsize == 16 yading@10: movhps [r0+r1], m0 yading@10: %else yading@10: psrlq m0, 32 yading@10: movh [r0+r1], m0 yading@10: %endif yading@10: add r0, r3 yading@10: dec r2d yading@10: jnz .nextrow yading@10: REP_RET yading@10: %endmacro yading@10: yading@10: INIT_MMX mmxext yading@10: WEIGHT_FUNC_HALF_MM 4, 0 yading@10: INIT_XMM sse2 yading@10: WEIGHT_FUNC_HALF_MM 8, 8 yading@10: yading@10: %macro BIWEIGHT_SETUP 0 yading@10: %if ARCH_X86_64 yading@10: %define off_regd r7d yading@10: %else yading@10: %define off_regd r3d yading@10: %endif yading@10: mov off_regd, r7m yading@10: add off_regd, 1 yading@10: or off_regd, 1 yading@10: add r4, 1 yading@10: cmp r5, 128 yading@10: jne .normal yading@10: sar r5, 1 yading@10: sar r6, 1 yading@10: sar off_regd, 1 yading@10: sub r4, 1 yading@10: .normal yading@10: %if cpuflag(ssse3) yading@10: movd m4, r5d yading@10: movd m0, r6d yading@10: %else yading@10: movd m3, r5d yading@10: movd m4, r6d yading@10: %endif yading@10: movd m5, off_regd yading@10: movd m6, r4d yading@10: pslld m5, m6 yading@10: psrld m5, 1 yading@10: %if cpuflag(ssse3) yading@10: punpcklbw m4, m0 yading@10: pshuflw m4, m4, 0 yading@10: pshuflw m5, m5, 0 yading@10: punpcklqdq m4, m4 yading@10: punpcklqdq m5, m5 yading@10: yading@10: %else yading@10: %if mmsize == 16 yading@10: pshuflw m3, m3, 0 yading@10: pshuflw m4, m4, 0 yading@10: pshuflw m5, m5, 0 yading@10: punpcklqdq m3, m3 yading@10: punpcklqdq m4, m4 yading@10: punpcklqdq m5, m5 yading@10: %else yading@10: pshufw m3, m3, 0 yading@10: pshufw m4, m4, 0 yading@10: pshufw m5, m5, 0 yading@10: %endif yading@10: pxor m7, m7 yading@10: %endif yading@10: %endmacro yading@10: yading@10: %macro BIWEIGHT_STEPA 3 yading@10: movh m%1, [r0+%3] yading@10: movh m%2, [r1+%3] yading@10: punpcklbw m%1, m7 yading@10: punpcklbw m%2, m7 yading@10: pmullw m%1, m3 yading@10: pmullw m%2, m4 yading@10: paddsw m%1, m%2 yading@10: %endmacro yading@10: yading@10: %macro BIWEIGHT_STEPB 0 yading@10: paddsw m0, m5 yading@10: paddsw m1, m5 yading@10: psraw m0, m6 yading@10: psraw m1, m6 yading@10: packuswb m0, m1 yading@10: %endmacro yading@10: yading@10: INIT_MMX mmxext yading@10: cglobal h264_biweight_16, 7, 8, 0 yading@10: BIWEIGHT_SETUP yading@10: movifnidn r3d, r3m yading@10: .nextrow: yading@10: BIWEIGHT_STEPA 0, 1, 0 yading@10: BIWEIGHT_STEPA 1, 2, 4 yading@10: BIWEIGHT_STEPB yading@10: mova [r0], m0 yading@10: BIWEIGHT_STEPA 0, 1, 8 yading@10: BIWEIGHT_STEPA 1, 2, 12 yading@10: BIWEIGHT_STEPB yading@10: mova [r0+8], m0 yading@10: add r0, r2 yading@10: add r1, r2 yading@10: dec r3d yading@10: jnz .nextrow yading@10: REP_RET yading@10: yading@10: %macro BIWEIGHT_FUNC_MM 2 yading@10: cglobal h264_biweight_%1, 7, 8, %2 yading@10: BIWEIGHT_SETUP yading@10: movifnidn r3d, r3m yading@10: .nextrow: yading@10: BIWEIGHT_STEPA 0, 1, 0 yading@10: BIWEIGHT_STEPA 1, 2, mmsize/2 yading@10: BIWEIGHT_STEPB yading@10: mova [r0], m0 yading@10: add r0, r2 yading@10: add r1, r2 yading@10: dec r3d yading@10: jnz .nextrow yading@10: REP_RET yading@10: %endmacro yading@10: yading@10: INIT_MMX mmxext yading@10: BIWEIGHT_FUNC_MM 8, 0 yading@10: INIT_XMM sse2 yading@10: BIWEIGHT_FUNC_MM 16, 8 yading@10: yading@10: %macro BIWEIGHT_FUNC_HALF_MM 2 yading@10: cglobal h264_biweight_%1, 7, 8, %2 yading@10: BIWEIGHT_SETUP yading@10: movifnidn r3d, r3m yading@10: sar r3, 1 yading@10: lea r4, [r2*2] yading@10: .nextrow: yading@10: BIWEIGHT_STEPA 0, 1, 0 yading@10: BIWEIGHT_STEPA 1, 2, r2 yading@10: BIWEIGHT_STEPB yading@10: movh [r0], m0 yading@10: %if mmsize == 16 yading@10: movhps [r0+r2], m0 yading@10: %else yading@10: psrlq m0, 32 yading@10: movh [r0+r2], m0 yading@10: %endif yading@10: add r0, r4 yading@10: add r1, r4 yading@10: dec r3d yading@10: jnz .nextrow yading@10: REP_RET yading@10: %endmacro yading@10: yading@10: INIT_MMX mmxext yading@10: BIWEIGHT_FUNC_HALF_MM 4, 0 yading@10: INIT_XMM sse2 yading@10: BIWEIGHT_FUNC_HALF_MM 8, 8 yading@10: yading@10: %macro BIWEIGHT_SSSE3_OP 0 yading@10: pmaddubsw m0, m4 yading@10: pmaddubsw m2, m4 yading@10: paddsw m0, m5 yading@10: paddsw m2, m5 yading@10: psraw m0, m6 yading@10: psraw m2, m6 yading@10: packuswb m0, m2 yading@10: %endmacro yading@10: yading@10: INIT_XMM ssse3 yading@10: cglobal h264_biweight_16, 7, 8, 8 yading@10: BIWEIGHT_SETUP yading@10: movifnidn r3d, r3m yading@10: yading@10: .nextrow: yading@10: movh m0, [r0] yading@10: movh m2, [r0+8] yading@10: movh m3, [r1+8] yading@10: punpcklbw m0, [r1] yading@10: punpcklbw m2, m3 yading@10: BIWEIGHT_SSSE3_OP yading@10: mova [r0], m0 yading@10: add r0, r2 yading@10: add r1, r2 yading@10: dec r3d yading@10: jnz .nextrow yading@10: REP_RET yading@10: yading@10: INIT_XMM ssse3 yading@10: cglobal h264_biweight_8, 7, 8, 8 yading@10: BIWEIGHT_SETUP yading@10: movifnidn r3d, r3m yading@10: sar r3, 1 yading@10: lea r4, [r2*2] yading@10: yading@10: .nextrow: yading@10: movh m0, [r0] yading@10: movh m1, [r1] yading@10: movh m2, [r0+r2] yading@10: movh m3, [r1+r2] yading@10: punpcklbw m0, m1 yading@10: punpcklbw m2, m3 yading@10: BIWEIGHT_SSSE3_OP yading@10: movh [r0], m0 yading@10: movhps [r0+r2], m0 yading@10: add r0, r4 yading@10: add r1, r4 yading@10: dec r3d yading@10: jnz .nextrow yading@10: REP_RET