yading@10: ;****************************************************************************** yading@10: ;* MMX/SSE2-optimized functions for the VP6 decoder yading@10: ;* Copyright (C) 2009 Sebastien Lucas yading@10: ;* Copyright (C) 2009 Zuxy Meng yading@10: ;* yading@10: ;* This file is part of FFmpeg. yading@10: ;* yading@10: ;* FFmpeg is free software; you can redistribute it and/or yading@10: ;* modify it under the terms of the GNU Lesser General Public yading@10: ;* License as published by the Free Software Foundation; either yading@10: ;* version 2.1 of the License, or (at your option) any later version. yading@10: ;* yading@10: ;* FFmpeg is distributed in the hope that it will be useful, yading@10: ;* but WITHOUT ANY WARRANTY; without even the implied warranty of yading@10: ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU yading@10: ;* Lesser General Public License for more details. yading@10: ;* yading@10: ;* You should have received a copy of the GNU Lesser General Public yading@10: ;* License along with FFmpeg; if not, write to the Free Software yading@10: ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA yading@10: ;****************************************************************************** yading@10: yading@10: %include "libavutil/x86/x86util.asm" yading@10: yading@10: cextern pw_64 yading@10: yading@10: SECTION .text yading@10: yading@10: %macro DIAG4 6 yading@10: %if mmsize == 8 yading@10: movq m0, [%1+%2] yading@10: movq m1, [%1+%3] yading@10: movq m3, m0 yading@10: movq m4, m1 yading@10: punpcklbw m0, m7 yading@10: punpcklbw m1, m7 yading@10: punpckhbw m3, m7 yading@10: punpckhbw m4, m7 yading@10: pmullw m0, [rsp+8*11] ; src[x-8 ] * biweight [0] yading@10: pmullw m1, [rsp+8*12] ; src[x ] * biweight [1] yading@10: pmullw m3, [rsp+8*11] ; src[x-8 ] * biweight [0] yading@10: pmullw m4, [rsp+8*12] ; src[x ] * biweight [1] yading@10: paddw m0, m1 yading@10: paddw m3, m4 yading@10: movq m1, [%1+%4] yading@10: movq m2, [%1+%5] yading@10: movq m4, m1 yading@10: movq m5, m2 yading@10: punpcklbw m1, m7 yading@10: punpcklbw m2, m7 yading@10: punpckhbw m4, m7 yading@10: punpckhbw m5, m7 yading@10: pmullw m1, [rsp+8*13] ; src[x+8 ] * biweight [2] yading@10: pmullw m2, [rsp+8*14] ; src[x+16] * biweight [3] yading@10: pmullw m4, [rsp+8*13] ; src[x+8 ] * biweight [2] yading@10: pmullw m5, [rsp+8*14] ; src[x+16] * biweight [3] yading@10: paddw m1, m2 yading@10: paddw m4, m5 yading@10: paddsw m0, m1 yading@10: paddsw m3, m4 yading@10: paddsw m0, m6 ; Add 64 yading@10: paddsw m3, m6 ; Add 64 yading@10: psraw m0, 7 yading@10: psraw m3, 7 yading@10: packuswb m0, m3 yading@10: movq [%6], m0 yading@10: %else ; mmsize == 16 yading@10: movq m0, [%1+%2] yading@10: movq m1, [%1+%3] yading@10: punpcklbw m0, m7 yading@10: punpcklbw m1, m7 yading@10: pmullw m0, m4 ; src[x-8 ] * biweight [0] yading@10: pmullw m1, m5 ; src[x ] * biweight [1] yading@10: paddw m0, m1 yading@10: movq m1, [%1+%4] yading@10: movq m2, [%1+%5] yading@10: punpcklbw m1, m7 yading@10: punpcklbw m2, m7 yading@10: pmullw m1, m6 ; src[x+8 ] * biweight [2] yading@10: pmullw m2, m3 ; src[x+16] * biweight [3] yading@10: paddw m1, m2 yading@10: paddsw m0, m1 yading@10: paddsw m0, [pw_64] ; Add 64 yading@10: psraw m0, 7 yading@10: packuswb m0, m0 yading@10: movq [%6], m0 yading@10: %endif ; mmsize == 8/16 yading@10: %endmacro yading@10: yading@10: %macro SPLAT4REGS 0 yading@10: %if mmsize == 8 yading@10: movq m5, m3 yading@10: punpcklwd m3, m3 yading@10: movq m4, m3 yading@10: punpckldq m3, m3 yading@10: punpckhdq m4, m4 yading@10: punpckhwd m5, m5 yading@10: movq m2, m5 yading@10: punpckhdq m2, m2 yading@10: punpckldq m5, m5 yading@10: movq [rsp+8*11], m3 yading@10: movq [rsp+8*12], m4 yading@10: movq [rsp+8*13], m5 yading@10: movq [rsp+8*14], m2 yading@10: %else ; mmsize == 16 yading@10: pshuflw m4, m3, 0x0 yading@10: pshuflw m5, m3, 0x55 yading@10: pshuflw m6, m3, 0xAA yading@10: pshuflw m3, m3, 0xFF yading@10: punpcklqdq m4, m4 yading@10: punpcklqdq m5, m5 yading@10: punpcklqdq m6, m6 yading@10: punpcklqdq m3, m3 yading@10: %endif ; mmsize == 8/16 yading@10: %endmacro yading@10: yading@10: %macro vp6_filter_diag4 0 yading@10: ; void ff_vp6_filter_diag4_(uint8_t *dst, uint8_t *src, int stride, yading@10: ; const int16_t h_weight[4], const int16_t v_weights[4]) yading@10: cglobal vp6_filter_diag4, 5, 7, 8 yading@10: mov r5, rsp ; backup stack pointer yading@10: and rsp, ~(mmsize-1) ; align stack yading@10: %if mmsize == 16 yading@10: sub rsp, 8*11 yading@10: %else yading@10: sub rsp, 8*15 yading@10: movq m6, [pw_64] yading@10: %endif yading@10: %if ARCH_X86_64 yading@10: movsxd r2, r2d yading@10: %endif yading@10: yading@10: sub r1, r2 yading@10: yading@10: pxor m7, m7 yading@10: movq m3, [r3] yading@10: SPLAT4REGS yading@10: yading@10: mov r3, rsp yading@10: mov r6, 11 yading@10: .nextrow: yading@10: DIAG4 r1, -1, 0, 1, 2, r3 yading@10: add r3, 8 yading@10: add r1, r2 yading@10: dec r6 yading@10: jnz .nextrow yading@10: yading@10: movq m3, [r4] yading@10: SPLAT4REGS yading@10: yading@10: lea r3, [rsp+8] yading@10: mov r6, 8 yading@10: .nextcol: yading@10: DIAG4 r3, -8, 0, 8, 16, r0 yading@10: add r3, 8 yading@10: add r0, r2 yading@10: dec r6 yading@10: jnz .nextcol yading@10: yading@10: mov rsp, r5 ; restore stack pointer yading@10: RET yading@10: %endmacro yading@10: yading@10: %if ARCH_X86_32 yading@10: INIT_MMX mmx yading@10: vp6_filter_diag4 yading@10: %endif yading@10: yading@10: INIT_XMM sse2 yading@10: vp6_filter_diag4