yading@11: ;***************************************************************************** yading@11: ;* x86-optimized functions for yadif filter yading@11: ;* yading@11: ;* Copyright (C) 2006 Michael Niedermayer yading@11: ;* Copyright (c) 2013 Daniel Kang yading@11: ;* Copyright (c) 2011-2013 James Darnley yading@11: ;* yading@11: ;* This file is part of FFmpeg. yading@11: ;* yading@11: ;* FFmpeg is free software; you can redistribute it and/or modify yading@11: ;* it under the terms of the GNU General Public License as published by yading@11: ;* the Free Software Foundation; either version 2 of the License, or yading@11: ;* (at your option) any later version. yading@11: ;* yading@11: ;* FFmpeg is distributed in the hope that it will be useful, yading@11: ;* but WITHOUT ANY WARRANTY; without even the implied warranty of yading@11: ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the yading@11: ;* GNU General Public License for more details. yading@11: ;* yading@11: ;* You should have received a copy of the GNU General Public License along yading@11: ;* with FFmpeg; if not, write to the Free Software Foundation, Inc., yading@11: ;* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. yading@11: ;****************************************************************************** yading@11: yading@11: %include "libavutil/x86/x86util.asm" yading@11: yading@11: SECTION_RODATA yading@11: yading@11: pw_1: times 8 dw 1 yading@11: pw_8000: times 8 dw 0x8000 yading@11: pd_1: times 4 dd 1 yading@11: pd_8000: times 4 dd 0x8000 yading@11: yading@11: SECTION .text yading@11: yading@11: %macro PIXSHIFT1 1 yading@11: %if cpuflag(sse2) yading@11: psrldq %1, 2 yading@11: %else yading@11: psrlq %1, 16 yading@11: %endif yading@11: %endmacro yading@11: yading@11: %macro PIXSHIFT2 1 yading@11: %if cpuflag(sse2) yading@11: psrldq %1, 4 yading@11: %else yading@11: psrlq %1, 32 yading@11: %endif yading@11: %endmacro yading@11: yading@11: %macro PABS 2 yading@11: %if cpuflag(ssse3) yading@11: pabsd %1, %1 yading@11: %else yading@11: pxor %2, %2 yading@11: pcmpgtd %2, %1 yading@11: pxor %1, %2 yading@11: psubd %1, %2 yading@11: %endif yading@11: %endmacro yading@11: yading@11: %macro PACK 1 yading@11: %if cpuflag(sse4) yading@11: packusdw %1, %1 yading@11: %else yading@11: psubd %1, [pd_8000] yading@11: packssdw %1, %1 yading@11: paddw %1, [pw_8000] yading@11: %endif yading@11: %endmacro yading@11: yading@11: %macro PMINSD 3 yading@11: %if cpuflag(sse4) yading@11: pminsd %1, %2 yading@11: %else yading@11: mova %3, %2 yading@11: pcmpgtd %3, %1 yading@11: pand %1, %3 yading@11: pandn %3, %2 yading@11: por %1, %3 yading@11: %endif yading@11: %endmacro yading@11: yading@11: %macro PMAXSD 3 yading@11: %if cpuflag(sse4) yading@11: pmaxsd %1, %2 yading@11: %else yading@11: mova %3, %1 yading@11: pcmpgtd %3, %2 yading@11: pand %1, %3 yading@11: pandn %3, %2 yading@11: por %1, %3 yading@11: %endif yading@11: %endmacro yading@11: yading@11: %macro PMAXUW 2 yading@11: %if cpuflag(sse4) yading@11: pmaxuw %1, %2 yading@11: %else yading@11: psubusw %1, %2 yading@11: paddusw %1, %2 yading@11: %endif yading@11: %endmacro yading@11: yading@11: %macro CHECK 2 yading@11: movu m2, [curq+t1+%1*2] yading@11: movu m3, [curq+t0+%2*2] yading@11: mova m4, m2 yading@11: mova m5, m2 yading@11: pxor m4, m3 yading@11: pavgw m5, m3 yading@11: pand m4, [pw_1] yading@11: psubusw m5, m4 yading@11: %if mmsize == 16 yading@11: psrldq m5, 2 yading@11: %else yading@11: psrlq m5, 16 yading@11: %endif yading@11: punpcklwd m5, m7 yading@11: mova m4, m2 yading@11: psubusw m2, m3 yading@11: psubusw m3, m4 yading@11: PMAXUW m2, m3 yading@11: mova m3, m2 yading@11: mova m4, m2 yading@11: %if mmsize == 16 yading@11: psrldq m3, 2 yading@11: psrldq m4, 4 yading@11: %else yading@11: psrlq m3, 16 yading@11: psrlq m4, 32 yading@11: %endif yading@11: punpcklwd m2, m7 yading@11: punpcklwd m3, m7 yading@11: punpcklwd m4, m7 yading@11: paddd m2, m3 yading@11: paddd m2, m4 yading@11: %endmacro yading@11: yading@11: %macro CHECK1 0 yading@11: mova m3, m0 yading@11: pcmpgtd m3, m2 yading@11: PMINSD m0, m2, m6 yading@11: mova m6, m3 yading@11: pand m5, m3 yading@11: pandn m3, m1 yading@11: por m3, m5 yading@11: mova m1, m3 yading@11: %endmacro yading@11: yading@11: %macro CHECK2 0 yading@11: paddd m6, [pd_1] yading@11: pslld m6, 30 yading@11: paddd m2, m6 yading@11: mova m3, m0 yading@11: pcmpgtd m3, m2 yading@11: PMINSD m0, m2, m4 yading@11: pand m5, m3 yading@11: pandn m3, m1 yading@11: por m3, m5 yading@11: mova m1, m3 yading@11: %endmacro yading@11: yading@11: ; This version of CHECK2 has 3 fewer instructions on sets older than SSE4 but I yading@11: ; am not sure whether it is any faster. A rewrite or refactor of the filter yading@11: ; code should make it possible to eliminate the move intruction at the end. It yading@11: ; exists to satisfy the expectation that the "score" values are in m1. yading@11: yading@11: ; %macro CHECK2 0 yading@11: ; mova m3, m0 yading@11: ; pcmpgtd m0, m2 yading@11: ; pand m0, m6 yading@11: ; mova m6, m0 yading@11: ; pand m5, m6 yading@11: ; pand m2, m0 yading@11: ; pandn m6, m1 yading@11: ; pandn m0, m3 yading@11: ; por m6, m5 yading@11: ; por m0, m2 yading@11: ; mova m1, m6 yading@11: ; %endmacro yading@11: yading@11: %macro LOAD 2 yading@11: movh %1, %2 yading@11: punpcklwd %1, m7 yading@11: %endmacro yading@11: yading@11: %macro FILTER 3 yading@11: .loop%1: yading@11: pxor m7, m7 yading@11: LOAD m0, [curq+t1] yading@11: LOAD m1, [curq+t0] yading@11: LOAD m2, [%2] yading@11: LOAD m3, [%3] yading@11: mova m4, m3 yading@11: paddd m3, m2 yading@11: psrad m3, 1 yading@11: mova [rsp+ 0], m0 yading@11: mova [rsp+16], m3 yading@11: mova [rsp+32], m1 yading@11: psubd m2, m4 yading@11: PABS m2, m4 yading@11: LOAD m3, [prevq+t1] yading@11: LOAD m4, [prevq+t0] yading@11: psubd m3, m0 yading@11: psubd m4, m1 yading@11: PABS m3, m5 yading@11: PABS m4, m5 yading@11: paddd m3, m4 yading@11: psrld m2, 1 yading@11: psrld m3, 1 yading@11: PMAXSD m2, m3, m6 yading@11: LOAD m3, [nextq+t1] yading@11: LOAD m4, [nextq+t0] yading@11: psubd m3, m0 yading@11: psubd m4, m1 yading@11: PABS m3, m5 yading@11: PABS m4, m5 yading@11: paddd m3, m4 yading@11: psrld m3, 1 yading@11: PMAXSD m2, m3, m6 yading@11: mova [rsp+48], m2 yading@11: yading@11: paddd m1, m0 yading@11: paddd m0, m0 yading@11: psubd m0, m1 yading@11: psrld m1, 1 yading@11: PABS m0, m2 yading@11: yading@11: movu m2, [curq+t1-1*2] yading@11: movu m3, [curq+t0-1*2] yading@11: mova m4, m2 yading@11: psubusw m2, m3 yading@11: psubusw m3, m4 yading@11: PMAXUW m2, m3 yading@11: %if mmsize == 16 yading@11: mova m3, m2 yading@11: psrldq m3, 4 yading@11: %else yading@11: mova m3, m2 yading@11: psrlq m3, 32 yading@11: %endif yading@11: punpcklwd m2, m7 yading@11: punpcklwd m3, m7 yading@11: paddd m0, m2 yading@11: paddd m0, m3 yading@11: psubd m0, [pd_1] yading@11: yading@11: CHECK -2, 0 yading@11: CHECK1 yading@11: CHECK -3, 1 yading@11: CHECK2 yading@11: CHECK 0, -2 yading@11: CHECK1 yading@11: CHECK 1, -3 yading@11: CHECK2 yading@11: yading@11: mova m6, [rsp+48] yading@11: cmp DWORD r8m, 2 yading@11: jge .end%1 yading@11: LOAD m2, [%2+t1*2] yading@11: LOAD m4, [%3+t1*2] yading@11: LOAD m3, [%2+t0*2] yading@11: LOAD m5, [%3+t0*2] yading@11: paddd m2, m4 yading@11: paddd m3, m5 yading@11: psrld m2, 1 yading@11: psrld m3, 1 yading@11: mova m4, [rsp+ 0] yading@11: mova m5, [rsp+16] yading@11: mova m7, [rsp+32] yading@11: psubd m2, m4 yading@11: psubd m3, m7 yading@11: mova m0, m5 yading@11: psubd m5, m4 yading@11: psubd m0, m7 yading@11: mova m4, m2 yading@11: PMINSD m2, m3, m7 yading@11: PMAXSD m3, m4, m7 yading@11: PMAXSD m2, m5, m7 yading@11: PMINSD m3, m5, m7 yading@11: PMAXSD m2, m0, m7 yading@11: PMINSD m3, m0, m7 yading@11: pxor m4, m4 yading@11: PMAXSD m6, m3, m7 yading@11: psubd m4, m2 yading@11: PMAXSD m6, m4, m7 yading@11: yading@11: .end%1: yading@11: mova m2, [rsp+16] yading@11: mova m3, m2 yading@11: psubd m2, m6 yading@11: paddd m3, m6 yading@11: PMAXSD m1, m2, m7 yading@11: PMINSD m1, m3, m7 yading@11: PACK m1 yading@11: yading@11: movh [dstq], m1 yading@11: add dstq, mmsize/2 yading@11: add prevq, mmsize/2 yading@11: add curq, mmsize/2 yading@11: add nextq, mmsize/2 yading@11: sub DWORD r4m, mmsize/4 yading@11: jg .loop%1 yading@11: %endmacro yading@11: yading@11: %macro YADIF 0 yading@11: %if ARCH_X86_32 yading@11: cglobal yadif_filter_line_16bit, 4, 6, 8, 80, dst, prev, cur, next, w, \ yading@11: prefs, mrefs, parity, mode yading@11: %else yading@11: cglobal yadif_filter_line_16bit, 4, 7, 8, 80, dst, prev, cur, next, w, \ yading@11: prefs, mrefs, parity, mode yading@11: %endif yading@11: %if ARCH_X86_32 yading@11: mov r4, r5mp yading@11: mov r5, r6mp yading@11: DECLARE_REG_TMP 4,5 yading@11: %else yading@11: movsxd r5, DWORD r5m yading@11: movsxd r6, DWORD r6m yading@11: DECLARE_REG_TMP 5,6 yading@11: %endif yading@11: yading@11: cmp DWORD paritym, 0 yading@11: je .parity0 yading@11: FILTER 1, prevq, curq yading@11: jmp .ret yading@11: yading@11: .parity0: yading@11: FILTER 0, curq, nextq yading@11: yading@11: .ret: yading@11: RET yading@11: %endmacro yading@11: yading@11: INIT_XMM sse4 yading@11: YADIF yading@11: INIT_XMM ssse3 yading@11: YADIF yading@11: INIT_XMM sse2 yading@11: YADIF yading@11: %if ARCH_X86_32 yading@11: INIT_MMX mmxext yading@11: YADIF yading@11: %endif