yading@10: ;***************************************************************************** yading@10: ;* MMX/SSE2/AVX-optimized 10-bit H.264 chroma MC code yading@10: ;***************************************************************************** yading@10: ;* Copyright (C) 2005-2011 x264 project yading@10: ;* yading@10: ;* Authors: Daniel Kang yading@10: ;* yading@10: ;* This file is part of Libav. yading@10: ;* yading@10: ;* Libav is free software; you can redistribute it and/or yading@10: ;* modify it under the terms of the GNU Lesser General Public yading@10: ;* License as published by the Free Software Foundation; either yading@10: ;* version 2.1 of the License, or (at your option) any later version. yading@10: ;* yading@10: ;* Libav is distributed in the hope that it will be useful, yading@10: ;* but WITHOUT ANY WARRANTY; without even the implied warranty of yading@10: ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU yading@10: ;* Lesser General Public License for more details. yading@10: ;* yading@10: ;* You should have received a copy of the GNU Lesser General Public yading@10: ;* License along with Libav; if not, write to the Free Software yading@10: ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA yading@10: ;****************************************************************************** yading@10: yading@10: %include "libavutil/x86/x86util.asm" yading@10: yading@10: SECTION_RODATA yading@10: yading@10: cextern pw_4 yading@10: cextern pw_8 yading@10: cextern pw_32 yading@10: cextern pw_64 yading@10: yading@10: SECTION .text yading@10: yading@10: yading@10: %macro MV0_PIXELS_MC8 0 yading@10: lea r4, [r2*3 ] yading@10: lea r5, [r2*4 ] yading@10: .next4rows: yading@10: movu m0, [r1 ] yading@10: movu m1, [r1+r2 ] yading@10: CHROMAMC_AVG m0, [r0 ] yading@10: CHROMAMC_AVG m1, [r0+r2 ] yading@10: mova [r0 ], m0 yading@10: mova [r0+r2 ], m1 yading@10: movu m0, [r1+r2*2] yading@10: movu m1, [r1+r4 ] yading@10: CHROMAMC_AVG m0, [r0+r2*2] yading@10: CHROMAMC_AVG m1, [r0+r4 ] yading@10: mova [r0+r2*2], m0 yading@10: mova [r0+r4 ], m1 yading@10: add r1, r5 yading@10: add r0, r5 yading@10: sub r3d, 4 yading@10: jne .next4rows yading@10: %endmacro yading@10: yading@10: ;----------------------------------------------------------------------------- yading@10: ; void put/avg_h264_chroma_mc8(pixel *dst, pixel *src, int stride, int h, int mx, int my) yading@10: ;----------------------------------------------------------------------------- yading@10: %macro CHROMA_MC8 1 yading@10: ; put/avg_h264_chroma_mc8_*(uint8_t *dst /*align 8*/, uint8_t *src /*align 1*/, yading@10: ; int stride, int h, int mx, int my) yading@10: cglobal %1_h264_chroma_mc8_10, 6,7,8 yading@10: movsxdifnidn r2, r2d yading@10: mov r6d, r5d yading@10: or r6d, r4d yading@10: jne .at_least_one_non_zero yading@10: ; mx == 0 AND my == 0 - no filter needed yading@10: MV0_PIXELS_MC8 yading@10: REP_RET yading@10: yading@10: .at_least_one_non_zero: yading@10: mov r6d, 2 yading@10: test r5d, r5d yading@10: je .x_interpolation yading@10: mov r6, r2 ; dxy = x ? 1 : stride yading@10: test r4d, r4d yading@10: jne .xy_interpolation yading@10: .x_interpolation: yading@10: ; mx == 0 XOR my == 0 - 1 dimensional filter only yading@10: or r4d, r5d ; x + y yading@10: movd m5, r4d yading@10: mova m4, [pw_8] yading@10: mova m6, [pw_4] ; mm6 = rnd >> 3 yading@10: SPLATW m5, m5 ; mm5 = B = x yading@10: psubw m4, m5 ; mm4 = A = 8-x yading@10: yading@10: .next1drow: yading@10: movu m0, [r1 ] ; mm0 = src[0..7] yading@10: movu m2, [r1+r6] ; mm2 = src[1..8] yading@10: yading@10: pmullw m0, m4 ; mm0 = A * src[0..7] yading@10: pmullw m2, m5 ; mm2 = B * src[1..8] yading@10: yading@10: paddw m0, m6 yading@10: paddw m0, m2 yading@10: psrlw m0, 3 yading@10: CHROMAMC_AVG m0, [r0] yading@10: mova [r0], m0 ; dst[0..7] = (A * src[0..7] + B * src[1..8] + (rnd >> 3)) >> 3 yading@10: yading@10: add r0, r2 yading@10: add r1, r2 yading@10: dec r3d yading@10: jne .next1drow yading@10: REP_RET yading@10: yading@10: .xy_interpolation: ; general case, bilinear yading@10: movd m4, r4m ; x yading@10: movd m6, r5m ; y yading@10: yading@10: SPLATW m4, m4 ; mm4 = x words yading@10: SPLATW m6, m6 ; mm6 = y words yading@10: psllw m5, m4, 3 ; mm5 = 8x yading@10: pmullw m4, m6 ; mm4 = x * y yading@10: psllw m6, 3 ; mm6 = 8y yading@10: paddw m1, m5, m6 ; mm7 = 8x+8y yading@10: mova m7, m4 ; DD = x * y yading@10: psubw m5, m4 ; mm5 = B = 8x - xy yading@10: psubw m6, m4 ; mm6 = C = 8y - xy yading@10: paddw m4, [pw_64] yading@10: psubw m4, m1 ; mm4 = A = xy - (8x+8y) + 64 yading@10: yading@10: movu m0, [r1 ] ; mm0 = src[0..7] yading@10: movu m1, [r1+2] ; mm1 = src[1..8] yading@10: .next2drow: yading@10: add r1, r2 yading@10: yading@10: pmullw m2, m0, m4 yading@10: pmullw m1, m5 yading@10: paddw m2, m1 ; mm2 = A * src[0..7] + B * src[1..8] yading@10: yading@10: movu m0, [r1] yading@10: movu m1, [r1+2] yading@10: pmullw m3, m0, m6 yading@10: paddw m2, m3 ; mm2 += C * src[0..7+strde] yading@10: pmullw m3, m1, m7 yading@10: paddw m2, m3 ; mm2 += D * src[1..8+strde] yading@10: yading@10: paddw m2, [pw_32] yading@10: psrlw m2, 6 yading@10: CHROMAMC_AVG m2, [r0] yading@10: mova [r0], m2 ; dst[0..7] = (mm2 + 32) >> 6 yading@10: yading@10: add r0, r2 yading@10: dec r3d yading@10: jne .next2drow yading@10: REP_RET yading@10: %endmacro yading@10: yading@10: ;----------------------------------------------------------------------------- yading@10: ; void put/avg_h264_chroma_mc4(pixel *dst, pixel *src, int stride, int h, int mx, int my) yading@10: ;----------------------------------------------------------------------------- yading@10: ;TODO: xmm mc4 yading@10: %macro MC4_OP 2 yading@10: movq %1, [r1 ] yading@10: movq m1, [r1+2] yading@10: add r1, r2 yading@10: pmullw %1, m4 yading@10: pmullw m1, m2 yading@10: paddw m1, %1 yading@10: mova %1, m1 yading@10: yading@10: pmullw %2, m5 yading@10: pmullw m1, m3 yading@10: paddw %2, [pw_32] yading@10: paddw m1, %2 yading@10: psrlw m1, 6 yading@10: CHROMAMC_AVG m1, %2, [r0] yading@10: movq [r0], m1 yading@10: add r0, r2 yading@10: %endmacro yading@10: yading@10: %macro CHROMA_MC4 1 yading@10: cglobal %1_h264_chroma_mc4_10, 6,6,7 yading@10: movsxdifnidn r2, r2d yading@10: movd m2, r4m ; x yading@10: movd m3, r5m ; y yading@10: mova m4, [pw_8] yading@10: mova m5, m4 yading@10: SPLATW m2, m2 yading@10: SPLATW m3, m3 yading@10: psubw m4, m2 yading@10: psubw m5, m3 yading@10: yading@10: movq m0, [r1 ] yading@10: movq m6, [r1+2] yading@10: add r1, r2 yading@10: pmullw m0, m4 yading@10: pmullw m6, m2 yading@10: paddw m6, m0 yading@10: yading@10: .next2rows: yading@10: MC4_OP m0, m6 yading@10: MC4_OP m6, m0 yading@10: sub r3d, 2 yading@10: jnz .next2rows yading@10: REP_RET yading@10: %endmacro yading@10: yading@10: ;----------------------------------------------------------------------------- yading@10: ; void put/avg_h264_chroma_mc2(pixel *dst, pixel *src, int stride, int h, int mx, int my) yading@10: ;----------------------------------------------------------------------------- yading@10: %macro CHROMA_MC2 1 yading@10: cglobal %1_h264_chroma_mc2_10, 6,7 yading@10: movsxdifnidn r2, r2d yading@10: mov r6d, r4d yading@10: shl r4d, 16 yading@10: sub r4d, r6d yading@10: add r4d, 8 yading@10: imul r5d, r4d ; x*y<<16 | y*(8-x) yading@10: shl r4d, 3 yading@10: sub r4d, r5d ; x*(8-y)<<16 | (8-x)*(8-y) yading@10: yading@10: movd m5, r4d yading@10: movd m6, r5d yading@10: punpckldq m5, m5 ; mm5 = {A,B,A,B} yading@10: punpckldq m6, m6 ; mm6 = {C,D,C,D} yading@10: pxor m7, m7 yading@10: pshufw m2, [r1], 0x94 ; mm0 = src[0,1,1,2] yading@10: yading@10: .nextrow: yading@10: add r1, r2 yading@10: movq m1, m2 yading@10: pmaddwd m1, m5 ; mm1 = A * src[0,1] + B * src[1,2] yading@10: pshufw m0, [r1], 0x94 ; mm0 = src[0,1,1,2] yading@10: movq m2, m0 yading@10: pmaddwd m0, m6 yading@10: paddw m1, [pw_32] yading@10: paddw m1, m0 ; mm1 += C * src[0,1] + D * src[1,2] yading@10: psrlw m1, 6 yading@10: packssdw m1, m7 yading@10: CHROMAMC_AVG m1, m3, [r0] yading@10: movd [r0], m1 yading@10: add r0, r2 yading@10: dec r3d yading@10: jnz .nextrow yading@10: REP_RET yading@10: %endmacro yading@10: yading@10: %macro NOTHING 2-3 yading@10: %endmacro yading@10: %macro AVG 2-3 yading@10: %if %0==3 yading@10: movq %2, %3 yading@10: %endif yading@10: pavgw %1, %2 yading@10: %endmacro yading@10: yading@10: %define CHROMAMC_AVG NOTHING yading@10: INIT_XMM sse2 yading@10: CHROMA_MC8 put yading@10: %if HAVE_AVX_EXTERNAL yading@10: INIT_XMM avx yading@10: CHROMA_MC8 put yading@10: %endif yading@10: INIT_MMX mmxext yading@10: CHROMA_MC4 put yading@10: CHROMA_MC2 put yading@10: yading@10: %define CHROMAMC_AVG AVG yading@10: INIT_XMM sse2 yading@10: CHROMA_MC8 avg yading@10: %if HAVE_AVX_EXTERNAL yading@10: INIT_XMM avx yading@10: CHROMA_MC8 avg yading@10: %endif yading@10: INIT_MMX mmxext yading@10: CHROMA_MC4 avg yading@10: CHROMA_MC2 avg