yading@10: ;****************************************************************************** yading@10: ;* MMX/SSE2-optimized functions for the RV30 and RV40 decoders yading@10: ;* Copyright (C) 2012 Christophe Gisquet yading@10: ;* yading@10: ;* This file is part of Libav. yading@10: ;* yading@10: ;* Libav is free software; you can redistribute it and/or yading@10: ;* modify it under the terms of the GNU Lesser General Public yading@10: ;* License as published by the Free Software Foundation; either yading@10: ;* version 2.1 of the License, or (at your option) any later version. yading@10: ;* yading@10: ;* Libav is distributed in the hope that it will be useful, yading@10: ;* but WITHOUT ANY WARRANTY; without even the implied warranty of yading@10: ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU yading@10: ;* Lesser General Public License for more details. yading@10: ;* yading@10: ;* You should have received a copy of the GNU Lesser General Public yading@10: ;* License along with Libav; if not, write to the Free Software yading@10: ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA yading@10: ;****************************************************************************** yading@10: yading@10: %include "libavutil/x86/x86util.asm" yading@10: yading@10: SECTION_RODATA yading@10: pw_row_coeffs: times 4 dw 13 yading@10: times 4 dw 17 yading@10: times 4 dw 7 yading@10: pd_512: times 2 dd 0x200 yading@10: pw_col_coeffs: dw 13, 13, 13, -13 yading@10: dw 17, 7, 7, -17 yading@10: dw 13, -13, 13, 13 yading@10: dw -7, 17, -17, -7 yading@10: yading@10: SECTION .text yading@10: yading@10: %macro IDCT_DC_NOROUND 1 yading@10: imul %1, 13*13*3 yading@10: sar %1, 11 yading@10: %endmacro yading@10: yading@10: %macro IDCT_DC_ROUND 1 yading@10: imul %1, 13*13 yading@10: add %1, 0x200 yading@10: sar %1, 10 yading@10: %endmacro yading@10: yading@10: %macro rv34_idct 1 yading@10: cglobal rv34_idct_%1, 1, 2, 0 yading@10: movsx r1, word [r0] yading@10: IDCT_DC r1 yading@10: movd m0, r1d yading@10: pshufw m0, m0, 0 yading@10: movq [r0+ 0], m0 yading@10: movq [r0+ 8], m0 yading@10: movq [r0+16], m0 yading@10: movq [r0+24], m0 yading@10: REP_RET yading@10: %endmacro yading@10: yading@10: INIT_MMX mmxext yading@10: %define IDCT_DC IDCT_DC_ROUND yading@10: rv34_idct dc yading@10: %define IDCT_DC IDCT_DC_NOROUND yading@10: rv34_idct dc_noround yading@10: yading@10: ; ff_rv34_idct_dc_add_mmx(uint8_t *dst, int stride, int dc); yading@10: INIT_MMX mmx yading@10: cglobal rv34_idct_dc_add, 3, 3 yading@10: ; calculate DC yading@10: IDCT_DC_ROUND r2 yading@10: pxor m1, m1 yading@10: movd m0, r2d yading@10: psubw m1, m0 yading@10: packuswb m0, m0 yading@10: packuswb m1, m1 yading@10: punpcklbw m0, m0 yading@10: punpcklbw m1, m1 yading@10: punpcklwd m0, m0 yading@10: punpcklwd m1, m1 yading@10: yading@10: ; add DC yading@10: lea r2, [r0+r1*2] yading@10: movh m2, [r0] yading@10: movh m3, [r0+r1] yading@10: movh m4, [r2] yading@10: movh m5, [r2+r1] yading@10: paddusb m2, m0 yading@10: paddusb m3, m0 yading@10: paddusb m4, m0 yading@10: paddusb m5, m0 yading@10: psubusb m2, m1 yading@10: psubusb m3, m1 yading@10: psubusb m4, m1 yading@10: psubusb m5, m1 yading@10: movh [r0], m2 yading@10: movh [r0+r1], m3 yading@10: movh [r2], m4 yading@10: movh [r2+r1], m5 yading@10: RET yading@10: yading@10: ; Load coeffs and perform row transform yading@10: ; Output: coeffs in mm[0467], rounder in mm5 yading@10: %macro ROW_TRANSFORM 1 yading@10: pxor mm7, mm7 yading@10: mova mm0, [%1+ 0*8] yading@10: mova mm1, [%1+ 1*8] yading@10: mova mm2, [%1+ 2*8] yading@10: mova mm3, [%1+ 3*8] yading@10: mova [%1+ 0*8], mm7 yading@10: mova [%1+ 1*8], mm7 yading@10: mova [%1+ 2*8], mm7 yading@10: mova [%1+ 3*8], mm7 yading@10: mova mm4, mm0 yading@10: mova mm6, [pw_row_coeffs+ 0] yading@10: paddsw mm0, mm2 ; b0 + b2 yading@10: psubsw mm4, mm2 ; b0 - b2 yading@10: pmullw mm0, mm6 ; *13 = z0 yading@10: pmullw mm4, mm6 ; *13 = z1 yading@10: mova mm5, mm1 yading@10: pmullw mm1, [pw_row_coeffs+ 8] ; b1*17 yading@10: pmullw mm5, [pw_row_coeffs+16] ; b1* 7 yading@10: mova mm7, mm3 yading@10: pmullw mm3, [pw_row_coeffs+ 8] ; b3*17 yading@10: pmullw mm7, [pw_row_coeffs+16] ; b3* 7 yading@10: paddsw mm1, mm7 ; z3 = b1*17 + b3* 7 yading@10: psubsw mm5, mm3 ; z2 = b1* 7 - b3*17 yading@10: mova mm7, mm0 yading@10: mova mm6, mm4 yading@10: paddsw mm0, mm1 ; z0 + z3 yading@10: psubsw mm7, mm1 ; z0 - z3 yading@10: paddsw mm4, mm5 ; z1 + z2 yading@10: psubsw mm6, mm5 ; z1 - z2 yading@10: mova mm5, [pd_512] ; 0x200 yading@10: %endmacro yading@10: yading@10: ; ff_rv34_idct_add_mmxext(uint8_t *dst, ptrdiff_t stride, int16_t *block); yading@10: %macro COL_TRANSFORM 4 yading@10: pshufw mm3, %2, 0xDD ; col. 1,3,1,3 yading@10: pshufw %2, %2, 0x88 ; col. 0,2,0,2 yading@10: pmaddwd %2, %3 ; 13*c0+13*c2 | 13*c0-13*c2 = z0 | z1 yading@10: pmaddwd mm3, %4 ; 17*c1+ 7*c3 | 7*c1-17*c3 = z3 | z2 yading@10: paddd %2, mm5 yading@10: pshufw mm1, %2, 01001110b ; z1 | z0 yading@10: pshufw mm2, mm3, 01001110b ; z2 | z3 yading@10: paddd %2, mm3 ; z0+z3 | z1+z2 yading@10: psubd mm1, mm2 ; z1-z2 | z0-z3 yading@10: movd mm3, %1 yading@10: psrad %2, 10 yading@10: pxor mm2, mm2 yading@10: psrad mm1, 10 yading@10: punpcklbw mm3, mm2 yading@10: packssdw %2, mm1 yading@10: paddw %2, mm3 yading@10: packuswb %2, %2 yading@10: movd %1, %2 yading@10: %endmacro yading@10: INIT_MMX mmxext yading@10: cglobal rv34_idct_add, 3,3,0, d, s, b yading@10: ROW_TRANSFORM bq yading@10: COL_TRANSFORM [dq], mm0, [pw_col_coeffs+ 0], [pw_col_coeffs+ 8] yading@10: mova mm0, [pw_col_coeffs+ 0] yading@10: COL_TRANSFORM [dq+sq], mm4, mm0, [pw_col_coeffs+ 8] yading@10: mova mm4, [pw_col_coeffs+ 8] yading@10: lea dq, [dq + 2*sq] yading@10: COL_TRANSFORM [dq], mm6, mm0, mm4 yading@10: COL_TRANSFORM [dq+sq], mm7, mm0, mm4 yading@10: ret yading@10: yading@10: ; ff_rv34_idct_dc_add_sse4(uint8_t *dst, int stride, int dc); yading@10: INIT_XMM sse4 yading@10: cglobal rv34_idct_dc_add, 3, 3, 6 yading@10: ; load data yading@10: IDCT_DC_ROUND r2 yading@10: pxor m1, m1 yading@10: yading@10: ; calculate DC yading@10: movd m0, r2d yading@10: lea r2, [r0+r1*2] yading@10: movd m2, [r0] yading@10: movd m3, [r0+r1] yading@10: pshuflw m0, m0, 0 yading@10: movd m4, [r2] yading@10: movd m5, [r2+r1] yading@10: punpcklqdq m0, m0 yading@10: punpckldq m2, m3 yading@10: punpckldq m4, m5 yading@10: punpcklbw m2, m1 yading@10: punpcklbw m4, m1 yading@10: paddw m2, m0 yading@10: paddw m4, m0 yading@10: packuswb m2, m4 yading@10: movd [r0], m2 yading@10: pextrd [r0+r1], m2, 1 yading@10: pextrd [r2], m2, 2 yading@10: pextrd [r2+r1], m2, 3 yading@10: RET