yading@10: ;***************************************************************************** yading@10: ;* x86-optimized AC-3 DSP utils yading@10: ;* Copyright (c) 2011 Justin Ruggles yading@10: ;* yading@10: ;* This file is part of FFmpeg. yading@10: ;* yading@10: ;* FFmpeg is free software; you can redistribute it and/or yading@10: ;* modify it under the terms of the GNU Lesser General Public yading@10: ;* License as published by the Free Software Foundation; either yading@10: ;* version 2.1 of the License, or (at your option) any later version. yading@10: ;* yading@10: ;* FFmpeg is distributed in the hope that it will be useful, yading@10: ;* but WITHOUT ANY WARRANTY; without even the implied warranty of yading@10: ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU yading@10: ;* Lesser General Public License for more details. yading@10: ;* yading@10: ;* You should have received a copy of the GNU Lesser General Public yading@10: ;* License along with FFmpeg; if not, write to the Free Software yading@10: ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA yading@10: ;****************************************************************************** yading@10: yading@10: %include "libavutil/x86/x86util.asm" yading@10: yading@10: SECTION_RODATA yading@10: yading@10: ; 16777216.0f - used in ff_float_to_fixed24() yading@10: pf_1_24: times 4 dd 0x4B800000 yading@10: yading@10: ; used in ff_ac3_compute_mantissa_size() yading@10: cextern ac3_bap_bits yading@10: pw_bap_mul1: dw 21846, 21846, 0, 32768, 21846, 21846, 0, 32768 yading@10: pw_bap_mul2: dw 5, 7, 0, 7, 5, 7, 0, 7 yading@10: yading@10: ; used in ff_ac3_extract_exponents() yading@10: pd_1: times 4 dd 1 yading@10: pd_151: times 4 dd 151 yading@10: yading@10: SECTION .text yading@10: yading@10: ;----------------------------------------------------------------------------- yading@10: ; void ff_ac3_exponent_min(uint8_t *exp, int num_reuse_blocks, int nb_coefs) yading@10: ;----------------------------------------------------------------------------- yading@10: yading@10: %macro AC3_EXPONENT_MIN 0 yading@10: cglobal ac3_exponent_min, 3, 4, 2, exp, reuse_blks, expn, offset yading@10: shl reuse_blksq, 8 yading@10: jz .end yading@10: LOOP_ALIGN yading@10: .nextexp: yading@10: mov offsetq, reuse_blksq yading@10: mova m0, [expq+offsetq] yading@10: sub offsetq, 256 yading@10: LOOP_ALIGN yading@10: .nextblk: yading@10: PMINUB m0, [expq+offsetq], m1 yading@10: sub offsetq, 256 yading@10: jae .nextblk yading@10: mova [expq], m0 yading@10: add expq, mmsize yading@10: sub expnq, mmsize yading@10: jg .nextexp yading@10: .end: yading@10: REP_RET yading@10: %endmacro yading@10: yading@10: %define LOOP_ALIGN yading@10: INIT_MMX mmx yading@10: AC3_EXPONENT_MIN yading@10: %if HAVE_MMXEXT_EXTERNAL yading@10: %define LOOP_ALIGN ALIGN 16 yading@10: INIT_MMX mmxext yading@10: AC3_EXPONENT_MIN yading@10: %endif yading@10: %if HAVE_SSE2_EXTERNAL yading@10: INIT_XMM sse2 yading@10: AC3_EXPONENT_MIN yading@10: %endif yading@10: %undef LOOP_ALIGN yading@10: yading@10: ;----------------------------------------------------------------------------- yading@10: ; int ff_ac3_max_msb_abs_int16(const int16_t *src, int len) yading@10: ; yading@10: ; This function uses 2 different methods to calculate a valid result. yading@10: ; 1) logical 'or' of abs of each element yading@10: ; This is used for ssse3 because of the pabsw instruction. yading@10: ; It is also used for mmx because of the lack of min/max instructions. yading@10: ; 2) calculate min/max for the array, then or(abs(min),abs(max)) yading@10: ; This is used for mmxext and sse2 because they have pminsw/pmaxsw. yading@10: ;----------------------------------------------------------------------------- yading@10: yading@10: ; logical 'or' of 4 or 8 words in an mmx or xmm register into the low word yading@10: %macro OR_WORDS_HORIZ 2 ; src, tmp yading@10: %if cpuflag(sse2) yading@10: movhlps %2, %1 yading@10: por %1, %2 yading@10: pshuflw %2, %1, q0032 yading@10: por %1, %2 yading@10: pshuflw %2, %1, q0001 yading@10: por %1, %2 yading@10: %elif cpuflag(mmxext) yading@10: pshufw %2, %1, q0032 yading@10: por %1, %2 yading@10: pshufw %2, %1, q0001 yading@10: por %1, %2 yading@10: %else ; mmx yading@10: movq %2, %1 yading@10: psrlq %2, 32 yading@10: por %1, %2 yading@10: movq %2, %1 yading@10: psrlq %2, 16 yading@10: por %1, %2 yading@10: %endif yading@10: %endmacro yading@10: yading@10: %macro AC3_MAX_MSB_ABS_INT16 1 yading@10: cglobal ac3_max_msb_abs_int16, 2,2,5, src, len yading@10: pxor m2, m2 yading@10: pxor m3, m3 yading@10: .loop: yading@10: %ifidn %1, min_max yading@10: mova m0, [srcq] yading@10: mova m1, [srcq+mmsize] yading@10: pminsw m2, m0 yading@10: pminsw m2, m1 yading@10: pmaxsw m3, m0 yading@10: pmaxsw m3, m1 yading@10: %else ; or_abs yading@10: %if notcpuflag(ssse3) yading@10: mova m0, [srcq] yading@10: mova m1, [srcq+mmsize] yading@10: ABS2 m0, m1, m3, m4 yading@10: %else ; ssse3 yading@10: ; using memory args is faster for ssse3 yading@10: pabsw m0, [srcq] yading@10: pabsw m1, [srcq+mmsize] yading@10: %endif yading@10: por m2, m0 yading@10: por m2, m1 yading@10: %endif yading@10: add srcq, mmsize*2 yading@10: sub lend, mmsize yading@10: ja .loop yading@10: %ifidn %1, min_max yading@10: ABS2 m2, m3, m0, m1 yading@10: por m2, m3 yading@10: %endif yading@10: OR_WORDS_HORIZ m2, m0 yading@10: movd eax, m2 yading@10: and eax, 0xFFFF yading@10: RET yading@10: %endmacro yading@10: yading@10: INIT_MMX mmx yading@10: AC3_MAX_MSB_ABS_INT16 or_abs yading@10: INIT_MMX mmxext yading@10: AC3_MAX_MSB_ABS_INT16 min_max yading@10: INIT_XMM sse2 yading@10: AC3_MAX_MSB_ABS_INT16 min_max yading@10: INIT_XMM ssse3 yading@10: AC3_MAX_MSB_ABS_INT16 or_abs yading@10: yading@10: ;----------------------------------------------------------------------------- yading@10: ; macro used for ff_ac3_lshift_int16() and ff_ac3_rshift_int32() yading@10: ;----------------------------------------------------------------------------- yading@10: yading@10: %macro AC3_SHIFT 3 ; l/r, 16/32, shift instruction, instruction set yading@10: cglobal ac3_%1shift_int%2, 3, 3, 5, src, len, shift yading@10: movd m0, shiftd yading@10: .loop: yading@10: mova m1, [srcq ] yading@10: mova m2, [srcq+mmsize ] yading@10: mova m3, [srcq+mmsize*2] yading@10: mova m4, [srcq+mmsize*3] yading@10: %3 m1, m0 yading@10: %3 m2, m0 yading@10: %3 m3, m0 yading@10: %3 m4, m0 yading@10: mova [srcq ], m1 yading@10: mova [srcq+mmsize ], m2 yading@10: mova [srcq+mmsize*2], m3 yading@10: mova [srcq+mmsize*3], m4 yading@10: add srcq, mmsize*4 yading@10: sub lend, mmsize*32/%2 yading@10: ja .loop yading@10: .end: yading@10: REP_RET yading@10: %endmacro yading@10: yading@10: ;----------------------------------------------------------------------------- yading@10: ; void ff_ac3_lshift_int16(int16_t *src, unsigned int len, unsigned int shift) yading@10: ;----------------------------------------------------------------------------- yading@10: yading@10: INIT_MMX mmx yading@10: AC3_SHIFT l, 16, psllw yading@10: INIT_XMM sse2 yading@10: AC3_SHIFT l, 16, psllw yading@10: yading@10: ;----------------------------------------------------------------------------- yading@10: ; void ff_ac3_rshift_int32(int32_t *src, unsigned int len, unsigned int shift) yading@10: ;----------------------------------------------------------------------------- yading@10: yading@10: INIT_MMX mmx yading@10: AC3_SHIFT r, 32, psrad yading@10: INIT_XMM sse2 yading@10: AC3_SHIFT r, 32, psrad yading@10: yading@10: ;----------------------------------------------------------------------------- yading@10: ; void ff_float_to_fixed24(int32_t *dst, const float *src, unsigned int len) yading@10: ;----------------------------------------------------------------------------- yading@10: yading@10: ; The 3DNow! version is not bit-identical because pf2id uses truncation rather yading@10: ; than round-to-nearest. yading@10: INIT_MMX 3dnow yading@10: cglobal float_to_fixed24, 3, 3, 0, dst, src, len yading@10: movq m0, [pf_1_24] yading@10: .loop: yading@10: movq m1, [srcq ] yading@10: movq m2, [srcq+8 ] yading@10: movq m3, [srcq+16] yading@10: movq m4, [srcq+24] yading@10: pfmul m1, m0 yading@10: pfmul m2, m0 yading@10: pfmul m3, m0 yading@10: pfmul m4, m0 yading@10: pf2id m1, m1 yading@10: pf2id m2, m2 yading@10: pf2id m3, m3 yading@10: pf2id m4, m4 yading@10: movq [dstq ], m1 yading@10: movq [dstq+8 ], m2 yading@10: movq [dstq+16], m3 yading@10: movq [dstq+24], m4 yading@10: add srcq, 32 yading@10: add dstq, 32 yading@10: sub lend, 8 yading@10: ja .loop yading@10: femms yading@10: RET yading@10: yading@10: INIT_XMM sse yading@10: cglobal float_to_fixed24, 3, 3, 3, dst, src, len yading@10: movaps m0, [pf_1_24] yading@10: .loop: yading@10: movaps m1, [srcq ] yading@10: movaps m2, [srcq+16] yading@10: mulps m1, m0 yading@10: mulps m2, m0 yading@10: cvtps2pi mm0, m1 yading@10: movhlps m1, m1 yading@10: cvtps2pi mm1, m1 yading@10: cvtps2pi mm2, m2 yading@10: movhlps m2, m2 yading@10: cvtps2pi mm3, m2 yading@10: movq [dstq ], mm0 yading@10: movq [dstq+ 8], mm1 yading@10: movq [dstq+16], mm2 yading@10: movq [dstq+24], mm3 yading@10: add srcq, 32 yading@10: add dstq, 32 yading@10: sub lend, 8 yading@10: ja .loop yading@10: emms yading@10: RET yading@10: yading@10: INIT_XMM sse2 yading@10: cglobal float_to_fixed24, 3, 3, 9, dst, src, len yading@10: movaps m0, [pf_1_24] yading@10: .loop: yading@10: movaps m1, [srcq ] yading@10: movaps m2, [srcq+16 ] yading@10: movaps m3, [srcq+32 ] yading@10: movaps m4, [srcq+48 ] yading@10: %ifdef m8 yading@10: movaps m5, [srcq+64 ] yading@10: movaps m6, [srcq+80 ] yading@10: movaps m7, [srcq+96 ] yading@10: movaps m8, [srcq+112] yading@10: %endif yading@10: mulps m1, m0 yading@10: mulps m2, m0 yading@10: mulps m3, m0 yading@10: mulps m4, m0 yading@10: %ifdef m8 yading@10: mulps m5, m0 yading@10: mulps m6, m0 yading@10: mulps m7, m0 yading@10: mulps m8, m0 yading@10: %endif yading@10: cvtps2dq m1, m1 yading@10: cvtps2dq m2, m2 yading@10: cvtps2dq m3, m3 yading@10: cvtps2dq m4, m4 yading@10: %ifdef m8 yading@10: cvtps2dq m5, m5 yading@10: cvtps2dq m6, m6 yading@10: cvtps2dq m7, m7 yading@10: cvtps2dq m8, m8 yading@10: %endif yading@10: movdqa [dstq ], m1 yading@10: movdqa [dstq+16 ], m2 yading@10: movdqa [dstq+32 ], m3 yading@10: movdqa [dstq+48 ], m4 yading@10: %ifdef m8 yading@10: movdqa [dstq+64 ], m5 yading@10: movdqa [dstq+80 ], m6 yading@10: movdqa [dstq+96 ], m7 yading@10: movdqa [dstq+112], m8 yading@10: add srcq, 128 yading@10: add dstq, 128 yading@10: sub lenq, 32 yading@10: %else yading@10: add srcq, 64 yading@10: add dstq, 64 yading@10: sub lenq, 16 yading@10: %endif yading@10: ja .loop yading@10: REP_RET yading@10: yading@10: ;------------------------------------------------------------------------------ yading@10: ; int ff_ac3_compute_mantissa_size(uint16_t mant_cnt[6][16]) yading@10: ;------------------------------------------------------------------------------ yading@10: yading@10: %macro PHADDD4 2 ; xmm src, xmm tmp yading@10: movhlps %2, %1 yading@10: paddd %1, %2 yading@10: pshufd %2, %1, 0x1 yading@10: paddd %1, %2 yading@10: %endmacro yading@10: yading@10: INIT_XMM sse2 yading@10: cglobal ac3_compute_mantissa_size, 1, 2, 4, mant_cnt, sum yading@10: movdqa m0, [mant_cntq ] yading@10: movdqa m1, [mant_cntq+ 1*16] yading@10: paddw m0, [mant_cntq+ 2*16] yading@10: paddw m1, [mant_cntq+ 3*16] yading@10: paddw m0, [mant_cntq+ 4*16] yading@10: paddw m1, [mant_cntq+ 5*16] yading@10: paddw m0, [mant_cntq+ 6*16] yading@10: paddw m1, [mant_cntq+ 7*16] yading@10: paddw m0, [mant_cntq+ 8*16] yading@10: paddw m1, [mant_cntq+ 9*16] yading@10: paddw m0, [mant_cntq+10*16] yading@10: paddw m1, [mant_cntq+11*16] yading@10: pmaddwd m0, [ac3_bap_bits ] yading@10: pmaddwd m1, [ac3_bap_bits+16] yading@10: paddd m0, m1 yading@10: PHADDD4 m0, m1 yading@10: movd sumd, m0 yading@10: movdqa m3, [pw_bap_mul1] yading@10: movhpd m0, [mant_cntq +2] yading@10: movlpd m0, [mant_cntq+1*32+2] yading@10: movhpd m1, [mant_cntq+2*32+2] yading@10: movlpd m1, [mant_cntq+3*32+2] yading@10: movhpd m2, [mant_cntq+4*32+2] yading@10: movlpd m2, [mant_cntq+5*32+2] yading@10: pmulhuw m0, m3 yading@10: pmulhuw m1, m3 yading@10: pmulhuw m2, m3 yading@10: paddusw m0, m1 yading@10: paddusw m0, m2 yading@10: pmaddwd m0, [pw_bap_mul2] yading@10: PHADDD4 m0, m1 yading@10: movd eax, m0 yading@10: add eax, sumd yading@10: RET yading@10: yading@10: ;------------------------------------------------------------------------------ yading@10: ; void ff_ac3_extract_exponents(uint8_t *exp, int32_t *coef, int nb_coefs) yading@10: ;------------------------------------------------------------------------------ yading@10: yading@10: %macro PABSD 1-2 ; src/dst, unused yading@10: %if cpuflag(ssse3) yading@10: pabsd %1, %1 yading@10: %else ; src/dst, tmp yading@10: pxor %2, %2 yading@10: pcmpgtd %2, %1 yading@10: pxor %1, %2 yading@10: psubd %1, %2 yading@10: %endif yading@10: %endmacro yading@10: yading@10: %if HAVE_AMD3DNOW_EXTERNAL yading@10: INIT_MMX 3dnow yading@10: cglobal ac3_extract_exponents, 3, 3, 0, exp, coef, len yading@10: add expq, lenq yading@10: lea coefq, [coefq+4*lenq] yading@10: neg lenq yading@10: movq m3, [pd_1] yading@10: movq m4, [pd_151] yading@10: .loop: yading@10: movq m0, [coefq+4*lenq ] yading@10: movq m1, [coefq+4*lenq+8] yading@10: PABSD m0, m2 yading@10: PABSD m1, m2 yading@10: pslld m0, 1 yading@10: por m0, m3 yading@10: pi2fd m2, m0 yading@10: psrld m2, 23 yading@10: movq m0, m4 yading@10: psubd m0, m2 yading@10: pslld m1, 1 yading@10: por m1, m3 yading@10: pi2fd m2, m1 yading@10: psrld m2, 23 yading@10: movq m1, m4 yading@10: psubd m1, m2 yading@10: packssdw m0, m0 yading@10: packuswb m0, m0 yading@10: packssdw m1, m1 yading@10: packuswb m1, m1 yading@10: punpcklwd m0, m1 yading@10: movd [expq+lenq], m0 yading@10: add lenq, 4 yading@10: jl .loop yading@10: REP_RET yading@10: %endif yading@10: yading@10: %macro AC3_EXTRACT_EXPONENTS 0 yading@10: cglobal ac3_extract_exponents, 3, 3, 4, exp, coef, len yading@10: add expq, lenq yading@10: lea coefq, [coefq+4*lenq] yading@10: neg lenq yading@10: mova m2, [pd_1] yading@10: mova m3, [pd_151] yading@10: .loop: yading@10: ; move 4 32-bit coefs to xmm0 yading@10: mova m0, [coefq+4*lenq] yading@10: ; absolute value yading@10: PABSD m0, m1 yading@10: ; convert to float and extract exponents yading@10: pslld m0, 1 yading@10: por m0, m2 yading@10: cvtdq2ps m1, m0 yading@10: psrld m1, 23 yading@10: mova m0, m3 yading@10: psubd m0, m1 yading@10: ; move the lowest byte in each of 4 dwords to the low dword yading@10: ; NOTE: We cannot just extract the low bytes with pshufb because the dword yading@10: ; result for 16777215 is -1 due to float inaccuracy. Using packuswb yading@10: ; clips this to 0, which is the correct exponent. yading@10: packssdw m0, m0 yading@10: packuswb m0, m0 yading@10: movd [expq+lenq], m0 yading@10: yading@10: add lenq, 4 yading@10: jl .loop yading@10: REP_RET yading@10: %endmacro yading@10: yading@10: %if HAVE_SSE2_EXTERNAL yading@10: INIT_XMM sse2 yading@10: AC3_EXTRACT_EXPONENTS yading@10: %endif yading@10: %if HAVE_SSSE3_EXTERNAL yading@10: INIT_XMM ssse3 yading@10: AC3_EXTRACT_EXPONENTS yading@10: %endif