yading@11: ;***************************************************************************** yading@11: ;* x86-optimized Float DSP functions yading@11: ;* yading@11: ;* Copyright 2006 Loren Merritt yading@11: ;* yading@11: ;* This file is part of FFmpeg. yading@11: ;* yading@11: ;* FFmpeg is free software; you can redistribute it and/or yading@11: ;* modify it under the terms of the GNU Lesser General Public yading@11: ;* License as published by the Free Software Foundation; either yading@11: ;* version 2.1 of the License, or (at your option) any later version. yading@11: ;* yading@11: ;* FFmpeg is distributed in the hope that it will be useful, yading@11: ;* but WITHOUT ANY WARRANTY; without even the implied warranty of yading@11: ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU yading@11: ;* Lesser General Public License for more details. yading@11: ;* yading@11: ;* You should have received a copy of the GNU Lesser General Public yading@11: ;* License along with FFmpeg; if not, write to the Free Software yading@11: ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA yading@11: ;****************************************************************************** yading@11: yading@11: %include "x86util.asm" yading@11: yading@11: SECTION .text yading@11: yading@11: ;----------------------------------------------------------------------------- yading@11: ; void vector_fmul(float *dst, const float *src0, const float *src1, int len) yading@11: ;----------------------------------------------------------------------------- yading@11: %macro VECTOR_FMUL 0 yading@11: cglobal vector_fmul, 4,4,2, dst, src0, src1, len yading@11: lea lenq, [lend*4 - 2*mmsize] yading@11: ALIGN 16 yading@11: .loop: yading@11: mova m0, [src0q + lenq] yading@11: mova m1, [src0q + lenq + mmsize] yading@11: mulps m0, m0, [src1q + lenq] yading@11: mulps m1, m1, [src1q + lenq + mmsize] yading@11: mova [dstq + lenq], m0 yading@11: mova [dstq + lenq + mmsize], m1 yading@11: yading@11: sub lenq, 2*mmsize yading@11: jge .loop yading@11: REP_RET yading@11: %endmacro yading@11: yading@11: INIT_XMM sse yading@11: VECTOR_FMUL yading@11: %if HAVE_AVX_EXTERNAL yading@11: INIT_YMM avx yading@11: VECTOR_FMUL yading@11: %endif yading@11: yading@11: ;------------------------------------------------------------------------------ yading@11: ; void ff_vector_fmac_scalar(float *dst, const float *src, float mul, int len) yading@11: ;------------------------------------------------------------------------------ yading@11: yading@11: %macro VECTOR_FMAC_SCALAR 0 yading@11: %if UNIX64 yading@11: cglobal vector_fmac_scalar, 3,3,3, dst, src, len yading@11: %else yading@11: cglobal vector_fmac_scalar, 4,4,3, dst, src, mul, len yading@11: %endif yading@11: %if ARCH_X86_32 yading@11: VBROADCASTSS m0, mulm yading@11: %else yading@11: %if WIN64 yading@11: mova xmm0, xmm2 yading@11: %endif yading@11: shufps xmm0, xmm0, 0 yading@11: %if cpuflag(avx) yading@11: vinsertf128 m0, m0, xmm0, 1 yading@11: %endif yading@11: %endif yading@11: lea lenq, [lend*4-2*mmsize] yading@11: .loop: yading@11: mulps m1, m0, [srcq+lenq ] yading@11: mulps m2, m0, [srcq+lenq+mmsize] yading@11: addps m1, m1, [dstq+lenq ] yading@11: addps m2, m2, [dstq+lenq+mmsize] yading@11: mova [dstq+lenq ], m1 yading@11: mova [dstq+lenq+mmsize], m2 yading@11: sub lenq, 2*mmsize yading@11: jge .loop yading@11: REP_RET yading@11: %endmacro yading@11: yading@11: INIT_XMM sse yading@11: VECTOR_FMAC_SCALAR yading@11: %if HAVE_AVX_EXTERNAL yading@11: INIT_YMM avx yading@11: VECTOR_FMAC_SCALAR yading@11: %endif yading@11: yading@11: ;------------------------------------------------------------------------------ yading@11: ; void ff_vector_fmul_scalar(float *dst, const float *src, float mul, int len) yading@11: ;------------------------------------------------------------------------------ yading@11: yading@11: %macro VECTOR_FMUL_SCALAR 0 yading@11: %if UNIX64 yading@11: cglobal vector_fmul_scalar, 3,3,2, dst, src, len yading@11: %else yading@11: cglobal vector_fmul_scalar, 4,4,3, dst, src, mul, len yading@11: %endif yading@11: %if ARCH_X86_32 yading@11: movss m0, mulm yading@11: %elif WIN64 yading@11: SWAP 0, 2 yading@11: %endif yading@11: shufps m0, m0, 0 yading@11: lea lenq, [lend*4-mmsize] yading@11: .loop: yading@11: mova m1, [srcq+lenq] yading@11: mulps m1, m0 yading@11: mova [dstq+lenq], m1 yading@11: sub lenq, mmsize yading@11: jge .loop yading@11: REP_RET yading@11: %endmacro yading@11: yading@11: INIT_XMM sse yading@11: VECTOR_FMUL_SCALAR yading@11: yading@11: ;------------------------------------------------------------------------------ yading@11: ; void ff_vector_dmul_scalar(double *dst, const double *src, double mul, yading@11: ; int len) yading@11: ;------------------------------------------------------------------------------ yading@11: yading@11: %macro VECTOR_DMUL_SCALAR 0 yading@11: %if ARCH_X86_32 yading@11: cglobal vector_dmul_scalar, 3,4,3, dst, src, mul, len, lenaddr yading@11: mov lenq, lenaddrm yading@11: %elif UNIX64 yading@11: cglobal vector_dmul_scalar, 3,3,3, dst, src, len yading@11: %else yading@11: cglobal vector_dmul_scalar, 4,4,3, dst, src, mul, len yading@11: %endif yading@11: %if ARCH_X86_32 yading@11: VBROADCASTSD m0, mulm yading@11: %else yading@11: %if WIN64 yading@11: movlhps xmm2, xmm2 yading@11: %if cpuflag(avx) yading@11: vinsertf128 ymm2, ymm2, xmm2, 1 yading@11: %endif yading@11: SWAP 0, 2 yading@11: %else yading@11: movlhps xmm0, xmm0 yading@11: %if cpuflag(avx) yading@11: vinsertf128 ymm0, ymm0, xmm0, 1 yading@11: %endif yading@11: %endif yading@11: %endif yading@11: lea lenq, [lend*8-2*mmsize] yading@11: .loop: yading@11: mulpd m1, m0, [srcq+lenq ] yading@11: mulpd m2, m0, [srcq+lenq+mmsize] yading@11: mova [dstq+lenq ], m1 yading@11: mova [dstq+lenq+mmsize], m2 yading@11: sub lenq, 2*mmsize yading@11: jge .loop yading@11: REP_RET yading@11: %endmacro yading@11: yading@11: INIT_XMM sse2 yading@11: VECTOR_DMUL_SCALAR yading@11: %if HAVE_AVX_EXTERNAL yading@11: INIT_YMM avx yading@11: VECTOR_DMUL_SCALAR yading@11: %endif yading@11: yading@11: ;----------------------------------------------------------------------------- yading@11: ; vector_fmul_add(float *dst, const float *src0, const float *src1, yading@11: ; const float *src2, int len) yading@11: ;----------------------------------------------------------------------------- yading@11: %macro VECTOR_FMUL_ADD 0 yading@11: cglobal vector_fmul_add, 5,5,2, dst, src0, src1, src2, len yading@11: lea lenq, [lend*4 - 2*mmsize] yading@11: ALIGN 16 yading@11: .loop: yading@11: mova m0, [src0q + lenq] yading@11: mova m1, [src0q + lenq + mmsize] yading@11: mulps m0, m0, [src1q + lenq] yading@11: mulps m1, m1, [src1q + lenq + mmsize] yading@11: addps m0, m0, [src2q + lenq] yading@11: addps m1, m1, [src2q + lenq + mmsize] yading@11: mova [dstq + lenq], m0 yading@11: mova [dstq + lenq + mmsize], m1 yading@11: yading@11: sub lenq, 2*mmsize yading@11: jge .loop yading@11: REP_RET yading@11: %endmacro yading@11: yading@11: INIT_XMM sse yading@11: VECTOR_FMUL_ADD yading@11: %if HAVE_AVX_EXTERNAL yading@11: INIT_YMM avx yading@11: VECTOR_FMUL_ADD yading@11: %endif yading@11: yading@11: ;----------------------------------------------------------------------------- yading@11: ; void vector_fmul_reverse(float *dst, const float *src0, const float *src1, yading@11: ; int len) yading@11: ;----------------------------------------------------------------------------- yading@11: %macro VECTOR_FMUL_REVERSE 0 yading@11: cglobal vector_fmul_reverse, 4,4,2, dst, src0, src1, len yading@11: lea lenq, [lend*4 - 2*mmsize] yading@11: ALIGN 16 yading@11: .loop: yading@11: %if cpuflag(avx) yading@11: vmovaps xmm0, [src1q + 16] yading@11: vinsertf128 m0, m0, [src1q], 1 yading@11: vshufps m0, m0, m0, q0123 yading@11: vmovaps xmm1, [src1q + mmsize + 16] yading@11: vinsertf128 m1, m1, [src1q + mmsize], 1 yading@11: vshufps m1, m1, m1, q0123 yading@11: %else yading@11: mova m0, [src1q] yading@11: mova m1, [src1q + mmsize] yading@11: shufps m0, m0, q0123 yading@11: shufps m1, m1, q0123 yading@11: %endif yading@11: mulps m0, m0, [src0q + lenq + mmsize] yading@11: mulps m1, m1, [src0q + lenq] yading@11: mova [dstq + lenq + mmsize], m0 yading@11: mova [dstq + lenq], m1 yading@11: add src1q, 2*mmsize yading@11: sub lenq, 2*mmsize yading@11: jge .loop yading@11: REP_RET yading@11: %endmacro yading@11: yading@11: INIT_XMM sse yading@11: VECTOR_FMUL_REVERSE yading@11: %if HAVE_AVX_EXTERNAL yading@11: INIT_YMM avx yading@11: VECTOR_FMUL_REVERSE yading@11: %endif yading@11: yading@11: ; float scalarproduct_float_sse(const float *v1, const float *v2, int len) yading@11: INIT_XMM sse yading@11: cglobal scalarproduct_float, 3,3,2, v1, v2, offset yading@11: neg offsetq yading@11: shl offsetq, 2 yading@11: sub v1q, offsetq yading@11: sub v2q, offsetq yading@11: xorps xmm0, xmm0 yading@11: .loop: yading@11: movaps xmm1, [v1q+offsetq] yading@11: mulps xmm1, [v2q+offsetq] yading@11: addps xmm0, xmm1 yading@11: add offsetq, 16 yading@11: js .loop yading@11: movhlps xmm1, xmm0 yading@11: addps xmm0, xmm1 yading@11: movss xmm1, xmm0 yading@11: shufps xmm0, xmm0, 1 yading@11: addss xmm0, xmm1 yading@11: %if ARCH_X86_64 == 0 yading@11: movss r0m, xmm0 yading@11: fld dword r0m yading@11: %endif yading@11: RET yading@11: yading@11: ;----------------------------------------------------------------------------- yading@11: ; void ff_butterflies_float(float *src0, float *src1, int len); yading@11: ;----------------------------------------------------------------------------- yading@11: INIT_XMM sse yading@11: cglobal butterflies_float, 3,3,3, src0, src1, len yading@11: movsxdifnidn lenq, lend yading@11: test lenq, lenq yading@11: jz .end yading@11: shl lenq, 2 yading@11: add src0q, lenq yading@11: add src1q, lenq yading@11: neg lenq yading@11: .loop: yading@11: mova m0, [src0q + lenq] yading@11: mova m1, [src1q + lenq] yading@11: subps m2, m0, m1 yading@11: addps m0, m0, m1 yading@11: mova [src1q + lenq], m2 yading@11: mova [src0q + lenq], m0 yading@11: add lenq, mmsize yading@11: jl .loop yading@11: .end: yading@11: REP_RET