annotate ffmpeg/libavutil/x86/float_dsp.asm @ 13:844d341cf643 tip

Back up before ISMIR
author Yading Song <yading.song@eecs.qmul.ac.uk>
date Thu, 31 Oct 2013 13:17:06 +0000
parents f445c3017523
children
rev   line source
yading@11 1 ;*****************************************************************************
yading@11 2 ;* x86-optimized Float DSP functions
yading@11 3 ;*
yading@11 4 ;* Copyright 2006 Loren Merritt
yading@11 5 ;*
yading@11 6 ;* This file is part of FFmpeg.
yading@11 7 ;*
yading@11 8 ;* FFmpeg is free software; you can redistribute it and/or
yading@11 9 ;* modify it under the terms of the GNU Lesser General Public
yading@11 10 ;* License as published by the Free Software Foundation; either
yading@11 11 ;* version 2.1 of the License, or (at your option) any later version.
yading@11 12 ;*
yading@11 13 ;* FFmpeg is distributed in the hope that it will be useful,
yading@11 14 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
yading@11 15 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
yading@11 16 ;* Lesser General Public License for more details.
yading@11 17 ;*
yading@11 18 ;* You should have received a copy of the GNU Lesser General Public
yading@11 19 ;* License along with FFmpeg; if not, write to the Free Software
yading@11 20 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
yading@11 21 ;******************************************************************************
yading@11 22
yading@11 23 %include "x86util.asm"
yading@11 24
yading@11 25 SECTION .text
yading@11 26
yading@11 27 ;-----------------------------------------------------------------------------
yading@11 28 ; void vector_fmul(float *dst, const float *src0, const float *src1, int len)
yading@11 29 ;-----------------------------------------------------------------------------
yading@11 30 %macro VECTOR_FMUL 0
yading@11 31 cglobal vector_fmul, 4,4,2, dst, src0, src1, len
yading@11 32 lea lenq, [lend*4 - 2*mmsize]
yading@11 33 ALIGN 16
yading@11 34 .loop:
yading@11 35 mova m0, [src0q + lenq]
yading@11 36 mova m1, [src0q + lenq + mmsize]
yading@11 37 mulps m0, m0, [src1q + lenq]
yading@11 38 mulps m1, m1, [src1q + lenq + mmsize]
yading@11 39 mova [dstq + lenq], m0
yading@11 40 mova [dstq + lenq + mmsize], m1
yading@11 41
yading@11 42 sub lenq, 2*mmsize
yading@11 43 jge .loop
yading@11 44 REP_RET
yading@11 45 %endmacro
yading@11 46
yading@11 47 INIT_XMM sse
yading@11 48 VECTOR_FMUL
yading@11 49 %if HAVE_AVX_EXTERNAL
yading@11 50 INIT_YMM avx
yading@11 51 VECTOR_FMUL
yading@11 52 %endif
yading@11 53
yading@11 54 ;------------------------------------------------------------------------------
yading@11 55 ; void ff_vector_fmac_scalar(float *dst, const float *src, float mul, int len)
yading@11 56 ;------------------------------------------------------------------------------
yading@11 57
yading@11 58 %macro VECTOR_FMAC_SCALAR 0
yading@11 59 %if UNIX64
yading@11 60 cglobal vector_fmac_scalar, 3,3,3, dst, src, len
yading@11 61 %else
yading@11 62 cglobal vector_fmac_scalar, 4,4,3, dst, src, mul, len
yading@11 63 %endif
yading@11 64 %if ARCH_X86_32
yading@11 65 VBROADCASTSS m0, mulm
yading@11 66 %else
yading@11 67 %if WIN64
yading@11 68 mova xmm0, xmm2
yading@11 69 %endif
yading@11 70 shufps xmm0, xmm0, 0
yading@11 71 %if cpuflag(avx)
yading@11 72 vinsertf128 m0, m0, xmm0, 1
yading@11 73 %endif
yading@11 74 %endif
yading@11 75 lea lenq, [lend*4-2*mmsize]
yading@11 76 .loop:
yading@11 77 mulps m1, m0, [srcq+lenq ]
yading@11 78 mulps m2, m0, [srcq+lenq+mmsize]
yading@11 79 addps m1, m1, [dstq+lenq ]
yading@11 80 addps m2, m2, [dstq+lenq+mmsize]
yading@11 81 mova [dstq+lenq ], m1
yading@11 82 mova [dstq+lenq+mmsize], m2
yading@11 83 sub lenq, 2*mmsize
yading@11 84 jge .loop
yading@11 85 REP_RET
yading@11 86 %endmacro
yading@11 87
yading@11 88 INIT_XMM sse
yading@11 89 VECTOR_FMAC_SCALAR
yading@11 90 %if HAVE_AVX_EXTERNAL
yading@11 91 INIT_YMM avx
yading@11 92 VECTOR_FMAC_SCALAR
yading@11 93 %endif
yading@11 94
yading@11 95 ;------------------------------------------------------------------------------
yading@11 96 ; void ff_vector_fmul_scalar(float *dst, const float *src, float mul, int len)
yading@11 97 ;------------------------------------------------------------------------------
yading@11 98
yading@11 99 %macro VECTOR_FMUL_SCALAR 0
yading@11 100 %if UNIX64
yading@11 101 cglobal vector_fmul_scalar, 3,3,2, dst, src, len
yading@11 102 %else
yading@11 103 cglobal vector_fmul_scalar, 4,4,3, dst, src, mul, len
yading@11 104 %endif
yading@11 105 %if ARCH_X86_32
yading@11 106 movss m0, mulm
yading@11 107 %elif WIN64
yading@11 108 SWAP 0, 2
yading@11 109 %endif
yading@11 110 shufps m0, m0, 0
yading@11 111 lea lenq, [lend*4-mmsize]
yading@11 112 .loop:
yading@11 113 mova m1, [srcq+lenq]
yading@11 114 mulps m1, m0
yading@11 115 mova [dstq+lenq], m1
yading@11 116 sub lenq, mmsize
yading@11 117 jge .loop
yading@11 118 REP_RET
yading@11 119 %endmacro
yading@11 120
yading@11 121 INIT_XMM sse
yading@11 122 VECTOR_FMUL_SCALAR
yading@11 123
yading@11 124 ;------------------------------------------------------------------------------
yading@11 125 ; void ff_vector_dmul_scalar(double *dst, const double *src, double mul,
yading@11 126 ; int len)
yading@11 127 ;------------------------------------------------------------------------------
yading@11 128
yading@11 129 %macro VECTOR_DMUL_SCALAR 0
yading@11 130 %if ARCH_X86_32
yading@11 131 cglobal vector_dmul_scalar, 3,4,3, dst, src, mul, len, lenaddr
yading@11 132 mov lenq, lenaddrm
yading@11 133 %elif UNIX64
yading@11 134 cglobal vector_dmul_scalar, 3,3,3, dst, src, len
yading@11 135 %else
yading@11 136 cglobal vector_dmul_scalar, 4,4,3, dst, src, mul, len
yading@11 137 %endif
yading@11 138 %if ARCH_X86_32
yading@11 139 VBROADCASTSD m0, mulm
yading@11 140 %else
yading@11 141 %if WIN64
yading@11 142 movlhps xmm2, xmm2
yading@11 143 %if cpuflag(avx)
yading@11 144 vinsertf128 ymm2, ymm2, xmm2, 1
yading@11 145 %endif
yading@11 146 SWAP 0, 2
yading@11 147 %else
yading@11 148 movlhps xmm0, xmm0
yading@11 149 %if cpuflag(avx)
yading@11 150 vinsertf128 ymm0, ymm0, xmm0, 1
yading@11 151 %endif
yading@11 152 %endif
yading@11 153 %endif
yading@11 154 lea lenq, [lend*8-2*mmsize]
yading@11 155 .loop:
yading@11 156 mulpd m1, m0, [srcq+lenq ]
yading@11 157 mulpd m2, m0, [srcq+lenq+mmsize]
yading@11 158 mova [dstq+lenq ], m1
yading@11 159 mova [dstq+lenq+mmsize], m2
yading@11 160 sub lenq, 2*mmsize
yading@11 161 jge .loop
yading@11 162 REP_RET
yading@11 163 %endmacro
yading@11 164
yading@11 165 INIT_XMM sse2
yading@11 166 VECTOR_DMUL_SCALAR
yading@11 167 %if HAVE_AVX_EXTERNAL
yading@11 168 INIT_YMM avx
yading@11 169 VECTOR_DMUL_SCALAR
yading@11 170 %endif
yading@11 171
yading@11 172 ;-----------------------------------------------------------------------------
yading@11 173 ; vector_fmul_add(float *dst, const float *src0, const float *src1,
yading@11 174 ; const float *src2, int len)
yading@11 175 ;-----------------------------------------------------------------------------
yading@11 176 %macro VECTOR_FMUL_ADD 0
yading@11 177 cglobal vector_fmul_add, 5,5,2, dst, src0, src1, src2, len
yading@11 178 lea lenq, [lend*4 - 2*mmsize]
yading@11 179 ALIGN 16
yading@11 180 .loop:
yading@11 181 mova m0, [src0q + lenq]
yading@11 182 mova m1, [src0q + lenq + mmsize]
yading@11 183 mulps m0, m0, [src1q + lenq]
yading@11 184 mulps m1, m1, [src1q + lenq + mmsize]
yading@11 185 addps m0, m0, [src2q + lenq]
yading@11 186 addps m1, m1, [src2q + lenq + mmsize]
yading@11 187 mova [dstq + lenq], m0
yading@11 188 mova [dstq + lenq + mmsize], m1
yading@11 189
yading@11 190 sub lenq, 2*mmsize
yading@11 191 jge .loop
yading@11 192 REP_RET
yading@11 193 %endmacro
yading@11 194
yading@11 195 INIT_XMM sse
yading@11 196 VECTOR_FMUL_ADD
yading@11 197 %if HAVE_AVX_EXTERNAL
yading@11 198 INIT_YMM avx
yading@11 199 VECTOR_FMUL_ADD
yading@11 200 %endif
yading@11 201
yading@11 202 ;-----------------------------------------------------------------------------
yading@11 203 ; void vector_fmul_reverse(float *dst, const float *src0, const float *src1,
yading@11 204 ; int len)
yading@11 205 ;-----------------------------------------------------------------------------
yading@11 206 %macro VECTOR_FMUL_REVERSE 0
yading@11 207 cglobal vector_fmul_reverse, 4,4,2, dst, src0, src1, len
yading@11 208 lea lenq, [lend*4 - 2*mmsize]
yading@11 209 ALIGN 16
yading@11 210 .loop:
yading@11 211 %if cpuflag(avx)
yading@11 212 vmovaps xmm0, [src1q + 16]
yading@11 213 vinsertf128 m0, m0, [src1q], 1
yading@11 214 vshufps m0, m0, m0, q0123
yading@11 215 vmovaps xmm1, [src1q + mmsize + 16]
yading@11 216 vinsertf128 m1, m1, [src1q + mmsize], 1
yading@11 217 vshufps m1, m1, m1, q0123
yading@11 218 %else
yading@11 219 mova m0, [src1q]
yading@11 220 mova m1, [src1q + mmsize]
yading@11 221 shufps m0, m0, q0123
yading@11 222 shufps m1, m1, q0123
yading@11 223 %endif
yading@11 224 mulps m0, m0, [src0q + lenq + mmsize]
yading@11 225 mulps m1, m1, [src0q + lenq]
yading@11 226 mova [dstq + lenq + mmsize], m0
yading@11 227 mova [dstq + lenq], m1
yading@11 228 add src1q, 2*mmsize
yading@11 229 sub lenq, 2*mmsize
yading@11 230 jge .loop
yading@11 231 REP_RET
yading@11 232 %endmacro
yading@11 233
yading@11 234 INIT_XMM sse
yading@11 235 VECTOR_FMUL_REVERSE
yading@11 236 %if HAVE_AVX_EXTERNAL
yading@11 237 INIT_YMM avx
yading@11 238 VECTOR_FMUL_REVERSE
yading@11 239 %endif
yading@11 240
yading@11 241 ; float scalarproduct_float_sse(const float *v1, const float *v2, int len)
yading@11 242 INIT_XMM sse
yading@11 243 cglobal scalarproduct_float, 3,3,2, v1, v2, offset
yading@11 244 neg offsetq
yading@11 245 shl offsetq, 2
yading@11 246 sub v1q, offsetq
yading@11 247 sub v2q, offsetq
yading@11 248 xorps xmm0, xmm0
yading@11 249 .loop:
yading@11 250 movaps xmm1, [v1q+offsetq]
yading@11 251 mulps xmm1, [v2q+offsetq]
yading@11 252 addps xmm0, xmm1
yading@11 253 add offsetq, 16
yading@11 254 js .loop
yading@11 255 movhlps xmm1, xmm0
yading@11 256 addps xmm0, xmm1
yading@11 257 movss xmm1, xmm0
yading@11 258 shufps xmm0, xmm0, 1
yading@11 259 addss xmm0, xmm1
yading@11 260 %if ARCH_X86_64 == 0
yading@11 261 movss r0m, xmm0
yading@11 262 fld dword r0m
yading@11 263 %endif
yading@11 264 RET
yading@11 265
yading@11 266 ;-----------------------------------------------------------------------------
yading@11 267 ; void ff_butterflies_float(float *src0, float *src1, int len);
yading@11 268 ;-----------------------------------------------------------------------------
yading@11 269 INIT_XMM sse
yading@11 270 cglobal butterflies_float, 3,3,3, src0, src1, len
yading@11 271 movsxdifnidn lenq, lend
yading@11 272 test lenq, lenq
yading@11 273 jz .end
yading@11 274 shl lenq, 2
yading@11 275 add src0q, lenq
yading@11 276 add src1q, lenq
yading@11 277 neg lenq
yading@11 278 .loop:
yading@11 279 mova m0, [src0q + lenq]
yading@11 280 mova m1, [src1q + lenq]
yading@11 281 subps m2, m0, m1
yading@11 282 addps m0, m0, m1
yading@11 283 mova [src1q + lenq], m2
yading@11 284 mova [src0q + lenq], m0
yading@11 285 add lenq, mmsize
yading@11 286 jl .loop
yading@11 287 .end:
yading@11 288 REP_RET