annotate ffmpeg/libavcodec/x86/ac3dsp.asm @ 13:844d341cf643 tip

Back up before ISMIR
author Yading Song <yading.song@eecs.qmul.ac.uk>
date Thu, 31 Oct 2013 13:17:06 +0000
parents 6840f77b83aa
children
rev   line source
yading@10 1 ;*****************************************************************************
yading@10 2 ;* x86-optimized AC-3 DSP utils
yading@10 3 ;* Copyright (c) 2011 Justin Ruggles
yading@10 4 ;*
yading@10 5 ;* This file is part of FFmpeg.
yading@10 6 ;*
yading@10 7 ;* FFmpeg is free software; you can redistribute it and/or
yading@10 8 ;* modify it under the terms of the GNU Lesser General Public
yading@10 9 ;* License as published by the Free Software Foundation; either
yading@10 10 ;* version 2.1 of the License, or (at your option) any later version.
yading@10 11 ;*
yading@10 12 ;* FFmpeg is distributed in the hope that it will be useful,
yading@10 13 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
yading@10 14 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
yading@10 15 ;* Lesser General Public License for more details.
yading@10 16 ;*
yading@10 17 ;* You should have received a copy of the GNU Lesser General Public
yading@10 18 ;* License along with FFmpeg; if not, write to the Free Software
yading@10 19 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
yading@10 20 ;******************************************************************************
yading@10 21
yading@10 22 %include "libavutil/x86/x86util.asm"
yading@10 23
yading@10 24 SECTION_RODATA
yading@10 25
yading@10 26 ; 16777216.0f - used in ff_float_to_fixed24()
yading@10 27 pf_1_24: times 4 dd 0x4B800000
yading@10 28
yading@10 29 ; used in ff_ac3_compute_mantissa_size()
yading@10 30 cextern ac3_bap_bits
yading@10 31 pw_bap_mul1: dw 21846, 21846, 0, 32768, 21846, 21846, 0, 32768
yading@10 32 pw_bap_mul2: dw 5, 7, 0, 7, 5, 7, 0, 7
yading@10 33
yading@10 34 ; used in ff_ac3_extract_exponents()
yading@10 35 pd_1: times 4 dd 1
yading@10 36 pd_151: times 4 dd 151
yading@10 37
yading@10 38 SECTION .text
yading@10 39
yading@10 40 ;-----------------------------------------------------------------------------
yading@10 41 ; void ff_ac3_exponent_min(uint8_t *exp, int num_reuse_blocks, int nb_coefs)
yading@10 42 ;-----------------------------------------------------------------------------
yading@10 43
yading@10 44 %macro AC3_EXPONENT_MIN 0
yading@10 45 cglobal ac3_exponent_min, 3, 4, 2, exp, reuse_blks, expn, offset
yading@10 46 shl reuse_blksq, 8
yading@10 47 jz .end
yading@10 48 LOOP_ALIGN
yading@10 49 .nextexp:
yading@10 50 mov offsetq, reuse_blksq
yading@10 51 mova m0, [expq+offsetq]
yading@10 52 sub offsetq, 256
yading@10 53 LOOP_ALIGN
yading@10 54 .nextblk:
yading@10 55 PMINUB m0, [expq+offsetq], m1
yading@10 56 sub offsetq, 256
yading@10 57 jae .nextblk
yading@10 58 mova [expq], m0
yading@10 59 add expq, mmsize
yading@10 60 sub expnq, mmsize
yading@10 61 jg .nextexp
yading@10 62 .end:
yading@10 63 REP_RET
yading@10 64 %endmacro
yading@10 65
yading@10 66 %define LOOP_ALIGN
yading@10 67 INIT_MMX mmx
yading@10 68 AC3_EXPONENT_MIN
yading@10 69 %if HAVE_MMXEXT_EXTERNAL
yading@10 70 %define LOOP_ALIGN ALIGN 16
yading@10 71 INIT_MMX mmxext
yading@10 72 AC3_EXPONENT_MIN
yading@10 73 %endif
yading@10 74 %if HAVE_SSE2_EXTERNAL
yading@10 75 INIT_XMM sse2
yading@10 76 AC3_EXPONENT_MIN
yading@10 77 %endif
yading@10 78 %undef LOOP_ALIGN
yading@10 79
yading@10 80 ;-----------------------------------------------------------------------------
yading@10 81 ; int ff_ac3_max_msb_abs_int16(const int16_t *src, int len)
yading@10 82 ;
yading@10 83 ; This function uses 2 different methods to calculate a valid result.
yading@10 84 ; 1) logical 'or' of abs of each element
yading@10 85 ; This is used for ssse3 because of the pabsw instruction.
yading@10 86 ; It is also used for mmx because of the lack of min/max instructions.
yading@10 87 ; 2) calculate min/max for the array, then or(abs(min),abs(max))
yading@10 88 ; This is used for mmxext and sse2 because they have pminsw/pmaxsw.
yading@10 89 ;-----------------------------------------------------------------------------
yading@10 90
yading@10 91 ; logical 'or' of 4 or 8 words in an mmx or xmm register into the low word
yading@10 92 %macro OR_WORDS_HORIZ 2 ; src, tmp
yading@10 93 %if cpuflag(sse2)
yading@10 94 movhlps %2, %1
yading@10 95 por %1, %2
yading@10 96 pshuflw %2, %1, q0032
yading@10 97 por %1, %2
yading@10 98 pshuflw %2, %1, q0001
yading@10 99 por %1, %2
yading@10 100 %elif cpuflag(mmxext)
yading@10 101 pshufw %2, %1, q0032
yading@10 102 por %1, %2
yading@10 103 pshufw %2, %1, q0001
yading@10 104 por %1, %2
yading@10 105 %else ; mmx
yading@10 106 movq %2, %1
yading@10 107 psrlq %2, 32
yading@10 108 por %1, %2
yading@10 109 movq %2, %1
yading@10 110 psrlq %2, 16
yading@10 111 por %1, %2
yading@10 112 %endif
yading@10 113 %endmacro
yading@10 114
yading@10 115 %macro AC3_MAX_MSB_ABS_INT16 1
yading@10 116 cglobal ac3_max_msb_abs_int16, 2,2,5, src, len
yading@10 117 pxor m2, m2
yading@10 118 pxor m3, m3
yading@10 119 .loop:
yading@10 120 %ifidn %1, min_max
yading@10 121 mova m0, [srcq]
yading@10 122 mova m1, [srcq+mmsize]
yading@10 123 pminsw m2, m0
yading@10 124 pminsw m2, m1
yading@10 125 pmaxsw m3, m0
yading@10 126 pmaxsw m3, m1
yading@10 127 %else ; or_abs
yading@10 128 %if notcpuflag(ssse3)
yading@10 129 mova m0, [srcq]
yading@10 130 mova m1, [srcq+mmsize]
yading@10 131 ABS2 m0, m1, m3, m4
yading@10 132 %else ; ssse3
yading@10 133 ; using memory args is faster for ssse3
yading@10 134 pabsw m0, [srcq]
yading@10 135 pabsw m1, [srcq+mmsize]
yading@10 136 %endif
yading@10 137 por m2, m0
yading@10 138 por m2, m1
yading@10 139 %endif
yading@10 140 add srcq, mmsize*2
yading@10 141 sub lend, mmsize
yading@10 142 ja .loop
yading@10 143 %ifidn %1, min_max
yading@10 144 ABS2 m2, m3, m0, m1
yading@10 145 por m2, m3
yading@10 146 %endif
yading@10 147 OR_WORDS_HORIZ m2, m0
yading@10 148 movd eax, m2
yading@10 149 and eax, 0xFFFF
yading@10 150 RET
yading@10 151 %endmacro
yading@10 152
yading@10 153 INIT_MMX mmx
yading@10 154 AC3_MAX_MSB_ABS_INT16 or_abs
yading@10 155 INIT_MMX mmxext
yading@10 156 AC3_MAX_MSB_ABS_INT16 min_max
yading@10 157 INIT_XMM sse2
yading@10 158 AC3_MAX_MSB_ABS_INT16 min_max
yading@10 159 INIT_XMM ssse3
yading@10 160 AC3_MAX_MSB_ABS_INT16 or_abs
yading@10 161
yading@10 162 ;-----------------------------------------------------------------------------
yading@10 163 ; macro used for ff_ac3_lshift_int16() and ff_ac3_rshift_int32()
yading@10 164 ;-----------------------------------------------------------------------------
yading@10 165
yading@10 166 %macro AC3_SHIFT 3 ; l/r, 16/32, shift instruction, instruction set
yading@10 167 cglobal ac3_%1shift_int%2, 3, 3, 5, src, len, shift
yading@10 168 movd m0, shiftd
yading@10 169 .loop:
yading@10 170 mova m1, [srcq ]
yading@10 171 mova m2, [srcq+mmsize ]
yading@10 172 mova m3, [srcq+mmsize*2]
yading@10 173 mova m4, [srcq+mmsize*3]
yading@10 174 %3 m1, m0
yading@10 175 %3 m2, m0
yading@10 176 %3 m3, m0
yading@10 177 %3 m4, m0
yading@10 178 mova [srcq ], m1
yading@10 179 mova [srcq+mmsize ], m2
yading@10 180 mova [srcq+mmsize*2], m3
yading@10 181 mova [srcq+mmsize*3], m4
yading@10 182 add srcq, mmsize*4
yading@10 183 sub lend, mmsize*32/%2
yading@10 184 ja .loop
yading@10 185 .end:
yading@10 186 REP_RET
yading@10 187 %endmacro
yading@10 188
yading@10 189 ;-----------------------------------------------------------------------------
yading@10 190 ; void ff_ac3_lshift_int16(int16_t *src, unsigned int len, unsigned int shift)
yading@10 191 ;-----------------------------------------------------------------------------
yading@10 192
yading@10 193 INIT_MMX mmx
yading@10 194 AC3_SHIFT l, 16, psllw
yading@10 195 INIT_XMM sse2
yading@10 196 AC3_SHIFT l, 16, psllw
yading@10 197
yading@10 198 ;-----------------------------------------------------------------------------
yading@10 199 ; void ff_ac3_rshift_int32(int32_t *src, unsigned int len, unsigned int shift)
yading@10 200 ;-----------------------------------------------------------------------------
yading@10 201
yading@10 202 INIT_MMX mmx
yading@10 203 AC3_SHIFT r, 32, psrad
yading@10 204 INIT_XMM sse2
yading@10 205 AC3_SHIFT r, 32, psrad
yading@10 206
yading@10 207 ;-----------------------------------------------------------------------------
yading@10 208 ; void ff_float_to_fixed24(int32_t *dst, const float *src, unsigned int len)
yading@10 209 ;-----------------------------------------------------------------------------
yading@10 210
yading@10 211 ; The 3DNow! version is not bit-identical because pf2id uses truncation rather
yading@10 212 ; than round-to-nearest.
yading@10 213 INIT_MMX 3dnow
yading@10 214 cglobal float_to_fixed24, 3, 3, 0, dst, src, len
yading@10 215 movq m0, [pf_1_24]
yading@10 216 .loop:
yading@10 217 movq m1, [srcq ]
yading@10 218 movq m2, [srcq+8 ]
yading@10 219 movq m3, [srcq+16]
yading@10 220 movq m4, [srcq+24]
yading@10 221 pfmul m1, m0
yading@10 222 pfmul m2, m0
yading@10 223 pfmul m3, m0
yading@10 224 pfmul m4, m0
yading@10 225 pf2id m1, m1
yading@10 226 pf2id m2, m2
yading@10 227 pf2id m3, m3
yading@10 228 pf2id m4, m4
yading@10 229 movq [dstq ], m1
yading@10 230 movq [dstq+8 ], m2
yading@10 231 movq [dstq+16], m3
yading@10 232 movq [dstq+24], m4
yading@10 233 add srcq, 32
yading@10 234 add dstq, 32
yading@10 235 sub lend, 8
yading@10 236 ja .loop
yading@10 237 femms
yading@10 238 RET
yading@10 239
yading@10 240 INIT_XMM sse
yading@10 241 cglobal float_to_fixed24, 3, 3, 3, dst, src, len
yading@10 242 movaps m0, [pf_1_24]
yading@10 243 .loop:
yading@10 244 movaps m1, [srcq ]
yading@10 245 movaps m2, [srcq+16]
yading@10 246 mulps m1, m0
yading@10 247 mulps m2, m0
yading@10 248 cvtps2pi mm0, m1
yading@10 249 movhlps m1, m1
yading@10 250 cvtps2pi mm1, m1
yading@10 251 cvtps2pi mm2, m2
yading@10 252 movhlps m2, m2
yading@10 253 cvtps2pi mm3, m2
yading@10 254 movq [dstq ], mm0
yading@10 255 movq [dstq+ 8], mm1
yading@10 256 movq [dstq+16], mm2
yading@10 257 movq [dstq+24], mm3
yading@10 258 add srcq, 32
yading@10 259 add dstq, 32
yading@10 260 sub lend, 8
yading@10 261 ja .loop
yading@10 262 emms
yading@10 263 RET
yading@10 264
yading@10 265 INIT_XMM sse2
yading@10 266 cglobal float_to_fixed24, 3, 3, 9, dst, src, len
yading@10 267 movaps m0, [pf_1_24]
yading@10 268 .loop:
yading@10 269 movaps m1, [srcq ]
yading@10 270 movaps m2, [srcq+16 ]
yading@10 271 movaps m3, [srcq+32 ]
yading@10 272 movaps m4, [srcq+48 ]
yading@10 273 %ifdef m8
yading@10 274 movaps m5, [srcq+64 ]
yading@10 275 movaps m6, [srcq+80 ]
yading@10 276 movaps m7, [srcq+96 ]
yading@10 277 movaps m8, [srcq+112]
yading@10 278 %endif
yading@10 279 mulps m1, m0
yading@10 280 mulps m2, m0
yading@10 281 mulps m3, m0
yading@10 282 mulps m4, m0
yading@10 283 %ifdef m8
yading@10 284 mulps m5, m0
yading@10 285 mulps m6, m0
yading@10 286 mulps m7, m0
yading@10 287 mulps m8, m0
yading@10 288 %endif
yading@10 289 cvtps2dq m1, m1
yading@10 290 cvtps2dq m2, m2
yading@10 291 cvtps2dq m3, m3
yading@10 292 cvtps2dq m4, m4
yading@10 293 %ifdef m8
yading@10 294 cvtps2dq m5, m5
yading@10 295 cvtps2dq m6, m6
yading@10 296 cvtps2dq m7, m7
yading@10 297 cvtps2dq m8, m8
yading@10 298 %endif
yading@10 299 movdqa [dstq ], m1
yading@10 300 movdqa [dstq+16 ], m2
yading@10 301 movdqa [dstq+32 ], m3
yading@10 302 movdqa [dstq+48 ], m4
yading@10 303 %ifdef m8
yading@10 304 movdqa [dstq+64 ], m5
yading@10 305 movdqa [dstq+80 ], m6
yading@10 306 movdqa [dstq+96 ], m7
yading@10 307 movdqa [dstq+112], m8
yading@10 308 add srcq, 128
yading@10 309 add dstq, 128
yading@10 310 sub lenq, 32
yading@10 311 %else
yading@10 312 add srcq, 64
yading@10 313 add dstq, 64
yading@10 314 sub lenq, 16
yading@10 315 %endif
yading@10 316 ja .loop
yading@10 317 REP_RET
yading@10 318
yading@10 319 ;------------------------------------------------------------------------------
yading@10 320 ; int ff_ac3_compute_mantissa_size(uint16_t mant_cnt[6][16])
yading@10 321 ;------------------------------------------------------------------------------
yading@10 322
yading@10 323 %macro PHADDD4 2 ; xmm src, xmm tmp
yading@10 324 movhlps %2, %1
yading@10 325 paddd %1, %2
yading@10 326 pshufd %2, %1, 0x1
yading@10 327 paddd %1, %2
yading@10 328 %endmacro
yading@10 329
yading@10 330 INIT_XMM sse2
yading@10 331 cglobal ac3_compute_mantissa_size, 1, 2, 4, mant_cnt, sum
yading@10 332 movdqa m0, [mant_cntq ]
yading@10 333 movdqa m1, [mant_cntq+ 1*16]
yading@10 334 paddw m0, [mant_cntq+ 2*16]
yading@10 335 paddw m1, [mant_cntq+ 3*16]
yading@10 336 paddw m0, [mant_cntq+ 4*16]
yading@10 337 paddw m1, [mant_cntq+ 5*16]
yading@10 338 paddw m0, [mant_cntq+ 6*16]
yading@10 339 paddw m1, [mant_cntq+ 7*16]
yading@10 340 paddw m0, [mant_cntq+ 8*16]
yading@10 341 paddw m1, [mant_cntq+ 9*16]
yading@10 342 paddw m0, [mant_cntq+10*16]
yading@10 343 paddw m1, [mant_cntq+11*16]
yading@10 344 pmaddwd m0, [ac3_bap_bits ]
yading@10 345 pmaddwd m1, [ac3_bap_bits+16]
yading@10 346 paddd m0, m1
yading@10 347 PHADDD4 m0, m1
yading@10 348 movd sumd, m0
yading@10 349 movdqa m3, [pw_bap_mul1]
yading@10 350 movhpd m0, [mant_cntq +2]
yading@10 351 movlpd m0, [mant_cntq+1*32+2]
yading@10 352 movhpd m1, [mant_cntq+2*32+2]
yading@10 353 movlpd m1, [mant_cntq+3*32+2]
yading@10 354 movhpd m2, [mant_cntq+4*32+2]
yading@10 355 movlpd m2, [mant_cntq+5*32+2]
yading@10 356 pmulhuw m0, m3
yading@10 357 pmulhuw m1, m3
yading@10 358 pmulhuw m2, m3
yading@10 359 paddusw m0, m1
yading@10 360 paddusw m0, m2
yading@10 361 pmaddwd m0, [pw_bap_mul2]
yading@10 362 PHADDD4 m0, m1
yading@10 363 movd eax, m0
yading@10 364 add eax, sumd
yading@10 365 RET
yading@10 366
yading@10 367 ;------------------------------------------------------------------------------
yading@10 368 ; void ff_ac3_extract_exponents(uint8_t *exp, int32_t *coef, int nb_coefs)
yading@10 369 ;------------------------------------------------------------------------------
yading@10 370
yading@10 371 %macro PABSD 1-2 ; src/dst, unused
yading@10 372 %if cpuflag(ssse3)
yading@10 373 pabsd %1, %1
yading@10 374 %else ; src/dst, tmp
yading@10 375 pxor %2, %2
yading@10 376 pcmpgtd %2, %1
yading@10 377 pxor %1, %2
yading@10 378 psubd %1, %2
yading@10 379 %endif
yading@10 380 %endmacro
yading@10 381
yading@10 382 %if HAVE_AMD3DNOW_EXTERNAL
yading@10 383 INIT_MMX 3dnow
yading@10 384 cglobal ac3_extract_exponents, 3, 3, 0, exp, coef, len
yading@10 385 add expq, lenq
yading@10 386 lea coefq, [coefq+4*lenq]
yading@10 387 neg lenq
yading@10 388 movq m3, [pd_1]
yading@10 389 movq m4, [pd_151]
yading@10 390 .loop:
yading@10 391 movq m0, [coefq+4*lenq ]
yading@10 392 movq m1, [coefq+4*lenq+8]
yading@10 393 PABSD m0, m2
yading@10 394 PABSD m1, m2
yading@10 395 pslld m0, 1
yading@10 396 por m0, m3
yading@10 397 pi2fd m2, m0
yading@10 398 psrld m2, 23
yading@10 399 movq m0, m4
yading@10 400 psubd m0, m2
yading@10 401 pslld m1, 1
yading@10 402 por m1, m3
yading@10 403 pi2fd m2, m1
yading@10 404 psrld m2, 23
yading@10 405 movq m1, m4
yading@10 406 psubd m1, m2
yading@10 407 packssdw m0, m0
yading@10 408 packuswb m0, m0
yading@10 409 packssdw m1, m1
yading@10 410 packuswb m1, m1
yading@10 411 punpcklwd m0, m1
yading@10 412 movd [expq+lenq], m0
yading@10 413 add lenq, 4
yading@10 414 jl .loop
yading@10 415 REP_RET
yading@10 416 %endif
yading@10 417
yading@10 418 %macro AC3_EXTRACT_EXPONENTS 0
yading@10 419 cglobal ac3_extract_exponents, 3, 3, 4, exp, coef, len
yading@10 420 add expq, lenq
yading@10 421 lea coefq, [coefq+4*lenq]
yading@10 422 neg lenq
yading@10 423 mova m2, [pd_1]
yading@10 424 mova m3, [pd_151]
yading@10 425 .loop:
yading@10 426 ; move 4 32-bit coefs to xmm0
yading@10 427 mova m0, [coefq+4*lenq]
yading@10 428 ; absolute value
yading@10 429 PABSD m0, m1
yading@10 430 ; convert to float and extract exponents
yading@10 431 pslld m0, 1
yading@10 432 por m0, m2
yading@10 433 cvtdq2ps m1, m0
yading@10 434 psrld m1, 23
yading@10 435 mova m0, m3
yading@10 436 psubd m0, m1
yading@10 437 ; move the lowest byte in each of 4 dwords to the low dword
yading@10 438 ; NOTE: We cannot just extract the low bytes with pshufb because the dword
yading@10 439 ; result for 16777215 is -1 due to float inaccuracy. Using packuswb
yading@10 440 ; clips this to 0, which is the correct exponent.
yading@10 441 packssdw m0, m0
yading@10 442 packuswb m0, m0
yading@10 443 movd [expq+lenq], m0
yading@10 444
yading@10 445 add lenq, 4
yading@10 446 jl .loop
yading@10 447 REP_RET
yading@10 448 %endmacro
yading@10 449
yading@10 450 %if HAVE_SSE2_EXTERNAL
yading@10 451 INIT_XMM sse2
yading@10 452 AC3_EXTRACT_EXPONENTS
yading@10 453 %endif
yading@10 454 %if HAVE_SSSE3_EXTERNAL
yading@10 455 INIT_XMM ssse3
yading@10 456 AC3_EXTRACT_EXPONENTS
yading@10 457 %endif