annotate ffmpeg/libavcodec/x86/sbrdsp.asm @ 13:844d341cf643 tip

Back up before ISMIR
author Yading Song <yading.song@eecs.qmul.ac.uk>
date Thu, 31 Oct 2013 13:17:06 +0000
parents 6840f77b83aa
children
rev   line source
yading@10 1 ;******************************************************************************
yading@10 2 ;* AAC Spectral Band Replication decoding functions
yading@10 3 ;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
yading@10 4 ;*
yading@10 5 ;* This file is part of Libav.
yading@10 6 ;*
yading@10 7 ;* Libav is free software; you can redistribute it and/or
yading@10 8 ;* modify it under the terms of the GNU Lesser General Public
yading@10 9 ;* License as published by the Free Software Foundation; either
yading@10 10 ;* version 2.1 of the License, or (at your option) any later version.
yading@10 11 ;*
yading@10 12 ;* Libav is distributed in the hope that it will be useful,
yading@10 13 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
yading@10 14 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
yading@10 15 ;* Lesser General Public License for more details.
yading@10 16 ;*
yading@10 17 ;* You should have received a copy of the GNU Lesser General Public
yading@10 18 ;* License along with Libav; if not, write to the Free Software
yading@10 19 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
yading@10 20 ;******************************************************************************
yading@10 21
yading@10 22 %include "libavutil/x86/x86util.asm"
yading@10 23
yading@10 24 SECTION_RODATA
yading@10 25 ; mask equivalent for multiply by -1.0 1.0
yading@10 26 ps_mask times 2 dd 1<<31, 0
yading@10 27 ps_mask2 times 2 dd 0, 1<<31
yading@10 28 ps_neg times 4 dd 1<<31
yading@10 29 ps_noise0 times 2 dd 1.0, 0.0,
yading@10 30 ps_noise2 times 2 dd -1.0, 0.0
yading@10 31 ps_noise13 dd 0.0, 1.0, 0.0, -1.0
yading@10 32 dd 0.0, -1.0, 0.0, 1.0
yading@10 33 dd 0.0, 1.0, 0.0, -1.0
yading@10 34 cextern sbr_noise_table
yading@10 35
yading@10 36 SECTION_TEXT
yading@10 37
yading@10 38 INIT_XMM sse
yading@10 39 cglobal sbr_sum_square, 2, 3, 6
yading@10 40 mov r2, r1
yading@10 41 xorps m0, m0
yading@10 42 xorps m1, m1
yading@10 43 sar r2, 3
yading@10 44 jz .prepare
yading@10 45 .loop:
yading@10 46 movu m2, [r0 + 0]
yading@10 47 movu m3, [r0 + 16]
yading@10 48 movu m4, [r0 + 32]
yading@10 49 movu m5, [r0 + 48]
yading@10 50 mulps m2, m2
yading@10 51 mulps m3, m3
yading@10 52 mulps m4, m4
yading@10 53 mulps m5, m5
yading@10 54 addps m0, m2
yading@10 55 addps m1, m3
yading@10 56 addps m0, m4
yading@10 57 addps m1, m5
yading@10 58 add r0, 64
yading@10 59 dec r2
yading@10 60 jnz .loop
yading@10 61 .prepare:
yading@10 62 and r1, 7
yading@10 63 sar r1, 1
yading@10 64 jz .end
yading@10 65 ; len is a multiple of 2, thus there are at least 4 elements to process
yading@10 66 .endloop:
yading@10 67 movu m2, [r0]
yading@10 68 add r0, 16
yading@10 69 mulps m2, m2
yading@10 70 dec r1
yading@10 71 addps m0, m2
yading@10 72 jnz .endloop
yading@10 73 .end:
yading@10 74 addps m0, m1
yading@10 75 movhlps m2, m0
yading@10 76 addps m0, m2
yading@10 77 movss m1, m0
yading@10 78 shufps m0, m0, 1
yading@10 79 addss m0, m1
yading@10 80 %if ARCH_X86_64 == 0
yading@10 81 movss r0m, m0
yading@10 82 fld dword r0m
yading@10 83 %endif
yading@10 84 RET
yading@10 85
yading@10 86 %define STEP 40*4*2
yading@10 87 cglobal sbr_hf_g_filt, 5, 6, 5
yading@10 88 lea r1, [r1 + 8*r4] ; offset by ixh elements into X_high
yading@10 89 mov r5, r3
yading@10 90 and r3, 0xFC
yading@10 91 lea r2, [r2 + r3*4]
yading@10 92 lea r0, [r0 + r3*8]
yading@10 93 neg r3
yading@10 94 jz .loop1
yading@10 95 .loop4:
yading@10 96 movlps m0, [r2 + 4*r3 + 0]
yading@10 97 movlps m1, [r2 + 4*r3 + 8]
yading@10 98 movlps m2, [r1 + 0*STEP]
yading@10 99 movlps m3, [r1 + 2*STEP]
yading@10 100 movhps m2, [r1 + 1*STEP]
yading@10 101 movhps m3, [r1 + 3*STEP]
yading@10 102 unpcklps m0, m0
yading@10 103 unpcklps m1, m1
yading@10 104 mulps m0, m2
yading@10 105 mulps m1, m3
yading@10 106 movu [r0 + 8*r3 + 0], m0
yading@10 107 movu [r0 + 8*r3 + 16], m1
yading@10 108 add r1, 4*STEP
yading@10 109 add r3, 4
yading@10 110 jnz .loop4
yading@10 111 and r5, 3 ; number of single element loops
yading@10 112 jz .end
yading@10 113 .loop1: ; element 0 and 1 can be computed at the same time
yading@10 114 movss m0, [r2]
yading@10 115 movlps m2, [r1]
yading@10 116 unpcklps m0, m0
yading@10 117 mulps m2, m0
yading@10 118 movlps [r0], m2
yading@10 119 add r0, 8
yading@10 120 add r2, 4
yading@10 121 add r1, STEP
yading@10 122 dec r5
yading@10 123 jnz .loop1
yading@10 124 .end:
yading@10 125 RET
yading@10 126
yading@10 127 ; static void sbr_hf_gen_c(float (*X_high)[2], const float (*X_low)[2],
yading@10 128 ; const float alpha0[2], const float alpha1[2],
yading@10 129 ; float bw, int start, int end)
yading@10 130 ;
yading@10 131 cglobal sbr_hf_gen, 4,4,8, X_high, X_low, alpha0, alpha1, BW, S, E
yading@10 132 ; load alpha factors
yading@10 133 %define bw m0
yading@10 134 %if ARCH_X86_64 == 0 || WIN64
yading@10 135 movss bw, BWm
yading@10 136 %endif
yading@10 137 movlps m2, [alpha1q]
yading@10 138 movlps m1, [alpha0q]
yading@10 139 shufps bw, bw, 0
yading@10 140 mulps m2, bw ; (a1[0] a1[1])*bw
yading@10 141 mulps m1, bw ; (a0[0] a0[1])*bw = (a2 a3)
yading@10 142 mulps m2, bw ; (a1[0] a1[1])*bw*bw = (a0 a1)
yading@10 143 mova m3, m1
yading@10 144 mova m4, m2
yading@10 145
yading@10 146 ; Set pointers
yading@10 147 %if ARCH_X86_64 == 0 || WIN64
yading@10 148 ; start and end 6th and 7th args on stack
yading@10 149 mov r2d, Sm
yading@10 150 mov r3d, Em
yading@10 151 %define start r2q
yading@10 152 %define end r3q
yading@10 153 %else
yading@10 154 ; BW does not actually occupy a register, so shift by 1
yading@10 155 %define start BWq
yading@10 156 %define end Sq
yading@10 157 %endif
yading@10 158 sub start, end ; neg num of loops
yading@10 159 lea X_highq, [X_highq + end*2*4]
yading@10 160 lea X_lowq, [X_lowq + end*2*4 - 2*2*4]
yading@10 161 shl start, 3 ; offset from num loops
yading@10 162
yading@10 163 mova m0, [X_lowq + start]
yading@10 164 shufps m3, m3, q1111
yading@10 165 shufps m4, m4, q1111
yading@10 166 xorps m3, [ps_mask]
yading@10 167 shufps m1, m1, q0000
yading@10 168 shufps m2, m2, q0000
yading@10 169 xorps m4, [ps_mask]
yading@10 170 .loop2:
yading@10 171 movu m7, [X_lowq + start + 8] ; BbCc
yading@10 172 mova m6, m0
yading@10 173 mova m5, m7
yading@10 174 shufps m0, m0, q2301 ; aAbB
yading@10 175 shufps m7, m7, q2301 ; bBcC
yading@10 176 mulps m0, m4
yading@10 177 mulps m7, m3
yading@10 178 mulps m6, m2
yading@10 179 mulps m5, m1
yading@10 180 addps m7, m0
yading@10 181 mova m0, [X_lowq + start +16] ; CcDd
yading@10 182 addps m7, m0
yading@10 183 addps m6, m5
yading@10 184 addps m7, m6
yading@10 185 mova [X_highq + start], m7
yading@10 186 add start, 16
yading@10 187 jnz .loop2
yading@10 188 RET
yading@10 189
yading@10 190 cglobal sbr_sum64x5, 1,2,4,z
yading@10 191 lea r1q, [zq+ 256]
yading@10 192 .loop:
yading@10 193 mova m0, [zq+ 0]
yading@10 194 mova m2, [zq+ 16]
yading@10 195 mova m1, [zq+ 256]
yading@10 196 mova m3, [zq+ 272]
yading@10 197 addps m0, [zq+ 512]
yading@10 198 addps m2, [zq+ 528]
yading@10 199 addps m1, [zq+ 768]
yading@10 200 addps m3, [zq+ 784]
yading@10 201 addps m0, [zq+1024]
yading@10 202 addps m2, [zq+1040]
yading@10 203 addps m0, m1
yading@10 204 addps m2, m3
yading@10 205 mova [zq], m0
yading@10 206 mova [zq+16], m2
yading@10 207 add zq, 32
yading@10 208 cmp zq, r1q
yading@10 209 jne .loop
yading@10 210 REP_RET
yading@10 211
yading@10 212 INIT_XMM sse
yading@10 213 cglobal sbr_qmf_post_shuffle, 2,3,4,W,z
yading@10 214 lea r2q, [zq + (64-4)*4]
yading@10 215 mova m3, [ps_neg]
yading@10 216 .loop:
yading@10 217 mova m1, [zq]
yading@10 218 xorps m0, m3, [r2q]
yading@10 219 shufps m0, m0, m0, q0123
yading@10 220 unpcklps m2, m0, m1
yading@10 221 unpckhps m0, m0, m1
yading@10 222 mova [Wq + 0], m2
yading@10 223 mova [Wq + 16], m0
yading@10 224 add Wq, 32
yading@10 225 sub r2q, 16
yading@10 226 add zq, 16
yading@10 227 cmp zq, r2q
yading@10 228 jl .loop
yading@10 229 REP_RET
yading@10 230
yading@10 231 INIT_XMM sse
yading@10 232 cglobal sbr_neg_odd_64, 1,2,4,z
yading@10 233 lea r1q, [zq+256]
yading@10 234 .loop:
yading@10 235 mova m0, [zq+ 0]
yading@10 236 mova m1, [zq+16]
yading@10 237 mova m2, [zq+32]
yading@10 238 mova m3, [zq+48]
yading@10 239 xorps m0, [ps_mask2]
yading@10 240 xorps m1, [ps_mask2]
yading@10 241 xorps m2, [ps_mask2]
yading@10 242 xorps m3, [ps_mask2]
yading@10 243 mova [zq+ 0], m0
yading@10 244 mova [zq+16], m1
yading@10 245 mova [zq+32], m2
yading@10 246 mova [zq+48], m3
yading@10 247 add zq, 64
yading@10 248 cmp zq, r1q
yading@10 249 jne .loop
yading@10 250 REP_RET
yading@10 251
yading@10 252 ; sbr_qmf_deint_bfly(float *v, const float *src0, const float *src1)
yading@10 253 %macro SBR_QMF_DEINT_BFLY 0
yading@10 254 cglobal sbr_qmf_deint_bfly, 3,5,8, v,src0,src1,vrev,c
yading@10 255 mov cq, 64*4-2*mmsize
yading@10 256 lea vrevq, [vq + 64*4]
yading@10 257 .loop:
yading@10 258 mova m0, [src0q+cq]
yading@10 259 mova m1, [src1q]
yading@10 260 mova m4, [src0q+cq+mmsize]
yading@10 261 mova m5, [src1q+mmsize]
yading@10 262 %if cpuflag(sse2)
yading@10 263 pshufd m2, m0, q0123
yading@10 264 pshufd m3, m1, q0123
yading@10 265 pshufd m6, m4, q0123
yading@10 266 pshufd m7, m5, q0123
yading@10 267 %else
yading@10 268 shufps m2, m0, m0, q0123
yading@10 269 shufps m3, m1, m1, q0123
yading@10 270 shufps m6, m4, m4, q0123
yading@10 271 shufps m7, m5, m5, q0123
yading@10 272 %endif
yading@10 273 addps m5, m2
yading@10 274 subps m0, m7
yading@10 275 addps m1, m6
yading@10 276 subps m4, m3
yading@10 277 mova [vrevq], m1
yading@10 278 mova [vrevq+mmsize], m5
yading@10 279 mova [vq+cq], m0
yading@10 280 mova [vq+cq+mmsize], m4
yading@10 281 add src1q, 2*mmsize
yading@10 282 add vrevq, 2*mmsize
yading@10 283 sub cq, 2*mmsize
yading@10 284 jge .loop
yading@10 285 REP_RET
yading@10 286 %endmacro
yading@10 287
yading@10 288 INIT_XMM sse
yading@10 289 SBR_QMF_DEINT_BFLY
yading@10 290
yading@10 291 INIT_XMM sse2
yading@10 292 SBR_QMF_DEINT_BFLY
yading@10 293
yading@10 294 INIT_XMM sse2
yading@10 295 cglobal sbr_qmf_pre_shuffle, 1,4,7,z
yading@10 296 %define OFFSET (32*4-2*mmsize)
yading@10 297 mov r3q, OFFSET
yading@10 298 lea r1q, [zq + (32+1)*4]
yading@10 299 lea r2q, [zq + 64*4]
yading@10 300 mova m6, [ps_neg]
yading@10 301 .loop:
yading@10 302 movu m0, [r1q]
yading@10 303 movu m2, [r1q + mmsize]
yading@10 304 movu m1, [zq + r3q + 4 + mmsize]
yading@10 305 movu m3, [zq + r3q + 4]
yading@10 306
yading@10 307 pxor m2, m6
yading@10 308 pxor m0, m6
yading@10 309 pshufd m2, m2, q0123
yading@10 310 pshufd m0, m0, q0123
yading@10 311 SBUTTERFLY dq, 2, 3, 5
yading@10 312 SBUTTERFLY dq, 0, 1, 4
yading@10 313 mova [r2q + 2*r3q + 0*mmsize], m2
yading@10 314 mova [r2q + 2*r3q + 1*mmsize], m3
yading@10 315 mova [r2q + 2*r3q + 2*mmsize], m0
yading@10 316 mova [r2q + 2*r3q + 3*mmsize], m1
yading@10 317 add r1q, 2*mmsize
yading@10 318 sub r3q, 2*mmsize
yading@10 319 jge .loop
yading@10 320 mova m2, [zq]
yading@10 321 movq [r2q], m2
yading@10 322 REP_RET
yading@10 323
yading@10 324 %if WIN64
yading@10 325 %define NREGS 0
yading@10 326 %define NOISE_TABLE sbr_noise_table
yading@10 327 %else
yading@10 328 %ifdef PIC
yading@10 329 %define NREGS 1
yading@10 330 %if UNIX64
yading@10 331 %define NOISE_TABLE r6q ; r5q is m_max
yading@10 332 %else
yading@10 333 %define NOISE_TABLE r5q
yading@10 334 %endif
yading@10 335 %else
yading@10 336 %define NREGS 0
yading@10 337 %define NOISE_TABLE sbr_noise_table
yading@10 338 %endif
yading@10 339 %endif
yading@10 340
yading@10 341 %macro LOAD_NST 1
yading@10 342 %if NREGS
yading@10 343 lea NOISE_TABLE, [%1]
yading@10 344 mova m0, [kxq + NOISE_TABLE]
yading@10 345 %else
yading@10 346 mova m0, [kxq + %1]
yading@10 347 %endif
yading@10 348 %endmacro
yading@10 349
yading@10 350 INIT_XMM sse2
yading@10 351 ; sbr_hf_apply_noise_0(float (*Y)[2], const float *s_m,
yading@10 352 ; const float *q_filt, int noise,
yading@10 353 ; int kx, int m_max)
yading@10 354 cglobal sbr_hf_apply_noise_0, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max
yading@10 355 mova m0, [ps_noise0]
yading@10 356 jmp apply_noise_main
yading@10 357
yading@10 358 ; sbr_hf_apply_noise_1(float (*Y)[2], const float *s_m,
yading@10 359 ; const float *q_filt, int noise,
yading@10 360 ; int kx, int m_max)
yading@10 361 cglobal sbr_hf_apply_noise_1, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max
yading@10 362 and kxq, 1
yading@10 363 shl kxq, 4
yading@10 364 LOAD_NST ps_noise13
yading@10 365 jmp apply_noise_main
yading@10 366
yading@10 367 ; sbr_hf_apply_noise_2(float (*Y)[2], const float *s_m,
yading@10 368 ; const float *q_filt, int noise,
yading@10 369 ; int kx, int m_max)
yading@10 370 cglobal sbr_hf_apply_noise_2, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max
yading@10 371 mova m0, [ps_noise2]
yading@10 372 jmp apply_noise_main
yading@10 373
yading@10 374 ; sbr_hf_apply_noise_3(float (*Y)[2], const float *s_m,
yading@10 375 ; const float *q_filt, int noise,
yading@10 376 ; int kx, int m_max)
yading@10 377 cglobal sbr_hf_apply_noise_3, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max
yading@10 378 and kxq, 1
yading@10 379 shl kxq, 4
yading@10 380 LOAD_NST ps_noise13+16
yading@10 381
yading@10 382 apply_noise_main:
yading@10 383 %if ARCH_X86_64 == 0 || WIN64
yading@10 384 mov kxd, m_maxm
yading@10 385 %define count kxq
yading@10 386 %else
yading@10 387 %define count m_maxq
yading@10 388 %endif
yading@10 389 dec noiseq
yading@10 390 shl count, 2
yading@10 391 %if NREGS
yading@10 392 lea NOISE_TABLE, [sbr_noise_table]
yading@10 393 %endif
yading@10 394 lea Yq, [Yq + 2*count]
yading@10 395 add s_mq, count
yading@10 396 add q_filtq, count
yading@10 397 shl noiseq, 3
yading@10 398 pxor m5, m5
yading@10 399 neg count
yading@10 400 .loop:
yading@10 401 mova m1, [q_filtq + count]
yading@10 402 movu m3, [noiseq + NOISE_TABLE + 1*mmsize]
yading@10 403 movu m4, [noiseq + NOISE_TABLE + 2*mmsize]
yading@10 404 add noiseq, 2*mmsize
yading@10 405 and noiseq, 0x1ff<<3
yading@10 406 punpckhdq m2, m1, m1
yading@10 407 punpckldq m1, m1
yading@10 408 mulps m1, m3 ; m2 = q_filt[m] * ff_sbr_noise_table[noise]
yading@10 409 mulps m2, m4 ; m2 = q_filt[m] * ff_sbr_noise_table[noise]
yading@10 410 mova m3, [s_mq + count]
yading@10 411 ; TODO: replace by a vpermd in AVX2
yading@10 412 punpckhdq m4, m3, m3
yading@10 413 punpckldq m3, m3
yading@10 414 pcmpeqd m6, m3, m5 ; m6 == 0
yading@10 415 pcmpeqd m7, m4, m5 ; m7 == 0
yading@10 416 mulps m3, m0 ; s_m[m] * phi_sign
yading@10 417 mulps m4, m0 ; s_m[m] * phi_sign
yading@10 418 pand m1, m6
yading@10 419 pand m2, m7
yading@10 420 movu m6, [Yq + 2*count]
yading@10 421 movu m7, [Yq + 2*count + mmsize]
yading@10 422 addps m3, m1
yading@10 423 addps m4, m2
yading@10 424 addps m6, m3
yading@10 425 addps m7, m4
yading@10 426 movu [Yq + 2*count], m6
yading@10 427 movu [Yq + 2*count + mmsize], m7
yading@10 428 add count, mmsize
yading@10 429 jl .loop
yading@10 430 RET