annotate ffmpeg/libavcodec/x86/dct32.asm @ 13:844d341cf643 tip

Back up before ISMIR
author Yading Song <yading.song@eecs.qmul.ac.uk>
date Thu, 31 Oct 2013 13:17:06 +0000
parents 6840f77b83aa
children
rev   line source
yading@10 1 ;******************************************************************************
yading@10 2 ;* 32 point SSE-optimized DCT transform
yading@10 3 ;* Copyright (c) 2010 Vitor Sessak
yading@10 4 ;*
yading@10 5 ;* This file is part of FFmpeg.
yading@10 6 ;*
yading@10 7 ;* FFmpeg is free software; you can redistribute it and/or
yading@10 8 ;* modify it under the terms of the GNU Lesser General Public
yading@10 9 ;* License as published by the Free Software Foundation; either
yading@10 10 ;* version 2.1 of the License, or (at your option) any later version.
yading@10 11 ;*
yading@10 12 ;* FFmpeg is distributed in the hope that it will be useful,
yading@10 13 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
yading@10 14 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
yading@10 15 ;* Lesser General Public License for more details.
yading@10 16 ;*
yading@10 17 ;* You should have received a copy of the GNU Lesser General Public
yading@10 18 ;* License along with FFmpeg; if not, write to the Free Software
yading@10 19 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
yading@10 20 ;******************************************************************************
yading@10 21
yading@10 22 %include "libavutil/x86/x86util.asm"
yading@10 23
yading@10 24 SECTION_RODATA 32
yading@10 25
yading@10 26 align 32
yading@10 27 ps_cos_vec: dd 0.500603, 0.505471, 0.515447, 0.531043
yading@10 28 dd 0.553104, 0.582935, 0.622504, 0.674808
yading@10 29 dd -10.190008, -3.407609, -2.057781, -1.484165
yading@10 30 dd -1.169440, -0.972568, -0.839350, -0.744536
yading@10 31 dd 0.502419, 0.522499, 0.566944, 0.646822
yading@10 32 dd 0.788155, 1.060678, 1.722447, 5.101149
yading@10 33 dd 0.509796, 0.601345, 0.899976, 2.562916
yading@10 34 dd 0.509796, 0.601345, 0.899976, 2.562916
yading@10 35 dd 1.000000, 1.000000, 1.306563, 0.541196
yading@10 36 dd 1.000000, 1.000000, 1.306563, 0.541196
yading@10 37 dd 1.000000, 0.707107, 1.000000, -0.707107
yading@10 38 dd 1.000000, 0.707107, 1.000000, -0.707107
yading@10 39 dd 0.707107, 0.707107, 0.707107, 0.707107
yading@10 40
yading@10 41 align 32
yading@10 42 ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000, 0, 0, 0x80000000, 0x80000000
yading@10 43
yading@10 44 %macro BUTTERFLY 4
yading@10 45 subps %4, %1, %2
yading@10 46 addps %2, %2, %1
yading@10 47 mulps %1, %4, %3
yading@10 48 %endmacro
yading@10 49
yading@10 50 %macro BUTTERFLY0 5
yading@10 51 %if cpuflag(sse2) && notcpuflag(avx)
yading@10 52 pshufd %4, %1, %5
yading@10 53 xorps %1, %2
yading@10 54 addps %1, %4
yading@10 55 mulps %1, %3
yading@10 56 %else
yading@10 57 shufps %4, %1, %1, %5
yading@10 58 xorps %1, %1, %2
yading@10 59 addps %4, %4, %1
yading@10 60 mulps %1, %4, %3
yading@10 61 %endif
yading@10 62 %endmacro
yading@10 63
yading@10 64 %macro BUTTERFLY2 4
yading@10 65 BUTTERFLY0 %1, %2, %3, %4, 0x1b
yading@10 66 %endmacro
yading@10 67
yading@10 68 %macro BUTTERFLY3 4
yading@10 69 BUTTERFLY0 %1, %2, %3, %4, 0xb1
yading@10 70 %endmacro
yading@10 71
yading@10 72 %macro BUTTERFLY3V 5
yading@10 73 movaps m%5, m%1
yading@10 74 addps m%1, m%2
yading@10 75 subps m%5, m%2
yading@10 76 SWAP %2, %5
yading@10 77 mulps m%2, [ps_cos_vec+192]
yading@10 78 movaps m%5, m%3
yading@10 79 addps m%3, m%4
yading@10 80 subps m%4, m%5
yading@10 81 mulps m%4, [ps_cos_vec+192]
yading@10 82 %endmacro
yading@10 83
yading@10 84 %macro PASS6_AND_PERMUTE 0
yading@10 85 mov tmpd, [outq+4]
yading@10 86 movss m7, [outq+72]
yading@10 87 addss m7, [outq+76]
yading@10 88 movss m3, [outq+56]
yading@10 89 addss m3, [outq+60]
yading@10 90 addss m4, m3
yading@10 91 movss m2, [outq+52]
yading@10 92 addss m2, m3
yading@10 93 movss m3, [outq+104]
yading@10 94 addss m3, [outq+108]
yading@10 95 addss m1, m3
yading@10 96 addss m5, m4
yading@10 97 movss [outq+ 16], m1
yading@10 98 movss m1, [outq+100]
yading@10 99 addss m1, m3
yading@10 100 movss m3, [outq+40]
yading@10 101 movss [outq+ 48], m1
yading@10 102 addss m3, [outq+44]
yading@10 103 movss m1, [outq+100]
yading@10 104 addss m4, m3
yading@10 105 addss m3, m2
yading@10 106 addss m1, [outq+108]
yading@10 107 movss [outq+ 40], m3
yading@10 108 addss m2, [outq+36]
yading@10 109 movss m3, [outq+8]
yading@10 110 movss [outq+ 56], m2
yading@10 111 addss m3, [outq+12]
yading@10 112 movss [outq+ 32], m3
yading@10 113 movss m3, [outq+80]
yading@10 114 movss [outq+ 8], m5
yading@10 115 movss [outq+ 80], m1
yading@10 116 movss m2, [outq+52]
yading@10 117 movss m5, [outq+120]
yading@10 118 addss m5, [outq+124]
yading@10 119 movss m1, [outq+64]
yading@10 120 addss m2, [outq+60]
yading@10 121 addss m0, m5
yading@10 122 addss m5, [outq+116]
yading@10 123 mov [outq+64], tmpd
yading@10 124 addss m6, m0
yading@10 125 addss m1, m6
yading@10 126 mov tmpd, [outq+12]
yading@10 127 mov [outq+ 96], tmpd
yading@10 128 movss [outq+ 4], m1
yading@10 129 movss m1, [outq+24]
yading@10 130 movss [outq+ 24], m4
yading@10 131 movss m4, [outq+88]
yading@10 132 addss m4, [outq+92]
yading@10 133 addss m3, m4
yading@10 134 addss m4, [outq+84]
yading@10 135 mov tmpd, [outq+108]
yading@10 136 addss m1, [outq+28]
yading@10 137 addss m0, m1
yading@10 138 addss m1, m5
yading@10 139 addss m6, m3
yading@10 140 addss m3, m0
yading@10 141 addss m0, m7
yading@10 142 addss m5, [outq+20]
yading@10 143 addss m7, m1
yading@10 144 movss [outq+ 12], m6
yading@10 145 mov [outq+112], tmpd
yading@10 146 movss m6, [outq+28]
yading@10 147 movss [outq+ 28], m0
yading@10 148 movss m0, [outq+36]
yading@10 149 movss [outq+ 36], m7
yading@10 150 addss m1, m4
yading@10 151 movss m7, [outq+116]
yading@10 152 addss m0, m2
yading@10 153 addss m7, [outq+124]
yading@10 154 movss [outq+ 72], m0
yading@10 155 movss m0, [outq+44]
yading@10 156 addss m2, m0
yading@10 157 movss [outq+ 44], m1
yading@10 158 movss [outq+ 88], m2
yading@10 159 addss m0, [outq+60]
yading@10 160 mov tmpd, [outq+60]
yading@10 161 mov [outq+120], tmpd
yading@10 162 movss [outq+104], m0
yading@10 163 addss m4, m5
yading@10 164 addss m5, [outq+68]
yading@10 165 movss [outq+52], m4
yading@10 166 movss [outq+60], m5
yading@10 167 movss m4, [outq+68]
yading@10 168 movss m5, [outq+20]
yading@10 169 movss [outq+ 20], m3
yading@10 170 addss m5, m7
yading@10 171 addss m7, m6
yading@10 172 addss m4, m5
yading@10 173 movss m2, [outq+84]
yading@10 174 addss m2, [outq+92]
yading@10 175 addss m5, m2
yading@10 176 movss [outq+ 68], m4
yading@10 177 addss m2, m7
yading@10 178 movss m4, [outq+76]
yading@10 179 movss [outq+ 84], m2
yading@10 180 movss [outq+ 76], m5
yading@10 181 addss m7, m4
yading@10 182 addss m6, [outq+124]
yading@10 183 addss m4, m6
yading@10 184 addss m6, [outq+92]
yading@10 185 movss [outq+100], m4
yading@10 186 movss [outq+108], m6
yading@10 187 movss m6, [outq+92]
yading@10 188 movss [outq+92], m7
yading@10 189 addss m6, [outq+124]
yading@10 190 movss [outq+116], m6
yading@10 191 %endmacro
yading@10 192
yading@10 193 INIT_YMM avx
yading@10 194 SECTION_TEXT
yading@10 195 %if HAVE_AVX_EXTERNAL
yading@10 196 ; void ff_dct32_float_avx(FFTSample *out, const FFTSample *in)
yading@10 197 cglobal dct32_float, 2,3,8, out, in, tmp
yading@10 198 ; pass 1
yading@10 199 vmovaps m4, [inq+0]
yading@10 200 vinsertf128 m5, m5, [inq+96], 1
yading@10 201 vinsertf128 m5, m5, [inq+112], 0
yading@10 202 vshufps m5, m5, m5, 0x1b
yading@10 203 BUTTERFLY m4, m5, [ps_cos_vec], m6
yading@10 204
yading@10 205 vmovaps m2, [inq+64]
yading@10 206 vinsertf128 m6, m6, [inq+32], 1
yading@10 207 vinsertf128 m6, m6, [inq+48], 0
yading@10 208 vshufps m6, m6, m6, 0x1b
yading@10 209 BUTTERFLY m2, m6, [ps_cos_vec+32], m0
yading@10 210
yading@10 211 ; pass 2
yading@10 212
yading@10 213 BUTTERFLY m5, m6, [ps_cos_vec+64], m0
yading@10 214 BUTTERFLY m4, m2, [ps_cos_vec+64], m7
yading@10 215
yading@10 216
yading@10 217 ; pass 3
yading@10 218 vperm2f128 m3, m6, m4, 0x31
yading@10 219 vperm2f128 m1, m6, m4, 0x20
yading@10 220 vshufps m3, m3, m3, 0x1b
yading@10 221
yading@10 222 BUTTERFLY m1, m3, [ps_cos_vec+96], m6
yading@10 223
yading@10 224
yading@10 225 vperm2f128 m4, m5, m2, 0x20
yading@10 226 vperm2f128 m5, m5, m2, 0x31
yading@10 227 vshufps m5, m5, m5, 0x1b
yading@10 228
yading@10 229 BUTTERFLY m4, m5, [ps_cos_vec+96], m6
yading@10 230
yading@10 231 ; pass 4
yading@10 232 vmovaps m6, [ps_p1p1m1m1+0]
yading@10 233 vmovaps m2, [ps_cos_vec+128]
yading@10 234
yading@10 235 BUTTERFLY2 m5, m6, m2, m7
yading@10 236 BUTTERFLY2 m4, m6, m2, m7
yading@10 237 BUTTERFLY2 m1, m6, m2, m7
yading@10 238 BUTTERFLY2 m3, m6, m2, m7
yading@10 239
yading@10 240
yading@10 241 ; pass 5
yading@10 242 vshufps m6, m6, m6, 0xcc
yading@10 243 vmovaps m2, [ps_cos_vec+160]
yading@10 244
yading@10 245 BUTTERFLY3 m5, m6, m2, m7
yading@10 246 BUTTERFLY3 m4, m6, m2, m7
yading@10 247 BUTTERFLY3 m1, m6, m2, m7
yading@10 248 BUTTERFLY3 m3, m6, m2, m7
yading@10 249
yading@10 250 vperm2f128 m6, m3, m3, 0x31
yading@10 251 vmovaps [outq], m3
yading@10 252
yading@10 253 vextractf128 [outq+64], m5, 1
yading@10 254 vextractf128 [outq+32], m5, 0
yading@10 255
yading@10 256 vextractf128 [outq+80], m4, 1
yading@10 257 vextractf128 [outq+48], m4, 0
yading@10 258
yading@10 259 vperm2f128 m0, m1, m1, 0x31
yading@10 260 vmovaps [outq+96], m1
yading@10 261
yading@10 262 vzeroupper
yading@10 263
yading@10 264 ; pass 6, no SIMD...
yading@10 265 INIT_XMM
yading@10 266 PASS6_AND_PERMUTE
yading@10 267 RET
yading@10 268 %endif
yading@10 269
yading@10 270 %if ARCH_X86_64
yading@10 271 %define SPILL SWAP
yading@10 272 %define UNSPILL SWAP
yading@10 273
yading@10 274 %macro PASS5 0
yading@10 275 nop ; FIXME code alignment
yading@10 276 SWAP 5, 8
yading@10 277 SWAP 4, 12
yading@10 278 SWAP 6, 14
yading@10 279 SWAP 7, 13
yading@10 280 SWAP 0, 15
yading@10 281 PERMUTE 9,10, 10,12, 11,14, 12,9, 13,11, 14,13
yading@10 282 TRANSPOSE4x4PS 8, 9, 10, 11, 0
yading@10 283 BUTTERFLY3V 8, 9, 10, 11, 0
yading@10 284 addps m10, m11
yading@10 285 TRANSPOSE4x4PS 12, 13, 14, 15, 0
yading@10 286 BUTTERFLY3V 12, 13, 14, 15, 0
yading@10 287 addps m14, m15
yading@10 288 addps m12, m14
yading@10 289 addps m14, m13
yading@10 290 addps m13, m15
yading@10 291 %endmacro
yading@10 292
yading@10 293 %macro PASS6 0
yading@10 294 SWAP 9, 12
yading@10 295 SWAP 11, 14
yading@10 296 movss [outq+0x00], m8
yading@10 297 pshuflw m0, m8, 0xe
yading@10 298 movss [outq+0x10], m9
yading@10 299 pshuflw m1, m9, 0xe
yading@10 300 movss [outq+0x20], m10
yading@10 301 pshuflw m2, m10, 0xe
yading@10 302 movss [outq+0x30], m11
yading@10 303 pshuflw m3, m11, 0xe
yading@10 304 movss [outq+0x40], m12
yading@10 305 pshuflw m4, m12, 0xe
yading@10 306 movss [outq+0x50], m13
yading@10 307 pshuflw m5, m13, 0xe
yading@10 308 movss [outq+0x60], m14
yading@10 309 pshuflw m6, m14, 0xe
yading@10 310 movaps [outq+0x70], m15
yading@10 311 pshuflw m7, m15, 0xe
yading@10 312 addss m0, m1
yading@10 313 addss m1, m2
yading@10 314 movss [outq+0x08], m0
yading@10 315 addss m2, m3
yading@10 316 movss [outq+0x18], m1
yading@10 317 addss m3, m4
yading@10 318 movss [outq+0x28], m2
yading@10 319 addss m4, m5
yading@10 320 movss [outq+0x38], m3
yading@10 321 addss m5, m6
yading@10 322 movss [outq+0x48], m4
yading@10 323 addss m6, m7
yading@10 324 movss [outq+0x58], m5
yading@10 325 movss [outq+0x68], m6
yading@10 326 movss [outq+0x78], m7
yading@10 327
yading@10 328 PERMUTE 1,8, 3,9, 5,10, 7,11, 9,12, 11,13, 13,14, 8,1, 10,3, 12,5, 14,7
yading@10 329 movhlps m0, m1
yading@10 330 pshufd m1, m1, 3
yading@10 331 SWAP 0, 2, 4, 6, 8, 10, 12, 14
yading@10 332 SWAP 1, 3, 5, 7, 9, 11, 13, 15
yading@10 333 %rep 7
yading@10 334 movhlps m0, m1
yading@10 335 pshufd m1, m1, 3
yading@10 336 addss m15, m1
yading@10 337 SWAP 0, 2, 4, 6, 8, 10, 12, 14
yading@10 338 SWAP 1, 3, 5, 7, 9, 11, 13, 15
yading@10 339 %endrep
yading@10 340 %assign i 4
yading@10 341 %rep 15
yading@10 342 addss m0, m1
yading@10 343 movss [outq+i], m0
yading@10 344 SWAP 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
yading@10 345 %assign i i+8
yading@10 346 %endrep
yading@10 347 %endmacro
yading@10 348
yading@10 349 %else ; ARCH_X86_32
yading@10 350 %macro SPILL 2 ; xmm#, mempos
yading@10 351 movaps [outq+(%2-8)*16], m%1
yading@10 352 %endmacro
yading@10 353 %macro UNSPILL 2
yading@10 354 movaps m%1, [outq+(%2-8)*16]
yading@10 355 %endmacro
yading@10 356
yading@10 357 %define PASS6 PASS6_AND_PERMUTE
yading@10 358 %macro PASS5 0
yading@10 359 movaps m2, [ps_cos_vec+160]
yading@10 360 shufps m3, m3, 0xcc
yading@10 361
yading@10 362 BUTTERFLY3 m5, m3, m2, m1
yading@10 363 SPILL 5, 8
yading@10 364
yading@10 365 UNSPILL 1, 9
yading@10 366 BUTTERFLY3 m1, m3, m2, m5
yading@10 367 SPILL 1, 14
yading@10 368
yading@10 369 BUTTERFLY3 m4, m3, m2, m5
yading@10 370 SPILL 4, 12
yading@10 371
yading@10 372 BUTTERFLY3 m7, m3, m2, m5
yading@10 373 SPILL 7, 13
yading@10 374
yading@10 375 UNSPILL 5, 10
yading@10 376 BUTTERFLY3 m5, m3, m2, m7
yading@10 377 SPILL 5, 10
yading@10 378
yading@10 379 UNSPILL 4, 11
yading@10 380 BUTTERFLY3 m4, m3, m2, m7
yading@10 381 SPILL 4, 11
yading@10 382
yading@10 383 BUTTERFLY3 m6, m3, m2, m7
yading@10 384 SPILL 6, 9
yading@10 385
yading@10 386 BUTTERFLY3 m0, m3, m2, m7
yading@10 387 SPILL 0, 15
yading@10 388 %endmacro
yading@10 389 %endif
yading@10 390
yading@10 391
yading@10 392 ; void ff_dct32_float_sse(FFTSample *out, const FFTSample *in)
yading@10 393 %macro DCT32_FUNC 0
yading@10 394 cglobal dct32_float, 2, 3, 16, out, in, tmp
yading@10 395 ; pass 1
yading@10 396
yading@10 397 movaps m0, [inq+0]
yading@10 398 LOAD_INV m1, [inq+112]
yading@10 399 BUTTERFLY m0, m1, [ps_cos_vec], m3
yading@10 400
yading@10 401 movaps m7, [inq+64]
yading@10 402 LOAD_INV m4, [inq+48]
yading@10 403 BUTTERFLY m7, m4, [ps_cos_vec+32], m3
yading@10 404
yading@10 405 ; pass 2
yading@10 406 movaps m2, [ps_cos_vec+64]
yading@10 407 BUTTERFLY m1, m4, m2, m3
yading@10 408 SPILL 1, 11
yading@10 409 SPILL 4, 8
yading@10 410
yading@10 411 ; pass 1
yading@10 412 movaps m1, [inq+16]
yading@10 413 LOAD_INV m6, [inq+96]
yading@10 414 BUTTERFLY m1, m6, [ps_cos_vec+16], m3
yading@10 415
yading@10 416 movaps m4, [inq+80]
yading@10 417 LOAD_INV m5, [inq+32]
yading@10 418 BUTTERFLY m4, m5, [ps_cos_vec+48], m3
yading@10 419
yading@10 420 ; pass 2
yading@10 421 BUTTERFLY m0, m7, m2, m3
yading@10 422
yading@10 423 movaps m2, [ps_cos_vec+80]
yading@10 424 BUTTERFLY m6, m5, m2, m3
yading@10 425
yading@10 426 BUTTERFLY m1, m4, m2, m3
yading@10 427
yading@10 428 ; pass 3
yading@10 429 movaps m2, [ps_cos_vec+96]
yading@10 430 shufps m1, m1, 0x1b
yading@10 431 BUTTERFLY m0, m1, m2, m3
yading@10 432 SPILL 0, 15
yading@10 433 SPILL 1, 14
yading@10 434
yading@10 435 UNSPILL 0, 8
yading@10 436 shufps m5, m5, 0x1b
yading@10 437 BUTTERFLY m0, m5, m2, m3
yading@10 438
yading@10 439 UNSPILL 1, 11
yading@10 440 shufps m6, m6, 0x1b
yading@10 441 BUTTERFLY m1, m6, m2, m3
yading@10 442 SPILL 1, 11
yading@10 443
yading@10 444 shufps m4, m4, 0x1b
yading@10 445 BUTTERFLY m7, m4, m2, m3
yading@10 446
yading@10 447 ; pass 4
yading@10 448 movaps m3, [ps_p1p1m1m1+0]
yading@10 449 movaps m2, [ps_cos_vec+128]
yading@10 450
yading@10 451 BUTTERFLY2 m5, m3, m2, m1
yading@10 452
yading@10 453 BUTTERFLY2 m0, m3, m2, m1
yading@10 454 SPILL 0, 9
yading@10 455
yading@10 456 BUTTERFLY2 m6, m3, m2, m1
yading@10 457 SPILL 6, 10
yading@10 458
yading@10 459 UNSPILL 0, 11
yading@10 460 BUTTERFLY2 m0, m3, m2, m1
yading@10 461 SPILL 0, 11
yading@10 462
yading@10 463 BUTTERFLY2 m4, m3, m2, m1
yading@10 464
yading@10 465 BUTTERFLY2 m7, m3, m2, m1
yading@10 466
yading@10 467 UNSPILL 6, 14
yading@10 468 BUTTERFLY2 m6, m3, m2, m1
yading@10 469
yading@10 470 UNSPILL 0, 15
yading@10 471 BUTTERFLY2 m0, m3, m2, m1
yading@10 472
yading@10 473 PASS5
yading@10 474 PASS6
yading@10 475 RET
yading@10 476 %endmacro
yading@10 477
yading@10 478 %macro LOAD_INV 2
yading@10 479 %if cpuflag(sse2)
yading@10 480 pshufd %1, %2, 0x1b
yading@10 481 %elif cpuflag(sse)
yading@10 482 movaps %1, %2
yading@10 483 shufps %1, %1, 0x1b
yading@10 484 %endif
yading@10 485 %endmacro
yading@10 486
yading@10 487 INIT_XMM sse
yading@10 488 DCT32_FUNC
yading@10 489 INIT_XMM sse2
yading@10 490 DCT32_FUNC