annotate ffmpeg/libavcodec/x86/imdct36.asm @ 13:844d341cf643 tip

Back up before ISMIR
author Yading Song <yading.song@eecs.qmul.ac.uk>
date Thu, 31 Oct 2013 13:17:06 +0000
parents 6840f77b83aa
children
rev   line source
yading@10 1 ;******************************************************************************
yading@10 2 ;* 36 point SSE-optimized IMDCT transform
yading@10 3 ;* Copyright (c) 2011 Vitor Sessak
yading@10 4 ;*
yading@10 5 ;* This file is part of FFmpeg.
yading@10 6 ;*
yading@10 7 ;* FFmpeg is free software; you can redistribute it and/or
yading@10 8 ;* modify it under the terms of the GNU Lesser General Public
yading@10 9 ;* License as published by the Free Software Foundation; either
yading@10 10 ;* version 2.1 of the License, or (at your option) any later version.
yading@10 11 ;*
yading@10 12 ;* FFmpeg is distributed in the hope that it will be useful,
yading@10 13 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
yading@10 14 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
yading@10 15 ;* Lesser General Public License for more details.
yading@10 16 ;*
yading@10 17 ;* You should have received a copy of the GNU Lesser General Public
yading@10 18 ;* License along with FFmpeg; if not, write to the Free Software
yading@10 19 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
yading@10 20 ;******************************************************************************
yading@10 21
yading@10 22 %include "libavutil/x86/x86util.asm"
yading@10 23
yading@10 24 SECTION_RODATA
yading@10 25
yading@10 26 align 16
yading@10 27 ps_mask: dd 0, ~0, ~0, ~0
yading@10 28 ps_mask2: dd 0, ~0, 0, ~0
yading@10 29 ps_mask3: dd 0, 0, 0, ~0
yading@10 30 ps_mask4: dd 0, ~0, 0, 0
yading@10 31
yading@10 32 ps_val1: dd -0.5, -0.5, -0.8660254038, -0.8660254038
yading@10 33 ps_val2: dd 1.0, 1.0, 0.8660254038, 0.8660254038
yading@10 34 ps_val3: dd 0.1736481777, 0.1736481777, 0.3420201433, 0.3420201433
yading@10 35 ps_val4: dd -0.7660444431, -0.7660444431, 0.8660254038, 0.8660254038
yading@10 36 ps_val5: dd -0.9396926208, -0.9396926208, -0.9848077530, -0.9848077530
yading@10 37 ps_val6: dd 0.5, 0.5, -0.6427876097, -0.6427876097
yading@10 38 ps_val7: dd 1.0, 1.0, -0.6427876097, -0.6427876097
yading@10 39
yading@10 40 ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000
yading@10 41 ps_p1m1p1m1: dd 0, 0x80000000, 0, 0x80000000
yading@10 42
yading@10 43 ps_cosh: dd 1.0, 0.50190991877167369479, 1.0, 5.73685662283492756461
yading@10 44 dd 1.0, 0.51763809020504152469, 1.0, 1.93185165257813657349
yading@10 45 dd 1.0, 0.55168895948124587824, -1.0, -1.18310079157624925896
yading@10 46 dd 1.0, 0.61038729438072803416, -1.0, -0.87172339781054900991
yading@10 47 dd 1.0, 0.70710678118654752439, 0.0, 0.0
yading@10 48
yading@10 49 ps_cosh_sse3: dd 1.0, -0.50190991877167369479, 1.0, -5.73685662283492756461
yading@10 50 dd 1.0, -0.51763809020504152469, 1.0, -1.93185165257813657349
yading@10 51 dd 1.0, -0.55168895948124587824, -1.0, 1.18310079157624925896
yading@10 52 dd 1.0, -0.61038729438072803416, -1.0, 0.87172339781054900991
yading@10 53 dd 1.0, 0.70710678118654752439, 0.0, 0.0
yading@10 54
yading@10 55 costabs: times 4 dd 0.98480773
yading@10 56 times 4 dd 0.93969262
yading@10 57 times 4 dd 0.86602539
yading@10 58 times 4 dd -0.76604444
yading@10 59 times 4 dd -0.64278764
yading@10 60 times 4 dd 0.50000000
yading@10 61 times 4 dd -0.50000000
yading@10 62 times 4 dd -0.34202015
yading@10 63 times 4 dd -0.17364818
yading@10 64 times 4 dd 0.50190992
yading@10 65 times 4 dd 0.51763808
yading@10 66 times 4 dd 0.55168896
yading@10 67 times 4 dd 0.61038726
yading@10 68 times 4 dd 0.70710677
yading@10 69 times 4 dd 0.87172341
yading@10 70 times 4 dd 1.18310082
yading@10 71 times 4 dd 1.93185163
yading@10 72 times 4 dd 5.73685646
yading@10 73
yading@10 74 %define SBLIMIT 32
yading@10 75 SECTION_TEXT
yading@10 76
yading@10 77 %macro PSHUFD 3
yading@10 78 %if cpuflag(sse2) && notcpuflag(avx)
yading@10 79 pshufd %1, %2, %3
yading@10 80 %else
yading@10 81 shufps %1, %2, %2, %3
yading@10 82 %endif
yading@10 83 %endmacro
yading@10 84
yading@10 85 ; input %2={x1,x2,x3,x4}, %3={y1,y2,y3,y4}
yading@10 86 ; output %1={x3,x4,y1,y2}
yading@10 87 %macro BUILDINVHIGHLOW 3
yading@10 88 %if cpuflag(avx)
yading@10 89 shufps %1, %2, %3, 0x4e
yading@10 90 %else
yading@10 91 movlhps %1, %3
yading@10 92 movhlps %1, %2
yading@10 93 %endif
yading@10 94 %endmacro
yading@10 95
yading@10 96 ; input %2={x1,x2,x3,x4}, %3={y1,y2,y3,y4}
yading@10 97 ; output %1={x4,y1,y2,y3}
yading@10 98 %macro ROTLEFT 3
yading@10 99 %if cpuflag(ssse3)
yading@10 100 palignr %1, %3, %2, 12
yading@10 101 %else
yading@10 102 BUILDINVHIGHLOW %1, %2, %3
yading@10 103 shufps %1, %1, %3, 0x99
yading@10 104 %endif
yading@10 105 %endmacro
yading@10 106
yading@10 107 %macro INVERTHL 2
yading@10 108 %if cpuflag(sse2)
yading@10 109 PSHUFD %1, %2, 0x4e
yading@10 110 %else
yading@10 111 movhlps %1, %2
yading@10 112 movlhps %1, %2
yading@10 113 %endif
yading@10 114 %endmacro
yading@10 115
yading@10 116 %macro BUTTERF 3
yading@10 117 INVERTHL %2, %1
yading@10 118 xorps %1, [ps_p1p1m1m1]
yading@10 119 addps %1, %2
yading@10 120 %if cpuflag(sse3)
yading@10 121 mulps %1, %1, [ps_cosh_sse3 + %3]
yading@10 122 PSHUFD %2, %1, 0xb1
yading@10 123 addsubps %1, %1, %2
yading@10 124 %else
yading@10 125 mulps %1, [ps_cosh + %3]
yading@10 126 PSHUFD %2, %1, 0xb1
yading@10 127 xorps %1, [ps_p1m1p1m1]
yading@10 128 addps %1, %2
yading@10 129 %endif
yading@10 130 %endmacro
yading@10 131
yading@10 132 %macro STORE 4
yading@10 133 movhlps %2, %1
yading@10 134 movss [%3 ], %1
yading@10 135 movss [%3 + 2*%4], %2
yading@10 136 shufps %1, %1, 0xb1
yading@10 137 movss [%3 + %4], %1
yading@10 138 movhlps %2, %1
yading@10 139 movss [%3 + 3*%4], %2
yading@10 140 %endmacro
yading@10 141
yading@10 142 %macro LOAD 4
yading@10 143 movlps %1, [%3 ]
yading@10 144 movhps %1, [%3 + %4]
yading@10 145 movlps %2, [%3 + 2*%4]
yading@10 146 movhps %2, [%3 + 3*%4]
yading@10 147 shufps %1, %2, 0x88
yading@10 148 %endmacro
yading@10 149
yading@10 150 %macro LOADA64 2
yading@10 151 %if cpuflag(avx)
yading@10 152 movu %1, [%2]
yading@10 153 %else
yading@10 154 movlps %1, [%2]
yading@10 155 movhps %1, [%2 + 8]
yading@10 156 %endif
yading@10 157 %endmacro
yading@10 158
yading@10 159 %macro DEFINE_IMDCT 0
yading@10 160 cglobal imdct36_float, 4,4,9, out, buf, in, win
yading@10 161
yading@10 162 ; for(i=17;i>=1;i--) in[i] += in[i-1];
yading@10 163 LOADA64 m0, inq
yading@10 164 LOADA64 m1, inq + 16
yading@10 165
yading@10 166 ROTLEFT m5, m0, m1
yading@10 167
yading@10 168 PSHUFD m6, m0, 0x93
yading@10 169 andps m6, m6, [ps_mask]
yading@10 170 addps m0, m0, m6
yading@10 171
yading@10 172 LOADA64 m2, inq + 32
yading@10 173
yading@10 174 ROTLEFT m7, m1, m2
yading@10 175
yading@10 176 addps m1, m1, m5
yading@10 177 LOADA64 m3, inq + 48
yading@10 178
yading@10 179 ROTLEFT m5, m2, m3
yading@10 180
yading@10 181 xorps m4, m4, m4
yading@10 182 movlps m4, [inq+64]
yading@10 183 BUILDINVHIGHLOW m6, m3, m4
yading@10 184 shufps m6, m6, m4, 0xa9
yading@10 185
yading@10 186 addps m4, m4, m6
yading@10 187 addps m2, m2, m7
yading@10 188 addps m3, m3, m5
yading@10 189
yading@10 190 ; for(i=17;i>=3;i-=2) in[i] += in[i-2];
yading@10 191 movlhps m5, m5, m0
yading@10 192 andps m5, m5, [ps_mask3]
yading@10 193
yading@10 194 BUILDINVHIGHLOW m7, m0, m1
yading@10 195 andps m7, m7, [ps_mask2]
yading@10 196
yading@10 197 addps m0, m0, m5
yading@10 198
yading@10 199 BUILDINVHIGHLOW m6, m1, m2
yading@10 200 andps m6, m6, [ps_mask2]
yading@10 201
yading@10 202 addps m1, m1, m7
yading@10 203
yading@10 204 BUILDINVHIGHLOW m7, m2, m3
yading@10 205 andps m7, m7, [ps_mask2]
yading@10 206
yading@10 207 addps m2, m2, m6
yading@10 208
yading@10 209 movhlps m6, m6, m3
yading@10 210 andps m6, m6, [ps_mask4]
yading@10 211
yading@10 212 addps m3, m3, m7
yading@10 213 addps m4, m4, m6
yading@10 214
yading@10 215 ; Populate tmp[]
yading@10 216 movlhps m6, m1, m5 ; zero out high values
yading@10 217 subps m6, m6, m4
yading@10 218
yading@10 219 subps m5, m0, m3
yading@10 220
yading@10 221 %if ARCH_X86_64
yading@10 222 SWAP m5, m8
yading@10 223 %endif
yading@10 224
yading@10 225 mulps m7, m2, [ps_val1]
yading@10 226
yading@10 227 %if ARCH_X86_64
yading@10 228 mulps m5, m8, [ps_val2]
yading@10 229 %else
yading@10 230 mulps m5, m5, [ps_val2]
yading@10 231 %endif
yading@10 232 addps m7, m7, m5
yading@10 233
yading@10 234 mulps m5, m6, [ps_val1]
yading@10 235 subps m7, m7, m5
yading@10 236
yading@10 237 %if ARCH_X86_64
yading@10 238 SWAP m5, m8
yading@10 239 %else
yading@10 240 subps m5, m0, m3
yading@10 241 %endif
yading@10 242
yading@10 243 subps m5, m5, m6
yading@10 244 addps m5, m5, m2
yading@10 245
yading@10 246 shufps m6, m4, m3, 0xe4
yading@10 247 subps m6, m6, m2
yading@10 248 mulps m6, m6, [ps_val3]
yading@10 249
yading@10 250 addps m4, m4, m1
yading@10 251 mulps m4, m4, [ps_val4]
yading@10 252
yading@10 253 shufps m1, m1, m0, 0xe4
yading@10 254 addps m1, m1, m2
yading@10 255 mulps m1, m1, [ps_val5]
yading@10 256
yading@10 257 mulps m3, m3, [ps_val6]
yading@10 258 mulps m0, m0, [ps_val7]
yading@10 259 addps m0, m0, m3
yading@10 260
yading@10 261 xorps m2, m1, [ps_p1p1m1m1]
yading@10 262 subps m2, m2, m4
yading@10 263 addps m2, m2, m0
yading@10 264
yading@10 265 addps m3, m4, m0
yading@10 266 subps m3, m3, m6
yading@10 267 xorps m3, m3, [ps_p1p1m1m1]
yading@10 268
yading@10 269 shufps m0, m0, m4, 0xe4
yading@10 270 subps m0, m0, m1
yading@10 271 addps m0, m0, m6
yading@10 272
yading@10 273 BUILDINVHIGHLOW m4, m2, m3
yading@10 274 shufps m3, m3, m2, 0x4e
yading@10 275
yading@10 276 ; we have tmp = {SwAPLH(m0), SwAPLH(m7), m3, m4, m5}
yading@10 277
yading@10 278 BUTTERF m0, m1, 0
yading@10 279 BUTTERF m7, m2, 16
yading@10 280 BUTTERF m3, m6, 32
yading@10 281 BUTTERF m4, m1, 48
yading@10 282
yading@10 283 mulps m5, m5, [ps_cosh + 64]
yading@10 284 PSHUFD m1, m5, 0xe1
yading@10 285 xorps m5, m5, [ps_p1m1p1m1]
yading@10 286 addps m5, m5, m1
yading@10 287
yading@10 288 ; permutates:
yading@10 289 ; m0 0 1 2 3 => 2 6 10 14 m1
yading@10 290 ; m7 4 5 6 7 => 3 7 11 15 m2
yading@10 291 ; m3 8 9 10 11 => 17 13 9 5 m3
yading@10 292 ; m4 12 13 14 15 => 16 12 8 4 m5
yading@10 293 ; m5 16 17 xx xx => 0 1 xx xx m0
yading@10 294
yading@10 295 unpckhps m1, m0, m7
yading@10 296 unpckhps m6, m3, m4
yading@10 297 movhlps m2, m6, m1
yading@10 298 movlhps m1, m1, m6
yading@10 299
yading@10 300 unpcklps m5, m5, m4
yading@10 301 unpcklps m3, m3, m7
yading@10 302 movhlps m4, m3, m5
yading@10 303 movlhps m5, m5, m3
yading@10 304 SWAP m4, m3
yading@10 305 ; permutation done
yading@10 306
yading@10 307 PSHUFD m6, m2, 0xb1
yading@10 308 movss m4, [bufq + 4*68]
yading@10 309 movss m7, [bufq + 4*64]
yading@10 310 unpcklps m7, m7, m4
yading@10 311 mulps m6, m6, [winq + 16*4]
yading@10 312 addps m6, m6, m7
yading@10 313 movss [outq + 64*SBLIMIT], m6
yading@10 314 shufps m6, m6, m6, 0xb1
yading@10 315 movss [outq + 68*SBLIMIT], m6
yading@10 316
yading@10 317 mulps m6, m3, [winq + 4*4]
yading@10 318 LOAD m4, m7, bufq + 4*16, 16
yading@10 319 addps m6, m6, m4
yading@10 320 STORE m6, m7, outq + 16*SBLIMIT, 4*SBLIMIT
yading@10 321
yading@10 322 shufps m4, m0, m3, 0xb5
yading@10 323 mulps m4, m4, [winq + 8*4]
yading@10 324 LOAD m7, m6, bufq + 4*32, 16
yading@10 325 addps m4, m4, m7
yading@10 326 STORE m4, m6, outq + 32*SBLIMIT, 4*SBLIMIT
yading@10 327
yading@10 328 shufps m3, m3, m2, 0xb1
yading@10 329 mulps m3, m3, [winq + 12*4]
yading@10 330 LOAD m7, m6, bufq + 4*48, 16
yading@10 331 addps m3, m3, m7
yading@10 332 STORE m3, m7, outq + 48*SBLIMIT, 4*SBLIMIT
yading@10 333
yading@10 334 mulps m2, m2, [winq]
yading@10 335 LOAD m6, m7, bufq, 16
yading@10 336 addps m2, m2, m6
yading@10 337 STORE m2, m7, outq, 4*SBLIMIT
yading@10 338
yading@10 339 mulps m4, m1, [winq + 20*4]
yading@10 340 STORE m4, m7, bufq, 16
yading@10 341
yading@10 342 mulps m3, m5, [winq + 24*4]
yading@10 343 STORE m3, m7, bufq + 4*16, 16
yading@10 344
yading@10 345 shufps m0, m0, m5, 0xb0
yading@10 346 mulps m0, m0, [winq + 28*4]
yading@10 347 STORE m0, m7, bufq + 4*32, 16
yading@10 348
yading@10 349 shufps m5, m5, m1, 0xb1
yading@10 350 mulps m5, m5, [winq + 32*4]
yading@10 351 STORE m5, m7, bufq + 4*48, 16
yading@10 352
yading@10 353 shufps m1, m1, m1, 0xb1
yading@10 354 mulps m1, m1, [winq + 36*4]
yading@10 355 movss [bufq + 4*64], m1
yading@10 356 shufps m1, m1, 0xb1
yading@10 357 movss [bufq + 4*68], m1
yading@10 358 RET
yading@10 359 %endmacro
yading@10 360
yading@10 361 INIT_XMM sse
yading@10 362 DEFINE_IMDCT
yading@10 363
yading@10 364 INIT_XMM sse2
yading@10 365 DEFINE_IMDCT
yading@10 366
yading@10 367 INIT_XMM sse3
yading@10 368 DEFINE_IMDCT
yading@10 369
yading@10 370 INIT_XMM ssse3
yading@10 371 DEFINE_IMDCT
yading@10 372
yading@10 373 %if HAVE_AVX_EXTERNAL
yading@10 374 INIT_XMM avx
yading@10 375 DEFINE_IMDCT
yading@10 376 %endif
yading@10 377
yading@10 378 INIT_XMM sse
yading@10 379
yading@10 380 %if ARCH_X86_64
yading@10 381 %define SPILL SWAP
yading@10 382 %define UNSPILL SWAP
yading@10 383 %define SPILLED(x) m %+ x
yading@10 384 %else
yading@10 385 %define SPILLED(x) [tmpq+(x-8)*16 + 32*4]
yading@10 386 %macro SPILL 2 ; xmm#, mempos
yading@10 387 movaps SPILLED(%2), m%1
yading@10 388 %endmacro
yading@10 389 %macro UNSPILL 2
yading@10 390 movaps m%1, SPILLED(%2)
yading@10 391 %endmacro
yading@10 392 %endif
yading@10 393
yading@10 394 %macro DEFINE_FOUR_IMDCT 0
yading@10 395 cglobal four_imdct36_float, 5,5,16, out, buf, in, win, tmp
yading@10 396 movlps m0, [inq+64]
yading@10 397 movhps m0, [inq+64 + 72]
yading@10 398 movlps m3, [inq+64 + 2*72]
yading@10 399 movhps m3, [inq+64 + 3*72]
yading@10 400
yading@10 401 shufps m5, m0, m3, 0xdd
yading@10 402 shufps m0, m0, m3, 0x88
yading@10 403
yading@10 404 mova m1, [inq+48]
yading@10 405 movu m6, [inq+48 + 72]
yading@10 406 mova m7, [inq+48 + 2*72]
yading@10 407 movu m3, [inq+48 + 3*72]
yading@10 408
yading@10 409 TRANSPOSE4x4PS 1, 6, 7, 3, 4
yading@10 410
yading@10 411 addps m4, m6, m7
yading@10 412 mova [tmpq+4*28], m4
yading@10 413
yading@10 414 addps m7, m3
yading@10 415 addps m6, m1
yading@10 416 addps m3, m0
yading@10 417 addps m0, m5
yading@10 418 addps m0, m7
yading@10 419 addps m7, m6
yading@10 420 mova [tmpq+4*12], m7
yading@10 421 SPILL 3, 12
yading@10 422
yading@10 423 mova m4, [inq+32]
yading@10 424 movu m5, [inq+32 + 72]
yading@10 425 mova m2, [inq+32 + 2*72]
yading@10 426 movu m7, [inq+32 + 3*72]
yading@10 427
yading@10 428 TRANSPOSE4x4PS 4, 5, 2, 7, 3
yading@10 429
yading@10 430 addps m1, m7
yading@10 431 SPILL 1, 11
yading@10 432
yading@10 433 addps m3, m5, m2
yading@10 434 SPILL 3, 13
yading@10 435
yading@10 436 addps m7, m2
yading@10 437 addps m5, m4
yading@10 438 addps m6, m7
yading@10 439 mova [tmpq], m6
yading@10 440 addps m7, m5
yading@10 441 mova [tmpq+4*16], m7
yading@10 442
yading@10 443 mova m2, [inq+16]
yading@10 444 movu m7, [inq+16 + 72]
yading@10 445 mova m1, [inq+16 + 2*72]
yading@10 446 movu m6, [inq+16 + 3*72]
yading@10 447
yading@10 448 TRANSPOSE4x4PS 2, 7, 1, 6, 3
yading@10 449
yading@10 450 addps m4, m6
yading@10 451 addps m6, m1
yading@10 452 addps m1, m7
yading@10 453 addps m7, m2
yading@10 454 addps m5, m6
yading@10 455 SPILL 5, 15
yading@10 456 addps m6, m7
yading@10 457 mulps m6, [costabs + 16*2]
yading@10 458 mova [tmpq+4*8], m6
yading@10 459 SPILL 1, 10
yading@10 460 SPILL 0, 14
yading@10 461
yading@10 462 mova m1, [inq]
yading@10 463 movu m6, [inq + 72]
yading@10 464 mova m3, [inq + 2*72]
yading@10 465 movu m5, [inq + 3*72]
yading@10 466
yading@10 467 TRANSPOSE4x4PS 1, 6, 3, 5, 0
yading@10 468
yading@10 469 addps m2, m5
yading@10 470 addps m5, m3
yading@10 471 addps m7, m5
yading@10 472 addps m3, m6
yading@10 473 addps m6, m1
yading@10 474 SPILL 7, 8
yading@10 475 addps m5, m6
yading@10 476 SPILL 6, 9
yading@10 477 addps m6, m4, SPILLED(12)
yading@10 478 subps m6, m2
yading@10 479 UNSPILL 7, 11
yading@10 480 SPILL 5, 11
yading@10 481 subps m5, m1, m7
yading@10 482 mulps m7, [costabs + 16*5]
yading@10 483 addps m7, m1
yading@10 484 mulps m0, m6, [costabs + 16*6]
yading@10 485 addps m0, m5
yading@10 486 mova [tmpq+4*24], m0
yading@10 487 addps m6, m5
yading@10 488 mova [tmpq+4*4], m6
yading@10 489 addps m6, m4, m2
yading@10 490 mulps m6, [costabs + 16*1]
yading@10 491 subps m4, SPILLED(12)
yading@10 492 mulps m4, [costabs + 16*8]
yading@10 493 addps m2, SPILLED(12)
yading@10 494 mulps m2, [costabs + 16*3]
yading@10 495 subps m5, m7, m6
yading@10 496 subps m5, m2
yading@10 497 addps m6, m7
yading@10 498 addps m6, m4
yading@10 499 addps m7, m2
yading@10 500 subps m7, m4
yading@10 501 mova [tmpq+4*20], m7
yading@10 502 mova m2, [tmpq+4*28]
yading@10 503 mova [tmpq+4*28], m5
yading@10 504 UNSPILL 7, 13
yading@10 505 subps m5, m7, m2
yading@10 506 mulps m5, [costabs + 16*7]
yading@10 507 UNSPILL 1, 10
yading@10 508 mulps m1, [costabs + 16*2]
yading@10 509 addps m4, m3, m2
yading@10 510 mulps m4, [costabs + 16*4]
yading@10 511 addps m2, m7
yading@10 512 addps m7, m3
yading@10 513 mulps m7, [costabs]
yading@10 514 subps m3, m2
yading@10 515 mulps m3, [costabs + 16*2]
yading@10 516 addps m2, m7, m5
yading@10 517 addps m2, m1
yading@10 518 SPILL 2, 10
yading@10 519 addps m7, m4
yading@10 520 subps m7, m1
yading@10 521 SPILL 7, 12
yading@10 522 subps m5, m4
yading@10 523 subps m5, m1
yading@10 524 UNSPILL 0, 14
yading@10 525 SPILL 5, 13
yading@10 526 addps m1, m0, SPILLED(15)
yading@10 527 subps m1, SPILLED(8)
yading@10 528 mova m4, [costabs + 16*5]
yading@10 529 mulps m4, [tmpq]
yading@10 530 UNSPILL 2, 9
yading@10 531 addps m4, m2
yading@10 532 subps m2, [tmpq]
yading@10 533 mulps m5, m1, [costabs + 16*6]
yading@10 534 addps m5, m2
yading@10 535 SPILL 5, 9
yading@10 536 addps m2, m1
yading@10 537 SPILL 2, 14
yading@10 538 UNSPILL 5, 15
yading@10 539 subps m7, m5, m0
yading@10 540 addps m5, SPILLED(8)
yading@10 541 mulps m5, [costabs + 16*1]
yading@10 542 mulps m7, [costabs + 16*8]
yading@10 543 addps m0, SPILLED(8)
yading@10 544 mulps m0, [costabs + 16*3]
yading@10 545 subps m2, m4, m5
yading@10 546 subps m2, m0
yading@10 547 SPILL 2, 15
yading@10 548 addps m5, m4
yading@10 549 addps m5, m7
yading@10 550 addps m4, m0
yading@10 551 subps m4, m7
yading@10 552 SPILL 4, 8
yading@10 553 mova m7, [tmpq+4*16]
yading@10 554 mova m2, [tmpq+4*12]
yading@10 555 addps m0, m7, m2
yading@10 556 subps m0, SPILLED(11)
yading@10 557 mulps m0, [costabs + 16*2]
yading@10 558 addps m4, m7, SPILLED(11)
yading@10 559 mulps m4, [costabs]
yading@10 560 subps m7, m2
yading@10 561 mulps m7, [costabs + 16*7]
yading@10 562 addps m2, SPILLED(11)
yading@10 563 mulps m2, [costabs + 16*4]
yading@10 564 addps m1, m7, [tmpq+4*8]
yading@10 565 addps m1, m4
yading@10 566 addps m4, m2
yading@10 567 subps m4, [tmpq+4*8]
yading@10 568 SPILL 4, 11
yading@10 569 subps m7, m2
yading@10 570 subps m7, [tmpq+4*8]
yading@10 571 addps m4, m6, SPILLED(10)
yading@10 572 subps m6, SPILLED(10)
yading@10 573 addps m2, m5, m1
yading@10 574 mulps m2, [costabs + 16*9]
yading@10 575 subps m5, m1
yading@10 576 mulps m5, [costabs + 16*17]
yading@10 577 subps m1, m4, m2
yading@10 578 addps m4, m2
yading@10 579 mulps m2, m1, [winq+4*36]
yading@10 580 addps m2, [bufq+4*36]
yading@10 581 mova [outq+1152], m2
yading@10 582 mulps m1, [winq+4*32]
yading@10 583 addps m1, [bufq+4*32]
yading@10 584 mova [outq+1024], m1
yading@10 585 mulps m1, m4, [winq+4*116]
yading@10 586 mova [bufq+4*36], m1
yading@10 587 mulps m4, [winq+4*112]
yading@10 588 mova [bufq+4*32], m4
yading@10 589 addps m2, m6, m5
yading@10 590 subps m6, m5
yading@10 591 mulps m1, m6, [winq+4*68]
yading@10 592 addps m1, [bufq+4*68]
yading@10 593 mova [outq+2176], m1
yading@10 594 mulps m6, [winq]
yading@10 595 addps m6, [bufq]
yading@10 596 mova [outq], m6
yading@10 597 mulps m1, m2, [winq+4*148]
yading@10 598 mova [bufq+4*68], m1
yading@10 599 mulps m2, [winq+4*80]
yading@10 600 mova [bufq], m2
yading@10 601 addps m5, m3, [tmpq+4*24]
yading@10 602 mova m2, [tmpq+4*24]
yading@10 603 subps m2, m3
yading@10 604 mova m1, SPILLED(9)
yading@10 605 subps m1, m0
yading@10 606 mulps m1, [costabs + 16*10]
yading@10 607 addps m0, SPILLED(9)
yading@10 608 mulps m0, [costabs + 16*16]
yading@10 609 addps m6, m5, m1
yading@10 610 subps m5, m1
yading@10 611 mulps m3, m5, [winq+4*40]
yading@10 612 addps m3, [bufq+4*40]
yading@10 613 mova [outq+1280], m3
yading@10 614 mulps m5, [winq+4*28]
yading@10 615 addps m5, [bufq+4*28]
yading@10 616 mova [outq+896], m5
yading@10 617 mulps m1, m6, [winq+4*120]
yading@10 618 mova [bufq+4*40], m1
yading@10 619 mulps m6, [winq+4*108]
yading@10 620 mova [bufq+4*28], m6
yading@10 621 addps m1, m2, m0
yading@10 622 subps m2, m0
yading@10 623 mulps m5, m2, [winq+4*64]
yading@10 624 addps m5, [bufq+4*64]
yading@10 625 mova [outq+2048], m5
yading@10 626 mulps m2, [winq+4*4]
yading@10 627 addps m2, [bufq+4*4]
yading@10 628 mova [outq+128], m2
yading@10 629 mulps m0, m1, [winq+4*144]
yading@10 630 mova [bufq+4*64], m0
yading@10 631 mulps m1, [winq+4*84]
yading@10 632 mova [bufq+4*4], m1
yading@10 633 mova m1, [tmpq+4*28]
yading@10 634 mova m5, m1
yading@10 635 addps m1, SPILLED(13)
yading@10 636 subps m5, SPILLED(13)
yading@10 637 UNSPILL 3, 15
yading@10 638 addps m2, m7, m3
yading@10 639 mulps m2, [costabs + 16*11]
yading@10 640 subps m3, m7
yading@10 641 mulps m3, [costabs + 16*15]
yading@10 642 addps m0, m2, m1
yading@10 643 subps m1, m2
yading@10 644 SWAP m0, m2
yading@10 645 mulps m6, m1, [winq+4*44]
yading@10 646 addps m6, [bufq+4*44]
yading@10 647 mova [outq+1408], m6
yading@10 648 mulps m1, [winq+4*24]
yading@10 649 addps m1, [bufq+4*24]
yading@10 650 mova [outq+768], m1
yading@10 651 mulps m0, m2, [winq+4*124]
yading@10 652 mova [bufq+4*44], m0
yading@10 653 mulps m2, [winq+4*104]
yading@10 654 mova [bufq+4*24], m2
yading@10 655 addps m0, m5, m3
yading@10 656 subps m5, m3
yading@10 657 mulps m1, m5, [winq+4*60]
yading@10 658 addps m1, [bufq+4*60]
yading@10 659 mova [outq+1920], m1
yading@10 660 mulps m5, [winq+4*8]
yading@10 661 addps m5, [bufq+4*8]
yading@10 662 mova [outq+256], m5
yading@10 663 mulps m1, m0, [winq+4*140]
yading@10 664 mova [bufq+4*60], m1
yading@10 665 mulps m0, [winq+4*88]
yading@10 666 mova [bufq+4*8], m0
yading@10 667 mova m1, [tmpq+4*20]
yading@10 668 addps m1, SPILLED(12)
yading@10 669 mova m2, [tmpq+4*20]
yading@10 670 subps m2, SPILLED(12)
yading@10 671 UNSPILL 7, 8
yading@10 672 subps m0, m7, SPILLED(11)
yading@10 673 addps m7, SPILLED(11)
yading@10 674 mulps m4, m7, [costabs + 16*12]
yading@10 675 mulps m0, [costabs + 16*14]
yading@10 676 addps m5, m1, m4
yading@10 677 subps m1, m4
yading@10 678 mulps m7, m1, [winq+4*48]
yading@10 679 addps m7, [bufq+4*48]
yading@10 680 mova [outq+1536], m7
yading@10 681 mulps m1, [winq+4*20]
yading@10 682 addps m1, [bufq+4*20]
yading@10 683 mova [outq+640], m1
yading@10 684 mulps m1, m5, [winq+4*128]
yading@10 685 mova [bufq+4*48], m1
yading@10 686 mulps m5, [winq+4*100]
yading@10 687 mova [bufq+4*20], m5
yading@10 688 addps m6, m2, m0
yading@10 689 subps m2, m0
yading@10 690 mulps m1, m2, [winq+4*56]
yading@10 691 addps m1, [bufq+4*56]
yading@10 692 mova [outq+1792], m1
yading@10 693 mulps m2, [winq+4*12]
yading@10 694 addps m2, [bufq+4*12]
yading@10 695 mova [outq+384], m2
yading@10 696 mulps m0, m6, [winq+4*136]
yading@10 697 mova [bufq+4*56], m0
yading@10 698 mulps m6, [winq+4*92]
yading@10 699 mova [bufq+4*12], m6
yading@10 700 UNSPILL 0, 14
yading@10 701 mulps m0, [costabs + 16*13]
yading@10 702 mova m3, [tmpq+4*4]
yading@10 703 addps m2, m0, m3
yading@10 704 subps m3, m0
yading@10 705 mulps m0, m3, [winq+4*52]
yading@10 706 addps m0, [bufq+4*52]
yading@10 707 mova [outq+1664], m0
yading@10 708 mulps m3, [winq+4*16]
yading@10 709 addps m3, [bufq+4*16]
yading@10 710 mova [outq+512], m3
yading@10 711 mulps m0, m2, [winq+4*132]
yading@10 712 mova [bufq+4*52], m0
yading@10 713 mulps m2, [winq+4*96]
yading@10 714 mova [bufq+4*16], m2
yading@10 715 RET
yading@10 716 %endmacro
yading@10 717
yading@10 718 INIT_XMM sse
yading@10 719 DEFINE_FOUR_IMDCT
yading@10 720
yading@10 721 %if HAVE_AVX_EXTERNAL
yading@10 722 INIT_XMM avx
yading@10 723 DEFINE_FOUR_IMDCT
yading@10 724 %endif