yading@10: ;****************************************************************************** yading@10: ;* 36 point SSE-optimized IMDCT transform yading@10: ;* Copyright (c) 2011 Vitor Sessak yading@10: ;* yading@10: ;* This file is part of FFmpeg. yading@10: ;* yading@10: ;* FFmpeg is free software; you can redistribute it and/or yading@10: ;* modify it under the terms of the GNU Lesser General Public yading@10: ;* License as published by the Free Software Foundation; either yading@10: ;* version 2.1 of the License, or (at your option) any later version. yading@10: ;* yading@10: ;* FFmpeg is distributed in the hope that it will be useful, yading@10: ;* but WITHOUT ANY WARRANTY; without even the implied warranty of yading@10: ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU yading@10: ;* Lesser General Public License for more details. yading@10: ;* yading@10: ;* You should have received a copy of the GNU Lesser General Public yading@10: ;* License along with FFmpeg; if not, write to the Free Software yading@10: ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA yading@10: ;****************************************************************************** yading@10: yading@10: %include "libavutil/x86/x86util.asm" yading@10: yading@10: SECTION_RODATA yading@10: yading@10: align 16 yading@10: ps_mask: dd 0, ~0, ~0, ~0 yading@10: ps_mask2: dd 0, ~0, 0, ~0 yading@10: ps_mask3: dd 0, 0, 0, ~0 yading@10: ps_mask4: dd 0, ~0, 0, 0 yading@10: yading@10: ps_val1: dd -0.5, -0.5, -0.8660254038, -0.8660254038 yading@10: ps_val2: dd 1.0, 1.0, 0.8660254038, 0.8660254038 yading@10: ps_val3: dd 0.1736481777, 0.1736481777, 0.3420201433, 0.3420201433 yading@10: ps_val4: dd -0.7660444431, -0.7660444431, 0.8660254038, 0.8660254038 yading@10: ps_val5: dd -0.9396926208, -0.9396926208, -0.9848077530, -0.9848077530 yading@10: ps_val6: dd 0.5, 0.5, -0.6427876097, -0.6427876097 yading@10: ps_val7: dd 1.0, 1.0, -0.6427876097, -0.6427876097 yading@10: yading@10: ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000 yading@10: ps_p1m1p1m1: dd 0, 0x80000000, 0, 0x80000000 yading@10: yading@10: ps_cosh: dd 1.0, 0.50190991877167369479, 1.0, 5.73685662283492756461 yading@10: dd 1.0, 0.51763809020504152469, 1.0, 1.93185165257813657349 yading@10: dd 1.0, 0.55168895948124587824, -1.0, -1.18310079157624925896 yading@10: dd 1.0, 0.61038729438072803416, -1.0, -0.87172339781054900991 yading@10: dd 1.0, 0.70710678118654752439, 0.0, 0.0 yading@10: yading@10: ps_cosh_sse3: dd 1.0, -0.50190991877167369479, 1.0, -5.73685662283492756461 yading@10: dd 1.0, -0.51763809020504152469, 1.0, -1.93185165257813657349 yading@10: dd 1.0, -0.55168895948124587824, -1.0, 1.18310079157624925896 yading@10: dd 1.0, -0.61038729438072803416, -1.0, 0.87172339781054900991 yading@10: dd 1.0, 0.70710678118654752439, 0.0, 0.0 yading@10: yading@10: costabs: times 4 dd 0.98480773 yading@10: times 4 dd 0.93969262 yading@10: times 4 dd 0.86602539 yading@10: times 4 dd -0.76604444 yading@10: times 4 dd -0.64278764 yading@10: times 4 dd 0.50000000 yading@10: times 4 dd -0.50000000 yading@10: times 4 dd -0.34202015 yading@10: times 4 dd -0.17364818 yading@10: times 4 dd 0.50190992 yading@10: times 4 dd 0.51763808 yading@10: times 4 dd 0.55168896 yading@10: times 4 dd 0.61038726 yading@10: times 4 dd 0.70710677 yading@10: times 4 dd 0.87172341 yading@10: times 4 dd 1.18310082 yading@10: times 4 dd 1.93185163 yading@10: times 4 dd 5.73685646 yading@10: yading@10: %define SBLIMIT 32 yading@10: SECTION_TEXT yading@10: yading@10: %macro PSHUFD 3 yading@10: %if cpuflag(sse2) && notcpuflag(avx) yading@10: pshufd %1, %2, %3 yading@10: %else yading@10: shufps %1, %2, %2, %3 yading@10: %endif yading@10: %endmacro yading@10: yading@10: ; input %2={x1,x2,x3,x4}, %3={y1,y2,y3,y4} yading@10: ; output %1={x3,x4,y1,y2} yading@10: %macro BUILDINVHIGHLOW 3 yading@10: %if cpuflag(avx) yading@10: shufps %1, %2, %3, 0x4e yading@10: %else yading@10: movlhps %1, %3 yading@10: movhlps %1, %2 yading@10: %endif yading@10: %endmacro yading@10: yading@10: ; input %2={x1,x2,x3,x4}, %3={y1,y2,y3,y4} yading@10: ; output %1={x4,y1,y2,y3} yading@10: %macro ROTLEFT 3 yading@10: %if cpuflag(ssse3) yading@10: palignr %1, %3, %2, 12 yading@10: %else yading@10: BUILDINVHIGHLOW %1, %2, %3 yading@10: shufps %1, %1, %3, 0x99 yading@10: %endif yading@10: %endmacro yading@10: yading@10: %macro INVERTHL 2 yading@10: %if cpuflag(sse2) yading@10: PSHUFD %1, %2, 0x4e yading@10: %else yading@10: movhlps %1, %2 yading@10: movlhps %1, %2 yading@10: %endif yading@10: %endmacro yading@10: yading@10: %macro BUTTERF 3 yading@10: INVERTHL %2, %1 yading@10: xorps %1, [ps_p1p1m1m1] yading@10: addps %1, %2 yading@10: %if cpuflag(sse3) yading@10: mulps %1, %1, [ps_cosh_sse3 + %3] yading@10: PSHUFD %2, %1, 0xb1 yading@10: addsubps %1, %1, %2 yading@10: %else yading@10: mulps %1, [ps_cosh + %3] yading@10: PSHUFD %2, %1, 0xb1 yading@10: xorps %1, [ps_p1m1p1m1] yading@10: addps %1, %2 yading@10: %endif yading@10: %endmacro yading@10: yading@10: %macro STORE 4 yading@10: movhlps %2, %1 yading@10: movss [%3 ], %1 yading@10: movss [%3 + 2*%4], %2 yading@10: shufps %1, %1, 0xb1 yading@10: movss [%3 + %4], %1 yading@10: movhlps %2, %1 yading@10: movss [%3 + 3*%4], %2 yading@10: %endmacro yading@10: yading@10: %macro LOAD 4 yading@10: movlps %1, [%3 ] yading@10: movhps %1, [%3 + %4] yading@10: movlps %2, [%3 + 2*%4] yading@10: movhps %2, [%3 + 3*%4] yading@10: shufps %1, %2, 0x88 yading@10: %endmacro yading@10: yading@10: %macro LOADA64 2 yading@10: %if cpuflag(avx) yading@10: movu %1, [%2] yading@10: %else yading@10: movlps %1, [%2] yading@10: movhps %1, [%2 + 8] yading@10: %endif yading@10: %endmacro yading@10: yading@10: %macro DEFINE_IMDCT 0 yading@10: cglobal imdct36_float, 4,4,9, out, buf, in, win yading@10: yading@10: ; for(i=17;i>=1;i--) in[i] += in[i-1]; yading@10: LOADA64 m0, inq yading@10: LOADA64 m1, inq + 16 yading@10: yading@10: ROTLEFT m5, m0, m1 yading@10: yading@10: PSHUFD m6, m0, 0x93 yading@10: andps m6, m6, [ps_mask] yading@10: addps m0, m0, m6 yading@10: yading@10: LOADA64 m2, inq + 32 yading@10: yading@10: ROTLEFT m7, m1, m2 yading@10: yading@10: addps m1, m1, m5 yading@10: LOADA64 m3, inq + 48 yading@10: yading@10: ROTLEFT m5, m2, m3 yading@10: yading@10: xorps m4, m4, m4 yading@10: movlps m4, [inq+64] yading@10: BUILDINVHIGHLOW m6, m3, m4 yading@10: shufps m6, m6, m4, 0xa9 yading@10: yading@10: addps m4, m4, m6 yading@10: addps m2, m2, m7 yading@10: addps m3, m3, m5 yading@10: yading@10: ; for(i=17;i>=3;i-=2) in[i] += in[i-2]; yading@10: movlhps m5, m5, m0 yading@10: andps m5, m5, [ps_mask3] yading@10: yading@10: BUILDINVHIGHLOW m7, m0, m1 yading@10: andps m7, m7, [ps_mask2] yading@10: yading@10: addps m0, m0, m5 yading@10: yading@10: BUILDINVHIGHLOW m6, m1, m2 yading@10: andps m6, m6, [ps_mask2] yading@10: yading@10: addps m1, m1, m7 yading@10: yading@10: BUILDINVHIGHLOW m7, m2, m3 yading@10: andps m7, m7, [ps_mask2] yading@10: yading@10: addps m2, m2, m6 yading@10: yading@10: movhlps m6, m6, m3 yading@10: andps m6, m6, [ps_mask4] yading@10: yading@10: addps m3, m3, m7 yading@10: addps m4, m4, m6 yading@10: yading@10: ; Populate tmp[] yading@10: movlhps m6, m1, m5 ; zero out high values yading@10: subps m6, m6, m4 yading@10: yading@10: subps m5, m0, m3 yading@10: yading@10: %if ARCH_X86_64 yading@10: SWAP m5, m8 yading@10: %endif yading@10: yading@10: mulps m7, m2, [ps_val1] yading@10: yading@10: %if ARCH_X86_64 yading@10: mulps m5, m8, [ps_val2] yading@10: %else yading@10: mulps m5, m5, [ps_val2] yading@10: %endif yading@10: addps m7, m7, m5 yading@10: yading@10: mulps m5, m6, [ps_val1] yading@10: subps m7, m7, m5 yading@10: yading@10: %if ARCH_X86_64 yading@10: SWAP m5, m8 yading@10: %else yading@10: subps m5, m0, m3 yading@10: %endif yading@10: yading@10: subps m5, m5, m6 yading@10: addps m5, m5, m2 yading@10: yading@10: shufps m6, m4, m3, 0xe4 yading@10: subps m6, m6, m2 yading@10: mulps m6, m6, [ps_val3] yading@10: yading@10: addps m4, m4, m1 yading@10: mulps m4, m4, [ps_val4] yading@10: yading@10: shufps m1, m1, m0, 0xe4 yading@10: addps m1, m1, m2 yading@10: mulps m1, m1, [ps_val5] yading@10: yading@10: mulps m3, m3, [ps_val6] yading@10: mulps m0, m0, [ps_val7] yading@10: addps m0, m0, m3 yading@10: yading@10: xorps m2, m1, [ps_p1p1m1m1] yading@10: subps m2, m2, m4 yading@10: addps m2, m2, m0 yading@10: yading@10: addps m3, m4, m0 yading@10: subps m3, m3, m6 yading@10: xorps m3, m3, [ps_p1p1m1m1] yading@10: yading@10: shufps m0, m0, m4, 0xe4 yading@10: subps m0, m0, m1 yading@10: addps m0, m0, m6 yading@10: yading@10: BUILDINVHIGHLOW m4, m2, m3 yading@10: shufps m3, m3, m2, 0x4e yading@10: yading@10: ; we have tmp = {SwAPLH(m0), SwAPLH(m7), m3, m4, m5} yading@10: yading@10: BUTTERF m0, m1, 0 yading@10: BUTTERF m7, m2, 16 yading@10: BUTTERF m3, m6, 32 yading@10: BUTTERF m4, m1, 48 yading@10: yading@10: mulps m5, m5, [ps_cosh + 64] yading@10: PSHUFD m1, m5, 0xe1 yading@10: xorps m5, m5, [ps_p1m1p1m1] yading@10: addps m5, m5, m1 yading@10: yading@10: ; permutates: yading@10: ; m0 0 1 2 3 => 2 6 10 14 m1 yading@10: ; m7 4 5 6 7 => 3 7 11 15 m2 yading@10: ; m3 8 9 10 11 => 17 13 9 5 m3 yading@10: ; m4 12 13 14 15 => 16 12 8 4 m5 yading@10: ; m5 16 17 xx xx => 0 1 xx xx m0 yading@10: yading@10: unpckhps m1, m0, m7 yading@10: unpckhps m6, m3, m4 yading@10: movhlps m2, m6, m1 yading@10: movlhps m1, m1, m6 yading@10: yading@10: unpcklps m5, m5, m4 yading@10: unpcklps m3, m3, m7 yading@10: movhlps m4, m3, m5 yading@10: movlhps m5, m5, m3 yading@10: SWAP m4, m3 yading@10: ; permutation done yading@10: yading@10: PSHUFD m6, m2, 0xb1 yading@10: movss m4, [bufq + 4*68] yading@10: movss m7, [bufq + 4*64] yading@10: unpcklps m7, m7, m4 yading@10: mulps m6, m6, [winq + 16*4] yading@10: addps m6, m6, m7 yading@10: movss [outq + 64*SBLIMIT], m6 yading@10: shufps m6, m6, m6, 0xb1 yading@10: movss [outq + 68*SBLIMIT], m6 yading@10: yading@10: mulps m6, m3, [winq + 4*4] yading@10: LOAD m4, m7, bufq + 4*16, 16 yading@10: addps m6, m6, m4 yading@10: STORE m6, m7, outq + 16*SBLIMIT, 4*SBLIMIT yading@10: yading@10: shufps m4, m0, m3, 0xb5 yading@10: mulps m4, m4, [winq + 8*4] yading@10: LOAD m7, m6, bufq + 4*32, 16 yading@10: addps m4, m4, m7 yading@10: STORE m4, m6, outq + 32*SBLIMIT, 4*SBLIMIT yading@10: yading@10: shufps m3, m3, m2, 0xb1 yading@10: mulps m3, m3, [winq + 12*4] yading@10: LOAD m7, m6, bufq + 4*48, 16 yading@10: addps m3, m3, m7 yading@10: STORE m3, m7, outq + 48*SBLIMIT, 4*SBLIMIT yading@10: yading@10: mulps m2, m2, [winq] yading@10: LOAD m6, m7, bufq, 16 yading@10: addps m2, m2, m6 yading@10: STORE m2, m7, outq, 4*SBLIMIT yading@10: yading@10: mulps m4, m1, [winq + 20*4] yading@10: STORE m4, m7, bufq, 16 yading@10: yading@10: mulps m3, m5, [winq + 24*4] yading@10: STORE m3, m7, bufq + 4*16, 16 yading@10: yading@10: shufps m0, m0, m5, 0xb0 yading@10: mulps m0, m0, [winq + 28*4] yading@10: STORE m0, m7, bufq + 4*32, 16 yading@10: yading@10: shufps m5, m5, m1, 0xb1 yading@10: mulps m5, m5, [winq + 32*4] yading@10: STORE m5, m7, bufq + 4*48, 16 yading@10: yading@10: shufps m1, m1, m1, 0xb1 yading@10: mulps m1, m1, [winq + 36*4] yading@10: movss [bufq + 4*64], m1 yading@10: shufps m1, m1, 0xb1 yading@10: movss [bufq + 4*68], m1 yading@10: RET yading@10: %endmacro yading@10: yading@10: INIT_XMM sse yading@10: DEFINE_IMDCT yading@10: yading@10: INIT_XMM sse2 yading@10: DEFINE_IMDCT yading@10: yading@10: INIT_XMM sse3 yading@10: DEFINE_IMDCT yading@10: yading@10: INIT_XMM ssse3 yading@10: DEFINE_IMDCT yading@10: yading@10: %if HAVE_AVX_EXTERNAL yading@10: INIT_XMM avx yading@10: DEFINE_IMDCT yading@10: %endif yading@10: yading@10: INIT_XMM sse yading@10: yading@10: %if ARCH_X86_64 yading@10: %define SPILL SWAP yading@10: %define UNSPILL SWAP yading@10: %define SPILLED(x) m %+ x yading@10: %else yading@10: %define SPILLED(x) [tmpq+(x-8)*16 + 32*4] yading@10: %macro SPILL 2 ; xmm#, mempos yading@10: movaps SPILLED(%2), m%1 yading@10: %endmacro yading@10: %macro UNSPILL 2 yading@10: movaps m%1, SPILLED(%2) yading@10: %endmacro yading@10: %endif yading@10: yading@10: %macro DEFINE_FOUR_IMDCT 0 yading@10: cglobal four_imdct36_float, 5,5,16, out, buf, in, win, tmp yading@10: movlps m0, [inq+64] yading@10: movhps m0, [inq+64 + 72] yading@10: movlps m3, [inq+64 + 2*72] yading@10: movhps m3, [inq+64 + 3*72] yading@10: yading@10: shufps m5, m0, m3, 0xdd yading@10: shufps m0, m0, m3, 0x88 yading@10: yading@10: mova m1, [inq+48] yading@10: movu m6, [inq+48 + 72] yading@10: mova m7, [inq+48 + 2*72] yading@10: movu m3, [inq+48 + 3*72] yading@10: yading@10: TRANSPOSE4x4PS 1, 6, 7, 3, 4 yading@10: yading@10: addps m4, m6, m7 yading@10: mova [tmpq+4*28], m4 yading@10: yading@10: addps m7, m3 yading@10: addps m6, m1 yading@10: addps m3, m0 yading@10: addps m0, m5 yading@10: addps m0, m7 yading@10: addps m7, m6 yading@10: mova [tmpq+4*12], m7 yading@10: SPILL 3, 12 yading@10: yading@10: mova m4, [inq+32] yading@10: movu m5, [inq+32 + 72] yading@10: mova m2, [inq+32 + 2*72] yading@10: movu m7, [inq+32 + 3*72] yading@10: yading@10: TRANSPOSE4x4PS 4, 5, 2, 7, 3 yading@10: yading@10: addps m1, m7 yading@10: SPILL 1, 11 yading@10: yading@10: addps m3, m5, m2 yading@10: SPILL 3, 13 yading@10: yading@10: addps m7, m2 yading@10: addps m5, m4 yading@10: addps m6, m7 yading@10: mova [tmpq], m6 yading@10: addps m7, m5 yading@10: mova [tmpq+4*16], m7 yading@10: yading@10: mova m2, [inq+16] yading@10: movu m7, [inq+16 + 72] yading@10: mova m1, [inq+16 + 2*72] yading@10: movu m6, [inq+16 + 3*72] yading@10: yading@10: TRANSPOSE4x4PS 2, 7, 1, 6, 3 yading@10: yading@10: addps m4, m6 yading@10: addps m6, m1 yading@10: addps m1, m7 yading@10: addps m7, m2 yading@10: addps m5, m6 yading@10: SPILL 5, 15 yading@10: addps m6, m7 yading@10: mulps m6, [costabs + 16*2] yading@10: mova [tmpq+4*8], m6 yading@10: SPILL 1, 10 yading@10: SPILL 0, 14 yading@10: yading@10: mova m1, [inq] yading@10: movu m6, [inq + 72] yading@10: mova m3, [inq + 2*72] yading@10: movu m5, [inq + 3*72] yading@10: yading@10: TRANSPOSE4x4PS 1, 6, 3, 5, 0 yading@10: yading@10: addps m2, m5 yading@10: addps m5, m3 yading@10: addps m7, m5 yading@10: addps m3, m6 yading@10: addps m6, m1 yading@10: SPILL 7, 8 yading@10: addps m5, m6 yading@10: SPILL 6, 9 yading@10: addps m6, m4, SPILLED(12) yading@10: subps m6, m2 yading@10: UNSPILL 7, 11 yading@10: SPILL 5, 11 yading@10: subps m5, m1, m7 yading@10: mulps m7, [costabs + 16*5] yading@10: addps m7, m1 yading@10: mulps m0, m6, [costabs + 16*6] yading@10: addps m0, m5 yading@10: mova [tmpq+4*24], m0 yading@10: addps m6, m5 yading@10: mova [tmpq+4*4], m6 yading@10: addps m6, m4, m2 yading@10: mulps m6, [costabs + 16*1] yading@10: subps m4, SPILLED(12) yading@10: mulps m4, [costabs + 16*8] yading@10: addps m2, SPILLED(12) yading@10: mulps m2, [costabs + 16*3] yading@10: subps m5, m7, m6 yading@10: subps m5, m2 yading@10: addps m6, m7 yading@10: addps m6, m4 yading@10: addps m7, m2 yading@10: subps m7, m4 yading@10: mova [tmpq+4*20], m7 yading@10: mova m2, [tmpq+4*28] yading@10: mova [tmpq+4*28], m5 yading@10: UNSPILL 7, 13 yading@10: subps m5, m7, m2 yading@10: mulps m5, [costabs + 16*7] yading@10: UNSPILL 1, 10 yading@10: mulps m1, [costabs + 16*2] yading@10: addps m4, m3, m2 yading@10: mulps m4, [costabs + 16*4] yading@10: addps m2, m7 yading@10: addps m7, m3 yading@10: mulps m7, [costabs] yading@10: subps m3, m2 yading@10: mulps m3, [costabs + 16*2] yading@10: addps m2, m7, m5 yading@10: addps m2, m1 yading@10: SPILL 2, 10 yading@10: addps m7, m4 yading@10: subps m7, m1 yading@10: SPILL 7, 12 yading@10: subps m5, m4 yading@10: subps m5, m1 yading@10: UNSPILL 0, 14 yading@10: SPILL 5, 13 yading@10: addps m1, m0, SPILLED(15) yading@10: subps m1, SPILLED(8) yading@10: mova m4, [costabs + 16*5] yading@10: mulps m4, [tmpq] yading@10: UNSPILL 2, 9 yading@10: addps m4, m2 yading@10: subps m2, [tmpq] yading@10: mulps m5, m1, [costabs + 16*6] yading@10: addps m5, m2 yading@10: SPILL 5, 9 yading@10: addps m2, m1 yading@10: SPILL 2, 14 yading@10: UNSPILL 5, 15 yading@10: subps m7, m5, m0 yading@10: addps m5, SPILLED(8) yading@10: mulps m5, [costabs + 16*1] yading@10: mulps m7, [costabs + 16*8] yading@10: addps m0, SPILLED(8) yading@10: mulps m0, [costabs + 16*3] yading@10: subps m2, m4, m5 yading@10: subps m2, m0 yading@10: SPILL 2, 15 yading@10: addps m5, m4 yading@10: addps m5, m7 yading@10: addps m4, m0 yading@10: subps m4, m7 yading@10: SPILL 4, 8 yading@10: mova m7, [tmpq+4*16] yading@10: mova m2, [tmpq+4*12] yading@10: addps m0, m7, m2 yading@10: subps m0, SPILLED(11) yading@10: mulps m0, [costabs + 16*2] yading@10: addps m4, m7, SPILLED(11) yading@10: mulps m4, [costabs] yading@10: subps m7, m2 yading@10: mulps m7, [costabs + 16*7] yading@10: addps m2, SPILLED(11) yading@10: mulps m2, [costabs + 16*4] yading@10: addps m1, m7, [tmpq+4*8] yading@10: addps m1, m4 yading@10: addps m4, m2 yading@10: subps m4, [tmpq+4*8] yading@10: SPILL 4, 11 yading@10: subps m7, m2 yading@10: subps m7, [tmpq+4*8] yading@10: addps m4, m6, SPILLED(10) yading@10: subps m6, SPILLED(10) yading@10: addps m2, m5, m1 yading@10: mulps m2, [costabs + 16*9] yading@10: subps m5, m1 yading@10: mulps m5, [costabs + 16*17] yading@10: subps m1, m4, m2 yading@10: addps m4, m2 yading@10: mulps m2, m1, [winq+4*36] yading@10: addps m2, [bufq+4*36] yading@10: mova [outq+1152], m2 yading@10: mulps m1, [winq+4*32] yading@10: addps m1, [bufq+4*32] yading@10: mova [outq+1024], m1 yading@10: mulps m1, m4, [winq+4*116] yading@10: mova [bufq+4*36], m1 yading@10: mulps m4, [winq+4*112] yading@10: mova [bufq+4*32], m4 yading@10: addps m2, m6, m5 yading@10: subps m6, m5 yading@10: mulps m1, m6, [winq+4*68] yading@10: addps m1, [bufq+4*68] yading@10: mova [outq+2176], m1 yading@10: mulps m6, [winq] yading@10: addps m6, [bufq] yading@10: mova [outq], m6 yading@10: mulps m1, m2, [winq+4*148] yading@10: mova [bufq+4*68], m1 yading@10: mulps m2, [winq+4*80] yading@10: mova [bufq], m2 yading@10: addps m5, m3, [tmpq+4*24] yading@10: mova m2, [tmpq+4*24] yading@10: subps m2, m3 yading@10: mova m1, SPILLED(9) yading@10: subps m1, m0 yading@10: mulps m1, [costabs + 16*10] yading@10: addps m0, SPILLED(9) yading@10: mulps m0, [costabs + 16*16] yading@10: addps m6, m5, m1 yading@10: subps m5, m1 yading@10: mulps m3, m5, [winq+4*40] yading@10: addps m3, [bufq+4*40] yading@10: mova [outq+1280], m3 yading@10: mulps m5, [winq+4*28] yading@10: addps m5, [bufq+4*28] yading@10: mova [outq+896], m5 yading@10: mulps m1, m6, [winq+4*120] yading@10: mova [bufq+4*40], m1 yading@10: mulps m6, [winq+4*108] yading@10: mova [bufq+4*28], m6 yading@10: addps m1, m2, m0 yading@10: subps m2, m0 yading@10: mulps m5, m2, [winq+4*64] yading@10: addps m5, [bufq+4*64] yading@10: mova [outq+2048], m5 yading@10: mulps m2, [winq+4*4] yading@10: addps m2, [bufq+4*4] yading@10: mova [outq+128], m2 yading@10: mulps m0, m1, [winq+4*144] yading@10: mova [bufq+4*64], m0 yading@10: mulps m1, [winq+4*84] yading@10: mova [bufq+4*4], m1 yading@10: mova m1, [tmpq+4*28] yading@10: mova m5, m1 yading@10: addps m1, SPILLED(13) yading@10: subps m5, SPILLED(13) yading@10: UNSPILL 3, 15 yading@10: addps m2, m7, m3 yading@10: mulps m2, [costabs + 16*11] yading@10: subps m3, m7 yading@10: mulps m3, [costabs + 16*15] yading@10: addps m0, m2, m1 yading@10: subps m1, m2 yading@10: SWAP m0, m2 yading@10: mulps m6, m1, [winq+4*44] yading@10: addps m6, [bufq+4*44] yading@10: mova [outq+1408], m6 yading@10: mulps m1, [winq+4*24] yading@10: addps m1, [bufq+4*24] yading@10: mova [outq+768], m1 yading@10: mulps m0, m2, [winq+4*124] yading@10: mova [bufq+4*44], m0 yading@10: mulps m2, [winq+4*104] yading@10: mova [bufq+4*24], m2 yading@10: addps m0, m5, m3 yading@10: subps m5, m3 yading@10: mulps m1, m5, [winq+4*60] yading@10: addps m1, [bufq+4*60] yading@10: mova [outq+1920], m1 yading@10: mulps m5, [winq+4*8] yading@10: addps m5, [bufq+4*8] yading@10: mova [outq+256], m5 yading@10: mulps m1, m0, [winq+4*140] yading@10: mova [bufq+4*60], m1 yading@10: mulps m0, [winq+4*88] yading@10: mova [bufq+4*8], m0 yading@10: mova m1, [tmpq+4*20] yading@10: addps m1, SPILLED(12) yading@10: mova m2, [tmpq+4*20] yading@10: subps m2, SPILLED(12) yading@10: UNSPILL 7, 8 yading@10: subps m0, m7, SPILLED(11) yading@10: addps m7, SPILLED(11) yading@10: mulps m4, m7, [costabs + 16*12] yading@10: mulps m0, [costabs + 16*14] yading@10: addps m5, m1, m4 yading@10: subps m1, m4 yading@10: mulps m7, m1, [winq+4*48] yading@10: addps m7, [bufq+4*48] yading@10: mova [outq+1536], m7 yading@10: mulps m1, [winq+4*20] yading@10: addps m1, [bufq+4*20] yading@10: mova [outq+640], m1 yading@10: mulps m1, m5, [winq+4*128] yading@10: mova [bufq+4*48], m1 yading@10: mulps m5, [winq+4*100] yading@10: mova [bufq+4*20], m5 yading@10: addps m6, m2, m0 yading@10: subps m2, m0 yading@10: mulps m1, m2, [winq+4*56] yading@10: addps m1, [bufq+4*56] yading@10: mova [outq+1792], m1 yading@10: mulps m2, [winq+4*12] yading@10: addps m2, [bufq+4*12] yading@10: mova [outq+384], m2 yading@10: mulps m0, m6, [winq+4*136] yading@10: mova [bufq+4*56], m0 yading@10: mulps m6, [winq+4*92] yading@10: mova [bufq+4*12], m6 yading@10: UNSPILL 0, 14 yading@10: mulps m0, [costabs + 16*13] yading@10: mova m3, [tmpq+4*4] yading@10: addps m2, m0, m3 yading@10: subps m3, m0 yading@10: mulps m0, m3, [winq+4*52] yading@10: addps m0, [bufq+4*52] yading@10: mova [outq+1664], m0 yading@10: mulps m3, [winq+4*16] yading@10: addps m3, [bufq+4*16] yading@10: mova [outq+512], m3 yading@10: mulps m0, m2, [winq+4*132] yading@10: mova [bufq+4*52], m0 yading@10: mulps m2, [winq+4*96] yading@10: mova [bufq+4*16], m2 yading@10: RET yading@10: %endmacro yading@10: yading@10: INIT_XMM sse yading@10: DEFINE_FOUR_IMDCT yading@10: yading@10: %if HAVE_AVX_EXTERNAL yading@10: INIT_XMM avx yading@10: DEFINE_FOUR_IMDCT yading@10: %endif