yading@10
|
1 ;******************************************************************************
|
yading@10
|
2 ;* 36 point SSE-optimized IMDCT transform
|
yading@10
|
3 ;* Copyright (c) 2011 Vitor Sessak
|
yading@10
|
4 ;*
|
yading@10
|
5 ;* This file is part of FFmpeg.
|
yading@10
|
6 ;*
|
yading@10
|
7 ;* FFmpeg is free software; you can redistribute it and/or
|
yading@10
|
8 ;* modify it under the terms of the GNU Lesser General Public
|
yading@10
|
9 ;* License as published by the Free Software Foundation; either
|
yading@10
|
10 ;* version 2.1 of the License, or (at your option) any later version.
|
yading@10
|
11 ;*
|
yading@10
|
12 ;* FFmpeg is distributed in the hope that it will be useful,
|
yading@10
|
13 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
yading@10
|
14 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
yading@10
|
15 ;* Lesser General Public License for more details.
|
yading@10
|
16 ;*
|
yading@10
|
17 ;* You should have received a copy of the GNU Lesser General Public
|
yading@10
|
18 ;* License along with FFmpeg; if not, write to the Free Software
|
yading@10
|
19 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
yading@10
|
20 ;******************************************************************************
|
yading@10
|
21
|
yading@10
|
22 %include "libavutil/x86/x86util.asm"
|
yading@10
|
23
|
yading@10
|
24 SECTION_RODATA
|
yading@10
|
25
|
yading@10
|
26 align 16
|
yading@10
|
27 ps_mask: dd 0, ~0, ~0, ~0
|
yading@10
|
28 ps_mask2: dd 0, ~0, 0, ~0
|
yading@10
|
29 ps_mask3: dd 0, 0, 0, ~0
|
yading@10
|
30 ps_mask4: dd 0, ~0, 0, 0
|
yading@10
|
31
|
yading@10
|
32 ps_val1: dd -0.5, -0.5, -0.8660254038, -0.8660254038
|
yading@10
|
33 ps_val2: dd 1.0, 1.0, 0.8660254038, 0.8660254038
|
yading@10
|
34 ps_val3: dd 0.1736481777, 0.1736481777, 0.3420201433, 0.3420201433
|
yading@10
|
35 ps_val4: dd -0.7660444431, -0.7660444431, 0.8660254038, 0.8660254038
|
yading@10
|
36 ps_val5: dd -0.9396926208, -0.9396926208, -0.9848077530, -0.9848077530
|
yading@10
|
37 ps_val6: dd 0.5, 0.5, -0.6427876097, -0.6427876097
|
yading@10
|
38 ps_val7: dd 1.0, 1.0, -0.6427876097, -0.6427876097
|
yading@10
|
39
|
yading@10
|
40 ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000
|
yading@10
|
41 ps_p1m1p1m1: dd 0, 0x80000000, 0, 0x80000000
|
yading@10
|
42
|
yading@10
|
43 ps_cosh: dd 1.0, 0.50190991877167369479, 1.0, 5.73685662283492756461
|
yading@10
|
44 dd 1.0, 0.51763809020504152469, 1.0, 1.93185165257813657349
|
yading@10
|
45 dd 1.0, 0.55168895948124587824, -1.0, -1.18310079157624925896
|
yading@10
|
46 dd 1.0, 0.61038729438072803416, -1.0, -0.87172339781054900991
|
yading@10
|
47 dd 1.0, 0.70710678118654752439, 0.0, 0.0
|
yading@10
|
48
|
yading@10
|
49 ps_cosh_sse3: dd 1.0, -0.50190991877167369479, 1.0, -5.73685662283492756461
|
yading@10
|
50 dd 1.0, -0.51763809020504152469, 1.0, -1.93185165257813657349
|
yading@10
|
51 dd 1.0, -0.55168895948124587824, -1.0, 1.18310079157624925896
|
yading@10
|
52 dd 1.0, -0.61038729438072803416, -1.0, 0.87172339781054900991
|
yading@10
|
53 dd 1.0, 0.70710678118654752439, 0.0, 0.0
|
yading@10
|
54
|
yading@10
|
55 costabs: times 4 dd 0.98480773
|
yading@10
|
56 times 4 dd 0.93969262
|
yading@10
|
57 times 4 dd 0.86602539
|
yading@10
|
58 times 4 dd -0.76604444
|
yading@10
|
59 times 4 dd -0.64278764
|
yading@10
|
60 times 4 dd 0.50000000
|
yading@10
|
61 times 4 dd -0.50000000
|
yading@10
|
62 times 4 dd -0.34202015
|
yading@10
|
63 times 4 dd -0.17364818
|
yading@10
|
64 times 4 dd 0.50190992
|
yading@10
|
65 times 4 dd 0.51763808
|
yading@10
|
66 times 4 dd 0.55168896
|
yading@10
|
67 times 4 dd 0.61038726
|
yading@10
|
68 times 4 dd 0.70710677
|
yading@10
|
69 times 4 dd 0.87172341
|
yading@10
|
70 times 4 dd 1.18310082
|
yading@10
|
71 times 4 dd 1.93185163
|
yading@10
|
72 times 4 dd 5.73685646
|
yading@10
|
73
|
yading@10
|
74 %define SBLIMIT 32
|
yading@10
|
75 SECTION_TEXT
|
yading@10
|
76
|
yading@10
|
77 %macro PSHUFD 3
|
yading@10
|
78 %if cpuflag(sse2) && notcpuflag(avx)
|
yading@10
|
79 pshufd %1, %2, %3
|
yading@10
|
80 %else
|
yading@10
|
81 shufps %1, %2, %2, %3
|
yading@10
|
82 %endif
|
yading@10
|
83 %endmacro
|
yading@10
|
84
|
yading@10
|
85 ; input %2={x1,x2,x3,x4}, %3={y1,y2,y3,y4}
|
yading@10
|
86 ; output %1={x3,x4,y1,y2}
|
yading@10
|
87 %macro BUILDINVHIGHLOW 3
|
yading@10
|
88 %if cpuflag(avx)
|
yading@10
|
89 shufps %1, %2, %3, 0x4e
|
yading@10
|
90 %else
|
yading@10
|
91 movlhps %1, %3
|
yading@10
|
92 movhlps %1, %2
|
yading@10
|
93 %endif
|
yading@10
|
94 %endmacro
|
yading@10
|
95
|
yading@10
|
96 ; input %2={x1,x2,x3,x4}, %3={y1,y2,y3,y4}
|
yading@10
|
97 ; output %1={x4,y1,y2,y3}
|
yading@10
|
98 %macro ROTLEFT 3
|
yading@10
|
99 %if cpuflag(ssse3)
|
yading@10
|
100 palignr %1, %3, %2, 12
|
yading@10
|
101 %else
|
yading@10
|
102 BUILDINVHIGHLOW %1, %2, %3
|
yading@10
|
103 shufps %1, %1, %3, 0x99
|
yading@10
|
104 %endif
|
yading@10
|
105 %endmacro
|
yading@10
|
106
|
yading@10
|
107 %macro INVERTHL 2
|
yading@10
|
108 %if cpuflag(sse2)
|
yading@10
|
109 PSHUFD %1, %2, 0x4e
|
yading@10
|
110 %else
|
yading@10
|
111 movhlps %1, %2
|
yading@10
|
112 movlhps %1, %2
|
yading@10
|
113 %endif
|
yading@10
|
114 %endmacro
|
yading@10
|
115
|
yading@10
|
116 %macro BUTTERF 3
|
yading@10
|
117 INVERTHL %2, %1
|
yading@10
|
118 xorps %1, [ps_p1p1m1m1]
|
yading@10
|
119 addps %1, %2
|
yading@10
|
120 %if cpuflag(sse3)
|
yading@10
|
121 mulps %1, %1, [ps_cosh_sse3 + %3]
|
yading@10
|
122 PSHUFD %2, %1, 0xb1
|
yading@10
|
123 addsubps %1, %1, %2
|
yading@10
|
124 %else
|
yading@10
|
125 mulps %1, [ps_cosh + %3]
|
yading@10
|
126 PSHUFD %2, %1, 0xb1
|
yading@10
|
127 xorps %1, [ps_p1m1p1m1]
|
yading@10
|
128 addps %1, %2
|
yading@10
|
129 %endif
|
yading@10
|
130 %endmacro
|
yading@10
|
131
|
yading@10
|
132 %macro STORE 4
|
yading@10
|
133 movhlps %2, %1
|
yading@10
|
134 movss [%3 ], %1
|
yading@10
|
135 movss [%3 + 2*%4], %2
|
yading@10
|
136 shufps %1, %1, 0xb1
|
yading@10
|
137 movss [%3 + %4], %1
|
yading@10
|
138 movhlps %2, %1
|
yading@10
|
139 movss [%3 + 3*%4], %2
|
yading@10
|
140 %endmacro
|
yading@10
|
141
|
yading@10
|
142 %macro LOAD 4
|
yading@10
|
143 movlps %1, [%3 ]
|
yading@10
|
144 movhps %1, [%3 + %4]
|
yading@10
|
145 movlps %2, [%3 + 2*%4]
|
yading@10
|
146 movhps %2, [%3 + 3*%4]
|
yading@10
|
147 shufps %1, %2, 0x88
|
yading@10
|
148 %endmacro
|
yading@10
|
149
|
yading@10
|
150 %macro LOADA64 2
|
yading@10
|
151 %if cpuflag(avx)
|
yading@10
|
152 movu %1, [%2]
|
yading@10
|
153 %else
|
yading@10
|
154 movlps %1, [%2]
|
yading@10
|
155 movhps %1, [%2 + 8]
|
yading@10
|
156 %endif
|
yading@10
|
157 %endmacro
|
yading@10
|
158
|
yading@10
|
159 %macro DEFINE_IMDCT 0
|
yading@10
|
160 cglobal imdct36_float, 4,4,9, out, buf, in, win
|
yading@10
|
161
|
yading@10
|
162 ; for(i=17;i>=1;i--) in[i] += in[i-1];
|
yading@10
|
163 LOADA64 m0, inq
|
yading@10
|
164 LOADA64 m1, inq + 16
|
yading@10
|
165
|
yading@10
|
166 ROTLEFT m5, m0, m1
|
yading@10
|
167
|
yading@10
|
168 PSHUFD m6, m0, 0x93
|
yading@10
|
169 andps m6, m6, [ps_mask]
|
yading@10
|
170 addps m0, m0, m6
|
yading@10
|
171
|
yading@10
|
172 LOADA64 m2, inq + 32
|
yading@10
|
173
|
yading@10
|
174 ROTLEFT m7, m1, m2
|
yading@10
|
175
|
yading@10
|
176 addps m1, m1, m5
|
yading@10
|
177 LOADA64 m3, inq + 48
|
yading@10
|
178
|
yading@10
|
179 ROTLEFT m5, m2, m3
|
yading@10
|
180
|
yading@10
|
181 xorps m4, m4, m4
|
yading@10
|
182 movlps m4, [inq+64]
|
yading@10
|
183 BUILDINVHIGHLOW m6, m3, m4
|
yading@10
|
184 shufps m6, m6, m4, 0xa9
|
yading@10
|
185
|
yading@10
|
186 addps m4, m4, m6
|
yading@10
|
187 addps m2, m2, m7
|
yading@10
|
188 addps m3, m3, m5
|
yading@10
|
189
|
yading@10
|
190 ; for(i=17;i>=3;i-=2) in[i] += in[i-2];
|
yading@10
|
191 movlhps m5, m5, m0
|
yading@10
|
192 andps m5, m5, [ps_mask3]
|
yading@10
|
193
|
yading@10
|
194 BUILDINVHIGHLOW m7, m0, m1
|
yading@10
|
195 andps m7, m7, [ps_mask2]
|
yading@10
|
196
|
yading@10
|
197 addps m0, m0, m5
|
yading@10
|
198
|
yading@10
|
199 BUILDINVHIGHLOW m6, m1, m2
|
yading@10
|
200 andps m6, m6, [ps_mask2]
|
yading@10
|
201
|
yading@10
|
202 addps m1, m1, m7
|
yading@10
|
203
|
yading@10
|
204 BUILDINVHIGHLOW m7, m2, m3
|
yading@10
|
205 andps m7, m7, [ps_mask2]
|
yading@10
|
206
|
yading@10
|
207 addps m2, m2, m6
|
yading@10
|
208
|
yading@10
|
209 movhlps m6, m6, m3
|
yading@10
|
210 andps m6, m6, [ps_mask4]
|
yading@10
|
211
|
yading@10
|
212 addps m3, m3, m7
|
yading@10
|
213 addps m4, m4, m6
|
yading@10
|
214
|
yading@10
|
215 ; Populate tmp[]
|
yading@10
|
216 movlhps m6, m1, m5 ; zero out high values
|
yading@10
|
217 subps m6, m6, m4
|
yading@10
|
218
|
yading@10
|
219 subps m5, m0, m3
|
yading@10
|
220
|
yading@10
|
221 %if ARCH_X86_64
|
yading@10
|
222 SWAP m5, m8
|
yading@10
|
223 %endif
|
yading@10
|
224
|
yading@10
|
225 mulps m7, m2, [ps_val1]
|
yading@10
|
226
|
yading@10
|
227 %if ARCH_X86_64
|
yading@10
|
228 mulps m5, m8, [ps_val2]
|
yading@10
|
229 %else
|
yading@10
|
230 mulps m5, m5, [ps_val2]
|
yading@10
|
231 %endif
|
yading@10
|
232 addps m7, m7, m5
|
yading@10
|
233
|
yading@10
|
234 mulps m5, m6, [ps_val1]
|
yading@10
|
235 subps m7, m7, m5
|
yading@10
|
236
|
yading@10
|
237 %if ARCH_X86_64
|
yading@10
|
238 SWAP m5, m8
|
yading@10
|
239 %else
|
yading@10
|
240 subps m5, m0, m3
|
yading@10
|
241 %endif
|
yading@10
|
242
|
yading@10
|
243 subps m5, m5, m6
|
yading@10
|
244 addps m5, m5, m2
|
yading@10
|
245
|
yading@10
|
246 shufps m6, m4, m3, 0xe4
|
yading@10
|
247 subps m6, m6, m2
|
yading@10
|
248 mulps m6, m6, [ps_val3]
|
yading@10
|
249
|
yading@10
|
250 addps m4, m4, m1
|
yading@10
|
251 mulps m4, m4, [ps_val4]
|
yading@10
|
252
|
yading@10
|
253 shufps m1, m1, m0, 0xe4
|
yading@10
|
254 addps m1, m1, m2
|
yading@10
|
255 mulps m1, m1, [ps_val5]
|
yading@10
|
256
|
yading@10
|
257 mulps m3, m3, [ps_val6]
|
yading@10
|
258 mulps m0, m0, [ps_val7]
|
yading@10
|
259 addps m0, m0, m3
|
yading@10
|
260
|
yading@10
|
261 xorps m2, m1, [ps_p1p1m1m1]
|
yading@10
|
262 subps m2, m2, m4
|
yading@10
|
263 addps m2, m2, m0
|
yading@10
|
264
|
yading@10
|
265 addps m3, m4, m0
|
yading@10
|
266 subps m3, m3, m6
|
yading@10
|
267 xorps m3, m3, [ps_p1p1m1m1]
|
yading@10
|
268
|
yading@10
|
269 shufps m0, m0, m4, 0xe4
|
yading@10
|
270 subps m0, m0, m1
|
yading@10
|
271 addps m0, m0, m6
|
yading@10
|
272
|
yading@10
|
273 BUILDINVHIGHLOW m4, m2, m3
|
yading@10
|
274 shufps m3, m3, m2, 0x4e
|
yading@10
|
275
|
yading@10
|
276 ; we have tmp = {SwAPLH(m0), SwAPLH(m7), m3, m4, m5}
|
yading@10
|
277
|
yading@10
|
278 BUTTERF m0, m1, 0
|
yading@10
|
279 BUTTERF m7, m2, 16
|
yading@10
|
280 BUTTERF m3, m6, 32
|
yading@10
|
281 BUTTERF m4, m1, 48
|
yading@10
|
282
|
yading@10
|
283 mulps m5, m5, [ps_cosh + 64]
|
yading@10
|
284 PSHUFD m1, m5, 0xe1
|
yading@10
|
285 xorps m5, m5, [ps_p1m1p1m1]
|
yading@10
|
286 addps m5, m5, m1
|
yading@10
|
287
|
yading@10
|
288 ; permutates:
|
yading@10
|
289 ; m0 0 1 2 3 => 2 6 10 14 m1
|
yading@10
|
290 ; m7 4 5 6 7 => 3 7 11 15 m2
|
yading@10
|
291 ; m3 8 9 10 11 => 17 13 9 5 m3
|
yading@10
|
292 ; m4 12 13 14 15 => 16 12 8 4 m5
|
yading@10
|
293 ; m5 16 17 xx xx => 0 1 xx xx m0
|
yading@10
|
294
|
yading@10
|
295 unpckhps m1, m0, m7
|
yading@10
|
296 unpckhps m6, m3, m4
|
yading@10
|
297 movhlps m2, m6, m1
|
yading@10
|
298 movlhps m1, m1, m6
|
yading@10
|
299
|
yading@10
|
300 unpcklps m5, m5, m4
|
yading@10
|
301 unpcklps m3, m3, m7
|
yading@10
|
302 movhlps m4, m3, m5
|
yading@10
|
303 movlhps m5, m5, m3
|
yading@10
|
304 SWAP m4, m3
|
yading@10
|
305 ; permutation done
|
yading@10
|
306
|
yading@10
|
307 PSHUFD m6, m2, 0xb1
|
yading@10
|
308 movss m4, [bufq + 4*68]
|
yading@10
|
309 movss m7, [bufq + 4*64]
|
yading@10
|
310 unpcklps m7, m7, m4
|
yading@10
|
311 mulps m6, m6, [winq + 16*4]
|
yading@10
|
312 addps m6, m6, m7
|
yading@10
|
313 movss [outq + 64*SBLIMIT], m6
|
yading@10
|
314 shufps m6, m6, m6, 0xb1
|
yading@10
|
315 movss [outq + 68*SBLIMIT], m6
|
yading@10
|
316
|
yading@10
|
317 mulps m6, m3, [winq + 4*4]
|
yading@10
|
318 LOAD m4, m7, bufq + 4*16, 16
|
yading@10
|
319 addps m6, m6, m4
|
yading@10
|
320 STORE m6, m7, outq + 16*SBLIMIT, 4*SBLIMIT
|
yading@10
|
321
|
yading@10
|
322 shufps m4, m0, m3, 0xb5
|
yading@10
|
323 mulps m4, m4, [winq + 8*4]
|
yading@10
|
324 LOAD m7, m6, bufq + 4*32, 16
|
yading@10
|
325 addps m4, m4, m7
|
yading@10
|
326 STORE m4, m6, outq + 32*SBLIMIT, 4*SBLIMIT
|
yading@10
|
327
|
yading@10
|
328 shufps m3, m3, m2, 0xb1
|
yading@10
|
329 mulps m3, m3, [winq + 12*4]
|
yading@10
|
330 LOAD m7, m6, bufq + 4*48, 16
|
yading@10
|
331 addps m3, m3, m7
|
yading@10
|
332 STORE m3, m7, outq + 48*SBLIMIT, 4*SBLIMIT
|
yading@10
|
333
|
yading@10
|
334 mulps m2, m2, [winq]
|
yading@10
|
335 LOAD m6, m7, bufq, 16
|
yading@10
|
336 addps m2, m2, m6
|
yading@10
|
337 STORE m2, m7, outq, 4*SBLIMIT
|
yading@10
|
338
|
yading@10
|
339 mulps m4, m1, [winq + 20*4]
|
yading@10
|
340 STORE m4, m7, bufq, 16
|
yading@10
|
341
|
yading@10
|
342 mulps m3, m5, [winq + 24*4]
|
yading@10
|
343 STORE m3, m7, bufq + 4*16, 16
|
yading@10
|
344
|
yading@10
|
345 shufps m0, m0, m5, 0xb0
|
yading@10
|
346 mulps m0, m0, [winq + 28*4]
|
yading@10
|
347 STORE m0, m7, bufq + 4*32, 16
|
yading@10
|
348
|
yading@10
|
349 shufps m5, m5, m1, 0xb1
|
yading@10
|
350 mulps m5, m5, [winq + 32*4]
|
yading@10
|
351 STORE m5, m7, bufq + 4*48, 16
|
yading@10
|
352
|
yading@10
|
353 shufps m1, m1, m1, 0xb1
|
yading@10
|
354 mulps m1, m1, [winq + 36*4]
|
yading@10
|
355 movss [bufq + 4*64], m1
|
yading@10
|
356 shufps m1, m1, 0xb1
|
yading@10
|
357 movss [bufq + 4*68], m1
|
yading@10
|
358 RET
|
yading@10
|
359 %endmacro
|
yading@10
|
360
|
yading@10
|
361 INIT_XMM sse
|
yading@10
|
362 DEFINE_IMDCT
|
yading@10
|
363
|
yading@10
|
364 INIT_XMM sse2
|
yading@10
|
365 DEFINE_IMDCT
|
yading@10
|
366
|
yading@10
|
367 INIT_XMM sse3
|
yading@10
|
368 DEFINE_IMDCT
|
yading@10
|
369
|
yading@10
|
370 INIT_XMM ssse3
|
yading@10
|
371 DEFINE_IMDCT
|
yading@10
|
372
|
yading@10
|
373 %if HAVE_AVX_EXTERNAL
|
yading@10
|
374 INIT_XMM avx
|
yading@10
|
375 DEFINE_IMDCT
|
yading@10
|
376 %endif
|
yading@10
|
377
|
yading@10
|
378 INIT_XMM sse
|
yading@10
|
379
|
yading@10
|
380 %if ARCH_X86_64
|
yading@10
|
381 %define SPILL SWAP
|
yading@10
|
382 %define UNSPILL SWAP
|
yading@10
|
383 %define SPILLED(x) m %+ x
|
yading@10
|
384 %else
|
yading@10
|
385 %define SPILLED(x) [tmpq+(x-8)*16 + 32*4]
|
yading@10
|
386 %macro SPILL 2 ; xmm#, mempos
|
yading@10
|
387 movaps SPILLED(%2), m%1
|
yading@10
|
388 %endmacro
|
yading@10
|
389 %macro UNSPILL 2
|
yading@10
|
390 movaps m%1, SPILLED(%2)
|
yading@10
|
391 %endmacro
|
yading@10
|
392 %endif
|
yading@10
|
393
|
yading@10
|
394 %macro DEFINE_FOUR_IMDCT 0
|
yading@10
|
395 cglobal four_imdct36_float, 5,5,16, out, buf, in, win, tmp
|
yading@10
|
396 movlps m0, [inq+64]
|
yading@10
|
397 movhps m0, [inq+64 + 72]
|
yading@10
|
398 movlps m3, [inq+64 + 2*72]
|
yading@10
|
399 movhps m3, [inq+64 + 3*72]
|
yading@10
|
400
|
yading@10
|
401 shufps m5, m0, m3, 0xdd
|
yading@10
|
402 shufps m0, m0, m3, 0x88
|
yading@10
|
403
|
yading@10
|
404 mova m1, [inq+48]
|
yading@10
|
405 movu m6, [inq+48 + 72]
|
yading@10
|
406 mova m7, [inq+48 + 2*72]
|
yading@10
|
407 movu m3, [inq+48 + 3*72]
|
yading@10
|
408
|
yading@10
|
409 TRANSPOSE4x4PS 1, 6, 7, 3, 4
|
yading@10
|
410
|
yading@10
|
411 addps m4, m6, m7
|
yading@10
|
412 mova [tmpq+4*28], m4
|
yading@10
|
413
|
yading@10
|
414 addps m7, m3
|
yading@10
|
415 addps m6, m1
|
yading@10
|
416 addps m3, m0
|
yading@10
|
417 addps m0, m5
|
yading@10
|
418 addps m0, m7
|
yading@10
|
419 addps m7, m6
|
yading@10
|
420 mova [tmpq+4*12], m7
|
yading@10
|
421 SPILL 3, 12
|
yading@10
|
422
|
yading@10
|
423 mova m4, [inq+32]
|
yading@10
|
424 movu m5, [inq+32 + 72]
|
yading@10
|
425 mova m2, [inq+32 + 2*72]
|
yading@10
|
426 movu m7, [inq+32 + 3*72]
|
yading@10
|
427
|
yading@10
|
428 TRANSPOSE4x4PS 4, 5, 2, 7, 3
|
yading@10
|
429
|
yading@10
|
430 addps m1, m7
|
yading@10
|
431 SPILL 1, 11
|
yading@10
|
432
|
yading@10
|
433 addps m3, m5, m2
|
yading@10
|
434 SPILL 3, 13
|
yading@10
|
435
|
yading@10
|
436 addps m7, m2
|
yading@10
|
437 addps m5, m4
|
yading@10
|
438 addps m6, m7
|
yading@10
|
439 mova [tmpq], m6
|
yading@10
|
440 addps m7, m5
|
yading@10
|
441 mova [tmpq+4*16], m7
|
yading@10
|
442
|
yading@10
|
443 mova m2, [inq+16]
|
yading@10
|
444 movu m7, [inq+16 + 72]
|
yading@10
|
445 mova m1, [inq+16 + 2*72]
|
yading@10
|
446 movu m6, [inq+16 + 3*72]
|
yading@10
|
447
|
yading@10
|
448 TRANSPOSE4x4PS 2, 7, 1, 6, 3
|
yading@10
|
449
|
yading@10
|
450 addps m4, m6
|
yading@10
|
451 addps m6, m1
|
yading@10
|
452 addps m1, m7
|
yading@10
|
453 addps m7, m2
|
yading@10
|
454 addps m5, m6
|
yading@10
|
455 SPILL 5, 15
|
yading@10
|
456 addps m6, m7
|
yading@10
|
457 mulps m6, [costabs + 16*2]
|
yading@10
|
458 mova [tmpq+4*8], m6
|
yading@10
|
459 SPILL 1, 10
|
yading@10
|
460 SPILL 0, 14
|
yading@10
|
461
|
yading@10
|
462 mova m1, [inq]
|
yading@10
|
463 movu m6, [inq + 72]
|
yading@10
|
464 mova m3, [inq + 2*72]
|
yading@10
|
465 movu m5, [inq + 3*72]
|
yading@10
|
466
|
yading@10
|
467 TRANSPOSE4x4PS 1, 6, 3, 5, 0
|
yading@10
|
468
|
yading@10
|
469 addps m2, m5
|
yading@10
|
470 addps m5, m3
|
yading@10
|
471 addps m7, m5
|
yading@10
|
472 addps m3, m6
|
yading@10
|
473 addps m6, m1
|
yading@10
|
474 SPILL 7, 8
|
yading@10
|
475 addps m5, m6
|
yading@10
|
476 SPILL 6, 9
|
yading@10
|
477 addps m6, m4, SPILLED(12)
|
yading@10
|
478 subps m6, m2
|
yading@10
|
479 UNSPILL 7, 11
|
yading@10
|
480 SPILL 5, 11
|
yading@10
|
481 subps m5, m1, m7
|
yading@10
|
482 mulps m7, [costabs + 16*5]
|
yading@10
|
483 addps m7, m1
|
yading@10
|
484 mulps m0, m6, [costabs + 16*6]
|
yading@10
|
485 addps m0, m5
|
yading@10
|
486 mova [tmpq+4*24], m0
|
yading@10
|
487 addps m6, m5
|
yading@10
|
488 mova [tmpq+4*4], m6
|
yading@10
|
489 addps m6, m4, m2
|
yading@10
|
490 mulps m6, [costabs + 16*1]
|
yading@10
|
491 subps m4, SPILLED(12)
|
yading@10
|
492 mulps m4, [costabs + 16*8]
|
yading@10
|
493 addps m2, SPILLED(12)
|
yading@10
|
494 mulps m2, [costabs + 16*3]
|
yading@10
|
495 subps m5, m7, m6
|
yading@10
|
496 subps m5, m2
|
yading@10
|
497 addps m6, m7
|
yading@10
|
498 addps m6, m4
|
yading@10
|
499 addps m7, m2
|
yading@10
|
500 subps m7, m4
|
yading@10
|
501 mova [tmpq+4*20], m7
|
yading@10
|
502 mova m2, [tmpq+4*28]
|
yading@10
|
503 mova [tmpq+4*28], m5
|
yading@10
|
504 UNSPILL 7, 13
|
yading@10
|
505 subps m5, m7, m2
|
yading@10
|
506 mulps m5, [costabs + 16*7]
|
yading@10
|
507 UNSPILL 1, 10
|
yading@10
|
508 mulps m1, [costabs + 16*2]
|
yading@10
|
509 addps m4, m3, m2
|
yading@10
|
510 mulps m4, [costabs + 16*4]
|
yading@10
|
511 addps m2, m7
|
yading@10
|
512 addps m7, m3
|
yading@10
|
513 mulps m7, [costabs]
|
yading@10
|
514 subps m3, m2
|
yading@10
|
515 mulps m3, [costabs + 16*2]
|
yading@10
|
516 addps m2, m7, m5
|
yading@10
|
517 addps m2, m1
|
yading@10
|
518 SPILL 2, 10
|
yading@10
|
519 addps m7, m4
|
yading@10
|
520 subps m7, m1
|
yading@10
|
521 SPILL 7, 12
|
yading@10
|
522 subps m5, m4
|
yading@10
|
523 subps m5, m1
|
yading@10
|
524 UNSPILL 0, 14
|
yading@10
|
525 SPILL 5, 13
|
yading@10
|
526 addps m1, m0, SPILLED(15)
|
yading@10
|
527 subps m1, SPILLED(8)
|
yading@10
|
528 mova m4, [costabs + 16*5]
|
yading@10
|
529 mulps m4, [tmpq]
|
yading@10
|
530 UNSPILL 2, 9
|
yading@10
|
531 addps m4, m2
|
yading@10
|
532 subps m2, [tmpq]
|
yading@10
|
533 mulps m5, m1, [costabs + 16*6]
|
yading@10
|
534 addps m5, m2
|
yading@10
|
535 SPILL 5, 9
|
yading@10
|
536 addps m2, m1
|
yading@10
|
537 SPILL 2, 14
|
yading@10
|
538 UNSPILL 5, 15
|
yading@10
|
539 subps m7, m5, m0
|
yading@10
|
540 addps m5, SPILLED(8)
|
yading@10
|
541 mulps m5, [costabs + 16*1]
|
yading@10
|
542 mulps m7, [costabs + 16*8]
|
yading@10
|
543 addps m0, SPILLED(8)
|
yading@10
|
544 mulps m0, [costabs + 16*3]
|
yading@10
|
545 subps m2, m4, m5
|
yading@10
|
546 subps m2, m0
|
yading@10
|
547 SPILL 2, 15
|
yading@10
|
548 addps m5, m4
|
yading@10
|
549 addps m5, m7
|
yading@10
|
550 addps m4, m0
|
yading@10
|
551 subps m4, m7
|
yading@10
|
552 SPILL 4, 8
|
yading@10
|
553 mova m7, [tmpq+4*16]
|
yading@10
|
554 mova m2, [tmpq+4*12]
|
yading@10
|
555 addps m0, m7, m2
|
yading@10
|
556 subps m0, SPILLED(11)
|
yading@10
|
557 mulps m0, [costabs + 16*2]
|
yading@10
|
558 addps m4, m7, SPILLED(11)
|
yading@10
|
559 mulps m4, [costabs]
|
yading@10
|
560 subps m7, m2
|
yading@10
|
561 mulps m7, [costabs + 16*7]
|
yading@10
|
562 addps m2, SPILLED(11)
|
yading@10
|
563 mulps m2, [costabs + 16*4]
|
yading@10
|
564 addps m1, m7, [tmpq+4*8]
|
yading@10
|
565 addps m1, m4
|
yading@10
|
566 addps m4, m2
|
yading@10
|
567 subps m4, [tmpq+4*8]
|
yading@10
|
568 SPILL 4, 11
|
yading@10
|
569 subps m7, m2
|
yading@10
|
570 subps m7, [tmpq+4*8]
|
yading@10
|
571 addps m4, m6, SPILLED(10)
|
yading@10
|
572 subps m6, SPILLED(10)
|
yading@10
|
573 addps m2, m5, m1
|
yading@10
|
574 mulps m2, [costabs + 16*9]
|
yading@10
|
575 subps m5, m1
|
yading@10
|
576 mulps m5, [costabs + 16*17]
|
yading@10
|
577 subps m1, m4, m2
|
yading@10
|
578 addps m4, m2
|
yading@10
|
579 mulps m2, m1, [winq+4*36]
|
yading@10
|
580 addps m2, [bufq+4*36]
|
yading@10
|
581 mova [outq+1152], m2
|
yading@10
|
582 mulps m1, [winq+4*32]
|
yading@10
|
583 addps m1, [bufq+4*32]
|
yading@10
|
584 mova [outq+1024], m1
|
yading@10
|
585 mulps m1, m4, [winq+4*116]
|
yading@10
|
586 mova [bufq+4*36], m1
|
yading@10
|
587 mulps m4, [winq+4*112]
|
yading@10
|
588 mova [bufq+4*32], m4
|
yading@10
|
589 addps m2, m6, m5
|
yading@10
|
590 subps m6, m5
|
yading@10
|
591 mulps m1, m6, [winq+4*68]
|
yading@10
|
592 addps m1, [bufq+4*68]
|
yading@10
|
593 mova [outq+2176], m1
|
yading@10
|
594 mulps m6, [winq]
|
yading@10
|
595 addps m6, [bufq]
|
yading@10
|
596 mova [outq], m6
|
yading@10
|
597 mulps m1, m2, [winq+4*148]
|
yading@10
|
598 mova [bufq+4*68], m1
|
yading@10
|
599 mulps m2, [winq+4*80]
|
yading@10
|
600 mova [bufq], m2
|
yading@10
|
601 addps m5, m3, [tmpq+4*24]
|
yading@10
|
602 mova m2, [tmpq+4*24]
|
yading@10
|
603 subps m2, m3
|
yading@10
|
604 mova m1, SPILLED(9)
|
yading@10
|
605 subps m1, m0
|
yading@10
|
606 mulps m1, [costabs + 16*10]
|
yading@10
|
607 addps m0, SPILLED(9)
|
yading@10
|
608 mulps m0, [costabs + 16*16]
|
yading@10
|
609 addps m6, m5, m1
|
yading@10
|
610 subps m5, m1
|
yading@10
|
611 mulps m3, m5, [winq+4*40]
|
yading@10
|
612 addps m3, [bufq+4*40]
|
yading@10
|
613 mova [outq+1280], m3
|
yading@10
|
614 mulps m5, [winq+4*28]
|
yading@10
|
615 addps m5, [bufq+4*28]
|
yading@10
|
616 mova [outq+896], m5
|
yading@10
|
617 mulps m1, m6, [winq+4*120]
|
yading@10
|
618 mova [bufq+4*40], m1
|
yading@10
|
619 mulps m6, [winq+4*108]
|
yading@10
|
620 mova [bufq+4*28], m6
|
yading@10
|
621 addps m1, m2, m0
|
yading@10
|
622 subps m2, m0
|
yading@10
|
623 mulps m5, m2, [winq+4*64]
|
yading@10
|
624 addps m5, [bufq+4*64]
|
yading@10
|
625 mova [outq+2048], m5
|
yading@10
|
626 mulps m2, [winq+4*4]
|
yading@10
|
627 addps m2, [bufq+4*4]
|
yading@10
|
628 mova [outq+128], m2
|
yading@10
|
629 mulps m0, m1, [winq+4*144]
|
yading@10
|
630 mova [bufq+4*64], m0
|
yading@10
|
631 mulps m1, [winq+4*84]
|
yading@10
|
632 mova [bufq+4*4], m1
|
yading@10
|
633 mova m1, [tmpq+4*28]
|
yading@10
|
634 mova m5, m1
|
yading@10
|
635 addps m1, SPILLED(13)
|
yading@10
|
636 subps m5, SPILLED(13)
|
yading@10
|
637 UNSPILL 3, 15
|
yading@10
|
638 addps m2, m7, m3
|
yading@10
|
639 mulps m2, [costabs + 16*11]
|
yading@10
|
640 subps m3, m7
|
yading@10
|
641 mulps m3, [costabs + 16*15]
|
yading@10
|
642 addps m0, m2, m1
|
yading@10
|
643 subps m1, m2
|
yading@10
|
644 SWAP m0, m2
|
yading@10
|
645 mulps m6, m1, [winq+4*44]
|
yading@10
|
646 addps m6, [bufq+4*44]
|
yading@10
|
647 mova [outq+1408], m6
|
yading@10
|
648 mulps m1, [winq+4*24]
|
yading@10
|
649 addps m1, [bufq+4*24]
|
yading@10
|
650 mova [outq+768], m1
|
yading@10
|
651 mulps m0, m2, [winq+4*124]
|
yading@10
|
652 mova [bufq+4*44], m0
|
yading@10
|
653 mulps m2, [winq+4*104]
|
yading@10
|
654 mova [bufq+4*24], m2
|
yading@10
|
655 addps m0, m5, m3
|
yading@10
|
656 subps m5, m3
|
yading@10
|
657 mulps m1, m5, [winq+4*60]
|
yading@10
|
658 addps m1, [bufq+4*60]
|
yading@10
|
659 mova [outq+1920], m1
|
yading@10
|
660 mulps m5, [winq+4*8]
|
yading@10
|
661 addps m5, [bufq+4*8]
|
yading@10
|
662 mova [outq+256], m5
|
yading@10
|
663 mulps m1, m0, [winq+4*140]
|
yading@10
|
664 mova [bufq+4*60], m1
|
yading@10
|
665 mulps m0, [winq+4*88]
|
yading@10
|
666 mova [bufq+4*8], m0
|
yading@10
|
667 mova m1, [tmpq+4*20]
|
yading@10
|
668 addps m1, SPILLED(12)
|
yading@10
|
669 mova m2, [tmpq+4*20]
|
yading@10
|
670 subps m2, SPILLED(12)
|
yading@10
|
671 UNSPILL 7, 8
|
yading@10
|
672 subps m0, m7, SPILLED(11)
|
yading@10
|
673 addps m7, SPILLED(11)
|
yading@10
|
674 mulps m4, m7, [costabs + 16*12]
|
yading@10
|
675 mulps m0, [costabs + 16*14]
|
yading@10
|
676 addps m5, m1, m4
|
yading@10
|
677 subps m1, m4
|
yading@10
|
678 mulps m7, m1, [winq+4*48]
|
yading@10
|
679 addps m7, [bufq+4*48]
|
yading@10
|
680 mova [outq+1536], m7
|
yading@10
|
681 mulps m1, [winq+4*20]
|
yading@10
|
682 addps m1, [bufq+4*20]
|
yading@10
|
683 mova [outq+640], m1
|
yading@10
|
684 mulps m1, m5, [winq+4*128]
|
yading@10
|
685 mova [bufq+4*48], m1
|
yading@10
|
686 mulps m5, [winq+4*100]
|
yading@10
|
687 mova [bufq+4*20], m5
|
yading@10
|
688 addps m6, m2, m0
|
yading@10
|
689 subps m2, m0
|
yading@10
|
690 mulps m1, m2, [winq+4*56]
|
yading@10
|
691 addps m1, [bufq+4*56]
|
yading@10
|
692 mova [outq+1792], m1
|
yading@10
|
693 mulps m2, [winq+4*12]
|
yading@10
|
694 addps m2, [bufq+4*12]
|
yading@10
|
695 mova [outq+384], m2
|
yading@10
|
696 mulps m0, m6, [winq+4*136]
|
yading@10
|
697 mova [bufq+4*56], m0
|
yading@10
|
698 mulps m6, [winq+4*92]
|
yading@10
|
699 mova [bufq+4*12], m6
|
yading@10
|
700 UNSPILL 0, 14
|
yading@10
|
701 mulps m0, [costabs + 16*13]
|
yading@10
|
702 mova m3, [tmpq+4*4]
|
yading@10
|
703 addps m2, m0, m3
|
yading@10
|
704 subps m3, m0
|
yading@10
|
705 mulps m0, m3, [winq+4*52]
|
yading@10
|
706 addps m0, [bufq+4*52]
|
yading@10
|
707 mova [outq+1664], m0
|
yading@10
|
708 mulps m3, [winq+4*16]
|
yading@10
|
709 addps m3, [bufq+4*16]
|
yading@10
|
710 mova [outq+512], m3
|
yading@10
|
711 mulps m0, m2, [winq+4*132]
|
yading@10
|
712 mova [bufq+4*52], m0
|
yading@10
|
713 mulps m2, [winq+4*96]
|
yading@10
|
714 mova [bufq+4*16], m2
|
yading@10
|
715 RET
|
yading@10
|
716 %endmacro
|
yading@10
|
717
|
yading@10
|
718 INIT_XMM sse
|
yading@10
|
719 DEFINE_FOUR_IMDCT
|
yading@10
|
720
|
yading@10
|
721 %if HAVE_AVX_EXTERNAL
|
yading@10
|
722 INIT_XMM avx
|
yading@10
|
723 DEFINE_FOUR_IMDCT
|
yading@10
|
724 %endif
|