yading@10
|
1 ;******************************************************************************
|
yading@10
|
2 ;* 32 point SSE-optimized DCT transform
|
yading@10
|
3 ;* Copyright (c) 2010 Vitor Sessak
|
yading@10
|
4 ;*
|
yading@10
|
5 ;* This file is part of FFmpeg.
|
yading@10
|
6 ;*
|
yading@10
|
7 ;* FFmpeg is free software; you can redistribute it and/or
|
yading@10
|
8 ;* modify it under the terms of the GNU Lesser General Public
|
yading@10
|
9 ;* License as published by the Free Software Foundation; either
|
yading@10
|
10 ;* version 2.1 of the License, or (at your option) any later version.
|
yading@10
|
11 ;*
|
yading@10
|
12 ;* FFmpeg is distributed in the hope that it will be useful,
|
yading@10
|
13 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
yading@10
|
14 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
yading@10
|
15 ;* Lesser General Public License for more details.
|
yading@10
|
16 ;*
|
yading@10
|
17 ;* You should have received a copy of the GNU Lesser General Public
|
yading@10
|
18 ;* License along with FFmpeg; if not, write to the Free Software
|
yading@10
|
19 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
yading@10
|
20 ;******************************************************************************
|
yading@10
|
21
|
yading@10
|
22 %include "libavutil/x86/x86util.asm"
|
yading@10
|
23
|
yading@10
|
24 SECTION_RODATA 32
|
yading@10
|
25
|
yading@10
|
26 align 32
|
yading@10
|
27 ps_cos_vec: dd 0.500603, 0.505471, 0.515447, 0.531043
|
yading@10
|
28 dd 0.553104, 0.582935, 0.622504, 0.674808
|
yading@10
|
29 dd -10.190008, -3.407609, -2.057781, -1.484165
|
yading@10
|
30 dd -1.169440, -0.972568, -0.839350, -0.744536
|
yading@10
|
31 dd 0.502419, 0.522499, 0.566944, 0.646822
|
yading@10
|
32 dd 0.788155, 1.060678, 1.722447, 5.101149
|
yading@10
|
33 dd 0.509796, 0.601345, 0.899976, 2.562916
|
yading@10
|
34 dd 0.509796, 0.601345, 0.899976, 2.562916
|
yading@10
|
35 dd 1.000000, 1.000000, 1.306563, 0.541196
|
yading@10
|
36 dd 1.000000, 1.000000, 1.306563, 0.541196
|
yading@10
|
37 dd 1.000000, 0.707107, 1.000000, -0.707107
|
yading@10
|
38 dd 1.000000, 0.707107, 1.000000, -0.707107
|
yading@10
|
39 dd 0.707107, 0.707107, 0.707107, 0.707107
|
yading@10
|
40
|
yading@10
|
41 align 32
|
yading@10
|
42 ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000, 0, 0, 0x80000000, 0x80000000
|
yading@10
|
43
|
yading@10
|
44 %macro BUTTERFLY 4
|
yading@10
|
45 subps %4, %1, %2
|
yading@10
|
46 addps %2, %2, %1
|
yading@10
|
47 mulps %1, %4, %3
|
yading@10
|
48 %endmacro
|
yading@10
|
49
|
yading@10
|
50 %macro BUTTERFLY0 5
|
yading@10
|
51 %if cpuflag(sse2) && notcpuflag(avx)
|
yading@10
|
52 pshufd %4, %1, %5
|
yading@10
|
53 xorps %1, %2
|
yading@10
|
54 addps %1, %4
|
yading@10
|
55 mulps %1, %3
|
yading@10
|
56 %else
|
yading@10
|
57 shufps %4, %1, %1, %5
|
yading@10
|
58 xorps %1, %1, %2
|
yading@10
|
59 addps %4, %4, %1
|
yading@10
|
60 mulps %1, %4, %3
|
yading@10
|
61 %endif
|
yading@10
|
62 %endmacro
|
yading@10
|
63
|
yading@10
|
64 %macro BUTTERFLY2 4
|
yading@10
|
65 BUTTERFLY0 %1, %2, %3, %4, 0x1b
|
yading@10
|
66 %endmacro
|
yading@10
|
67
|
yading@10
|
68 %macro BUTTERFLY3 4
|
yading@10
|
69 BUTTERFLY0 %1, %2, %3, %4, 0xb1
|
yading@10
|
70 %endmacro
|
yading@10
|
71
|
yading@10
|
72 %macro BUTTERFLY3V 5
|
yading@10
|
73 movaps m%5, m%1
|
yading@10
|
74 addps m%1, m%2
|
yading@10
|
75 subps m%5, m%2
|
yading@10
|
76 SWAP %2, %5
|
yading@10
|
77 mulps m%2, [ps_cos_vec+192]
|
yading@10
|
78 movaps m%5, m%3
|
yading@10
|
79 addps m%3, m%4
|
yading@10
|
80 subps m%4, m%5
|
yading@10
|
81 mulps m%4, [ps_cos_vec+192]
|
yading@10
|
82 %endmacro
|
yading@10
|
83
|
yading@10
|
84 %macro PASS6_AND_PERMUTE 0
|
yading@10
|
85 mov tmpd, [outq+4]
|
yading@10
|
86 movss m7, [outq+72]
|
yading@10
|
87 addss m7, [outq+76]
|
yading@10
|
88 movss m3, [outq+56]
|
yading@10
|
89 addss m3, [outq+60]
|
yading@10
|
90 addss m4, m3
|
yading@10
|
91 movss m2, [outq+52]
|
yading@10
|
92 addss m2, m3
|
yading@10
|
93 movss m3, [outq+104]
|
yading@10
|
94 addss m3, [outq+108]
|
yading@10
|
95 addss m1, m3
|
yading@10
|
96 addss m5, m4
|
yading@10
|
97 movss [outq+ 16], m1
|
yading@10
|
98 movss m1, [outq+100]
|
yading@10
|
99 addss m1, m3
|
yading@10
|
100 movss m3, [outq+40]
|
yading@10
|
101 movss [outq+ 48], m1
|
yading@10
|
102 addss m3, [outq+44]
|
yading@10
|
103 movss m1, [outq+100]
|
yading@10
|
104 addss m4, m3
|
yading@10
|
105 addss m3, m2
|
yading@10
|
106 addss m1, [outq+108]
|
yading@10
|
107 movss [outq+ 40], m3
|
yading@10
|
108 addss m2, [outq+36]
|
yading@10
|
109 movss m3, [outq+8]
|
yading@10
|
110 movss [outq+ 56], m2
|
yading@10
|
111 addss m3, [outq+12]
|
yading@10
|
112 movss [outq+ 32], m3
|
yading@10
|
113 movss m3, [outq+80]
|
yading@10
|
114 movss [outq+ 8], m5
|
yading@10
|
115 movss [outq+ 80], m1
|
yading@10
|
116 movss m2, [outq+52]
|
yading@10
|
117 movss m5, [outq+120]
|
yading@10
|
118 addss m5, [outq+124]
|
yading@10
|
119 movss m1, [outq+64]
|
yading@10
|
120 addss m2, [outq+60]
|
yading@10
|
121 addss m0, m5
|
yading@10
|
122 addss m5, [outq+116]
|
yading@10
|
123 mov [outq+64], tmpd
|
yading@10
|
124 addss m6, m0
|
yading@10
|
125 addss m1, m6
|
yading@10
|
126 mov tmpd, [outq+12]
|
yading@10
|
127 mov [outq+ 96], tmpd
|
yading@10
|
128 movss [outq+ 4], m1
|
yading@10
|
129 movss m1, [outq+24]
|
yading@10
|
130 movss [outq+ 24], m4
|
yading@10
|
131 movss m4, [outq+88]
|
yading@10
|
132 addss m4, [outq+92]
|
yading@10
|
133 addss m3, m4
|
yading@10
|
134 addss m4, [outq+84]
|
yading@10
|
135 mov tmpd, [outq+108]
|
yading@10
|
136 addss m1, [outq+28]
|
yading@10
|
137 addss m0, m1
|
yading@10
|
138 addss m1, m5
|
yading@10
|
139 addss m6, m3
|
yading@10
|
140 addss m3, m0
|
yading@10
|
141 addss m0, m7
|
yading@10
|
142 addss m5, [outq+20]
|
yading@10
|
143 addss m7, m1
|
yading@10
|
144 movss [outq+ 12], m6
|
yading@10
|
145 mov [outq+112], tmpd
|
yading@10
|
146 movss m6, [outq+28]
|
yading@10
|
147 movss [outq+ 28], m0
|
yading@10
|
148 movss m0, [outq+36]
|
yading@10
|
149 movss [outq+ 36], m7
|
yading@10
|
150 addss m1, m4
|
yading@10
|
151 movss m7, [outq+116]
|
yading@10
|
152 addss m0, m2
|
yading@10
|
153 addss m7, [outq+124]
|
yading@10
|
154 movss [outq+ 72], m0
|
yading@10
|
155 movss m0, [outq+44]
|
yading@10
|
156 addss m2, m0
|
yading@10
|
157 movss [outq+ 44], m1
|
yading@10
|
158 movss [outq+ 88], m2
|
yading@10
|
159 addss m0, [outq+60]
|
yading@10
|
160 mov tmpd, [outq+60]
|
yading@10
|
161 mov [outq+120], tmpd
|
yading@10
|
162 movss [outq+104], m0
|
yading@10
|
163 addss m4, m5
|
yading@10
|
164 addss m5, [outq+68]
|
yading@10
|
165 movss [outq+52], m4
|
yading@10
|
166 movss [outq+60], m5
|
yading@10
|
167 movss m4, [outq+68]
|
yading@10
|
168 movss m5, [outq+20]
|
yading@10
|
169 movss [outq+ 20], m3
|
yading@10
|
170 addss m5, m7
|
yading@10
|
171 addss m7, m6
|
yading@10
|
172 addss m4, m5
|
yading@10
|
173 movss m2, [outq+84]
|
yading@10
|
174 addss m2, [outq+92]
|
yading@10
|
175 addss m5, m2
|
yading@10
|
176 movss [outq+ 68], m4
|
yading@10
|
177 addss m2, m7
|
yading@10
|
178 movss m4, [outq+76]
|
yading@10
|
179 movss [outq+ 84], m2
|
yading@10
|
180 movss [outq+ 76], m5
|
yading@10
|
181 addss m7, m4
|
yading@10
|
182 addss m6, [outq+124]
|
yading@10
|
183 addss m4, m6
|
yading@10
|
184 addss m6, [outq+92]
|
yading@10
|
185 movss [outq+100], m4
|
yading@10
|
186 movss [outq+108], m6
|
yading@10
|
187 movss m6, [outq+92]
|
yading@10
|
188 movss [outq+92], m7
|
yading@10
|
189 addss m6, [outq+124]
|
yading@10
|
190 movss [outq+116], m6
|
yading@10
|
191 %endmacro
|
yading@10
|
192
|
yading@10
|
193 INIT_YMM avx
|
yading@10
|
194 SECTION_TEXT
|
yading@10
|
195 %if HAVE_AVX_EXTERNAL
|
yading@10
|
196 ; void ff_dct32_float_avx(FFTSample *out, const FFTSample *in)
|
yading@10
|
197 cglobal dct32_float, 2,3,8, out, in, tmp
|
yading@10
|
198 ; pass 1
|
yading@10
|
199 vmovaps m4, [inq+0]
|
yading@10
|
200 vinsertf128 m5, m5, [inq+96], 1
|
yading@10
|
201 vinsertf128 m5, m5, [inq+112], 0
|
yading@10
|
202 vshufps m5, m5, m5, 0x1b
|
yading@10
|
203 BUTTERFLY m4, m5, [ps_cos_vec], m6
|
yading@10
|
204
|
yading@10
|
205 vmovaps m2, [inq+64]
|
yading@10
|
206 vinsertf128 m6, m6, [inq+32], 1
|
yading@10
|
207 vinsertf128 m6, m6, [inq+48], 0
|
yading@10
|
208 vshufps m6, m6, m6, 0x1b
|
yading@10
|
209 BUTTERFLY m2, m6, [ps_cos_vec+32], m0
|
yading@10
|
210
|
yading@10
|
211 ; pass 2
|
yading@10
|
212
|
yading@10
|
213 BUTTERFLY m5, m6, [ps_cos_vec+64], m0
|
yading@10
|
214 BUTTERFLY m4, m2, [ps_cos_vec+64], m7
|
yading@10
|
215
|
yading@10
|
216
|
yading@10
|
217 ; pass 3
|
yading@10
|
218 vperm2f128 m3, m6, m4, 0x31
|
yading@10
|
219 vperm2f128 m1, m6, m4, 0x20
|
yading@10
|
220 vshufps m3, m3, m3, 0x1b
|
yading@10
|
221
|
yading@10
|
222 BUTTERFLY m1, m3, [ps_cos_vec+96], m6
|
yading@10
|
223
|
yading@10
|
224
|
yading@10
|
225 vperm2f128 m4, m5, m2, 0x20
|
yading@10
|
226 vperm2f128 m5, m5, m2, 0x31
|
yading@10
|
227 vshufps m5, m5, m5, 0x1b
|
yading@10
|
228
|
yading@10
|
229 BUTTERFLY m4, m5, [ps_cos_vec+96], m6
|
yading@10
|
230
|
yading@10
|
231 ; pass 4
|
yading@10
|
232 vmovaps m6, [ps_p1p1m1m1+0]
|
yading@10
|
233 vmovaps m2, [ps_cos_vec+128]
|
yading@10
|
234
|
yading@10
|
235 BUTTERFLY2 m5, m6, m2, m7
|
yading@10
|
236 BUTTERFLY2 m4, m6, m2, m7
|
yading@10
|
237 BUTTERFLY2 m1, m6, m2, m7
|
yading@10
|
238 BUTTERFLY2 m3, m6, m2, m7
|
yading@10
|
239
|
yading@10
|
240
|
yading@10
|
241 ; pass 5
|
yading@10
|
242 vshufps m6, m6, m6, 0xcc
|
yading@10
|
243 vmovaps m2, [ps_cos_vec+160]
|
yading@10
|
244
|
yading@10
|
245 BUTTERFLY3 m5, m6, m2, m7
|
yading@10
|
246 BUTTERFLY3 m4, m6, m2, m7
|
yading@10
|
247 BUTTERFLY3 m1, m6, m2, m7
|
yading@10
|
248 BUTTERFLY3 m3, m6, m2, m7
|
yading@10
|
249
|
yading@10
|
250 vperm2f128 m6, m3, m3, 0x31
|
yading@10
|
251 vmovaps [outq], m3
|
yading@10
|
252
|
yading@10
|
253 vextractf128 [outq+64], m5, 1
|
yading@10
|
254 vextractf128 [outq+32], m5, 0
|
yading@10
|
255
|
yading@10
|
256 vextractf128 [outq+80], m4, 1
|
yading@10
|
257 vextractf128 [outq+48], m4, 0
|
yading@10
|
258
|
yading@10
|
259 vperm2f128 m0, m1, m1, 0x31
|
yading@10
|
260 vmovaps [outq+96], m1
|
yading@10
|
261
|
yading@10
|
262 vzeroupper
|
yading@10
|
263
|
yading@10
|
264 ; pass 6, no SIMD...
|
yading@10
|
265 INIT_XMM
|
yading@10
|
266 PASS6_AND_PERMUTE
|
yading@10
|
267 RET
|
yading@10
|
268 %endif
|
yading@10
|
269
|
yading@10
|
270 %if ARCH_X86_64
|
yading@10
|
271 %define SPILL SWAP
|
yading@10
|
272 %define UNSPILL SWAP
|
yading@10
|
273
|
yading@10
|
274 %macro PASS5 0
|
yading@10
|
275 nop ; FIXME code alignment
|
yading@10
|
276 SWAP 5, 8
|
yading@10
|
277 SWAP 4, 12
|
yading@10
|
278 SWAP 6, 14
|
yading@10
|
279 SWAP 7, 13
|
yading@10
|
280 SWAP 0, 15
|
yading@10
|
281 PERMUTE 9,10, 10,12, 11,14, 12,9, 13,11, 14,13
|
yading@10
|
282 TRANSPOSE4x4PS 8, 9, 10, 11, 0
|
yading@10
|
283 BUTTERFLY3V 8, 9, 10, 11, 0
|
yading@10
|
284 addps m10, m11
|
yading@10
|
285 TRANSPOSE4x4PS 12, 13, 14, 15, 0
|
yading@10
|
286 BUTTERFLY3V 12, 13, 14, 15, 0
|
yading@10
|
287 addps m14, m15
|
yading@10
|
288 addps m12, m14
|
yading@10
|
289 addps m14, m13
|
yading@10
|
290 addps m13, m15
|
yading@10
|
291 %endmacro
|
yading@10
|
292
|
yading@10
|
293 %macro PASS6 0
|
yading@10
|
294 SWAP 9, 12
|
yading@10
|
295 SWAP 11, 14
|
yading@10
|
296 movss [outq+0x00], m8
|
yading@10
|
297 pshuflw m0, m8, 0xe
|
yading@10
|
298 movss [outq+0x10], m9
|
yading@10
|
299 pshuflw m1, m9, 0xe
|
yading@10
|
300 movss [outq+0x20], m10
|
yading@10
|
301 pshuflw m2, m10, 0xe
|
yading@10
|
302 movss [outq+0x30], m11
|
yading@10
|
303 pshuflw m3, m11, 0xe
|
yading@10
|
304 movss [outq+0x40], m12
|
yading@10
|
305 pshuflw m4, m12, 0xe
|
yading@10
|
306 movss [outq+0x50], m13
|
yading@10
|
307 pshuflw m5, m13, 0xe
|
yading@10
|
308 movss [outq+0x60], m14
|
yading@10
|
309 pshuflw m6, m14, 0xe
|
yading@10
|
310 movaps [outq+0x70], m15
|
yading@10
|
311 pshuflw m7, m15, 0xe
|
yading@10
|
312 addss m0, m1
|
yading@10
|
313 addss m1, m2
|
yading@10
|
314 movss [outq+0x08], m0
|
yading@10
|
315 addss m2, m3
|
yading@10
|
316 movss [outq+0x18], m1
|
yading@10
|
317 addss m3, m4
|
yading@10
|
318 movss [outq+0x28], m2
|
yading@10
|
319 addss m4, m5
|
yading@10
|
320 movss [outq+0x38], m3
|
yading@10
|
321 addss m5, m6
|
yading@10
|
322 movss [outq+0x48], m4
|
yading@10
|
323 addss m6, m7
|
yading@10
|
324 movss [outq+0x58], m5
|
yading@10
|
325 movss [outq+0x68], m6
|
yading@10
|
326 movss [outq+0x78], m7
|
yading@10
|
327
|
yading@10
|
328 PERMUTE 1,8, 3,9, 5,10, 7,11, 9,12, 11,13, 13,14, 8,1, 10,3, 12,5, 14,7
|
yading@10
|
329 movhlps m0, m1
|
yading@10
|
330 pshufd m1, m1, 3
|
yading@10
|
331 SWAP 0, 2, 4, 6, 8, 10, 12, 14
|
yading@10
|
332 SWAP 1, 3, 5, 7, 9, 11, 13, 15
|
yading@10
|
333 %rep 7
|
yading@10
|
334 movhlps m0, m1
|
yading@10
|
335 pshufd m1, m1, 3
|
yading@10
|
336 addss m15, m1
|
yading@10
|
337 SWAP 0, 2, 4, 6, 8, 10, 12, 14
|
yading@10
|
338 SWAP 1, 3, 5, 7, 9, 11, 13, 15
|
yading@10
|
339 %endrep
|
yading@10
|
340 %assign i 4
|
yading@10
|
341 %rep 15
|
yading@10
|
342 addss m0, m1
|
yading@10
|
343 movss [outq+i], m0
|
yading@10
|
344 SWAP 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
|
yading@10
|
345 %assign i i+8
|
yading@10
|
346 %endrep
|
yading@10
|
347 %endmacro
|
yading@10
|
348
|
yading@10
|
349 %else ; ARCH_X86_32
|
yading@10
|
350 %macro SPILL 2 ; xmm#, mempos
|
yading@10
|
351 movaps [outq+(%2-8)*16], m%1
|
yading@10
|
352 %endmacro
|
yading@10
|
353 %macro UNSPILL 2
|
yading@10
|
354 movaps m%1, [outq+(%2-8)*16]
|
yading@10
|
355 %endmacro
|
yading@10
|
356
|
yading@10
|
357 %define PASS6 PASS6_AND_PERMUTE
|
yading@10
|
358 %macro PASS5 0
|
yading@10
|
359 movaps m2, [ps_cos_vec+160]
|
yading@10
|
360 shufps m3, m3, 0xcc
|
yading@10
|
361
|
yading@10
|
362 BUTTERFLY3 m5, m3, m2, m1
|
yading@10
|
363 SPILL 5, 8
|
yading@10
|
364
|
yading@10
|
365 UNSPILL 1, 9
|
yading@10
|
366 BUTTERFLY3 m1, m3, m2, m5
|
yading@10
|
367 SPILL 1, 14
|
yading@10
|
368
|
yading@10
|
369 BUTTERFLY3 m4, m3, m2, m5
|
yading@10
|
370 SPILL 4, 12
|
yading@10
|
371
|
yading@10
|
372 BUTTERFLY3 m7, m3, m2, m5
|
yading@10
|
373 SPILL 7, 13
|
yading@10
|
374
|
yading@10
|
375 UNSPILL 5, 10
|
yading@10
|
376 BUTTERFLY3 m5, m3, m2, m7
|
yading@10
|
377 SPILL 5, 10
|
yading@10
|
378
|
yading@10
|
379 UNSPILL 4, 11
|
yading@10
|
380 BUTTERFLY3 m4, m3, m2, m7
|
yading@10
|
381 SPILL 4, 11
|
yading@10
|
382
|
yading@10
|
383 BUTTERFLY3 m6, m3, m2, m7
|
yading@10
|
384 SPILL 6, 9
|
yading@10
|
385
|
yading@10
|
386 BUTTERFLY3 m0, m3, m2, m7
|
yading@10
|
387 SPILL 0, 15
|
yading@10
|
388 %endmacro
|
yading@10
|
389 %endif
|
yading@10
|
390
|
yading@10
|
391
|
yading@10
|
392 ; void ff_dct32_float_sse(FFTSample *out, const FFTSample *in)
|
yading@10
|
393 %macro DCT32_FUNC 0
|
yading@10
|
394 cglobal dct32_float, 2, 3, 16, out, in, tmp
|
yading@10
|
395 ; pass 1
|
yading@10
|
396
|
yading@10
|
397 movaps m0, [inq+0]
|
yading@10
|
398 LOAD_INV m1, [inq+112]
|
yading@10
|
399 BUTTERFLY m0, m1, [ps_cos_vec], m3
|
yading@10
|
400
|
yading@10
|
401 movaps m7, [inq+64]
|
yading@10
|
402 LOAD_INV m4, [inq+48]
|
yading@10
|
403 BUTTERFLY m7, m4, [ps_cos_vec+32], m3
|
yading@10
|
404
|
yading@10
|
405 ; pass 2
|
yading@10
|
406 movaps m2, [ps_cos_vec+64]
|
yading@10
|
407 BUTTERFLY m1, m4, m2, m3
|
yading@10
|
408 SPILL 1, 11
|
yading@10
|
409 SPILL 4, 8
|
yading@10
|
410
|
yading@10
|
411 ; pass 1
|
yading@10
|
412 movaps m1, [inq+16]
|
yading@10
|
413 LOAD_INV m6, [inq+96]
|
yading@10
|
414 BUTTERFLY m1, m6, [ps_cos_vec+16], m3
|
yading@10
|
415
|
yading@10
|
416 movaps m4, [inq+80]
|
yading@10
|
417 LOAD_INV m5, [inq+32]
|
yading@10
|
418 BUTTERFLY m4, m5, [ps_cos_vec+48], m3
|
yading@10
|
419
|
yading@10
|
420 ; pass 2
|
yading@10
|
421 BUTTERFLY m0, m7, m2, m3
|
yading@10
|
422
|
yading@10
|
423 movaps m2, [ps_cos_vec+80]
|
yading@10
|
424 BUTTERFLY m6, m5, m2, m3
|
yading@10
|
425
|
yading@10
|
426 BUTTERFLY m1, m4, m2, m3
|
yading@10
|
427
|
yading@10
|
428 ; pass 3
|
yading@10
|
429 movaps m2, [ps_cos_vec+96]
|
yading@10
|
430 shufps m1, m1, 0x1b
|
yading@10
|
431 BUTTERFLY m0, m1, m2, m3
|
yading@10
|
432 SPILL 0, 15
|
yading@10
|
433 SPILL 1, 14
|
yading@10
|
434
|
yading@10
|
435 UNSPILL 0, 8
|
yading@10
|
436 shufps m5, m5, 0x1b
|
yading@10
|
437 BUTTERFLY m0, m5, m2, m3
|
yading@10
|
438
|
yading@10
|
439 UNSPILL 1, 11
|
yading@10
|
440 shufps m6, m6, 0x1b
|
yading@10
|
441 BUTTERFLY m1, m6, m2, m3
|
yading@10
|
442 SPILL 1, 11
|
yading@10
|
443
|
yading@10
|
444 shufps m4, m4, 0x1b
|
yading@10
|
445 BUTTERFLY m7, m4, m2, m3
|
yading@10
|
446
|
yading@10
|
447 ; pass 4
|
yading@10
|
448 movaps m3, [ps_p1p1m1m1+0]
|
yading@10
|
449 movaps m2, [ps_cos_vec+128]
|
yading@10
|
450
|
yading@10
|
451 BUTTERFLY2 m5, m3, m2, m1
|
yading@10
|
452
|
yading@10
|
453 BUTTERFLY2 m0, m3, m2, m1
|
yading@10
|
454 SPILL 0, 9
|
yading@10
|
455
|
yading@10
|
456 BUTTERFLY2 m6, m3, m2, m1
|
yading@10
|
457 SPILL 6, 10
|
yading@10
|
458
|
yading@10
|
459 UNSPILL 0, 11
|
yading@10
|
460 BUTTERFLY2 m0, m3, m2, m1
|
yading@10
|
461 SPILL 0, 11
|
yading@10
|
462
|
yading@10
|
463 BUTTERFLY2 m4, m3, m2, m1
|
yading@10
|
464
|
yading@10
|
465 BUTTERFLY2 m7, m3, m2, m1
|
yading@10
|
466
|
yading@10
|
467 UNSPILL 6, 14
|
yading@10
|
468 BUTTERFLY2 m6, m3, m2, m1
|
yading@10
|
469
|
yading@10
|
470 UNSPILL 0, 15
|
yading@10
|
471 BUTTERFLY2 m0, m3, m2, m1
|
yading@10
|
472
|
yading@10
|
473 PASS5
|
yading@10
|
474 PASS6
|
yading@10
|
475 RET
|
yading@10
|
476 %endmacro
|
yading@10
|
477
|
yading@10
|
478 %macro LOAD_INV 2
|
yading@10
|
479 %if cpuflag(sse2)
|
yading@10
|
480 pshufd %1, %2, 0x1b
|
yading@10
|
481 %elif cpuflag(sse)
|
yading@10
|
482 movaps %1, %2
|
yading@10
|
483 shufps %1, %1, 0x1b
|
yading@10
|
484 %endif
|
yading@10
|
485 %endmacro
|
yading@10
|
486
|
yading@10
|
487 INIT_XMM sse
|
yading@10
|
488 DCT32_FUNC
|
yading@10
|
489 INIT_XMM sse2
|
yading@10
|
490 DCT32_FUNC
|