yading@10
|
1 ;******************************************************************************
|
yading@10
|
2 ;* x86 optimized Format Conversion Utils
|
yading@10
|
3 ;* Copyright (c) 2008 Loren Merritt
|
yading@10
|
4 ;*
|
yading@10
|
5 ;* This file is part of FFmpeg.
|
yading@10
|
6 ;*
|
yading@10
|
7 ;* FFmpeg is free software; you can redistribute it and/or
|
yading@10
|
8 ;* modify it under the terms of the GNU Lesser General Public
|
yading@10
|
9 ;* License as published by the Free Software Foundation; either
|
yading@10
|
10 ;* version 2.1 of the License, or (at your option) any later version.
|
yading@10
|
11 ;*
|
yading@10
|
12 ;* FFmpeg is distributed in the hope that it will be useful,
|
yading@10
|
13 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
yading@10
|
14 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
yading@10
|
15 ;* Lesser General Public License for more details.
|
yading@10
|
16 ;*
|
yading@10
|
17 ;* You should have received a copy of the GNU Lesser General Public
|
yading@10
|
18 ;* License along with FFmpeg; if not, write to the Free Software
|
yading@10
|
19 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
yading@10
|
20 ;******************************************************************************
|
yading@10
|
21
|
yading@10
|
22 %include "libavutil/x86/x86util.asm"
|
yading@10
|
23
|
yading@10
|
24 SECTION_TEXT
|
yading@10
|
25
|
yading@10
|
26 %macro CVTPS2PI 2
|
yading@10
|
27 %if cpuflag(sse)
|
yading@10
|
28 cvtps2pi %1, %2
|
yading@10
|
29 %elif cpuflag(3dnow)
|
yading@10
|
30 pf2id %1, %2
|
yading@10
|
31 %endif
|
yading@10
|
32 %endmacro
|
yading@10
|
33
|
yading@10
|
34 ;---------------------------------------------------------------------------------
|
yading@10
|
35 ; void int32_to_float_fmul_scalar(float *dst, const int *src, float mul, int len);
|
yading@10
|
36 ;---------------------------------------------------------------------------------
|
yading@10
|
37 %macro INT32_TO_FLOAT_FMUL_SCALAR 1
|
yading@10
|
38 %if UNIX64
|
yading@10
|
39 cglobal int32_to_float_fmul_scalar, 3, 3, %1, dst, src, len
|
yading@10
|
40 %else
|
yading@10
|
41 cglobal int32_to_float_fmul_scalar, 4, 4, %1, dst, src, mul, len
|
yading@10
|
42 %endif
|
yading@10
|
43 %if WIN64
|
yading@10
|
44 SWAP 0, 2
|
yading@10
|
45 %elif ARCH_X86_32
|
yading@10
|
46 movss m0, mulm
|
yading@10
|
47 %endif
|
yading@10
|
48 SPLATD m0
|
yading@10
|
49 shl lenq, 2
|
yading@10
|
50 add srcq, lenq
|
yading@10
|
51 add dstq, lenq
|
yading@10
|
52 neg lenq
|
yading@10
|
53 .loop:
|
yading@10
|
54 %if cpuflag(sse2)
|
yading@10
|
55 cvtdq2ps m1, [srcq+lenq ]
|
yading@10
|
56 cvtdq2ps m2, [srcq+lenq+16]
|
yading@10
|
57 %else
|
yading@10
|
58 cvtpi2ps m1, [srcq+lenq ]
|
yading@10
|
59 cvtpi2ps m3, [srcq+lenq+ 8]
|
yading@10
|
60 cvtpi2ps m2, [srcq+lenq+16]
|
yading@10
|
61 cvtpi2ps m4, [srcq+lenq+24]
|
yading@10
|
62 movlhps m1, m3
|
yading@10
|
63 movlhps m2, m4
|
yading@10
|
64 %endif
|
yading@10
|
65 mulps m1, m0
|
yading@10
|
66 mulps m2, m0
|
yading@10
|
67 mova [dstq+lenq ], m1
|
yading@10
|
68 mova [dstq+lenq+16], m2
|
yading@10
|
69 add lenq, 32
|
yading@10
|
70 jl .loop
|
yading@10
|
71 REP_RET
|
yading@10
|
72 %endmacro
|
yading@10
|
73
|
yading@10
|
74 INIT_XMM sse
|
yading@10
|
75 INT32_TO_FLOAT_FMUL_SCALAR 5
|
yading@10
|
76 INIT_XMM sse2
|
yading@10
|
77 INT32_TO_FLOAT_FMUL_SCALAR 3
|
yading@10
|
78
|
yading@10
|
79
|
yading@10
|
80 ;------------------------------------------------------------------------------
|
yading@10
|
81 ; void ff_float_to_int16(int16_t *dst, const float *src, long len);
|
yading@10
|
82 ;------------------------------------------------------------------------------
|
yading@10
|
83 %macro FLOAT_TO_INT16 1
|
yading@10
|
84 cglobal float_to_int16, 3, 3, %1, dst, src, len
|
yading@10
|
85 add lenq, lenq
|
yading@10
|
86 lea srcq, [srcq+2*lenq]
|
yading@10
|
87 add dstq, lenq
|
yading@10
|
88 neg lenq
|
yading@10
|
89 .loop:
|
yading@10
|
90 %if cpuflag(sse2)
|
yading@10
|
91 cvtps2dq m0, [srcq+2*lenq ]
|
yading@10
|
92 cvtps2dq m1, [srcq+2*lenq+16]
|
yading@10
|
93 packssdw m0, m1
|
yading@10
|
94 mova [dstq+lenq], m0
|
yading@10
|
95 %else
|
yading@10
|
96 CVTPS2PI m0, [srcq+2*lenq ]
|
yading@10
|
97 CVTPS2PI m1, [srcq+2*lenq+ 8]
|
yading@10
|
98 CVTPS2PI m2, [srcq+2*lenq+16]
|
yading@10
|
99 CVTPS2PI m3, [srcq+2*lenq+24]
|
yading@10
|
100 packssdw m0, m1
|
yading@10
|
101 packssdw m2, m3
|
yading@10
|
102 mova [dstq+lenq ], m0
|
yading@10
|
103 mova [dstq+lenq+8], m2
|
yading@10
|
104 %endif
|
yading@10
|
105 add lenq, 16
|
yading@10
|
106 js .loop
|
yading@10
|
107 %if mmsize == 8
|
yading@10
|
108 emms
|
yading@10
|
109 %endif
|
yading@10
|
110 REP_RET
|
yading@10
|
111 %endmacro
|
yading@10
|
112
|
yading@10
|
113 INIT_XMM sse2
|
yading@10
|
114 FLOAT_TO_INT16 2
|
yading@10
|
115 INIT_MMX sse
|
yading@10
|
116 FLOAT_TO_INT16 0
|
yading@10
|
117 INIT_MMX 3dnow
|
yading@10
|
118 FLOAT_TO_INT16 0
|
yading@10
|
119
|
yading@10
|
120 ;------------------------------------------------------------------------------
|
yading@10
|
121 ; void ff_float_to_int16_step(int16_t *dst, const float *src, long len, long step);
|
yading@10
|
122 ;------------------------------------------------------------------------------
|
yading@10
|
123 %macro FLOAT_TO_INT16_STEP 1
|
yading@10
|
124 cglobal float_to_int16_step, 4, 7, %1, dst, src, len, step, step3, v1, v2
|
yading@10
|
125 add lenq, lenq
|
yading@10
|
126 lea srcq, [srcq+2*lenq]
|
yading@10
|
127 lea step3q, [stepq*3]
|
yading@10
|
128 neg lenq
|
yading@10
|
129 .loop:
|
yading@10
|
130 %if cpuflag(sse2)
|
yading@10
|
131 cvtps2dq m0, [srcq+2*lenq ]
|
yading@10
|
132 cvtps2dq m1, [srcq+2*lenq+16]
|
yading@10
|
133 packssdw m0, m1
|
yading@10
|
134 movd v1d, m0
|
yading@10
|
135 psrldq m0, 4
|
yading@10
|
136 movd v2d, m0
|
yading@10
|
137 psrldq m0, 4
|
yading@10
|
138 mov [dstq], v1w
|
yading@10
|
139 mov [dstq+stepq*4], v2w
|
yading@10
|
140 shr v1d, 16
|
yading@10
|
141 shr v2d, 16
|
yading@10
|
142 mov [dstq+stepq*2], v1w
|
yading@10
|
143 mov [dstq+step3q*2], v2w
|
yading@10
|
144 lea dstq, [dstq+stepq*8]
|
yading@10
|
145 movd v1d, m0
|
yading@10
|
146 psrldq m0, 4
|
yading@10
|
147 movd v2d, m0
|
yading@10
|
148 mov [dstq], v1w
|
yading@10
|
149 mov [dstq+stepq*4], v2w
|
yading@10
|
150 shr v1d, 16
|
yading@10
|
151 shr v2d, 16
|
yading@10
|
152 mov [dstq+stepq*2], v1w
|
yading@10
|
153 mov [dstq+step3q*2], v2w
|
yading@10
|
154 lea dstq, [dstq+stepq*8]
|
yading@10
|
155 %else
|
yading@10
|
156 CVTPS2PI m0, [srcq+2*lenq ]
|
yading@10
|
157 CVTPS2PI m1, [srcq+2*lenq+ 8]
|
yading@10
|
158 CVTPS2PI m2, [srcq+2*lenq+16]
|
yading@10
|
159 CVTPS2PI m3, [srcq+2*lenq+24]
|
yading@10
|
160 packssdw m0, m1
|
yading@10
|
161 packssdw m2, m3
|
yading@10
|
162 movd v1d, m0
|
yading@10
|
163 psrlq m0, 32
|
yading@10
|
164 movd v2d, m0
|
yading@10
|
165 mov [dstq], v1w
|
yading@10
|
166 mov [dstq+stepq*4], v2w
|
yading@10
|
167 shr v1d, 16
|
yading@10
|
168 shr v2d, 16
|
yading@10
|
169 mov [dstq+stepq*2], v1w
|
yading@10
|
170 mov [dstq+step3q*2], v2w
|
yading@10
|
171 lea dstq, [dstq+stepq*8]
|
yading@10
|
172 movd v1d, m2
|
yading@10
|
173 psrlq m2, 32
|
yading@10
|
174 movd v2d, m2
|
yading@10
|
175 mov [dstq], v1w
|
yading@10
|
176 mov [dstq+stepq*4], v2w
|
yading@10
|
177 shr v1d, 16
|
yading@10
|
178 shr v2d, 16
|
yading@10
|
179 mov [dstq+stepq*2], v1w
|
yading@10
|
180 mov [dstq+step3q*2], v2w
|
yading@10
|
181 lea dstq, [dstq+stepq*8]
|
yading@10
|
182 %endif
|
yading@10
|
183 add lenq, 16
|
yading@10
|
184 js .loop
|
yading@10
|
185 %if mmsize == 8
|
yading@10
|
186 emms
|
yading@10
|
187 %endif
|
yading@10
|
188 REP_RET
|
yading@10
|
189 %endmacro
|
yading@10
|
190
|
yading@10
|
191 INIT_XMM sse2
|
yading@10
|
192 FLOAT_TO_INT16_STEP 2
|
yading@10
|
193 INIT_MMX sse
|
yading@10
|
194 FLOAT_TO_INT16_STEP 0
|
yading@10
|
195 INIT_MMX 3dnow
|
yading@10
|
196 FLOAT_TO_INT16_STEP 0
|
yading@10
|
197
|
yading@10
|
198 ;-------------------------------------------------------------------------------
|
yading@10
|
199 ; void ff_float_to_int16_interleave2(int16_t *dst, const float **src, long len);
|
yading@10
|
200 ;-------------------------------------------------------------------------------
|
yading@10
|
201 %macro FLOAT_TO_INT16_INTERLEAVE2 0
|
yading@10
|
202 cglobal float_to_int16_interleave2, 3, 4, 2, dst, src0, src1, len
|
yading@10
|
203 lea lenq, [4*r2q]
|
yading@10
|
204 mov src1q, [src0q+gprsize]
|
yading@10
|
205 mov src0q, [src0q]
|
yading@10
|
206 add dstq, lenq
|
yading@10
|
207 add src0q, lenq
|
yading@10
|
208 add src1q, lenq
|
yading@10
|
209 neg lenq
|
yading@10
|
210 .loop:
|
yading@10
|
211 %if cpuflag(sse2)
|
yading@10
|
212 cvtps2dq m0, [src0q+lenq]
|
yading@10
|
213 cvtps2dq m1, [src1q+lenq]
|
yading@10
|
214 packssdw m0, m1
|
yading@10
|
215 movhlps m1, m0
|
yading@10
|
216 punpcklwd m0, m1
|
yading@10
|
217 mova [dstq+lenq], m0
|
yading@10
|
218 %else
|
yading@10
|
219 CVTPS2PI m0, [src0q+lenq ]
|
yading@10
|
220 CVTPS2PI m1, [src0q+lenq+8]
|
yading@10
|
221 CVTPS2PI m2, [src1q+lenq ]
|
yading@10
|
222 CVTPS2PI m3, [src1q+lenq+8]
|
yading@10
|
223 packssdw m0, m1
|
yading@10
|
224 packssdw m2, m3
|
yading@10
|
225 mova m1, m0
|
yading@10
|
226 punpcklwd m0, m2
|
yading@10
|
227 punpckhwd m1, m2
|
yading@10
|
228 mova [dstq+lenq ], m0
|
yading@10
|
229 mova [dstq+lenq+8], m1
|
yading@10
|
230 %endif
|
yading@10
|
231 add lenq, 16
|
yading@10
|
232 js .loop
|
yading@10
|
233 %if mmsize == 8
|
yading@10
|
234 emms
|
yading@10
|
235 %endif
|
yading@10
|
236 REP_RET
|
yading@10
|
237 %endmacro
|
yading@10
|
238
|
yading@10
|
239 INIT_MMX 3dnow
|
yading@10
|
240 FLOAT_TO_INT16_INTERLEAVE2
|
yading@10
|
241 INIT_MMX sse
|
yading@10
|
242 FLOAT_TO_INT16_INTERLEAVE2
|
yading@10
|
243 INIT_XMM sse2
|
yading@10
|
244 FLOAT_TO_INT16_INTERLEAVE2
|
yading@10
|
245
|
yading@10
|
246 %macro FLOAT_TO_INT16_INTERLEAVE6 0
|
yading@10
|
247 ; void float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len)
|
yading@10
|
248 cglobal float_to_int16_interleave6, 2, 8, 0, dst, src, src1, src2, src3, src4, src5, len
|
yading@10
|
249 %if ARCH_X86_64
|
yading@10
|
250 mov lend, r2d
|
yading@10
|
251 %else
|
yading@10
|
252 %define lend dword r2m
|
yading@10
|
253 %endif
|
yading@10
|
254 mov src1q, [srcq+1*gprsize]
|
yading@10
|
255 mov src2q, [srcq+2*gprsize]
|
yading@10
|
256 mov src3q, [srcq+3*gprsize]
|
yading@10
|
257 mov src4q, [srcq+4*gprsize]
|
yading@10
|
258 mov src5q, [srcq+5*gprsize]
|
yading@10
|
259 mov srcq, [srcq]
|
yading@10
|
260 sub src1q, srcq
|
yading@10
|
261 sub src2q, srcq
|
yading@10
|
262 sub src3q, srcq
|
yading@10
|
263 sub src4q, srcq
|
yading@10
|
264 sub src5q, srcq
|
yading@10
|
265 .loop:
|
yading@10
|
266 CVTPS2PI mm0, [srcq]
|
yading@10
|
267 CVTPS2PI mm1, [srcq+src1q]
|
yading@10
|
268 CVTPS2PI mm2, [srcq+src2q]
|
yading@10
|
269 CVTPS2PI mm3, [srcq+src3q]
|
yading@10
|
270 CVTPS2PI mm4, [srcq+src4q]
|
yading@10
|
271 CVTPS2PI mm5, [srcq+src5q]
|
yading@10
|
272 packssdw mm0, mm3
|
yading@10
|
273 packssdw mm1, mm4
|
yading@10
|
274 packssdw mm2, mm5
|
yading@10
|
275 PSWAPD mm3, mm0
|
yading@10
|
276 punpcklwd mm0, mm1
|
yading@10
|
277 punpckhwd mm1, mm2
|
yading@10
|
278 punpcklwd mm2, mm3
|
yading@10
|
279 PSWAPD mm3, mm0
|
yading@10
|
280 punpckldq mm0, mm2
|
yading@10
|
281 punpckhdq mm2, mm1
|
yading@10
|
282 punpckldq mm1, mm3
|
yading@10
|
283 movq [dstq ], mm0
|
yading@10
|
284 movq [dstq+16], mm2
|
yading@10
|
285 movq [dstq+ 8], mm1
|
yading@10
|
286 add srcq, 8
|
yading@10
|
287 add dstq, 24
|
yading@10
|
288 sub lend, 2
|
yading@10
|
289 jg .loop
|
yading@10
|
290 emms
|
yading@10
|
291 RET
|
yading@10
|
292 %endmacro ; FLOAT_TO_INT16_INTERLEAVE6
|
yading@10
|
293
|
yading@10
|
294 INIT_MMX sse
|
yading@10
|
295 FLOAT_TO_INT16_INTERLEAVE6
|
yading@10
|
296 INIT_MMX 3dnow
|
yading@10
|
297 FLOAT_TO_INT16_INTERLEAVE6
|
yading@10
|
298 INIT_MMX 3dnowext
|
yading@10
|
299 FLOAT_TO_INT16_INTERLEAVE6
|
yading@10
|
300
|
yading@10
|
301 ;-----------------------------------------------------------------------------
|
yading@10
|
302 ; void ff_float_interleave6(float *dst, const float **src, unsigned int len);
|
yading@10
|
303 ;-----------------------------------------------------------------------------
|
yading@10
|
304
|
yading@10
|
305 %macro FLOAT_INTERLEAVE6 1
|
yading@10
|
306 cglobal float_interleave6, 2, 8, %1, dst, src, src1, src2, src3, src4, src5, len
|
yading@10
|
307 %if ARCH_X86_64
|
yading@10
|
308 mov lend, r2d
|
yading@10
|
309 %else
|
yading@10
|
310 %define lend dword r2m
|
yading@10
|
311 %endif
|
yading@10
|
312 mov src1q, [srcq+1*gprsize]
|
yading@10
|
313 mov src2q, [srcq+2*gprsize]
|
yading@10
|
314 mov src3q, [srcq+3*gprsize]
|
yading@10
|
315 mov src4q, [srcq+4*gprsize]
|
yading@10
|
316 mov src5q, [srcq+5*gprsize]
|
yading@10
|
317 mov srcq, [srcq]
|
yading@10
|
318 sub src1q, srcq
|
yading@10
|
319 sub src2q, srcq
|
yading@10
|
320 sub src3q, srcq
|
yading@10
|
321 sub src4q, srcq
|
yading@10
|
322 sub src5q, srcq
|
yading@10
|
323 .loop:
|
yading@10
|
324 %if cpuflag(sse)
|
yading@10
|
325 movaps m0, [srcq]
|
yading@10
|
326 movaps m1, [srcq+src1q]
|
yading@10
|
327 movaps m2, [srcq+src2q]
|
yading@10
|
328 movaps m3, [srcq+src3q]
|
yading@10
|
329 movaps m4, [srcq+src4q]
|
yading@10
|
330 movaps m5, [srcq+src5q]
|
yading@10
|
331
|
yading@10
|
332 SBUTTERFLYPS 0, 1, 6
|
yading@10
|
333 SBUTTERFLYPS 2, 3, 6
|
yading@10
|
334 SBUTTERFLYPS 4, 5, 6
|
yading@10
|
335
|
yading@10
|
336 movaps m6, m4
|
yading@10
|
337 shufps m4, m0, 0xe4
|
yading@10
|
338 movlhps m0, m2
|
yading@10
|
339 movhlps m6, m2
|
yading@10
|
340 movaps [dstq ], m0
|
yading@10
|
341 movaps [dstq+16], m4
|
yading@10
|
342 movaps [dstq+32], m6
|
yading@10
|
343
|
yading@10
|
344 movaps m6, m5
|
yading@10
|
345 shufps m5, m1, 0xe4
|
yading@10
|
346 movlhps m1, m3
|
yading@10
|
347 movhlps m6, m3
|
yading@10
|
348 movaps [dstq+48], m1
|
yading@10
|
349 movaps [dstq+64], m5
|
yading@10
|
350 movaps [dstq+80], m6
|
yading@10
|
351 %else ; mmx
|
yading@10
|
352 movq m0, [srcq]
|
yading@10
|
353 movq m1, [srcq+src1q]
|
yading@10
|
354 movq m2, [srcq+src2q]
|
yading@10
|
355 movq m3, [srcq+src3q]
|
yading@10
|
356 movq m4, [srcq+src4q]
|
yading@10
|
357 movq m5, [srcq+src5q]
|
yading@10
|
358
|
yading@10
|
359 SBUTTERFLY dq, 0, 1, 6
|
yading@10
|
360 SBUTTERFLY dq, 2, 3, 6
|
yading@10
|
361 SBUTTERFLY dq, 4, 5, 6
|
yading@10
|
362 movq [dstq ], m0
|
yading@10
|
363 movq [dstq+ 8], m2
|
yading@10
|
364 movq [dstq+16], m4
|
yading@10
|
365 movq [dstq+24], m1
|
yading@10
|
366 movq [dstq+32], m3
|
yading@10
|
367 movq [dstq+40], m5
|
yading@10
|
368 %endif
|
yading@10
|
369 add srcq, mmsize
|
yading@10
|
370 add dstq, mmsize*6
|
yading@10
|
371 sub lend, mmsize/4
|
yading@10
|
372 jg .loop
|
yading@10
|
373 %if mmsize == 8
|
yading@10
|
374 emms
|
yading@10
|
375 %endif
|
yading@10
|
376 REP_RET
|
yading@10
|
377 %endmacro
|
yading@10
|
378
|
yading@10
|
379 INIT_MMX mmx
|
yading@10
|
380 FLOAT_INTERLEAVE6 0
|
yading@10
|
381 INIT_XMM sse
|
yading@10
|
382 FLOAT_INTERLEAVE6 7
|
yading@10
|
383
|
yading@10
|
384 ;-----------------------------------------------------------------------------
|
yading@10
|
385 ; void ff_float_interleave2(float *dst, const float **src, unsigned int len);
|
yading@10
|
386 ;-----------------------------------------------------------------------------
|
yading@10
|
387
|
yading@10
|
388 %macro FLOAT_INTERLEAVE2 1
|
yading@10
|
389 cglobal float_interleave2, 3, 4, %1, dst, src, len, src1
|
yading@10
|
390 mov src1q, [srcq+gprsize]
|
yading@10
|
391 mov srcq, [srcq ]
|
yading@10
|
392 sub src1q, srcq
|
yading@10
|
393 .loop:
|
yading@10
|
394 mova m0, [srcq ]
|
yading@10
|
395 mova m1, [srcq+src1q ]
|
yading@10
|
396 mova m3, [srcq +mmsize]
|
yading@10
|
397 mova m4, [srcq+src1q+mmsize]
|
yading@10
|
398
|
yading@10
|
399 mova m2, m0
|
yading@10
|
400 PUNPCKLDQ m0, m1
|
yading@10
|
401 PUNPCKHDQ m2, m1
|
yading@10
|
402
|
yading@10
|
403 mova m1, m3
|
yading@10
|
404 PUNPCKLDQ m3, m4
|
yading@10
|
405 PUNPCKHDQ m1, m4
|
yading@10
|
406
|
yading@10
|
407 mova [dstq ], m0
|
yading@10
|
408 mova [dstq+1*mmsize], m2
|
yading@10
|
409 mova [dstq+2*mmsize], m3
|
yading@10
|
410 mova [dstq+3*mmsize], m1
|
yading@10
|
411
|
yading@10
|
412 add srcq, mmsize*2
|
yading@10
|
413 add dstq, mmsize*4
|
yading@10
|
414 sub lend, mmsize/2
|
yading@10
|
415 jg .loop
|
yading@10
|
416 %if mmsize == 8
|
yading@10
|
417 emms
|
yading@10
|
418 %endif
|
yading@10
|
419 REP_RET
|
yading@10
|
420 %endmacro
|
yading@10
|
421
|
yading@10
|
422 INIT_MMX mmx
|
yading@10
|
423 %define PUNPCKLDQ punpckldq
|
yading@10
|
424 %define PUNPCKHDQ punpckhdq
|
yading@10
|
425 FLOAT_INTERLEAVE2 0
|
yading@10
|
426 INIT_XMM sse
|
yading@10
|
427 %define PUNPCKLDQ unpcklps
|
yading@10
|
428 %define PUNPCKHDQ unpckhps
|
yading@10
|
429 FLOAT_INTERLEAVE2 5
|