yading@11
|
1 ;******************************************************************************
|
yading@11
|
2 ;* x86 optimized Format Conversion Utils
|
yading@11
|
3 ;* Copyright (c) 2008 Loren Merritt
|
yading@11
|
4 ;* Copyright (c) 2012 Justin Ruggles <justin.ruggles@gmail.com>
|
yading@11
|
5 ;*
|
yading@11
|
6 ;* This file is part of Libav.
|
yading@11
|
7 ;*
|
yading@11
|
8 ;* Libav is free software; you can redistribute it and/or
|
yading@11
|
9 ;* modify it under the terms of the GNU Lesser General Public
|
yading@11
|
10 ;* License as published by the Free Software Foundation; either
|
yading@11
|
11 ;* version 2.1 of the License, or (at your option) any later version.
|
yading@11
|
12 ;*
|
yading@11
|
13 ;* Libav is distributed in the hope that it will be useful,
|
yading@11
|
14 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
yading@11
|
15 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
yading@11
|
16 ;* Lesser General Public License for more details.
|
yading@11
|
17 ;*
|
yading@11
|
18 ;* You should have received a copy of the GNU Lesser General Public
|
yading@11
|
19 ;* License along with Libav; if not, write to the Free Software
|
yading@11
|
20 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
yading@11
|
21 ;******************************************************************************
|
yading@11
|
22
|
yading@11
|
23 %include "libavutil/x86/x86util.asm"
|
yading@11
|
24 %include "util.asm"
|
yading@11
|
25
|
yading@11
|
26 SECTION_RODATA 32
|
yading@11
|
27
|
yading@11
|
28 pf_s32_inv_scale: times 8 dd 0x30000000
|
yading@11
|
29 pf_s32_scale: times 8 dd 0x4f000000
|
yading@11
|
30 pf_s32_clip: times 8 dd 0x4effffff
|
yading@11
|
31 pf_s16_inv_scale: times 4 dd 0x38000000
|
yading@11
|
32 pf_s16_scale: times 4 dd 0x47000000
|
yading@11
|
33 pb_shuf_unpack_even: db -1, -1, 0, 1, -1, -1, 2, 3, -1, -1, 8, 9, -1, -1, 10, 11
|
yading@11
|
34 pb_shuf_unpack_odd: db -1, -1, 4, 5, -1, -1, 6, 7, -1, -1, 12, 13, -1, -1, 14, 15
|
yading@11
|
35 pb_interleave_words: SHUFFLE_MASK_W 0, 4, 1, 5, 2, 6, 3, 7
|
yading@11
|
36 pb_deinterleave_words: SHUFFLE_MASK_W 0, 2, 4, 6, 1, 3, 5, 7
|
yading@11
|
37 pw_zero_even: times 4 dw 0x0000, 0xffff
|
yading@11
|
38
|
yading@11
|
39 SECTION_TEXT
|
yading@11
|
40
|
yading@11
|
41 ;------------------------------------------------------------------------------
|
yading@11
|
42 ; void ff_conv_s16_to_s32(int32_t *dst, const int16_t *src, int len);
|
yading@11
|
43 ;------------------------------------------------------------------------------
|
yading@11
|
44
|
yading@11
|
45 INIT_XMM sse2
|
yading@11
|
46 cglobal conv_s16_to_s32, 3,3,3, dst, src, len
|
yading@11
|
47 lea lenq, [2*lend]
|
yading@11
|
48 lea dstq, [dstq+2*lenq]
|
yading@11
|
49 add srcq, lenq
|
yading@11
|
50 neg lenq
|
yading@11
|
51 .loop:
|
yading@11
|
52 mova m2, [srcq+lenq]
|
yading@11
|
53 pxor m0, m0
|
yading@11
|
54 pxor m1, m1
|
yading@11
|
55 punpcklwd m0, m2
|
yading@11
|
56 punpckhwd m1, m2
|
yading@11
|
57 mova [dstq+2*lenq ], m0
|
yading@11
|
58 mova [dstq+2*lenq+mmsize], m1
|
yading@11
|
59 add lenq, mmsize
|
yading@11
|
60 jl .loop
|
yading@11
|
61 REP_RET
|
yading@11
|
62
|
yading@11
|
63 ;------------------------------------------------------------------------------
|
yading@11
|
64 ; void ff_conv_s16_to_flt(float *dst, const int16_t *src, int len);
|
yading@11
|
65 ;------------------------------------------------------------------------------
|
yading@11
|
66
|
yading@11
|
67 %macro CONV_S16_TO_FLT 0
|
yading@11
|
68 cglobal conv_s16_to_flt, 3,3,3, dst, src, len
|
yading@11
|
69 lea lenq, [2*lend]
|
yading@11
|
70 add srcq, lenq
|
yading@11
|
71 lea dstq, [dstq + 2*lenq]
|
yading@11
|
72 neg lenq
|
yading@11
|
73 mova m2, [pf_s16_inv_scale]
|
yading@11
|
74 ALIGN 16
|
yading@11
|
75 .loop:
|
yading@11
|
76 mova m0, [srcq+lenq]
|
yading@11
|
77 S16_TO_S32_SX 0, 1
|
yading@11
|
78 cvtdq2ps m0, m0
|
yading@11
|
79 cvtdq2ps m1, m1
|
yading@11
|
80 mulps m0, m2
|
yading@11
|
81 mulps m1, m2
|
yading@11
|
82 mova [dstq+2*lenq ], m0
|
yading@11
|
83 mova [dstq+2*lenq+mmsize], m1
|
yading@11
|
84 add lenq, mmsize
|
yading@11
|
85 jl .loop
|
yading@11
|
86 REP_RET
|
yading@11
|
87 %endmacro
|
yading@11
|
88
|
yading@11
|
89 INIT_XMM sse2
|
yading@11
|
90 CONV_S16_TO_FLT
|
yading@11
|
91 INIT_XMM sse4
|
yading@11
|
92 CONV_S16_TO_FLT
|
yading@11
|
93
|
yading@11
|
94 ;------------------------------------------------------------------------------
|
yading@11
|
95 ; void ff_conv_s32_to_s16(int16_t *dst, const int32_t *src, int len);
|
yading@11
|
96 ;------------------------------------------------------------------------------
|
yading@11
|
97
|
yading@11
|
98 %macro CONV_S32_TO_S16 0
|
yading@11
|
99 cglobal conv_s32_to_s16, 3,3,4, dst, src, len
|
yading@11
|
100 lea lenq, [2*lend]
|
yading@11
|
101 lea srcq, [srcq+2*lenq]
|
yading@11
|
102 add dstq, lenq
|
yading@11
|
103 neg lenq
|
yading@11
|
104 .loop:
|
yading@11
|
105 mova m0, [srcq+2*lenq ]
|
yading@11
|
106 mova m1, [srcq+2*lenq+ mmsize]
|
yading@11
|
107 mova m2, [srcq+2*lenq+2*mmsize]
|
yading@11
|
108 mova m3, [srcq+2*lenq+3*mmsize]
|
yading@11
|
109 psrad m0, 16
|
yading@11
|
110 psrad m1, 16
|
yading@11
|
111 psrad m2, 16
|
yading@11
|
112 psrad m3, 16
|
yading@11
|
113 packssdw m0, m1
|
yading@11
|
114 packssdw m2, m3
|
yading@11
|
115 mova [dstq+lenq ], m0
|
yading@11
|
116 mova [dstq+lenq+mmsize], m2
|
yading@11
|
117 add lenq, mmsize*2
|
yading@11
|
118 jl .loop
|
yading@11
|
119 %if mmsize == 8
|
yading@11
|
120 emms
|
yading@11
|
121 RET
|
yading@11
|
122 %else
|
yading@11
|
123 REP_RET
|
yading@11
|
124 %endif
|
yading@11
|
125 %endmacro
|
yading@11
|
126
|
yading@11
|
127 INIT_MMX mmx
|
yading@11
|
128 CONV_S32_TO_S16
|
yading@11
|
129 INIT_XMM sse2
|
yading@11
|
130 CONV_S32_TO_S16
|
yading@11
|
131
|
yading@11
|
132 ;------------------------------------------------------------------------------
|
yading@11
|
133 ; void ff_conv_s32_to_flt(float *dst, const int32_t *src, int len);
|
yading@11
|
134 ;------------------------------------------------------------------------------
|
yading@11
|
135
|
yading@11
|
136 %macro CONV_S32_TO_FLT 0
|
yading@11
|
137 cglobal conv_s32_to_flt, 3,3,3, dst, src, len
|
yading@11
|
138 lea lenq, [4*lend]
|
yading@11
|
139 add srcq, lenq
|
yading@11
|
140 add dstq, lenq
|
yading@11
|
141 neg lenq
|
yading@11
|
142 mova m0, [pf_s32_inv_scale]
|
yading@11
|
143 ALIGN 16
|
yading@11
|
144 .loop:
|
yading@11
|
145 cvtdq2ps m1, [srcq+lenq ]
|
yading@11
|
146 cvtdq2ps m2, [srcq+lenq+mmsize]
|
yading@11
|
147 mulps m1, m1, m0
|
yading@11
|
148 mulps m2, m2, m0
|
yading@11
|
149 mova [dstq+lenq ], m1
|
yading@11
|
150 mova [dstq+lenq+mmsize], m2
|
yading@11
|
151 add lenq, mmsize*2
|
yading@11
|
152 jl .loop
|
yading@11
|
153 REP_RET
|
yading@11
|
154 %endmacro
|
yading@11
|
155
|
yading@11
|
156 INIT_XMM sse2
|
yading@11
|
157 CONV_S32_TO_FLT
|
yading@11
|
158 %if HAVE_AVX_EXTERNAL
|
yading@11
|
159 INIT_YMM avx
|
yading@11
|
160 CONV_S32_TO_FLT
|
yading@11
|
161 %endif
|
yading@11
|
162
|
yading@11
|
163 ;------------------------------------------------------------------------------
|
yading@11
|
164 ; void ff_conv_flt_to_s16(int16_t *dst, const float *src, int len);
|
yading@11
|
165 ;------------------------------------------------------------------------------
|
yading@11
|
166
|
yading@11
|
167 INIT_XMM sse2
|
yading@11
|
168 cglobal conv_flt_to_s16, 3,3,5, dst, src, len
|
yading@11
|
169 lea lenq, [2*lend]
|
yading@11
|
170 lea srcq, [srcq+2*lenq]
|
yading@11
|
171 add dstq, lenq
|
yading@11
|
172 neg lenq
|
yading@11
|
173 mova m4, [pf_s16_scale]
|
yading@11
|
174 .loop:
|
yading@11
|
175 mova m0, [srcq+2*lenq ]
|
yading@11
|
176 mova m1, [srcq+2*lenq+1*mmsize]
|
yading@11
|
177 mova m2, [srcq+2*lenq+2*mmsize]
|
yading@11
|
178 mova m3, [srcq+2*lenq+3*mmsize]
|
yading@11
|
179 mulps m0, m4
|
yading@11
|
180 mulps m1, m4
|
yading@11
|
181 mulps m2, m4
|
yading@11
|
182 mulps m3, m4
|
yading@11
|
183 cvtps2dq m0, m0
|
yading@11
|
184 cvtps2dq m1, m1
|
yading@11
|
185 cvtps2dq m2, m2
|
yading@11
|
186 cvtps2dq m3, m3
|
yading@11
|
187 packssdw m0, m1
|
yading@11
|
188 packssdw m2, m3
|
yading@11
|
189 mova [dstq+lenq ], m0
|
yading@11
|
190 mova [dstq+lenq+mmsize], m2
|
yading@11
|
191 add lenq, mmsize*2
|
yading@11
|
192 jl .loop
|
yading@11
|
193 REP_RET
|
yading@11
|
194
|
yading@11
|
195 ;------------------------------------------------------------------------------
|
yading@11
|
196 ; void ff_conv_flt_to_s32(int32_t *dst, const float *src, int len);
|
yading@11
|
197 ;------------------------------------------------------------------------------
|
yading@11
|
198
|
yading@11
|
199 %macro CONV_FLT_TO_S32 0
|
yading@11
|
200 cglobal conv_flt_to_s32, 3,3,6, dst, src, len
|
yading@11
|
201 lea lenq, [lend*4]
|
yading@11
|
202 add srcq, lenq
|
yading@11
|
203 add dstq, lenq
|
yading@11
|
204 neg lenq
|
yading@11
|
205 mova m4, [pf_s32_scale]
|
yading@11
|
206 mova m5, [pf_s32_clip]
|
yading@11
|
207 .loop:
|
yading@11
|
208 mulps m0, m4, [srcq+lenq ]
|
yading@11
|
209 mulps m1, m4, [srcq+lenq+1*mmsize]
|
yading@11
|
210 mulps m2, m4, [srcq+lenq+2*mmsize]
|
yading@11
|
211 mulps m3, m4, [srcq+lenq+3*mmsize]
|
yading@11
|
212 minps m0, m0, m5
|
yading@11
|
213 minps m1, m1, m5
|
yading@11
|
214 minps m2, m2, m5
|
yading@11
|
215 minps m3, m3, m5
|
yading@11
|
216 cvtps2dq m0, m0
|
yading@11
|
217 cvtps2dq m1, m1
|
yading@11
|
218 cvtps2dq m2, m2
|
yading@11
|
219 cvtps2dq m3, m3
|
yading@11
|
220 mova [dstq+lenq ], m0
|
yading@11
|
221 mova [dstq+lenq+1*mmsize], m1
|
yading@11
|
222 mova [dstq+lenq+2*mmsize], m2
|
yading@11
|
223 mova [dstq+lenq+3*mmsize], m3
|
yading@11
|
224 add lenq, mmsize*4
|
yading@11
|
225 jl .loop
|
yading@11
|
226 REP_RET
|
yading@11
|
227 %endmacro
|
yading@11
|
228
|
yading@11
|
229 INIT_XMM sse2
|
yading@11
|
230 CONV_FLT_TO_S32
|
yading@11
|
231 %if HAVE_AVX_EXTERNAL
|
yading@11
|
232 INIT_YMM avx
|
yading@11
|
233 CONV_FLT_TO_S32
|
yading@11
|
234 %endif
|
yading@11
|
235
|
yading@11
|
236 ;------------------------------------------------------------------------------
|
yading@11
|
237 ; void ff_conv_s16p_to_s16_2ch(int16_t *dst, int16_t *const *src, int len,
|
yading@11
|
238 ; int channels);
|
yading@11
|
239 ;------------------------------------------------------------------------------
|
yading@11
|
240
|
yading@11
|
241 %macro CONV_S16P_TO_S16_2CH 0
|
yading@11
|
242 cglobal conv_s16p_to_s16_2ch, 3,4,5, dst, src0, len, src1
|
yading@11
|
243 mov src1q, [src0q+gprsize]
|
yading@11
|
244 mov src0q, [src0q ]
|
yading@11
|
245 lea lenq, [2*lend]
|
yading@11
|
246 add src0q, lenq
|
yading@11
|
247 add src1q, lenq
|
yading@11
|
248 lea dstq, [dstq+2*lenq]
|
yading@11
|
249 neg lenq
|
yading@11
|
250 .loop:
|
yading@11
|
251 mova m0, [src0q+lenq ]
|
yading@11
|
252 mova m1, [src1q+lenq ]
|
yading@11
|
253 mova m2, [src0q+lenq+mmsize]
|
yading@11
|
254 mova m3, [src1q+lenq+mmsize]
|
yading@11
|
255 SBUTTERFLY2 wd, 0, 1, 4
|
yading@11
|
256 SBUTTERFLY2 wd, 2, 3, 4
|
yading@11
|
257 mova [dstq+2*lenq+0*mmsize], m0
|
yading@11
|
258 mova [dstq+2*lenq+1*mmsize], m1
|
yading@11
|
259 mova [dstq+2*lenq+2*mmsize], m2
|
yading@11
|
260 mova [dstq+2*lenq+3*mmsize], m3
|
yading@11
|
261 add lenq, 2*mmsize
|
yading@11
|
262 jl .loop
|
yading@11
|
263 REP_RET
|
yading@11
|
264 %endmacro
|
yading@11
|
265
|
yading@11
|
266 INIT_XMM sse2
|
yading@11
|
267 CONV_S16P_TO_S16_2CH
|
yading@11
|
268 %if HAVE_AVX_EXTERNAL
|
yading@11
|
269 INIT_XMM avx
|
yading@11
|
270 CONV_S16P_TO_S16_2CH
|
yading@11
|
271 %endif
|
yading@11
|
272
|
yading@11
|
273 ;------------------------------------------------------------------------------
|
yading@11
|
274 ; void ff_conv_s16p_to_s16_6ch(int16_t *dst, int16_t *const *src, int len,
|
yading@11
|
275 ; int channels);
|
yading@11
|
276 ;------------------------------------------------------------------------------
|
yading@11
|
277
|
yading@11
|
278 ;------------------------------------------------------------------------------
|
yading@11
|
279 ; NOTE: In the 6-channel functions, len could be used as an index on x86-64
|
yading@11
|
280 ; instead of just a counter, which would avoid incrementing the
|
yading@11
|
281 ; pointers, but the extra complexity and amount of code is not worth
|
yading@11
|
282 ; the small gain. On x86-32 there are not enough registers to use len
|
yading@11
|
283 ; as an index without keeping two of the pointers on the stack and
|
yading@11
|
284 ; loading them in each iteration.
|
yading@11
|
285 ;------------------------------------------------------------------------------
|
yading@11
|
286
|
yading@11
|
287 %macro CONV_S16P_TO_S16_6CH 0
|
yading@11
|
288 %if ARCH_X86_64
|
yading@11
|
289 cglobal conv_s16p_to_s16_6ch, 3,8,7, dst, src0, len, src1, src2, src3, src4, src5
|
yading@11
|
290 %else
|
yading@11
|
291 cglobal conv_s16p_to_s16_6ch, 2,7,7, dst, src0, src1, src2, src3, src4, src5
|
yading@11
|
292 %define lend dword r2m
|
yading@11
|
293 %endif
|
yading@11
|
294 mov src1q, [src0q+1*gprsize]
|
yading@11
|
295 mov src2q, [src0q+2*gprsize]
|
yading@11
|
296 mov src3q, [src0q+3*gprsize]
|
yading@11
|
297 mov src4q, [src0q+4*gprsize]
|
yading@11
|
298 mov src5q, [src0q+5*gprsize]
|
yading@11
|
299 mov src0q, [src0q]
|
yading@11
|
300 sub src1q, src0q
|
yading@11
|
301 sub src2q, src0q
|
yading@11
|
302 sub src3q, src0q
|
yading@11
|
303 sub src4q, src0q
|
yading@11
|
304 sub src5q, src0q
|
yading@11
|
305 .loop:
|
yading@11
|
306 %if cpuflag(sse2slow)
|
yading@11
|
307 movq m0, [src0q ] ; m0 = 0, 6, 12, 18, x, x, x, x
|
yading@11
|
308 movq m1, [src0q+src1q] ; m1 = 1, 7, 13, 19, x, x, x, x
|
yading@11
|
309 movq m2, [src0q+src2q] ; m2 = 2, 8, 14, 20, x, x, x, x
|
yading@11
|
310 movq m3, [src0q+src3q] ; m3 = 3, 9, 15, 21, x, x, x, x
|
yading@11
|
311 movq m4, [src0q+src4q] ; m4 = 4, 10, 16, 22, x, x, x, x
|
yading@11
|
312 movq m5, [src0q+src5q] ; m5 = 5, 11, 17, 23, x, x, x, x
|
yading@11
|
313 ; unpack words:
|
yading@11
|
314 punpcklwd m0, m1 ; m0 = 0, 1, 6, 7, 12, 13, 18, 19
|
yading@11
|
315 punpcklwd m2, m3 ; m2 = 4, 5, 10, 11, 16, 17, 22, 23
|
yading@11
|
316 punpcklwd m4, m5 ; m4 = 2, 3, 8, 9, 14, 15, 20, 21
|
yading@11
|
317 ; blend dwords
|
yading@11
|
318 shufps m1, m0, m2, q2020 ; m1 = 0, 1, 12, 13, 2, 3, 14, 15
|
yading@11
|
319 shufps m0, m4, q2031 ; m0 = 6, 7, 18, 19, 4, 5, 16, 17
|
yading@11
|
320 shufps m2, m4, q3131 ; m2 = 8, 9, 20, 21, 10, 11, 22, 23
|
yading@11
|
321 ; shuffle dwords
|
yading@11
|
322 pshufd m0, m0, q1302 ; m0 = 4, 5, 6, 7, 16, 17, 18, 19
|
yading@11
|
323 pshufd m1, m1, q3120 ; m1 = 0, 1, 2, 3, 12, 13, 14, 15
|
yading@11
|
324 pshufd m2, m2, q3120 ; m2 = 8, 9, 10, 11, 20, 21, 22, 23
|
yading@11
|
325 movq [dstq+0*mmsize/2], m1
|
yading@11
|
326 movq [dstq+1*mmsize/2], m0
|
yading@11
|
327 movq [dstq+2*mmsize/2], m2
|
yading@11
|
328 movhps [dstq+3*mmsize/2], m1
|
yading@11
|
329 movhps [dstq+4*mmsize/2], m0
|
yading@11
|
330 movhps [dstq+5*mmsize/2], m2
|
yading@11
|
331 add src0q, mmsize/2
|
yading@11
|
332 add dstq, mmsize*3
|
yading@11
|
333 sub lend, mmsize/4
|
yading@11
|
334 %else
|
yading@11
|
335 mova m0, [src0q ] ; m0 = 0, 6, 12, 18, 24, 30, 36, 42
|
yading@11
|
336 mova m1, [src0q+src1q] ; m1 = 1, 7, 13, 19, 25, 31, 37, 43
|
yading@11
|
337 mova m2, [src0q+src2q] ; m2 = 2, 8, 14, 20, 26, 32, 38, 44
|
yading@11
|
338 mova m3, [src0q+src3q] ; m3 = 3, 9, 15, 21, 27, 33, 39, 45
|
yading@11
|
339 mova m4, [src0q+src4q] ; m4 = 4, 10, 16, 22, 28, 34, 40, 46
|
yading@11
|
340 mova m5, [src0q+src5q] ; m5 = 5, 11, 17, 23, 29, 35, 41, 47
|
yading@11
|
341 ; unpack words:
|
yading@11
|
342 SBUTTERFLY2 wd, 0, 1, 6 ; m0 = 0, 1, 6, 7, 12, 13, 18, 19
|
yading@11
|
343 ; m1 = 24, 25, 30, 31, 36, 37, 42, 43
|
yading@11
|
344 SBUTTERFLY2 wd, 2, 3, 6 ; m2 = 2, 3, 8, 9, 14, 15, 20, 21
|
yading@11
|
345 ; m3 = 26, 27, 32, 33, 38, 39, 44, 45
|
yading@11
|
346 SBUTTERFLY2 wd, 4, 5, 6 ; m4 = 4, 5, 10, 11, 16, 17, 22, 23
|
yading@11
|
347 ; m5 = 28, 29, 34, 35, 40, 41, 46, 47
|
yading@11
|
348 ; blend dwords
|
yading@11
|
349 shufps m6, m0, m2, q2020 ; m6 = 0, 1, 12, 13, 2, 3, 14, 15
|
yading@11
|
350 shufps m0, m4, q2031 ; m0 = 6, 7, 18, 19, 4, 5, 16, 17
|
yading@11
|
351 shufps m2, m4, q3131 ; m2 = 8, 9, 20, 21, 10, 11, 22, 23
|
yading@11
|
352 SWAP 4,6 ; m4 = 0, 1, 12, 13, 2, 3, 14, 15
|
yading@11
|
353 shufps m6, m1, m3, q2020 ; m6 = 24, 25, 36, 37, 26, 27, 38, 39
|
yading@11
|
354 shufps m1, m5, q2031 ; m1 = 30, 31, 42, 43, 28, 29, 40, 41
|
yading@11
|
355 shufps m3, m5, q3131 ; m3 = 32, 33, 44, 45, 34, 35, 46, 47
|
yading@11
|
356 SWAP 5,6 ; m5 = 24, 25, 36, 37, 26, 27, 38, 39
|
yading@11
|
357 ; shuffle dwords
|
yading@11
|
358 pshufd m0, m0, q1302 ; m0 = 4, 5, 6, 7, 16, 17, 18, 19
|
yading@11
|
359 pshufd m2, m2, q3120 ; m2 = 8, 9, 10, 11, 20, 21, 22, 23
|
yading@11
|
360 pshufd m4, m4, q3120 ; m4 = 0, 1, 2, 3, 12, 13, 14, 15
|
yading@11
|
361 pshufd m1, m1, q1302 ; m1 = 28, 29, 30, 31, 40, 41, 42, 43
|
yading@11
|
362 pshufd m3, m3, q3120 ; m3 = 32, 33, 34, 35, 44, 45, 46, 47
|
yading@11
|
363 pshufd m5, m5, q3120 ; m5 = 24, 25, 26, 27, 36, 37, 38, 39
|
yading@11
|
364 ; shuffle qwords
|
yading@11
|
365 punpcklqdq m6, m4, m0 ; m6 = 0, 1, 2, 3, 4, 5, 6, 7
|
yading@11
|
366 punpckhqdq m0, m2 ; m0 = 16, 17, 18, 19, 20, 21, 22, 23
|
yading@11
|
367 shufps m2, m4, q3210 ; m2 = 8, 9, 10, 11, 12, 13, 14, 15
|
yading@11
|
368 SWAP 4,6 ; m4 = 0, 1, 2, 3, 4, 5, 6, 7
|
yading@11
|
369 punpcklqdq m6, m5, m1 ; m6 = 24, 25, 26, 27, 28, 29, 30, 31
|
yading@11
|
370 punpckhqdq m1, m3 ; m1 = 40, 41, 42, 43, 44, 45, 46, 47
|
yading@11
|
371 shufps m3, m5, q3210 ; m3 = 32, 33, 34, 35, 36, 37, 38, 39
|
yading@11
|
372 SWAP 5,6 ; m5 = 24, 25, 26, 27, 28, 29, 30, 31
|
yading@11
|
373 mova [dstq+0*mmsize], m4
|
yading@11
|
374 mova [dstq+1*mmsize], m2
|
yading@11
|
375 mova [dstq+2*mmsize], m0
|
yading@11
|
376 mova [dstq+3*mmsize], m5
|
yading@11
|
377 mova [dstq+4*mmsize], m3
|
yading@11
|
378 mova [dstq+5*mmsize], m1
|
yading@11
|
379 add src0q, mmsize
|
yading@11
|
380 add dstq, mmsize*6
|
yading@11
|
381 sub lend, mmsize/2
|
yading@11
|
382 %endif
|
yading@11
|
383 jg .loop
|
yading@11
|
384 REP_RET
|
yading@11
|
385 %endmacro
|
yading@11
|
386
|
yading@11
|
387 INIT_XMM sse2
|
yading@11
|
388 CONV_S16P_TO_S16_6CH
|
yading@11
|
389 INIT_XMM sse2slow
|
yading@11
|
390 CONV_S16P_TO_S16_6CH
|
yading@11
|
391 %if HAVE_AVX_EXTERNAL
|
yading@11
|
392 INIT_XMM avx
|
yading@11
|
393 CONV_S16P_TO_S16_6CH
|
yading@11
|
394 %endif
|
yading@11
|
395
|
yading@11
|
396 ;------------------------------------------------------------------------------
|
yading@11
|
397 ; void ff_conv_s16p_to_flt_2ch(float *dst, int16_t *const *src, int len,
|
yading@11
|
398 ; int channels);
|
yading@11
|
399 ;------------------------------------------------------------------------------
|
yading@11
|
400
|
yading@11
|
401 %macro CONV_S16P_TO_FLT_2CH 0
|
yading@11
|
402 cglobal conv_s16p_to_flt_2ch, 3,4,6, dst, src0, len, src1
|
yading@11
|
403 lea lenq, [2*lend]
|
yading@11
|
404 mov src1q, [src0q+gprsize]
|
yading@11
|
405 mov src0q, [src0q ]
|
yading@11
|
406 lea dstq, [dstq+4*lenq]
|
yading@11
|
407 add src0q, lenq
|
yading@11
|
408 add src1q, lenq
|
yading@11
|
409 neg lenq
|
yading@11
|
410 mova m5, [pf_s32_inv_scale]
|
yading@11
|
411 .loop:
|
yading@11
|
412 mova m2, [src0q+lenq] ; m2 = 0, 2, 4, 6, 8, 10, 12, 14
|
yading@11
|
413 mova m4, [src1q+lenq] ; m4 = 1, 3, 5, 7, 9, 11, 13, 15
|
yading@11
|
414 SBUTTERFLY2 wd, 2, 4, 3 ; m2 = 0, 1, 2, 3, 4, 5, 6, 7
|
yading@11
|
415 ; m4 = 8, 9, 10, 11, 12, 13, 14, 15
|
yading@11
|
416 pxor m3, m3
|
yading@11
|
417 punpcklwd m0, m3, m2 ; m0 = 0, 1, 2, 3
|
yading@11
|
418 punpckhwd m1, m3, m2 ; m1 = 4, 5, 6, 7
|
yading@11
|
419 punpcklwd m2, m3, m4 ; m2 = 8, 9, 10, 11
|
yading@11
|
420 punpckhwd m3, m4 ; m3 = 12, 13, 14, 15
|
yading@11
|
421 cvtdq2ps m0, m0
|
yading@11
|
422 cvtdq2ps m1, m1
|
yading@11
|
423 cvtdq2ps m2, m2
|
yading@11
|
424 cvtdq2ps m3, m3
|
yading@11
|
425 mulps m0, m5
|
yading@11
|
426 mulps m1, m5
|
yading@11
|
427 mulps m2, m5
|
yading@11
|
428 mulps m3, m5
|
yading@11
|
429 mova [dstq+4*lenq ], m0
|
yading@11
|
430 mova [dstq+4*lenq+ mmsize], m1
|
yading@11
|
431 mova [dstq+4*lenq+2*mmsize], m2
|
yading@11
|
432 mova [dstq+4*lenq+3*mmsize], m3
|
yading@11
|
433 add lenq, mmsize
|
yading@11
|
434 jl .loop
|
yading@11
|
435 REP_RET
|
yading@11
|
436 %endmacro
|
yading@11
|
437
|
yading@11
|
438 INIT_XMM sse2
|
yading@11
|
439 CONV_S16P_TO_FLT_2CH
|
yading@11
|
440 %if HAVE_AVX_EXTERNAL
|
yading@11
|
441 INIT_XMM avx
|
yading@11
|
442 CONV_S16P_TO_FLT_2CH
|
yading@11
|
443 %endif
|
yading@11
|
444
|
yading@11
|
445 ;------------------------------------------------------------------------------
|
yading@11
|
446 ; void ff_conv_s16p_to_flt_6ch(float *dst, int16_t *const *src, int len,
|
yading@11
|
447 ; int channels);
|
yading@11
|
448 ;------------------------------------------------------------------------------
|
yading@11
|
449
|
yading@11
|
450 %macro CONV_S16P_TO_FLT_6CH 0
|
yading@11
|
451 %if ARCH_X86_64
|
yading@11
|
452 cglobal conv_s16p_to_flt_6ch, 3,8,8, dst, src, len, src1, src2, src3, src4, src5
|
yading@11
|
453 %else
|
yading@11
|
454 cglobal conv_s16p_to_flt_6ch, 2,7,8, dst, src, src1, src2, src3, src4, src5
|
yading@11
|
455 %define lend dword r2m
|
yading@11
|
456 %endif
|
yading@11
|
457 mov src1q, [srcq+1*gprsize]
|
yading@11
|
458 mov src2q, [srcq+2*gprsize]
|
yading@11
|
459 mov src3q, [srcq+3*gprsize]
|
yading@11
|
460 mov src4q, [srcq+4*gprsize]
|
yading@11
|
461 mov src5q, [srcq+5*gprsize]
|
yading@11
|
462 mov srcq, [srcq]
|
yading@11
|
463 sub src1q, srcq
|
yading@11
|
464 sub src2q, srcq
|
yading@11
|
465 sub src3q, srcq
|
yading@11
|
466 sub src4q, srcq
|
yading@11
|
467 sub src5q, srcq
|
yading@11
|
468 mova m7, [pf_s32_inv_scale]
|
yading@11
|
469 %if cpuflag(ssse3)
|
yading@11
|
470 %define unpack_even m6
|
yading@11
|
471 mova m6, [pb_shuf_unpack_even]
|
yading@11
|
472 %if ARCH_X86_64
|
yading@11
|
473 %define unpack_odd m8
|
yading@11
|
474 mova m8, [pb_shuf_unpack_odd]
|
yading@11
|
475 %else
|
yading@11
|
476 %define unpack_odd [pb_shuf_unpack_odd]
|
yading@11
|
477 %endif
|
yading@11
|
478 %endif
|
yading@11
|
479 .loop:
|
yading@11
|
480 movq m0, [srcq ] ; m0 = 0, 6, 12, 18, x, x, x, x
|
yading@11
|
481 movq m1, [srcq+src1q] ; m1 = 1, 7, 13, 19, x, x, x, x
|
yading@11
|
482 movq m2, [srcq+src2q] ; m2 = 2, 8, 14, 20, x, x, x, x
|
yading@11
|
483 movq m3, [srcq+src3q] ; m3 = 3, 9, 15, 21, x, x, x, x
|
yading@11
|
484 movq m4, [srcq+src4q] ; m4 = 4, 10, 16, 22, x, x, x, x
|
yading@11
|
485 movq m5, [srcq+src5q] ; m5 = 5, 11, 17, 23, x, x, x, x
|
yading@11
|
486 ; unpack words:
|
yading@11
|
487 punpcklwd m0, m1 ; m0 = 0, 1, 6, 7, 12, 13, 18, 19
|
yading@11
|
488 punpcklwd m2, m3 ; m2 = 2, 3, 8, 9, 14, 15, 20, 21
|
yading@11
|
489 punpcklwd m4, m5 ; m4 = 4, 5, 10, 11, 16, 17, 22, 23
|
yading@11
|
490 ; blend dwords
|
yading@11
|
491 shufps m1, m4, m0, q3120 ; m1 = 4, 5, 16, 17, 6, 7, 18, 19
|
yading@11
|
492 shufps m0, m2, q2020 ; m0 = 0, 1, 12, 13, 2, 3, 14, 15
|
yading@11
|
493 shufps m2, m4, q3131 ; m2 = 8, 9, 20, 21, 10, 11, 22, 23
|
yading@11
|
494 %if cpuflag(ssse3)
|
yading@11
|
495 pshufb m3, m0, unpack_odd ; m3 = 12, 13, 14, 15
|
yading@11
|
496 pshufb m0, unpack_even ; m0 = 0, 1, 2, 3
|
yading@11
|
497 pshufb m4, m1, unpack_odd ; m4 = 16, 17, 18, 19
|
yading@11
|
498 pshufb m1, unpack_even ; m1 = 4, 5, 6, 7
|
yading@11
|
499 pshufb m5, m2, unpack_odd ; m5 = 20, 21, 22, 23
|
yading@11
|
500 pshufb m2, unpack_even ; m2 = 8, 9, 10, 11
|
yading@11
|
501 %else
|
yading@11
|
502 ; shuffle dwords
|
yading@11
|
503 pshufd m0, m0, q3120 ; m0 = 0, 1, 2, 3, 12, 13, 14, 15
|
yading@11
|
504 pshufd m1, m1, q3120 ; m1 = 4, 5, 6, 7, 16, 17, 18, 19
|
yading@11
|
505 pshufd m2, m2, q3120 ; m2 = 8, 9, 10, 11, 20, 21, 22, 23
|
yading@11
|
506 pxor m6, m6 ; convert s16 in m0-m2 to s32 in m0-m5
|
yading@11
|
507 punpcklwd m3, m6, m0 ; m3 = 0, 1, 2, 3
|
yading@11
|
508 punpckhwd m4, m6, m0 ; m4 = 12, 13, 14, 15
|
yading@11
|
509 punpcklwd m0, m6, m1 ; m0 = 4, 5, 6, 7
|
yading@11
|
510 punpckhwd m5, m6, m1 ; m5 = 16, 17, 18, 19
|
yading@11
|
511 punpcklwd m1, m6, m2 ; m1 = 8, 9, 10, 11
|
yading@11
|
512 punpckhwd m6, m2 ; m6 = 20, 21, 22, 23
|
yading@11
|
513 SWAP 6,2,1,0,3,4,5 ; swap registers 3,0,1,4,5,6 to 0,1,2,3,4,5
|
yading@11
|
514 %endif
|
yading@11
|
515 cvtdq2ps m0, m0 ; convert s32 to float
|
yading@11
|
516 cvtdq2ps m1, m1
|
yading@11
|
517 cvtdq2ps m2, m2
|
yading@11
|
518 cvtdq2ps m3, m3
|
yading@11
|
519 cvtdq2ps m4, m4
|
yading@11
|
520 cvtdq2ps m5, m5
|
yading@11
|
521 mulps m0, m7 ; scale float from s32 range to [-1.0,1.0]
|
yading@11
|
522 mulps m1, m7
|
yading@11
|
523 mulps m2, m7
|
yading@11
|
524 mulps m3, m7
|
yading@11
|
525 mulps m4, m7
|
yading@11
|
526 mulps m5, m7
|
yading@11
|
527 mova [dstq ], m0
|
yading@11
|
528 mova [dstq+ mmsize], m1
|
yading@11
|
529 mova [dstq+2*mmsize], m2
|
yading@11
|
530 mova [dstq+3*mmsize], m3
|
yading@11
|
531 mova [dstq+4*mmsize], m4
|
yading@11
|
532 mova [dstq+5*mmsize], m5
|
yading@11
|
533 add srcq, mmsize/2
|
yading@11
|
534 add dstq, mmsize*6
|
yading@11
|
535 sub lend, mmsize/4
|
yading@11
|
536 jg .loop
|
yading@11
|
537 REP_RET
|
yading@11
|
538 %endmacro
|
yading@11
|
539
|
yading@11
|
540 INIT_XMM sse2
|
yading@11
|
541 CONV_S16P_TO_FLT_6CH
|
yading@11
|
542 INIT_XMM ssse3
|
yading@11
|
543 CONV_S16P_TO_FLT_6CH
|
yading@11
|
544 %if HAVE_AVX_EXTERNAL
|
yading@11
|
545 INIT_XMM avx
|
yading@11
|
546 CONV_S16P_TO_FLT_6CH
|
yading@11
|
547 %endif
|
yading@11
|
548
|
yading@11
|
549 ;------------------------------------------------------------------------------
|
yading@11
|
550 ; void ff_conv_fltp_to_s16_2ch(int16_t *dst, float *const *src, int len,
|
yading@11
|
551 ; int channels);
|
yading@11
|
552 ;------------------------------------------------------------------------------
|
yading@11
|
553
|
yading@11
|
554 %macro CONV_FLTP_TO_S16_2CH 0
|
yading@11
|
555 cglobal conv_fltp_to_s16_2ch, 3,4,3, dst, src0, len, src1
|
yading@11
|
556 lea lenq, [4*lend]
|
yading@11
|
557 mov src1q, [src0q+gprsize]
|
yading@11
|
558 mov src0q, [src0q ]
|
yading@11
|
559 add dstq, lenq
|
yading@11
|
560 add src0q, lenq
|
yading@11
|
561 add src1q, lenq
|
yading@11
|
562 neg lenq
|
yading@11
|
563 mova m2, [pf_s16_scale]
|
yading@11
|
564 %if cpuflag(ssse3)
|
yading@11
|
565 mova m3, [pb_interleave_words]
|
yading@11
|
566 %endif
|
yading@11
|
567 .loop:
|
yading@11
|
568 mulps m0, m2, [src0q+lenq] ; m0 = 0, 2, 4, 6
|
yading@11
|
569 mulps m1, m2, [src1q+lenq] ; m1 = 1, 3, 5, 7
|
yading@11
|
570 cvtps2dq m0, m0
|
yading@11
|
571 cvtps2dq m1, m1
|
yading@11
|
572 %if cpuflag(ssse3)
|
yading@11
|
573 packssdw m0, m1 ; m0 = 0, 2, 4, 6, 1, 3, 5, 7
|
yading@11
|
574 pshufb m0, m3 ; m0 = 0, 1, 2, 3, 4, 5, 6, 7
|
yading@11
|
575 %else
|
yading@11
|
576 packssdw m0, m0 ; m0 = 0, 2, 4, 6, x, x, x, x
|
yading@11
|
577 packssdw m1, m1 ; m1 = 1, 3, 5, 7, x, x, x, x
|
yading@11
|
578 punpcklwd m0, m1 ; m0 = 0, 1, 2, 3, 4, 5, 6, 7
|
yading@11
|
579 %endif
|
yading@11
|
580 mova [dstq+lenq], m0
|
yading@11
|
581 add lenq, mmsize
|
yading@11
|
582 jl .loop
|
yading@11
|
583 REP_RET
|
yading@11
|
584 %endmacro
|
yading@11
|
585
|
yading@11
|
586 INIT_XMM sse2
|
yading@11
|
587 CONV_FLTP_TO_S16_2CH
|
yading@11
|
588 INIT_XMM ssse3
|
yading@11
|
589 CONV_FLTP_TO_S16_2CH
|
yading@11
|
590
|
yading@11
|
591 ;------------------------------------------------------------------------------
|
yading@11
|
592 ; void ff_conv_fltp_to_s16_6ch(int16_t *dst, float *const *src, int len,
|
yading@11
|
593 ; int channels);
|
yading@11
|
594 ;------------------------------------------------------------------------------
|
yading@11
|
595
|
yading@11
|
596 %macro CONV_FLTP_TO_S16_6CH 0
|
yading@11
|
597 %if ARCH_X86_64
|
yading@11
|
598 cglobal conv_fltp_to_s16_6ch, 3,8,7, dst, src, len, src1, src2, src3, src4, src5
|
yading@11
|
599 %else
|
yading@11
|
600 cglobal conv_fltp_to_s16_6ch, 2,7,7, dst, src, src1, src2, src3, src4, src5
|
yading@11
|
601 %define lend dword r2m
|
yading@11
|
602 %endif
|
yading@11
|
603 mov src1q, [srcq+1*gprsize]
|
yading@11
|
604 mov src2q, [srcq+2*gprsize]
|
yading@11
|
605 mov src3q, [srcq+3*gprsize]
|
yading@11
|
606 mov src4q, [srcq+4*gprsize]
|
yading@11
|
607 mov src5q, [srcq+5*gprsize]
|
yading@11
|
608 mov srcq, [srcq]
|
yading@11
|
609 sub src1q, srcq
|
yading@11
|
610 sub src2q, srcq
|
yading@11
|
611 sub src3q, srcq
|
yading@11
|
612 sub src4q, srcq
|
yading@11
|
613 sub src5q, srcq
|
yading@11
|
614 movaps xmm6, [pf_s16_scale]
|
yading@11
|
615 .loop:
|
yading@11
|
616 %if cpuflag(sse2)
|
yading@11
|
617 mulps m0, m6, [srcq ]
|
yading@11
|
618 mulps m1, m6, [srcq+src1q]
|
yading@11
|
619 mulps m2, m6, [srcq+src2q]
|
yading@11
|
620 mulps m3, m6, [srcq+src3q]
|
yading@11
|
621 mulps m4, m6, [srcq+src4q]
|
yading@11
|
622 mulps m5, m6, [srcq+src5q]
|
yading@11
|
623 cvtps2dq m0, m0
|
yading@11
|
624 cvtps2dq m1, m1
|
yading@11
|
625 cvtps2dq m2, m2
|
yading@11
|
626 cvtps2dq m3, m3
|
yading@11
|
627 cvtps2dq m4, m4
|
yading@11
|
628 cvtps2dq m5, m5
|
yading@11
|
629 packssdw m0, m3 ; m0 = 0, 6, 12, 18, 3, 9, 15, 21
|
yading@11
|
630 packssdw m1, m4 ; m1 = 1, 7, 13, 19, 4, 10, 16, 22
|
yading@11
|
631 packssdw m2, m5 ; m2 = 2, 8, 14, 20, 5, 11, 17, 23
|
yading@11
|
632 ; unpack words:
|
yading@11
|
633 movhlps m3, m0 ; m3 = 3, 9, 15, 21, x, x, x, x
|
yading@11
|
634 punpcklwd m0, m1 ; m0 = 0, 1, 6, 7, 12, 13, 18, 19
|
yading@11
|
635 punpckhwd m1, m2 ; m1 = 4, 5, 10, 11, 16, 17, 22, 23
|
yading@11
|
636 punpcklwd m2, m3 ; m2 = 2, 3, 8, 9, 14, 15, 20, 21
|
yading@11
|
637 ; blend dwords:
|
yading@11
|
638 shufps m3, m0, m2, q2020 ; m3 = 0, 1, 12, 13, 2, 3, 14, 15
|
yading@11
|
639 shufps m0, m1, q2031 ; m0 = 6, 7, 18, 19, 4, 5, 16, 17
|
yading@11
|
640 shufps m2, m1, q3131 ; m2 = 8, 9, 20, 21, 10, 11, 22, 23
|
yading@11
|
641 ; shuffle dwords:
|
yading@11
|
642 shufps m1, m2, m3, q3120 ; m1 = 8, 9, 10, 11, 12, 13, 14, 15
|
yading@11
|
643 shufps m3, m0, q0220 ; m3 = 0, 1, 2, 3, 4, 5, 6, 7
|
yading@11
|
644 shufps m0, m2, q3113 ; m0 = 16, 17, 18, 19, 20, 21, 22, 23
|
yading@11
|
645 mova [dstq+0*mmsize], m3
|
yading@11
|
646 mova [dstq+1*mmsize], m1
|
yading@11
|
647 mova [dstq+2*mmsize], m0
|
yading@11
|
648 %else ; sse
|
yading@11
|
649 movlps xmm0, [srcq ]
|
yading@11
|
650 movlps xmm1, [srcq+src1q]
|
yading@11
|
651 movlps xmm2, [srcq+src2q]
|
yading@11
|
652 movlps xmm3, [srcq+src3q]
|
yading@11
|
653 movlps xmm4, [srcq+src4q]
|
yading@11
|
654 movlps xmm5, [srcq+src5q]
|
yading@11
|
655 mulps xmm0, xmm6
|
yading@11
|
656 mulps xmm1, xmm6
|
yading@11
|
657 mulps xmm2, xmm6
|
yading@11
|
658 mulps xmm3, xmm6
|
yading@11
|
659 mulps xmm4, xmm6
|
yading@11
|
660 mulps xmm5, xmm6
|
yading@11
|
661 cvtps2pi mm0, xmm0
|
yading@11
|
662 cvtps2pi mm1, xmm1
|
yading@11
|
663 cvtps2pi mm2, xmm2
|
yading@11
|
664 cvtps2pi mm3, xmm3
|
yading@11
|
665 cvtps2pi mm4, xmm4
|
yading@11
|
666 cvtps2pi mm5, xmm5
|
yading@11
|
667 packssdw mm0, mm3 ; m0 = 0, 6, 3, 9
|
yading@11
|
668 packssdw mm1, mm4 ; m1 = 1, 7, 4, 10
|
yading@11
|
669 packssdw mm2, mm5 ; m2 = 2, 8, 5, 11
|
yading@11
|
670 ; unpack words
|
yading@11
|
671 pshufw mm3, mm0, q1032 ; m3 = 3, 9, 0, 6
|
yading@11
|
672 punpcklwd mm0, mm1 ; m0 = 0, 1, 6, 7
|
yading@11
|
673 punpckhwd mm1, mm2 ; m1 = 4, 5, 10, 11
|
yading@11
|
674 punpcklwd mm2, mm3 ; m2 = 2, 3, 8, 9
|
yading@11
|
675 ; unpack dwords
|
yading@11
|
676 pshufw mm3, mm0, q1032 ; m3 = 6, 7, 0, 1
|
yading@11
|
677 punpckldq mm0, mm2 ; m0 = 0, 1, 2, 3 (final)
|
yading@11
|
678 punpckhdq mm2, mm1 ; m2 = 8, 9, 10, 11 (final)
|
yading@11
|
679 punpckldq mm1, mm3 ; m1 = 4, 5, 6, 7 (final)
|
yading@11
|
680 mova [dstq+0*mmsize], mm0
|
yading@11
|
681 mova [dstq+1*mmsize], mm1
|
yading@11
|
682 mova [dstq+2*mmsize], mm2
|
yading@11
|
683 %endif
|
yading@11
|
684 add srcq, mmsize
|
yading@11
|
685 add dstq, mmsize*3
|
yading@11
|
686 sub lend, mmsize/4
|
yading@11
|
687 jg .loop
|
yading@11
|
688 %if mmsize == 8
|
yading@11
|
689 emms
|
yading@11
|
690 RET
|
yading@11
|
691 %else
|
yading@11
|
692 REP_RET
|
yading@11
|
693 %endif
|
yading@11
|
694 %endmacro
|
yading@11
|
695
|
yading@11
|
696 INIT_MMX sse
|
yading@11
|
697 CONV_FLTP_TO_S16_6CH
|
yading@11
|
698 INIT_XMM sse2
|
yading@11
|
699 CONV_FLTP_TO_S16_6CH
|
yading@11
|
700 %if HAVE_AVX_EXTERNAL
|
yading@11
|
701 INIT_XMM avx
|
yading@11
|
702 CONV_FLTP_TO_S16_6CH
|
yading@11
|
703 %endif
|
yading@11
|
704
|
yading@11
|
705 ;------------------------------------------------------------------------------
|
yading@11
|
706 ; void ff_conv_fltp_to_flt_2ch(float *dst, float *const *src, int len,
|
yading@11
|
707 ; int channels);
|
yading@11
|
708 ;------------------------------------------------------------------------------
|
yading@11
|
709
|
yading@11
|
710 %macro CONV_FLTP_TO_FLT_2CH 0
|
yading@11
|
711 cglobal conv_fltp_to_flt_2ch, 3,4,5, dst, src0, len, src1
|
yading@11
|
712 mov src1q, [src0q+gprsize]
|
yading@11
|
713 mov src0q, [src0q]
|
yading@11
|
714 lea lenq, [4*lend]
|
yading@11
|
715 add src0q, lenq
|
yading@11
|
716 add src1q, lenq
|
yading@11
|
717 lea dstq, [dstq+2*lenq]
|
yading@11
|
718 neg lenq
|
yading@11
|
719 .loop:
|
yading@11
|
720 mova m0, [src0q+lenq ]
|
yading@11
|
721 mova m1, [src1q+lenq ]
|
yading@11
|
722 mova m2, [src0q+lenq+mmsize]
|
yading@11
|
723 mova m3, [src1q+lenq+mmsize]
|
yading@11
|
724 SBUTTERFLYPS 0, 1, 4
|
yading@11
|
725 SBUTTERFLYPS 2, 3, 4
|
yading@11
|
726 mova [dstq+2*lenq+0*mmsize], m0
|
yading@11
|
727 mova [dstq+2*lenq+1*mmsize], m1
|
yading@11
|
728 mova [dstq+2*lenq+2*mmsize], m2
|
yading@11
|
729 mova [dstq+2*lenq+3*mmsize], m3
|
yading@11
|
730 add lenq, 2*mmsize
|
yading@11
|
731 jl .loop
|
yading@11
|
732 REP_RET
|
yading@11
|
733 %endmacro
|
yading@11
|
734
|
yading@11
|
735 INIT_XMM sse
|
yading@11
|
736 CONV_FLTP_TO_FLT_2CH
|
yading@11
|
737 %if HAVE_AVX_EXTERNAL
|
yading@11
|
738 INIT_XMM avx
|
yading@11
|
739 CONV_FLTP_TO_FLT_2CH
|
yading@11
|
740 %endif
|
yading@11
|
741
|
yading@11
|
742 ;-----------------------------------------------------------------------------
|
yading@11
|
743 ; void ff_conv_fltp_to_flt_6ch(float *dst, float *const *src, int len,
|
yading@11
|
744 ; int channels);
|
yading@11
|
745 ;-----------------------------------------------------------------------------
|
yading@11
|
746
|
yading@11
|
747 %macro CONV_FLTP_TO_FLT_6CH 0
|
yading@11
|
748 cglobal conv_fltp_to_flt_6ch, 2,8,7, dst, src, src1, src2, src3, src4, src5, len
|
yading@11
|
749 %if ARCH_X86_64
|
yading@11
|
750 mov lend, r2d
|
yading@11
|
751 %else
|
yading@11
|
752 %define lend dword r2m
|
yading@11
|
753 %endif
|
yading@11
|
754 mov src1q, [srcq+1*gprsize]
|
yading@11
|
755 mov src2q, [srcq+2*gprsize]
|
yading@11
|
756 mov src3q, [srcq+3*gprsize]
|
yading@11
|
757 mov src4q, [srcq+4*gprsize]
|
yading@11
|
758 mov src5q, [srcq+5*gprsize]
|
yading@11
|
759 mov srcq, [srcq]
|
yading@11
|
760 sub src1q, srcq
|
yading@11
|
761 sub src2q, srcq
|
yading@11
|
762 sub src3q, srcq
|
yading@11
|
763 sub src4q, srcq
|
yading@11
|
764 sub src5q, srcq
|
yading@11
|
765 .loop:
|
yading@11
|
766 mova m0, [srcq ]
|
yading@11
|
767 mova m1, [srcq+src1q]
|
yading@11
|
768 mova m2, [srcq+src2q]
|
yading@11
|
769 mova m3, [srcq+src3q]
|
yading@11
|
770 mova m4, [srcq+src4q]
|
yading@11
|
771 mova m5, [srcq+src5q]
|
yading@11
|
772 %if cpuflag(sse4)
|
yading@11
|
773 SBUTTERFLYPS 0, 1, 6
|
yading@11
|
774 SBUTTERFLYPS 2, 3, 6
|
yading@11
|
775 SBUTTERFLYPS 4, 5, 6
|
yading@11
|
776
|
yading@11
|
777 blendps m6, m4, m0, 1100b
|
yading@11
|
778 movlhps m0, m2
|
yading@11
|
779 movhlps m4, m2
|
yading@11
|
780 blendps m2, m5, m1, 1100b
|
yading@11
|
781 movlhps m1, m3
|
yading@11
|
782 movhlps m5, m3
|
yading@11
|
783
|
yading@11
|
784 movaps [dstq ], m0
|
yading@11
|
785 movaps [dstq+16], m6
|
yading@11
|
786 movaps [dstq+32], m4
|
yading@11
|
787 movaps [dstq+48], m1
|
yading@11
|
788 movaps [dstq+64], m2
|
yading@11
|
789 movaps [dstq+80], m5
|
yading@11
|
790 %else ; mmx
|
yading@11
|
791 SBUTTERFLY dq, 0, 1, 6
|
yading@11
|
792 SBUTTERFLY dq, 2, 3, 6
|
yading@11
|
793 SBUTTERFLY dq, 4, 5, 6
|
yading@11
|
794
|
yading@11
|
795 movq [dstq ], m0
|
yading@11
|
796 movq [dstq+ 8], m2
|
yading@11
|
797 movq [dstq+16], m4
|
yading@11
|
798 movq [dstq+24], m1
|
yading@11
|
799 movq [dstq+32], m3
|
yading@11
|
800 movq [dstq+40], m5
|
yading@11
|
801 %endif
|
yading@11
|
802 add srcq, mmsize
|
yading@11
|
803 add dstq, mmsize*6
|
yading@11
|
804 sub lend, mmsize/4
|
yading@11
|
805 jg .loop
|
yading@11
|
806 %if mmsize == 8
|
yading@11
|
807 emms
|
yading@11
|
808 RET
|
yading@11
|
809 %else
|
yading@11
|
810 REP_RET
|
yading@11
|
811 %endif
|
yading@11
|
812 %endmacro
|
yading@11
|
813
|
yading@11
|
814 INIT_MMX mmx
|
yading@11
|
815 CONV_FLTP_TO_FLT_6CH
|
yading@11
|
816 INIT_XMM sse4
|
yading@11
|
817 CONV_FLTP_TO_FLT_6CH
|
yading@11
|
818 %if HAVE_AVX_EXTERNAL
|
yading@11
|
819 INIT_XMM avx
|
yading@11
|
820 CONV_FLTP_TO_FLT_6CH
|
yading@11
|
821 %endif
|
yading@11
|
822
|
yading@11
|
823 ;------------------------------------------------------------------------------
|
yading@11
|
824 ; void ff_conv_s16_to_s16p_2ch(int16_t *const *dst, int16_t *src, int len,
|
yading@11
|
825 ; int channels);
|
yading@11
|
826 ;------------------------------------------------------------------------------
|
yading@11
|
827
|
yading@11
|
828 %macro CONV_S16_TO_S16P_2CH 0
|
yading@11
|
829 cglobal conv_s16_to_s16p_2ch, 3,4,4, dst0, src, len, dst1
|
yading@11
|
830 lea lenq, [2*lend]
|
yading@11
|
831 mov dst1q, [dst0q+gprsize]
|
yading@11
|
832 mov dst0q, [dst0q ]
|
yading@11
|
833 lea srcq, [srcq+2*lenq]
|
yading@11
|
834 add dst0q, lenq
|
yading@11
|
835 add dst1q, lenq
|
yading@11
|
836 neg lenq
|
yading@11
|
837 %if cpuflag(ssse3)
|
yading@11
|
838 mova m3, [pb_deinterleave_words]
|
yading@11
|
839 %endif
|
yading@11
|
840 .loop:
|
yading@11
|
841 mova m0, [srcq+2*lenq ] ; m0 = 0, 1, 2, 3, 4, 5, 6, 7
|
yading@11
|
842 mova m1, [srcq+2*lenq+mmsize] ; m1 = 8, 9, 10, 11, 12, 13, 14, 15
|
yading@11
|
843 %if cpuflag(ssse3)
|
yading@11
|
844 pshufb m0, m3 ; m0 = 0, 2, 4, 6, 1, 3, 5, 7
|
yading@11
|
845 pshufb m1, m3 ; m1 = 8, 10, 12, 14, 9, 11, 13, 15
|
yading@11
|
846 SBUTTERFLY2 qdq, 0, 1, 2 ; m0 = 0, 2, 4, 6, 8, 10, 12, 14
|
yading@11
|
847 ; m1 = 1, 3, 5, 7, 9, 11, 13, 15
|
yading@11
|
848 %else ; sse2
|
yading@11
|
849 pshuflw m0, m0, q3120 ; m0 = 0, 2, 1, 3, 4, 5, 6, 7
|
yading@11
|
850 pshufhw m0, m0, q3120 ; m0 = 0, 2, 1, 3, 4, 6, 5, 7
|
yading@11
|
851 pshuflw m1, m1, q3120 ; m1 = 8, 10, 9, 11, 12, 13, 14, 15
|
yading@11
|
852 pshufhw m1, m1, q3120 ; m1 = 8, 10, 9, 11, 12, 14, 13, 15
|
yading@11
|
853 DEINT2_PS 0, 1, 2 ; m0 = 0, 2, 4, 6, 8, 10, 12, 14
|
yading@11
|
854 ; m1 = 1, 3, 5, 7, 9, 11, 13, 15
|
yading@11
|
855 %endif
|
yading@11
|
856 mova [dst0q+lenq], m0
|
yading@11
|
857 mova [dst1q+lenq], m1
|
yading@11
|
858 add lenq, mmsize
|
yading@11
|
859 jl .loop
|
yading@11
|
860 REP_RET
|
yading@11
|
861 %endmacro
|
yading@11
|
862
|
yading@11
|
863 INIT_XMM sse2
|
yading@11
|
864 CONV_S16_TO_S16P_2CH
|
yading@11
|
865 INIT_XMM ssse3
|
yading@11
|
866 CONV_S16_TO_S16P_2CH
|
yading@11
|
867 %if HAVE_AVX_EXTERNAL
|
yading@11
|
868 INIT_XMM avx
|
yading@11
|
869 CONV_S16_TO_S16P_2CH
|
yading@11
|
870 %endif
|
yading@11
|
871
|
yading@11
|
872 ;------------------------------------------------------------------------------
|
yading@11
|
873 ; void ff_conv_s16_to_s16p_6ch(int16_t *const *dst, int16_t *src, int len,
|
yading@11
|
874 ; int channels);
|
yading@11
|
875 ;------------------------------------------------------------------------------
|
yading@11
|
876
|
yading@11
|
877 %macro CONV_S16_TO_S16P_6CH 0
|
yading@11
|
878 %if ARCH_X86_64
|
yading@11
|
879 cglobal conv_s16_to_s16p_6ch, 3,8,5, dst, src, len, dst1, dst2, dst3, dst4, dst5
|
yading@11
|
880 %else
|
yading@11
|
881 cglobal conv_s16_to_s16p_6ch, 2,7,5, dst, src, dst1, dst2, dst3, dst4, dst5
|
yading@11
|
882 %define lend dword r2m
|
yading@11
|
883 %endif
|
yading@11
|
884 mov dst1q, [dstq+ gprsize]
|
yading@11
|
885 mov dst2q, [dstq+2*gprsize]
|
yading@11
|
886 mov dst3q, [dstq+3*gprsize]
|
yading@11
|
887 mov dst4q, [dstq+4*gprsize]
|
yading@11
|
888 mov dst5q, [dstq+5*gprsize]
|
yading@11
|
889 mov dstq, [dstq ]
|
yading@11
|
890 sub dst1q, dstq
|
yading@11
|
891 sub dst2q, dstq
|
yading@11
|
892 sub dst3q, dstq
|
yading@11
|
893 sub dst4q, dstq
|
yading@11
|
894 sub dst5q, dstq
|
yading@11
|
895 .loop:
|
yading@11
|
896 mova m0, [srcq+0*mmsize] ; m0 = 0, 1, 2, 3, 4, 5, 6, 7
|
yading@11
|
897 mova m3, [srcq+1*mmsize] ; m3 = 8, 9, 10, 11, 12, 13, 14, 15
|
yading@11
|
898 mova m2, [srcq+2*mmsize] ; m2 = 16, 17, 18, 19, 20, 21, 22, 23
|
yading@11
|
899 PALIGNR m1, m3, m0, 12, m4 ; m1 = 6, 7, 8, 9, 10, 11, x, x
|
yading@11
|
900 shufps m3, m2, q1032 ; m3 = 12, 13, 14, 15, 16, 17, 18, 19
|
yading@11
|
901 psrldq m2, 4 ; m2 = 18, 19, 20, 21, 22, 23, x, x
|
yading@11
|
902 SBUTTERFLY2 wd, 0, 1, 4 ; m0 = 0, 6, 1, 7, 2, 8, 3, 9
|
yading@11
|
903 ; m1 = 4, 10, 5, 11, x, x, x, x
|
yading@11
|
904 SBUTTERFLY2 wd, 3, 2, 4 ; m3 = 12, 18, 13, 19, 14, 20, 15, 21
|
yading@11
|
905 ; m2 = 16, 22, 17, 23, x, x, x, x
|
yading@11
|
906 SBUTTERFLY2 dq, 0, 3, 4 ; m0 = 0, 6, 12, 18, 1, 7, 13, 19
|
yading@11
|
907 ; m3 = 2, 8, 14, 20, 3, 9, 15, 21
|
yading@11
|
908 punpckldq m1, m2 ; m1 = 4, 10, 16, 22, 5, 11, 17, 23
|
yading@11
|
909 movq [dstq ], m0
|
yading@11
|
910 movhps [dstq+dst1q], m0
|
yading@11
|
911 movq [dstq+dst2q], m3
|
yading@11
|
912 movhps [dstq+dst3q], m3
|
yading@11
|
913 movq [dstq+dst4q], m1
|
yading@11
|
914 movhps [dstq+dst5q], m1
|
yading@11
|
915 add srcq, mmsize*3
|
yading@11
|
916 add dstq, mmsize/2
|
yading@11
|
917 sub lend, mmsize/4
|
yading@11
|
918 jg .loop
|
yading@11
|
919 REP_RET
|
yading@11
|
920 %endmacro
|
yading@11
|
921
|
yading@11
|
922 INIT_XMM sse2
|
yading@11
|
923 CONV_S16_TO_S16P_6CH
|
yading@11
|
924 INIT_XMM ssse3
|
yading@11
|
925 CONV_S16_TO_S16P_6CH
|
yading@11
|
926 %if HAVE_AVX_EXTERNAL
|
yading@11
|
927 INIT_XMM avx
|
yading@11
|
928 CONV_S16_TO_S16P_6CH
|
yading@11
|
929 %endif
|
yading@11
|
930
|
yading@11
|
931 ;------------------------------------------------------------------------------
|
yading@11
|
932 ; void ff_conv_s16_to_fltp_2ch(float *const *dst, int16_t *src, int len,
|
yading@11
|
933 ; int channels);
|
yading@11
|
934 ;------------------------------------------------------------------------------
|
yading@11
|
935
|
yading@11
|
936 %macro CONV_S16_TO_FLTP_2CH 0
|
yading@11
|
937 cglobal conv_s16_to_fltp_2ch, 3,4,5, dst0, src, len, dst1
|
yading@11
|
938 lea lenq, [4*lend]
|
yading@11
|
939 mov dst1q, [dst0q+gprsize]
|
yading@11
|
940 mov dst0q, [dst0q ]
|
yading@11
|
941 add srcq, lenq
|
yading@11
|
942 add dst0q, lenq
|
yading@11
|
943 add dst1q, lenq
|
yading@11
|
944 neg lenq
|
yading@11
|
945 mova m3, [pf_s32_inv_scale]
|
yading@11
|
946 mova m4, [pw_zero_even]
|
yading@11
|
947 .loop:
|
yading@11
|
948 mova m1, [srcq+lenq]
|
yading@11
|
949 pslld m0, m1, 16
|
yading@11
|
950 pand m1, m4
|
yading@11
|
951 cvtdq2ps m0, m0
|
yading@11
|
952 cvtdq2ps m1, m1
|
yading@11
|
953 mulps m0, m0, m3
|
yading@11
|
954 mulps m1, m1, m3
|
yading@11
|
955 mova [dst0q+lenq], m0
|
yading@11
|
956 mova [dst1q+lenq], m1
|
yading@11
|
957 add lenq, mmsize
|
yading@11
|
958 jl .loop
|
yading@11
|
959 REP_RET
|
yading@11
|
960 %endmacro
|
yading@11
|
961
|
yading@11
|
962 INIT_XMM sse2
|
yading@11
|
963 CONV_S16_TO_FLTP_2CH
|
yading@11
|
964 %if HAVE_AVX_EXTERNAL
|
yading@11
|
965 INIT_XMM avx
|
yading@11
|
966 CONV_S16_TO_FLTP_2CH
|
yading@11
|
967 %endif
|
yading@11
|
968
|
yading@11
|
969 ;------------------------------------------------------------------------------
|
yading@11
|
970 ; void ff_conv_s16_to_fltp_6ch(float *const *dst, int16_t *src, int len,
|
yading@11
|
971 ; int channels);
|
yading@11
|
972 ;------------------------------------------------------------------------------
|
yading@11
|
973
|
yading@11
|
974 %macro CONV_S16_TO_FLTP_6CH 0
|
yading@11
|
975 %if ARCH_X86_64
|
yading@11
|
976 cglobal conv_s16_to_fltp_6ch, 3,8,7, dst, src, len, dst1, dst2, dst3, dst4, dst5
|
yading@11
|
977 %else
|
yading@11
|
978 cglobal conv_s16_to_fltp_6ch, 2,7,7, dst, src, dst1, dst2, dst3, dst4, dst5
|
yading@11
|
979 %define lend dword r2m
|
yading@11
|
980 %endif
|
yading@11
|
981 mov dst1q, [dstq+ gprsize]
|
yading@11
|
982 mov dst2q, [dstq+2*gprsize]
|
yading@11
|
983 mov dst3q, [dstq+3*gprsize]
|
yading@11
|
984 mov dst4q, [dstq+4*gprsize]
|
yading@11
|
985 mov dst5q, [dstq+5*gprsize]
|
yading@11
|
986 mov dstq, [dstq ]
|
yading@11
|
987 sub dst1q, dstq
|
yading@11
|
988 sub dst2q, dstq
|
yading@11
|
989 sub dst3q, dstq
|
yading@11
|
990 sub dst4q, dstq
|
yading@11
|
991 sub dst5q, dstq
|
yading@11
|
992 mova m6, [pf_s16_inv_scale]
|
yading@11
|
993 .loop:
|
yading@11
|
994 mova m0, [srcq+0*mmsize] ; m0 = 0, 1, 2, 3, 4, 5, 6, 7
|
yading@11
|
995 mova m3, [srcq+1*mmsize] ; m3 = 8, 9, 10, 11, 12, 13, 14, 15
|
yading@11
|
996 mova m2, [srcq+2*mmsize] ; m2 = 16, 17, 18, 19, 20, 21, 22, 23
|
yading@11
|
997 PALIGNR m1, m3, m0, 12, m4 ; m1 = 6, 7, 8, 9, 10, 11, x, x
|
yading@11
|
998 shufps m3, m2, q1032 ; m3 = 12, 13, 14, 15, 16, 17, 18, 19
|
yading@11
|
999 psrldq m2, 4 ; m2 = 18, 19, 20, 21, 22, 23, x, x
|
yading@11
|
1000 SBUTTERFLY2 wd, 0, 1, 4 ; m0 = 0, 6, 1, 7, 2, 8, 3, 9
|
yading@11
|
1001 ; m1 = 4, 10, 5, 11, x, x, x, x
|
yading@11
|
1002 SBUTTERFLY2 wd, 3, 2, 4 ; m3 = 12, 18, 13, 19, 14, 20, 15, 21
|
yading@11
|
1003 ; m2 = 16, 22, 17, 23, x, x, x, x
|
yading@11
|
1004 SBUTTERFLY2 dq, 0, 3, 4 ; m0 = 0, 6, 12, 18, 1, 7, 13, 19
|
yading@11
|
1005 ; m3 = 2, 8, 14, 20, 3, 9, 15, 21
|
yading@11
|
1006 punpckldq m1, m2 ; m1 = 4, 10, 16, 22, 5, 11, 17, 23
|
yading@11
|
1007 S16_TO_S32_SX 0, 2 ; m0 = 0, 6, 12, 18
|
yading@11
|
1008 ; m2 = 1, 7, 13, 19
|
yading@11
|
1009 S16_TO_S32_SX 3, 4 ; m3 = 2, 8, 14, 20
|
yading@11
|
1010 ; m4 = 3, 9, 15, 21
|
yading@11
|
1011 S16_TO_S32_SX 1, 5 ; m1 = 4, 10, 16, 22
|
yading@11
|
1012 ; m5 = 5, 11, 17, 23
|
yading@11
|
1013 SWAP 1,2,3,4
|
yading@11
|
1014 cvtdq2ps m0, m0
|
yading@11
|
1015 cvtdq2ps m1, m1
|
yading@11
|
1016 cvtdq2ps m2, m2
|
yading@11
|
1017 cvtdq2ps m3, m3
|
yading@11
|
1018 cvtdq2ps m4, m4
|
yading@11
|
1019 cvtdq2ps m5, m5
|
yading@11
|
1020 mulps m0, m6
|
yading@11
|
1021 mulps m1, m6
|
yading@11
|
1022 mulps m2, m6
|
yading@11
|
1023 mulps m3, m6
|
yading@11
|
1024 mulps m4, m6
|
yading@11
|
1025 mulps m5, m6
|
yading@11
|
1026 mova [dstq ], m0
|
yading@11
|
1027 mova [dstq+dst1q], m1
|
yading@11
|
1028 mova [dstq+dst2q], m2
|
yading@11
|
1029 mova [dstq+dst3q], m3
|
yading@11
|
1030 mova [dstq+dst4q], m4
|
yading@11
|
1031 mova [dstq+dst5q], m5
|
yading@11
|
1032 add srcq, mmsize*3
|
yading@11
|
1033 add dstq, mmsize
|
yading@11
|
1034 sub lend, mmsize/4
|
yading@11
|
1035 jg .loop
|
yading@11
|
1036 REP_RET
|
yading@11
|
1037 %endmacro
|
yading@11
|
1038
|
yading@11
|
1039 INIT_XMM sse2
|
yading@11
|
1040 CONV_S16_TO_FLTP_6CH
|
yading@11
|
1041 INIT_XMM ssse3
|
yading@11
|
1042 CONV_S16_TO_FLTP_6CH
|
yading@11
|
1043 INIT_XMM sse4
|
yading@11
|
1044 CONV_S16_TO_FLTP_6CH
|
yading@11
|
1045 %if HAVE_AVX_EXTERNAL
|
yading@11
|
1046 INIT_XMM avx
|
yading@11
|
1047 CONV_S16_TO_FLTP_6CH
|
yading@11
|
1048 %endif
|
yading@11
|
1049
|
yading@11
|
1050 ;------------------------------------------------------------------------------
|
yading@11
|
1051 ; void ff_conv_flt_to_s16p_2ch(int16_t *const *dst, float *src, int len,
|
yading@11
|
1052 ; int channels);
|
yading@11
|
1053 ;------------------------------------------------------------------------------
|
yading@11
|
1054
|
yading@11
|
1055 %macro CONV_FLT_TO_S16P_2CH 0
|
yading@11
|
1056 cglobal conv_flt_to_s16p_2ch, 3,4,6, dst0, src, len, dst1
|
yading@11
|
1057 lea lenq, [2*lend]
|
yading@11
|
1058 mov dst1q, [dst0q+gprsize]
|
yading@11
|
1059 mov dst0q, [dst0q ]
|
yading@11
|
1060 lea srcq, [srcq+4*lenq]
|
yading@11
|
1061 add dst0q, lenq
|
yading@11
|
1062 add dst1q, lenq
|
yading@11
|
1063 neg lenq
|
yading@11
|
1064 mova m5, [pf_s16_scale]
|
yading@11
|
1065 .loop:
|
yading@11
|
1066 mova m0, [srcq+4*lenq ]
|
yading@11
|
1067 mova m1, [srcq+4*lenq+ mmsize]
|
yading@11
|
1068 mova m2, [srcq+4*lenq+2*mmsize]
|
yading@11
|
1069 mova m3, [srcq+4*lenq+3*mmsize]
|
yading@11
|
1070 DEINT2_PS 0, 1, 4
|
yading@11
|
1071 DEINT2_PS 2, 3, 4
|
yading@11
|
1072 mulps m0, m0, m5
|
yading@11
|
1073 mulps m1, m1, m5
|
yading@11
|
1074 mulps m2, m2, m5
|
yading@11
|
1075 mulps m3, m3, m5
|
yading@11
|
1076 cvtps2dq m0, m0
|
yading@11
|
1077 cvtps2dq m1, m1
|
yading@11
|
1078 cvtps2dq m2, m2
|
yading@11
|
1079 cvtps2dq m3, m3
|
yading@11
|
1080 packssdw m0, m2
|
yading@11
|
1081 packssdw m1, m3
|
yading@11
|
1082 mova [dst0q+lenq], m0
|
yading@11
|
1083 mova [dst1q+lenq], m1
|
yading@11
|
1084 add lenq, mmsize
|
yading@11
|
1085 jl .loop
|
yading@11
|
1086 REP_RET
|
yading@11
|
1087 %endmacro
|
yading@11
|
1088
|
yading@11
|
1089 INIT_XMM sse2
|
yading@11
|
1090 CONV_FLT_TO_S16P_2CH
|
yading@11
|
1091 %if HAVE_AVX_EXTERNAL
|
yading@11
|
1092 INIT_XMM avx
|
yading@11
|
1093 CONV_FLT_TO_S16P_2CH
|
yading@11
|
1094 %endif
|
yading@11
|
1095
|
yading@11
|
1096 ;------------------------------------------------------------------------------
|
yading@11
|
1097 ; void ff_conv_flt_to_s16p_6ch(int16_t *const *dst, float *src, int len,
|
yading@11
|
1098 ; int channels);
|
yading@11
|
1099 ;------------------------------------------------------------------------------
|
yading@11
|
1100
|
yading@11
|
1101 %macro CONV_FLT_TO_S16P_6CH 0
|
yading@11
|
1102 %if ARCH_X86_64
|
yading@11
|
1103 cglobal conv_flt_to_s16p_6ch, 3,8,7, dst, src, len, dst1, dst2, dst3, dst4, dst5
|
yading@11
|
1104 %else
|
yading@11
|
1105 cglobal conv_flt_to_s16p_6ch, 2,7,7, dst, src, dst1, dst2, dst3, dst4, dst5
|
yading@11
|
1106 %define lend dword r2m
|
yading@11
|
1107 %endif
|
yading@11
|
1108 mov dst1q, [dstq+ gprsize]
|
yading@11
|
1109 mov dst2q, [dstq+2*gprsize]
|
yading@11
|
1110 mov dst3q, [dstq+3*gprsize]
|
yading@11
|
1111 mov dst4q, [dstq+4*gprsize]
|
yading@11
|
1112 mov dst5q, [dstq+5*gprsize]
|
yading@11
|
1113 mov dstq, [dstq ]
|
yading@11
|
1114 sub dst1q, dstq
|
yading@11
|
1115 sub dst2q, dstq
|
yading@11
|
1116 sub dst3q, dstq
|
yading@11
|
1117 sub dst4q, dstq
|
yading@11
|
1118 sub dst5q, dstq
|
yading@11
|
1119 mova m6, [pf_s16_scale]
|
yading@11
|
1120 .loop:
|
yading@11
|
1121 mulps m0, m6, [srcq+0*mmsize]
|
yading@11
|
1122 mulps m3, m6, [srcq+1*mmsize]
|
yading@11
|
1123 mulps m1, m6, [srcq+2*mmsize]
|
yading@11
|
1124 mulps m4, m6, [srcq+3*mmsize]
|
yading@11
|
1125 mulps m2, m6, [srcq+4*mmsize]
|
yading@11
|
1126 mulps m5, m6, [srcq+5*mmsize]
|
yading@11
|
1127 cvtps2dq m0, m0
|
yading@11
|
1128 cvtps2dq m1, m1
|
yading@11
|
1129 cvtps2dq m2, m2
|
yading@11
|
1130 cvtps2dq m3, m3
|
yading@11
|
1131 cvtps2dq m4, m4
|
yading@11
|
1132 cvtps2dq m5, m5
|
yading@11
|
1133 packssdw m0, m3 ; m0 = 0, 1, 2, 3, 4, 5, 6, 7
|
yading@11
|
1134 packssdw m1, m4 ; m1 = 8, 9, 10, 11, 12, 13, 14, 15
|
yading@11
|
1135 packssdw m2, m5 ; m2 = 16, 17, 18, 19, 20, 21, 22, 23
|
yading@11
|
1136 PALIGNR m3, m1, m0, 12, m4 ; m3 = 6, 7, 8, 9, 10, 11, x, x
|
yading@11
|
1137 shufps m1, m2, q1032 ; m1 = 12, 13, 14, 15, 16, 17, 18, 19
|
yading@11
|
1138 psrldq m2, 4 ; m2 = 18, 19, 20, 21, 22, 23, x, x
|
yading@11
|
1139 SBUTTERFLY2 wd, 0, 3, 4 ; m0 = 0, 6, 1, 7, 2, 8, 3, 9
|
yading@11
|
1140 ; m3 = 4, 10, 5, 11, x, x, x, x
|
yading@11
|
1141 SBUTTERFLY2 wd, 1, 2, 4 ; m1 = 12, 18, 13, 19, 14, 20, 15, 21
|
yading@11
|
1142 ; m2 = 16, 22, 17, 23, x, x, x, x
|
yading@11
|
1143 SBUTTERFLY2 dq, 0, 1, 4 ; m0 = 0, 6, 12, 18, 1, 7, 13, 19
|
yading@11
|
1144 ; m1 = 2, 8, 14, 20, 3, 9, 15, 21
|
yading@11
|
1145 punpckldq m3, m2 ; m3 = 4, 10, 16, 22, 5, 11, 17, 23
|
yading@11
|
1146 movq [dstq ], m0
|
yading@11
|
1147 movhps [dstq+dst1q], m0
|
yading@11
|
1148 movq [dstq+dst2q], m1
|
yading@11
|
1149 movhps [dstq+dst3q], m1
|
yading@11
|
1150 movq [dstq+dst4q], m3
|
yading@11
|
1151 movhps [dstq+dst5q], m3
|
yading@11
|
1152 add srcq, mmsize*6
|
yading@11
|
1153 add dstq, mmsize/2
|
yading@11
|
1154 sub lend, mmsize/4
|
yading@11
|
1155 jg .loop
|
yading@11
|
1156 REP_RET
|
yading@11
|
1157 %endmacro
|
yading@11
|
1158
|
yading@11
|
1159 INIT_XMM sse2
|
yading@11
|
1160 CONV_FLT_TO_S16P_6CH
|
yading@11
|
1161 INIT_XMM ssse3
|
yading@11
|
1162 CONV_FLT_TO_S16P_6CH
|
yading@11
|
1163 %if HAVE_AVX_EXTERNAL
|
yading@11
|
1164 INIT_XMM avx
|
yading@11
|
1165 CONV_FLT_TO_S16P_6CH
|
yading@11
|
1166 %endif
|
yading@11
|
1167
|
yading@11
|
1168 ;------------------------------------------------------------------------------
|
yading@11
|
1169 ; void ff_conv_flt_to_fltp_2ch(float *const *dst, float *src, int len,
|
yading@11
|
1170 ; int channels);
|
yading@11
|
1171 ;------------------------------------------------------------------------------
|
yading@11
|
1172
|
yading@11
|
1173 %macro CONV_FLT_TO_FLTP_2CH 0
|
yading@11
|
1174 cglobal conv_flt_to_fltp_2ch, 3,4,3, dst0, src, len, dst1
|
yading@11
|
1175 lea lenq, [4*lend]
|
yading@11
|
1176 mov dst1q, [dst0q+gprsize]
|
yading@11
|
1177 mov dst0q, [dst0q ]
|
yading@11
|
1178 lea srcq, [srcq+2*lenq]
|
yading@11
|
1179 add dst0q, lenq
|
yading@11
|
1180 add dst1q, lenq
|
yading@11
|
1181 neg lenq
|
yading@11
|
1182 .loop:
|
yading@11
|
1183 mova m0, [srcq+2*lenq ]
|
yading@11
|
1184 mova m1, [srcq+2*lenq+mmsize]
|
yading@11
|
1185 DEINT2_PS 0, 1, 2
|
yading@11
|
1186 mova [dst0q+lenq], m0
|
yading@11
|
1187 mova [dst1q+lenq], m1
|
yading@11
|
1188 add lenq, mmsize
|
yading@11
|
1189 jl .loop
|
yading@11
|
1190 REP_RET
|
yading@11
|
1191 %endmacro
|
yading@11
|
1192
|
yading@11
|
1193 INIT_XMM sse
|
yading@11
|
1194 CONV_FLT_TO_FLTP_2CH
|
yading@11
|
1195 %if HAVE_AVX_EXTERNAL
|
yading@11
|
1196 INIT_XMM avx
|
yading@11
|
1197 CONV_FLT_TO_FLTP_2CH
|
yading@11
|
1198 %endif
|
yading@11
|
1199
|
yading@11
|
1200 ;------------------------------------------------------------------------------
|
yading@11
|
1201 ; void ff_conv_flt_to_fltp_6ch(float *const *dst, float *src, int len,
|
yading@11
|
1202 ; int channels);
|
yading@11
|
1203 ;------------------------------------------------------------------------------
|
yading@11
|
1204
|
yading@11
|
1205 %macro CONV_FLT_TO_FLTP_6CH 0
|
yading@11
|
1206 %if ARCH_X86_64
|
yading@11
|
1207 cglobal conv_flt_to_fltp_6ch, 3,8,7, dst, src, len, dst1, dst2, dst3, dst4, dst5
|
yading@11
|
1208 %else
|
yading@11
|
1209 cglobal conv_flt_to_fltp_6ch, 2,7,7, dst, src, dst1, dst2, dst3, dst4, dst5
|
yading@11
|
1210 %define lend dword r2m
|
yading@11
|
1211 %endif
|
yading@11
|
1212 mov dst1q, [dstq+ gprsize]
|
yading@11
|
1213 mov dst2q, [dstq+2*gprsize]
|
yading@11
|
1214 mov dst3q, [dstq+3*gprsize]
|
yading@11
|
1215 mov dst4q, [dstq+4*gprsize]
|
yading@11
|
1216 mov dst5q, [dstq+5*gprsize]
|
yading@11
|
1217 mov dstq, [dstq ]
|
yading@11
|
1218 sub dst1q, dstq
|
yading@11
|
1219 sub dst2q, dstq
|
yading@11
|
1220 sub dst3q, dstq
|
yading@11
|
1221 sub dst4q, dstq
|
yading@11
|
1222 sub dst5q, dstq
|
yading@11
|
1223 .loop:
|
yading@11
|
1224 mova m0, [srcq+0*mmsize] ; m0 = 0, 1, 2, 3
|
yading@11
|
1225 mova m1, [srcq+1*mmsize] ; m1 = 4, 5, 6, 7
|
yading@11
|
1226 mova m2, [srcq+2*mmsize] ; m2 = 8, 9, 10, 11
|
yading@11
|
1227 mova m3, [srcq+3*mmsize] ; m3 = 12, 13, 14, 15
|
yading@11
|
1228 mova m4, [srcq+4*mmsize] ; m4 = 16, 17, 18, 19
|
yading@11
|
1229 mova m5, [srcq+5*mmsize] ; m5 = 20, 21, 22, 23
|
yading@11
|
1230
|
yading@11
|
1231 SBUTTERFLY2 dq, 0, 3, 6 ; m0 = 0, 12, 1, 13
|
yading@11
|
1232 ; m3 = 2, 14, 3, 15
|
yading@11
|
1233 SBUTTERFLY2 dq, 1, 4, 6 ; m1 = 4, 16, 5, 17
|
yading@11
|
1234 ; m4 = 6, 18, 7, 19
|
yading@11
|
1235 SBUTTERFLY2 dq, 2, 5, 6 ; m2 = 8, 20, 9, 21
|
yading@11
|
1236 ; m5 = 10, 22, 11, 23
|
yading@11
|
1237 SBUTTERFLY2 dq, 0, 4, 6 ; m0 = 0, 6, 12, 18
|
yading@11
|
1238 ; m4 = 1, 7, 13, 19
|
yading@11
|
1239 SBUTTERFLY2 dq, 3, 2, 6 ; m3 = 2, 8, 14, 20
|
yading@11
|
1240 ; m2 = 3, 9, 15, 21
|
yading@11
|
1241 SBUTTERFLY2 dq, 1, 5, 6 ; m1 = 4, 10, 16, 22
|
yading@11
|
1242 ; m5 = 5, 11, 17, 23
|
yading@11
|
1243 mova [dstq ], m0
|
yading@11
|
1244 mova [dstq+dst1q], m4
|
yading@11
|
1245 mova [dstq+dst2q], m3
|
yading@11
|
1246 mova [dstq+dst3q], m2
|
yading@11
|
1247 mova [dstq+dst4q], m1
|
yading@11
|
1248 mova [dstq+dst5q], m5
|
yading@11
|
1249 add srcq, mmsize*6
|
yading@11
|
1250 add dstq, mmsize
|
yading@11
|
1251 sub lend, mmsize/4
|
yading@11
|
1252 jg .loop
|
yading@11
|
1253 REP_RET
|
yading@11
|
1254 %endmacro
|
yading@11
|
1255
|
yading@11
|
1256 INIT_XMM sse2
|
yading@11
|
1257 CONV_FLT_TO_FLTP_6CH
|
yading@11
|
1258 %if HAVE_AVX_EXTERNAL
|
yading@11
|
1259 INIT_XMM avx
|
yading@11
|
1260 CONV_FLT_TO_FLTP_6CH
|
yading@11
|
1261 %endif
|