yading@11
|
1 ;******************************************************************************
|
yading@11
|
2 ;* x86 optimized channel mixing
|
yading@11
|
3 ;* Copyright (c) 2012 Justin Ruggles <justin.ruggles@gmail.com>
|
yading@11
|
4 ;*
|
yading@11
|
5 ;* This file is part of Libav.
|
yading@11
|
6 ;*
|
yading@11
|
7 ;* Libav is free software; you can redistribute it and/or
|
yading@11
|
8 ;* modify it under the terms of the GNU Lesser General Public
|
yading@11
|
9 ;* License as published by the Free Software Foundation; either
|
yading@11
|
10 ;* version 2.1 of the License, or (at your option) any later version.
|
yading@11
|
11 ;*
|
yading@11
|
12 ;* Libav is distributed in the hope that it will be useful,
|
yading@11
|
13 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
yading@11
|
14 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
yading@11
|
15 ;* Lesser General Public License for more details.
|
yading@11
|
16 ;*
|
yading@11
|
17 ;* You should have received a copy of the GNU Lesser General Public
|
yading@11
|
18 ;* License along with Libav; if not, write to the Free Software
|
yading@11
|
19 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
yading@11
|
20 ;******************************************************************************
|
yading@11
|
21
|
yading@11
|
22 %include "libavutil/x86/x86util.asm"
|
yading@11
|
23 %include "util.asm"
|
yading@11
|
24
|
yading@11
|
25 SECTION_TEXT
|
yading@11
|
26
|
yading@11
|
27 ;-----------------------------------------------------------------------------
|
yading@11
|
28 ; void ff_mix_2_to_1_fltp_flt(float **src, float **matrix, int len,
|
yading@11
|
29 ; int out_ch, int in_ch);
|
yading@11
|
30 ;-----------------------------------------------------------------------------
|
yading@11
|
31
|
yading@11
|
32 %macro MIX_2_TO_1_FLTP_FLT 0
|
yading@11
|
33 cglobal mix_2_to_1_fltp_flt, 3,4,6, src, matrix, len, src1
|
yading@11
|
34 mov src1q, [srcq+gprsize]
|
yading@11
|
35 mov srcq, [srcq ]
|
yading@11
|
36 sub src1q, srcq
|
yading@11
|
37 mov matrixq, [matrixq ]
|
yading@11
|
38 VBROADCASTSS m4, [matrixq ]
|
yading@11
|
39 VBROADCASTSS m5, [matrixq+4]
|
yading@11
|
40 ALIGN 16
|
yading@11
|
41 .loop:
|
yading@11
|
42 mulps m0, m4, [srcq ]
|
yading@11
|
43 mulps m1, m5, [srcq+src1q ]
|
yading@11
|
44 mulps m2, m4, [srcq+ mmsize]
|
yading@11
|
45 mulps m3, m5, [srcq+src1q+mmsize]
|
yading@11
|
46 addps m0, m0, m1
|
yading@11
|
47 addps m2, m2, m3
|
yading@11
|
48 mova [srcq ], m0
|
yading@11
|
49 mova [srcq+mmsize], m2
|
yading@11
|
50 add srcq, mmsize*2
|
yading@11
|
51 sub lend, mmsize*2/4
|
yading@11
|
52 jg .loop
|
yading@11
|
53 REP_RET
|
yading@11
|
54 %endmacro
|
yading@11
|
55
|
yading@11
|
56 INIT_XMM sse
|
yading@11
|
57 MIX_2_TO_1_FLTP_FLT
|
yading@11
|
58 %if HAVE_AVX_EXTERNAL
|
yading@11
|
59 INIT_YMM avx
|
yading@11
|
60 MIX_2_TO_1_FLTP_FLT
|
yading@11
|
61 %endif
|
yading@11
|
62
|
yading@11
|
63 ;-----------------------------------------------------------------------------
|
yading@11
|
64 ; void ff_mix_2_to_1_s16p_flt(int16_t **src, float **matrix, int len,
|
yading@11
|
65 ; int out_ch, int in_ch);
|
yading@11
|
66 ;-----------------------------------------------------------------------------
|
yading@11
|
67
|
yading@11
|
68 %macro MIX_2_TO_1_S16P_FLT 0
|
yading@11
|
69 cglobal mix_2_to_1_s16p_flt, 3,4,6, src, matrix, len, src1
|
yading@11
|
70 mov src1q, [srcq+gprsize]
|
yading@11
|
71 mov srcq, [srcq]
|
yading@11
|
72 sub src1q, srcq
|
yading@11
|
73 mov matrixq, [matrixq ]
|
yading@11
|
74 VBROADCASTSS m4, [matrixq ]
|
yading@11
|
75 VBROADCASTSS m5, [matrixq+4]
|
yading@11
|
76 ALIGN 16
|
yading@11
|
77 .loop:
|
yading@11
|
78 mova m0, [srcq ]
|
yading@11
|
79 mova m2, [srcq+src1q]
|
yading@11
|
80 S16_TO_S32_SX 0, 1
|
yading@11
|
81 S16_TO_S32_SX 2, 3
|
yading@11
|
82 cvtdq2ps m0, m0
|
yading@11
|
83 cvtdq2ps m1, m1
|
yading@11
|
84 cvtdq2ps m2, m2
|
yading@11
|
85 cvtdq2ps m3, m3
|
yading@11
|
86 mulps m0, m4
|
yading@11
|
87 mulps m1, m4
|
yading@11
|
88 mulps m2, m5
|
yading@11
|
89 mulps m3, m5
|
yading@11
|
90 addps m0, m2
|
yading@11
|
91 addps m1, m3
|
yading@11
|
92 cvtps2dq m0, m0
|
yading@11
|
93 cvtps2dq m1, m1
|
yading@11
|
94 packssdw m0, m1
|
yading@11
|
95 mova [srcq], m0
|
yading@11
|
96 add srcq, mmsize
|
yading@11
|
97 sub lend, mmsize/2
|
yading@11
|
98 jg .loop
|
yading@11
|
99 REP_RET
|
yading@11
|
100 %endmacro
|
yading@11
|
101
|
yading@11
|
102 INIT_XMM sse2
|
yading@11
|
103 MIX_2_TO_1_S16P_FLT
|
yading@11
|
104 INIT_XMM sse4
|
yading@11
|
105 MIX_2_TO_1_S16P_FLT
|
yading@11
|
106
|
yading@11
|
107 ;-----------------------------------------------------------------------------
|
yading@11
|
108 ; void ff_mix_2_to_1_s16p_q8(int16_t **src, int16_t **matrix, int len,
|
yading@11
|
109 ; int out_ch, int in_ch);
|
yading@11
|
110 ;-----------------------------------------------------------------------------
|
yading@11
|
111
|
yading@11
|
112 INIT_XMM sse2
|
yading@11
|
113 cglobal mix_2_to_1_s16p_q8, 3,4,6, src, matrix, len, src1
|
yading@11
|
114 mov src1q, [srcq+gprsize]
|
yading@11
|
115 mov srcq, [srcq]
|
yading@11
|
116 sub src1q, srcq
|
yading@11
|
117 mov matrixq, [matrixq]
|
yading@11
|
118 movd m4, [matrixq]
|
yading@11
|
119 movd m5, [matrixq]
|
yading@11
|
120 SPLATW m4, m4, 0
|
yading@11
|
121 SPLATW m5, m5, 1
|
yading@11
|
122 pxor m0, m0
|
yading@11
|
123 punpcklwd m4, m0
|
yading@11
|
124 punpcklwd m5, m0
|
yading@11
|
125 ALIGN 16
|
yading@11
|
126 .loop:
|
yading@11
|
127 mova m0, [srcq ]
|
yading@11
|
128 mova m2, [srcq+src1q]
|
yading@11
|
129 punpckhwd m1, m0, m0
|
yading@11
|
130 punpcklwd m0, m0
|
yading@11
|
131 punpckhwd m3, m2, m2
|
yading@11
|
132 punpcklwd m2, m2
|
yading@11
|
133 pmaddwd m0, m4
|
yading@11
|
134 pmaddwd m1, m4
|
yading@11
|
135 pmaddwd m2, m5
|
yading@11
|
136 pmaddwd m3, m5
|
yading@11
|
137 paddd m0, m2
|
yading@11
|
138 paddd m1, m3
|
yading@11
|
139 psrad m0, 8
|
yading@11
|
140 psrad m1, 8
|
yading@11
|
141 packssdw m0, m1
|
yading@11
|
142 mova [srcq], m0
|
yading@11
|
143 add srcq, mmsize
|
yading@11
|
144 sub lend, mmsize/2
|
yading@11
|
145 jg .loop
|
yading@11
|
146 REP_RET
|
yading@11
|
147
|
yading@11
|
148 ;-----------------------------------------------------------------------------
|
yading@11
|
149 ; void ff_mix_1_to_2_fltp_flt(float **src, float **matrix, int len,
|
yading@11
|
150 ; int out_ch, int in_ch);
|
yading@11
|
151 ;-----------------------------------------------------------------------------
|
yading@11
|
152
|
yading@11
|
153 %macro MIX_1_TO_2_FLTP_FLT 0
|
yading@11
|
154 cglobal mix_1_to_2_fltp_flt, 3,5,4, src0, matrix0, len, src1, matrix1
|
yading@11
|
155 mov src1q, [src0q+gprsize]
|
yading@11
|
156 mov src0q, [src0q]
|
yading@11
|
157 sub src1q, src0q
|
yading@11
|
158 mov matrix1q, [matrix0q+gprsize]
|
yading@11
|
159 mov matrix0q, [matrix0q]
|
yading@11
|
160 VBROADCASTSS m2, [matrix0q]
|
yading@11
|
161 VBROADCASTSS m3, [matrix1q]
|
yading@11
|
162 ALIGN 16
|
yading@11
|
163 .loop:
|
yading@11
|
164 mova m0, [src0q]
|
yading@11
|
165 mulps m1, m0, m3
|
yading@11
|
166 mulps m0, m0, m2
|
yading@11
|
167 mova [src0q ], m0
|
yading@11
|
168 mova [src0q+src1q], m1
|
yading@11
|
169 add src0q, mmsize
|
yading@11
|
170 sub lend, mmsize/4
|
yading@11
|
171 jg .loop
|
yading@11
|
172 REP_RET
|
yading@11
|
173 %endmacro
|
yading@11
|
174
|
yading@11
|
175 INIT_XMM sse
|
yading@11
|
176 MIX_1_TO_2_FLTP_FLT
|
yading@11
|
177 %if HAVE_AVX_EXTERNAL
|
yading@11
|
178 INIT_YMM avx
|
yading@11
|
179 MIX_1_TO_2_FLTP_FLT
|
yading@11
|
180 %endif
|
yading@11
|
181
|
yading@11
|
182 ;-----------------------------------------------------------------------------
|
yading@11
|
183 ; void ff_mix_1_to_2_s16p_flt(int16_t **src, float **matrix, int len,
|
yading@11
|
184 ; int out_ch, int in_ch);
|
yading@11
|
185 ;-----------------------------------------------------------------------------
|
yading@11
|
186
|
yading@11
|
187 %macro MIX_1_TO_2_S16P_FLT 0
|
yading@11
|
188 cglobal mix_1_to_2_s16p_flt, 3,5,6, src0, matrix0, len, src1, matrix1
|
yading@11
|
189 mov src1q, [src0q+gprsize]
|
yading@11
|
190 mov src0q, [src0q]
|
yading@11
|
191 sub src1q, src0q
|
yading@11
|
192 mov matrix1q, [matrix0q+gprsize]
|
yading@11
|
193 mov matrix0q, [matrix0q]
|
yading@11
|
194 VBROADCASTSS m4, [matrix0q]
|
yading@11
|
195 VBROADCASTSS m5, [matrix1q]
|
yading@11
|
196 ALIGN 16
|
yading@11
|
197 .loop:
|
yading@11
|
198 mova m0, [src0q]
|
yading@11
|
199 S16_TO_S32_SX 0, 2
|
yading@11
|
200 cvtdq2ps m0, m0
|
yading@11
|
201 cvtdq2ps m2, m2
|
yading@11
|
202 mulps m1, m0, m5
|
yading@11
|
203 mulps m0, m0, m4
|
yading@11
|
204 mulps m3, m2, m5
|
yading@11
|
205 mulps m2, m2, m4
|
yading@11
|
206 cvtps2dq m0, m0
|
yading@11
|
207 cvtps2dq m1, m1
|
yading@11
|
208 cvtps2dq m2, m2
|
yading@11
|
209 cvtps2dq m3, m3
|
yading@11
|
210 packssdw m0, m2
|
yading@11
|
211 packssdw m1, m3
|
yading@11
|
212 mova [src0q ], m0
|
yading@11
|
213 mova [src0q+src1q], m1
|
yading@11
|
214 add src0q, mmsize
|
yading@11
|
215 sub lend, mmsize/2
|
yading@11
|
216 jg .loop
|
yading@11
|
217 REP_RET
|
yading@11
|
218 %endmacro
|
yading@11
|
219
|
yading@11
|
220 INIT_XMM sse2
|
yading@11
|
221 MIX_1_TO_2_S16P_FLT
|
yading@11
|
222 INIT_XMM sse4
|
yading@11
|
223 MIX_1_TO_2_S16P_FLT
|
yading@11
|
224 %if HAVE_AVX_EXTERNAL
|
yading@11
|
225 INIT_XMM avx
|
yading@11
|
226 MIX_1_TO_2_S16P_FLT
|
yading@11
|
227 %endif
|
yading@11
|
228
|
yading@11
|
229 ;-----------------------------------------------------------------------------
|
yading@11
|
230 ; void ff_mix_3_8_to_1_2_fltp/s16p_flt(float/int16_t **src, float **matrix,
|
yading@11
|
231 ; int len, int out_ch, int in_ch);
|
yading@11
|
232 ;-----------------------------------------------------------------------------
|
yading@11
|
233
|
yading@11
|
234 %macro MIX_3_8_TO_1_2_FLT 3 ; %1 = in channels, %2 = out channels, %3 = s16p or fltp
|
yading@11
|
235 ; define some names to make the code clearer
|
yading@11
|
236 %assign in_channels %1
|
yading@11
|
237 %assign out_channels %2
|
yading@11
|
238 %assign stereo out_channels - 1
|
yading@11
|
239 %ifidn %3, s16p
|
yading@11
|
240 %assign is_s16 1
|
yading@11
|
241 %else
|
yading@11
|
242 %assign is_s16 0
|
yading@11
|
243 %endif
|
yading@11
|
244
|
yading@11
|
245 ; determine how many matrix elements must go on the stack vs. mmregs
|
yading@11
|
246 %assign matrix_elements in_channels * out_channels
|
yading@11
|
247 %if is_s16
|
yading@11
|
248 %if stereo
|
yading@11
|
249 %assign needed_mmregs 7
|
yading@11
|
250 %else
|
yading@11
|
251 %assign needed_mmregs 5
|
yading@11
|
252 %endif
|
yading@11
|
253 %else
|
yading@11
|
254 %if stereo
|
yading@11
|
255 %assign needed_mmregs 4
|
yading@11
|
256 %else
|
yading@11
|
257 %assign needed_mmregs 3
|
yading@11
|
258 %endif
|
yading@11
|
259 %endif
|
yading@11
|
260 %assign matrix_elements_mm num_mmregs - needed_mmregs
|
yading@11
|
261 %if matrix_elements < matrix_elements_mm
|
yading@11
|
262 %assign matrix_elements_mm matrix_elements
|
yading@11
|
263 %endif
|
yading@11
|
264 %if matrix_elements_mm < matrix_elements
|
yading@11
|
265 %assign matrix_elements_stack matrix_elements - matrix_elements_mm
|
yading@11
|
266 %else
|
yading@11
|
267 %assign matrix_elements_stack 0
|
yading@11
|
268 %endif
|
yading@11
|
269 %assign matrix_stack_size matrix_elements_stack * mmsize
|
yading@11
|
270
|
yading@11
|
271 %assign needed_stack_size -1 * matrix_stack_size
|
yading@11
|
272 %if ARCH_X86_32 && in_channels >= 7
|
yading@11
|
273 %assign needed_stack_size needed_stack_size - 16
|
yading@11
|
274 %endif
|
yading@11
|
275
|
yading@11
|
276 cglobal mix_%1_to_%2_%3_flt, 3,in_channels+2,needed_mmregs+matrix_elements_mm, needed_stack_size, src0, src1, len, src2, src3, src4, src5, src6, src7
|
yading@11
|
277
|
yading@11
|
278 ; define src pointers on stack if needed
|
yading@11
|
279 %if matrix_elements_stack > 0 && ARCH_X86_32 && in_channels >= 7
|
yading@11
|
280 %define src5m [rsp+matrix_stack_size+0]
|
yading@11
|
281 %define src6m [rsp+matrix_stack_size+4]
|
yading@11
|
282 %define src7m [rsp+matrix_stack_size+8]
|
yading@11
|
283 %endif
|
yading@11
|
284
|
yading@11
|
285 ; load matrix pointers
|
yading@11
|
286 %define matrix0q r1q
|
yading@11
|
287 %define matrix1q r3q
|
yading@11
|
288 %if stereo
|
yading@11
|
289 mov matrix1q, [matrix0q+gprsize]
|
yading@11
|
290 %endif
|
yading@11
|
291 mov matrix0q, [matrix0q]
|
yading@11
|
292
|
yading@11
|
293 ; define matrix coeff names
|
yading@11
|
294 %assign %%i 0
|
yading@11
|
295 %assign %%j needed_mmregs
|
yading@11
|
296 %rep in_channels
|
yading@11
|
297 %if %%i >= matrix_elements_mm
|
yading@11
|
298 CAT_XDEFINE mx_stack_0_, %%i, 1
|
yading@11
|
299 CAT_XDEFINE mx_0_, %%i, [rsp+(%%i-matrix_elements_mm)*mmsize]
|
yading@11
|
300 %else
|
yading@11
|
301 CAT_XDEFINE mx_stack_0_, %%i, 0
|
yading@11
|
302 CAT_XDEFINE mx_0_, %%i, m %+ %%j
|
yading@11
|
303 %assign %%j %%j+1
|
yading@11
|
304 %endif
|
yading@11
|
305 %assign %%i %%i+1
|
yading@11
|
306 %endrep
|
yading@11
|
307 %if stereo
|
yading@11
|
308 %assign %%i 0
|
yading@11
|
309 %rep in_channels
|
yading@11
|
310 %if in_channels + %%i >= matrix_elements_mm
|
yading@11
|
311 CAT_XDEFINE mx_stack_1_, %%i, 1
|
yading@11
|
312 CAT_XDEFINE mx_1_, %%i, [rsp+(in_channels+%%i-matrix_elements_mm)*mmsize]
|
yading@11
|
313 %else
|
yading@11
|
314 CAT_XDEFINE mx_stack_1_, %%i, 0
|
yading@11
|
315 CAT_XDEFINE mx_1_, %%i, m %+ %%j
|
yading@11
|
316 %assign %%j %%j+1
|
yading@11
|
317 %endif
|
yading@11
|
318 %assign %%i %%i+1
|
yading@11
|
319 %endrep
|
yading@11
|
320 %endif
|
yading@11
|
321
|
yading@11
|
322 ; load/splat matrix coeffs
|
yading@11
|
323 %assign %%i 0
|
yading@11
|
324 %rep in_channels
|
yading@11
|
325 %if mx_stack_0_ %+ %%i
|
yading@11
|
326 VBROADCASTSS m0, [matrix0q+4*%%i]
|
yading@11
|
327 mova mx_0_ %+ %%i, m0
|
yading@11
|
328 %else
|
yading@11
|
329 VBROADCASTSS mx_0_ %+ %%i, [matrix0q+4*%%i]
|
yading@11
|
330 %endif
|
yading@11
|
331 %if stereo
|
yading@11
|
332 %if mx_stack_1_ %+ %%i
|
yading@11
|
333 VBROADCASTSS m0, [matrix1q+4*%%i]
|
yading@11
|
334 mova mx_1_ %+ %%i, m0
|
yading@11
|
335 %else
|
yading@11
|
336 VBROADCASTSS mx_1_ %+ %%i, [matrix1q+4*%%i]
|
yading@11
|
337 %endif
|
yading@11
|
338 %endif
|
yading@11
|
339 %assign %%i %%i+1
|
yading@11
|
340 %endrep
|
yading@11
|
341
|
yading@11
|
342 ; load channel pointers to registers as offsets from the first channel pointer
|
yading@11
|
343 %if ARCH_X86_64
|
yading@11
|
344 movsxd lenq, r2d
|
yading@11
|
345 %endif
|
yading@11
|
346 shl lenq, 2-is_s16
|
yading@11
|
347 %assign %%i 1
|
yading@11
|
348 %rep (in_channels - 1)
|
yading@11
|
349 %if ARCH_X86_32 && in_channels >= 7 && %%i >= 5
|
yading@11
|
350 mov src5q, [src0q+%%i*gprsize]
|
yading@11
|
351 add src5q, lenq
|
yading@11
|
352 mov src %+ %%i %+ m, src5q
|
yading@11
|
353 %else
|
yading@11
|
354 mov src %+ %%i %+ q, [src0q+%%i*gprsize]
|
yading@11
|
355 add src %+ %%i %+ q, lenq
|
yading@11
|
356 %endif
|
yading@11
|
357 %assign %%i %%i+1
|
yading@11
|
358 %endrep
|
yading@11
|
359 mov src0q, [src0q]
|
yading@11
|
360 add src0q, lenq
|
yading@11
|
361 neg lenq
|
yading@11
|
362 .loop:
|
yading@11
|
363 ; for x86-32 with 7-8 channels we do not have enough gp registers for all src
|
yading@11
|
364 ; pointers, so we have to load some of them from the stack each time
|
yading@11
|
365 %define copy_src_from_stack ARCH_X86_32 && in_channels >= 7 && %%i >= 5
|
yading@11
|
366 %if is_s16
|
yading@11
|
367 ; mix with s16p input
|
yading@11
|
368 mova m0, [src0q+lenq]
|
yading@11
|
369 S16_TO_S32_SX 0, 1
|
yading@11
|
370 cvtdq2ps m0, m0
|
yading@11
|
371 cvtdq2ps m1, m1
|
yading@11
|
372 %if stereo
|
yading@11
|
373 mulps m2, m0, mx_1_0
|
yading@11
|
374 mulps m3, m1, mx_1_0
|
yading@11
|
375 %endif
|
yading@11
|
376 mulps m0, m0, mx_0_0
|
yading@11
|
377 mulps m1, m1, mx_0_0
|
yading@11
|
378 %assign %%i 1
|
yading@11
|
379 %rep (in_channels - 1)
|
yading@11
|
380 %if copy_src_from_stack
|
yading@11
|
381 %define src_ptr src5q
|
yading@11
|
382 %else
|
yading@11
|
383 %define src_ptr src %+ %%i %+ q
|
yading@11
|
384 %endif
|
yading@11
|
385 %if stereo
|
yading@11
|
386 %if copy_src_from_stack
|
yading@11
|
387 mov src_ptr, src %+ %%i %+ m
|
yading@11
|
388 %endif
|
yading@11
|
389 mova m4, [src_ptr+lenq]
|
yading@11
|
390 S16_TO_S32_SX 4, 5
|
yading@11
|
391 cvtdq2ps m4, m4
|
yading@11
|
392 cvtdq2ps m5, m5
|
yading@11
|
393 fmaddps m2, m4, mx_1_ %+ %%i, m2, m6
|
yading@11
|
394 fmaddps m3, m5, mx_1_ %+ %%i, m3, m6
|
yading@11
|
395 fmaddps m0, m4, mx_0_ %+ %%i, m0, m4
|
yading@11
|
396 fmaddps m1, m5, mx_0_ %+ %%i, m1, m5
|
yading@11
|
397 %else
|
yading@11
|
398 %if copy_src_from_stack
|
yading@11
|
399 mov src_ptr, src %+ %%i %+ m
|
yading@11
|
400 %endif
|
yading@11
|
401 mova m2, [src_ptr+lenq]
|
yading@11
|
402 S16_TO_S32_SX 2, 3
|
yading@11
|
403 cvtdq2ps m2, m2
|
yading@11
|
404 cvtdq2ps m3, m3
|
yading@11
|
405 fmaddps m0, m2, mx_0_ %+ %%i, m0, m4
|
yading@11
|
406 fmaddps m1, m3, mx_0_ %+ %%i, m1, m4
|
yading@11
|
407 %endif
|
yading@11
|
408 %assign %%i %%i+1
|
yading@11
|
409 %endrep
|
yading@11
|
410 %if stereo
|
yading@11
|
411 cvtps2dq m2, m2
|
yading@11
|
412 cvtps2dq m3, m3
|
yading@11
|
413 packssdw m2, m3
|
yading@11
|
414 mova [src1q+lenq], m2
|
yading@11
|
415 %endif
|
yading@11
|
416 cvtps2dq m0, m0
|
yading@11
|
417 cvtps2dq m1, m1
|
yading@11
|
418 packssdw m0, m1
|
yading@11
|
419 mova [src0q+lenq], m0
|
yading@11
|
420 %else
|
yading@11
|
421 ; mix with fltp input
|
yading@11
|
422 %if stereo || mx_stack_0_0
|
yading@11
|
423 mova m0, [src0q+lenq]
|
yading@11
|
424 %endif
|
yading@11
|
425 %if stereo
|
yading@11
|
426 mulps m1, m0, mx_1_0
|
yading@11
|
427 %endif
|
yading@11
|
428 %if stereo || mx_stack_0_0
|
yading@11
|
429 mulps m0, m0, mx_0_0
|
yading@11
|
430 %else
|
yading@11
|
431 mulps m0, [src0q+lenq], mx_0_0
|
yading@11
|
432 %endif
|
yading@11
|
433 %assign %%i 1
|
yading@11
|
434 %rep (in_channels - 1)
|
yading@11
|
435 %if copy_src_from_stack
|
yading@11
|
436 %define src_ptr src5q
|
yading@11
|
437 mov src_ptr, src %+ %%i %+ m
|
yading@11
|
438 %else
|
yading@11
|
439 %define src_ptr src %+ %%i %+ q
|
yading@11
|
440 %endif
|
yading@11
|
441 ; avoid extra load for mono if matrix is in a mm register
|
yading@11
|
442 %if stereo || mx_stack_0_ %+ %%i
|
yading@11
|
443 mova m2, [src_ptr+lenq]
|
yading@11
|
444 %endif
|
yading@11
|
445 %if stereo
|
yading@11
|
446 fmaddps m1, m2, mx_1_ %+ %%i, m1, m3
|
yading@11
|
447 %endif
|
yading@11
|
448 %if stereo || mx_stack_0_ %+ %%i
|
yading@11
|
449 fmaddps m0, m2, mx_0_ %+ %%i, m0, m2
|
yading@11
|
450 %else
|
yading@11
|
451 fmaddps m0, mx_0_ %+ %%i, [src_ptr+lenq], m0, m1
|
yading@11
|
452 %endif
|
yading@11
|
453 %assign %%i %%i+1
|
yading@11
|
454 %endrep
|
yading@11
|
455 mova [src0q+lenq], m0
|
yading@11
|
456 %if stereo
|
yading@11
|
457 mova [src1q+lenq], m1
|
yading@11
|
458 %endif
|
yading@11
|
459 %endif
|
yading@11
|
460
|
yading@11
|
461 add lenq, mmsize
|
yading@11
|
462 jl .loop
|
yading@11
|
463 ; zero ymm high halves
|
yading@11
|
464 %if mmsize == 32
|
yading@11
|
465 vzeroupper
|
yading@11
|
466 %endif
|
yading@11
|
467 RET
|
yading@11
|
468 %endmacro
|
yading@11
|
469
|
yading@11
|
470 %macro MIX_3_8_TO_1_2_FLT_FUNCS 0
|
yading@11
|
471 %assign %%i 3
|
yading@11
|
472 %rep 6
|
yading@11
|
473 INIT_XMM sse
|
yading@11
|
474 MIX_3_8_TO_1_2_FLT %%i, 1, fltp
|
yading@11
|
475 MIX_3_8_TO_1_2_FLT %%i, 2, fltp
|
yading@11
|
476 INIT_XMM sse2
|
yading@11
|
477 MIX_3_8_TO_1_2_FLT %%i, 1, s16p
|
yading@11
|
478 MIX_3_8_TO_1_2_FLT %%i, 2, s16p
|
yading@11
|
479 INIT_XMM sse4
|
yading@11
|
480 MIX_3_8_TO_1_2_FLT %%i, 1, s16p
|
yading@11
|
481 MIX_3_8_TO_1_2_FLT %%i, 2, s16p
|
yading@11
|
482 ; do not use ymm AVX or FMA4 in x86-32 for 6 or more channels due to stack alignment issues
|
yading@11
|
483 %if HAVE_AVX_EXTERNAL
|
yading@11
|
484 %if ARCH_X86_64 || %%i < 6
|
yading@11
|
485 INIT_YMM avx
|
yading@11
|
486 %else
|
yading@11
|
487 INIT_XMM avx
|
yading@11
|
488 %endif
|
yading@11
|
489 MIX_3_8_TO_1_2_FLT %%i, 1, fltp
|
yading@11
|
490 MIX_3_8_TO_1_2_FLT %%i, 2, fltp
|
yading@11
|
491 INIT_XMM avx
|
yading@11
|
492 MIX_3_8_TO_1_2_FLT %%i, 1, s16p
|
yading@11
|
493 MIX_3_8_TO_1_2_FLT %%i, 2, s16p
|
yading@11
|
494 %endif
|
yading@11
|
495 %if HAVE_FMA4_EXTERNAL
|
yading@11
|
496 %if ARCH_X86_64 || %%i < 6
|
yading@11
|
497 INIT_YMM fma4
|
yading@11
|
498 %else
|
yading@11
|
499 INIT_XMM fma4
|
yading@11
|
500 %endif
|
yading@11
|
501 MIX_3_8_TO_1_2_FLT %%i, 1, fltp
|
yading@11
|
502 MIX_3_8_TO_1_2_FLT %%i, 2, fltp
|
yading@11
|
503 INIT_XMM fma4
|
yading@11
|
504 MIX_3_8_TO_1_2_FLT %%i, 1, s16p
|
yading@11
|
505 MIX_3_8_TO_1_2_FLT %%i, 2, s16p
|
yading@11
|
506 %endif
|
yading@11
|
507 %assign %%i %%i+1
|
yading@11
|
508 %endrep
|
yading@11
|
509 %endmacro
|
yading@11
|
510
|
yading@11
|
511 MIX_3_8_TO_1_2_FLT_FUNCS
|