annotate ffmpeg/libavresample/x86/audio_convert.asm @ 13:844d341cf643 tip

Back up before ISMIR
author Yading Song <yading.song@eecs.qmul.ac.uk>
date Thu, 31 Oct 2013 13:17:06 +0000
parents f445c3017523
children
rev   line source
yading@11 1 ;******************************************************************************
yading@11 2 ;* x86 optimized Format Conversion Utils
yading@11 3 ;* Copyright (c) 2008 Loren Merritt
yading@11 4 ;* Copyright (c) 2012 Justin Ruggles <justin.ruggles@gmail.com>
yading@11 5 ;*
yading@11 6 ;* This file is part of Libav.
yading@11 7 ;*
yading@11 8 ;* Libav is free software; you can redistribute it and/or
yading@11 9 ;* modify it under the terms of the GNU Lesser General Public
yading@11 10 ;* License as published by the Free Software Foundation; either
yading@11 11 ;* version 2.1 of the License, or (at your option) any later version.
yading@11 12 ;*
yading@11 13 ;* Libav is distributed in the hope that it will be useful,
yading@11 14 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
yading@11 15 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
yading@11 16 ;* Lesser General Public License for more details.
yading@11 17 ;*
yading@11 18 ;* You should have received a copy of the GNU Lesser General Public
yading@11 19 ;* License along with Libav; if not, write to the Free Software
yading@11 20 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
yading@11 21 ;******************************************************************************
yading@11 22
yading@11 23 %include "libavutil/x86/x86util.asm"
yading@11 24 %include "util.asm"
yading@11 25
yading@11 26 SECTION_RODATA 32
yading@11 27
yading@11 28 pf_s32_inv_scale: times 8 dd 0x30000000
yading@11 29 pf_s32_scale: times 8 dd 0x4f000000
yading@11 30 pf_s32_clip: times 8 dd 0x4effffff
yading@11 31 pf_s16_inv_scale: times 4 dd 0x38000000
yading@11 32 pf_s16_scale: times 4 dd 0x47000000
yading@11 33 pb_shuf_unpack_even: db -1, -1, 0, 1, -1, -1, 2, 3, -1, -1, 8, 9, -1, -1, 10, 11
yading@11 34 pb_shuf_unpack_odd: db -1, -1, 4, 5, -1, -1, 6, 7, -1, -1, 12, 13, -1, -1, 14, 15
yading@11 35 pb_interleave_words: SHUFFLE_MASK_W 0, 4, 1, 5, 2, 6, 3, 7
yading@11 36 pb_deinterleave_words: SHUFFLE_MASK_W 0, 2, 4, 6, 1, 3, 5, 7
yading@11 37 pw_zero_even: times 4 dw 0x0000, 0xffff
yading@11 38
yading@11 39 SECTION_TEXT
yading@11 40
yading@11 41 ;------------------------------------------------------------------------------
yading@11 42 ; void ff_conv_s16_to_s32(int32_t *dst, const int16_t *src, int len);
yading@11 43 ;------------------------------------------------------------------------------
yading@11 44
yading@11 45 INIT_XMM sse2
yading@11 46 cglobal conv_s16_to_s32, 3,3,3, dst, src, len
yading@11 47 lea lenq, [2*lend]
yading@11 48 lea dstq, [dstq+2*lenq]
yading@11 49 add srcq, lenq
yading@11 50 neg lenq
yading@11 51 .loop:
yading@11 52 mova m2, [srcq+lenq]
yading@11 53 pxor m0, m0
yading@11 54 pxor m1, m1
yading@11 55 punpcklwd m0, m2
yading@11 56 punpckhwd m1, m2
yading@11 57 mova [dstq+2*lenq ], m0
yading@11 58 mova [dstq+2*lenq+mmsize], m1
yading@11 59 add lenq, mmsize
yading@11 60 jl .loop
yading@11 61 REP_RET
yading@11 62
yading@11 63 ;------------------------------------------------------------------------------
yading@11 64 ; void ff_conv_s16_to_flt(float *dst, const int16_t *src, int len);
yading@11 65 ;------------------------------------------------------------------------------
yading@11 66
yading@11 67 %macro CONV_S16_TO_FLT 0
yading@11 68 cglobal conv_s16_to_flt, 3,3,3, dst, src, len
yading@11 69 lea lenq, [2*lend]
yading@11 70 add srcq, lenq
yading@11 71 lea dstq, [dstq + 2*lenq]
yading@11 72 neg lenq
yading@11 73 mova m2, [pf_s16_inv_scale]
yading@11 74 ALIGN 16
yading@11 75 .loop:
yading@11 76 mova m0, [srcq+lenq]
yading@11 77 S16_TO_S32_SX 0, 1
yading@11 78 cvtdq2ps m0, m0
yading@11 79 cvtdq2ps m1, m1
yading@11 80 mulps m0, m2
yading@11 81 mulps m1, m2
yading@11 82 mova [dstq+2*lenq ], m0
yading@11 83 mova [dstq+2*lenq+mmsize], m1
yading@11 84 add lenq, mmsize
yading@11 85 jl .loop
yading@11 86 REP_RET
yading@11 87 %endmacro
yading@11 88
yading@11 89 INIT_XMM sse2
yading@11 90 CONV_S16_TO_FLT
yading@11 91 INIT_XMM sse4
yading@11 92 CONV_S16_TO_FLT
yading@11 93
yading@11 94 ;------------------------------------------------------------------------------
yading@11 95 ; void ff_conv_s32_to_s16(int16_t *dst, const int32_t *src, int len);
yading@11 96 ;------------------------------------------------------------------------------
yading@11 97
yading@11 98 %macro CONV_S32_TO_S16 0
yading@11 99 cglobal conv_s32_to_s16, 3,3,4, dst, src, len
yading@11 100 lea lenq, [2*lend]
yading@11 101 lea srcq, [srcq+2*lenq]
yading@11 102 add dstq, lenq
yading@11 103 neg lenq
yading@11 104 .loop:
yading@11 105 mova m0, [srcq+2*lenq ]
yading@11 106 mova m1, [srcq+2*lenq+ mmsize]
yading@11 107 mova m2, [srcq+2*lenq+2*mmsize]
yading@11 108 mova m3, [srcq+2*lenq+3*mmsize]
yading@11 109 psrad m0, 16
yading@11 110 psrad m1, 16
yading@11 111 psrad m2, 16
yading@11 112 psrad m3, 16
yading@11 113 packssdw m0, m1
yading@11 114 packssdw m2, m3
yading@11 115 mova [dstq+lenq ], m0
yading@11 116 mova [dstq+lenq+mmsize], m2
yading@11 117 add lenq, mmsize*2
yading@11 118 jl .loop
yading@11 119 %if mmsize == 8
yading@11 120 emms
yading@11 121 RET
yading@11 122 %else
yading@11 123 REP_RET
yading@11 124 %endif
yading@11 125 %endmacro
yading@11 126
yading@11 127 INIT_MMX mmx
yading@11 128 CONV_S32_TO_S16
yading@11 129 INIT_XMM sse2
yading@11 130 CONV_S32_TO_S16
yading@11 131
yading@11 132 ;------------------------------------------------------------------------------
yading@11 133 ; void ff_conv_s32_to_flt(float *dst, const int32_t *src, int len);
yading@11 134 ;------------------------------------------------------------------------------
yading@11 135
yading@11 136 %macro CONV_S32_TO_FLT 0
yading@11 137 cglobal conv_s32_to_flt, 3,3,3, dst, src, len
yading@11 138 lea lenq, [4*lend]
yading@11 139 add srcq, lenq
yading@11 140 add dstq, lenq
yading@11 141 neg lenq
yading@11 142 mova m0, [pf_s32_inv_scale]
yading@11 143 ALIGN 16
yading@11 144 .loop:
yading@11 145 cvtdq2ps m1, [srcq+lenq ]
yading@11 146 cvtdq2ps m2, [srcq+lenq+mmsize]
yading@11 147 mulps m1, m1, m0
yading@11 148 mulps m2, m2, m0
yading@11 149 mova [dstq+lenq ], m1
yading@11 150 mova [dstq+lenq+mmsize], m2
yading@11 151 add lenq, mmsize*2
yading@11 152 jl .loop
yading@11 153 REP_RET
yading@11 154 %endmacro
yading@11 155
yading@11 156 INIT_XMM sse2
yading@11 157 CONV_S32_TO_FLT
yading@11 158 %if HAVE_AVX_EXTERNAL
yading@11 159 INIT_YMM avx
yading@11 160 CONV_S32_TO_FLT
yading@11 161 %endif
yading@11 162
yading@11 163 ;------------------------------------------------------------------------------
yading@11 164 ; void ff_conv_flt_to_s16(int16_t *dst, const float *src, int len);
yading@11 165 ;------------------------------------------------------------------------------
yading@11 166
yading@11 167 INIT_XMM sse2
yading@11 168 cglobal conv_flt_to_s16, 3,3,5, dst, src, len
yading@11 169 lea lenq, [2*lend]
yading@11 170 lea srcq, [srcq+2*lenq]
yading@11 171 add dstq, lenq
yading@11 172 neg lenq
yading@11 173 mova m4, [pf_s16_scale]
yading@11 174 .loop:
yading@11 175 mova m0, [srcq+2*lenq ]
yading@11 176 mova m1, [srcq+2*lenq+1*mmsize]
yading@11 177 mova m2, [srcq+2*lenq+2*mmsize]
yading@11 178 mova m3, [srcq+2*lenq+3*mmsize]
yading@11 179 mulps m0, m4
yading@11 180 mulps m1, m4
yading@11 181 mulps m2, m4
yading@11 182 mulps m3, m4
yading@11 183 cvtps2dq m0, m0
yading@11 184 cvtps2dq m1, m1
yading@11 185 cvtps2dq m2, m2
yading@11 186 cvtps2dq m3, m3
yading@11 187 packssdw m0, m1
yading@11 188 packssdw m2, m3
yading@11 189 mova [dstq+lenq ], m0
yading@11 190 mova [dstq+lenq+mmsize], m2
yading@11 191 add lenq, mmsize*2
yading@11 192 jl .loop
yading@11 193 REP_RET
yading@11 194
yading@11 195 ;------------------------------------------------------------------------------
yading@11 196 ; void ff_conv_flt_to_s32(int32_t *dst, const float *src, int len);
yading@11 197 ;------------------------------------------------------------------------------
yading@11 198
yading@11 199 %macro CONV_FLT_TO_S32 0
yading@11 200 cglobal conv_flt_to_s32, 3,3,6, dst, src, len
yading@11 201 lea lenq, [lend*4]
yading@11 202 add srcq, lenq
yading@11 203 add dstq, lenq
yading@11 204 neg lenq
yading@11 205 mova m4, [pf_s32_scale]
yading@11 206 mova m5, [pf_s32_clip]
yading@11 207 .loop:
yading@11 208 mulps m0, m4, [srcq+lenq ]
yading@11 209 mulps m1, m4, [srcq+lenq+1*mmsize]
yading@11 210 mulps m2, m4, [srcq+lenq+2*mmsize]
yading@11 211 mulps m3, m4, [srcq+lenq+3*mmsize]
yading@11 212 minps m0, m0, m5
yading@11 213 minps m1, m1, m5
yading@11 214 minps m2, m2, m5
yading@11 215 minps m3, m3, m5
yading@11 216 cvtps2dq m0, m0
yading@11 217 cvtps2dq m1, m1
yading@11 218 cvtps2dq m2, m2
yading@11 219 cvtps2dq m3, m3
yading@11 220 mova [dstq+lenq ], m0
yading@11 221 mova [dstq+lenq+1*mmsize], m1
yading@11 222 mova [dstq+lenq+2*mmsize], m2
yading@11 223 mova [dstq+lenq+3*mmsize], m3
yading@11 224 add lenq, mmsize*4
yading@11 225 jl .loop
yading@11 226 REP_RET
yading@11 227 %endmacro
yading@11 228
yading@11 229 INIT_XMM sse2
yading@11 230 CONV_FLT_TO_S32
yading@11 231 %if HAVE_AVX_EXTERNAL
yading@11 232 INIT_YMM avx
yading@11 233 CONV_FLT_TO_S32
yading@11 234 %endif
yading@11 235
yading@11 236 ;------------------------------------------------------------------------------
yading@11 237 ; void ff_conv_s16p_to_s16_2ch(int16_t *dst, int16_t *const *src, int len,
yading@11 238 ; int channels);
yading@11 239 ;------------------------------------------------------------------------------
yading@11 240
yading@11 241 %macro CONV_S16P_TO_S16_2CH 0
yading@11 242 cglobal conv_s16p_to_s16_2ch, 3,4,5, dst, src0, len, src1
yading@11 243 mov src1q, [src0q+gprsize]
yading@11 244 mov src0q, [src0q ]
yading@11 245 lea lenq, [2*lend]
yading@11 246 add src0q, lenq
yading@11 247 add src1q, lenq
yading@11 248 lea dstq, [dstq+2*lenq]
yading@11 249 neg lenq
yading@11 250 .loop:
yading@11 251 mova m0, [src0q+lenq ]
yading@11 252 mova m1, [src1q+lenq ]
yading@11 253 mova m2, [src0q+lenq+mmsize]
yading@11 254 mova m3, [src1q+lenq+mmsize]
yading@11 255 SBUTTERFLY2 wd, 0, 1, 4
yading@11 256 SBUTTERFLY2 wd, 2, 3, 4
yading@11 257 mova [dstq+2*lenq+0*mmsize], m0
yading@11 258 mova [dstq+2*lenq+1*mmsize], m1
yading@11 259 mova [dstq+2*lenq+2*mmsize], m2
yading@11 260 mova [dstq+2*lenq+3*mmsize], m3
yading@11 261 add lenq, 2*mmsize
yading@11 262 jl .loop
yading@11 263 REP_RET
yading@11 264 %endmacro
yading@11 265
yading@11 266 INIT_XMM sse2
yading@11 267 CONV_S16P_TO_S16_2CH
yading@11 268 %if HAVE_AVX_EXTERNAL
yading@11 269 INIT_XMM avx
yading@11 270 CONV_S16P_TO_S16_2CH
yading@11 271 %endif
yading@11 272
yading@11 273 ;------------------------------------------------------------------------------
yading@11 274 ; void ff_conv_s16p_to_s16_6ch(int16_t *dst, int16_t *const *src, int len,
yading@11 275 ; int channels);
yading@11 276 ;------------------------------------------------------------------------------
yading@11 277
yading@11 278 ;------------------------------------------------------------------------------
yading@11 279 ; NOTE: In the 6-channel functions, len could be used as an index on x86-64
yading@11 280 ; instead of just a counter, which would avoid incrementing the
yading@11 281 ; pointers, but the extra complexity and amount of code is not worth
yading@11 282 ; the small gain. On x86-32 there are not enough registers to use len
yading@11 283 ; as an index without keeping two of the pointers on the stack and
yading@11 284 ; loading them in each iteration.
yading@11 285 ;------------------------------------------------------------------------------
yading@11 286
yading@11 287 %macro CONV_S16P_TO_S16_6CH 0
yading@11 288 %if ARCH_X86_64
yading@11 289 cglobal conv_s16p_to_s16_6ch, 3,8,7, dst, src0, len, src1, src2, src3, src4, src5
yading@11 290 %else
yading@11 291 cglobal conv_s16p_to_s16_6ch, 2,7,7, dst, src0, src1, src2, src3, src4, src5
yading@11 292 %define lend dword r2m
yading@11 293 %endif
yading@11 294 mov src1q, [src0q+1*gprsize]
yading@11 295 mov src2q, [src0q+2*gprsize]
yading@11 296 mov src3q, [src0q+3*gprsize]
yading@11 297 mov src4q, [src0q+4*gprsize]
yading@11 298 mov src5q, [src0q+5*gprsize]
yading@11 299 mov src0q, [src0q]
yading@11 300 sub src1q, src0q
yading@11 301 sub src2q, src0q
yading@11 302 sub src3q, src0q
yading@11 303 sub src4q, src0q
yading@11 304 sub src5q, src0q
yading@11 305 .loop:
yading@11 306 %if cpuflag(sse2slow)
yading@11 307 movq m0, [src0q ] ; m0 = 0, 6, 12, 18, x, x, x, x
yading@11 308 movq m1, [src0q+src1q] ; m1 = 1, 7, 13, 19, x, x, x, x
yading@11 309 movq m2, [src0q+src2q] ; m2 = 2, 8, 14, 20, x, x, x, x
yading@11 310 movq m3, [src0q+src3q] ; m3 = 3, 9, 15, 21, x, x, x, x
yading@11 311 movq m4, [src0q+src4q] ; m4 = 4, 10, 16, 22, x, x, x, x
yading@11 312 movq m5, [src0q+src5q] ; m5 = 5, 11, 17, 23, x, x, x, x
yading@11 313 ; unpack words:
yading@11 314 punpcklwd m0, m1 ; m0 = 0, 1, 6, 7, 12, 13, 18, 19
yading@11 315 punpcklwd m2, m3 ; m2 = 4, 5, 10, 11, 16, 17, 22, 23
yading@11 316 punpcklwd m4, m5 ; m4 = 2, 3, 8, 9, 14, 15, 20, 21
yading@11 317 ; blend dwords
yading@11 318 shufps m1, m0, m2, q2020 ; m1 = 0, 1, 12, 13, 2, 3, 14, 15
yading@11 319 shufps m0, m4, q2031 ; m0 = 6, 7, 18, 19, 4, 5, 16, 17
yading@11 320 shufps m2, m4, q3131 ; m2 = 8, 9, 20, 21, 10, 11, 22, 23
yading@11 321 ; shuffle dwords
yading@11 322 pshufd m0, m0, q1302 ; m0 = 4, 5, 6, 7, 16, 17, 18, 19
yading@11 323 pshufd m1, m1, q3120 ; m1 = 0, 1, 2, 3, 12, 13, 14, 15
yading@11 324 pshufd m2, m2, q3120 ; m2 = 8, 9, 10, 11, 20, 21, 22, 23
yading@11 325 movq [dstq+0*mmsize/2], m1
yading@11 326 movq [dstq+1*mmsize/2], m0
yading@11 327 movq [dstq+2*mmsize/2], m2
yading@11 328 movhps [dstq+3*mmsize/2], m1
yading@11 329 movhps [dstq+4*mmsize/2], m0
yading@11 330 movhps [dstq+5*mmsize/2], m2
yading@11 331 add src0q, mmsize/2
yading@11 332 add dstq, mmsize*3
yading@11 333 sub lend, mmsize/4
yading@11 334 %else
yading@11 335 mova m0, [src0q ] ; m0 = 0, 6, 12, 18, 24, 30, 36, 42
yading@11 336 mova m1, [src0q+src1q] ; m1 = 1, 7, 13, 19, 25, 31, 37, 43
yading@11 337 mova m2, [src0q+src2q] ; m2 = 2, 8, 14, 20, 26, 32, 38, 44
yading@11 338 mova m3, [src0q+src3q] ; m3 = 3, 9, 15, 21, 27, 33, 39, 45
yading@11 339 mova m4, [src0q+src4q] ; m4 = 4, 10, 16, 22, 28, 34, 40, 46
yading@11 340 mova m5, [src0q+src5q] ; m5 = 5, 11, 17, 23, 29, 35, 41, 47
yading@11 341 ; unpack words:
yading@11 342 SBUTTERFLY2 wd, 0, 1, 6 ; m0 = 0, 1, 6, 7, 12, 13, 18, 19
yading@11 343 ; m1 = 24, 25, 30, 31, 36, 37, 42, 43
yading@11 344 SBUTTERFLY2 wd, 2, 3, 6 ; m2 = 2, 3, 8, 9, 14, 15, 20, 21
yading@11 345 ; m3 = 26, 27, 32, 33, 38, 39, 44, 45
yading@11 346 SBUTTERFLY2 wd, 4, 5, 6 ; m4 = 4, 5, 10, 11, 16, 17, 22, 23
yading@11 347 ; m5 = 28, 29, 34, 35, 40, 41, 46, 47
yading@11 348 ; blend dwords
yading@11 349 shufps m6, m0, m2, q2020 ; m6 = 0, 1, 12, 13, 2, 3, 14, 15
yading@11 350 shufps m0, m4, q2031 ; m0 = 6, 7, 18, 19, 4, 5, 16, 17
yading@11 351 shufps m2, m4, q3131 ; m2 = 8, 9, 20, 21, 10, 11, 22, 23
yading@11 352 SWAP 4,6 ; m4 = 0, 1, 12, 13, 2, 3, 14, 15
yading@11 353 shufps m6, m1, m3, q2020 ; m6 = 24, 25, 36, 37, 26, 27, 38, 39
yading@11 354 shufps m1, m5, q2031 ; m1 = 30, 31, 42, 43, 28, 29, 40, 41
yading@11 355 shufps m3, m5, q3131 ; m3 = 32, 33, 44, 45, 34, 35, 46, 47
yading@11 356 SWAP 5,6 ; m5 = 24, 25, 36, 37, 26, 27, 38, 39
yading@11 357 ; shuffle dwords
yading@11 358 pshufd m0, m0, q1302 ; m0 = 4, 5, 6, 7, 16, 17, 18, 19
yading@11 359 pshufd m2, m2, q3120 ; m2 = 8, 9, 10, 11, 20, 21, 22, 23
yading@11 360 pshufd m4, m4, q3120 ; m4 = 0, 1, 2, 3, 12, 13, 14, 15
yading@11 361 pshufd m1, m1, q1302 ; m1 = 28, 29, 30, 31, 40, 41, 42, 43
yading@11 362 pshufd m3, m3, q3120 ; m3 = 32, 33, 34, 35, 44, 45, 46, 47
yading@11 363 pshufd m5, m5, q3120 ; m5 = 24, 25, 26, 27, 36, 37, 38, 39
yading@11 364 ; shuffle qwords
yading@11 365 punpcklqdq m6, m4, m0 ; m6 = 0, 1, 2, 3, 4, 5, 6, 7
yading@11 366 punpckhqdq m0, m2 ; m0 = 16, 17, 18, 19, 20, 21, 22, 23
yading@11 367 shufps m2, m4, q3210 ; m2 = 8, 9, 10, 11, 12, 13, 14, 15
yading@11 368 SWAP 4,6 ; m4 = 0, 1, 2, 3, 4, 5, 6, 7
yading@11 369 punpcklqdq m6, m5, m1 ; m6 = 24, 25, 26, 27, 28, 29, 30, 31
yading@11 370 punpckhqdq m1, m3 ; m1 = 40, 41, 42, 43, 44, 45, 46, 47
yading@11 371 shufps m3, m5, q3210 ; m3 = 32, 33, 34, 35, 36, 37, 38, 39
yading@11 372 SWAP 5,6 ; m5 = 24, 25, 26, 27, 28, 29, 30, 31
yading@11 373 mova [dstq+0*mmsize], m4
yading@11 374 mova [dstq+1*mmsize], m2
yading@11 375 mova [dstq+2*mmsize], m0
yading@11 376 mova [dstq+3*mmsize], m5
yading@11 377 mova [dstq+4*mmsize], m3
yading@11 378 mova [dstq+5*mmsize], m1
yading@11 379 add src0q, mmsize
yading@11 380 add dstq, mmsize*6
yading@11 381 sub lend, mmsize/2
yading@11 382 %endif
yading@11 383 jg .loop
yading@11 384 REP_RET
yading@11 385 %endmacro
yading@11 386
yading@11 387 INIT_XMM sse2
yading@11 388 CONV_S16P_TO_S16_6CH
yading@11 389 INIT_XMM sse2slow
yading@11 390 CONV_S16P_TO_S16_6CH
yading@11 391 %if HAVE_AVX_EXTERNAL
yading@11 392 INIT_XMM avx
yading@11 393 CONV_S16P_TO_S16_6CH
yading@11 394 %endif
yading@11 395
yading@11 396 ;------------------------------------------------------------------------------
yading@11 397 ; void ff_conv_s16p_to_flt_2ch(float *dst, int16_t *const *src, int len,
yading@11 398 ; int channels);
yading@11 399 ;------------------------------------------------------------------------------
yading@11 400
yading@11 401 %macro CONV_S16P_TO_FLT_2CH 0
yading@11 402 cglobal conv_s16p_to_flt_2ch, 3,4,6, dst, src0, len, src1
yading@11 403 lea lenq, [2*lend]
yading@11 404 mov src1q, [src0q+gprsize]
yading@11 405 mov src0q, [src0q ]
yading@11 406 lea dstq, [dstq+4*lenq]
yading@11 407 add src0q, lenq
yading@11 408 add src1q, lenq
yading@11 409 neg lenq
yading@11 410 mova m5, [pf_s32_inv_scale]
yading@11 411 .loop:
yading@11 412 mova m2, [src0q+lenq] ; m2 = 0, 2, 4, 6, 8, 10, 12, 14
yading@11 413 mova m4, [src1q+lenq] ; m4 = 1, 3, 5, 7, 9, 11, 13, 15
yading@11 414 SBUTTERFLY2 wd, 2, 4, 3 ; m2 = 0, 1, 2, 3, 4, 5, 6, 7
yading@11 415 ; m4 = 8, 9, 10, 11, 12, 13, 14, 15
yading@11 416 pxor m3, m3
yading@11 417 punpcklwd m0, m3, m2 ; m0 = 0, 1, 2, 3
yading@11 418 punpckhwd m1, m3, m2 ; m1 = 4, 5, 6, 7
yading@11 419 punpcklwd m2, m3, m4 ; m2 = 8, 9, 10, 11
yading@11 420 punpckhwd m3, m4 ; m3 = 12, 13, 14, 15
yading@11 421 cvtdq2ps m0, m0
yading@11 422 cvtdq2ps m1, m1
yading@11 423 cvtdq2ps m2, m2
yading@11 424 cvtdq2ps m3, m3
yading@11 425 mulps m0, m5
yading@11 426 mulps m1, m5
yading@11 427 mulps m2, m5
yading@11 428 mulps m3, m5
yading@11 429 mova [dstq+4*lenq ], m0
yading@11 430 mova [dstq+4*lenq+ mmsize], m1
yading@11 431 mova [dstq+4*lenq+2*mmsize], m2
yading@11 432 mova [dstq+4*lenq+3*mmsize], m3
yading@11 433 add lenq, mmsize
yading@11 434 jl .loop
yading@11 435 REP_RET
yading@11 436 %endmacro
yading@11 437
yading@11 438 INIT_XMM sse2
yading@11 439 CONV_S16P_TO_FLT_2CH
yading@11 440 %if HAVE_AVX_EXTERNAL
yading@11 441 INIT_XMM avx
yading@11 442 CONV_S16P_TO_FLT_2CH
yading@11 443 %endif
yading@11 444
yading@11 445 ;------------------------------------------------------------------------------
yading@11 446 ; void ff_conv_s16p_to_flt_6ch(float *dst, int16_t *const *src, int len,
yading@11 447 ; int channels);
yading@11 448 ;------------------------------------------------------------------------------
yading@11 449
yading@11 450 %macro CONV_S16P_TO_FLT_6CH 0
yading@11 451 %if ARCH_X86_64
yading@11 452 cglobal conv_s16p_to_flt_6ch, 3,8,8, dst, src, len, src1, src2, src3, src4, src5
yading@11 453 %else
yading@11 454 cglobal conv_s16p_to_flt_6ch, 2,7,8, dst, src, src1, src2, src3, src4, src5
yading@11 455 %define lend dword r2m
yading@11 456 %endif
yading@11 457 mov src1q, [srcq+1*gprsize]
yading@11 458 mov src2q, [srcq+2*gprsize]
yading@11 459 mov src3q, [srcq+3*gprsize]
yading@11 460 mov src4q, [srcq+4*gprsize]
yading@11 461 mov src5q, [srcq+5*gprsize]
yading@11 462 mov srcq, [srcq]
yading@11 463 sub src1q, srcq
yading@11 464 sub src2q, srcq
yading@11 465 sub src3q, srcq
yading@11 466 sub src4q, srcq
yading@11 467 sub src5q, srcq
yading@11 468 mova m7, [pf_s32_inv_scale]
yading@11 469 %if cpuflag(ssse3)
yading@11 470 %define unpack_even m6
yading@11 471 mova m6, [pb_shuf_unpack_even]
yading@11 472 %if ARCH_X86_64
yading@11 473 %define unpack_odd m8
yading@11 474 mova m8, [pb_shuf_unpack_odd]
yading@11 475 %else
yading@11 476 %define unpack_odd [pb_shuf_unpack_odd]
yading@11 477 %endif
yading@11 478 %endif
yading@11 479 .loop:
yading@11 480 movq m0, [srcq ] ; m0 = 0, 6, 12, 18, x, x, x, x
yading@11 481 movq m1, [srcq+src1q] ; m1 = 1, 7, 13, 19, x, x, x, x
yading@11 482 movq m2, [srcq+src2q] ; m2 = 2, 8, 14, 20, x, x, x, x
yading@11 483 movq m3, [srcq+src3q] ; m3 = 3, 9, 15, 21, x, x, x, x
yading@11 484 movq m4, [srcq+src4q] ; m4 = 4, 10, 16, 22, x, x, x, x
yading@11 485 movq m5, [srcq+src5q] ; m5 = 5, 11, 17, 23, x, x, x, x
yading@11 486 ; unpack words:
yading@11 487 punpcklwd m0, m1 ; m0 = 0, 1, 6, 7, 12, 13, 18, 19
yading@11 488 punpcklwd m2, m3 ; m2 = 2, 3, 8, 9, 14, 15, 20, 21
yading@11 489 punpcklwd m4, m5 ; m4 = 4, 5, 10, 11, 16, 17, 22, 23
yading@11 490 ; blend dwords
yading@11 491 shufps m1, m4, m0, q3120 ; m1 = 4, 5, 16, 17, 6, 7, 18, 19
yading@11 492 shufps m0, m2, q2020 ; m0 = 0, 1, 12, 13, 2, 3, 14, 15
yading@11 493 shufps m2, m4, q3131 ; m2 = 8, 9, 20, 21, 10, 11, 22, 23
yading@11 494 %if cpuflag(ssse3)
yading@11 495 pshufb m3, m0, unpack_odd ; m3 = 12, 13, 14, 15
yading@11 496 pshufb m0, unpack_even ; m0 = 0, 1, 2, 3
yading@11 497 pshufb m4, m1, unpack_odd ; m4 = 16, 17, 18, 19
yading@11 498 pshufb m1, unpack_even ; m1 = 4, 5, 6, 7
yading@11 499 pshufb m5, m2, unpack_odd ; m5 = 20, 21, 22, 23
yading@11 500 pshufb m2, unpack_even ; m2 = 8, 9, 10, 11
yading@11 501 %else
yading@11 502 ; shuffle dwords
yading@11 503 pshufd m0, m0, q3120 ; m0 = 0, 1, 2, 3, 12, 13, 14, 15
yading@11 504 pshufd m1, m1, q3120 ; m1 = 4, 5, 6, 7, 16, 17, 18, 19
yading@11 505 pshufd m2, m2, q3120 ; m2 = 8, 9, 10, 11, 20, 21, 22, 23
yading@11 506 pxor m6, m6 ; convert s16 in m0-m2 to s32 in m0-m5
yading@11 507 punpcklwd m3, m6, m0 ; m3 = 0, 1, 2, 3
yading@11 508 punpckhwd m4, m6, m0 ; m4 = 12, 13, 14, 15
yading@11 509 punpcklwd m0, m6, m1 ; m0 = 4, 5, 6, 7
yading@11 510 punpckhwd m5, m6, m1 ; m5 = 16, 17, 18, 19
yading@11 511 punpcklwd m1, m6, m2 ; m1 = 8, 9, 10, 11
yading@11 512 punpckhwd m6, m2 ; m6 = 20, 21, 22, 23
yading@11 513 SWAP 6,2,1,0,3,4,5 ; swap registers 3,0,1,4,5,6 to 0,1,2,3,4,5
yading@11 514 %endif
yading@11 515 cvtdq2ps m0, m0 ; convert s32 to float
yading@11 516 cvtdq2ps m1, m1
yading@11 517 cvtdq2ps m2, m2
yading@11 518 cvtdq2ps m3, m3
yading@11 519 cvtdq2ps m4, m4
yading@11 520 cvtdq2ps m5, m5
yading@11 521 mulps m0, m7 ; scale float from s32 range to [-1.0,1.0]
yading@11 522 mulps m1, m7
yading@11 523 mulps m2, m7
yading@11 524 mulps m3, m7
yading@11 525 mulps m4, m7
yading@11 526 mulps m5, m7
yading@11 527 mova [dstq ], m0
yading@11 528 mova [dstq+ mmsize], m1
yading@11 529 mova [dstq+2*mmsize], m2
yading@11 530 mova [dstq+3*mmsize], m3
yading@11 531 mova [dstq+4*mmsize], m4
yading@11 532 mova [dstq+5*mmsize], m5
yading@11 533 add srcq, mmsize/2
yading@11 534 add dstq, mmsize*6
yading@11 535 sub lend, mmsize/4
yading@11 536 jg .loop
yading@11 537 REP_RET
yading@11 538 %endmacro
yading@11 539
yading@11 540 INIT_XMM sse2
yading@11 541 CONV_S16P_TO_FLT_6CH
yading@11 542 INIT_XMM ssse3
yading@11 543 CONV_S16P_TO_FLT_6CH
yading@11 544 %if HAVE_AVX_EXTERNAL
yading@11 545 INIT_XMM avx
yading@11 546 CONV_S16P_TO_FLT_6CH
yading@11 547 %endif
yading@11 548
yading@11 549 ;------------------------------------------------------------------------------
yading@11 550 ; void ff_conv_fltp_to_s16_2ch(int16_t *dst, float *const *src, int len,
yading@11 551 ; int channels);
yading@11 552 ;------------------------------------------------------------------------------
yading@11 553
yading@11 554 %macro CONV_FLTP_TO_S16_2CH 0
yading@11 555 cglobal conv_fltp_to_s16_2ch, 3,4,3, dst, src0, len, src1
yading@11 556 lea lenq, [4*lend]
yading@11 557 mov src1q, [src0q+gprsize]
yading@11 558 mov src0q, [src0q ]
yading@11 559 add dstq, lenq
yading@11 560 add src0q, lenq
yading@11 561 add src1q, lenq
yading@11 562 neg lenq
yading@11 563 mova m2, [pf_s16_scale]
yading@11 564 %if cpuflag(ssse3)
yading@11 565 mova m3, [pb_interleave_words]
yading@11 566 %endif
yading@11 567 .loop:
yading@11 568 mulps m0, m2, [src0q+lenq] ; m0 = 0, 2, 4, 6
yading@11 569 mulps m1, m2, [src1q+lenq] ; m1 = 1, 3, 5, 7
yading@11 570 cvtps2dq m0, m0
yading@11 571 cvtps2dq m1, m1
yading@11 572 %if cpuflag(ssse3)
yading@11 573 packssdw m0, m1 ; m0 = 0, 2, 4, 6, 1, 3, 5, 7
yading@11 574 pshufb m0, m3 ; m0 = 0, 1, 2, 3, 4, 5, 6, 7
yading@11 575 %else
yading@11 576 packssdw m0, m0 ; m0 = 0, 2, 4, 6, x, x, x, x
yading@11 577 packssdw m1, m1 ; m1 = 1, 3, 5, 7, x, x, x, x
yading@11 578 punpcklwd m0, m1 ; m0 = 0, 1, 2, 3, 4, 5, 6, 7
yading@11 579 %endif
yading@11 580 mova [dstq+lenq], m0
yading@11 581 add lenq, mmsize
yading@11 582 jl .loop
yading@11 583 REP_RET
yading@11 584 %endmacro
yading@11 585
yading@11 586 INIT_XMM sse2
yading@11 587 CONV_FLTP_TO_S16_2CH
yading@11 588 INIT_XMM ssse3
yading@11 589 CONV_FLTP_TO_S16_2CH
yading@11 590
yading@11 591 ;------------------------------------------------------------------------------
yading@11 592 ; void ff_conv_fltp_to_s16_6ch(int16_t *dst, float *const *src, int len,
yading@11 593 ; int channels);
yading@11 594 ;------------------------------------------------------------------------------
yading@11 595
yading@11 596 %macro CONV_FLTP_TO_S16_6CH 0
yading@11 597 %if ARCH_X86_64
yading@11 598 cglobal conv_fltp_to_s16_6ch, 3,8,7, dst, src, len, src1, src2, src3, src4, src5
yading@11 599 %else
yading@11 600 cglobal conv_fltp_to_s16_6ch, 2,7,7, dst, src, src1, src2, src3, src4, src5
yading@11 601 %define lend dword r2m
yading@11 602 %endif
yading@11 603 mov src1q, [srcq+1*gprsize]
yading@11 604 mov src2q, [srcq+2*gprsize]
yading@11 605 mov src3q, [srcq+3*gprsize]
yading@11 606 mov src4q, [srcq+4*gprsize]
yading@11 607 mov src5q, [srcq+5*gprsize]
yading@11 608 mov srcq, [srcq]
yading@11 609 sub src1q, srcq
yading@11 610 sub src2q, srcq
yading@11 611 sub src3q, srcq
yading@11 612 sub src4q, srcq
yading@11 613 sub src5q, srcq
yading@11 614 movaps xmm6, [pf_s16_scale]
yading@11 615 .loop:
yading@11 616 %if cpuflag(sse2)
yading@11 617 mulps m0, m6, [srcq ]
yading@11 618 mulps m1, m6, [srcq+src1q]
yading@11 619 mulps m2, m6, [srcq+src2q]
yading@11 620 mulps m3, m6, [srcq+src3q]
yading@11 621 mulps m4, m6, [srcq+src4q]
yading@11 622 mulps m5, m6, [srcq+src5q]
yading@11 623 cvtps2dq m0, m0
yading@11 624 cvtps2dq m1, m1
yading@11 625 cvtps2dq m2, m2
yading@11 626 cvtps2dq m3, m3
yading@11 627 cvtps2dq m4, m4
yading@11 628 cvtps2dq m5, m5
yading@11 629 packssdw m0, m3 ; m0 = 0, 6, 12, 18, 3, 9, 15, 21
yading@11 630 packssdw m1, m4 ; m1 = 1, 7, 13, 19, 4, 10, 16, 22
yading@11 631 packssdw m2, m5 ; m2 = 2, 8, 14, 20, 5, 11, 17, 23
yading@11 632 ; unpack words:
yading@11 633 movhlps m3, m0 ; m3 = 3, 9, 15, 21, x, x, x, x
yading@11 634 punpcklwd m0, m1 ; m0 = 0, 1, 6, 7, 12, 13, 18, 19
yading@11 635 punpckhwd m1, m2 ; m1 = 4, 5, 10, 11, 16, 17, 22, 23
yading@11 636 punpcklwd m2, m3 ; m2 = 2, 3, 8, 9, 14, 15, 20, 21
yading@11 637 ; blend dwords:
yading@11 638 shufps m3, m0, m2, q2020 ; m3 = 0, 1, 12, 13, 2, 3, 14, 15
yading@11 639 shufps m0, m1, q2031 ; m0 = 6, 7, 18, 19, 4, 5, 16, 17
yading@11 640 shufps m2, m1, q3131 ; m2 = 8, 9, 20, 21, 10, 11, 22, 23
yading@11 641 ; shuffle dwords:
yading@11 642 shufps m1, m2, m3, q3120 ; m1 = 8, 9, 10, 11, 12, 13, 14, 15
yading@11 643 shufps m3, m0, q0220 ; m3 = 0, 1, 2, 3, 4, 5, 6, 7
yading@11 644 shufps m0, m2, q3113 ; m0 = 16, 17, 18, 19, 20, 21, 22, 23
yading@11 645 mova [dstq+0*mmsize], m3
yading@11 646 mova [dstq+1*mmsize], m1
yading@11 647 mova [dstq+2*mmsize], m0
yading@11 648 %else ; sse
yading@11 649 movlps xmm0, [srcq ]
yading@11 650 movlps xmm1, [srcq+src1q]
yading@11 651 movlps xmm2, [srcq+src2q]
yading@11 652 movlps xmm3, [srcq+src3q]
yading@11 653 movlps xmm4, [srcq+src4q]
yading@11 654 movlps xmm5, [srcq+src5q]
yading@11 655 mulps xmm0, xmm6
yading@11 656 mulps xmm1, xmm6
yading@11 657 mulps xmm2, xmm6
yading@11 658 mulps xmm3, xmm6
yading@11 659 mulps xmm4, xmm6
yading@11 660 mulps xmm5, xmm6
yading@11 661 cvtps2pi mm0, xmm0
yading@11 662 cvtps2pi mm1, xmm1
yading@11 663 cvtps2pi mm2, xmm2
yading@11 664 cvtps2pi mm3, xmm3
yading@11 665 cvtps2pi mm4, xmm4
yading@11 666 cvtps2pi mm5, xmm5
yading@11 667 packssdw mm0, mm3 ; m0 = 0, 6, 3, 9
yading@11 668 packssdw mm1, mm4 ; m1 = 1, 7, 4, 10
yading@11 669 packssdw mm2, mm5 ; m2 = 2, 8, 5, 11
yading@11 670 ; unpack words
yading@11 671 pshufw mm3, mm0, q1032 ; m3 = 3, 9, 0, 6
yading@11 672 punpcklwd mm0, mm1 ; m0 = 0, 1, 6, 7
yading@11 673 punpckhwd mm1, mm2 ; m1 = 4, 5, 10, 11
yading@11 674 punpcklwd mm2, mm3 ; m2 = 2, 3, 8, 9
yading@11 675 ; unpack dwords
yading@11 676 pshufw mm3, mm0, q1032 ; m3 = 6, 7, 0, 1
yading@11 677 punpckldq mm0, mm2 ; m0 = 0, 1, 2, 3 (final)
yading@11 678 punpckhdq mm2, mm1 ; m2 = 8, 9, 10, 11 (final)
yading@11 679 punpckldq mm1, mm3 ; m1 = 4, 5, 6, 7 (final)
yading@11 680 mova [dstq+0*mmsize], mm0
yading@11 681 mova [dstq+1*mmsize], mm1
yading@11 682 mova [dstq+2*mmsize], mm2
yading@11 683 %endif
yading@11 684 add srcq, mmsize
yading@11 685 add dstq, mmsize*3
yading@11 686 sub lend, mmsize/4
yading@11 687 jg .loop
yading@11 688 %if mmsize == 8
yading@11 689 emms
yading@11 690 RET
yading@11 691 %else
yading@11 692 REP_RET
yading@11 693 %endif
yading@11 694 %endmacro
yading@11 695
yading@11 696 INIT_MMX sse
yading@11 697 CONV_FLTP_TO_S16_6CH
yading@11 698 INIT_XMM sse2
yading@11 699 CONV_FLTP_TO_S16_6CH
yading@11 700 %if HAVE_AVX_EXTERNAL
yading@11 701 INIT_XMM avx
yading@11 702 CONV_FLTP_TO_S16_6CH
yading@11 703 %endif
yading@11 704
yading@11 705 ;------------------------------------------------------------------------------
yading@11 706 ; void ff_conv_fltp_to_flt_2ch(float *dst, float *const *src, int len,
yading@11 707 ; int channels);
yading@11 708 ;------------------------------------------------------------------------------
yading@11 709
yading@11 710 %macro CONV_FLTP_TO_FLT_2CH 0
yading@11 711 cglobal conv_fltp_to_flt_2ch, 3,4,5, dst, src0, len, src1
yading@11 712 mov src1q, [src0q+gprsize]
yading@11 713 mov src0q, [src0q]
yading@11 714 lea lenq, [4*lend]
yading@11 715 add src0q, lenq
yading@11 716 add src1q, lenq
yading@11 717 lea dstq, [dstq+2*lenq]
yading@11 718 neg lenq
yading@11 719 .loop:
yading@11 720 mova m0, [src0q+lenq ]
yading@11 721 mova m1, [src1q+lenq ]
yading@11 722 mova m2, [src0q+lenq+mmsize]
yading@11 723 mova m3, [src1q+lenq+mmsize]
yading@11 724 SBUTTERFLYPS 0, 1, 4
yading@11 725 SBUTTERFLYPS 2, 3, 4
yading@11 726 mova [dstq+2*lenq+0*mmsize], m0
yading@11 727 mova [dstq+2*lenq+1*mmsize], m1
yading@11 728 mova [dstq+2*lenq+2*mmsize], m2
yading@11 729 mova [dstq+2*lenq+3*mmsize], m3
yading@11 730 add lenq, 2*mmsize
yading@11 731 jl .loop
yading@11 732 REP_RET
yading@11 733 %endmacro
yading@11 734
yading@11 735 INIT_XMM sse
yading@11 736 CONV_FLTP_TO_FLT_2CH
yading@11 737 %if HAVE_AVX_EXTERNAL
yading@11 738 INIT_XMM avx
yading@11 739 CONV_FLTP_TO_FLT_2CH
yading@11 740 %endif
yading@11 741
yading@11 742 ;-----------------------------------------------------------------------------
yading@11 743 ; void ff_conv_fltp_to_flt_6ch(float *dst, float *const *src, int len,
yading@11 744 ; int channels);
yading@11 745 ;-----------------------------------------------------------------------------
yading@11 746
yading@11 747 %macro CONV_FLTP_TO_FLT_6CH 0
yading@11 748 cglobal conv_fltp_to_flt_6ch, 2,8,7, dst, src, src1, src2, src3, src4, src5, len
yading@11 749 %if ARCH_X86_64
yading@11 750 mov lend, r2d
yading@11 751 %else
yading@11 752 %define lend dword r2m
yading@11 753 %endif
yading@11 754 mov src1q, [srcq+1*gprsize]
yading@11 755 mov src2q, [srcq+2*gprsize]
yading@11 756 mov src3q, [srcq+3*gprsize]
yading@11 757 mov src4q, [srcq+4*gprsize]
yading@11 758 mov src5q, [srcq+5*gprsize]
yading@11 759 mov srcq, [srcq]
yading@11 760 sub src1q, srcq
yading@11 761 sub src2q, srcq
yading@11 762 sub src3q, srcq
yading@11 763 sub src4q, srcq
yading@11 764 sub src5q, srcq
yading@11 765 .loop:
yading@11 766 mova m0, [srcq ]
yading@11 767 mova m1, [srcq+src1q]
yading@11 768 mova m2, [srcq+src2q]
yading@11 769 mova m3, [srcq+src3q]
yading@11 770 mova m4, [srcq+src4q]
yading@11 771 mova m5, [srcq+src5q]
yading@11 772 %if cpuflag(sse4)
yading@11 773 SBUTTERFLYPS 0, 1, 6
yading@11 774 SBUTTERFLYPS 2, 3, 6
yading@11 775 SBUTTERFLYPS 4, 5, 6
yading@11 776
yading@11 777 blendps m6, m4, m0, 1100b
yading@11 778 movlhps m0, m2
yading@11 779 movhlps m4, m2
yading@11 780 blendps m2, m5, m1, 1100b
yading@11 781 movlhps m1, m3
yading@11 782 movhlps m5, m3
yading@11 783
yading@11 784 movaps [dstq ], m0
yading@11 785 movaps [dstq+16], m6
yading@11 786 movaps [dstq+32], m4
yading@11 787 movaps [dstq+48], m1
yading@11 788 movaps [dstq+64], m2
yading@11 789 movaps [dstq+80], m5
yading@11 790 %else ; mmx
yading@11 791 SBUTTERFLY dq, 0, 1, 6
yading@11 792 SBUTTERFLY dq, 2, 3, 6
yading@11 793 SBUTTERFLY dq, 4, 5, 6
yading@11 794
yading@11 795 movq [dstq ], m0
yading@11 796 movq [dstq+ 8], m2
yading@11 797 movq [dstq+16], m4
yading@11 798 movq [dstq+24], m1
yading@11 799 movq [dstq+32], m3
yading@11 800 movq [dstq+40], m5
yading@11 801 %endif
yading@11 802 add srcq, mmsize
yading@11 803 add dstq, mmsize*6
yading@11 804 sub lend, mmsize/4
yading@11 805 jg .loop
yading@11 806 %if mmsize == 8
yading@11 807 emms
yading@11 808 RET
yading@11 809 %else
yading@11 810 REP_RET
yading@11 811 %endif
yading@11 812 %endmacro
yading@11 813
yading@11 814 INIT_MMX mmx
yading@11 815 CONV_FLTP_TO_FLT_6CH
yading@11 816 INIT_XMM sse4
yading@11 817 CONV_FLTP_TO_FLT_6CH
yading@11 818 %if HAVE_AVX_EXTERNAL
yading@11 819 INIT_XMM avx
yading@11 820 CONV_FLTP_TO_FLT_6CH
yading@11 821 %endif
yading@11 822
yading@11 823 ;------------------------------------------------------------------------------
yading@11 824 ; void ff_conv_s16_to_s16p_2ch(int16_t *const *dst, int16_t *src, int len,
yading@11 825 ; int channels);
yading@11 826 ;------------------------------------------------------------------------------
yading@11 827
yading@11 828 %macro CONV_S16_TO_S16P_2CH 0
yading@11 829 cglobal conv_s16_to_s16p_2ch, 3,4,4, dst0, src, len, dst1
yading@11 830 lea lenq, [2*lend]
yading@11 831 mov dst1q, [dst0q+gprsize]
yading@11 832 mov dst0q, [dst0q ]
yading@11 833 lea srcq, [srcq+2*lenq]
yading@11 834 add dst0q, lenq
yading@11 835 add dst1q, lenq
yading@11 836 neg lenq
yading@11 837 %if cpuflag(ssse3)
yading@11 838 mova m3, [pb_deinterleave_words]
yading@11 839 %endif
yading@11 840 .loop:
yading@11 841 mova m0, [srcq+2*lenq ] ; m0 = 0, 1, 2, 3, 4, 5, 6, 7
yading@11 842 mova m1, [srcq+2*lenq+mmsize] ; m1 = 8, 9, 10, 11, 12, 13, 14, 15
yading@11 843 %if cpuflag(ssse3)
yading@11 844 pshufb m0, m3 ; m0 = 0, 2, 4, 6, 1, 3, 5, 7
yading@11 845 pshufb m1, m3 ; m1 = 8, 10, 12, 14, 9, 11, 13, 15
yading@11 846 SBUTTERFLY2 qdq, 0, 1, 2 ; m0 = 0, 2, 4, 6, 8, 10, 12, 14
yading@11 847 ; m1 = 1, 3, 5, 7, 9, 11, 13, 15
yading@11 848 %else ; sse2
yading@11 849 pshuflw m0, m0, q3120 ; m0 = 0, 2, 1, 3, 4, 5, 6, 7
yading@11 850 pshufhw m0, m0, q3120 ; m0 = 0, 2, 1, 3, 4, 6, 5, 7
yading@11 851 pshuflw m1, m1, q3120 ; m1 = 8, 10, 9, 11, 12, 13, 14, 15
yading@11 852 pshufhw m1, m1, q3120 ; m1 = 8, 10, 9, 11, 12, 14, 13, 15
yading@11 853 DEINT2_PS 0, 1, 2 ; m0 = 0, 2, 4, 6, 8, 10, 12, 14
yading@11 854 ; m1 = 1, 3, 5, 7, 9, 11, 13, 15
yading@11 855 %endif
yading@11 856 mova [dst0q+lenq], m0
yading@11 857 mova [dst1q+lenq], m1
yading@11 858 add lenq, mmsize
yading@11 859 jl .loop
yading@11 860 REP_RET
yading@11 861 %endmacro
yading@11 862
yading@11 863 INIT_XMM sse2
yading@11 864 CONV_S16_TO_S16P_2CH
yading@11 865 INIT_XMM ssse3
yading@11 866 CONV_S16_TO_S16P_2CH
yading@11 867 %if HAVE_AVX_EXTERNAL
yading@11 868 INIT_XMM avx
yading@11 869 CONV_S16_TO_S16P_2CH
yading@11 870 %endif
yading@11 871
yading@11 872 ;------------------------------------------------------------------------------
yading@11 873 ; void ff_conv_s16_to_s16p_6ch(int16_t *const *dst, int16_t *src, int len,
yading@11 874 ; int channels);
yading@11 875 ;------------------------------------------------------------------------------
yading@11 876
yading@11 877 %macro CONV_S16_TO_S16P_6CH 0
yading@11 878 %if ARCH_X86_64
yading@11 879 cglobal conv_s16_to_s16p_6ch, 3,8,5, dst, src, len, dst1, dst2, dst3, dst4, dst5
yading@11 880 %else
yading@11 881 cglobal conv_s16_to_s16p_6ch, 2,7,5, dst, src, dst1, dst2, dst3, dst4, dst5
yading@11 882 %define lend dword r2m
yading@11 883 %endif
yading@11 884 mov dst1q, [dstq+ gprsize]
yading@11 885 mov dst2q, [dstq+2*gprsize]
yading@11 886 mov dst3q, [dstq+3*gprsize]
yading@11 887 mov dst4q, [dstq+4*gprsize]
yading@11 888 mov dst5q, [dstq+5*gprsize]
yading@11 889 mov dstq, [dstq ]
yading@11 890 sub dst1q, dstq
yading@11 891 sub dst2q, dstq
yading@11 892 sub dst3q, dstq
yading@11 893 sub dst4q, dstq
yading@11 894 sub dst5q, dstq
yading@11 895 .loop:
yading@11 896 mova m0, [srcq+0*mmsize] ; m0 = 0, 1, 2, 3, 4, 5, 6, 7
yading@11 897 mova m3, [srcq+1*mmsize] ; m3 = 8, 9, 10, 11, 12, 13, 14, 15
yading@11 898 mova m2, [srcq+2*mmsize] ; m2 = 16, 17, 18, 19, 20, 21, 22, 23
yading@11 899 PALIGNR m1, m3, m0, 12, m4 ; m1 = 6, 7, 8, 9, 10, 11, x, x
yading@11 900 shufps m3, m2, q1032 ; m3 = 12, 13, 14, 15, 16, 17, 18, 19
yading@11 901 psrldq m2, 4 ; m2 = 18, 19, 20, 21, 22, 23, x, x
yading@11 902 SBUTTERFLY2 wd, 0, 1, 4 ; m0 = 0, 6, 1, 7, 2, 8, 3, 9
yading@11 903 ; m1 = 4, 10, 5, 11, x, x, x, x
yading@11 904 SBUTTERFLY2 wd, 3, 2, 4 ; m3 = 12, 18, 13, 19, 14, 20, 15, 21
yading@11 905 ; m2 = 16, 22, 17, 23, x, x, x, x
yading@11 906 SBUTTERFLY2 dq, 0, 3, 4 ; m0 = 0, 6, 12, 18, 1, 7, 13, 19
yading@11 907 ; m3 = 2, 8, 14, 20, 3, 9, 15, 21
yading@11 908 punpckldq m1, m2 ; m1 = 4, 10, 16, 22, 5, 11, 17, 23
yading@11 909 movq [dstq ], m0
yading@11 910 movhps [dstq+dst1q], m0
yading@11 911 movq [dstq+dst2q], m3
yading@11 912 movhps [dstq+dst3q], m3
yading@11 913 movq [dstq+dst4q], m1
yading@11 914 movhps [dstq+dst5q], m1
yading@11 915 add srcq, mmsize*3
yading@11 916 add dstq, mmsize/2
yading@11 917 sub lend, mmsize/4
yading@11 918 jg .loop
yading@11 919 REP_RET
yading@11 920 %endmacro
yading@11 921
yading@11 922 INIT_XMM sse2
yading@11 923 CONV_S16_TO_S16P_6CH
yading@11 924 INIT_XMM ssse3
yading@11 925 CONV_S16_TO_S16P_6CH
yading@11 926 %if HAVE_AVX_EXTERNAL
yading@11 927 INIT_XMM avx
yading@11 928 CONV_S16_TO_S16P_6CH
yading@11 929 %endif
yading@11 930
yading@11 931 ;------------------------------------------------------------------------------
yading@11 932 ; void ff_conv_s16_to_fltp_2ch(float *const *dst, int16_t *src, int len,
yading@11 933 ; int channels);
yading@11 934 ;------------------------------------------------------------------------------
yading@11 935
yading@11 936 %macro CONV_S16_TO_FLTP_2CH 0
yading@11 937 cglobal conv_s16_to_fltp_2ch, 3,4,5, dst0, src, len, dst1
yading@11 938 lea lenq, [4*lend]
yading@11 939 mov dst1q, [dst0q+gprsize]
yading@11 940 mov dst0q, [dst0q ]
yading@11 941 add srcq, lenq
yading@11 942 add dst0q, lenq
yading@11 943 add dst1q, lenq
yading@11 944 neg lenq
yading@11 945 mova m3, [pf_s32_inv_scale]
yading@11 946 mova m4, [pw_zero_even]
yading@11 947 .loop:
yading@11 948 mova m1, [srcq+lenq]
yading@11 949 pslld m0, m1, 16
yading@11 950 pand m1, m4
yading@11 951 cvtdq2ps m0, m0
yading@11 952 cvtdq2ps m1, m1
yading@11 953 mulps m0, m0, m3
yading@11 954 mulps m1, m1, m3
yading@11 955 mova [dst0q+lenq], m0
yading@11 956 mova [dst1q+lenq], m1
yading@11 957 add lenq, mmsize
yading@11 958 jl .loop
yading@11 959 REP_RET
yading@11 960 %endmacro
yading@11 961
yading@11 962 INIT_XMM sse2
yading@11 963 CONV_S16_TO_FLTP_2CH
yading@11 964 %if HAVE_AVX_EXTERNAL
yading@11 965 INIT_XMM avx
yading@11 966 CONV_S16_TO_FLTP_2CH
yading@11 967 %endif
yading@11 968
yading@11 969 ;------------------------------------------------------------------------------
yading@11 970 ; void ff_conv_s16_to_fltp_6ch(float *const *dst, int16_t *src, int len,
yading@11 971 ; int channels);
yading@11 972 ;------------------------------------------------------------------------------
yading@11 973
yading@11 974 %macro CONV_S16_TO_FLTP_6CH 0
yading@11 975 %if ARCH_X86_64
yading@11 976 cglobal conv_s16_to_fltp_6ch, 3,8,7, dst, src, len, dst1, dst2, dst3, dst4, dst5
yading@11 977 %else
yading@11 978 cglobal conv_s16_to_fltp_6ch, 2,7,7, dst, src, dst1, dst2, dst3, dst4, dst5
yading@11 979 %define lend dword r2m
yading@11 980 %endif
yading@11 981 mov dst1q, [dstq+ gprsize]
yading@11 982 mov dst2q, [dstq+2*gprsize]
yading@11 983 mov dst3q, [dstq+3*gprsize]
yading@11 984 mov dst4q, [dstq+4*gprsize]
yading@11 985 mov dst5q, [dstq+5*gprsize]
yading@11 986 mov dstq, [dstq ]
yading@11 987 sub dst1q, dstq
yading@11 988 sub dst2q, dstq
yading@11 989 sub dst3q, dstq
yading@11 990 sub dst4q, dstq
yading@11 991 sub dst5q, dstq
yading@11 992 mova m6, [pf_s16_inv_scale]
yading@11 993 .loop:
yading@11 994 mova m0, [srcq+0*mmsize] ; m0 = 0, 1, 2, 3, 4, 5, 6, 7
yading@11 995 mova m3, [srcq+1*mmsize] ; m3 = 8, 9, 10, 11, 12, 13, 14, 15
yading@11 996 mova m2, [srcq+2*mmsize] ; m2 = 16, 17, 18, 19, 20, 21, 22, 23
yading@11 997 PALIGNR m1, m3, m0, 12, m4 ; m1 = 6, 7, 8, 9, 10, 11, x, x
yading@11 998 shufps m3, m2, q1032 ; m3 = 12, 13, 14, 15, 16, 17, 18, 19
yading@11 999 psrldq m2, 4 ; m2 = 18, 19, 20, 21, 22, 23, x, x
yading@11 1000 SBUTTERFLY2 wd, 0, 1, 4 ; m0 = 0, 6, 1, 7, 2, 8, 3, 9
yading@11 1001 ; m1 = 4, 10, 5, 11, x, x, x, x
yading@11 1002 SBUTTERFLY2 wd, 3, 2, 4 ; m3 = 12, 18, 13, 19, 14, 20, 15, 21
yading@11 1003 ; m2 = 16, 22, 17, 23, x, x, x, x
yading@11 1004 SBUTTERFLY2 dq, 0, 3, 4 ; m0 = 0, 6, 12, 18, 1, 7, 13, 19
yading@11 1005 ; m3 = 2, 8, 14, 20, 3, 9, 15, 21
yading@11 1006 punpckldq m1, m2 ; m1 = 4, 10, 16, 22, 5, 11, 17, 23
yading@11 1007 S16_TO_S32_SX 0, 2 ; m0 = 0, 6, 12, 18
yading@11 1008 ; m2 = 1, 7, 13, 19
yading@11 1009 S16_TO_S32_SX 3, 4 ; m3 = 2, 8, 14, 20
yading@11 1010 ; m4 = 3, 9, 15, 21
yading@11 1011 S16_TO_S32_SX 1, 5 ; m1 = 4, 10, 16, 22
yading@11 1012 ; m5 = 5, 11, 17, 23
yading@11 1013 SWAP 1,2,3,4
yading@11 1014 cvtdq2ps m0, m0
yading@11 1015 cvtdq2ps m1, m1
yading@11 1016 cvtdq2ps m2, m2
yading@11 1017 cvtdq2ps m3, m3
yading@11 1018 cvtdq2ps m4, m4
yading@11 1019 cvtdq2ps m5, m5
yading@11 1020 mulps m0, m6
yading@11 1021 mulps m1, m6
yading@11 1022 mulps m2, m6
yading@11 1023 mulps m3, m6
yading@11 1024 mulps m4, m6
yading@11 1025 mulps m5, m6
yading@11 1026 mova [dstq ], m0
yading@11 1027 mova [dstq+dst1q], m1
yading@11 1028 mova [dstq+dst2q], m2
yading@11 1029 mova [dstq+dst3q], m3
yading@11 1030 mova [dstq+dst4q], m4
yading@11 1031 mova [dstq+dst5q], m5
yading@11 1032 add srcq, mmsize*3
yading@11 1033 add dstq, mmsize
yading@11 1034 sub lend, mmsize/4
yading@11 1035 jg .loop
yading@11 1036 REP_RET
yading@11 1037 %endmacro
yading@11 1038
yading@11 1039 INIT_XMM sse2
yading@11 1040 CONV_S16_TO_FLTP_6CH
yading@11 1041 INIT_XMM ssse3
yading@11 1042 CONV_S16_TO_FLTP_6CH
yading@11 1043 INIT_XMM sse4
yading@11 1044 CONV_S16_TO_FLTP_6CH
yading@11 1045 %if HAVE_AVX_EXTERNAL
yading@11 1046 INIT_XMM avx
yading@11 1047 CONV_S16_TO_FLTP_6CH
yading@11 1048 %endif
yading@11 1049
yading@11 1050 ;------------------------------------------------------------------------------
yading@11 1051 ; void ff_conv_flt_to_s16p_2ch(int16_t *const *dst, float *src, int len,
yading@11 1052 ; int channels);
yading@11 1053 ;------------------------------------------------------------------------------
yading@11 1054
yading@11 1055 %macro CONV_FLT_TO_S16P_2CH 0
yading@11 1056 cglobal conv_flt_to_s16p_2ch, 3,4,6, dst0, src, len, dst1
yading@11 1057 lea lenq, [2*lend]
yading@11 1058 mov dst1q, [dst0q+gprsize]
yading@11 1059 mov dst0q, [dst0q ]
yading@11 1060 lea srcq, [srcq+4*lenq]
yading@11 1061 add dst0q, lenq
yading@11 1062 add dst1q, lenq
yading@11 1063 neg lenq
yading@11 1064 mova m5, [pf_s16_scale]
yading@11 1065 .loop:
yading@11 1066 mova m0, [srcq+4*lenq ]
yading@11 1067 mova m1, [srcq+4*lenq+ mmsize]
yading@11 1068 mova m2, [srcq+4*lenq+2*mmsize]
yading@11 1069 mova m3, [srcq+4*lenq+3*mmsize]
yading@11 1070 DEINT2_PS 0, 1, 4
yading@11 1071 DEINT2_PS 2, 3, 4
yading@11 1072 mulps m0, m0, m5
yading@11 1073 mulps m1, m1, m5
yading@11 1074 mulps m2, m2, m5
yading@11 1075 mulps m3, m3, m5
yading@11 1076 cvtps2dq m0, m0
yading@11 1077 cvtps2dq m1, m1
yading@11 1078 cvtps2dq m2, m2
yading@11 1079 cvtps2dq m3, m3
yading@11 1080 packssdw m0, m2
yading@11 1081 packssdw m1, m3
yading@11 1082 mova [dst0q+lenq], m0
yading@11 1083 mova [dst1q+lenq], m1
yading@11 1084 add lenq, mmsize
yading@11 1085 jl .loop
yading@11 1086 REP_RET
yading@11 1087 %endmacro
yading@11 1088
yading@11 1089 INIT_XMM sse2
yading@11 1090 CONV_FLT_TO_S16P_2CH
yading@11 1091 %if HAVE_AVX_EXTERNAL
yading@11 1092 INIT_XMM avx
yading@11 1093 CONV_FLT_TO_S16P_2CH
yading@11 1094 %endif
yading@11 1095
yading@11 1096 ;------------------------------------------------------------------------------
yading@11 1097 ; void ff_conv_flt_to_s16p_6ch(int16_t *const *dst, float *src, int len,
yading@11 1098 ; int channels);
yading@11 1099 ;------------------------------------------------------------------------------
yading@11 1100
yading@11 1101 %macro CONV_FLT_TO_S16P_6CH 0
yading@11 1102 %if ARCH_X86_64
yading@11 1103 cglobal conv_flt_to_s16p_6ch, 3,8,7, dst, src, len, dst1, dst2, dst3, dst4, dst5
yading@11 1104 %else
yading@11 1105 cglobal conv_flt_to_s16p_6ch, 2,7,7, dst, src, dst1, dst2, dst3, dst4, dst5
yading@11 1106 %define lend dword r2m
yading@11 1107 %endif
yading@11 1108 mov dst1q, [dstq+ gprsize]
yading@11 1109 mov dst2q, [dstq+2*gprsize]
yading@11 1110 mov dst3q, [dstq+3*gprsize]
yading@11 1111 mov dst4q, [dstq+4*gprsize]
yading@11 1112 mov dst5q, [dstq+5*gprsize]
yading@11 1113 mov dstq, [dstq ]
yading@11 1114 sub dst1q, dstq
yading@11 1115 sub dst2q, dstq
yading@11 1116 sub dst3q, dstq
yading@11 1117 sub dst4q, dstq
yading@11 1118 sub dst5q, dstq
yading@11 1119 mova m6, [pf_s16_scale]
yading@11 1120 .loop:
yading@11 1121 mulps m0, m6, [srcq+0*mmsize]
yading@11 1122 mulps m3, m6, [srcq+1*mmsize]
yading@11 1123 mulps m1, m6, [srcq+2*mmsize]
yading@11 1124 mulps m4, m6, [srcq+3*mmsize]
yading@11 1125 mulps m2, m6, [srcq+4*mmsize]
yading@11 1126 mulps m5, m6, [srcq+5*mmsize]
yading@11 1127 cvtps2dq m0, m0
yading@11 1128 cvtps2dq m1, m1
yading@11 1129 cvtps2dq m2, m2
yading@11 1130 cvtps2dq m3, m3
yading@11 1131 cvtps2dq m4, m4
yading@11 1132 cvtps2dq m5, m5
yading@11 1133 packssdw m0, m3 ; m0 = 0, 1, 2, 3, 4, 5, 6, 7
yading@11 1134 packssdw m1, m4 ; m1 = 8, 9, 10, 11, 12, 13, 14, 15
yading@11 1135 packssdw m2, m5 ; m2 = 16, 17, 18, 19, 20, 21, 22, 23
yading@11 1136 PALIGNR m3, m1, m0, 12, m4 ; m3 = 6, 7, 8, 9, 10, 11, x, x
yading@11 1137 shufps m1, m2, q1032 ; m1 = 12, 13, 14, 15, 16, 17, 18, 19
yading@11 1138 psrldq m2, 4 ; m2 = 18, 19, 20, 21, 22, 23, x, x
yading@11 1139 SBUTTERFLY2 wd, 0, 3, 4 ; m0 = 0, 6, 1, 7, 2, 8, 3, 9
yading@11 1140 ; m3 = 4, 10, 5, 11, x, x, x, x
yading@11 1141 SBUTTERFLY2 wd, 1, 2, 4 ; m1 = 12, 18, 13, 19, 14, 20, 15, 21
yading@11 1142 ; m2 = 16, 22, 17, 23, x, x, x, x
yading@11 1143 SBUTTERFLY2 dq, 0, 1, 4 ; m0 = 0, 6, 12, 18, 1, 7, 13, 19
yading@11 1144 ; m1 = 2, 8, 14, 20, 3, 9, 15, 21
yading@11 1145 punpckldq m3, m2 ; m3 = 4, 10, 16, 22, 5, 11, 17, 23
yading@11 1146 movq [dstq ], m0
yading@11 1147 movhps [dstq+dst1q], m0
yading@11 1148 movq [dstq+dst2q], m1
yading@11 1149 movhps [dstq+dst3q], m1
yading@11 1150 movq [dstq+dst4q], m3
yading@11 1151 movhps [dstq+dst5q], m3
yading@11 1152 add srcq, mmsize*6
yading@11 1153 add dstq, mmsize/2
yading@11 1154 sub lend, mmsize/4
yading@11 1155 jg .loop
yading@11 1156 REP_RET
yading@11 1157 %endmacro
yading@11 1158
yading@11 1159 INIT_XMM sse2
yading@11 1160 CONV_FLT_TO_S16P_6CH
yading@11 1161 INIT_XMM ssse3
yading@11 1162 CONV_FLT_TO_S16P_6CH
yading@11 1163 %if HAVE_AVX_EXTERNAL
yading@11 1164 INIT_XMM avx
yading@11 1165 CONV_FLT_TO_S16P_6CH
yading@11 1166 %endif
yading@11 1167
yading@11 1168 ;------------------------------------------------------------------------------
yading@11 1169 ; void ff_conv_flt_to_fltp_2ch(float *const *dst, float *src, int len,
yading@11 1170 ; int channels);
yading@11 1171 ;------------------------------------------------------------------------------
yading@11 1172
yading@11 1173 %macro CONV_FLT_TO_FLTP_2CH 0
yading@11 1174 cglobal conv_flt_to_fltp_2ch, 3,4,3, dst0, src, len, dst1
yading@11 1175 lea lenq, [4*lend]
yading@11 1176 mov dst1q, [dst0q+gprsize]
yading@11 1177 mov dst0q, [dst0q ]
yading@11 1178 lea srcq, [srcq+2*lenq]
yading@11 1179 add dst0q, lenq
yading@11 1180 add dst1q, lenq
yading@11 1181 neg lenq
yading@11 1182 .loop:
yading@11 1183 mova m0, [srcq+2*lenq ]
yading@11 1184 mova m1, [srcq+2*lenq+mmsize]
yading@11 1185 DEINT2_PS 0, 1, 2
yading@11 1186 mova [dst0q+lenq], m0
yading@11 1187 mova [dst1q+lenq], m1
yading@11 1188 add lenq, mmsize
yading@11 1189 jl .loop
yading@11 1190 REP_RET
yading@11 1191 %endmacro
yading@11 1192
yading@11 1193 INIT_XMM sse
yading@11 1194 CONV_FLT_TO_FLTP_2CH
yading@11 1195 %if HAVE_AVX_EXTERNAL
yading@11 1196 INIT_XMM avx
yading@11 1197 CONV_FLT_TO_FLTP_2CH
yading@11 1198 %endif
yading@11 1199
yading@11 1200 ;------------------------------------------------------------------------------
yading@11 1201 ; void ff_conv_flt_to_fltp_6ch(float *const *dst, float *src, int len,
yading@11 1202 ; int channels);
yading@11 1203 ;------------------------------------------------------------------------------
yading@11 1204
yading@11 1205 %macro CONV_FLT_TO_FLTP_6CH 0
yading@11 1206 %if ARCH_X86_64
yading@11 1207 cglobal conv_flt_to_fltp_6ch, 3,8,7, dst, src, len, dst1, dst2, dst3, dst4, dst5
yading@11 1208 %else
yading@11 1209 cglobal conv_flt_to_fltp_6ch, 2,7,7, dst, src, dst1, dst2, dst3, dst4, dst5
yading@11 1210 %define lend dword r2m
yading@11 1211 %endif
yading@11 1212 mov dst1q, [dstq+ gprsize]
yading@11 1213 mov dst2q, [dstq+2*gprsize]
yading@11 1214 mov dst3q, [dstq+3*gprsize]
yading@11 1215 mov dst4q, [dstq+4*gprsize]
yading@11 1216 mov dst5q, [dstq+5*gprsize]
yading@11 1217 mov dstq, [dstq ]
yading@11 1218 sub dst1q, dstq
yading@11 1219 sub dst2q, dstq
yading@11 1220 sub dst3q, dstq
yading@11 1221 sub dst4q, dstq
yading@11 1222 sub dst5q, dstq
yading@11 1223 .loop:
yading@11 1224 mova m0, [srcq+0*mmsize] ; m0 = 0, 1, 2, 3
yading@11 1225 mova m1, [srcq+1*mmsize] ; m1 = 4, 5, 6, 7
yading@11 1226 mova m2, [srcq+2*mmsize] ; m2 = 8, 9, 10, 11
yading@11 1227 mova m3, [srcq+3*mmsize] ; m3 = 12, 13, 14, 15
yading@11 1228 mova m4, [srcq+4*mmsize] ; m4 = 16, 17, 18, 19
yading@11 1229 mova m5, [srcq+5*mmsize] ; m5 = 20, 21, 22, 23
yading@11 1230
yading@11 1231 SBUTTERFLY2 dq, 0, 3, 6 ; m0 = 0, 12, 1, 13
yading@11 1232 ; m3 = 2, 14, 3, 15
yading@11 1233 SBUTTERFLY2 dq, 1, 4, 6 ; m1 = 4, 16, 5, 17
yading@11 1234 ; m4 = 6, 18, 7, 19
yading@11 1235 SBUTTERFLY2 dq, 2, 5, 6 ; m2 = 8, 20, 9, 21
yading@11 1236 ; m5 = 10, 22, 11, 23
yading@11 1237 SBUTTERFLY2 dq, 0, 4, 6 ; m0 = 0, 6, 12, 18
yading@11 1238 ; m4 = 1, 7, 13, 19
yading@11 1239 SBUTTERFLY2 dq, 3, 2, 6 ; m3 = 2, 8, 14, 20
yading@11 1240 ; m2 = 3, 9, 15, 21
yading@11 1241 SBUTTERFLY2 dq, 1, 5, 6 ; m1 = 4, 10, 16, 22
yading@11 1242 ; m5 = 5, 11, 17, 23
yading@11 1243 mova [dstq ], m0
yading@11 1244 mova [dstq+dst1q], m4
yading@11 1245 mova [dstq+dst2q], m3
yading@11 1246 mova [dstq+dst3q], m2
yading@11 1247 mova [dstq+dst4q], m1
yading@11 1248 mova [dstq+dst5q], m5
yading@11 1249 add srcq, mmsize*6
yading@11 1250 add dstq, mmsize
yading@11 1251 sub lend, mmsize/4
yading@11 1252 jg .loop
yading@11 1253 REP_RET
yading@11 1254 %endmacro
yading@11 1255
yading@11 1256 INIT_XMM sse2
yading@11 1257 CONV_FLT_TO_FLTP_6CH
yading@11 1258 %if HAVE_AVX_EXTERNAL
yading@11 1259 INIT_XMM avx
yading@11 1260 CONV_FLT_TO_FLTP_6CH
yading@11 1261 %endif