annotate ffmpeg/libavcodec/x86/fmtconvert.asm @ 13:844d341cf643 tip

Back up before ISMIR
author Yading Song <yading.song@eecs.qmul.ac.uk>
date Thu, 31 Oct 2013 13:17:06 +0000
parents 6840f77b83aa
children
rev   line source
yading@10 1 ;******************************************************************************
yading@10 2 ;* x86 optimized Format Conversion Utils
yading@10 3 ;* Copyright (c) 2008 Loren Merritt
yading@10 4 ;*
yading@10 5 ;* This file is part of FFmpeg.
yading@10 6 ;*
yading@10 7 ;* FFmpeg is free software; you can redistribute it and/or
yading@10 8 ;* modify it under the terms of the GNU Lesser General Public
yading@10 9 ;* License as published by the Free Software Foundation; either
yading@10 10 ;* version 2.1 of the License, or (at your option) any later version.
yading@10 11 ;*
yading@10 12 ;* FFmpeg is distributed in the hope that it will be useful,
yading@10 13 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
yading@10 14 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
yading@10 15 ;* Lesser General Public License for more details.
yading@10 16 ;*
yading@10 17 ;* You should have received a copy of the GNU Lesser General Public
yading@10 18 ;* License along with FFmpeg; if not, write to the Free Software
yading@10 19 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
yading@10 20 ;******************************************************************************
yading@10 21
yading@10 22 %include "libavutil/x86/x86util.asm"
yading@10 23
yading@10 24 SECTION_TEXT
yading@10 25
yading@10 26 %macro CVTPS2PI 2
yading@10 27 %if cpuflag(sse)
yading@10 28 cvtps2pi %1, %2
yading@10 29 %elif cpuflag(3dnow)
yading@10 30 pf2id %1, %2
yading@10 31 %endif
yading@10 32 %endmacro
yading@10 33
yading@10 34 ;---------------------------------------------------------------------------------
yading@10 35 ; void int32_to_float_fmul_scalar(float *dst, const int *src, float mul, int len);
yading@10 36 ;---------------------------------------------------------------------------------
yading@10 37 %macro INT32_TO_FLOAT_FMUL_SCALAR 1
yading@10 38 %if UNIX64
yading@10 39 cglobal int32_to_float_fmul_scalar, 3, 3, %1, dst, src, len
yading@10 40 %else
yading@10 41 cglobal int32_to_float_fmul_scalar, 4, 4, %1, dst, src, mul, len
yading@10 42 %endif
yading@10 43 %if WIN64
yading@10 44 SWAP 0, 2
yading@10 45 %elif ARCH_X86_32
yading@10 46 movss m0, mulm
yading@10 47 %endif
yading@10 48 SPLATD m0
yading@10 49 shl lenq, 2
yading@10 50 add srcq, lenq
yading@10 51 add dstq, lenq
yading@10 52 neg lenq
yading@10 53 .loop:
yading@10 54 %if cpuflag(sse2)
yading@10 55 cvtdq2ps m1, [srcq+lenq ]
yading@10 56 cvtdq2ps m2, [srcq+lenq+16]
yading@10 57 %else
yading@10 58 cvtpi2ps m1, [srcq+lenq ]
yading@10 59 cvtpi2ps m3, [srcq+lenq+ 8]
yading@10 60 cvtpi2ps m2, [srcq+lenq+16]
yading@10 61 cvtpi2ps m4, [srcq+lenq+24]
yading@10 62 movlhps m1, m3
yading@10 63 movlhps m2, m4
yading@10 64 %endif
yading@10 65 mulps m1, m0
yading@10 66 mulps m2, m0
yading@10 67 mova [dstq+lenq ], m1
yading@10 68 mova [dstq+lenq+16], m2
yading@10 69 add lenq, 32
yading@10 70 jl .loop
yading@10 71 REP_RET
yading@10 72 %endmacro
yading@10 73
yading@10 74 INIT_XMM sse
yading@10 75 INT32_TO_FLOAT_FMUL_SCALAR 5
yading@10 76 INIT_XMM sse2
yading@10 77 INT32_TO_FLOAT_FMUL_SCALAR 3
yading@10 78
yading@10 79
yading@10 80 ;------------------------------------------------------------------------------
yading@10 81 ; void ff_float_to_int16(int16_t *dst, const float *src, long len);
yading@10 82 ;------------------------------------------------------------------------------
yading@10 83 %macro FLOAT_TO_INT16 1
yading@10 84 cglobal float_to_int16, 3, 3, %1, dst, src, len
yading@10 85 add lenq, lenq
yading@10 86 lea srcq, [srcq+2*lenq]
yading@10 87 add dstq, lenq
yading@10 88 neg lenq
yading@10 89 .loop:
yading@10 90 %if cpuflag(sse2)
yading@10 91 cvtps2dq m0, [srcq+2*lenq ]
yading@10 92 cvtps2dq m1, [srcq+2*lenq+16]
yading@10 93 packssdw m0, m1
yading@10 94 mova [dstq+lenq], m0
yading@10 95 %else
yading@10 96 CVTPS2PI m0, [srcq+2*lenq ]
yading@10 97 CVTPS2PI m1, [srcq+2*lenq+ 8]
yading@10 98 CVTPS2PI m2, [srcq+2*lenq+16]
yading@10 99 CVTPS2PI m3, [srcq+2*lenq+24]
yading@10 100 packssdw m0, m1
yading@10 101 packssdw m2, m3
yading@10 102 mova [dstq+lenq ], m0
yading@10 103 mova [dstq+lenq+8], m2
yading@10 104 %endif
yading@10 105 add lenq, 16
yading@10 106 js .loop
yading@10 107 %if mmsize == 8
yading@10 108 emms
yading@10 109 %endif
yading@10 110 REP_RET
yading@10 111 %endmacro
yading@10 112
yading@10 113 INIT_XMM sse2
yading@10 114 FLOAT_TO_INT16 2
yading@10 115 INIT_MMX sse
yading@10 116 FLOAT_TO_INT16 0
yading@10 117 INIT_MMX 3dnow
yading@10 118 FLOAT_TO_INT16 0
yading@10 119
yading@10 120 ;------------------------------------------------------------------------------
yading@10 121 ; void ff_float_to_int16_step(int16_t *dst, const float *src, long len, long step);
yading@10 122 ;------------------------------------------------------------------------------
yading@10 123 %macro FLOAT_TO_INT16_STEP 1
yading@10 124 cglobal float_to_int16_step, 4, 7, %1, dst, src, len, step, step3, v1, v2
yading@10 125 add lenq, lenq
yading@10 126 lea srcq, [srcq+2*lenq]
yading@10 127 lea step3q, [stepq*3]
yading@10 128 neg lenq
yading@10 129 .loop:
yading@10 130 %if cpuflag(sse2)
yading@10 131 cvtps2dq m0, [srcq+2*lenq ]
yading@10 132 cvtps2dq m1, [srcq+2*lenq+16]
yading@10 133 packssdw m0, m1
yading@10 134 movd v1d, m0
yading@10 135 psrldq m0, 4
yading@10 136 movd v2d, m0
yading@10 137 psrldq m0, 4
yading@10 138 mov [dstq], v1w
yading@10 139 mov [dstq+stepq*4], v2w
yading@10 140 shr v1d, 16
yading@10 141 shr v2d, 16
yading@10 142 mov [dstq+stepq*2], v1w
yading@10 143 mov [dstq+step3q*2], v2w
yading@10 144 lea dstq, [dstq+stepq*8]
yading@10 145 movd v1d, m0
yading@10 146 psrldq m0, 4
yading@10 147 movd v2d, m0
yading@10 148 mov [dstq], v1w
yading@10 149 mov [dstq+stepq*4], v2w
yading@10 150 shr v1d, 16
yading@10 151 shr v2d, 16
yading@10 152 mov [dstq+stepq*2], v1w
yading@10 153 mov [dstq+step3q*2], v2w
yading@10 154 lea dstq, [dstq+stepq*8]
yading@10 155 %else
yading@10 156 CVTPS2PI m0, [srcq+2*lenq ]
yading@10 157 CVTPS2PI m1, [srcq+2*lenq+ 8]
yading@10 158 CVTPS2PI m2, [srcq+2*lenq+16]
yading@10 159 CVTPS2PI m3, [srcq+2*lenq+24]
yading@10 160 packssdw m0, m1
yading@10 161 packssdw m2, m3
yading@10 162 movd v1d, m0
yading@10 163 psrlq m0, 32
yading@10 164 movd v2d, m0
yading@10 165 mov [dstq], v1w
yading@10 166 mov [dstq+stepq*4], v2w
yading@10 167 shr v1d, 16
yading@10 168 shr v2d, 16
yading@10 169 mov [dstq+stepq*2], v1w
yading@10 170 mov [dstq+step3q*2], v2w
yading@10 171 lea dstq, [dstq+stepq*8]
yading@10 172 movd v1d, m2
yading@10 173 psrlq m2, 32
yading@10 174 movd v2d, m2
yading@10 175 mov [dstq], v1w
yading@10 176 mov [dstq+stepq*4], v2w
yading@10 177 shr v1d, 16
yading@10 178 shr v2d, 16
yading@10 179 mov [dstq+stepq*2], v1w
yading@10 180 mov [dstq+step3q*2], v2w
yading@10 181 lea dstq, [dstq+stepq*8]
yading@10 182 %endif
yading@10 183 add lenq, 16
yading@10 184 js .loop
yading@10 185 %if mmsize == 8
yading@10 186 emms
yading@10 187 %endif
yading@10 188 REP_RET
yading@10 189 %endmacro
yading@10 190
yading@10 191 INIT_XMM sse2
yading@10 192 FLOAT_TO_INT16_STEP 2
yading@10 193 INIT_MMX sse
yading@10 194 FLOAT_TO_INT16_STEP 0
yading@10 195 INIT_MMX 3dnow
yading@10 196 FLOAT_TO_INT16_STEP 0
yading@10 197
yading@10 198 ;-------------------------------------------------------------------------------
yading@10 199 ; void ff_float_to_int16_interleave2(int16_t *dst, const float **src, long len);
yading@10 200 ;-------------------------------------------------------------------------------
yading@10 201 %macro FLOAT_TO_INT16_INTERLEAVE2 0
yading@10 202 cglobal float_to_int16_interleave2, 3, 4, 2, dst, src0, src1, len
yading@10 203 lea lenq, [4*r2q]
yading@10 204 mov src1q, [src0q+gprsize]
yading@10 205 mov src0q, [src0q]
yading@10 206 add dstq, lenq
yading@10 207 add src0q, lenq
yading@10 208 add src1q, lenq
yading@10 209 neg lenq
yading@10 210 .loop:
yading@10 211 %if cpuflag(sse2)
yading@10 212 cvtps2dq m0, [src0q+lenq]
yading@10 213 cvtps2dq m1, [src1q+lenq]
yading@10 214 packssdw m0, m1
yading@10 215 movhlps m1, m0
yading@10 216 punpcklwd m0, m1
yading@10 217 mova [dstq+lenq], m0
yading@10 218 %else
yading@10 219 CVTPS2PI m0, [src0q+lenq ]
yading@10 220 CVTPS2PI m1, [src0q+lenq+8]
yading@10 221 CVTPS2PI m2, [src1q+lenq ]
yading@10 222 CVTPS2PI m3, [src1q+lenq+8]
yading@10 223 packssdw m0, m1
yading@10 224 packssdw m2, m3
yading@10 225 mova m1, m0
yading@10 226 punpcklwd m0, m2
yading@10 227 punpckhwd m1, m2
yading@10 228 mova [dstq+lenq ], m0
yading@10 229 mova [dstq+lenq+8], m1
yading@10 230 %endif
yading@10 231 add lenq, 16
yading@10 232 js .loop
yading@10 233 %if mmsize == 8
yading@10 234 emms
yading@10 235 %endif
yading@10 236 REP_RET
yading@10 237 %endmacro
yading@10 238
yading@10 239 INIT_MMX 3dnow
yading@10 240 FLOAT_TO_INT16_INTERLEAVE2
yading@10 241 INIT_MMX sse
yading@10 242 FLOAT_TO_INT16_INTERLEAVE2
yading@10 243 INIT_XMM sse2
yading@10 244 FLOAT_TO_INT16_INTERLEAVE2
yading@10 245
yading@10 246 %macro FLOAT_TO_INT16_INTERLEAVE6 0
yading@10 247 ; void float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len)
yading@10 248 cglobal float_to_int16_interleave6, 2, 8, 0, dst, src, src1, src2, src3, src4, src5, len
yading@10 249 %if ARCH_X86_64
yading@10 250 mov lend, r2d
yading@10 251 %else
yading@10 252 %define lend dword r2m
yading@10 253 %endif
yading@10 254 mov src1q, [srcq+1*gprsize]
yading@10 255 mov src2q, [srcq+2*gprsize]
yading@10 256 mov src3q, [srcq+3*gprsize]
yading@10 257 mov src4q, [srcq+4*gprsize]
yading@10 258 mov src5q, [srcq+5*gprsize]
yading@10 259 mov srcq, [srcq]
yading@10 260 sub src1q, srcq
yading@10 261 sub src2q, srcq
yading@10 262 sub src3q, srcq
yading@10 263 sub src4q, srcq
yading@10 264 sub src5q, srcq
yading@10 265 .loop:
yading@10 266 CVTPS2PI mm0, [srcq]
yading@10 267 CVTPS2PI mm1, [srcq+src1q]
yading@10 268 CVTPS2PI mm2, [srcq+src2q]
yading@10 269 CVTPS2PI mm3, [srcq+src3q]
yading@10 270 CVTPS2PI mm4, [srcq+src4q]
yading@10 271 CVTPS2PI mm5, [srcq+src5q]
yading@10 272 packssdw mm0, mm3
yading@10 273 packssdw mm1, mm4
yading@10 274 packssdw mm2, mm5
yading@10 275 PSWAPD mm3, mm0
yading@10 276 punpcklwd mm0, mm1
yading@10 277 punpckhwd mm1, mm2
yading@10 278 punpcklwd mm2, mm3
yading@10 279 PSWAPD mm3, mm0
yading@10 280 punpckldq mm0, mm2
yading@10 281 punpckhdq mm2, mm1
yading@10 282 punpckldq mm1, mm3
yading@10 283 movq [dstq ], mm0
yading@10 284 movq [dstq+16], mm2
yading@10 285 movq [dstq+ 8], mm1
yading@10 286 add srcq, 8
yading@10 287 add dstq, 24
yading@10 288 sub lend, 2
yading@10 289 jg .loop
yading@10 290 emms
yading@10 291 RET
yading@10 292 %endmacro ; FLOAT_TO_INT16_INTERLEAVE6
yading@10 293
yading@10 294 INIT_MMX sse
yading@10 295 FLOAT_TO_INT16_INTERLEAVE6
yading@10 296 INIT_MMX 3dnow
yading@10 297 FLOAT_TO_INT16_INTERLEAVE6
yading@10 298 INIT_MMX 3dnowext
yading@10 299 FLOAT_TO_INT16_INTERLEAVE6
yading@10 300
yading@10 301 ;-----------------------------------------------------------------------------
yading@10 302 ; void ff_float_interleave6(float *dst, const float **src, unsigned int len);
yading@10 303 ;-----------------------------------------------------------------------------
yading@10 304
yading@10 305 %macro FLOAT_INTERLEAVE6 1
yading@10 306 cglobal float_interleave6, 2, 8, %1, dst, src, src1, src2, src3, src4, src5, len
yading@10 307 %if ARCH_X86_64
yading@10 308 mov lend, r2d
yading@10 309 %else
yading@10 310 %define lend dword r2m
yading@10 311 %endif
yading@10 312 mov src1q, [srcq+1*gprsize]
yading@10 313 mov src2q, [srcq+2*gprsize]
yading@10 314 mov src3q, [srcq+3*gprsize]
yading@10 315 mov src4q, [srcq+4*gprsize]
yading@10 316 mov src5q, [srcq+5*gprsize]
yading@10 317 mov srcq, [srcq]
yading@10 318 sub src1q, srcq
yading@10 319 sub src2q, srcq
yading@10 320 sub src3q, srcq
yading@10 321 sub src4q, srcq
yading@10 322 sub src5q, srcq
yading@10 323 .loop:
yading@10 324 %if cpuflag(sse)
yading@10 325 movaps m0, [srcq]
yading@10 326 movaps m1, [srcq+src1q]
yading@10 327 movaps m2, [srcq+src2q]
yading@10 328 movaps m3, [srcq+src3q]
yading@10 329 movaps m4, [srcq+src4q]
yading@10 330 movaps m5, [srcq+src5q]
yading@10 331
yading@10 332 SBUTTERFLYPS 0, 1, 6
yading@10 333 SBUTTERFLYPS 2, 3, 6
yading@10 334 SBUTTERFLYPS 4, 5, 6
yading@10 335
yading@10 336 movaps m6, m4
yading@10 337 shufps m4, m0, 0xe4
yading@10 338 movlhps m0, m2
yading@10 339 movhlps m6, m2
yading@10 340 movaps [dstq ], m0
yading@10 341 movaps [dstq+16], m4
yading@10 342 movaps [dstq+32], m6
yading@10 343
yading@10 344 movaps m6, m5
yading@10 345 shufps m5, m1, 0xe4
yading@10 346 movlhps m1, m3
yading@10 347 movhlps m6, m3
yading@10 348 movaps [dstq+48], m1
yading@10 349 movaps [dstq+64], m5
yading@10 350 movaps [dstq+80], m6
yading@10 351 %else ; mmx
yading@10 352 movq m0, [srcq]
yading@10 353 movq m1, [srcq+src1q]
yading@10 354 movq m2, [srcq+src2q]
yading@10 355 movq m3, [srcq+src3q]
yading@10 356 movq m4, [srcq+src4q]
yading@10 357 movq m5, [srcq+src5q]
yading@10 358
yading@10 359 SBUTTERFLY dq, 0, 1, 6
yading@10 360 SBUTTERFLY dq, 2, 3, 6
yading@10 361 SBUTTERFLY dq, 4, 5, 6
yading@10 362 movq [dstq ], m0
yading@10 363 movq [dstq+ 8], m2
yading@10 364 movq [dstq+16], m4
yading@10 365 movq [dstq+24], m1
yading@10 366 movq [dstq+32], m3
yading@10 367 movq [dstq+40], m5
yading@10 368 %endif
yading@10 369 add srcq, mmsize
yading@10 370 add dstq, mmsize*6
yading@10 371 sub lend, mmsize/4
yading@10 372 jg .loop
yading@10 373 %if mmsize == 8
yading@10 374 emms
yading@10 375 %endif
yading@10 376 REP_RET
yading@10 377 %endmacro
yading@10 378
yading@10 379 INIT_MMX mmx
yading@10 380 FLOAT_INTERLEAVE6 0
yading@10 381 INIT_XMM sse
yading@10 382 FLOAT_INTERLEAVE6 7
yading@10 383
yading@10 384 ;-----------------------------------------------------------------------------
yading@10 385 ; void ff_float_interleave2(float *dst, const float **src, unsigned int len);
yading@10 386 ;-----------------------------------------------------------------------------
yading@10 387
yading@10 388 %macro FLOAT_INTERLEAVE2 1
yading@10 389 cglobal float_interleave2, 3, 4, %1, dst, src, len, src1
yading@10 390 mov src1q, [srcq+gprsize]
yading@10 391 mov srcq, [srcq ]
yading@10 392 sub src1q, srcq
yading@10 393 .loop:
yading@10 394 mova m0, [srcq ]
yading@10 395 mova m1, [srcq+src1q ]
yading@10 396 mova m3, [srcq +mmsize]
yading@10 397 mova m4, [srcq+src1q+mmsize]
yading@10 398
yading@10 399 mova m2, m0
yading@10 400 PUNPCKLDQ m0, m1
yading@10 401 PUNPCKHDQ m2, m1
yading@10 402
yading@10 403 mova m1, m3
yading@10 404 PUNPCKLDQ m3, m4
yading@10 405 PUNPCKHDQ m1, m4
yading@10 406
yading@10 407 mova [dstq ], m0
yading@10 408 mova [dstq+1*mmsize], m2
yading@10 409 mova [dstq+2*mmsize], m3
yading@10 410 mova [dstq+3*mmsize], m1
yading@10 411
yading@10 412 add srcq, mmsize*2
yading@10 413 add dstq, mmsize*4
yading@10 414 sub lend, mmsize/2
yading@10 415 jg .loop
yading@10 416 %if mmsize == 8
yading@10 417 emms
yading@10 418 %endif
yading@10 419 REP_RET
yading@10 420 %endmacro
yading@10 421
yading@10 422 INIT_MMX mmx
yading@10 423 %define PUNPCKLDQ punpckldq
yading@10 424 %define PUNPCKHDQ punpckhdq
yading@10 425 FLOAT_INTERLEAVE2 0
yading@10 426 INIT_XMM sse
yading@10 427 %define PUNPCKLDQ unpcklps
yading@10 428 %define PUNPCKHDQ unpckhps
yading@10 429 FLOAT_INTERLEAVE2 5