annotate ffmpeg/libavcodec/x86/dsputil.asm @ 13:844d341cf643 tip

Back up before ISMIR
author Yading Song <yading.song@eecs.qmul.ac.uk>
date Thu, 31 Oct 2013 13:17:06 +0000
parents 6840f77b83aa
children
rev   line source
yading@10 1 ;******************************************************************************
yading@10 2 ;* MMX optimized DSP utils
yading@10 3 ;* Copyright (c) 2008 Loren Merritt
yading@10 4 ;* Copyright (c) 2003-2013 Michael Niedermayer
yading@10 5 ;* Copyright (c) 2013 Daniel Kang
yading@10 6 ;*
yading@10 7 ;* This file is part of FFmpeg.
yading@10 8 ;*
yading@10 9 ;* FFmpeg is free software; you can redistribute it and/or
yading@10 10 ;* modify it under the terms of the GNU Lesser General Public
yading@10 11 ;* License as published by the Free Software Foundation; either
yading@10 12 ;* version 2.1 of the License, or (at your option) any later version.
yading@10 13 ;*
yading@10 14 ;* FFmpeg is distributed in the hope that it will be useful,
yading@10 15 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
yading@10 16 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
yading@10 17 ;* Lesser General Public License for more details.
yading@10 18 ;*
yading@10 19 ;* You should have received a copy of the GNU Lesser General Public
yading@10 20 ;* License along with FFmpeg; if not, write to the Free Software
yading@10 21 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
yading@10 22 ;******************************************************************************
yading@10 23
yading@10 24 %include "libavutil/x86/x86util.asm"
yading@10 25
yading@10 26 SECTION_RODATA
yading@10 27 pb_f: times 16 db 15
yading@10 28 pb_zzzzzzzz77777777: times 8 db -1
yading@10 29 pb_7: times 8 db 7
yading@10 30 pb_zzzz3333zzzzbbbb: db -1,-1,-1,-1,3,3,3,3,-1,-1,-1,-1,11,11,11,11
yading@10 31 pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13
yading@10 32 pb_revwords: SHUFFLE_MASK_W 7, 6, 5, 4, 3, 2, 1, 0
yading@10 33 pd_16384: times 4 dd 16384
yading@10 34 pb_bswap32: db 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
yading@10 35
yading@10 36 SECTION_TEXT
yading@10 37
yading@10 38 %macro SCALARPRODUCT 0
yading@10 39 ; int scalarproduct_int16(int16_t *v1, int16_t *v2, int order)
yading@10 40 cglobal scalarproduct_int16, 3,3,3, v1, v2, order
yading@10 41 shl orderq, 1
yading@10 42 add v1q, orderq
yading@10 43 add v2q, orderq
yading@10 44 neg orderq
yading@10 45 pxor m2, m2
yading@10 46 .loop:
yading@10 47 movu m0, [v1q + orderq]
yading@10 48 movu m1, [v1q + orderq + mmsize]
yading@10 49 pmaddwd m0, [v2q + orderq]
yading@10 50 pmaddwd m1, [v2q + orderq + mmsize]
yading@10 51 paddd m2, m0
yading@10 52 paddd m2, m1
yading@10 53 add orderq, mmsize*2
yading@10 54 jl .loop
yading@10 55 %if mmsize == 16
yading@10 56 movhlps m0, m2
yading@10 57 paddd m2, m0
yading@10 58 pshuflw m0, m2, 0x4e
yading@10 59 %else
yading@10 60 pshufw m0, m2, 0x4e
yading@10 61 %endif
yading@10 62 paddd m2, m0
yading@10 63 movd eax, m2
yading@10 64 RET
yading@10 65
yading@10 66 ; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul)
yading@10 67 cglobal scalarproduct_and_madd_int16, 4,4,8, v1, v2, v3, order, mul
yading@10 68 shl orderq, 1
yading@10 69 movd m7, mulm
yading@10 70 %if mmsize == 16
yading@10 71 pshuflw m7, m7, 0
yading@10 72 punpcklqdq m7, m7
yading@10 73 %else
yading@10 74 pshufw m7, m7, 0
yading@10 75 %endif
yading@10 76 pxor m6, m6
yading@10 77 add v1q, orderq
yading@10 78 add v2q, orderq
yading@10 79 add v3q, orderq
yading@10 80 neg orderq
yading@10 81 .loop:
yading@10 82 movu m0, [v2q + orderq]
yading@10 83 movu m1, [v2q + orderq + mmsize]
yading@10 84 mova m4, [v1q + orderq]
yading@10 85 mova m5, [v1q + orderq + mmsize]
yading@10 86 movu m2, [v3q + orderq]
yading@10 87 movu m3, [v3q + orderq + mmsize]
yading@10 88 pmaddwd m0, m4
yading@10 89 pmaddwd m1, m5
yading@10 90 pmullw m2, m7
yading@10 91 pmullw m3, m7
yading@10 92 paddd m6, m0
yading@10 93 paddd m6, m1
yading@10 94 paddw m2, m4
yading@10 95 paddw m3, m5
yading@10 96 mova [v1q + orderq], m2
yading@10 97 mova [v1q + orderq + mmsize], m3
yading@10 98 add orderq, mmsize*2
yading@10 99 jl .loop
yading@10 100 %if mmsize == 16
yading@10 101 movhlps m0, m6
yading@10 102 paddd m6, m0
yading@10 103 pshuflw m0, m6, 0x4e
yading@10 104 %else
yading@10 105 pshufw m0, m6, 0x4e
yading@10 106 %endif
yading@10 107 paddd m6, m0
yading@10 108 movd eax, m6
yading@10 109 RET
yading@10 110 %endmacro
yading@10 111
yading@10 112 INIT_MMX mmxext
yading@10 113 SCALARPRODUCT
yading@10 114 INIT_XMM sse2
yading@10 115 SCALARPRODUCT
yading@10 116
yading@10 117 %macro SCALARPRODUCT_LOOP 1
yading@10 118 align 16
yading@10 119 .loop%1:
yading@10 120 sub orderq, mmsize*2
yading@10 121 %if %1
yading@10 122 mova m1, m4
yading@10 123 mova m4, [v2q + orderq]
yading@10 124 mova m0, [v2q + orderq + mmsize]
yading@10 125 palignr m1, m0, %1
yading@10 126 palignr m0, m4, %1
yading@10 127 mova m3, m5
yading@10 128 mova m5, [v3q + orderq]
yading@10 129 mova m2, [v3q + orderq + mmsize]
yading@10 130 palignr m3, m2, %1
yading@10 131 palignr m2, m5, %1
yading@10 132 %else
yading@10 133 mova m0, [v2q + orderq]
yading@10 134 mova m1, [v2q + orderq + mmsize]
yading@10 135 mova m2, [v3q + orderq]
yading@10 136 mova m3, [v3q + orderq + mmsize]
yading@10 137 %endif
yading@10 138 %define t0 [v1q + orderq]
yading@10 139 %define t1 [v1q + orderq + mmsize]
yading@10 140 %if ARCH_X86_64
yading@10 141 mova m8, t0
yading@10 142 mova m9, t1
yading@10 143 %define t0 m8
yading@10 144 %define t1 m9
yading@10 145 %endif
yading@10 146 pmaddwd m0, t0
yading@10 147 pmaddwd m1, t1
yading@10 148 pmullw m2, m7
yading@10 149 pmullw m3, m7
yading@10 150 paddw m2, t0
yading@10 151 paddw m3, t1
yading@10 152 paddd m6, m0
yading@10 153 paddd m6, m1
yading@10 154 mova [v1q + orderq], m2
yading@10 155 mova [v1q + orderq + mmsize], m3
yading@10 156 jg .loop%1
yading@10 157 %if %1
yading@10 158 jmp .end
yading@10 159 %endif
yading@10 160 %endmacro
yading@10 161
yading@10 162 ; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul)
yading@10 163 INIT_XMM ssse3
yading@10 164 cglobal scalarproduct_and_madd_int16, 4,5,10, v1, v2, v3, order, mul
yading@10 165 shl orderq, 1
yading@10 166 movd m7, mulm
yading@10 167 pshuflw m7, m7, 0
yading@10 168 punpcklqdq m7, m7
yading@10 169 pxor m6, m6
yading@10 170 mov r4d, v2d
yading@10 171 and r4d, 15
yading@10 172 and v2q, ~15
yading@10 173 and v3q, ~15
yading@10 174 mova m4, [v2q + orderq]
yading@10 175 mova m5, [v3q + orderq]
yading@10 176 ; linear is faster than branch tree or jump table, because the branches taken are cyclic (i.e. predictable)
yading@10 177 cmp r4d, 0
yading@10 178 je .loop0
yading@10 179 cmp r4d, 2
yading@10 180 je .loop2
yading@10 181 cmp r4d, 4
yading@10 182 je .loop4
yading@10 183 cmp r4d, 6
yading@10 184 je .loop6
yading@10 185 cmp r4d, 8
yading@10 186 je .loop8
yading@10 187 cmp r4d, 10
yading@10 188 je .loop10
yading@10 189 cmp r4d, 12
yading@10 190 je .loop12
yading@10 191 SCALARPRODUCT_LOOP 14
yading@10 192 SCALARPRODUCT_LOOP 12
yading@10 193 SCALARPRODUCT_LOOP 10
yading@10 194 SCALARPRODUCT_LOOP 8
yading@10 195 SCALARPRODUCT_LOOP 6
yading@10 196 SCALARPRODUCT_LOOP 4
yading@10 197 SCALARPRODUCT_LOOP 2
yading@10 198 SCALARPRODUCT_LOOP 0
yading@10 199 .end:
yading@10 200 movhlps m0, m6
yading@10 201 paddd m6, m0
yading@10 202 pshuflw m0, m6, 0x4e
yading@10 203 paddd m6, m0
yading@10 204 movd eax, m6
yading@10 205 RET
yading@10 206
yading@10 207
yading@10 208 ;-----------------------------------------------------------------------------
yading@10 209 ; void ff_apply_window_int16(int16_t *output, const int16_t *input,
yading@10 210 ; const int16_t *window, unsigned int len)
yading@10 211 ;-----------------------------------------------------------------------------
yading@10 212
yading@10 213 %macro REVERSE_WORDS 1-2
yading@10 214 %if cpuflag(ssse3) && notcpuflag(atom)
yading@10 215 pshufb %1, %2
yading@10 216 %elif cpuflag(sse2)
yading@10 217 pshuflw %1, %1, 0x1B
yading@10 218 pshufhw %1, %1, 0x1B
yading@10 219 pshufd %1, %1, 0x4E
yading@10 220 %elif cpuflag(mmxext)
yading@10 221 pshufw %1, %1, 0x1B
yading@10 222 %endif
yading@10 223 %endmacro
yading@10 224
yading@10 225 %macro MUL16FIXED 3
yading@10 226 %if cpuflag(ssse3) ; dst, src, unused
yading@10 227 ; dst = ((dst * src) + (1<<14)) >> 15
yading@10 228 pmulhrsw %1, %2
yading@10 229 %elif cpuflag(mmxext) ; dst, src, temp
yading@10 230 ; dst = (dst * src) >> 15
yading@10 231 ; pmulhw cuts off the bottom bit, so we have to lshift by 1 and add it back
yading@10 232 ; in from the pmullw result.
yading@10 233 mova %3, %1
yading@10 234 pmulhw %1, %2
yading@10 235 pmullw %3, %2
yading@10 236 psrlw %3, 15
yading@10 237 psllw %1, 1
yading@10 238 por %1, %3
yading@10 239 %endif
yading@10 240 %endmacro
yading@10 241
yading@10 242 %macro APPLY_WINDOW_INT16 1 ; %1 bitexact version
yading@10 243 %if %1
yading@10 244 cglobal apply_window_int16, 4,5,6, output, input, window, offset, offset2
yading@10 245 %else
yading@10 246 cglobal apply_window_int16_round, 4,5,6, output, input, window, offset, offset2
yading@10 247 %endif
yading@10 248 lea offset2q, [offsetq-mmsize]
yading@10 249 %if cpuflag(ssse3) && notcpuflag(atom)
yading@10 250 mova m5, [pb_revwords]
yading@10 251 ALIGN 16
yading@10 252 %elif %1
yading@10 253 mova m5, [pd_16384]
yading@10 254 %endif
yading@10 255 .loop:
yading@10 256 %if cpuflag(ssse3)
yading@10 257 ; This version does the 16x16->16 multiplication in-place without expanding
yading@10 258 ; to 32-bit. The ssse3 version is bit-identical.
yading@10 259 mova m0, [windowq+offset2q]
yading@10 260 mova m1, [ inputq+offset2q]
yading@10 261 pmulhrsw m1, m0
yading@10 262 REVERSE_WORDS m0, m5
yading@10 263 pmulhrsw m0, [ inputq+offsetq ]
yading@10 264 mova [outputq+offset2q], m1
yading@10 265 mova [outputq+offsetq ], m0
yading@10 266 %elif %1
yading@10 267 ; This version expands 16-bit to 32-bit, multiplies by the window,
yading@10 268 ; adds 16384 for rounding, right shifts 15, then repacks back to words to
yading@10 269 ; save to the output. The window is reversed for the second half.
yading@10 270 mova m3, [windowq+offset2q]
yading@10 271 mova m4, [ inputq+offset2q]
yading@10 272 pxor m0, m0
yading@10 273 punpcklwd m0, m3
yading@10 274 punpcklwd m1, m4
yading@10 275 pmaddwd m0, m1
yading@10 276 paddd m0, m5
yading@10 277 psrad m0, 15
yading@10 278 pxor m2, m2
yading@10 279 punpckhwd m2, m3
yading@10 280 punpckhwd m1, m4
yading@10 281 pmaddwd m2, m1
yading@10 282 paddd m2, m5
yading@10 283 psrad m2, 15
yading@10 284 packssdw m0, m2
yading@10 285 mova [outputq+offset2q], m0
yading@10 286 REVERSE_WORDS m3
yading@10 287 mova m4, [ inputq+offsetq]
yading@10 288 pxor m0, m0
yading@10 289 punpcklwd m0, m3
yading@10 290 punpcklwd m1, m4
yading@10 291 pmaddwd m0, m1
yading@10 292 paddd m0, m5
yading@10 293 psrad m0, 15
yading@10 294 pxor m2, m2
yading@10 295 punpckhwd m2, m3
yading@10 296 punpckhwd m1, m4
yading@10 297 pmaddwd m2, m1
yading@10 298 paddd m2, m5
yading@10 299 psrad m2, 15
yading@10 300 packssdw m0, m2
yading@10 301 mova [outputq+offsetq], m0
yading@10 302 %else
yading@10 303 ; This version does the 16x16->16 multiplication in-place without expanding
yading@10 304 ; to 32-bit. The mmxext and sse2 versions do not use rounding, and
yading@10 305 ; therefore are not bit-identical to the C version.
yading@10 306 mova m0, [windowq+offset2q]
yading@10 307 mova m1, [ inputq+offset2q]
yading@10 308 mova m2, [ inputq+offsetq ]
yading@10 309 MUL16FIXED m1, m0, m3
yading@10 310 REVERSE_WORDS m0
yading@10 311 MUL16FIXED m2, m0, m3
yading@10 312 mova [outputq+offset2q], m1
yading@10 313 mova [outputq+offsetq ], m2
yading@10 314 %endif
yading@10 315 add offsetd, mmsize
yading@10 316 sub offset2d, mmsize
yading@10 317 jae .loop
yading@10 318 REP_RET
yading@10 319 %endmacro
yading@10 320
yading@10 321 INIT_MMX mmxext
yading@10 322 APPLY_WINDOW_INT16 0
yading@10 323 INIT_XMM sse2
yading@10 324 APPLY_WINDOW_INT16 0
yading@10 325
yading@10 326 INIT_MMX mmxext
yading@10 327 APPLY_WINDOW_INT16 1
yading@10 328 INIT_XMM sse2
yading@10 329 APPLY_WINDOW_INT16 1
yading@10 330 INIT_XMM ssse3
yading@10 331 APPLY_WINDOW_INT16 1
yading@10 332 INIT_XMM ssse3, atom
yading@10 333 APPLY_WINDOW_INT16 1
yading@10 334
yading@10 335
yading@10 336 ; void add_hfyu_median_prediction_mmxext(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top)
yading@10 337 INIT_MMX mmxext
yading@10 338 cglobal add_hfyu_median_prediction, 6,6,0, dst, top, diff, w, left, left_top
yading@10 339 movq mm0, [topq]
yading@10 340 movq mm2, mm0
yading@10 341 movd mm4, [left_topq]
yading@10 342 psllq mm2, 8
yading@10 343 movq mm1, mm0
yading@10 344 por mm4, mm2
yading@10 345 movd mm3, [leftq]
yading@10 346 psubb mm0, mm4 ; t-tl
yading@10 347 add dstq, wq
yading@10 348 add topq, wq
yading@10 349 add diffq, wq
yading@10 350 neg wq
yading@10 351 jmp .skip
yading@10 352 .loop:
yading@10 353 movq mm4, [topq+wq]
yading@10 354 movq mm0, mm4
yading@10 355 psllq mm4, 8
yading@10 356 por mm4, mm1
yading@10 357 movq mm1, mm0 ; t
yading@10 358 psubb mm0, mm4 ; t-tl
yading@10 359 .skip:
yading@10 360 movq mm2, [diffq+wq]
yading@10 361 %assign i 0
yading@10 362 %rep 8
yading@10 363 movq mm4, mm0
yading@10 364 paddb mm4, mm3 ; t-tl+l
yading@10 365 movq mm5, mm3
yading@10 366 pmaxub mm3, mm1
yading@10 367 pminub mm5, mm1
yading@10 368 pminub mm3, mm4
yading@10 369 pmaxub mm3, mm5 ; median
yading@10 370 paddb mm3, mm2 ; +residual
yading@10 371 %if i==0
yading@10 372 movq mm7, mm3
yading@10 373 psllq mm7, 56
yading@10 374 %else
yading@10 375 movq mm6, mm3
yading@10 376 psrlq mm7, 8
yading@10 377 psllq mm6, 56
yading@10 378 por mm7, mm6
yading@10 379 %endif
yading@10 380 %if i<7
yading@10 381 psrlq mm0, 8
yading@10 382 psrlq mm1, 8
yading@10 383 psrlq mm2, 8
yading@10 384 %endif
yading@10 385 %assign i i+1
yading@10 386 %endrep
yading@10 387 movq [dstq+wq], mm7
yading@10 388 add wq, 8
yading@10 389 jl .loop
yading@10 390 movzx r2d, byte [dstq-1]
yading@10 391 mov [leftq], r2d
yading@10 392 movzx r2d, byte [topq-1]
yading@10 393 mov [left_topq], r2d
yading@10 394 RET
yading@10 395
yading@10 396
yading@10 397 %macro ADD_HFYU_LEFT_LOOP 2 ; %1 = dst_is_aligned, %2 = src_is_aligned
yading@10 398 add srcq, wq
yading@10 399 add dstq, wq
yading@10 400 neg wq
yading@10 401 %%.loop:
yading@10 402 %if %2
yading@10 403 mova m1, [srcq+wq]
yading@10 404 %else
yading@10 405 movu m1, [srcq+wq]
yading@10 406 %endif
yading@10 407 mova m2, m1
yading@10 408 psllw m1, 8
yading@10 409 paddb m1, m2
yading@10 410 mova m2, m1
yading@10 411 pshufb m1, m3
yading@10 412 paddb m1, m2
yading@10 413 pshufb m0, m5
yading@10 414 mova m2, m1
yading@10 415 pshufb m1, m4
yading@10 416 paddb m1, m2
yading@10 417 %if mmsize == 16
yading@10 418 mova m2, m1
yading@10 419 pshufb m1, m6
yading@10 420 paddb m1, m2
yading@10 421 %endif
yading@10 422 paddb m0, m1
yading@10 423 %if %1
yading@10 424 mova [dstq+wq], m0
yading@10 425 %else
yading@10 426 movq [dstq+wq], m0
yading@10 427 movhps [dstq+wq+8], m0
yading@10 428 %endif
yading@10 429 add wq, mmsize
yading@10 430 jl %%.loop
yading@10 431 mov eax, mmsize-1
yading@10 432 sub eax, wd
yading@10 433 movd m1, eax
yading@10 434 pshufb m0, m1
yading@10 435 movd eax, m0
yading@10 436 RET
yading@10 437 %endmacro
yading@10 438
yading@10 439 ; int add_hfyu_left_prediction(uint8_t *dst, const uint8_t *src, int w, int left)
yading@10 440 INIT_MMX ssse3
yading@10 441 cglobal add_hfyu_left_prediction, 3,3,7, dst, src, w, left
yading@10 442 .skip_prologue:
yading@10 443 mova m5, [pb_7]
yading@10 444 mova m4, [pb_zzzz3333zzzzbbbb]
yading@10 445 mova m3, [pb_zz11zz55zz99zzdd]
yading@10 446 movd m0, leftm
yading@10 447 psllq m0, 56
yading@10 448 ADD_HFYU_LEFT_LOOP 1, 1
yading@10 449
yading@10 450 INIT_XMM sse4
yading@10 451 cglobal add_hfyu_left_prediction, 3,3,7, dst, src, w, left
yading@10 452 mova m5, [pb_f]
yading@10 453 mova m6, [pb_zzzzzzzz77777777]
yading@10 454 mova m4, [pb_zzzz3333zzzzbbbb]
yading@10 455 mova m3, [pb_zz11zz55zz99zzdd]
yading@10 456 movd m0, leftm
yading@10 457 pslldq m0, 15
yading@10 458 test srcq, 15
yading@10 459 jnz .src_unaligned
yading@10 460 test dstq, 15
yading@10 461 jnz .dst_unaligned
yading@10 462 ADD_HFYU_LEFT_LOOP 1, 1
yading@10 463 .dst_unaligned:
yading@10 464 ADD_HFYU_LEFT_LOOP 0, 1
yading@10 465 .src_unaligned:
yading@10 466 ADD_HFYU_LEFT_LOOP 0, 0
yading@10 467
yading@10 468 ;-----------------------------------------------------------------------------
yading@10 469 ; void ff_vector_clip_int32(int32_t *dst, const int32_t *src, int32_t min,
yading@10 470 ; int32_t max, unsigned int len)
yading@10 471 ;-----------------------------------------------------------------------------
yading@10 472
yading@10 473 ; %1 = number of xmm registers used
yading@10 474 ; %2 = number of inline load/process/store loops per asm loop
yading@10 475 ; %3 = process 4*mmsize (%3=0) or 8*mmsize (%3=1) bytes per loop
yading@10 476 ; %4 = CLIPD function takes min/max as float instead of int (CLIPD_SSE2)
yading@10 477 ; %5 = suffix
yading@10 478 %macro VECTOR_CLIP_INT32 4-5
yading@10 479 cglobal vector_clip_int32%5, 5,5,%1, dst, src, min, max, len
yading@10 480 %if %4
yading@10 481 cvtsi2ss m4, minm
yading@10 482 cvtsi2ss m5, maxm
yading@10 483 %else
yading@10 484 movd m4, minm
yading@10 485 movd m5, maxm
yading@10 486 %endif
yading@10 487 SPLATD m4
yading@10 488 SPLATD m5
yading@10 489 .loop:
yading@10 490 %assign %%i 1
yading@10 491 %rep %2
yading@10 492 mova m0, [srcq+mmsize*0*%%i]
yading@10 493 mova m1, [srcq+mmsize*1*%%i]
yading@10 494 mova m2, [srcq+mmsize*2*%%i]
yading@10 495 mova m3, [srcq+mmsize*3*%%i]
yading@10 496 %if %3
yading@10 497 mova m7, [srcq+mmsize*4*%%i]
yading@10 498 mova m8, [srcq+mmsize*5*%%i]
yading@10 499 mova m9, [srcq+mmsize*6*%%i]
yading@10 500 mova m10, [srcq+mmsize*7*%%i]
yading@10 501 %endif
yading@10 502 CLIPD m0, m4, m5, m6
yading@10 503 CLIPD m1, m4, m5, m6
yading@10 504 CLIPD m2, m4, m5, m6
yading@10 505 CLIPD m3, m4, m5, m6
yading@10 506 %if %3
yading@10 507 CLIPD m7, m4, m5, m6
yading@10 508 CLIPD m8, m4, m5, m6
yading@10 509 CLIPD m9, m4, m5, m6
yading@10 510 CLIPD m10, m4, m5, m6
yading@10 511 %endif
yading@10 512 mova [dstq+mmsize*0*%%i], m0
yading@10 513 mova [dstq+mmsize*1*%%i], m1
yading@10 514 mova [dstq+mmsize*2*%%i], m2
yading@10 515 mova [dstq+mmsize*3*%%i], m3
yading@10 516 %if %3
yading@10 517 mova [dstq+mmsize*4*%%i], m7
yading@10 518 mova [dstq+mmsize*5*%%i], m8
yading@10 519 mova [dstq+mmsize*6*%%i], m9
yading@10 520 mova [dstq+mmsize*7*%%i], m10
yading@10 521 %endif
yading@10 522 %assign %%i %%i+1
yading@10 523 %endrep
yading@10 524 add srcq, mmsize*4*(%2+%3)
yading@10 525 add dstq, mmsize*4*(%2+%3)
yading@10 526 sub lend, mmsize*(%2+%3)
yading@10 527 jg .loop
yading@10 528 REP_RET
yading@10 529 %endmacro
yading@10 530
yading@10 531 INIT_MMX mmx
yading@10 532 %define CLIPD CLIPD_MMX
yading@10 533 VECTOR_CLIP_INT32 0, 1, 0, 0
yading@10 534 INIT_XMM sse2
yading@10 535 VECTOR_CLIP_INT32 6, 1, 0, 0, _int
yading@10 536 %define CLIPD CLIPD_SSE2
yading@10 537 VECTOR_CLIP_INT32 6, 2, 0, 1
yading@10 538 INIT_XMM sse4
yading@10 539 %define CLIPD CLIPD_SSE41
yading@10 540 %ifdef m8
yading@10 541 VECTOR_CLIP_INT32 11, 1, 1, 0
yading@10 542 %else
yading@10 543 VECTOR_CLIP_INT32 6, 1, 0, 0
yading@10 544 %endif
yading@10 545
yading@10 546 ; %1 = aligned/unaligned
yading@10 547 %macro BSWAP_LOOPS 1
yading@10 548 mov r3, r2
yading@10 549 sar r2, 3
yading@10 550 jz .left4_%1
yading@10 551 .loop8_%1:
yading@10 552 mov%1 m0, [r1 + 0]
yading@10 553 mov%1 m1, [r1 + 16]
yading@10 554 %if cpuflag(ssse3)
yading@10 555 pshufb m0, m2
yading@10 556 pshufb m1, m2
yading@10 557 mov%1 [r0 + 0], m0
yading@10 558 mov%1 [r0 + 16], m1
yading@10 559 %else
yading@10 560 pshuflw m0, m0, 10110001b
yading@10 561 pshuflw m1, m1, 10110001b
yading@10 562 pshufhw m0, m0, 10110001b
yading@10 563 pshufhw m1, m1, 10110001b
yading@10 564 mova m2, m0
yading@10 565 mova m3, m1
yading@10 566 psllw m0, 8
yading@10 567 psllw m1, 8
yading@10 568 psrlw m2, 8
yading@10 569 psrlw m3, 8
yading@10 570 por m2, m0
yading@10 571 por m3, m1
yading@10 572 mov%1 [r0 + 0], m2
yading@10 573 mov%1 [r0 + 16], m3
yading@10 574 %endif
yading@10 575 add r0, 32
yading@10 576 add r1, 32
yading@10 577 dec r2
yading@10 578 jnz .loop8_%1
yading@10 579 .left4_%1:
yading@10 580 mov r2, r3
yading@10 581 and r3, 4
yading@10 582 jz .left
yading@10 583 mov%1 m0, [r1]
yading@10 584 %if cpuflag(ssse3)
yading@10 585 pshufb m0, m2
yading@10 586 mov%1 [r0], m0
yading@10 587 %else
yading@10 588 pshuflw m0, m0, 10110001b
yading@10 589 pshufhw m0, m0, 10110001b
yading@10 590 mova m2, m0
yading@10 591 psllw m0, 8
yading@10 592 psrlw m2, 8
yading@10 593 por m2, m0
yading@10 594 mov%1 [r0], m2
yading@10 595 %endif
yading@10 596 add r1, 16
yading@10 597 add r0, 16
yading@10 598 %endmacro
yading@10 599
yading@10 600 ; void bswap_buf(uint32_t *dst, const uint32_t *src, int w);
yading@10 601 %macro BSWAP32_BUF 0
yading@10 602 %if cpuflag(ssse3)
yading@10 603 cglobal bswap32_buf, 3,4,3
yading@10 604 mov r3, r1
yading@10 605 mova m2, [pb_bswap32]
yading@10 606 %else
yading@10 607 cglobal bswap32_buf, 3,4,5
yading@10 608 mov r3, r1
yading@10 609 %endif
yading@10 610 or r3, r0
yading@10 611 and r3, 15
yading@10 612 jz .start_align
yading@10 613 BSWAP_LOOPS u
yading@10 614 jmp .left
yading@10 615 .start_align:
yading@10 616 BSWAP_LOOPS a
yading@10 617 .left:
yading@10 618 %if cpuflag(ssse3)
yading@10 619 mov r3, r2
yading@10 620 and r2, 2
yading@10 621 jz .left1
yading@10 622 movq m0, [r1]
yading@10 623 pshufb m0, m2
yading@10 624 movq [r0], m0
yading@10 625 add r1, 8
yading@10 626 add r0, 8
yading@10 627 .left1:
yading@10 628 and r3, 1
yading@10 629 jz .end
yading@10 630 mov r2d, [r1]
yading@10 631 bswap r2d
yading@10 632 mov [r0], r2d
yading@10 633 %else
yading@10 634 and r2, 3
yading@10 635 jz .end
yading@10 636 .loop2:
yading@10 637 mov r3d, [r1]
yading@10 638 bswap r3d
yading@10 639 mov [r0], r3d
yading@10 640 add r1, 4
yading@10 641 add r0, 4
yading@10 642 dec r2
yading@10 643 jnz .loop2
yading@10 644 %endif
yading@10 645 .end:
yading@10 646 RET
yading@10 647 %endmacro
yading@10 648
yading@10 649 INIT_XMM sse2
yading@10 650 BSWAP32_BUF
yading@10 651
yading@10 652 INIT_XMM ssse3
yading@10 653 BSWAP32_BUF