annotate ffmpeg/libavcodec/x86/h264_chromamc.asm @ 13:844d341cf643 tip

Back up before ISMIR
author Yading Song <yading.song@eecs.qmul.ac.uk>
date Thu, 31 Oct 2013 13:17:06 +0000
parents 6840f77b83aa
children
rev   line source
yading@10 1 ;******************************************************************************
yading@10 2 ;* MMX/SSSE3-optimized functions for H264 chroma MC
yading@10 3 ;* Copyright (c) 2005 Zoltan Hidvegi <hzoli -a- hzoli -d- com>,
yading@10 4 ;* 2005-2008 Loren Merritt
yading@10 5 ;*
yading@10 6 ;* This file is part of FFmpeg.
yading@10 7 ;*
yading@10 8 ;* FFmpeg is free software; you can redistribute it and/or
yading@10 9 ;* modify it under the terms of the GNU Lesser General Public
yading@10 10 ;* License as published by the Free Software Foundation; either
yading@10 11 ;* version 2.1 of the License, or (at your option) any later version.
yading@10 12 ;*
yading@10 13 ;* FFmpeg is distributed in the hope that it will be useful,
yading@10 14 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
yading@10 15 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
yading@10 16 ;* Lesser General Public License for more details.
yading@10 17 ;*
yading@10 18 ;* You should have received a copy of the GNU Lesser General Public
yading@10 19 ;* License along with FFmpeg; if not, write to the Free Software
yading@10 20 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
yading@10 21 ;******************************************************************************
yading@10 22
yading@10 23 %include "libavutil/x86/x86util.asm"
yading@10 24
yading@10 25 SECTION_RODATA
yading@10 26
yading@10 27 rnd_rv40_2d_tbl: times 4 dw 0
yading@10 28 times 4 dw 16
yading@10 29 times 4 dw 32
yading@10 30 times 4 dw 16
yading@10 31 times 4 dw 32
yading@10 32 times 4 dw 28
yading@10 33 times 4 dw 32
yading@10 34 times 4 dw 28
yading@10 35 times 4 dw 0
yading@10 36 times 4 dw 32
yading@10 37 times 4 dw 16
yading@10 38 times 4 dw 32
yading@10 39 times 4 dw 32
yading@10 40 times 4 dw 28
yading@10 41 times 4 dw 32
yading@10 42 times 4 dw 28
yading@10 43 rnd_rv40_1d_tbl: times 4 dw 0
yading@10 44 times 4 dw 2
yading@10 45 times 4 dw 4
yading@10 46 times 4 dw 2
yading@10 47 times 4 dw 4
yading@10 48 times 4 dw 3
yading@10 49 times 4 dw 4
yading@10 50 times 4 dw 3
yading@10 51 times 4 dw 0
yading@10 52 times 4 dw 4
yading@10 53 times 4 dw 2
yading@10 54 times 4 dw 4
yading@10 55 times 4 dw 4
yading@10 56 times 4 dw 3
yading@10 57 times 4 dw 4
yading@10 58 times 4 dw 3
yading@10 59
yading@10 60 cextern pw_3
yading@10 61 cextern pw_4
yading@10 62 cextern pw_8
yading@10 63 pw_28: times 8 dw 28
yading@10 64 cextern pw_32
yading@10 65 cextern pw_64
yading@10 66
yading@10 67 SECTION .text
yading@10 68
yading@10 69 %macro mv0_pixels_mc8 0
yading@10 70 lea r4, [r2*2 ]
yading@10 71 .next4rows:
yading@10 72 movq mm0, [r1 ]
yading@10 73 movq mm1, [r1+r2]
yading@10 74 add r1, r4
yading@10 75 CHROMAMC_AVG mm0, [r0 ]
yading@10 76 CHROMAMC_AVG mm1, [r0+r2]
yading@10 77 movq [r0 ], mm0
yading@10 78 movq [r0+r2], mm1
yading@10 79 add r0, r4
yading@10 80 movq mm0, [r1 ]
yading@10 81 movq mm1, [r1+r2]
yading@10 82 add r1, r4
yading@10 83 CHROMAMC_AVG mm0, [r0 ]
yading@10 84 CHROMAMC_AVG mm1, [r0+r2]
yading@10 85 movq [r0 ], mm0
yading@10 86 movq [r0+r2], mm1
yading@10 87 add r0, r4
yading@10 88 sub r3d, 4
yading@10 89 jne .next4rows
yading@10 90 %endmacro
yading@10 91
yading@10 92 %macro chroma_mc8_mmx_func 2-3
yading@10 93 %ifidn %2, rv40
yading@10 94 %ifdef PIC
yading@10 95 %define rnd_1d_rv40 r8
yading@10 96 %define rnd_2d_rv40 r8
yading@10 97 %define extra_regs 2
yading@10 98 %else ; no-PIC
yading@10 99 %define rnd_1d_rv40 rnd_rv40_1d_tbl
yading@10 100 %define rnd_2d_rv40 rnd_rv40_2d_tbl
yading@10 101 %define extra_regs 1
yading@10 102 %endif ; PIC
yading@10 103 %else
yading@10 104 %define extra_regs 0
yading@10 105 %endif ; rv40
yading@10 106 ; put/avg_h264_chroma_mc8_*(uint8_t *dst /*align 8*/, uint8_t *src /*align 1*/,
yading@10 107 ; int stride, int h, int mx, int my)
yading@10 108 cglobal %1_%2_chroma_mc8%3, 6, 7 + extra_regs, 0
yading@10 109 %if ARCH_X86_64
yading@10 110 movsxd r2, r2d
yading@10 111 %endif
yading@10 112 mov r6d, r5d
yading@10 113 or r6d, r4d
yading@10 114 jne .at_least_one_non_zero
yading@10 115 ; mx == 0 AND my == 0 - no filter needed
yading@10 116 mv0_pixels_mc8
yading@10 117 REP_RET
yading@10 118
yading@10 119 .at_least_one_non_zero:
yading@10 120 %ifidn %2, rv40
yading@10 121 %if ARCH_X86_64
yading@10 122 mov r7, r5
yading@10 123 and r7, 6 ; &~1 for mx/my=[0,7]
yading@10 124 lea r7, [r7*4+r4]
yading@10 125 sar r7d, 1
yading@10 126 %define rnd_bias r7
yading@10 127 %define dest_reg r0
yading@10 128 %else ; x86-32
yading@10 129 mov r0, r5
yading@10 130 and r0, 6 ; &~1 for mx/my=[0,7]
yading@10 131 lea r0, [r0*4+r4]
yading@10 132 sar r0d, 1
yading@10 133 %define rnd_bias r0
yading@10 134 %define dest_reg r5
yading@10 135 %endif
yading@10 136 %else ; vc1, h264
yading@10 137 %define rnd_bias 0
yading@10 138 %define dest_reg r0
yading@10 139 %endif
yading@10 140
yading@10 141 test r5d, r5d
yading@10 142 mov r6, 1
yading@10 143 je .my_is_zero
yading@10 144 test r4d, r4d
yading@10 145 mov r6, r2 ; dxy = x ? 1 : stride
yading@10 146 jne .both_non_zero
yading@10 147 .my_is_zero:
yading@10 148 ; mx == 0 XOR my == 0 - 1 dimensional filter only
yading@10 149 or r4d, r5d ; x + y
yading@10 150
yading@10 151 %ifidn %2, rv40
yading@10 152 %ifdef PIC
yading@10 153 lea r8, [rnd_rv40_1d_tbl]
yading@10 154 %endif
yading@10 155 %if ARCH_X86_64 == 0
yading@10 156 mov r5, r0m
yading@10 157 %endif
yading@10 158 %endif
yading@10 159
yading@10 160 movd m5, r4d
yading@10 161 movq m4, [pw_8]
yading@10 162 movq m6, [rnd_1d_%2+rnd_bias*8] ; mm6 = rnd >> 3
yading@10 163 punpcklwd m5, m5
yading@10 164 punpckldq m5, m5 ; mm5 = B = x
yading@10 165 pxor m7, m7
yading@10 166 psubw m4, m5 ; mm4 = A = 8-x
yading@10 167
yading@10 168 .next1drow:
yading@10 169 movq m0, [r1 ] ; mm0 = src[0..7]
yading@10 170 movq m2, [r1+r6] ; mm1 = src[1..8]
yading@10 171
yading@10 172 movq m1, m0
yading@10 173 movq m3, m2
yading@10 174 punpcklbw m0, m7
yading@10 175 punpckhbw m1, m7
yading@10 176 punpcklbw m2, m7
yading@10 177 punpckhbw m3, m7
yading@10 178 pmullw m0, m4 ; [mm0,mm1] = A * src[0..7]
yading@10 179 pmullw m1, m4
yading@10 180 pmullw m2, m5 ; [mm2,mm3] = B * src[1..8]
yading@10 181 pmullw m3, m5
yading@10 182
yading@10 183 paddw m0, m6
yading@10 184 paddw m1, m6
yading@10 185 paddw m0, m2
yading@10 186 paddw m1, m3
yading@10 187 psrlw m0, 3
yading@10 188 psrlw m1, 3
yading@10 189 packuswb m0, m1
yading@10 190 CHROMAMC_AVG m0, [dest_reg]
yading@10 191 movq [dest_reg], m0 ; dst[0..7] = (A * src[0..7] + B * src[1..8] + (rnd >> 3)) >> 3
yading@10 192
yading@10 193 add dest_reg, r2
yading@10 194 add r1, r2
yading@10 195 dec r3d
yading@10 196 jne .next1drow
yading@10 197 REP_RET
yading@10 198
yading@10 199 .both_non_zero: ; general case, bilinear
yading@10 200 movd m4, r4d ; x
yading@10 201 movd m6, r5d ; y
yading@10 202 %ifidn %2, rv40
yading@10 203 %ifdef PIC
yading@10 204 lea r8, [rnd_rv40_2d_tbl]
yading@10 205 %endif
yading@10 206 %if ARCH_X86_64 == 0
yading@10 207 mov r5, r0m
yading@10 208 %endif
yading@10 209 %endif
yading@10 210 mov r6, rsp ; backup stack pointer
yading@10 211 and rsp, ~(mmsize-1) ; align stack
yading@10 212 sub rsp, 16 ; AA and DD
yading@10 213
yading@10 214 punpcklwd m4, m4
yading@10 215 punpcklwd m6, m6
yading@10 216 punpckldq m4, m4 ; mm4 = x words
yading@10 217 punpckldq m6, m6 ; mm6 = y words
yading@10 218 movq m5, m4
yading@10 219 pmullw m4, m6 ; mm4 = x * y
yading@10 220 psllw m5, 3
yading@10 221 psllw m6, 3
yading@10 222 movq m7, m5
yading@10 223 paddw m7, m6
yading@10 224 movq [rsp+8], m4 ; DD = x * y
yading@10 225 psubw m5, m4 ; mm5 = B = 8x - xy
yading@10 226 psubw m6, m4 ; mm6 = C = 8y - xy
yading@10 227 paddw m4, [pw_64]
yading@10 228 psubw m4, m7 ; mm4 = A = xy - (8x+8y) + 64
yading@10 229 pxor m7, m7
yading@10 230 movq [rsp ], m4
yading@10 231
yading@10 232 movq m0, [r1 ] ; mm0 = src[0..7]
yading@10 233 movq m1, [r1+1] ; mm1 = src[1..8]
yading@10 234 .next2drow:
yading@10 235 add r1, r2
yading@10 236
yading@10 237 movq m2, m0
yading@10 238 movq m3, m1
yading@10 239 punpckhbw m0, m7
yading@10 240 punpcklbw m1, m7
yading@10 241 punpcklbw m2, m7
yading@10 242 punpckhbw m3, m7
yading@10 243 pmullw m0, [rsp]
yading@10 244 pmullw m2, [rsp]
yading@10 245 pmullw m1, m5
yading@10 246 pmullw m3, m5
yading@10 247 paddw m2, m1 ; mm2 = A * src[0..3] + B * src[1..4]
yading@10 248 paddw m3, m0 ; mm3 = A * src[4..7] + B * src[5..8]
yading@10 249
yading@10 250 movq m0, [r1]
yading@10 251 movq m1, m0
yading@10 252 punpcklbw m0, m7
yading@10 253 punpckhbw m1, m7
yading@10 254 pmullw m0, m6
yading@10 255 pmullw m1, m6
yading@10 256 paddw m2, m0
yading@10 257 paddw m3, m1 ; [mm2,mm3] += C * src[0..7]
yading@10 258
yading@10 259 movq m1, [r1+1]
yading@10 260 movq m0, m1
yading@10 261 movq m4, m1
yading@10 262 punpcklbw m0, m7
yading@10 263 punpckhbw m4, m7
yading@10 264 pmullw m0, [rsp+8]
yading@10 265 pmullw m4, [rsp+8]
yading@10 266 paddw m2, m0
yading@10 267 paddw m3, m4 ; [mm2,mm3] += D * src[1..8]
yading@10 268 movq m0, [r1]
yading@10 269
yading@10 270 paddw m2, [rnd_2d_%2+rnd_bias*8]
yading@10 271 paddw m3, [rnd_2d_%2+rnd_bias*8]
yading@10 272 psrlw m2, 6
yading@10 273 psrlw m3, 6
yading@10 274 packuswb m2, m3
yading@10 275 CHROMAMC_AVG m2, [dest_reg]
yading@10 276 movq [dest_reg], m2 ; dst[0..7] = ([mm2,mm3] + rnd) >> 6
yading@10 277
yading@10 278 add dest_reg, r2
yading@10 279 dec r3d
yading@10 280 jne .next2drow
yading@10 281 mov rsp, r6 ; restore stack pointer
yading@10 282 RET
yading@10 283 %endmacro
yading@10 284
yading@10 285 %macro chroma_mc4_mmx_func 2
yading@10 286 %define extra_regs 0
yading@10 287 %ifidn %2, rv40
yading@10 288 %ifdef PIC
yading@10 289 %define extra_regs 1
yading@10 290 %endif ; PIC
yading@10 291 %endif ; rv40
yading@10 292 cglobal %1_%2_chroma_mc4, 6, 6 + extra_regs, 0
yading@10 293 %if ARCH_X86_64
yading@10 294 movsxd r2, r2d
yading@10 295 %endif
yading@10 296 pxor m7, m7
yading@10 297 movd m2, r4d ; x
yading@10 298 movd m3, r5d ; y
yading@10 299 movq m4, [pw_8]
yading@10 300 movq m5, [pw_8]
yading@10 301 punpcklwd m2, m2
yading@10 302 punpcklwd m3, m3
yading@10 303 punpcklwd m2, m2
yading@10 304 punpcklwd m3, m3
yading@10 305 psubw m4, m2
yading@10 306 psubw m5, m3
yading@10 307
yading@10 308 %ifidn %2, rv40
yading@10 309 %ifdef PIC
yading@10 310 lea r6, [rnd_rv40_2d_tbl]
yading@10 311 %define rnd_2d_rv40 r6
yading@10 312 %else
yading@10 313 %define rnd_2d_rv40 rnd_rv40_2d_tbl
yading@10 314 %endif
yading@10 315 and r5, 6 ; &~1 for mx/my=[0,7]
yading@10 316 lea r5, [r5*4+r4]
yading@10 317 sar r5d, 1
yading@10 318 %define rnd_bias r5
yading@10 319 %else ; vc1, h264
yading@10 320 %define rnd_bias 0
yading@10 321 %endif
yading@10 322
yading@10 323 movd m0, [r1 ]
yading@10 324 movd m6, [r1+1]
yading@10 325 add r1, r2
yading@10 326 punpcklbw m0, m7
yading@10 327 punpcklbw m6, m7
yading@10 328 pmullw m0, m4
yading@10 329 pmullw m6, m2
yading@10 330 paddw m6, m0
yading@10 331
yading@10 332 .next2rows:
yading@10 333 movd m0, [r1 ]
yading@10 334 movd m1, [r1+1]
yading@10 335 add r1, r2
yading@10 336 punpcklbw m0, m7
yading@10 337 punpcklbw m1, m7
yading@10 338 pmullw m0, m4
yading@10 339 pmullw m1, m2
yading@10 340 paddw m1, m0
yading@10 341 movq m0, m1
yading@10 342
yading@10 343 pmullw m6, m5
yading@10 344 pmullw m1, m3
yading@10 345 paddw m6, [rnd_2d_%2+rnd_bias*8]
yading@10 346 paddw m1, m6
yading@10 347 psrlw m1, 6
yading@10 348 packuswb m1, m1
yading@10 349 CHROMAMC_AVG4 m1, m6, [r0]
yading@10 350 movd [r0], m1
yading@10 351 add r0, r2
yading@10 352
yading@10 353 movd m6, [r1 ]
yading@10 354 movd m1, [r1+1]
yading@10 355 add r1, r2
yading@10 356 punpcklbw m6, m7
yading@10 357 punpcklbw m1, m7
yading@10 358 pmullw m6, m4
yading@10 359 pmullw m1, m2
yading@10 360 paddw m1, m6
yading@10 361 movq m6, m1
yading@10 362 pmullw m0, m5
yading@10 363 pmullw m1, m3
yading@10 364 paddw m0, [rnd_2d_%2+rnd_bias*8]
yading@10 365 paddw m1, m0
yading@10 366 psrlw m1, 6
yading@10 367 packuswb m1, m1
yading@10 368 CHROMAMC_AVG4 m1, m0, [r0]
yading@10 369 movd [r0], m1
yading@10 370 add r0, r2
yading@10 371 sub r3d, 2
yading@10 372 jnz .next2rows
yading@10 373 REP_RET
yading@10 374 %endmacro
yading@10 375
yading@10 376 %macro chroma_mc2_mmx_func 2
yading@10 377 cglobal %1_%2_chroma_mc2, 6, 7, 0
yading@10 378 %if ARCH_X86_64
yading@10 379 movsxd r2, r2d
yading@10 380 %endif
yading@10 381
yading@10 382 mov r6d, r4d
yading@10 383 shl r4d, 16
yading@10 384 sub r4d, r6d
yading@10 385 add r4d, 8
yading@10 386 imul r5d, r4d ; x*y<<16 | y*(8-x)
yading@10 387 shl r4d, 3
yading@10 388 sub r4d, r5d ; x*(8-y)<<16 | (8-x)*(8-y)
yading@10 389
yading@10 390 movd m5, r4d
yading@10 391 movd m6, r5d
yading@10 392 punpckldq m5, m5 ; mm5 = {A,B,A,B}
yading@10 393 punpckldq m6, m6 ; mm6 = {C,D,C,D}
yading@10 394 pxor m7, m7
yading@10 395 movd m2, [r1]
yading@10 396 punpcklbw m2, m7
yading@10 397 pshufw m2, m2, 0x94 ; mm0 = src[0,1,1,2]
yading@10 398
yading@10 399 .nextrow:
yading@10 400 add r1, r2
yading@10 401 movq m1, m2
yading@10 402 pmaddwd m1, m5 ; mm1 = A * src[0,1] + B * src[1,2]
yading@10 403 movd m0, [r1]
yading@10 404 punpcklbw m0, m7
yading@10 405 pshufw m0, m0, 0x94 ; mm0 = src[0,1,1,2]
yading@10 406 movq m2, m0
yading@10 407 pmaddwd m0, m6
yading@10 408 paddw m1, [rnd_2d_%2]
yading@10 409 paddw m1, m0 ; mm1 += C * src[0,1] + D * src[1,2]
yading@10 410 psrlw m1, 6
yading@10 411 packssdw m1, m7
yading@10 412 packuswb m1, m7
yading@10 413 CHROMAMC_AVG4 m1, m3, [r0]
yading@10 414 movd r5d, m1
yading@10 415 mov [r0], r5w
yading@10 416 add r0, r2
yading@10 417 sub r3d, 1
yading@10 418 jnz .nextrow
yading@10 419 REP_RET
yading@10 420 %endmacro
yading@10 421
yading@10 422 %define rnd_1d_h264 pw_4
yading@10 423 %define rnd_2d_h264 pw_32
yading@10 424 %define rnd_1d_vc1 pw_3
yading@10 425 %define rnd_2d_vc1 pw_28
yading@10 426
yading@10 427 %macro NOTHING 2-3
yading@10 428 %endmacro
yading@10 429 %macro DIRECT_AVG 2
yading@10 430 PAVGB %1, %2
yading@10 431 %endmacro
yading@10 432 %macro COPY_AVG 3
yading@10 433 movd %2, %3
yading@10 434 PAVGB %1, %2
yading@10 435 %endmacro
yading@10 436
yading@10 437 INIT_MMX mmx
yading@10 438 %define CHROMAMC_AVG NOTHING
yading@10 439 %define CHROMAMC_AVG4 NOTHING
yading@10 440 chroma_mc8_mmx_func put, h264, _rnd
yading@10 441 chroma_mc8_mmx_func put, vc1, _nornd
yading@10 442 chroma_mc8_mmx_func put, rv40
yading@10 443 chroma_mc4_mmx_func put, h264
yading@10 444 chroma_mc4_mmx_func put, rv40
yading@10 445
yading@10 446 INIT_MMX mmxext
yading@10 447 chroma_mc2_mmx_func put, h264
yading@10 448
yading@10 449 %define CHROMAMC_AVG DIRECT_AVG
yading@10 450 %define CHROMAMC_AVG4 COPY_AVG
yading@10 451 chroma_mc8_mmx_func avg, h264, _rnd
yading@10 452 chroma_mc8_mmx_func avg, vc1, _nornd
yading@10 453 chroma_mc8_mmx_func avg, rv40
yading@10 454 chroma_mc4_mmx_func avg, h264
yading@10 455 chroma_mc4_mmx_func avg, rv40
yading@10 456 chroma_mc2_mmx_func avg, h264
yading@10 457
yading@10 458 INIT_MMX 3dnow
yading@10 459 chroma_mc8_mmx_func avg, h264, _rnd
yading@10 460 chroma_mc8_mmx_func avg, vc1, _nornd
yading@10 461 chroma_mc8_mmx_func avg, rv40
yading@10 462 chroma_mc4_mmx_func avg, h264
yading@10 463 chroma_mc4_mmx_func avg, rv40
yading@10 464
yading@10 465 %macro chroma_mc8_ssse3_func 2-3
yading@10 466 cglobal %1_%2_chroma_mc8%3, 6, 7, 8
yading@10 467 %if ARCH_X86_64
yading@10 468 movsxd r2, r2d
yading@10 469 %endif
yading@10 470 mov r6d, r5d
yading@10 471 or r6d, r4d
yading@10 472 jne .at_least_one_non_zero
yading@10 473 ; mx == 0 AND my == 0 - no filter needed
yading@10 474 mv0_pixels_mc8
yading@10 475 REP_RET
yading@10 476
yading@10 477 .at_least_one_non_zero:
yading@10 478 test r5d, r5d
yading@10 479 je .my_is_zero
yading@10 480 test r4d, r4d
yading@10 481 je .mx_is_zero
yading@10 482
yading@10 483 ; general case, bilinear
yading@10 484 mov r6d, r4d
yading@10 485 shl r4d, 8
yading@10 486 sub r4, r6
yading@10 487 mov r6, 8
yading@10 488 add r4, 8 ; x*288+8 = x<<8 | (8-x)
yading@10 489 sub r6d, r5d
yading@10 490 imul r6, r4 ; (8-y)*(x*255+8) = (8-y)*x<<8 | (8-y)*(8-x)
yading@10 491 imul r4d, r5d ; y *(x*255+8) = y *x<<8 | y *(8-x)
yading@10 492
yading@10 493 movd m7, r6d
yading@10 494 movd m6, r4d
yading@10 495 movdqa m5, [rnd_2d_%2]
yading@10 496 movq m0, [r1 ]
yading@10 497 movq m1, [r1+1]
yading@10 498 pshuflw m7, m7, 0
yading@10 499 pshuflw m6, m6, 0
yading@10 500 punpcklbw m0, m1
yading@10 501 movlhps m7, m7
yading@10 502 movlhps m6, m6
yading@10 503
yading@10 504 .next2rows:
yading@10 505 movq m1, [r1+r2*1 ]
yading@10 506 movq m2, [r1+r2*1+1]
yading@10 507 movq m3, [r1+r2*2 ]
yading@10 508 movq m4, [r1+r2*2+1]
yading@10 509 lea r1, [r1+r2*2]
yading@10 510 punpcklbw m1, m2
yading@10 511 movdqa m2, m1
yading@10 512 punpcklbw m3, m4
yading@10 513 movdqa m4, m3
yading@10 514 pmaddubsw m0, m7
yading@10 515 pmaddubsw m1, m6
yading@10 516 pmaddubsw m2, m7
yading@10 517 pmaddubsw m3, m6
yading@10 518 paddw m0, m5
yading@10 519 paddw m2, m5
yading@10 520 paddw m1, m0
yading@10 521 paddw m3, m2
yading@10 522 psrlw m1, 6
yading@10 523 movdqa m0, m4
yading@10 524 psrlw m3, 6
yading@10 525 %ifidn %1, avg
yading@10 526 movq m2, [r0 ]
yading@10 527 movhps m2, [r0+r2]
yading@10 528 %endif
yading@10 529 packuswb m1, m3
yading@10 530 CHROMAMC_AVG m1, m2
yading@10 531 movq [r0 ], m1
yading@10 532 movhps [r0+r2], m1
yading@10 533 sub r3d, 2
yading@10 534 lea r0, [r0+r2*2]
yading@10 535 jg .next2rows
yading@10 536 REP_RET
yading@10 537
yading@10 538 .my_is_zero:
yading@10 539 mov r5d, r4d
yading@10 540 shl r4d, 8
yading@10 541 add r4, 8
yading@10 542 sub r4, r5 ; 255*x+8 = x<<8 | (8-x)
yading@10 543 movd m7, r4d
yading@10 544 movdqa m6, [rnd_1d_%2]
yading@10 545 pshuflw m7, m7, 0
yading@10 546 movlhps m7, m7
yading@10 547
yading@10 548 .next2xrows:
yading@10 549 movq m0, [r1 ]
yading@10 550 movq m1, [r1 +1]
yading@10 551 movq m2, [r1+r2 ]
yading@10 552 movq m3, [r1+r2+1]
yading@10 553 punpcklbw m0, m1
yading@10 554 punpcklbw m2, m3
yading@10 555 pmaddubsw m0, m7
yading@10 556 pmaddubsw m2, m7
yading@10 557 %ifidn %1, avg
yading@10 558 movq m4, [r0 ]
yading@10 559 movhps m4, [r0+r2]
yading@10 560 %endif
yading@10 561 paddw m0, m6
yading@10 562 paddw m2, m6
yading@10 563 psrlw m0, 3
yading@10 564 psrlw m2, 3
yading@10 565 packuswb m0, m2
yading@10 566 CHROMAMC_AVG m0, m4
yading@10 567 movq [r0 ], m0
yading@10 568 movhps [r0+r2], m0
yading@10 569 sub r3d, 2
yading@10 570 lea r0, [r0+r2*2]
yading@10 571 lea r1, [r1+r2*2]
yading@10 572 jg .next2xrows
yading@10 573 REP_RET
yading@10 574
yading@10 575 .mx_is_zero:
yading@10 576 mov r4d, r5d
yading@10 577 shl r5d, 8
yading@10 578 add r5, 8
yading@10 579 sub r5, r4 ; 255*y+8 = y<<8 | (8-y)
yading@10 580 movd m7, r5d
yading@10 581 movdqa m6, [rnd_1d_%2]
yading@10 582 pshuflw m7, m7, 0
yading@10 583 movlhps m7, m7
yading@10 584
yading@10 585 .next2yrows:
yading@10 586 movq m0, [r1 ]
yading@10 587 movq m1, [r1+r2 ]
yading@10 588 movdqa m2, m1
yading@10 589 movq m3, [r1+r2*2]
yading@10 590 lea r1, [r1+r2*2]
yading@10 591 punpcklbw m0, m1
yading@10 592 punpcklbw m2, m3
yading@10 593 pmaddubsw m0, m7
yading@10 594 pmaddubsw m2, m7
yading@10 595 %ifidn %1, avg
yading@10 596 movq m4, [r0 ]
yading@10 597 movhps m4, [r0+r2]
yading@10 598 %endif
yading@10 599 paddw m0, m6
yading@10 600 paddw m2, m6
yading@10 601 psrlw m0, 3
yading@10 602 psrlw m2, 3
yading@10 603 packuswb m0, m2
yading@10 604 CHROMAMC_AVG m0, m4
yading@10 605 movq [r0 ], m0
yading@10 606 movhps [r0+r2], m0
yading@10 607 sub r3d, 2
yading@10 608 lea r0, [r0+r2*2]
yading@10 609 jg .next2yrows
yading@10 610 REP_RET
yading@10 611 %endmacro
yading@10 612
yading@10 613 %macro chroma_mc4_ssse3_func 2
yading@10 614 cglobal %1_%2_chroma_mc4, 6, 7, 0
yading@10 615 %if ARCH_X86_64
yading@10 616 movsxd r2, r2d
yading@10 617 %endif
yading@10 618 mov r6, r4
yading@10 619 shl r4d, 8
yading@10 620 sub r4d, r6d
yading@10 621 mov r6, 8
yading@10 622 add r4d, 8 ; x*288+8
yading@10 623 sub r6d, r5d
yading@10 624 imul r6d, r4d ; (8-y)*(x*255+8) = (8-y)*x<<8 | (8-y)*(8-x)
yading@10 625 imul r4d, r5d ; y *(x*255+8) = y *x<<8 | y *(8-x)
yading@10 626
yading@10 627 movd m7, r6d
yading@10 628 movd m6, r4d
yading@10 629 movq m5, [pw_32]
yading@10 630 movd m0, [r1 ]
yading@10 631 pshufw m7, m7, 0
yading@10 632 punpcklbw m0, [r1+1]
yading@10 633 pshufw m6, m6, 0
yading@10 634
yading@10 635 .next2rows:
yading@10 636 movd m1, [r1+r2*1 ]
yading@10 637 movd m3, [r1+r2*2 ]
yading@10 638 punpcklbw m1, [r1+r2*1+1]
yading@10 639 punpcklbw m3, [r1+r2*2+1]
yading@10 640 lea r1, [r1+r2*2]
yading@10 641 movq m2, m1
yading@10 642 movq m4, m3
yading@10 643 pmaddubsw m0, m7
yading@10 644 pmaddubsw m1, m6
yading@10 645 pmaddubsw m2, m7
yading@10 646 pmaddubsw m3, m6
yading@10 647 paddw m0, m5
yading@10 648 paddw m2, m5
yading@10 649 paddw m1, m0
yading@10 650 paddw m3, m2
yading@10 651 psrlw m1, 6
yading@10 652 movq m0, m4
yading@10 653 psrlw m3, 6
yading@10 654 packuswb m1, m1
yading@10 655 packuswb m3, m3
yading@10 656 CHROMAMC_AVG m1, [r0 ]
yading@10 657 CHROMAMC_AVG m3, [r0+r2]
yading@10 658 movd [r0 ], m1
yading@10 659 movd [r0+r2], m3
yading@10 660 sub r3d, 2
yading@10 661 lea r0, [r0+r2*2]
yading@10 662 jg .next2rows
yading@10 663 REP_RET
yading@10 664 %endmacro
yading@10 665
yading@10 666 %define CHROMAMC_AVG NOTHING
yading@10 667 INIT_XMM ssse3
yading@10 668 chroma_mc8_ssse3_func put, h264, _rnd
yading@10 669 chroma_mc8_ssse3_func put, vc1, _nornd
yading@10 670 INIT_MMX ssse3
yading@10 671 chroma_mc4_ssse3_func put, h264
yading@10 672
yading@10 673 %define CHROMAMC_AVG DIRECT_AVG
yading@10 674 INIT_XMM ssse3
yading@10 675 chroma_mc8_ssse3_func avg, h264, _rnd
yading@10 676 chroma_mc8_ssse3_func avg, vc1, _nornd
yading@10 677 INIT_MMX ssse3
yading@10 678 chroma_mc4_ssse3_func avg, h264