annotate ffmpeg/libavcodec/x86/h264_qpel_8bit.asm @ 13:844d341cf643 tip

Back up before ISMIR
author Yading Song <yading.song@eecs.qmul.ac.uk>
date Thu, 31 Oct 2013 13:17:06 +0000
parents 6840f77b83aa
children
rev   line source
yading@10 1 ;*****************************************************************************
yading@10 2 ;* MMX/SSE2/SSSE3-optimized H.264 QPEL code
yading@10 3 ;*****************************************************************************
yading@10 4 ;* Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt
yading@10 5 ;* Copyright (C) 2012 Daniel Kang
yading@10 6 ;*
yading@10 7 ;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
yading@10 8 ;*
yading@10 9 ;* This file is part of FFmpeg.
yading@10 10 ;*
yading@10 11 ;* FFmpeg is free software; you can redistribute it and/or
yading@10 12 ;* modify it under the terms of the GNU Lesser General Public
yading@10 13 ;* License as published by the Free Software Foundation; either
yading@10 14 ;* version 2.1 of the License, or (at your option) any later version.
yading@10 15 ;*
yading@10 16 ;* FFmpeg is distributed in the hope that it will be useful,
yading@10 17 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
yading@10 18 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
yading@10 19 ;* Lesser General Public License for more details.
yading@10 20 ;*
yading@10 21 ;* You should have received a copy of the GNU Lesser General Public
yading@10 22 ;* License along with FFmpeg; if not, write to the Free Software
yading@10 23 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
yading@10 24 ;******************************************************************************
yading@10 25
yading@10 26 %include "libavutil/x86/x86util.asm"
yading@10 27
yading@10 28 SECTION_RODATA 32
yading@10 29
yading@10 30 cextern pw_16
yading@10 31 cextern pw_5
yading@10 32 cextern pb_0
yading@10 33
yading@10 34 SECTION .text
yading@10 35
yading@10 36
yading@10 37 %macro op_avgh 3
yading@10 38 movh %3, %2
yading@10 39 pavgb %1, %3
yading@10 40 movh %2, %1
yading@10 41 %endmacro
yading@10 42
yading@10 43 %macro op_avg 2-3
yading@10 44 pavgb %1, %2
yading@10 45 mova %2, %1
yading@10 46 %endmacro
yading@10 47
yading@10 48 %macro op_puth 2-3
yading@10 49 movh %2, %1
yading@10 50 %endmacro
yading@10 51
yading@10 52 %macro op_put 2-3
yading@10 53 mova %2, %1
yading@10 54 %endmacro
yading@10 55
yading@10 56 %macro QPEL4_H_LOWPASS_OP 1
yading@10 57 cglobal %1_h264_qpel4_h_lowpass, 4,5 ; dst, src, dstStride, srcStride
yading@10 58 movsxdifnidn r2, r2d
yading@10 59 movsxdifnidn r3, r3d
yading@10 60 pxor m7, m7
yading@10 61 mova m4, [pw_5]
yading@10 62 mova m5, [pw_16]
yading@10 63 mov r4d, 4
yading@10 64 .loop:
yading@10 65 movh m1, [r1-1]
yading@10 66 movh m2, [r1+0]
yading@10 67 movh m3, [r1+1]
yading@10 68 movh m0, [r1+2]
yading@10 69 punpcklbw m1, m7
yading@10 70 punpcklbw m2, m7
yading@10 71 punpcklbw m3, m7
yading@10 72 punpcklbw m0, m7
yading@10 73 paddw m1, m0
yading@10 74 paddw m2, m3
yading@10 75 movh m0, [r1-2]
yading@10 76 movh m3, [r1+3]
yading@10 77 punpcklbw m0, m7
yading@10 78 punpcklbw m3, m7
yading@10 79 paddw m0, m3
yading@10 80 psllw m2, 2
yading@10 81 psubw m2, m1
yading@10 82 pmullw m2, m4
yading@10 83 paddw m0, m5
yading@10 84 paddw m0, m2
yading@10 85 psraw m0, 5
yading@10 86 packuswb m0, m0
yading@10 87 op_%1h m0, [r0], m6
yading@10 88 add r0, r2
yading@10 89 add r1, r3
yading@10 90 dec r4d
yading@10 91 jg .loop
yading@10 92 REP_RET
yading@10 93 %endmacro
yading@10 94
yading@10 95 INIT_MMX mmxext
yading@10 96 QPEL4_H_LOWPASS_OP put
yading@10 97 QPEL4_H_LOWPASS_OP avg
yading@10 98
yading@10 99 %macro QPEL8_H_LOWPASS_OP 1
yading@10 100 cglobal %1_h264_qpel8_h_lowpass, 4,5 ; dst, src, dstStride, srcStride
yading@10 101 movsxdifnidn r2, r2d
yading@10 102 movsxdifnidn r3, r3d
yading@10 103 mov r4d, 8
yading@10 104 pxor m7, m7
yading@10 105 mova m6, [pw_5]
yading@10 106 .loop:
yading@10 107 mova m0, [r1]
yading@10 108 mova m2, [r1+1]
yading@10 109 mova m1, m0
yading@10 110 mova m3, m2
yading@10 111 punpcklbw m0, m7
yading@10 112 punpckhbw m1, m7
yading@10 113 punpcklbw m2, m7
yading@10 114 punpckhbw m3, m7
yading@10 115 paddw m0, m2
yading@10 116 paddw m1, m3
yading@10 117 psllw m0, 2
yading@10 118 psllw m1, 2
yading@10 119 mova m2, [r1-1]
yading@10 120 mova m4, [r1+2]
yading@10 121 mova m3, m2
yading@10 122 mova m5, m4
yading@10 123 punpcklbw m2, m7
yading@10 124 punpckhbw m3, m7
yading@10 125 punpcklbw m4, m7
yading@10 126 punpckhbw m5, m7
yading@10 127 paddw m2, m4
yading@10 128 paddw m5, m3
yading@10 129 psubw m0, m2
yading@10 130 psubw m1, m5
yading@10 131 pmullw m0, m6
yading@10 132 pmullw m1, m6
yading@10 133 movd m2, [r1-2]
yading@10 134 movd m5, [r1+7]
yading@10 135 punpcklbw m2, m7
yading@10 136 punpcklbw m5, m7
yading@10 137 paddw m2, m3
yading@10 138 paddw m4, m5
yading@10 139 mova m5, [pw_16]
yading@10 140 paddw m2, m5
yading@10 141 paddw m4, m5
yading@10 142 paddw m0, m2
yading@10 143 paddw m1, m4
yading@10 144 psraw m0, 5
yading@10 145 psraw m1, 5
yading@10 146 packuswb m0, m1
yading@10 147 op_%1 m0, [r0], m4
yading@10 148 add r0, r2
yading@10 149 add r1, r3
yading@10 150 dec r4d
yading@10 151 jg .loop
yading@10 152 REP_RET
yading@10 153 %endmacro
yading@10 154
yading@10 155 INIT_MMX mmxext
yading@10 156 QPEL8_H_LOWPASS_OP put
yading@10 157 QPEL8_H_LOWPASS_OP avg
yading@10 158
yading@10 159 %macro QPEL8_H_LOWPASS_OP_XMM 1
yading@10 160 cglobal %1_h264_qpel8_h_lowpass, 4,5,8 ; dst, src, dstStride, srcStride
yading@10 161 movsxdifnidn r2, r2d
yading@10 162 movsxdifnidn r3, r3d
yading@10 163 mov r4d, 8
yading@10 164 pxor m7, m7
yading@10 165 mova m6, [pw_5]
yading@10 166 .loop:
yading@10 167 movu m1, [r1-2]
yading@10 168 mova m0, m1
yading@10 169 punpckhbw m1, m7
yading@10 170 punpcklbw m0, m7
yading@10 171 mova m2, m1
yading@10 172 mova m3, m1
yading@10 173 mova m4, m1
yading@10 174 mova m5, m1
yading@10 175 palignr m4, m0, 2
yading@10 176 palignr m3, m0, 4
yading@10 177 palignr m2, m0, 6
yading@10 178 palignr m1, m0, 8
yading@10 179 palignr m5, m0, 10
yading@10 180 paddw m0, m5
yading@10 181 paddw m2, m3
yading@10 182 paddw m1, m4
yading@10 183 psllw m2, 2
yading@10 184 psubw m2, m1
yading@10 185 paddw m0, [pw_16]
yading@10 186 pmullw m2, m6
yading@10 187 paddw m2, m0
yading@10 188 psraw m2, 5
yading@10 189 packuswb m2, m2
yading@10 190 op_%1h m2, [r0], m4
yading@10 191 add r1, r3
yading@10 192 add r0, r2
yading@10 193 dec r4d
yading@10 194 jne .loop
yading@10 195 REP_RET
yading@10 196 %endmacro
yading@10 197
yading@10 198 INIT_XMM ssse3
yading@10 199 QPEL8_H_LOWPASS_OP_XMM put
yading@10 200 QPEL8_H_LOWPASS_OP_XMM avg
yading@10 201
yading@10 202
yading@10 203 %macro QPEL4_H_LOWPASS_L2_OP 1
yading@10 204 cglobal %1_h264_qpel4_h_lowpass_l2, 5,6 ; dst, src, src2, dstStride, srcStride
yading@10 205 movsxdifnidn r3, r3d
yading@10 206 movsxdifnidn r4, r4d
yading@10 207 pxor m7, m7
yading@10 208 mova m4, [pw_5]
yading@10 209 mova m5, [pw_16]
yading@10 210 mov r5d, 4
yading@10 211 .loop:
yading@10 212 movh m1, [r1-1]
yading@10 213 movh m2, [r1+0]
yading@10 214 movh m3, [r1+1]
yading@10 215 movh m0, [r1+2]
yading@10 216 punpcklbw m1, m7
yading@10 217 punpcklbw m2, m7
yading@10 218 punpcklbw m3, m7
yading@10 219 punpcklbw m0, m7
yading@10 220 paddw m1, m0
yading@10 221 paddw m2, m3
yading@10 222 movh m0, [r1-2]
yading@10 223 movh m3, [r1+3]
yading@10 224 punpcklbw m0, m7
yading@10 225 punpcklbw m3, m7
yading@10 226 paddw m0, m3
yading@10 227 psllw m2, 2
yading@10 228 psubw m2, m1
yading@10 229 pmullw m2, m4
yading@10 230 paddw m0, m5
yading@10 231 paddw m0, m2
yading@10 232 movh m3, [r2]
yading@10 233 psraw m0, 5
yading@10 234 packuswb m0, m0
yading@10 235 pavgb m0, m3
yading@10 236 op_%1h m0, [r0], m6
yading@10 237 add r0, r3
yading@10 238 add r1, r3
yading@10 239 add r2, r4
yading@10 240 dec r5d
yading@10 241 jg .loop
yading@10 242 REP_RET
yading@10 243 %endmacro
yading@10 244
yading@10 245 INIT_MMX mmxext
yading@10 246 QPEL4_H_LOWPASS_L2_OP put
yading@10 247 QPEL4_H_LOWPASS_L2_OP avg
yading@10 248
yading@10 249
yading@10 250 %macro QPEL8_H_LOWPASS_L2_OP 1
yading@10 251 cglobal %1_h264_qpel8_h_lowpass_l2, 5,6 ; dst, src, src2, dstStride, srcStride
yading@10 252 movsxdifnidn r3, r3d
yading@10 253 movsxdifnidn r4, r4d
yading@10 254 mov r5d, 8
yading@10 255 pxor m7, m7
yading@10 256 mova m6, [pw_5]
yading@10 257 .loop:
yading@10 258 mova m0, [r1]
yading@10 259 mova m2, [r1+1]
yading@10 260 mova m1, m0
yading@10 261 mova m3, m2
yading@10 262 punpcklbw m0, m7
yading@10 263 punpckhbw m1, m7
yading@10 264 punpcklbw m2, m7
yading@10 265 punpckhbw m3, m7
yading@10 266 paddw m0, m2
yading@10 267 paddw m1, m3
yading@10 268 psllw m0, 2
yading@10 269 psllw m1, 2
yading@10 270 mova m2, [r1-1]
yading@10 271 mova m4, [r1+2]
yading@10 272 mova m3, m2
yading@10 273 mova m5, m4
yading@10 274 punpcklbw m2, m7
yading@10 275 punpckhbw m3, m7
yading@10 276 punpcklbw m4, m7
yading@10 277 punpckhbw m5, m7
yading@10 278 paddw m2, m4
yading@10 279 paddw m5, m3
yading@10 280 psubw m0, m2
yading@10 281 psubw m1, m5
yading@10 282 pmullw m0, m6
yading@10 283 pmullw m1, m6
yading@10 284 movd m2, [r1-2]
yading@10 285 movd m5, [r1+7]
yading@10 286 punpcklbw m2, m7
yading@10 287 punpcklbw m5, m7
yading@10 288 paddw m2, m3
yading@10 289 paddw m4, m5
yading@10 290 mova m5, [pw_16]
yading@10 291 paddw m2, m5
yading@10 292 paddw m4, m5
yading@10 293 paddw m0, m2
yading@10 294 paddw m1, m4
yading@10 295 psraw m0, 5
yading@10 296 psraw m1, 5
yading@10 297 mova m4, [r2]
yading@10 298 packuswb m0, m1
yading@10 299 pavgb m0, m4
yading@10 300 op_%1 m0, [r0], m4
yading@10 301 add r0, r3
yading@10 302 add r1, r3
yading@10 303 add r2, r4
yading@10 304 dec r5d
yading@10 305 jg .loop
yading@10 306 REP_RET
yading@10 307 %endmacro
yading@10 308
yading@10 309 INIT_MMX mmxext
yading@10 310 QPEL8_H_LOWPASS_L2_OP put
yading@10 311 QPEL8_H_LOWPASS_L2_OP avg
yading@10 312
yading@10 313
yading@10 314 %macro QPEL8_H_LOWPASS_L2_OP_XMM 1
yading@10 315 cglobal %1_h264_qpel8_h_lowpass_l2, 5,6,8 ; dst, src, src2, dstStride, src2Stride
yading@10 316 movsxdifnidn r3, r3d
yading@10 317 movsxdifnidn r4, r4d
yading@10 318 mov r5d, 8
yading@10 319 pxor m7, m7
yading@10 320 mova m6, [pw_5]
yading@10 321 .loop:
yading@10 322 lddqu m1, [r1-2]
yading@10 323 mova m0, m1
yading@10 324 punpckhbw m1, m7
yading@10 325 punpcklbw m0, m7
yading@10 326 mova m2, m1
yading@10 327 mova m3, m1
yading@10 328 mova m4, m1
yading@10 329 mova m5, m1
yading@10 330 palignr m4, m0, 2
yading@10 331 palignr m3, m0, 4
yading@10 332 palignr m2, m0, 6
yading@10 333 palignr m1, m0, 8
yading@10 334 palignr m5, m0, 10
yading@10 335 paddw m0, m5
yading@10 336 paddw m2, m3
yading@10 337 paddw m1, m4
yading@10 338 psllw m2, 2
yading@10 339 movh m3, [r2]
yading@10 340 psubw m2, m1
yading@10 341 paddw m0, [pw_16]
yading@10 342 pmullw m2, m6
yading@10 343 paddw m2, m0
yading@10 344 psraw m2, 5
yading@10 345 packuswb m2, m2
yading@10 346 pavgb m2, m3
yading@10 347 op_%1h m2, [r0], m4
yading@10 348 add r1, r3
yading@10 349 add r0, r3
yading@10 350 add r2, r4
yading@10 351 dec r5d
yading@10 352 jg .loop
yading@10 353 REP_RET
yading@10 354 %endmacro
yading@10 355
yading@10 356 INIT_XMM ssse3
yading@10 357 QPEL8_H_LOWPASS_L2_OP_XMM put
yading@10 358 QPEL8_H_LOWPASS_L2_OP_XMM avg
yading@10 359
yading@10 360
yading@10 361 ; All functions that call this are required to have function arguments of
yading@10 362 ; dst, src, dstStride, srcStride
yading@10 363 %macro FILT_V 1
yading@10 364 mova m6, m2
yading@10 365 movh m5, [r1]
yading@10 366 paddw m6, m3
yading@10 367 psllw m6, 2
yading@10 368 psubw m6, m1
yading@10 369 psubw m6, m4
yading@10 370 punpcklbw m5, m7
yading@10 371 pmullw m6, [pw_5]
yading@10 372 paddw m0, [pw_16]
yading@10 373 add r1, r3
yading@10 374 paddw m0, m5
yading@10 375 paddw m6, m0
yading@10 376 psraw m6, 5
yading@10 377 packuswb m6, m6
yading@10 378 op_%1h m6, [r0], m0 ; 1
yading@10 379 add r0, r2
yading@10 380 SWAP 0, 1, 2, 3, 4, 5
yading@10 381 %endmacro
yading@10 382
yading@10 383 %macro QPEL4_V_LOWPASS_OP 1
yading@10 384 cglobal %1_h264_qpel4_v_lowpass, 4,4 ; dst, src, dstStride, srcStride
yading@10 385 movsxdifnidn r2, r2d
yading@10 386 movsxdifnidn r3, r3d
yading@10 387 sub r1, r3
yading@10 388 sub r1, r3
yading@10 389 pxor m7, m7
yading@10 390 movh m0, [r1]
yading@10 391 movh m1, [r1+r3]
yading@10 392 lea r1, [r1+2*r3]
yading@10 393 movh m2, [r1]
yading@10 394 movh m3, [r1+r3]
yading@10 395 lea r1, [r1+2*r3]
yading@10 396 movh m4, [r1]
yading@10 397 add r1, r3
yading@10 398 punpcklbw m0, m7
yading@10 399 punpcklbw m1, m7
yading@10 400 punpcklbw m2, m7
yading@10 401 punpcklbw m3, m7
yading@10 402 punpcklbw m4, m7
yading@10 403 FILT_V %1
yading@10 404 FILT_V %1
yading@10 405 FILT_V %1
yading@10 406 FILT_V %1
yading@10 407 RET
yading@10 408 %endmacro
yading@10 409
yading@10 410 INIT_MMX mmxext
yading@10 411 QPEL4_V_LOWPASS_OP put
yading@10 412 QPEL4_V_LOWPASS_OP avg
yading@10 413
yading@10 414
yading@10 415
yading@10 416 %macro QPEL8OR16_V_LOWPASS_OP 1
yading@10 417 %if cpuflag(sse2)
yading@10 418 cglobal %1_h264_qpel8or16_v_lowpass, 5,5,8 ; dst, src, dstStride, srcStride, h
yading@10 419 movsxdifnidn r2, r2d
yading@10 420 movsxdifnidn r3, r3d
yading@10 421 sub r1, r3
yading@10 422 sub r1, r3
yading@10 423 %else
yading@10 424 cglobal %1_h264_qpel8or16_v_lowpass_op, 5,5,8 ; dst, src, dstStride, srcStride, h
yading@10 425 movsxdifnidn r2, r2d
yading@10 426 movsxdifnidn r3, r3d
yading@10 427 %endif
yading@10 428 pxor m7, m7
yading@10 429 movh m0, [r1]
yading@10 430 movh m1, [r1+r3]
yading@10 431 lea r1, [r1+2*r3]
yading@10 432 movh m2, [r1]
yading@10 433 movh m3, [r1+r3]
yading@10 434 lea r1, [r1+2*r3]
yading@10 435 movh m4, [r1]
yading@10 436 add r1, r3
yading@10 437 punpcklbw m0, m7
yading@10 438 punpcklbw m1, m7
yading@10 439 punpcklbw m2, m7
yading@10 440 punpcklbw m3, m7
yading@10 441 punpcklbw m4, m7
yading@10 442 FILT_V %1
yading@10 443 FILT_V %1
yading@10 444 FILT_V %1
yading@10 445 FILT_V %1
yading@10 446 FILT_V %1
yading@10 447 FILT_V %1
yading@10 448 FILT_V %1
yading@10 449 FILT_V %1
yading@10 450 cmp r4d, 16
yading@10 451 jne .end
yading@10 452 FILT_V %1
yading@10 453 FILT_V %1
yading@10 454 FILT_V %1
yading@10 455 FILT_V %1
yading@10 456 FILT_V %1
yading@10 457 FILT_V %1
yading@10 458 FILT_V %1
yading@10 459 FILT_V %1
yading@10 460 .end:
yading@10 461 REP_RET
yading@10 462 %endmacro
yading@10 463
yading@10 464 INIT_MMX mmxext
yading@10 465 QPEL8OR16_V_LOWPASS_OP put
yading@10 466 QPEL8OR16_V_LOWPASS_OP avg
yading@10 467
yading@10 468 INIT_XMM sse2
yading@10 469 QPEL8OR16_V_LOWPASS_OP put
yading@10 470 QPEL8OR16_V_LOWPASS_OP avg
yading@10 471
yading@10 472
yading@10 473 ; All functions that use this are required to have args:
yading@10 474 ; src, tmp, srcSize
yading@10 475 %macro FILT_HV 1 ; offset
yading@10 476 mova m6, m2
yading@10 477 movh m5, [r0]
yading@10 478 paddw m6, m3
yading@10 479 psllw m6, 2
yading@10 480 paddw m0, [pw_16]
yading@10 481 psubw m6, m1
yading@10 482 psubw m6, m4
yading@10 483 punpcklbw m5, m7
yading@10 484 pmullw m6, [pw_5]
yading@10 485 paddw m0, m5
yading@10 486 add r0, r2
yading@10 487 paddw m6, m0
yading@10 488 mova [r1+%1], m6
yading@10 489 SWAP 0, 1, 2, 3, 4, 5
yading@10 490 %endmacro
yading@10 491
yading@10 492 %macro QPEL4_HV1_LOWPASS_OP 1
yading@10 493 cglobal %1_h264_qpel4_hv_lowpass_v, 3,3 ; src, tmp, srcStride
yading@10 494 movsxdifnidn r2, r2d
yading@10 495 pxor m7, m7
yading@10 496 movh m0, [r0]
yading@10 497 movh m1, [r0+r2]
yading@10 498 lea r0, [r0+2*r2]
yading@10 499 movh m2, [r0]
yading@10 500 movh m3, [r0+r2]
yading@10 501 lea r0, [r0+2*r2]
yading@10 502 movh m4, [r0]
yading@10 503 add r0, r2
yading@10 504 punpcklbw m0, m7
yading@10 505 punpcklbw m1, m7
yading@10 506 punpcklbw m2, m7
yading@10 507 punpcklbw m3, m7
yading@10 508 punpcklbw m4, m7
yading@10 509 FILT_HV 0*24
yading@10 510 FILT_HV 1*24
yading@10 511 FILT_HV 2*24
yading@10 512 FILT_HV 3*24
yading@10 513 RET
yading@10 514
yading@10 515 cglobal %1_h264_qpel4_hv_lowpass_h, 3,4 ; tmp, dst, dstStride
yading@10 516 movsxdifnidn r2, r2d
yading@10 517 mov r3d, 4
yading@10 518 .loop:
yading@10 519 mova m0, [r0]
yading@10 520 paddw m0, [r0+10]
yading@10 521 mova m1, [r0+2]
yading@10 522 paddw m1, [r0+8]
yading@10 523 mova m2, [r0+4]
yading@10 524 paddw m2, [r0+6]
yading@10 525 psubw m0, m1
yading@10 526 psraw m0, 2
yading@10 527 psubw m0, m1
yading@10 528 paddsw m0, m2
yading@10 529 psraw m0, 2
yading@10 530 paddw m0, m2
yading@10 531 psraw m0, 6
yading@10 532 packuswb m0, m0
yading@10 533 op_%1h m0, [r1], m7
yading@10 534 add r0, 24
yading@10 535 add r1, r2
yading@10 536 dec r3d
yading@10 537 jnz .loop
yading@10 538 REP_RET
yading@10 539 %endmacro
yading@10 540
yading@10 541 INIT_MMX mmxext
yading@10 542 QPEL4_HV1_LOWPASS_OP put
yading@10 543 QPEL4_HV1_LOWPASS_OP avg
yading@10 544
yading@10 545 %macro QPEL8OR16_HV1_LOWPASS_OP 1
yading@10 546 cglobal %1_h264_qpel8or16_hv1_lowpass_op, 4,4,8 ; src, tmp, srcStride, size
yading@10 547 movsxdifnidn r2, r2d
yading@10 548 pxor m7, m7
yading@10 549 movh m0, [r0]
yading@10 550 movh m1, [r0+r2]
yading@10 551 lea r0, [r0+2*r2]
yading@10 552 movh m2, [r0]
yading@10 553 movh m3, [r0+r2]
yading@10 554 lea r0, [r0+2*r2]
yading@10 555 movh m4, [r0]
yading@10 556 add r0, r2
yading@10 557 punpcklbw m0, m7
yading@10 558 punpcklbw m1, m7
yading@10 559 punpcklbw m2, m7
yading@10 560 punpcklbw m3, m7
yading@10 561 punpcklbw m4, m7
yading@10 562 FILT_HV 0*48
yading@10 563 FILT_HV 1*48
yading@10 564 FILT_HV 2*48
yading@10 565 FILT_HV 3*48
yading@10 566 FILT_HV 4*48
yading@10 567 FILT_HV 5*48
yading@10 568 FILT_HV 6*48
yading@10 569 FILT_HV 7*48
yading@10 570 cmp r3d, 16
yading@10 571 jne .end
yading@10 572 FILT_HV 8*48
yading@10 573 FILT_HV 9*48
yading@10 574 FILT_HV 10*48
yading@10 575 FILT_HV 11*48
yading@10 576 FILT_HV 12*48
yading@10 577 FILT_HV 13*48
yading@10 578 FILT_HV 14*48
yading@10 579 FILT_HV 15*48
yading@10 580 .end:
yading@10 581 REP_RET
yading@10 582 %endmacro
yading@10 583
yading@10 584 INIT_MMX mmxext
yading@10 585 QPEL8OR16_HV1_LOWPASS_OP put
yading@10 586 QPEL8OR16_HV1_LOWPASS_OP avg
yading@10 587
yading@10 588 INIT_XMM sse2
yading@10 589 QPEL8OR16_HV1_LOWPASS_OP put
yading@10 590
yading@10 591
yading@10 592
yading@10 593 %macro QPEL8OR16_HV2_LOWPASS_OP 1
yading@10 594 ; unused is to match ssse3 and mmxext args
yading@10 595 cglobal %1_h264_qpel8or16_hv2_lowpass_op, 5,5 ; dst, tmp, dstStride, unused, h
yading@10 596 movsxdifnidn r2, r2d
yading@10 597 .loop:
yading@10 598 mova m0, [r1]
yading@10 599 mova m3, [r1+8]
yading@10 600 mova m1, [r1+2]
yading@10 601 mova m4, [r1+10]
yading@10 602 paddw m0, m4
yading@10 603 paddw m1, m3
yading@10 604 paddw m3, [r1+18]
yading@10 605 paddw m4, [r1+16]
yading@10 606 mova m2, [r1+4]
yading@10 607 mova m5, [r1+12]
yading@10 608 paddw m2, [r1+6]
yading@10 609 paddw m5, [r1+14]
yading@10 610 psubw m0, m1
yading@10 611 psubw m3, m4
yading@10 612 psraw m0, 2
yading@10 613 psraw m3, 2
yading@10 614 psubw m0, m1
yading@10 615 psubw m3, m4
yading@10 616 paddsw m0, m2
yading@10 617 paddsw m3, m5
yading@10 618 psraw m0, 2
yading@10 619 psraw m3, 2
yading@10 620 paddw m0, m2
yading@10 621 paddw m3, m5
yading@10 622 psraw m0, 6
yading@10 623 psraw m3, 6
yading@10 624 packuswb m0, m3
yading@10 625 op_%1 m0, [r0], m7
yading@10 626 add r1, 48
yading@10 627 add r0, r2
yading@10 628 dec r4d
yading@10 629 jne .loop
yading@10 630 REP_RET
yading@10 631 %endmacro
yading@10 632
yading@10 633 INIT_MMX mmxext
yading@10 634 QPEL8OR16_HV2_LOWPASS_OP put
yading@10 635 QPEL8OR16_HV2_LOWPASS_OP avg
yading@10 636
yading@10 637 %macro QPEL8OR16_HV2_LOWPASS_OP_XMM 1
yading@10 638 cglobal %1_h264_qpel8or16_hv2_lowpass, 5,5,8 ; dst, tmp, dstStride, tmpStride, size
yading@10 639 movsxdifnidn r2, r2d
yading@10 640 movsxdifnidn r3, r3d
yading@10 641 cmp r4d, 16
yading@10 642 je .op16
yading@10 643 .loop8:
yading@10 644 mova m1, [r1+16]
yading@10 645 mova m0, [r1]
yading@10 646 mova m2, m1
yading@10 647 mova m3, m1
yading@10 648 mova m4, m1
yading@10 649 mova m5, m1
yading@10 650 palignr m5, m0, 10
yading@10 651 palignr m4, m0, 8
yading@10 652 palignr m3, m0, 6
yading@10 653 palignr m2, m0, 4
yading@10 654 palignr m1, m0, 2
yading@10 655 paddw m0, m5
yading@10 656 paddw m1, m4
yading@10 657 paddw m2, m3
yading@10 658 psubw m0, m1
yading@10 659 psraw m0, 2
yading@10 660 psubw m0, m1
yading@10 661 paddw m0, m2
yading@10 662 psraw m0, 2
yading@10 663 paddw m0, m2
yading@10 664 psraw m0, 6
yading@10 665 packuswb m0, m0
yading@10 666 op_%1h m0, [r0], m7
yading@10 667 add r1, 48
yading@10 668 add r0, r2
yading@10 669 dec r4d
yading@10 670 jne .loop8
yading@10 671 jmp .done
yading@10 672 .op16:
yading@10 673 mova m4, [r1+32]
yading@10 674 mova m5, [r1+16]
yading@10 675 mova m7, [r1]
yading@10 676 mova m3, m4
yading@10 677 mova m2, m4
yading@10 678 mova m1, m4
yading@10 679 mova m0, m4
yading@10 680 palignr m0, m5, 10
yading@10 681 palignr m1, m5, 8
yading@10 682 palignr m2, m5, 6
yading@10 683 palignr m3, m5, 4
yading@10 684 palignr m4, m5, 2
yading@10 685 paddw m0, m5
yading@10 686 paddw m1, m4
yading@10 687 paddw m2, m3
yading@10 688 mova m6, m5
yading@10 689 mova m4, m5
yading@10 690 mova m3, m5
yading@10 691 palignr m4, m7, 8
yading@10 692 palignr m6, m7, 2
yading@10 693 palignr m3, m7, 10
yading@10 694 paddw m4, m6
yading@10 695 mova m6, m5
yading@10 696 palignr m5, m7, 6
yading@10 697 palignr m6, m7, 4
yading@10 698 paddw m3, m7
yading@10 699 paddw m5, m6
yading@10 700 psubw m0, m1
yading@10 701 psubw m3, m4
yading@10 702 psraw m0, 2
yading@10 703 psraw m3, 2
yading@10 704 psubw m0, m1
yading@10 705 psubw m3, m4
yading@10 706 paddw m0, m2
yading@10 707 paddw m3, m5
yading@10 708 psraw m0, 2
yading@10 709 psraw m3, 2
yading@10 710 paddw m0, m2
yading@10 711 paddw m3, m5
yading@10 712 psraw m0, 6
yading@10 713 psraw m3, 6
yading@10 714 packuswb m3, m0
yading@10 715 op_%1 m3, [r0], m7
yading@10 716 add r1, 48
yading@10 717 add r0, r2
yading@10 718 dec r4d
yading@10 719 jne .op16
yading@10 720 .done:
yading@10 721 REP_RET
yading@10 722 %endmacro
yading@10 723
yading@10 724 INIT_XMM ssse3
yading@10 725 QPEL8OR16_HV2_LOWPASS_OP_XMM put
yading@10 726 QPEL8OR16_HV2_LOWPASS_OP_XMM avg
yading@10 727
yading@10 728
yading@10 729 %macro PIXELS4_L2_SHIFT5 1
yading@10 730 cglobal %1_pixels4_l2_shift5,6,6 ; dst, src16, src8, dstStride, src8Stride, h
yading@10 731 movsxdifnidn r3, r3d
yading@10 732 movsxdifnidn r4, r4d
yading@10 733 mova m0, [r1]
yading@10 734 mova m1, [r1+24]
yading@10 735 psraw m0, 5
yading@10 736 psraw m1, 5
yading@10 737 packuswb m0, m0
yading@10 738 packuswb m1, m1
yading@10 739 pavgb m0, [r2]
yading@10 740 pavgb m1, [r2+r4]
yading@10 741 op_%1h m0, [r0], m4
yading@10 742 op_%1h m1, [r0+r3], m5
yading@10 743 lea r2, [r2+r4*2]
yading@10 744 lea r0, [r0+r3*2]
yading@10 745 mova m0, [r1+48]
yading@10 746 mova m1, [r1+72]
yading@10 747 psraw m0, 5
yading@10 748 psraw m1, 5
yading@10 749 packuswb m0, m0
yading@10 750 packuswb m1, m1
yading@10 751 pavgb m0, [r2]
yading@10 752 pavgb m1, [r2+r4]
yading@10 753 op_%1h m0, [r0], m4
yading@10 754 op_%1h m1, [r0+r3], m5
yading@10 755 RET
yading@10 756 %endmacro
yading@10 757
yading@10 758 INIT_MMX mmxext
yading@10 759 PIXELS4_L2_SHIFT5 put
yading@10 760 PIXELS4_L2_SHIFT5 avg
yading@10 761
yading@10 762
yading@10 763 %macro PIXELS8_L2_SHIFT5 1
yading@10 764 cglobal %1_pixels8_l2_shift5, 6, 6 ; dst, src16, src8, dstStride, src8Stride, h
yading@10 765 movsxdifnidn r3, r3d
yading@10 766 movsxdifnidn r4, r4d
yading@10 767 .loop:
yading@10 768 mova m0, [r1]
yading@10 769 mova m1, [r1+8]
yading@10 770 mova m2, [r1+48]
yading@10 771 mova m3, [r1+48+8]
yading@10 772 psraw m0, 5
yading@10 773 psraw m1, 5
yading@10 774 psraw m2, 5
yading@10 775 psraw m3, 5
yading@10 776 packuswb m0, m1
yading@10 777 packuswb m2, m3
yading@10 778 pavgb m0, [r2]
yading@10 779 pavgb m2, [r2+r4]
yading@10 780 op_%1 m0, [r0], m4
yading@10 781 op_%1 m2, [r0+r3], m5
yading@10 782 lea r2, [r2+2*r4]
yading@10 783 add r1, 48*2
yading@10 784 lea r0, [r0+2*r3]
yading@10 785 sub r5d, 2
yading@10 786 jne .loop
yading@10 787 REP_RET
yading@10 788 %endmacro
yading@10 789
yading@10 790 INIT_MMX mmxext
yading@10 791 PIXELS8_L2_SHIFT5 put
yading@10 792 PIXELS8_L2_SHIFT5 avg
yading@10 793
yading@10 794
yading@10 795 %if ARCH_X86_64
yading@10 796 %macro QPEL16_H_LOWPASS_L2_OP 1
yading@10 797 cglobal %1_h264_qpel16_h_lowpass_l2, 5, 6, 16 ; dst, src, src2, dstStride, src2Stride
yading@10 798 movsxdifnidn r3, r3d
yading@10 799 movsxdifnidn r4, r4d
yading@10 800 mov r5d, 16
yading@10 801 pxor m15, m15
yading@10 802 mova m14, [pw_5]
yading@10 803 mova m13, [pw_16]
yading@10 804 .loop:
yading@10 805 lddqu m1, [r1+6]
yading@10 806 lddqu m7, [r1-2]
yading@10 807 mova m0, m1
yading@10 808 punpckhbw m1, m15
yading@10 809 punpcklbw m0, m15
yading@10 810 punpcklbw m7, m15
yading@10 811 mova m2, m1
yading@10 812 mova m6, m0
yading@10 813 mova m3, m1
yading@10 814 mova m8, m0
yading@10 815 mova m4, m1
yading@10 816 mova m9, m0
yading@10 817 mova m12, m0
yading@10 818 mova m11, m1
yading@10 819 palignr m11, m0, 10
yading@10 820 palignr m12, m7, 10
yading@10 821 palignr m4, m0, 2
yading@10 822 palignr m9, m7, 2
yading@10 823 palignr m3, m0, 4
yading@10 824 palignr m8, m7, 4
yading@10 825 palignr m2, m0, 6
yading@10 826 palignr m6, m7, 6
yading@10 827 paddw m11, m0
yading@10 828 palignr m1, m0, 8
yading@10 829 palignr m0, m7, 8
yading@10 830 paddw m7, m12
yading@10 831 paddw m2, m3
yading@10 832 paddw m6, m8
yading@10 833 paddw m1, m4
yading@10 834 paddw m0, m9
yading@10 835 psllw m2, 2
yading@10 836 psllw m6, 2
yading@10 837 psubw m2, m1
yading@10 838 psubw m6, m0
yading@10 839 paddw m11, m13
yading@10 840 paddw m7, m13
yading@10 841 pmullw m2, m14
yading@10 842 pmullw m6, m14
yading@10 843 lddqu m3, [r2]
yading@10 844 paddw m2, m11
yading@10 845 paddw m6, m7
yading@10 846 psraw m2, 5
yading@10 847 psraw m6, 5
yading@10 848 packuswb m6, m2
yading@10 849 pavgb m6, m3
yading@10 850 op_%1 m6, [r0], m11
yading@10 851 add r1, r3
yading@10 852 add r0, r3
yading@10 853 add r2, r4
yading@10 854 dec r5d
yading@10 855 jg .loop
yading@10 856 REP_RET
yading@10 857 %endmacro
yading@10 858
yading@10 859 INIT_XMM ssse3
yading@10 860 QPEL16_H_LOWPASS_L2_OP put
yading@10 861 QPEL16_H_LOWPASS_L2_OP avg
yading@10 862 %endif