annotate ffmpeg/libavcodec/x86/h264_intrapred_10bit.asm @ 13:844d341cf643 tip

Back up before ISMIR
author Yading Song <yading.song@eecs.qmul.ac.uk>
date Thu, 31 Oct 2013 13:17:06 +0000
parents 6840f77b83aa
children
rev   line source
yading@10 1 ;*****************************************************************************
yading@10 2 ;* MMX/SSE2/AVX-optimized 10-bit H.264 intra prediction code
yading@10 3 ;*****************************************************************************
yading@10 4 ;* Copyright (C) 2005-2011 x264 project
yading@10 5 ;*
yading@10 6 ;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
yading@10 7 ;*
yading@10 8 ;* This file is part of Libav.
yading@10 9 ;*
yading@10 10 ;* Libav is free software; you can redistribute it and/or
yading@10 11 ;* modify it under the terms of the GNU Lesser General Public
yading@10 12 ;* License as published by the Free Software Foundation; either
yading@10 13 ;* version 2.1 of the License, or (at your option) any later version.
yading@10 14 ;*
yading@10 15 ;* Libav is distributed in the hope that it will be useful,
yading@10 16 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
yading@10 17 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
yading@10 18 ;* Lesser General Public License for more details.
yading@10 19 ;*
yading@10 20 ;* You should have received a copy of the GNU Lesser General Public
yading@10 21 ;* License along with Libav; if not, write to the Free Software
yading@10 22 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
yading@10 23 ;******************************************************************************
yading@10 24
yading@10 25 %include "libavutil/x86/x86util.asm"
yading@10 26
yading@10 27 SECTION_RODATA
yading@10 28
yading@10 29 cextern pw_16
yading@10 30 cextern pw_8
yading@10 31 cextern pw_4
yading@10 32 cextern pw_2
yading@10 33 cextern pw_1
yading@10 34
yading@10 35 pw_m32101234: dw -3, -2, -1, 0, 1, 2, 3, 4
yading@10 36 pw_m3: times 8 dw -3
yading@10 37 pw_pixel_max: times 8 dw ((1 << 10)-1)
yading@10 38 pw_512: times 8 dw 512
yading@10 39 pd_17: times 4 dd 17
yading@10 40 pd_16: times 4 dd 16
yading@10 41
yading@10 42 SECTION .text
yading@10 43
yading@10 44 ; dest, left, right, src
yading@10 45 ; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
yading@10 46 %macro PRED4x4_LOWPASS 4
yading@10 47 paddw %2, %3
yading@10 48 psrlw %2, 1
yading@10 49 pavgw %1, %4, %2
yading@10 50 %endmacro
yading@10 51
yading@10 52 ;-----------------------------------------------------------------------------
yading@10 53 ; void pred4x4_down_right(pixel *src, const pixel *topright, int stride)
yading@10 54 ;-----------------------------------------------------------------------------
yading@10 55 %macro PRED4x4_DR 0
yading@10 56 cglobal pred4x4_down_right_10, 3, 3
yading@10 57 sub r0, r2
yading@10 58 lea r1, [r0+r2*2]
yading@10 59 movhps m1, [r1-8]
yading@10 60 movhps m2, [r0+r2*1-8]
yading@10 61 movhps m4, [r0-8]
yading@10 62 punpckhwd m2, m4
yading@10 63 movq m3, [r0]
yading@10 64 punpckhdq m1, m2
yading@10 65 PALIGNR m3, m1, 10, m1
yading@10 66 movhps m4, [r1+r2*1-8]
yading@10 67 PALIGNR m0, m3, m4, 14, m4
yading@10 68 movhps m4, [r1+r2*2-8]
yading@10 69 PALIGNR m2, m0, m4, 14, m4
yading@10 70 PRED4x4_LOWPASS m0, m2, m3, m0
yading@10 71 movq [r1+r2*2], m0
yading@10 72 psrldq m0, 2
yading@10 73 movq [r1+r2*1], m0
yading@10 74 psrldq m0, 2
yading@10 75 movq [r0+r2*2], m0
yading@10 76 psrldq m0, 2
yading@10 77 movq [r0+r2*1], m0
yading@10 78 RET
yading@10 79 %endmacro
yading@10 80
yading@10 81 INIT_XMM sse2
yading@10 82 PRED4x4_DR
yading@10 83 INIT_XMM ssse3
yading@10 84 PRED4x4_DR
yading@10 85 %if HAVE_AVX_EXTERNAL
yading@10 86 INIT_XMM avx
yading@10 87 PRED4x4_DR
yading@10 88 %endif
yading@10 89
yading@10 90 ;-----------------------------------------------------------------------------
yading@10 91 ; void pred4x4_vertical_right(pixel *src, const pixel *topright, int stride)
yading@10 92 ;-----------------------------------------------------------------------------
yading@10 93 %macro PRED4x4_VR 0
yading@10 94 cglobal pred4x4_vertical_right_10, 3, 3, 6
yading@10 95 sub r0, r2
yading@10 96 lea r1, [r0+r2*2]
yading@10 97 movq m5, [r0] ; ........t3t2t1t0
yading@10 98 movhps m1, [r0-8]
yading@10 99 PALIGNR m0, m5, m1, 14, m1 ; ......t3t2t1t0lt
yading@10 100 pavgw m5, m0
yading@10 101 movhps m1, [r0+r2*1-8]
yading@10 102 PALIGNR m0, m1, 14, m1 ; ....t3t2t1t0ltl0
yading@10 103 movhps m2, [r0+r2*2-8]
yading@10 104 PALIGNR m1, m0, m2, 14, m2 ; ..t3t2t1t0ltl0l1
yading@10 105 movhps m3, [r1+r2*1-8]
yading@10 106 PALIGNR m2, m1, m3, 14, m3 ; t3t2t1t0ltl0l1l2
yading@10 107 PRED4x4_LOWPASS m1, m0, m2, m1
yading@10 108 pslldq m0, m1, 12
yading@10 109 psrldq m1, 4
yading@10 110 movq [r0+r2*1], m5
yading@10 111 movq [r0+r2*2], m1
yading@10 112 PALIGNR m5, m0, 14, m2
yading@10 113 pslldq m0, 2
yading@10 114 movq [r1+r2*1], m5
yading@10 115 PALIGNR m1, m0, 14, m0
yading@10 116 movq [r1+r2*2], m1
yading@10 117 RET
yading@10 118 %endmacro
yading@10 119
yading@10 120 INIT_XMM sse2
yading@10 121 PRED4x4_VR
yading@10 122 INIT_XMM ssse3
yading@10 123 PRED4x4_VR
yading@10 124 %if HAVE_AVX_EXTERNAL
yading@10 125 INIT_XMM avx
yading@10 126 PRED4x4_VR
yading@10 127 %endif
yading@10 128
yading@10 129 ;-----------------------------------------------------------------------------
yading@10 130 ; void pred4x4_horizontal_down(pixel *src, const pixel *topright, int stride)
yading@10 131 ;-----------------------------------------------------------------------------
yading@10 132 %macro PRED4x4_HD 0
yading@10 133 cglobal pred4x4_horizontal_down_10, 3, 3
yading@10 134 sub r0, r2
yading@10 135 lea r1, [r0+r2*2]
yading@10 136 movq m0, [r0-8] ; lt ..
yading@10 137 movhps m0, [r0]
yading@10 138 pslldq m0, 2 ; t2 t1 t0 lt .. .. .. ..
yading@10 139 movq m1, [r1+r2*2-8] ; l3
yading@10 140 movq m3, [r1+r2*1-8]
yading@10 141 punpcklwd m1, m3 ; l2 l3
yading@10 142 movq m2, [r0+r2*2-8] ; l1
yading@10 143 movq m3, [r0+r2*1-8]
yading@10 144 punpcklwd m2, m3 ; l0 l1
yading@10 145 punpckhdq m1, m2 ; l0 l1 l2 l3
yading@10 146 punpckhqdq m1, m0 ; t2 t1 t0 lt l0 l1 l2 l3
yading@10 147 psrldq m0, m1, 4 ; .. .. t2 t1 t0 lt l0 l1
yading@10 148 psrldq m3, m1, 2 ; .. t2 t1 t0 lt l0 l1 l2
yading@10 149 pavgw m5, m1, m3
yading@10 150 PRED4x4_LOWPASS m3, m1, m0, m3
yading@10 151 punpcklwd m5, m3
yading@10 152 psrldq m3, 8
yading@10 153 PALIGNR m3, m5, 12, m4
yading@10 154 movq [r1+r2*2], m5
yading@10 155 movhps [r0+r2*2], m5
yading@10 156 psrldq m5, 4
yading@10 157 movq [r1+r2*1], m5
yading@10 158 movq [r0+r2*1], m3
yading@10 159 RET
yading@10 160 %endmacro
yading@10 161
yading@10 162 INIT_XMM sse2
yading@10 163 PRED4x4_HD
yading@10 164 INIT_XMM ssse3
yading@10 165 PRED4x4_HD
yading@10 166 %if HAVE_AVX_EXTERNAL
yading@10 167 INIT_XMM avx
yading@10 168 PRED4x4_HD
yading@10 169 %endif
yading@10 170
yading@10 171 ;-----------------------------------------------------------------------------
yading@10 172 ; void pred4x4_dc(pixel *src, const pixel *topright, int stride)
yading@10 173 ;-----------------------------------------------------------------------------
yading@10 174 %macro HADDD 2 ; sum junk
yading@10 175 %if mmsize == 16
yading@10 176 movhlps %2, %1
yading@10 177 paddd %1, %2
yading@10 178 pshuflw %2, %1, 0xE
yading@10 179 paddd %1, %2
yading@10 180 %else
yading@10 181 pshufw %2, %1, 0xE
yading@10 182 paddd %1, %2
yading@10 183 %endif
yading@10 184 %endmacro
yading@10 185
yading@10 186 %macro HADDW 2
yading@10 187 pmaddwd %1, [pw_1]
yading@10 188 HADDD %1, %2
yading@10 189 %endmacro
yading@10 190
yading@10 191 INIT_MMX mmxext
yading@10 192 cglobal pred4x4_dc_10, 3, 3
yading@10 193 sub r0, r2
yading@10 194 lea r1, [r0+r2*2]
yading@10 195 movq m2, [r0+r2*1-8]
yading@10 196 paddw m2, [r0+r2*2-8]
yading@10 197 paddw m2, [r1+r2*1-8]
yading@10 198 paddw m2, [r1+r2*2-8]
yading@10 199 psrlq m2, 48
yading@10 200 movq m0, [r0]
yading@10 201 HADDW m0, m1
yading@10 202 paddw m0, [pw_4]
yading@10 203 paddw m0, m2
yading@10 204 psrlw m0, 3
yading@10 205 SPLATW m0, m0, 0
yading@10 206 movq [r0+r2*1], m0
yading@10 207 movq [r0+r2*2], m0
yading@10 208 movq [r1+r2*1], m0
yading@10 209 movq [r1+r2*2], m0
yading@10 210 RET
yading@10 211
yading@10 212 ;-----------------------------------------------------------------------------
yading@10 213 ; void pred4x4_down_left(pixel *src, const pixel *topright, int stride)
yading@10 214 ;-----------------------------------------------------------------------------
yading@10 215 %macro PRED4x4_DL 0
yading@10 216 cglobal pred4x4_down_left_10, 3, 3
yading@10 217 sub r0, r2
yading@10 218 movq m0, [r0]
yading@10 219 movhps m0, [r1]
yading@10 220 psrldq m2, m0, 2
yading@10 221 pslldq m3, m0, 2
yading@10 222 pshufhw m2, m2, 10100100b
yading@10 223 PRED4x4_LOWPASS m0, m3, m2, m0
yading@10 224 lea r1, [r0+r2*2]
yading@10 225 movhps [r1+r2*2], m0
yading@10 226 psrldq m0, 2
yading@10 227 movq [r0+r2*1], m0
yading@10 228 psrldq m0, 2
yading@10 229 movq [r0+r2*2], m0
yading@10 230 psrldq m0, 2
yading@10 231 movq [r1+r2*1], m0
yading@10 232 RET
yading@10 233 %endmacro
yading@10 234
yading@10 235 INIT_XMM sse2
yading@10 236 PRED4x4_DL
yading@10 237 %if HAVE_AVX_EXTERNAL
yading@10 238 INIT_XMM avx
yading@10 239 PRED4x4_DL
yading@10 240 %endif
yading@10 241
yading@10 242 ;-----------------------------------------------------------------------------
yading@10 243 ; void pred4x4_vertical_left(pixel *src, const pixel *topright, int stride)
yading@10 244 ;-----------------------------------------------------------------------------
yading@10 245 %macro PRED4x4_VL 0
yading@10 246 cglobal pred4x4_vertical_left_10, 3, 3
yading@10 247 sub r0, r2
yading@10 248 movu m1, [r0]
yading@10 249 movhps m1, [r1]
yading@10 250 psrldq m0, m1, 2
yading@10 251 psrldq m2, m1, 4
yading@10 252 pavgw m4, m0, m1
yading@10 253 PRED4x4_LOWPASS m0, m1, m2, m0
yading@10 254 lea r1, [r0+r2*2]
yading@10 255 movq [r0+r2*1], m4
yading@10 256 movq [r0+r2*2], m0
yading@10 257 psrldq m4, 2
yading@10 258 psrldq m0, 2
yading@10 259 movq [r1+r2*1], m4
yading@10 260 movq [r1+r2*2], m0
yading@10 261 RET
yading@10 262 %endmacro
yading@10 263
yading@10 264 INIT_XMM sse2
yading@10 265 PRED4x4_VL
yading@10 266 %if HAVE_AVX_EXTERNAL
yading@10 267 INIT_XMM avx
yading@10 268 PRED4x4_VL
yading@10 269 %endif
yading@10 270
yading@10 271 ;-----------------------------------------------------------------------------
yading@10 272 ; void pred4x4_horizontal_up(pixel *src, const pixel *topright, int stride)
yading@10 273 ;-----------------------------------------------------------------------------
yading@10 274 INIT_MMX mmxext
yading@10 275 cglobal pred4x4_horizontal_up_10, 3, 3
yading@10 276 sub r0, r2
yading@10 277 lea r1, [r0+r2*2]
yading@10 278 movq m0, [r0+r2*1-8]
yading@10 279 punpckhwd m0, [r0+r2*2-8]
yading@10 280 movq m1, [r1+r2*1-8]
yading@10 281 punpckhwd m1, [r1+r2*2-8]
yading@10 282 punpckhdq m0, m1
yading@10 283 pshufw m1, m1, 0xFF
yading@10 284 movq [r1+r2*2], m1
yading@10 285 movd [r1+r2*1+4], m1
yading@10 286 pshufw m2, m0, 11111001b
yading@10 287 movq m1, m2
yading@10 288 pavgw m2, m0
yading@10 289
yading@10 290 pshufw m5, m0, 11111110b
yading@10 291 PRED4x4_LOWPASS m1, m0, m5, m1
yading@10 292 movq m6, m2
yading@10 293 punpcklwd m6, m1
yading@10 294 movq [r0+r2*1], m6
yading@10 295 psrlq m2, 16
yading@10 296 psrlq m1, 16
yading@10 297 punpcklwd m2, m1
yading@10 298 movq [r0+r2*2], m2
yading@10 299 psrlq m2, 32
yading@10 300 movd [r1+r2*1], m2
yading@10 301 RET
yading@10 302
yading@10 303
yading@10 304
yading@10 305 ;-----------------------------------------------------------------------------
yading@10 306 ; void pred8x8_vertical(pixel *src, int stride)
yading@10 307 ;-----------------------------------------------------------------------------
yading@10 308 INIT_XMM sse2
yading@10 309 cglobal pred8x8_vertical_10, 2, 2
yading@10 310 sub r0, r1
yading@10 311 mova m0, [r0]
yading@10 312 %rep 3
yading@10 313 mova [r0+r1*1], m0
yading@10 314 mova [r0+r1*2], m0
yading@10 315 lea r0, [r0+r1*2]
yading@10 316 %endrep
yading@10 317 mova [r0+r1*1], m0
yading@10 318 mova [r0+r1*2], m0
yading@10 319 RET
yading@10 320
yading@10 321 ;-----------------------------------------------------------------------------
yading@10 322 ; void pred8x8_horizontal(pixel *src, int stride)
yading@10 323 ;-----------------------------------------------------------------------------
yading@10 324 INIT_XMM sse2
yading@10 325 cglobal pred8x8_horizontal_10, 2, 3
yading@10 326 mov r2d, 4
yading@10 327 .loop:
yading@10 328 movq m0, [r0+r1*0-8]
yading@10 329 movq m1, [r0+r1*1-8]
yading@10 330 pshuflw m0, m0, 0xff
yading@10 331 pshuflw m1, m1, 0xff
yading@10 332 punpcklqdq m0, m0
yading@10 333 punpcklqdq m1, m1
yading@10 334 mova [r0+r1*0], m0
yading@10 335 mova [r0+r1*1], m1
yading@10 336 lea r0, [r0+r1*2]
yading@10 337 dec r2d
yading@10 338 jg .loop
yading@10 339 REP_RET
yading@10 340
yading@10 341 ;-----------------------------------------------------------------------------
yading@10 342 ; void predict_8x8_dc(pixel *src, int stride)
yading@10 343 ;-----------------------------------------------------------------------------
yading@10 344 %macro MOV8 2-3
yading@10 345 ; sort of a hack, but it works
yading@10 346 %if mmsize==8
yading@10 347 movq [%1+0], %2
yading@10 348 movq [%1+8], %3
yading@10 349 %else
yading@10 350 movdqa [%1], %2
yading@10 351 %endif
yading@10 352 %endmacro
yading@10 353
yading@10 354 %macro PRED8x8_DC 1
yading@10 355 cglobal pred8x8_dc_10, 2, 6
yading@10 356 sub r0, r1
yading@10 357 pxor m4, m4
yading@10 358 movq m0, [r0+0]
yading@10 359 movq m1, [r0+8]
yading@10 360 %if mmsize==16
yading@10 361 punpcklwd m0, m1
yading@10 362 movhlps m1, m0
yading@10 363 paddw m0, m1
yading@10 364 %else
yading@10 365 pshufw m2, m0, 00001110b
yading@10 366 pshufw m3, m1, 00001110b
yading@10 367 paddw m0, m2
yading@10 368 paddw m1, m3
yading@10 369 punpcklwd m0, m1
yading@10 370 %endif
yading@10 371 %1 m2, m0, 00001110b
yading@10 372 paddw m0, m2
yading@10 373
yading@10 374 lea r5, [r1*3]
yading@10 375 lea r4, [r0+r1*4]
yading@10 376 movzx r2d, word [r0+r1*1-2]
yading@10 377 movzx r3d, word [r0+r1*2-2]
yading@10 378 add r2d, r3d
yading@10 379 movzx r3d, word [r0+r5*1-2]
yading@10 380 add r2d, r3d
yading@10 381 movzx r3d, word [r4-2]
yading@10 382 add r2d, r3d
yading@10 383 movd m2, r2d ; s2
yading@10 384
yading@10 385 movzx r2d, word [r4+r1*1-2]
yading@10 386 movzx r3d, word [r4+r1*2-2]
yading@10 387 add r2d, r3d
yading@10 388 movzx r3d, word [r4+r5*1-2]
yading@10 389 add r2d, r3d
yading@10 390 movzx r3d, word [r4+r1*4-2]
yading@10 391 add r2d, r3d
yading@10 392 movd m3, r2d ; s3
yading@10 393
yading@10 394 punpcklwd m2, m3
yading@10 395 punpckldq m0, m2 ; s0, s1, s2, s3
yading@10 396 %1 m3, m0, 11110110b ; s2, s1, s3, s3
yading@10 397 %1 m0, m0, 01110100b ; s0, s1, s3, s1
yading@10 398 paddw m0, m3
yading@10 399 psrlw m0, 2
yading@10 400 pavgw m0, m4 ; s0+s2, s1, s3, s1+s3
yading@10 401 %if mmsize==16
yading@10 402 punpcklwd m0, m0
yading@10 403 pshufd m3, m0, 11111010b
yading@10 404 punpckldq m0, m0
yading@10 405 SWAP 0,1
yading@10 406 %else
yading@10 407 pshufw m1, m0, 0x00
yading@10 408 pshufw m2, m0, 0x55
yading@10 409 pshufw m3, m0, 0xaa
yading@10 410 pshufw m4, m0, 0xff
yading@10 411 %endif
yading@10 412 MOV8 r0+r1*1, m1, m2
yading@10 413 MOV8 r0+r1*2, m1, m2
yading@10 414 MOV8 r0+r5*1, m1, m2
yading@10 415 MOV8 r0+r1*4, m1, m2
yading@10 416 MOV8 r4+r1*1, m3, m4
yading@10 417 MOV8 r4+r1*2, m3, m4
yading@10 418 MOV8 r4+r5*1, m3, m4
yading@10 419 MOV8 r4+r1*4, m3, m4
yading@10 420 RET
yading@10 421 %endmacro
yading@10 422
yading@10 423 INIT_MMX mmxext
yading@10 424 PRED8x8_DC pshufw
yading@10 425 INIT_XMM sse2
yading@10 426 PRED8x8_DC pshuflw
yading@10 427
yading@10 428 ;-----------------------------------------------------------------------------
yading@10 429 ; void pred8x8_top_dc(pixel *src, int stride)
yading@10 430 ;-----------------------------------------------------------------------------
yading@10 431 INIT_XMM sse2
yading@10 432 cglobal pred8x8_top_dc_10, 2, 4
yading@10 433 sub r0, r1
yading@10 434 mova m0, [r0]
yading@10 435 pshuflw m1, m0, 0x4e
yading@10 436 pshufhw m1, m1, 0x4e
yading@10 437 paddw m0, m1
yading@10 438 pshuflw m1, m0, 0xb1
yading@10 439 pshufhw m1, m1, 0xb1
yading@10 440 paddw m0, m1
yading@10 441 lea r2, [r1*3]
yading@10 442 lea r3, [r0+r1*4]
yading@10 443 paddw m0, [pw_2]
yading@10 444 psrlw m0, 2
yading@10 445 mova [r0+r1*1], m0
yading@10 446 mova [r0+r1*2], m0
yading@10 447 mova [r0+r2*1], m0
yading@10 448 mova [r0+r1*4], m0
yading@10 449 mova [r3+r1*1], m0
yading@10 450 mova [r3+r1*2], m0
yading@10 451 mova [r3+r2*1], m0
yading@10 452 mova [r3+r1*4], m0
yading@10 453 RET
yading@10 454
yading@10 455 ;-----------------------------------------------------------------------------
yading@10 456 ; void pred8x8_plane(pixel *src, int stride)
yading@10 457 ;-----------------------------------------------------------------------------
yading@10 458 INIT_XMM sse2
yading@10 459 cglobal pred8x8_plane_10, 2, 7, 7
yading@10 460 sub r0, r1
yading@10 461 lea r2, [r1*3]
yading@10 462 lea r3, [r0+r1*4]
yading@10 463 mova m2, [r0]
yading@10 464 pmaddwd m2, [pw_m32101234]
yading@10 465 HADDD m2, m1
yading@10 466 movd m0, [r0-4]
yading@10 467 psrld m0, 14
yading@10 468 psubw m2, m0 ; H
yading@10 469 movd m0, [r3+r1*4-4]
yading@10 470 movd m1, [r0+12]
yading@10 471 paddw m0, m1
yading@10 472 psllw m0, 4 ; 16*(src[7*stride-1] + src[-stride+7])
yading@10 473 movzx r4d, word [r3+r1*1-2] ; src[4*stride-1]
yading@10 474 movzx r5d, word [r0+r2*1-2] ; src[2*stride-1]
yading@10 475 sub r4d, r5d
yading@10 476 movzx r6d, word [r3+r1*2-2] ; src[5*stride-1]
yading@10 477 movzx r5d, word [r0+r1*2-2] ; src[1*stride-1]
yading@10 478 sub r6d, r5d
yading@10 479 lea r4d, [r4+r6*2]
yading@10 480 movzx r5d, word [r3+r2*1-2] ; src[6*stride-1]
yading@10 481 movzx r6d, word [r0+r1*1-2] ; src[0*stride-1]
yading@10 482 sub r5d, r6d
yading@10 483 lea r5d, [r5*3]
yading@10 484 add r4d, r5d
yading@10 485 movzx r6d, word [r3+r1*4-2] ; src[7*stride-1]
yading@10 486 movzx r5d, word [r0+r1*0-2] ; src[ -stride-1]
yading@10 487 sub r6d, r5d
yading@10 488 lea r4d, [r4+r6*4]
yading@10 489 movd m3, r4d ; V
yading@10 490 punpckldq m2, m3
yading@10 491 pmaddwd m2, [pd_17]
yading@10 492 paddd m2, [pd_16]
yading@10 493 psrad m2, 5 ; b, c
yading@10 494
yading@10 495 mova m3, [pw_pixel_max]
yading@10 496 pxor m1, m1
yading@10 497 SPLATW m0, m0, 1
yading@10 498 SPLATW m4, m2, 2
yading@10 499 SPLATW m2, m2, 0
yading@10 500 pmullw m2, [pw_m32101234] ; b
yading@10 501 pmullw m5, m4, [pw_m3] ; c
yading@10 502 paddw m5, [pw_16]
yading@10 503 mov r2d, 8
yading@10 504 add r0, r1
yading@10 505 .loop:
yading@10 506 paddsw m6, m2, m5
yading@10 507 paddsw m6, m0
yading@10 508 psraw m6, 5
yading@10 509 CLIPW m6, m1, m3
yading@10 510 mova [r0], m6
yading@10 511 paddw m5, m4
yading@10 512 add r0, r1
yading@10 513 dec r2d
yading@10 514 jg .loop
yading@10 515 REP_RET
yading@10 516
yading@10 517
yading@10 518 ;-----------------------------------------------------------------------------
yading@10 519 ; void pred8x8l_128_dc(pixel *src, int has_topleft, int has_topright, int stride)
yading@10 520 ;-----------------------------------------------------------------------------
yading@10 521 %macro PRED8x8L_128_DC 0
yading@10 522 cglobal pred8x8l_128_dc_10, 4, 4
yading@10 523 mova m0, [pw_512] ; (1<<(BIT_DEPTH-1))
yading@10 524 lea r1, [r3*3]
yading@10 525 lea r2, [r0+r3*4]
yading@10 526 MOV8 r0+r3*0, m0, m0
yading@10 527 MOV8 r0+r3*1, m0, m0
yading@10 528 MOV8 r0+r3*2, m0, m0
yading@10 529 MOV8 r0+r1*1, m0, m0
yading@10 530 MOV8 r2+r3*0, m0, m0
yading@10 531 MOV8 r2+r3*1, m0, m0
yading@10 532 MOV8 r2+r3*2, m0, m0
yading@10 533 MOV8 r2+r1*1, m0, m0
yading@10 534 RET
yading@10 535 %endmacro
yading@10 536
yading@10 537 INIT_MMX mmxext
yading@10 538 PRED8x8L_128_DC
yading@10 539 INIT_XMM sse2
yading@10 540 PRED8x8L_128_DC
yading@10 541
yading@10 542 ;-----------------------------------------------------------------------------
yading@10 543 ; void pred8x8l_top_dc(pixel *src, int has_topleft, int has_topright, int stride)
yading@10 544 ;-----------------------------------------------------------------------------
yading@10 545 %macro PRED8x8L_TOP_DC 0
yading@10 546 cglobal pred8x8l_top_dc_10, 4, 4, 6
yading@10 547 sub r0, r3
yading@10 548 mova m0, [r0]
yading@10 549 shr r1d, 14
yading@10 550 shr r2d, 13
yading@10 551 neg r1
yading@10 552 pslldq m1, m0, 2
yading@10 553 psrldq m2, m0, 2
yading@10 554 pinsrw m1, [r0+r1], 0
yading@10 555 pinsrw m2, [r0+r2+14], 7
yading@10 556 lea r1, [r3*3]
yading@10 557 lea r2, [r0+r3*4]
yading@10 558 PRED4x4_LOWPASS m0, m2, m1, m0
yading@10 559 HADDW m0, m1
yading@10 560 paddw m0, [pw_4]
yading@10 561 psrlw m0, 3
yading@10 562 SPLATW m0, m0, 0
yading@10 563 mova [r0+r3*1], m0
yading@10 564 mova [r0+r3*2], m0
yading@10 565 mova [r0+r1*1], m0
yading@10 566 mova [r0+r3*4], m0
yading@10 567 mova [r2+r3*1], m0
yading@10 568 mova [r2+r3*2], m0
yading@10 569 mova [r2+r1*1], m0
yading@10 570 mova [r2+r3*4], m0
yading@10 571 RET
yading@10 572 %endmacro
yading@10 573
yading@10 574 INIT_XMM sse2
yading@10 575 PRED8x8L_TOP_DC
yading@10 576 %if HAVE_AVX_EXTERNAL
yading@10 577 INIT_XMM avx
yading@10 578 PRED8x8L_TOP_DC
yading@10 579 %endif
yading@10 580
yading@10 581 ;-----------------------------------------------------------------------------
yading@10 582 ;void pred8x8l_dc(pixel *src, int has_topleft, int has_topright, int stride)
yading@10 583 ;-----------------------------------------------------------------------------
yading@10 584 ;TODO: see if scalar is faster
yading@10 585 %macro PRED8x8L_DC 0
yading@10 586 cglobal pred8x8l_dc_10, 4, 6, 6
yading@10 587 sub r0, r3
yading@10 588 lea r4, [r0+r3*4]
yading@10 589 lea r5, [r3*3]
yading@10 590 mova m0, [r0+r3*2-16]
yading@10 591 punpckhwd m0, [r0+r3*1-16]
yading@10 592 mova m1, [r4+r3*0-16]
yading@10 593 punpckhwd m1, [r0+r5*1-16]
yading@10 594 punpckhdq m1, m0
yading@10 595 mova m2, [r4+r3*2-16]
yading@10 596 punpckhwd m2, [r4+r3*1-16]
yading@10 597 mova m3, [r4+r3*4-16]
yading@10 598 punpckhwd m3, [r4+r5*1-16]
yading@10 599 punpckhdq m3, m2
yading@10 600 punpckhqdq m3, m1
yading@10 601 mova m0, [r0]
yading@10 602 shr r1d, 14
yading@10 603 shr r2d, 13
yading@10 604 neg r1
yading@10 605 pslldq m1, m0, 2
yading@10 606 psrldq m2, m0, 2
yading@10 607 pinsrw m1, [r0+r1], 0
yading@10 608 pinsrw m2, [r0+r2+14], 7
yading@10 609 not r1
yading@10 610 and r1, r3
yading@10 611 pslldq m4, m3, 2
yading@10 612 psrldq m5, m3, 2
yading@10 613 pshuflw m4, m4, 11100101b
yading@10 614 pinsrw m5, [r0+r1-2], 7
yading@10 615 PRED4x4_LOWPASS m3, m4, m5, m3
yading@10 616 PRED4x4_LOWPASS m0, m2, m1, m0
yading@10 617 paddw m0, m3
yading@10 618 HADDW m0, m1
yading@10 619 paddw m0, [pw_8]
yading@10 620 psrlw m0, 4
yading@10 621 SPLATW m0, m0
yading@10 622 mova [r0+r3*1], m0
yading@10 623 mova [r0+r3*2], m0
yading@10 624 mova [r0+r5*1], m0
yading@10 625 mova [r0+r3*4], m0
yading@10 626 mova [r4+r3*1], m0
yading@10 627 mova [r4+r3*2], m0
yading@10 628 mova [r4+r5*1], m0
yading@10 629 mova [r4+r3*4], m0
yading@10 630 RET
yading@10 631 %endmacro
yading@10 632
yading@10 633 INIT_XMM sse2
yading@10 634 PRED8x8L_DC
yading@10 635 %if HAVE_AVX_EXTERNAL
yading@10 636 INIT_XMM avx
yading@10 637 PRED8x8L_DC
yading@10 638 %endif
yading@10 639
yading@10 640 ;-----------------------------------------------------------------------------
yading@10 641 ; void pred8x8l_vertical(pixel *src, int has_topleft, int has_topright, int stride)
yading@10 642 ;-----------------------------------------------------------------------------
yading@10 643 %macro PRED8x8L_VERTICAL 0
yading@10 644 cglobal pred8x8l_vertical_10, 4, 4, 6
yading@10 645 sub r0, r3
yading@10 646 mova m0, [r0]
yading@10 647 shr r1d, 14
yading@10 648 shr r2d, 13
yading@10 649 neg r1
yading@10 650 pslldq m1, m0, 2
yading@10 651 psrldq m2, m0, 2
yading@10 652 pinsrw m1, [r0+r1], 0
yading@10 653 pinsrw m2, [r0+r2+14], 7
yading@10 654 lea r1, [r3*3]
yading@10 655 lea r2, [r0+r3*4]
yading@10 656 PRED4x4_LOWPASS m0, m2, m1, m0
yading@10 657 mova [r0+r3*1], m0
yading@10 658 mova [r0+r3*2], m0
yading@10 659 mova [r0+r1*1], m0
yading@10 660 mova [r0+r3*4], m0
yading@10 661 mova [r2+r3*1], m0
yading@10 662 mova [r2+r3*2], m0
yading@10 663 mova [r2+r1*1], m0
yading@10 664 mova [r2+r3*4], m0
yading@10 665 RET
yading@10 666 %endmacro
yading@10 667
yading@10 668 INIT_XMM sse2
yading@10 669 PRED8x8L_VERTICAL
yading@10 670 %if HAVE_AVX_EXTERNAL
yading@10 671 INIT_XMM avx
yading@10 672 PRED8x8L_VERTICAL
yading@10 673 %endif
yading@10 674
yading@10 675 ;-----------------------------------------------------------------------------
yading@10 676 ; void pred8x8l_horizontal(uint8_t *src, int has_topleft, int has_topright, int stride)
yading@10 677 ;-----------------------------------------------------------------------------
yading@10 678 %macro PRED8x8L_HORIZONTAL 0
yading@10 679 cglobal pred8x8l_horizontal_10, 4, 4, 5
yading@10 680 mova m0, [r0-16]
yading@10 681 shr r1d, 14
yading@10 682 dec r1
yading@10 683 and r1, r3
yading@10 684 sub r1, r3
yading@10 685 punpckhwd m0, [r0+r1-16]
yading@10 686 mova m1, [r0+r3*2-16]
yading@10 687 punpckhwd m1, [r0+r3*1-16]
yading@10 688 lea r2, [r0+r3*4]
yading@10 689 lea r1, [r3*3]
yading@10 690 punpckhdq m1, m0
yading@10 691 mova m2, [r2+r3*0-16]
yading@10 692 punpckhwd m2, [r0+r1-16]
yading@10 693 mova m3, [r2+r3*2-16]
yading@10 694 punpckhwd m3, [r2+r3*1-16]
yading@10 695 punpckhdq m3, m2
yading@10 696 punpckhqdq m3, m1
yading@10 697 PALIGNR m4, m3, [r2+r1-16], 14, m0
yading@10 698 pslldq m0, m4, 2
yading@10 699 pshuflw m0, m0, 11100101b
yading@10 700 PRED4x4_LOWPASS m4, m3, m0, m4
yading@10 701 punpckhwd m3, m4, m4
yading@10 702 punpcklwd m4, m4
yading@10 703 pshufd m0, m3, 0xff
yading@10 704 pshufd m1, m3, 0xaa
yading@10 705 pshufd m2, m3, 0x55
yading@10 706 pshufd m3, m3, 0x00
yading@10 707 mova [r0+r3*0], m0
yading@10 708 mova [r0+r3*1], m1
yading@10 709 mova [r0+r3*2], m2
yading@10 710 mova [r0+r1*1], m3
yading@10 711 pshufd m0, m4, 0xff
yading@10 712 pshufd m1, m4, 0xaa
yading@10 713 pshufd m2, m4, 0x55
yading@10 714 pshufd m3, m4, 0x00
yading@10 715 mova [r2+r3*0], m0
yading@10 716 mova [r2+r3*1], m1
yading@10 717 mova [r2+r3*2], m2
yading@10 718 mova [r2+r1*1], m3
yading@10 719 RET
yading@10 720 %endmacro
yading@10 721
yading@10 722 INIT_XMM sse2
yading@10 723 PRED8x8L_HORIZONTAL
yading@10 724 INIT_XMM ssse3
yading@10 725 PRED8x8L_HORIZONTAL
yading@10 726 %if HAVE_AVX_EXTERNAL
yading@10 727 INIT_XMM avx
yading@10 728 PRED8x8L_HORIZONTAL
yading@10 729 %endif
yading@10 730
yading@10 731 ;-----------------------------------------------------------------------------
yading@10 732 ;void pred8x8l_down_left(pixel *src, int has_topleft, int has_topright, int stride)
yading@10 733 ;-----------------------------------------------------------------------------
yading@10 734 %macro PRED8x8L_DOWN_LEFT 0
yading@10 735 cglobal pred8x8l_down_left_10, 4, 4, 7
yading@10 736 sub r0, r3
yading@10 737 mova m3, [r0]
yading@10 738 shr r1d, 14
yading@10 739 neg r1
yading@10 740 shr r2d, 13
yading@10 741 pslldq m1, m3, 2
yading@10 742 psrldq m2, m3, 2
yading@10 743 pinsrw m1, [r0+r1], 0
yading@10 744 pinsrw m2, [r0+r2+14], 7
yading@10 745 PRED4x4_LOWPASS m6, m2, m1, m3
yading@10 746 jz .fix_tr ; flags from shr r2d
yading@10 747 mova m1, [r0+16]
yading@10 748 psrldq m5, m1, 2
yading@10 749 PALIGNR m2, m1, m3, 14, m3
yading@10 750 pshufhw m5, m5, 10100100b
yading@10 751 PRED4x4_LOWPASS m1, m2, m5, m1
yading@10 752 .do_topright:
yading@10 753 lea r1, [r3*3]
yading@10 754 psrldq m5, m1, 14
yading@10 755 lea r2, [r0+r3*4]
yading@10 756 PALIGNR m2, m1, m6, 2, m0
yading@10 757 PALIGNR m3, m1, m6, 14, m0
yading@10 758 PALIGNR m5, m1, 2, m0
yading@10 759 pslldq m4, m6, 2
yading@10 760 PRED4x4_LOWPASS m6, m4, m2, m6
yading@10 761 PRED4x4_LOWPASS m1, m3, m5, m1
yading@10 762 mova [r2+r3*4], m1
yading@10 763 PALIGNR m1, m6, 14, m2
yading@10 764 pslldq m6, 2
yading@10 765 mova [r2+r1*1], m1
yading@10 766 PALIGNR m1, m6, 14, m2
yading@10 767 pslldq m6, 2
yading@10 768 mova [r2+r3*2], m1
yading@10 769 PALIGNR m1, m6, 14, m2
yading@10 770 pslldq m6, 2
yading@10 771 mova [r2+r3*1], m1
yading@10 772 PALIGNR m1, m6, 14, m2
yading@10 773 pslldq m6, 2
yading@10 774 mova [r0+r3*4], m1
yading@10 775 PALIGNR m1, m6, 14, m2
yading@10 776 pslldq m6, 2
yading@10 777 mova [r0+r1*1], m1
yading@10 778 PALIGNR m1, m6, 14, m2
yading@10 779 pslldq m6, 2
yading@10 780 mova [r0+r3*2], m1
yading@10 781 PALIGNR m1, m6, 14, m6
yading@10 782 mova [r0+r3*1], m1
yading@10 783 RET
yading@10 784 .fix_tr:
yading@10 785 punpckhwd m3, m3
yading@10 786 pshufd m1, m3, 0xFF
yading@10 787 jmp .do_topright
yading@10 788 %endmacro
yading@10 789
yading@10 790 INIT_XMM sse2
yading@10 791 PRED8x8L_DOWN_LEFT
yading@10 792 INIT_XMM ssse3
yading@10 793 PRED8x8L_DOWN_LEFT
yading@10 794 %if HAVE_AVX_EXTERNAL
yading@10 795 INIT_XMM avx
yading@10 796 PRED8x8L_DOWN_LEFT
yading@10 797 %endif
yading@10 798
yading@10 799 ;-----------------------------------------------------------------------------
yading@10 800 ;void pred8x8l_down_right(pixel *src, int has_topleft, int has_topright, int stride)
yading@10 801 ;-----------------------------------------------------------------------------
yading@10 802 %macro PRED8x8L_DOWN_RIGHT 0
yading@10 803 ; standard forbids this when has_topleft is false
yading@10 804 ; no need to check
yading@10 805 cglobal pred8x8l_down_right_10, 4, 5, 8
yading@10 806 sub r0, r3
yading@10 807 lea r4, [r0+r3*4]
yading@10 808 lea r1, [r3*3]
yading@10 809 mova m0, [r0+r3*1-16]
yading@10 810 punpckhwd m0, [r0+r3*0-16]
yading@10 811 mova m1, [r0+r1*1-16]
yading@10 812 punpckhwd m1, [r0+r3*2-16]
yading@10 813 punpckhdq m1, m0
yading@10 814 mova m2, [r4+r3*1-16]
yading@10 815 punpckhwd m2, [r4+r3*0-16]
yading@10 816 mova m3, [r4+r1*1-16]
yading@10 817 punpckhwd m3, [r4+r3*2-16]
yading@10 818 punpckhdq m3, m2
yading@10 819 punpckhqdq m3, m1
yading@10 820 mova m0, [r4+r3*4-16]
yading@10 821 mova m1, [r0]
yading@10 822 PALIGNR m4, m3, m0, 14, m0
yading@10 823 PALIGNR m1, m3, 2, m2
yading@10 824 pslldq m0, m4, 2
yading@10 825 pshuflw m0, m0, 11100101b
yading@10 826 PRED4x4_LOWPASS m6, m1, m4, m3
yading@10 827 PRED4x4_LOWPASS m4, m3, m0, m4
yading@10 828 mova m3, [r0]
yading@10 829 shr r2d, 13
yading@10 830 pslldq m1, m3, 2
yading@10 831 psrldq m2, m3, 2
yading@10 832 pinsrw m1, [r0-2], 0
yading@10 833 pinsrw m2, [r0+r2+14], 7
yading@10 834 PRED4x4_LOWPASS m3, m2, m1, m3
yading@10 835 PALIGNR m2, m3, m6, 2, m0
yading@10 836 PALIGNR m5, m3, m6, 14, m0
yading@10 837 psrldq m7, m3, 2
yading@10 838 PRED4x4_LOWPASS m6, m4, m2, m6
yading@10 839 PRED4x4_LOWPASS m3, m5, m7, m3
yading@10 840 mova [r4+r3*4], m6
yading@10 841 PALIGNR m3, m6, 14, m2
yading@10 842 pslldq m6, 2
yading@10 843 mova [r0+r3*1], m3
yading@10 844 PALIGNR m3, m6, 14, m2
yading@10 845 pslldq m6, 2
yading@10 846 mova [r0+r3*2], m3
yading@10 847 PALIGNR m3, m6, 14, m2
yading@10 848 pslldq m6, 2
yading@10 849 mova [r0+r1*1], m3
yading@10 850 PALIGNR m3, m6, 14, m2
yading@10 851 pslldq m6, 2
yading@10 852 mova [r0+r3*4], m3
yading@10 853 PALIGNR m3, m6, 14, m2
yading@10 854 pslldq m6, 2
yading@10 855 mova [r4+r3*1], m3
yading@10 856 PALIGNR m3, m6, 14, m2
yading@10 857 pslldq m6, 2
yading@10 858 mova [r4+r3*2], m3
yading@10 859 PALIGNR m3, m6, 14, m6
yading@10 860 mova [r4+r1*1], m3
yading@10 861 RET
yading@10 862 %endmacro
yading@10 863
yading@10 864 INIT_XMM sse2
yading@10 865 PRED8x8L_DOWN_RIGHT
yading@10 866 INIT_XMM ssse3
yading@10 867 PRED8x8L_DOWN_RIGHT
yading@10 868 %if HAVE_AVX_EXTERNAL
yading@10 869 INIT_XMM avx
yading@10 870 PRED8x8L_DOWN_RIGHT
yading@10 871 %endif
yading@10 872
yading@10 873 ;-----------------------------------------------------------------------------
yading@10 874 ; void pred8x8l_vertical_right(pixel *src, int has_topleft, int has_topright, int stride)
yading@10 875 ;-----------------------------------------------------------------------------
yading@10 876 %macro PRED8x8L_VERTICAL_RIGHT 0
yading@10 877 ; likewise with 8x8l_down_right
yading@10 878 cglobal pred8x8l_vertical_right_10, 4, 5, 7
yading@10 879 sub r0, r3
yading@10 880 lea r4, [r0+r3*4]
yading@10 881 lea r1, [r3*3]
yading@10 882 mova m0, [r0+r3*1-16]
yading@10 883 punpckhwd m0, [r0+r3*0-16]
yading@10 884 mova m1, [r0+r1*1-16]
yading@10 885 punpckhwd m1, [r0+r3*2-16]
yading@10 886 punpckhdq m1, m0
yading@10 887 mova m2, [r4+r3*1-16]
yading@10 888 punpckhwd m2, [r4+r3*0-16]
yading@10 889 mova m3, [r4+r1*1-16]
yading@10 890 punpckhwd m3, [r4+r3*2-16]
yading@10 891 punpckhdq m3, m2
yading@10 892 punpckhqdq m3, m1
yading@10 893 mova m0, [r4+r3*4-16]
yading@10 894 mova m1, [r0]
yading@10 895 PALIGNR m4, m3, m0, 14, m0
yading@10 896 PALIGNR m1, m3, 2, m2
yading@10 897 PRED4x4_LOWPASS m3, m1, m4, m3
yading@10 898 mova m2, [r0]
yading@10 899 shr r2d, 13
yading@10 900 pslldq m1, m2, 2
yading@10 901 psrldq m5, m2, 2
yading@10 902 pinsrw m1, [r0-2], 0
yading@10 903 pinsrw m5, [r0+r2+14], 7
yading@10 904 PRED4x4_LOWPASS m2, m5, m1, m2
yading@10 905 PALIGNR m6, m2, m3, 12, m1
yading@10 906 PALIGNR m5, m2, m3, 14, m0
yading@10 907 PRED4x4_LOWPASS m0, m6, m2, m5
yading@10 908 pavgw m2, m5
yading@10 909 mova [r0+r3*2], m0
yading@10 910 mova [r0+r3*1], m2
yading@10 911 pslldq m6, m3, 4
yading@10 912 pslldq m1, m3, 2
yading@10 913 PRED4x4_LOWPASS m1, m3, m6, m1
yading@10 914 PALIGNR m2, m1, 14, m4
yading@10 915 mova [r0+r1*1], m2
yading@10 916 pslldq m1, 2
yading@10 917 PALIGNR m0, m1, 14, m3
yading@10 918 mova [r0+r3*4], m0
yading@10 919 pslldq m1, 2
yading@10 920 PALIGNR m2, m1, 14, m4
yading@10 921 mova [r4+r3*1], m2
yading@10 922 pslldq m1, 2
yading@10 923 PALIGNR m0, m1, 14, m3
yading@10 924 mova [r4+r3*2], m0
yading@10 925 pslldq m1, 2
yading@10 926 PALIGNR m2, m1, 14, m4
yading@10 927 mova [r4+r1*1], m2
yading@10 928 pslldq m1, 2
yading@10 929 PALIGNR m0, m1, 14, m1
yading@10 930 mova [r4+r3*4], m0
yading@10 931 RET
yading@10 932 %endmacro
yading@10 933
yading@10 934 INIT_XMM sse2
yading@10 935 PRED8x8L_VERTICAL_RIGHT
yading@10 936 INIT_XMM ssse3
yading@10 937 PRED8x8L_VERTICAL_RIGHT
yading@10 938 %if HAVE_AVX_EXTERNAL
yading@10 939 INIT_XMM avx
yading@10 940 PRED8x8L_VERTICAL_RIGHT
yading@10 941 %endif
yading@10 942
yading@10 943 ;-----------------------------------------------------------------------------
yading@10 944 ; void pred8x8l_horizontal_up(pixel *src, int has_topleft, int has_topright, int stride)
yading@10 945 ;-----------------------------------------------------------------------------
yading@10 946 %macro PRED8x8L_HORIZONTAL_UP 0
yading@10 947 cglobal pred8x8l_horizontal_up_10, 4, 4, 6
yading@10 948 mova m0, [r0+r3*0-16]
yading@10 949 punpckhwd m0, [r0+r3*1-16]
yading@10 950 shr r1d, 14
yading@10 951 dec r1
yading@10 952 and r1, r3
yading@10 953 sub r1, r3
yading@10 954 mova m4, [r0+r1*1-16]
yading@10 955 lea r1, [r3*3]
yading@10 956 lea r2, [r0+r3*4]
yading@10 957 mova m1, [r0+r3*2-16]
yading@10 958 punpckhwd m1, [r0+r1*1-16]
yading@10 959 punpckhdq m0, m1
yading@10 960 mova m2, [r2+r3*0-16]
yading@10 961 punpckhwd m2, [r2+r3*1-16]
yading@10 962 mova m3, [r2+r3*2-16]
yading@10 963 punpckhwd m3, [r2+r1*1-16]
yading@10 964 punpckhdq m2, m3
yading@10 965 punpckhqdq m0, m2
yading@10 966 PALIGNR m1, m0, m4, 14, m4
yading@10 967 psrldq m2, m0, 2
yading@10 968 pshufhw m2, m2, 10100100b
yading@10 969 PRED4x4_LOWPASS m0, m1, m2, m0
yading@10 970 psrldq m1, m0, 2
yading@10 971 psrldq m2, m0, 4
yading@10 972 pshufhw m1, m1, 10100100b
yading@10 973 pshufhw m2, m2, 01010100b
yading@10 974 pavgw m4, m0, m1
yading@10 975 PRED4x4_LOWPASS m1, m2, m0, m1
yading@10 976 punpckhwd m5, m4, m1
yading@10 977 punpcklwd m4, m1
yading@10 978 mova [r2+r3*0], m5
yading@10 979 mova [r0+r3*0], m4
yading@10 980 pshufd m0, m5, 11111001b
yading@10 981 pshufd m1, m5, 11111110b
yading@10 982 pshufd m2, m5, 11111111b
yading@10 983 mova [r2+r3*1], m0
yading@10 984 mova [r2+r3*2], m1
yading@10 985 mova [r2+r1*1], m2
yading@10 986 PALIGNR m2, m5, m4, 4, m0
yading@10 987 PALIGNR m3, m5, m4, 8, m1
yading@10 988 PALIGNR m5, m5, m4, 12, m4
yading@10 989 mova [r0+r3*1], m2
yading@10 990 mova [r0+r3*2], m3
yading@10 991 mova [r0+r1*1], m5
yading@10 992 RET
yading@10 993 %endmacro
yading@10 994
yading@10 995 INIT_XMM sse2
yading@10 996 PRED8x8L_HORIZONTAL_UP
yading@10 997 INIT_XMM ssse3
yading@10 998 PRED8x8L_HORIZONTAL_UP
yading@10 999 %if HAVE_AVX_EXTERNAL
yading@10 1000 INIT_XMM avx
yading@10 1001 PRED8x8L_HORIZONTAL_UP
yading@10 1002 %endif
yading@10 1003
yading@10 1004
yading@10 1005 ;-----------------------------------------------------------------------------
yading@10 1006 ; void pred16x16_vertical(pixel *src, int stride)
yading@10 1007 ;-----------------------------------------------------------------------------
yading@10 1008 %macro MOV16 3-5
yading@10 1009 mova [%1+ 0], %2
yading@10 1010 mova [%1+mmsize], %3
yading@10 1011 %if mmsize==8
yading@10 1012 mova [%1+ 16], %4
yading@10 1013 mova [%1+ 24], %5
yading@10 1014 %endif
yading@10 1015 %endmacro
yading@10 1016
yading@10 1017 %macro PRED16x16_VERTICAL 0
yading@10 1018 cglobal pred16x16_vertical_10, 2, 3
yading@10 1019 sub r0, r1
yading@10 1020 mov r2d, 8
yading@10 1021 mova m0, [r0+ 0]
yading@10 1022 mova m1, [r0+mmsize]
yading@10 1023 %if mmsize==8
yading@10 1024 mova m2, [r0+16]
yading@10 1025 mova m3, [r0+24]
yading@10 1026 %endif
yading@10 1027 .loop:
yading@10 1028 MOV16 r0+r1*1, m0, m1, m2, m3
yading@10 1029 MOV16 r0+r1*2, m0, m1, m2, m3
yading@10 1030 lea r0, [r0+r1*2]
yading@10 1031 dec r2d
yading@10 1032 jg .loop
yading@10 1033 REP_RET
yading@10 1034 %endmacro
yading@10 1035
yading@10 1036 INIT_MMX mmxext
yading@10 1037 PRED16x16_VERTICAL
yading@10 1038 INIT_XMM sse2
yading@10 1039 PRED16x16_VERTICAL
yading@10 1040
yading@10 1041 ;-----------------------------------------------------------------------------
yading@10 1042 ; void pred16x16_horizontal(pixel *src, int stride)
yading@10 1043 ;-----------------------------------------------------------------------------
yading@10 1044 %macro PRED16x16_HORIZONTAL 0
yading@10 1045 cglobal pred16x16_horizontal_10, 2, 3
yading@10 1046 mov r2d, 8
yading@10 1047 .vloop:
yading@10 1048 movd m0, [r0+r1*0-4]
yading@10 1049 movd m1, [r0+r1*1-4]
yading@10 1050 SPLATW m0, m0, 1
yading@10 1051 SPLATW m1, m1, 1
yading@10 1052 MOV16 r0+r1*0, m0, m0, m0, m0
yading@10 1053 MOV16 r0+r1*1, m1, m1, m1, m1
yading@10 1054 lea r0, [r0+r1*2]
yading@10 1055 dec r2d
yading@10 1056 jg .vloop
yading@10 1057 REP_RET
yading@10 1058 %endmacro
yading@10 1059
yading@10 1060 INIT_MMX mmxext
yading@10 1061 PRED16x16_HORIZONTAL
yading@10 1062 INIT_XMM sse2
yading@10 1063 PRED16x16_HORIZONTAL
yading@10 1064
yading@10 1065 ;-----------------------------------------------------------------------------
yading@10 1066 ; void pred16x16_dc(pixel *src, int stride)
yading@10 1067 ;-----------------------------------------------------------------------------
yading@10 1068 %macro PRED16x16_DC 0
yading@10 1069 cglobal pred16x16_dc_10, 2, 6
yading@10 1070 mov r5, r0
yading@10 1071 sub r0, r1
yading@10 1072 mova m0, [r0+0]
yading@10 1073 paddw m0, [r0+mmsize]
yading@10 1074 %if mmsize==8
yading@10 1075 paddw m0, [r0+16]
yading@10 1076 paddw m0, [r0+24]
yading@10 1077 %endif
yading@10 1078 HADDW m0, m2
yading@10 1079
yading@10 1080 lea r0, [r0+r1-2]
yading@10 1081 movzx r3d, word [r0]
yading@10 1082 movzx r4d, word [r0+r1]
yading@10 1083 %rep 7
yading@10 1084 lea r0, [r0+r1*2]
yading@10 1085 movzx r2d, word [r0]
yading@10 1086 add r3d, r2d
yading@10 1087 movzx r2d, word [r0+r1]
yading@10 1088 add r4d, r2d
yading@10 1089 %endrep
yading@10 1090 lea r3d, [r3+r4+16]
yading@10 1091
yading@10 1092 movd m1, r3d
yading@10 1093 paddw m0, m1
yading@10 1094 psrlw m0, 5
yading@10 1095 SPLATW m0, m0
yading@10 1096 mov r3d, 8
yading@10 1097 .loop:
yading@10 1098 MOV16 r5+r1*0, m0, m0, m0, m0
yading@10 1099 MOV16 r5+r1*1, m0, m0, m0, m0
yading@10 1100 lea r5, [r5+r1*2]
yading@10 1101 dec r3d
yading@10 1102 jg .loop
yading@10 1103 REP_RET
yading@10 1104 %endmacro
yading@10 1105
yading@10 1106 INIT_MMX mmxext
yading@10 1107 PRED16x16_DC
yading@10 1108 INIT_XMM sse2
yading@10 1109 PRED16x16_DC
yading@10 1110
yading@10 1111 ;-----------------------------------------------------------------------------
yading@10 1112 ; void pred16x16_top_dc(pixel *src, int stride)
yading@10 1113 ;-----------------------------------------------------------------------------
yading@10 1114 %macro PRED16x16_TOP_DC 0
yading@10 1115 cglobal pred16x16_top_dc_10, 2, 3
yading@10 1116 sub r0, r1
yading@10 1117 mova m0, [r0+0]
yading@10 1118 paddw m0, [r0+mmsize]
yading@10 1119 %if mmsize==8
yading@10 1120 paddw m0, [r0+16]
yading@10 1121 paddw m0, [r0+24]
yading@10 1122 %endif
yading@10 1123 HADDW m0, m2
yading@10 1124
yading@10 1125 SPLATW m0, m0
yading@10 1126 paddw m0, [pw_8]
yading@10 1127 psrlw m0, 4
yading@10 1128 mov r2d, 8
yading@10 1129 .loop:
yading@10 1130 MOV16 r0+r1*1, m0, m0, m0, m0
yading@10 1131 MOV16 r0+r1*2, m0, m0, m0, m0
yading@10 1132 lea r0, [r0+r1*2]
yading@10 1133 dec r2d
yading@10 1134 jg .loop
yading@10 1135 REP_RET
yading@10 1136 %endmacro
yading@10 1137
yading@10 1138 INIT_MMX mmxext
yading@10 1139 PRED16x16_TOP_DC
yading@10 1140 INIT_XMM sse2
yading@10 1141 PRED16x16_TOP_DC
yading@10 1142
yading@10 1143 ;-----------------------------------------------------------------------------
yading@10 1144 ; void pred16x16_left_dc(pixel *src, int stride)
yading@10 1145 ;-----------------------------------------------------------------------------
yading@10 1146 %macro PRED16x16_LEFT_DC 0
yading@10 1147 cglobal pred16x16_left_dc_10, 2, 6
yading@10 1148 mov r5, r0
yading@10 1149
yading@10 1150 sub r0, 2
yading@10 1151 movzx r3d, word [r0]
yading@10 1152 movzx r4d, word [r0+r1]
yading@10 1153 %rep 7
yading@10 1154 lea r0, [r0+r1*2]
yading@10 1155 movzx r2d, word [r0]
yading@10 1156 add r3d, r2d
yading@10 1157 movzx r2d, word [r0+r1]
yading@10 1158 add r4d, r2d
yading@10 1159 %endrep
yading@10 1160 lea r3d, [r3+r4+8]
yading@10 1161 shr r3d, 4
yading@10 1162
yading@10 1163 movd m0, r3d
yading@10 1164 SPLATW m0, m0
yading@10 1165 mov r3d, 8
yading@10 1166 .loop:
yading@10 1167 MOV16 r5+r1*0, m0, m0, m0, m0
yading@10 1168 MOV16 r5+r1*1, m0, m0, m0, m0
yading@10 1169 lea r5, [r5+r1*2]
yading@10 1170 dec r3d
yading@10 1171 jg .loop
yading@10 1172 REP_RET
yading@10 1173 %endmacro
yading@10 1174
yading@10 1175 INIT_MMX mmxext
yading@10 1176 PRED16x16_LEFT_DC
yading@10 1177 INIT_XMM sse2
yading@10 1178 PRED16x16_LEFT_DC
yading@10 1179
yading@10 1180 ;-----------------------------------------------------------------------------
yading@10 1181 ; void pred16x16_128_dc(pixel *src, int stride)
yading@10 1182 ;-----------------------------------------------------------------------------
yading@10 1183 %macro PRED16x16_128_DC 0
yading@10 1184 cglobal pred16x16_128_dc_10, 2,3
yading@10 1185 mova m0, [pw_512]
yading@10 1186 mov r2d, 8
yading@10 1187 .loop:
yading@10 1188 MOV16 r0+r1*0, m0, m0, m0, m0
yading@10 1189 MOV16 r0+r1*1, m0, m0, m0, m0
yading@10 1190 lea r0, [r0+r1*2]
yading@10 1191 dec r2d
yading@10 1192 jg .loop
yading@10 1193 REP_RET
yading@10 1194 %endmacro
yading@10 1195
yading@10 1196 INIT_MMX mmxext
yading@10 1197 PRED16x16_128_DC
yading@10 1198 INIT_XMM sse2
yading@10 1199 PRED16x16_128_DC