annotate ffmpeg/libavcodec/x86/h264_idct_10bit.asm @ 13:844d341cf643 tip

Back up before ISMIR
author Yading Song <yading.song@eecs.qmul.ac.uk>
date Thu, 31 Oct 2013 13:17:06 +0000
parents 6840f77b83aa
children
rev   line source
yading@10 1 ;*****************************************************************************
yading@10 2 ;* MMX/SSE2/AVX-optimized 10-bit H.264 iDCT code
yading@10 3 ;*****************************************************************************
yading@10 4 ;* Copyright (C) 2005-2011 x264 project
yading@10 5 ;*
yading@10 6 ;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
yading@10 7 ;*
yading@10 8 ;* This file is part of Libav.
yading@10 9 ;*
yading@10 10 ;* Libav is free software; you can redistribute it and/or
yading@10 11 ;* modify it under the terms of the GNU Lesser General Public
yading@10 12 ;* License as published by the Free Software Foundation; either
yading@10 13 ;* version 2.1 of the License, or (at your option) any later version.
yading@10 14 ;*
yading@10 15 ;* Libav is distributed in the hope that it will be useful,
yading@10 16 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
yading@10 17 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
yading@10 18 ;* Lesser General Public License for more details.
yading@10 19 ;*
yading@10 20 ;* You should have received a copy of the GNU Lesser General Public
yading@10 21 ;* License along with Libav; if not, write to the Free Software
yading@10 22 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
yading@10 23 ;******************************************************************************
yading@10 24
yading@10 25 %include "libavutil/x86/x86util.asm"
yading@10 26
yading@10 27 SECTION_RODATA
yading@10 28
yading@10 29 pw_pixel_max: times 8 dw ((1 << 10)-1)
yading@10 30 pd_32: times 4 dd 32
yading@10 31
yading@10 32 SECTION .text
yading@10 33
yading@10 34 ;-----------------------------------------------------------------------------
yading@10 35 ; void h264_idct_add(pixel *dst, dctcoef *block, int stride)
yading@10 36 ;-----------------------------------------------------------------------------
yading@10 37 %macro STORE_DIFFx2 6
yading@10 38 psrad %1, 6
yading@10 39 psrad %2, 6
yading@10 40 packssdw %1, %2
yading@10 41 movq %3, [%5]
yading@10 42 movhps %3, [%5+%6]
yading@10 43 paddsw %1, %3
yading@10 44 CLIPW %1, %4, [pw_pixel_max]
yading@10 45 movq [%5], %1
yading@10 46 movhps [%5+%6], %1
yading@10 47 %endmacro
yading@10 48
yading@10 49 %macro STORE_DIFF16 5
yading@10 50 psrad %1, 6
yading@10 51 psrad %2, 6
yading@10 52 packssdw %1, %2
yading@10 53 paddsw %1, [%5]
yading@10 54 CLIPW %1, %3, %4
yading@10 55 mova [%5], %1
yading@10 56 %endmacro
yading@10 57
yading@10 58 ;dst, in, stride
yading@10 59 %macro IDCT4_ADD_10 3
yading@10 60 mova m0, [%2+ 0]
yading@10 61 mova m1, [%2+16]
yading@10 62 mova m2, [%2+32]
yading@10 63 mova m3, [%2+48]
yading@10 64 IDCT4_1D d,0,1,2,3,4,5
yading@10 65 TRANSPOSE4x4D 0,1,2,3,4
yading@10 66 paddd m0, [pd_32]
yading@10 67 IDCT4_1D d,0,1,2,3,4,5
yading@10 68 pxor m5, m5
yading@10 69 mova [%2+ 0], m5
yading@10 70 mova [%2+16], m5
yading@10 71 mova [%2+32], m5
yading@10 72 mova [%2+48], m5
yading@10 73 STORE_DIFFx2 m0, m1, m4, m5, %1, %3
yading@10 74 lea %1, [%1+%3*2]
yading@10 75 STORE_DIFFx2 m2, m3, m4, m5, %1, %3
yading@10 76 %endmacro
yading@10 77
yading@10 78 %macro IDCT_ADD_10 0
yading@10 79 cglobal h264_idct_add_10, 3,3
yading@10 80 IDCT4_ADD_10 r0, r1, r2
yading@10 81 RET
yading@10 82 %endmacro
yading@10 83
yading@10 84 INIT_XMM sse2
yading@10 85 IDCT_ADD_10
yading@10 86 %if HAVE_AVX_EXTERNAL
yading@10 87 INIT_XMM avx
yading@10 88 IDCT_ADD_10
yading@10 89 %endif
yading@10 90
yading@10 91 ;-----------------------------------------------------------------------------
yading@10 92 ; h264_idct_add16(pixel *dst, const int *block_offset, dctcoef *block, int stride, const uint8_t nnzc[6*8])
yading@10 93 ;-----------------------------------------------------------------------------
yading@10 94 ;;;;;;; NO FATE SAMPLES TRIGGER THIS
yading@10 95 %macro ADD4x4IDCT 0
yading@10 96 add4x4_idct %+ SUFFIX:
yading@10 97 add r5, r0
yading@10 98 mova m0, [r2+ 0]
yading@10 99 mova m1, [r2+16]
yading@10 100 mova m2, [r2+32]
yading@10 101 mova m3, [r2+48]
yading@10 102 IDCT4_1D d,0,1,2,3,4,5
yading@10 103 TRANSPOSE4x4D 0,1,2,3,4
yading@10 104 paddd m0, [pd_32]
yading@10 105 IDCT4_1D d,0,1,2,3,4,5
yading@10 106 pxor m5, m5
yading@10 107 mova [r2+ 0], m5
yading@10 108 mova [r2+16], m5
yading@10 109 mova [r2+32], m5
yading@10 110 mova [r2+48], m5
yading@10 111 STORE_DIFFx2 m0, m1, m4, m5, r5, r3
yading@10 112 lea r5, [r5+r3*2]
yading@10 113 STORE_DIFFx2 m2, m3, m4, m5, r5, r3
yading@10 114 ret
yading@10 115 %endmacro
yading@10 116
yading@10 117 INIT_XMM sse2
yading@10 118 ALIGN 16
yading@10 119 ADD4x4IDCT
yading@10 120 %if HAVE_AVX_EXTERNAL
yading@10 121 INIT_XMM avx
yading@10 122 ALIGN 16
yading@10 123 ADD4x4IDCT
yading@10 124 %endif
yading@10 125
yading@10 126 %macro ADD16_OP 2
yading@10 127 cmp byte [r4+%2], 0
yading@10 128 jz .skipblock%1
yading@10 129 mov r5d, [r1+%1*4]
yading@10 130 call add4x4_idct %+ SUFFIX
yading@10 131 .skipblock%1:
yading@10 132 %if %1<15
yading@10 133 add r2, 64
yading@10 134 %endif
yading@10 135 %endmacro
yading@10 136
yading@10 137 %macro IDCT_ADD16_10 0
yading@10 138 cglobal h264_idct_add16_10, 5,6
yading@10 139 ADD16_OP 0, 4+1*8
yading@10 140 ADD16_OP 1, 5+1*8
yading@10 141 ADD16_OP 2, 4+2*8
yading@10 142 ADD16_OP 3, 5+2*8
yading@10 143 ADD16_OP 4, 6+1*8
yading@10 144 ADD16_OP 5, 7+1*8
yading@10 145 ADD16_OP 6, 6+2*8
yading@10 146 ADD16_OP 7, 7+2*8
yading@10 147 ADD16_OP 8, 4+3*8
yading@10 148 ADD16_OP 9, 5+3*8
yading@10 149 ADD16_OP 10, 4+4*8
yading@10 150 ADD16_OP 11, 5+4*8
yading@10 151 ADD16_OP 12, 6+3*8
yading@10 152 ADD16_OP 13, 7+3*8
yading@10 153 ADD16_OP 14, 6+4*8
yading@10 154 ADD16_OP 15, 7+4*8
yading@10 155 REP_RET
yading@10 156 %endmacro
yading@10 157
yading@10 158 INIT_XMM sse2
yading@10 159 IDCT_ADD16_10
yading@10 160 %if HAVE_AVX_EXTERNAL
yading@10 161 INIT_XMM avx
yading@10 162 IDCT_ADD16_10
yading@10 163 %endif
yading@10 164
yading@10 165 ;-----------------------------------------------------------------------------
yading@10 166 ; void h264_idct_dc_add(pixel *dst, dctcoef *block, int stride)
yading@10 167 ;-----------------------------------------------------------------------------
yading@10 168 %macro IDCT_DC_ADD_OP_10 3
yading@10 169 pxor m5, m5
yading@10 170 %if avx_enabled
yading@10 171 paddw m1, m0, [%1+0 ]
yading@10 172 paddw m2, m0, [%1+%2 ]
yading@10 173 paddw m3, m0, [%1+%2*2]
yading@10 174 paddw m4, m0, [%1+%3 ]
yading@10 175 %else
yading@10 176 mova m1, [%1+0 ]
yading@10 177 mova m2, [%1+%2 ]
yading@10 178 mova m3, [%1+%2*2]
yading@10 179 mova m4, [%1+%3 ]
yading@10 180 paddw m1, m0
yading@10 181 paddw m2, m0
yading@10 182 paddw m3, m0
yading@10 183 paddw m4, m0
yading@10 184 %endif
yading@10 185 CLIPW m1, m5, m6
yading@10 186 CLIPW m2, m5, m6
yading@10 187 CLIPW m3, m5, m6
yading@10 188 CLIPW m4, m5, m6
yading@10 189 mova [%1+0 ], m1
yading@10 190 mova [%1+%2 ], m2
yading@10 191 mova [%1+%2*2], m3
yading@10 192 mova [%1+%3 ], m4
yading@10 193 %endmacro
yading@10 194
yading@10 195 INIT_MMX mmxext
yading@10 196 cglobal h264_idct_dc_add_10,3,3
yading@10 197 movd m0, [r1]
yading@10 198 mov dword [r1], 0
yading@10 199 paddd m0, [pd_32]
yading@10 200 psrad m0, 6
yading@10 201 lea r1, [r2*3]
yading@10 202 pshufw m0, m0, 0
yading@10 203 mova m6, [pw_pixel_max]
yading@10 204 IDCT_DC_ADD_OP_10 r0, r2, r1
yading@10 205 RET
yading@10 206
yading@10 207 ;-----------------------------------------------------------------------------
yading@10 208 ; void h264_idct8_dc_add(pixel *dst, dctcoef *block, int stride)
yading@10 209 ;-----------------------------------------------------------------------------
yading@10 210 %macro IDCT8_DC_ADD 0
yading@10 211 cglobal h264_idct8_dc_add_10,3,4,7
yading@10 212 movd m0, [r1]
yading@10 213 mov dword[r1], 0
yading@10 214 paddd m0, [pd_32]
yading@10 215 psrad m0, 6
yading@10 216 lea r1, [r2*3]
yading@10 217 SPLATW m0, m0, 0
yading@10 218 mova m6, [pw_pixel_max]
yading@10 219 IDCT_DC_ADD_OP_10 r0, r2, r1
yading@10 220 lea r0, [r0+r2*4]
yading@10 221 IDCT_DC_ADD_OP_10 r0, r2, r1
yading@10 222 RET
yading@10 223 %endmacro
yading@10 224
yading@10 225 INIT_XMM sse2
yading@10 226 IDCT8_DC_ADD
yading@10 227 %if HAVE_AVX_EXTERNAL
yading@10 228 INIT_XMM avx
yading@10 229 IDCT8_DC_ADD
yading@10 230 %endif
yading@10 231
yading@10 232 ;-----------------------------------------------------------------------------
yading@10 233 ; h264_idct_add16intra(pixel *dst, const int *block_offset, dctcoef *block, int stride, const uint8_t nnzc[6*8])
yading@10 234 ;-----------------------------------------------------------------------------
yading@10 235 %macro AC 1
yading@10 236 .ac%1:
yading@10 237 mov r5d, [r1+(%1+0)*4]
yading@10 238 call add4x4_idct %+ SUFFIX
yading@10 239 mov r5d, [r1+(%1+1)*4]
yading@10 240 add r2, 64
yading@10 241 call add4x4_idct %+ SUFFIX
yading@10 242 add r2, 64
yading@10 243 jmp .skipadd%1
yading@10 244 %endmacro
yading@10 245
yading@10 246 %assign last_block 16
yading@10 247 %macro ADD16_OP_INTRA 2
yading@10 248 cmp word [r4+%2], 0
yading@10 249 jnz .ac%1
yading@10 250 mov r5d, [r2+ 0]
yading@10 251 or r5d, [r2+64]
yading@10 252 jz .skipblock%1
yading@10 253 mov r5d, [r1+(%1+0)*4]
yading@10 254 call idct_dc_add %+ SUFFIX
yading@10 255 .skipblock%1:
yading@10 256 %if %1<last_block-2
yading@10 257 add r2, 128
yading@10 258 %endif
yading@10 259 .skipadd%1:
yading@10 260 %endmacro
yading@10 261
yading@10 262 %macro IDCT_ADD16INTRA_10 0
yading@10 263 idct_dc_add %+ SUFFIX:
yading@10 264 add r5, r0
yading@10 265 movq m0, [r2+ 0]
yading@10 266 movhps m0, [r2+64]
yading@10 267 mov dword [r2+ 0], 0
yading@10 268 mov dword [r2+64], 0
yading@10 269 paddd m0, [pd_32]
yading@10 270 psrad m0, 6
yading@10 271 pshufhw m0, m0, 0
yading@10 272 pshuflw m0, m0, 0
yading@10 273 lea r6, [r3*3]
yading@10 274 mova m6, [pw_pixel_max]
yading@10 275 IDCT_DC_ADD_OP_10 r5, r3, r6
yading@10 276 ret
yading@10 277
yading@10 278 cglobal h264_idct_add16intra_10,5,7,8
yading@10 279 ADD16_OP_INTRA 0, 4+1*8
yading@10 280 ADD16_OP_INTRA 2, 4+2*8
yading@10 281 ADD16_OP_INTRA 4, 6+1*8
yading@10 282 ADD16_OP_INTRA 6, 6+2*8
yading@10 283 ADD16_OP_INTRA 8, 4+3*8
yading@10 284 ADD16_OP_INTRA 10, 4+4*8
yading@10 285 ADD16_OP_INTRA 12, 6+3*8
yading@10 286 ADD16_OP_INTRA 14, 6+4*8
yading@10 287 REP_RET
yading@10 288 AC 8
yading@10 289 AC 10
yading@10 290 AC 12
yading@10 291 AC 14
yading@10 292 AC 0
yading@10 293 AC 2
yading@10 294 AC 4
yading@10 295 AC 6
yading@10 296 %endmacro
yading@10 297
yading@10 298 INIT_XMM sse2
yading@10 299 IDCT_ADD16INTRA_10
yading@10 300 %if HAVE_AVX_EXTERNAL
yading@10 301 INIT_XMM avx
yading@10 302 IDCT_ADD16INTRA_10
yading@10 303 %endif
yading@10 304
yading@10 305 %assign last_block 36
yading@10 306 ;-----------------------------------------------------------------------------
yading@10 307 ; h264_idct_add8(pixel **dst, const int *block_offset, dctcoef *block, int stride, const uint8_t nnzc[6*8])
yading@10 308 ;-----------------------------------------------------------------------------
yading@10 309 %macro IDCT_ADD8 0
yading@10 310 cglobal h264_idct_add8_10,5,8,7
yading@10 311 %if ARCH_X86_64
yading@10 312 mov r7, r0
yading@10 313 %endif
yading@10 314 add r2, 1024
yading@10 315 mov r0, [r0]
yading@10 316 ADD16_OP_INTRA 16, 4+ 6*8
yading@10 317 ADD16_OP_INTRA 18, 4+ 7*8
yading@10 318 add r2, 1024-128*2
yading@10 319 %if ARCH_X86_64
yading@10 320 mov r0, [r7+gprsize]
yading@10 321 %else
yading@10 322 mov r0, r0m
yading@10 323 mov r0, [r0+gprsize]
yading@10 324 %endif
yading@10 325 ADD16_OP_INTRA 32, 4+11*8
yading@10 326 ADD16_OP_INTRA 34, 4+12*8
yading@10 327 REP_RET
yading@10 328 AC 16
yading@10 329 AC 18
yading@10 330 AC 32
yading@10 331 AC 34
yading@10 332
yading@10 333 %endmacro ; IDCT_ADD8
yading@10 334
yading@10 335 INIT_XMM sse2
yading@10 336 IDCT_ADD8
yading@10 337 %if HAVE_AVX_EXTERNAL
yading@10 338 INIT_XMM avx
yading@10 339 IDCT_ADD8
yading@10 340 %endif
yading@10 341
yading@10 342 ;-----------------------------------------------------------------------------
yading@10 343 ; void h264_idct8_add(pixel *dst, dctcoef *block, int stride)
yading@10 344 ;-----------------------------------------------------------------------------
yading@10 345 %macro IDCT8_1D 2
yading@10 346 SWAP 0, 1
yading@10 347 psrad m4, m5, 1
yading@10 348 psrad m1, m0, 1
yading@10 349 paddd m4, m5
yading@10 350 paddd m1, m0
yading@10 351 paddd m4, m7
yading@10 352 paddd m1, m5
yading@10 353 psubd m4, m0
yading@10 354 paddd m1, m3
yading@10 355
yading@10 356 psubd m0, m3
yading@10 357 psubd m5, m3
yading@10 358 paddd m0, m7
yading@10 359 psubd m5, m7
yading@10 360 psrad m3, 1
yading@10 361 psrad m7, 1
yading@10 362 psubd m0, m3
yading@10 363 psubd m5, m7
yading@10 364
yading@10 365 SWAP 1, 7
yading@10 366 psrad m1, m7, 2
yading@10 367 psrad m3, m4, 2
yading@10 368 paddd m3, m0
yading@10 369 psrad m0, 2
yading@10 370 paddd m1, m5
yading@10 371 psrad m5, 2
yading@10 372 psubd m0, m4
yading@10 373 psubd m7, m5
yading@10 374
yading@10 375 SWAP 5, 6
yading@10 376 psrad m4, m2, 1
yading@10 377 psrad m6, m5, 1
yading@10 378 psubd m4, m5
yading@10 379 paddd m6, m2
yading@10 380
yading@10 381 mova m2, %1
yading@10 382 mova m5, %2
yading@10 383 SUMSUB_BA d, 5, 2
yading@10 384 SUMSUB_BA d, 6, 5
yading@10 385 SUMSUB_BA d, 4, 2
yading@10 386 SUMSUB_BA d, 7, 6
yading@10 387 SUMSUB_BA d, 0, 4
yading@10 388 SUMSUB_BA d, 3, 2
yading@10 389 SUMSUB_BA d, 1, 5
yading@10 390 SWAP 7, 6, 4, 5, 2, 3, 1, 0 ; 70315246 -> 01234567
yading@10 391 %endmacro
yading@10 392
yading@10 393 %macro IDCT8_1D_FULL 1
yading@10 394 mova m7, [%1+112*2]
yading@10 395 mova m6, [%1+ 96*2]
yading@10 396 mova m5, [%1+ 80*2]
yading@10 397 mova m3, [%1+ 48*2]
yading@10 398 mova m2, [%1+ 32*2]
yading@10 399 mova m1, [%1+ 16*2]
yading@10 400 IDCT8_1D [%1], [%1+ 64*2]
yading@10 401 %endmacro
yading@10 402
yading@10 403 ; %1=int16_t *block, %2=int16_t *dstblock
yading@10 404 %macro IDCT8_ADD_SSE_START 2
yading@10 405 IDCT8_1D_FULL %1
yading@10 406 %if ARCH_X86_64
yading@10 407 TRANSPOSE4x4D 0,1,2,3,8
yading@10 408 mova [%2 ], m0
yading@10 409 TRANSPOSE4x4D 4,5,6,7,8
yading@10 410 mova [%2+8*2], m4
yading@10 411 %else
yading@10 412 mova [%1], m7
yading@10 413 TRANSPOSE4x4D 0,1,2,3,7
yading@10 414 mova m7, [%1]
yading@10 415 mova [%2 ], m0
yading@10 416 mova [%2+16*2], m1
yading@10 417 mova [%2+32*2], m2
yading@10 418 mova [%2+48*2], m3
yading@10 419 TRANSPOSE4x4D 4,5,6,7,3
yading@10 420 mova [%2+ 8*2], m4
yading@10 421 mova [%2+24*2], m5
yading@10 422 mova [%2+40*2], m6
yading@10 423 mova [%2+56*2], m7
yading@10 424 %endif
yading@10 425 %endmacro
yading@10 426
yading@10 427 ; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
yading@10 428 %macro IDCT8_ADD_SSE_END 3
yading@10 429 IDCT8_1D_FULL %2
yading@10 430 mova [%2 ], m6
yading@10 431 mova [%2+16*2], m7
yading@10 432
yading@10 433 pxor m7, m7
yading@10 434 STORE_DIFFx2 m0, m1, m6, m7, %1, %3
yading@10 435 lea %1, [%1+%3*2]
yading@10 436 STORE_DIFFx2 m2, m3, m6, m7, %1, %3
yading@10 437 mova m0, [%2 ]
yading@10 438 mova m1, [%2+16*2]
yading@10 439 lea %1, [%1+%3*2]
yading@10 440 STORE_DIFFx2 m4, m5, m6, m7, %1, %3
yading@10 441 lea %1, [%1+%3*2]
yading@10 442 STORE_DIFFx2 m0, m1, m6, m7, %1, %3
yading@10 443 %endmacro
yading@10 444
yading@10 445 %macro IDCT8_ADD 0
yading@10 446 cglobal h264_idct8_add_10, 3,4,16
yading@10 447 %if UNIX64 == 0
yading@10 448 %assign pad 16-gprsize-(stack_offset&15)
yading@10 449 sub rsp, pad
yading@10 450 call h264_idct8_add1_10 %+ SUFFIX
yading@10 451 add rsp, pad
yading@10 452 RET
yading@10 453 %endif
yading@10 454
yading@10 455 ALIGN 16
yading@10 456 ; TODO: does not need to use stack
yading@10 457 h264_idct8_add1_10 %+ SUFFIX:
yading@10 458 %assign pad 256+16-gprsize
yading@10 459 sub rsp, pad
yading@10 460 add dword [r1], 32
yading@10 461
yading@10 462 %if ARCH_X86_64
yading@10 463 IDCT8_ADD_SSE_START r1, rsp
yading@10 464 SWAP 1, 9
yading@10 465 SWAP 2, 10
yading@10 466 SWAP 3, 11
yading@10 467 SWAP 5, 13
yading@10 468 SWAP 6, 14
yading@10 469 SWAP 7, 15
yading@10 470 IDCT8_ADD_SSE_START r1+16, rsp+128
yading@10 471 PERMUTE 1,9, 2,10, 3,11, 5,1, 6,2, 7,3, 9,13, 10,14, 11,15, 13,5, 14,6, 15,7
yading@10 472 IDCT8_1D [rsp], [rsp+128]
yading@10 473 SWAP 0, 8
yading@10 474 SWAP 1, 9
yading@10 475 SWAP 2, 10
yading@10 476 SWAP 3, 11
yading@10 477 SWAP 4, 12
yading@10 478 SWAP 5, 13
yading@10 479 SWAP 6, 14
yading@10 480 SWAP 7, 15
yading@10 481 IDCT8_1D [rsp+16], [rsp+144]
yading@10 482 psrad m8, 6
yading@10 483 psrad m0, 6
yading@10 484 packssdw m8, m0
yading@10 485 paddsw m8, [r0]
yading@10 486 pxor m0, m0
yading@10 487 mova [r1+ 0], m0
yading@10 488 mova [r1+ 16], m0
yading@10 489 mova [r1+ 32], m0
yading@10 490 mova [r1+ 48], m0
yading@10 491 mova [r1+ 64], m0
yading@10 492 mova [r1+ 80], m0
yading@10 493 mova [r1+ 96], m0
yading@10 494 mova [r1+112], m0
yading@10 495 mova [r1+128], m0
yading@10 496 mova [r1+144], m0
yading@10 497 mova [r1+160], m0
yading@10 498 mova [r1+176], m0
yading@10 499 mova [r1+192], m0
yading@10 500 mova [r1+208], m0
yading@10 501 mova [r1+224], m0
yading@10 502 mova [r1+240], m0
yading@10 503 CLIPW m8, m0, [pw_pixel_max]
yading@10 504 mova [r0], m8
yading@10 505 mova m8, [pw_pixel_max]
yading@10 506 STORE_DIFF16 m9, m1, m0, m8, r0+r2
yading@10 507 lea r0, [r0+r2*2]
yading@10 508 STORE_DIFF16 m10, m2, m0, m8, r0
yading@10 509 STORE_DIFF16 m11, m3, m0, m8, r0+r2
yading@10 510 lea r0, [r0+r2*2]
yading@10 511 STORE_DIFF16 m12, m4, m0, m8, r0
yading@10 512 STORE_DIFF16 m13, m5, m0, m8, r0+r2
yading@10 513 lea r0, [r0+r2*2]
yading@10 514 STORE_DIFF16 m14, m6, m0, m8, r0
yading@10 515 STORE_DIFF16 m15, m7, m0, m8, r0+r2
yading@10 516 %else
yading@10 517 IDCT8_ADD_SSE_START r1, rsp
yading@10 518 IDCT8_ADD_SSE_START r1+16, rsp+128
yading@10 519 lea r3, [r0+8]
yading@10 520 IDCT8_ADD_SSE_END r0, rsp, r2
yading@10 521 IDCT8_ADD_SSE_END r3, rsp+16, r2
yading@10 522 mova [r1+ 0], m7
yading@10 523 mova [r1+ 16], m7
yading@10 524 mova [r1+ 32], m7
yading@10 525 mova [r1+ 48], m7
yading@10 526 mova [r1+ 64], m7
yading@10 527 mova [r1+ 80], m7
yading@10 528 mova [r1+ 96], m7
yading@10 529 mova [r1+112], m7
yading@10 530 mova [r1+128], m7
yading@10 531 mova [r1+144], m7
yading@10 532 mova [r1+160], m7
yading@10 533 mova [r1+176], m7
yading@10 534 mova [r1+192], m7
yading@10 535 mova [r1+208], m7
yading@10 536 mova [r1+224], m7
yading@10 537 mova [r1+240], m7
yading@10 538 %endif ; ARCH_X86_64
yading@10 539
yading@10 540 add rsp, pad
yading@10 541 ret
yading@10 542 %endmacro
yading@10 543
yading@10 544 INIT_XMM sse2
yading@10 545 IDCT8_ADD
yading@10 546 %if HAVE_AVX_EXTERNAL
yading@10 547 INIT_XMM avx
yading@10 548 IDCT8_ADD
yading@10 549 %endif
yading@10 550
yading@10 551 ;-----------------------------------------------------------------------------
yading@10 552 ; h264_idct8_add4(pixel **dst, const int *block_offset, dctcoef *block, int stride, const uint8_t nnzc[6*8])
yading@10 553 ;-----------------------------------------------------------------------------
yading@10 554 ;;;;;;; NO FATE SAMPLES TRIGGER THIS
yading@10 555 %macro IDCT8_ADD4_OP 2
yading@10 556 cmp byte [r4+%2], 0
yading@10 557 jz .skipblock%1
yading@10 558 mov r0d, [r6+%1*4]
yading@10 559 add r0, r5
yading@10 560 call h264_idct8_add1_10 %+ SUFFIX
yading@10 561 .skipblock%1:
yading@10 562 %if %1<12
yading@10 563 add r1, 256
yading@10 564 %endif
yading@10 565 %endmacro
yading@10 566
yading@10 567 %macro IDCT8_ADD4 0
yading@10 568 cglobal h264_idct8_add4_10, 0,7,16
yading@10 569 %assign pad 16-gprsize-(stack_offset&15)
yading@10 570 SUB rsp, pad
yading@10 571 mov r5, r0mp
yading@10 572 mov r6, r1mp
yading@10 573 mov r1, r2mp
yading@10 574 mov r2d, r3m
yading@10 575 movifnidn r4, r4mp
yading@10 576 IDCT8_ADD4_OP 0, 4+1*8
yading@10 577 IDCT8_ADD4_OP 4, 6+1*8
yading@10 578 IDCT8_ADD4_OP 8, 4+3*8
yading@10 579 IDCT8_ADD4_OP 12, 6+3*8
yading@10 580 ADD rsp, pad
yading@10 581 RET
yading@10 582 %endmacro ; IDCT8_ADD4
yading@10 583
yading@10 584 INIT_XMM sse2
yading@10 585 IDCT8_ADD4
yading@10 586 %if HAVE_AVX_EXTERNAL
yading@10 587 INIT_XMM avx
yading@10 588 IDCT8_ADD4
yading@10 589 %endif