annotate ffmpeg/libavcodec/x86/h264_idct.asm @ 13:844d341cf643 tip

Back up before ISMIR
author Yading Song <yading.song@eecs.qmul.ac.uk>
date Thu, 31 Oct 2013 13:17:06 +0000
parents 6840f77b83aa
children
rev   line source
yading@10 1 ;*****************************************************************************
yading@10 2 ;* MMX/SSE2-optimized H.264 iDCT
yading@10 3 ;*****************************************************************************
yading@10 4 ;* Copyright (C) 2004-2005 Michael Niedermayer, Loren Merritt
yading@10 5 ;* Copyright (C) 2003-2008 x264 project
yading@10 6 ;*
yading@10 7 ;* Authors: Laurent Aimar <fenrir@via.ecp.fr>
yading@10 8 ;* Loren Merritt <lorenm@u.washington.edu>
yading@10 9 ;* Holger Lubitz <hal@duncan.ol.sub.de>
yading@10 10 ;* Min Chen <chenm001.163.com>
yading@10 11 ;*
yading@10 12 ;* This file is part of FFmpeg.
yading@10 13 ;*
yading@10 14 ;* FFmpeg is free software; you can redistribute it and/or
yading@10 15 ;* modify it under the terms of the GNU Lesser General Public
yading@10 16 ;* License as published by the Free Software Foundation; either
yading@10 17 ;* version 2.1 of the License, or (at your option) any later version.
yading@10 18 ;*
yading@10 19 ;* FFmpeg is distributed in the hope that it will be useful,
yading@10 20 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
yading@10 21 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
yading@10 22 ;* Lesser General Public License for more details.
yading@10 23 ;*
yading@10 24 ;* You should have received a copy of the GNU Lesser General Public
yading@10 25 ;* License along with FFmpeg; if not, write to the Free Software
yading@10 26 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
yading@10 27 ;*****************************************************************************
yading@10 28
yading@10 29 %include "libavutil/x86/x86util.asm"
yading@10 30
yading@10 31 SECTION_RODATA
yading@10 32
yading@10 33 ; FIXME this table is a duplicate from h264data.h, and will be removed once the tables from, h264 have been split
yading@10 34 scan8_mem: db 4+ 1*8, 5+ 1*8, 4+ 2*8, 5+ 2*8
yading@10 35 db 6+ 1*8, 7+ 1*8, 6+ 2*8, 7+ 2*8
yading@10 36 db 4+ 3*8, 5+ 3*8, 4+ 4*8, 5+ 4*8
yading@10 37 db 6+ 3*8, 7+ 3*8, 6+ 4*8, 7+ 4*8
yading@10 38 db 4+ 6*8, 5+ 6*8, 4+ 7*8, 5+ 7*8
yading@10 39 db 6+ 6*8, 7+ 6*8, 6+ 7*8, 7+ 7*8
yading@10 40 db 4+ 8*8, 5+ 8*8, 4+ 9*8, 5+ 9*8
yading@10 41 db 6+ 8*8, 7+ 8*8, 6+ 9*8, 7+ 9*8
yading@10 42 db 4+11*8, 5+11*8, 4+12*8, 5+12*8
yading@10 43 db 6+11*8, 7+11*8, 6+12*8, 7+12*8
yading@10 44 db 4+13*8, 5+13*8, 4+14*8, 5+14*8
yading@10 45 db 6+13*8, 7+13*8, 6+14*8, 7+14*8
yading@10 46 %ifdef PIC
yading@10 47 %define npicregs 1
yading@10 48 %define scan8 picregq
yading@10 49 %else
yading@10 50 %define npicregs 0
yading@10 51 %define scan8 scan8_mem
yading@10 52 %endif
yading@10 53
yading@10 54 cextern pw_32
yading@10 55 cextern pw_1
yading@10 56
yading@10 57 SECTION .text
yading@10 58
yading@10 59 ; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
yading@10 60 %macro IDCT4_ADD 3
yading@10 61 ; Load dct coeffs
yading@10 62 movq m0, [%2]
yading@10 63 movq m1, [%2+8]
yading@10 64 movq m2, [%2+16]
yading@10 65 movq m3, [%2+24]
yading@10 66
yading@10 67 IDCT4_1D w, 0, 1, 2, 3, 4, 5
yading@10 68 mova m6, [pw_32]
yading@10 69 TRANSPOSE4x4W 0, 1, 2, 3, 4
yading@10 70 paddw m0, m6
yading@10 71 IDCT4_1D w, 0, 1, 2, 3, 4, 5
yading@10 72 pxor m7, m7
yading@10 73 movq [%2+ 0], m7
yading@10 74 movq [%2+ 8], m7
yading@10 75 movq [%2+16], m7
yading@10 76 movq [%2+24], m7
yading@10 77
yading@10 78 STORE_DIFFx2 m0, m1, m4, m5, m7, 6, %1, %3
yading@10 79 lea %1, [%1+%3*2]
yading@10 80 STORE_DIFFx2 m2, m3, m4, m5, m7, 6, %1, %3
yading@10 81 %endmacro
yading@10 82
yading@10 83 INIT_MMX mmx
yading@10 84 ; ff_h264_idct_add_mmx(uint8_t *dst, int16_t *block, int stride)
yading@10 85 cglobal h264_idct_add_8, 3, 3, 0
yading@10 86 IDCT4_ADD r0, r1, r2
yading@10 87 RET
yading@10 88
yading@10 89 %macro IDCT8_1D 2
yading@10 90 mova m0, m1
yading@10 91 psraw m1, 1
yading@10 92 mova m4, m5
yading@10 93 psraw m4, 1
yading@10 94 paddw m4, m5
yading@10 95 paddw m1, m0
yading@10 96 paddw m4, m7
yading@10 97 paddw m1, m5
yading@10 98 psubw m4, m0
yading@10 99 paddw m1, m3
yading@10 100
yading@10 101 psubw m0, m3
yading@10 102 psubw m5, m3
yading@10 103 psraw m3, 1
yading@10 104 paddw m0, m7
yading@10 105 psubw m5, m7
yading@10 106 psraw m7, 1
yading@10 107 psubw m0, m3
yading@10 108 psubw m5, m7
yading@10 109
yading@10 110 mova m7, m1
yading@10 111 psraw m1, 2
yading@10 112 mova m3, m4
yading@10 113 psraw m3, 2
yading@10 114 paddw m3, m0
yading@10 115 psraw m0, 2
yading@10 116 paddw m1, m5
yading@10 117 psraw m5, 2
yading@10 118 psubw m0, m4
yading@10 119 psubw m7, m5
yading@10 120
yading@10 121 mova m5, m6
yading@10 122 psraw m6, 1
yading@10 123 mova m4, m2
yading@10 124 psraw m4, 1
yading@10 125 paddw m6, m2
yading@10 126 psubw m4, m5
yading@10 127
yading@10 128 mova m2, %1
yading@10 129 mova m5, %2
yading@10 130 SUMSUB_BA w, 5, 2
yading@10 131 SUMSUB_BA w, 6, 5
yading@10 132 SUMSUB_BA w, 4, 2
yading@10 133 SUMSUB_BA w, 7, 6
yading@10 134 SUMSUB_BA w, 0, 4
yading@10 135 SUMSUB_BA w, 3, 2
yading@10 136 SUMSUB_BA w, 1, 5
yading@10 137 SWAP 7, 6, 4, 5, 2, 3, 1, 0 ; 70315246 -> 01234567
yading@10 138 %endmacro
yading@10 139
yading@10 140 %macro IDCT8_1D_FULL 1
yading@10 141 mova m7, [%1+112]
yading@10 142 mova m6, [%1+ 96]
yading@10 143 mova m5, [%1+ 80]
yading@10 144 mova m3, [%1+ 48]
yading@10 145 mova m2, [%1+ 32]
yading@10 146 mova m1, [%1+ 16]
yading@10 147 IDCT8_1D [%1], [%1+ 64]
yading@10 148 %endmacro
yading@10 149
yading@10 150 ; %1=int16_t *block, %2=int16_t *dstblock
yading@10 151 %macro IDCT8_ADD_MMX_START 2
yading@10 152 IDCT8_1D_FULL %1
yading@10 153 mova [%1], m7
yading@10 154 TRANSPOSE4x4W 0, 1, 2, 3, 7
yading@10 155 mova m7, [%1]
yading@10 156 mova [%2 ], m0
yading@10 157 mova [%2+16], m1
yading@10 158 mova [%2+32], m2
yading@10 159 mova [%2+48], m3
yading@10 160 TRANSPOSE4x4W 4, 5, 6, 7, 3
yading@10 161 mova [%2+ 8], m4
yading@10 162 mova [%2+24], m5
yading@10 163 mova [%2+40], m6
yading@10 164 mova [%2+56], m7
yading@10 165 %endmacro
yading@10 166
yading@10 167 ; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
yading@10 168 %macro IDCT8_ADD_MMX_END 3-4
yading@10 169 IDCT8_1D_FULL %2
yading@10 170 mova [%2 ], m5
yading@10 171 mova [%2+16], m6
yading@10 172 mova [%2+32], m7
yading@10 173
yading@10 174 pxor m7, m7
yading@10 175 %if %0 == 4
yading@10 176 movq [%4+ 0], m7
yading@10 177 movq [%4+ 8], m7
yading@10 178 movq [%4+ 16], m7
yading@10 179 movq [%4+ 24], m7
yading@10 180 movq [%4+ 32], m7
yading@10 181 movq [%4+ 40], m7
yading@10 182 movq [%4+ 48], m7
yading@10 183 movq [%4+ 56], m7
yading@10 184 movq [%4+ 64], m7
yading@10 185 movq [%4+ 72], m7
yading@10 186 movq [%4+ 80], m7
yading@10 187 movq [%4+ 88], m7
yading@10 188 movq [%4+ 96], m7
yading@10 189 movq [%4+104], m7
yading@10 190 movq [%4+112], m7
yading@10 191 movq [%4+120], m7
yading@10 192 %endif
yading@10 193 STORE_DIFFx2 m0, m1, m5, m6, m7, 6, %1, %3
yading@10 194 lea %1, [%1+%3*2]
yading@10 195 STORE_DIFFx2 m2, m3, m5, m6, m7, 6, %1, %3
yading@10 196 mova m0, [%2 ]
yading@10 197 mova m1, [%2+16]
yading@10 198 mova m2, [%2+32]
yading@10 199 lea %1, [%1+%3*2]
yading@10 200 STORE_DIFFx2 m4, m0, m5, m6, m7, 6, %1, %3
yading@10 201 lea %1, [%1+%3*2]
yading@10 202 STORE_DIFFx2 m1, m2, m5, m6, m7, 6, %1, %3
yading@10 203 %endmacro
yading@10 204
yading@10 205 INIT_MMX mmx
yading@10 206 ; ff_h264_idct8_add_mmx(uint8_t *dst, int16_t *block, int stride)
yading@10 207 cglobal h264_idct8_add_8, 3, 4, 0
yading@10 208 %assign pad 128+4-(stack_offset&7)
yading@10 209 SUB rsp, pad
yading@10 210
yading@10 211 add word [r1], 32
yading@10 212 IDCT8_ADD_MMX_START r1 , rsp
yading@10 213 IDCT8_ADD_MMX_START r1+8, rsp+64
yading@10 214 lea r3, [r0+4]
yading@10 215 IDCT8_ADD_MMX_END r0 , rsp, r2, r1
yading@10 216 IDCT8_ADD_MMX_END r3 , rsp+8, r2
yading@10 217
yading@10 218 ADD rsp, pad
yading@10 219 RET
yading@10 220
yading@10 221 ; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
yading@10 222 %macro IDCT8_ADD_SSE 4
yading@10 223 IDCT8_1D_FULL %2
yading@10 224 %if ARCH_X86_64
yading@10 225 TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8
yading@10 226 %else
yading@10 227 TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [%2], [%2+16]
yading@10 228 %endif
yading@10 229 paddw m0, [pw_32]
yading@10 230
yading@10 231 %if ARCH_X86_64 == 0
yading@10 232 mova [%2 ], m0
yading@10 233 mova [%2+16], m4
yading@10 234 IDCT8_1D [%2], [%2+ 16]
yading@10 235 mova [%2 ], m6
yading@10 236 mova [%2+16], m7
yading@10 237 %else
yading@10 238 SWAP 0, 8
yading@10 239 SWAP 4, 9
yading@10 240 IDCT8_1D m8, m9
yading@10 241 SWAP 6, 8
yading@10 242 SWAP 7, 9
yading@10 243 %endif
yading@10 244
yading@10 245 pxor m7, m7
yading@10 246 lea %4, [%3*3]
yading@10 247 STORE_DIFF m0, m6, m7, [%1 ]
yading@10 248 STORE_DIFF m1, m6, m7, [%1+%3 ]
yading@10 249 STORE_DIFF m2, m6, m7, [%1+%3*2]
yading@10 250 STORE_DIFF m3, m6, m7, [%1+%4 ]
yading@10 251 %if ARCH_X86_64 == 0
yading@10 252 mova m0, [%2 ]
yading@10 253 mova m1, [%2+16]
yading@10 254 %else
yading@10 255 SWAP 0, 8
yading@10 256 SWAP 1, 9
yading@10 257 %endif
yading@10 258 mova [%2+ 0], m7
yading@10 259 mova [%2+ 16], m7
yading@10 260 mova [%2+ 32], m7
yading@10 261 mova [%2+ 48], m7
yading@10 262 mova [%2+ 64], m7
yading@10 263 mova [%2+ 80], m7
yading@10 264 mova [%2+ 96], m7
yading@10 265 mova [%2+112], m7
yading@10 266 lea %1, [%1+%3*4]
yading@10 267 STORE_DIFF m4, m6, m7, [%1 ]
yading@10 268 STORE_DIFF m5, m6, m7, [%1+%3 ]
yading@10 269 STORE_DIFF m0, m6, m7, [%1+%3*2]
yading@10 270 STORE_DIFF m1, m6, m7, [%1+%4 ]
yading@10 271 %endmacro
yading@10 272
yading@10 273 INIT_XMM sse2
yading@10 274 ; ff_h264_idct8_add_sse2(uint8_t *dst, int16_t *block, int stride)
yading@10 275 cglobal h264_idct8_add_8, 3, 4, 10
yading@10 276 IDCT8_ADD_SSE r0, r1, r2, r3
yading@10 277 RET
yading@10 278
yading@10 279 %macro DC_ADD_MMXEXT_INIT 2
yading@10 280 add %1, 32
yading@10 281 sar %1, 6
yading@10 282 movd m0, %1d
yading@10 283 lea %1, [%2*3]
yading@10 284 pshufw m0, m0, 0
yading@10 285 pxor m1, m1
yading@10 286 psubw m1, m0
yading@10 287 packuswb m0, m0
yading@10 288 packuswb m1, m1
yading@10 289 %endmacro
yading@10 290
yading@10 291 %macro DC_ADD_MMXEXT_OP 4
yading@10 292 %1 m2, [%2 ]
yading@10 293 %1 m3, [%2+%3 ]
yading@10 294 %1 m4, [%2+%3*2]
yading@10 295 %1 m5, [%2+%4 ]
yading@10 296 paddusb m2, m0
yading@10 297 paddusb m3, m0
yading@10 298 paddusb m4, m0
yading@10 299 paddusb m5, m0
yading@10 300 psubusb m2, m1
yading@10 301 psubusb m3, m1
yading@10 302 psubusb m4, m1
yading@10 303 psubusb m5, m1
yading@10 304 %1 [%2 ], m2
yading@10 305 %1 [%2+%3 ], m3
yading@10 306 %1 [%2+%3*2], m4
yading@10 307 %1 [%2+%4 ], m5
yading@10 308 %endmacro
yading@10 309
yading@10 310 INIT_MMX mmxext
yading@10 311 ; ff_h264_idct_dc_add_mmxext(uint8_t *dst, int16_t *block, int stride)
yading@10 312 %if ARCH_X86_64
yading@10 313 cglobal h264_idct_dc_add_8, 3, 4, 0
yading@10 314 movsx r3, word [r1]
yading@10 315 mov dword [r1], 0
yading@10 316 DC_ADD_MMXEXT_INIT r3, r2
yading@10 317 DC_ADD_MMXEXT_OP movh, r0, r2, r3
yading@10 318 RET
yading@10 319
yading@10 320 ; ff_h264_idct8_dc_add_mmxext(uint8_t *dst, int16_t *block, int stride)
yading@10 321 cglobal h264_idct8_dc_add_8, 3, 4, 0
yading@10 322 movsx r3, word [r1]
yading@10 323 mov dword [r1], 0
yading@10 324 DC_ADD_MMXEXT_INIT r3, r2
yading@10 325 DC_ADD_MMXEXT_OP mova, r0, r2, r3
yading@10 326 lea r0, [r0+r2*4]
yading@10 327 DC_ADD_MMXEXT_OP mova, r0, r2, r3
yading@10 328 RET
yading@10 329 %else
yading@10 330 cglobal h264_idct_dc_add_8, 2, 3, 0
yading@10 331 movsx r2, word [r1]
yading@10 332 mov dword [r1], 0
yading@10 333 mov r1, r2m
yading@10 334 DC_ADD_MMXEXT_INIT r2, r1
yading@10 335 DC_ADD_MMXEXT_OP movh, r0, r1, r2
yading@10 336 RET
yading@10 337
yading@10 338 ; ff_h264_idct8_dc_add_mmxext(uint8_t *dst, int16_t *block, int stride)
yading@10 339 cglobal h264_idct8_dc_add_8, 2, 3, 0
yading@10 340 movsx r2, word [r1]
yading@10 341 mov dword [r1], 0
yading@10 342 mov r1, r2m
yading@10 343 DC_ADD_MMXEXT_INIT r2, r1
yading@10 344 DC_ADD_MMXEXT_OP mova, r0, r1, r2
yading@10 345 lea r0, [r0+r1*4]
yading@10 346 DC_ADD_MMXEXT_OP mova, r0, r1, r2
yading@10 347 RET
yading@10 348 %endif
yading@10 349
yading@10 350 INIT_MMX mmx
yading@10 351 ; ff_h264_idct_add16_mmx(uint8_t *dst, const int *block_offset,
yading@10 352 ; int16_t *block, int stride, const uint8_t nnzc[6*8])
yading@10 353 cglobal h264_idct_add16_8, 5, 7 + npicregs, 0, dst, block_offset, block, stride, nnzc, cntr, coeff, picreg
yading@10 354 xor r5, r5
yading@10 355 %ifdef PIC
yading@10 356 lea picregq, [scan8_mem]
yading@10 357 %endif
yading@10 358 .nextblock:
yading@10 359 movzx r6, byte [scan8+r5]
yading@10 360 movzx r6, byte [r4+r6]
yading@10 361 test r6, r6
yading@10 362 jz .skipblock
yading@10 363 mov r6d, dword [r1+r5*4]
yading@10 364 lea r6, [r0+r6]
yading@10 365 IDCT4_ADD r6, r2, r3
yading@10 366 .skipblock:
yading@10 367 inc r5
yading@10 368 add r2, 32
yading@10 369 cmp r5, 16
yading@10 370 jl .nextblock
yading@10 371 REP_RET
yading@10 372
yading@10 373 ; ff_h264_idct8_add4_mmx(uint8_t *dst, const int *block_offset,
yading@10 374 ; int16_t *block, int stride, const uint8_t nnzc[6*8])
yading@10 375 cglobal h264_idct8_add4_8, 5, 7 + npicregs, 0, dst, block_offset, block, stride, nnzc, cntr, coeff, picreg
yading@10 376 %assign pad 128+4-(stack_offset&7)
yading@10 377 SUB rsp, pad
yading@10 378
yading@10 379 xor r5, r5
yading@10 380 %ifdef PIC
yading@10 381 lea picregq, [scan8_mem]
yading@10 382 %endif
yading@10 383 .nextblock:
yading@10 384 movzx r6, byte [scan8+r5]
yading@10 385 movzx r6, byte [r4+r6]
yading@10 386 test r6, r6
yading@10 387 jz .skipblock
yading@10 388 mov r6d, dword [r1+r5*4]
yading@10 389 add r6, r0
yading@10 390 add word [r2], 32
yading@10 391 IDCT8_ADD_MMX_START r2 , rsp
yading@10 392 IDCT8_ADD_MMX_START r2+8, rsp+64
yading@10 393 IDCT8_ADD_MMX_END r6 , rsp, r3, r2
yading@10 394 mov r6d, dword [r1+r5*4]
yading@10 395 lea r6, [r0+r6+4]
yading@10 396 IDCT8_ADD_MMX_END r6 , rsp+8, r3
yading@10 397 .skipblock:
yading@10 398 add r5, 4
yading@10 399 add r2, 128
yading@10 400 cmp r5, 16
yading@10 401 jl .nextblock
yading@10 402 ADD rsp, pad
yading@10 403 RET
yading@10 404
yading@10 405 INIT_MMX mmxext
yading@10 406 ; ff_h264_idct_add16_mmxext(uint8_t *dst, const int *block_offset,
yading@10 407 ; int16_t *block, int stride, const uint8_t nnzc[6*8])
yading@10 408 cglobal h264_idct_add16_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
yading@10 409 xor r5, r5
yading@10 410 %ifdef PIC
yading@10 411 lea picregq, [scan8_mem]
yading@10 412 %endif
yading@10 413 .nextblock:
yading@10 414 movzx r6, byte [scan8+r5]
yading@10 415 movzx r6, byte [r4+r6]
yading@10 416 test r6, r6
yading@10 417 jz .skipblock
yading@10 418 cmp r6, 1
yading@10 419 jnz .no_dc
yading@10 420 movsx r6, word [r2]
yading@10 421 test r6, r6
yading@10 422 jz .no_dc
yading@10 423 mov word [r2], 0
yading@10 424 DC_ADD_MMXEXT_INIT r6, r3
yading@10 425 %if ARCH_X86_64 == 0
yading@10 426 %define dst2q r1
yading@10 427 %define dst2d r1d
yading@10 428 %endif
yading@10 429 mov dst2d, dword [r1+r5*4]
yading@10 430 lea dst2q, [r0+dst2q]
yading@10 431 DC_ADD_MMXEXT_OP movh, dst2q, r3, r6
yading@10 432 %if ARCH_X86_64 == 0
yading@10 433 mov r1, r1m
yading@10 434 %endif
yading@10 435 inc r5
yading@10 436 add r2, 32
yading@10 437 cmp r5, 16
yading@10 438 jl .nextblock
yading@10 439 REP_RET
yading@10 440 .no_dc:
yading@10 441 mov r6d, dword [r1+r5*4]
yading@10 442 add r6, r0
yading@10 443 IDCT4_ADD r6, r2, r3
yading@10 444 .skipblock:
yading@10 445 inc r5
yading@10 446 add r2, 32
yading@10 447 cmp r5, 16
yading@10 448 jl .nextblock
yading@10 449 REP_RET
yading@10 450
yading@10 451 INIT_MMX mmx
yading@10 452 ; ff_h264_idct_add16intra_mmx(uint8_t *dst, const int *block_offset,
yading@10 453 ; int16_t *block, int stride, const uint8_t nnzc[6*8])
yading@10 454 cglobal h264_idct_add16intra_8, 5, 7 + npicregs, 0, dst, block_offset, block, stride, nnzc, cntr, coeff, picreg
yading@10 455 xor r5, r5
yading@10 456 %ifdef PIC
yading@10 457 lea picregq, [scan8_mem]
yading@10 458 %endif
yading@10 459 .nextblock:
yading@10 460 movzx r6, byte [scan8+r5]
yading@10 461 movzx r6, byte [r4+r6]
yading@10 462 or r6w, word [r2]
yading@10 463 test r6, r6
yading@10 464 jz .skipblock
yading@10 465 mov r6d, dword [r1+r5*4]
yading@10 466 add r6, r0
yading@10 467 IDCT4_ADD r6, r2, r3
yading@10 468 .skipblock:
yading@10 469 inc r5
yading@10 470 add r2, 32
yading@10 471 cmp r5, 16
yading@10 472 jl .nextblock
yading@10 473 REP_RET
yading@10 474
yading@10 475 INIT_MMX mmxext
yading@10 476 ; ff_h264_idct_add16intra_mmxext(uint8_t *dst, const int *block_offset,
yading@10 477 ; int16_t *block, int stride,
yading@10 478 ; const uint8_t nnzc[6*8])
yading@10 479 cglobal h264_idct_add16intra_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
yading@10 480 xor r5, r5
yading@10 481 %ifdef PIC
yading@10 482 lea picregq, [scan8_mem]
yading@10 483 %endif
yading@10 484 .nextblock:
yading@10 485 movzx r6, byte [scan8+r5]
yading@10 486 movzx r6, byte [r4+r6]
yading@10 487 test r6, r6
yading@10 488 jz .try_dc
yading@10 489 mov r6d, dword [r1+r5*4]
yading@10 490 lea r6, [r0+r6]
yading@10 491 IDCT4_ADD r6, r2, r3
yading@10 492 inc r5
yading@10 493 add r2, 32
yading@10 494 cmp r5, 16
yading@10 495 jl .nextblock
yading@10 496 REP_RET
yading@10 497 .try_dc:
yading@10 498 movsx r6, word [r2]
yading@10 499 test r6, r6
yading@10 500 jz .skipblock
yading@10 501 mov word [r2], 0
yading@10 502 DC_ADD_MMXEXT_INIT r6, r3
yading@10 503 %if ARCH_X86_64 == 0
yading@10 504 %define dst2q r1
yading@10 505 %define dst2d r1d
yading@10 506 %endif
yading@10 507 mov dst2d, dword [r1+r5*4]
yading@10 508 add dst2q, r0
yading@10 509 DC_ADD_MMXEXT_OP movh, dst2q, r3, r6
yading@10 510 %if ARCH_X86_64 == 0
yading@10 511 mov r1, r1m
yading@10 512 %endif
yading@10 513 .skipblock:
yading@10 514 inc r5
yading@10 515 add r2, 32
yading@10 516 cmp r5, 16
yading@10 517 jl .nextblock
yading@10 518 REP_RET
yading@10 519
yading@10 520 ; ff_h264_idct8_add4_mmxext(uint8_t *dst, const int *block_offset,
yading@10 521 ; int16_t *block, int stride,
yading@10 522 ; const uint8_t nnzc[6*8])
yading@10 523 cglobal h264_idct8_add4_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
yading@10 524 %assign pad 128+4-(stack_offset&7)
yading@10 525 SUB rsp, pad
yading@10 526
yading@10 527 xor r5, r5
yading@10 528 %ifdef PIC
yading@10 529 lea picregq, [scan8_mem]
yading@10 530 %endif
yading@10 531 .nextblock:
yading@10 532 movzx r6, byte [scan8+r5]
yading@10 533 movzx r6, byte [r4+r6]
yading@10 534 test r6, r6
yading@10 535 jz .skipblock
yading@10 536 cmp r6, 1
yading@10 537 jnz .no_dc
yading@10 538 movsx r6, word [r2]
yading@10 539 test r6, r6
yading@10 540 jz .no_dc
yading@10 541 mov word [r2], 0
yading@10 542 DC_ADD_MMXEXT_INIT r6, r3
yading@10 543 %if ARCH_X86_64 == 0
yading@10 544 %define dst2q r1
yading@10 545 %define dst2d r1d
yading@10 546 %endif
yading@10 547 mov dst2d, dword [r1+r5*4]
yading@10 548 lea dst2q, [r0+dst2q]
yading@10 549 DC_ADD_MMXEXT_OP mova, dst2q, r3, r6
yading@10 550 lea dst2q, [dst2q+r3*4]
yading@10 551 DC_ADD_MMXEXT_OP mova, dst2q, r3, r6
yading@10 552 %if ARCH_X86_64 == 0
yading@10 553 mov r1, r1m
yading@10 554 %endif
yading@10 555 add r5, 4
yading@10 556 add r2, 128
yading@10 557 cmp r5, 16
yading@10 558 jl .nextblock
yading@10 559
yading@10 560 ADD rsp, pad
yading@10 561 RET
yading@10 562 .no_dc:
yading@10 563 mov r6d, dword [r1+r5*4]
yading@10 564 add r6, r0
yading@10 565 add word [r2], 32
yading@10 566 IDCT8_ADD_MMX_START r2 , rsp
yading@10 567 IDCT8_ADD_MMX_START r2+8, rsp+64
yading@10 568 IDCT8_ADD_MMX_END r6 , rsp, r3, r2
yading@10 569 mov r6d, dword [r1+r5*4]
yading@10 570 lea r6, [r0+r6+4]
yading@10 571 IDCT8_ADD_MMX_END r6 , rsp+8, r3
yading@10 572 .skipblock:
yading@10 573 add r5, 4
yading@10 574 add r2, 128
yading@10 575 cmp r5, 16
yading@10 576 jl .nextblock
yading@10 577
yading@10 578 ADD rsp, pad
yading@10 579 RET
yading@10 580
yading@10 581 INIT_XMM sse2
yading@10 582 ; ff_h264_idct8_add4_sse2(uint8_t *dst, const int *block_offset,
yading@10 583 ; int16_t *block, int stride, const uint8_t nnzc[6*8])
yading@10 584 cglobal h264_idct8_add4_8, 5, 8 + npicregs, 10, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
yading@10 585 xor r5, r5
yading@10 586 %ifdef PIC
yading@10 587 lea picregq, [scan8_mem]
yading@10 588 %endif
yading@10 589 .nextblock:
yading@10 590 movzx r6, byte [scan8+r5]
yading@10 591 movzx r6, byte [r4+r6]
yading@10 592 test r6, r6
yading@10 593 jz .skipblock
yading@10 594 cmp r6, 1
yading@10 595 jnz .no_dc
yading@10 596 movsx r6, word [r2]
yading@10 597 test r6, r6
yading@10 598 jz .no_dc
yading@10 599 INIT_MMX cpuname
yading@10 600 mov word [r2], 0
yading@10 601 DC_ADD_MMXEXT_INIT r6, r3
yading@10 602 %if ARCH_X86_64 == 0
yading@10 603 %define dst2q r1
yading@10 604 %define dst2d r1d
yading@10 605 %endif
yading@10 606 mov dst2d, dword [r1+r5*4]
yading@10 607 add dst2q, r0
yading@10 608 DC_ADD_MMXEXT_OP mova, dst2q, r3, r6
yading@10 609 lea dst2q, [dst2q+r3*4]
yading@10 610 DC_ADD_MMXEXT_OP mova, dst2q, r3, r6
yading@10 611 %if ARCH_X86_64 == 0
yading@10 612 mov r1, r1m
yading@10 613 %endif
yading@10 614 add r5, 4
yading@10 615 add r2, 128
yading@10 616 cmp r5, 16
yading@10 617 jl .nextblock
yading@10 618 REP_RET
yading@10 619 .no_dc:
yading@10 620 INIT_XMM cpuname
yading@10 621 mov dst2d, dword [r1+r5*4]
yading@10 622 add dst2q, r0
yading@10 623 IDCT8_ADD_SSE dst2q, r2, r3, r6
yading@10 624 %if ARCH_X86_64 == 0
yading@10 625 mov r1, r1m
yading@10 626 %endif
yading@10 627 .skipblock:
yading@10 628 add r5, 4
yading@10 629 add r2, 128
yading@10 630 cmp r5, 16
yading@10 631 jl .nextblock
yading@10 632 REP_RET
yading@10 633
yading@10 634 INIT_MMX mmx
yading@10 635 h264_idct_add8_mmx_plane:
yading@10 636 .nextblock:
yading@10 637 movzx r6, byte [scan8+r5]
yading@10 638 movzx r6, byte [r4+r6]
yading@10 639 or r6w, word [r2]
yading@10 640 test r6, r6
yading@10 641 jz .skipblock
yading@10 642 %if ARCH_X86_64
yading@10 643 mov r0d, dword [r1+r5*4]
yading@10 644 add r0, [dst2q]
yading@10 645 %else
yading@10 646 mov r0, r1m ; XXX r1m here is actually r0m of the calling func
yading@10 647 mov r0, [r0]
yading@10 648 add r0, dword [r1+r5*4]
yading@10 649 %endif
yading@10 650 IDCT4_ADD r0, r2, r3
yading@10 651 .skipblock:
yading@10 652 inc r5
yading@10 653 add r2, 32
yading@10 654 test r5, 3
yading@10 655 jnz .nextblock
yading@10 656 rep ret
yading@10 657
yading@10 658 ; ff_h264_idct_add8_mmx(uint8_t **dest, const int *block_offset,
yading@10 659 ; int16_t *block, int stride, const uint8_t nnzc[6*8])
yading@10 660 cglobal h264_idct_add8_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
yading@10 661 mov r5, 16
yading@10 662 add r2, 512
yading@10 663 %ifdef PIC
yading@10 664 lea picregq, [scan8_mem]
yading@10 665 %endif
yading@10 666 %if ARCH_X86_64
yading@10 667 mov dst2q, r0
yading@10 668 %endif
yading@10 669 call h264_idct_add8_mmx_plane
yading@10 670 mov r5, 32
yading@10 671 add r2, 384
yading@10 672 %if ARCH_X86_64
yading@10 673 add dst2q, gprsize
yading@10 674 %else
yading@10 675 add r0mp, gprsize
yading@10 676 %endif
yading@10 677 call h264_idct_add8_mmx_plane
yading@10 678 RET
yading@10 679
yading@10 680 h264_idct_add8_mmxext_plane:
yading@10 681 .nextblock:
yading@10 682 movzx r6, byte [scan8+r5]
yading@10 683 movzx r6, byte [r4+r6]
yading@10 684 test r6, r6
yading@10 685 jz .try_dc
yading@10 686 %if ARCH_X86_64
yading@10 687 mov r0d, dword [r1+r5*4]
yading@10 688 add r0, [dst2q]
yading@10 689 %else
yading@10 690 mov r0, r1m ; XXX r1m here is actually r0m of the calling func
yading@10 691 mov r0, [r0]
yading@10 692 add r0, dword [r1+r5*4]
yading@10 693 %endif
yading@10 694 IDCT4_ADD r0, r2, r3
yading@10 695 inc r5
yading@10 696 add r2, 32
yading@10 697 test r5, 3
yading@10 698 jnz .nextblock
yading@10 699 rep ret
yading@10 700 .try_dc:
yading@10 701 movsx r6, word [r2]
yading@10 702 test r6, r6
yading@10 703 jz .skipblock
yading@10 704 mov word [r2], 0
yading@10 705 DC_ADD_MMXEXT_INIT r6, r3
yading@10 706 %if ARCH_X86_64
yading@10 707 mov r0d, dword [r1+r5*4]
yading@10 708 add r0, [dst2q]
yading@10 709 %else
yading@10 710 mov r0, r1m ; XXX r1m here is actually r0m of the calling func
yading@10 711 mov r0, [r0]
yading@10 712 add r0, dword [r1+r5*4]
yading@10 713 %endif
yading@10 714 DC_ADD_MMXEXT_OP movh, r0, r3, r6
yading@10 715 .skipblock:
yading@10 716 inc r5
yading@10 717 add r2, 32
yading@10 718 test r5, 3
yading@10 719 jnz .nextblock
yading@10 720 rep ret
yading@10 721
yading@10 722 INIT_MMX mmxext
yading@10 723 ; ff_h264_idct_add8_mmxext(uint8_t **dest, const int *block_offset,
yading@10 724 ; int16_t *block, int stride, const uint8_t nnzc[6*8])
yading@10 725 cglobal h264_idct_add8_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
yading@10 726 mov r5, 16
yading@10 727 add r2, 512
yading@10 728 %if ARCH_X86_64
yading@10 729 mov dst2q, r0
yading@10 730 %endif
yading@10 731 %ifdef PIC
yading@10 732 lea picregq, [scan8_mem]
yading@10 733 %endif
yading@10 734 call h264_idct_add8_mmxext_plane
yading@10 735 mov r5, 32
yading@10 736 add r2, 384
yading@10 737 %if ARCH_X86_64
yading@10 738 add dst2q, gprsize
yading@10 739 %else
yading@10 740 add r0mp, gprsize
yading@10 741 %endif
yading@10 742 call h264_idct_add8_mmxext_plane
yading@10 743 RET
yading@10 744
yading@10 745 ; r0 = uint8_t *dst, r2 = int16_t *block, r3 = int stride, r6=clobbered
yading@10 746 h264_idct_dc_add8_mmxext:
yading@10 747 movd m0, [r2 ] ; 0 0 X D
yading@10 748 mov word [r2+ 0], 0
yading@10 749 punpcklwd m0, [r2+32] ; x X d D
yading@10 750 mov word [r2+32], 0
yading@10 751 paddsw m0, [pw_32]
yading@10 752 psraw m0, 6
yading@10 753 punpcklwd m0, m0 ; d d D D
yading@10 754 pxor m1, m1 ; 0 0 0 0
yading@10 755 psubw m1, m0 ; -d-d-D-D
yading@10 756 packuswb m0, m1 ; -d-d-D-D d d D D
yading@10 757 pshufw m1, m0, 0xFA ; -d-d-d-d-D-D-D-D
yading@10 758 punpcklwd m0, m0 ; d d d d D D D D
yading@10 759 lea r6, [r3*3]
yading@10 760 DC_ADD_MMXEXT_OP movq, r0, r3, r6
yading@10 761 ret
yading@10 762
yading@10 763 ALIGN 16
yading@10 764 INIT_XMM sse2
yading@10 765 ; r0 = uint8_t *dst (clobbered), r2 = int16_t *block, r3 = int stride
yading@10 766 h264_add8x4_idct_sse2:
yading@10 767 movq m0, [r2+ 0]
yading@10 768 movq m1, [r2+ 8]
yading@10 769 movq m2, [r2+16]
yading@10 770 movq m3, [r2+24]
yading@10 771 movhps m0, [r2+32]
yading@10 772 movhps m1, [r2+40]
yading@10 773 movhps m2, [r2+48]
yading@10 774 movhps m3, [r2+56]
yading@10 775 IDCT4_1D w,0,1,2,3,4,5
yading@10 776 TRANSPOSE2x4x4W 0,1,2,3,4
yading@10 777 paddw m0, [pw_32]
yading@10 778 IDCT4_1D w,0,1,2,3,4,5
yading@10 779 pxor m7, m7
yading@10 780 mova [r2+ 0], m7
yading@10 781 mova [r2+16], m7
yading@10 782 mova [r2+32], m7
yading@10 783 mova [r2+48], m7
yading@10 784 STORE_DIFFx2 m0, m1, m4, m5, m7, 6, r0, r3
yading@10 785 lea r0, [r0+r3*2]
yading@10 786 STORE_DIFFx2 m2, m3, m4, m5, m7, 6, r0, r3
yading@10 787 ret
yading@10 788
yading@10 789 %macro add16_sse2_cycle 2
yading@10 790 movzx r0, word [r4+%2]
yading@10 791 test r0, r0
yading@10 792 jz .cycle%1end
yading@10 793 mov r0d, dword [r1+%1*8]
yading@10 794 %if ARCH_X86_64
yading@10 795 add r0, r5
yading@10 796 %else
yading@10 797 add r0, r0m
yading@10 798 %endif
yading@10 799 call h264_add8x4_idct_sse2
yading@10 800 .cycle%1end:
yading@10 801 %if %1 < 7
yading@10 802 add r2, 64
yading@10 803 %endif
yading@10 804 %endmacro
yading@10 805
yading@10 806 ; ff_h264_idct_add16_sse2(uint8_t *dst, const int *block_offset,
yading@10 807 ; int16_t *block, int stride, const uint8_t nnzc[6*8])
yading@10 808 cglobal h264_idct_add16_8, 5, 5 + ARCH_X86_64, 8
yading@10 809 %if ARCH_X86_64
yading@10 810 mov r5, r0
yading@10 811 %endif
yading@10 812 ; unrolling of the loop leads to an average performance gain of
yading@10 813 ; 20-25%
yading@10 814 add16_sse2_cycle 0, 0xc
yading@10 815 add16_sse2_cycle 1, 0x14
yading@10 816 add16_sse2_cycle 2, 0xe
yading@10 817 add16_sse2_cycle 3, 0x16
yading@10 818 add16_sse2_cycle 4, 0x1c
yading@10 819 add16_sse2_cycle 5, 0x24
yading@10 820 add16_sse2_cycle 6, 0x1e
yading@10 821 add16_sse2_cycle 7, 0x26
yading@10 822 RET
yading@10 823
yading@10 824 %macro add16intra_sse2_cycle 2
yading@10 825 movzx r0, word [r4+%2]
yading@10 826 test r0, r0
yading@10 827 jz .try%1dc
yading@10 828 mov r0d, dword [r1+%1*8]
yading@10 829 %if ARCH_X86_64
yading@10 830 add r0, r7
yading@10 831 %else
yading@10 832 add r0, r0m
yading@10 833 %endif
yading@10 834 call h264_add8x4_idct_sse2
yading@10 835 jmp .cycle%1end
yading@10 836 .try%1dc:
yading@10 837 movsx r0, word [r2 ]
yading@10 838 or r0w, word [r2+32]
yading@10 839 jz .cycle%1end
yading@10 840 mov r0d, dword [r1+%1*8]
yading@10 841 %if ARCH_X86_64
yading@10 842 add r0, r7
yading@10 843 %else
yading@10 844 add r0, r0m
yading@10 845 %endif
yading@10 846 call h264_idct_dc_add8_mmxext
yading@10 847 .cycle%1end:
yading@10 848 %if %1 < 7
yading@10 849 add r2, 64
yading@10 850 %endif
yading@10 851 %endmacro
yading@10 852
yading@10 853 ; ff_h264_idct_add16intra_sse2(uint8_t *dst, const int *block_offset,
yading@10 854 ; int16_t *block, int stride, const uint8_t nnzc[6*8])
yading@10 855 cglobal h264_idct_add16intra_8, 5, 7 + ARCH_X86_64, 8
yading@10 856 %if ARCH_X86_64
yading@10 857 mov r7, r0
yading@10 858 %endif
yading@10 859 add16intra_sse2_cycle 0, 0xc
yading@10 860 add16intra_sse2_cycle 1, 0x14
yading@10 861 add16intra_sse2_cycle 2, 0xe
yading@10 862 add16intra_sse2_cycle 3, 0x16
yading@10 863 add16intra_sse2_cycle 4, 0x1c
yading@10 864 add16intra_sse2_cycle 5, 0x24
yading@10 865 add16intra_sse2_cycle 6, 0x1e
yading@10 866 add16intra_sse2_cycle 7, 0x26
yading@10 867 RET
yading@10 868
yading@10 869 %macro add8_sse2_cycle 2
yading@10 870 movzx r0, word [r4+%2]
yading@10 871 test r0, r0
yading@10 872 jz .try%1dc
yading@10 873 %if ARCH_X86_64
yading@10 874 mov r0d, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
yading@10 875 add r0, [r7]
yading@10 876 %else
yading@10 877 mov r0, r0m
yading@10 878 mov r0, [r0]
yading@10 879 add r0, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
yading@10 880 %endif
yading@10 881 call h264_add8x4_idct_sse2
yading@10 882 jmp .cycle%1end
yading@10 883 .try%1dc:
yading@10 884 movsx r0, word [r2 ]
yading@10 885 or r0w, word [r2+32]
yading@10 886 jz .cycle%1end
yading@10 887 %if ARCH_X86_64
yading@10 888 mov r0d, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
yading@10 889 add r0, [r7]
yading@10 890 %else
yading@10 891 mov r0, r0m
yading@10 892 mov r0, [r0]
yading@10 893 add r0, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
yading@10 894 %endif
yading@10 895 call h264_idct_dc_add8_mmxext
yading@10 896 .cycle%1end:
yading@10 897 %if %1 == 1
yading@10 898 add r2, 384+64
yading@10 899 %elif %1 < 3
yading@10 900 add r2, 64
yading@10 901 %endif
yading@10 902 %endmacro
yading@10 903
yading@10 904 ; ff_h264_idct_add8_sse2(uint8_t **dest, const int *block_offset,
yading@10 905 ; int16_t *block, int stride, const uint8_t nnzc[6*8])
yading@10 906 cglobal h264_idct_add8_8, 5, 7 + ARCH_X86_64, 8
yading@10 907 add r2, 512
yading@10 908 %if ARCH_X86_64
yading@10 909 mov r7, r0
yading@10 910 %endif
yading@10 911 add8_sse2_cycle 0, 0x34
yading@10 912 add8_sse2_cycle 1, 0x3c
yading@10 913 %if ARCH_X86_64
yading@10 914 add r7, gprsize
yading@10 915 %else
yading@10 916 add r0mp, gprsize
yading@10 917 %endif
yading@10 918 add8_sse2_cycle 2, 0x5c
yading@10 919 add8_sse2_cycle 3, 0x64
yading@10 920 RET
yading@10 921
yading@10 922 ;void ff_h264_luma_dc_dequant_idct_mmx(int16_t *output, int16_t *input, int qmul)
yading@10 923
yading@10 924 %macro WALSH4_1D 5
yading@10 925 SUMSUB_BADC w, %4, %3, %2, %1, %5
yading@10 926 SUMSUB_BADC w, %4, %2, %3, %1, %5
yading@10 927 SWAP %1, %4, %3
yading@10 928 %endmacro
yading@10 929
yading@10 930 %macro DEQUANT_MMX 3
yading@10 931 mova m7, [pw_1]
yading@10 932 mova m4, %1
yading@10 933 punpcklwd %1, m7
yading@10 934 punpckhwd m4, m7
yading@10 935 mova m5, %2
yading@10 936 punpcklwd %2, m7
yading@10 937 punpckhwd m5, m7
yading@10 938 movd m7, t3d
yading@10 939 punpckldq m7, m7
yading@10 940 pmaddwd %1, m7
yading@10 941 pmaddwd %2, m7
yading@10 942 pmaddwd m4, m7
yading@10 943 pmaddwd m5, m7
yading@10 944 psrad %1, %3
yading@10 945 psrad %2, %3
yading@10 946 psrad m4, %3
yading@10 947 psrad m5, %3
yading@10 948 packssdw %1, m4
yading@10 949 packssdw %2, m5
yading@10 950 %endmacro
yading@10 951
yading@10 952 %macro STORE_WORDS 5-9
yading@10 953 %if cpuflag(sse)
yading@10 954 movd t0d, %1
yading@10 955 psrldq %1, 4
yading@10 956 movd t1d, %1
yading@10 957 psrldq %1, 4
yading@10 958 mov [t2+%2*32], t0w
yading@10 959 mov [t2+%4*32], t1w
yading@10 960 shr t0d, 16
yading@10 961 shr t1d, 16
yading@10 962 mov [t2+%3*32], t0w
yading@10 963 mov [t2+%5*32], t1w
yading@10 964 movd t0d, %1
yading@10 965 psrldq %1, 4
yading@10 966 movd t1d, %1
yading@10 967 mov [t2+%6*32], t0w
yading@10 968 mov [t2+%8*32], t1w
yading@10 969 shr t0d, 16
yading@10 970 shr t1d, 16
yading@10 971 mov [t2+%7*32], t0w
yading@10 972 mov [t2+%9*32], t1w
yading@10 973 %else
yading@10 974 movd t0d, %1
yading@10 975 psrlq %1, 32
yading@10 976 movd t1d, %1
yading@10 977 mov [t2+%2*32], t0w
yading@10 978 mov [t2+%4*32], t1w
yading@10 979 shr t0d, 16
yading@10 980 shr t1d, 16
yading@10 981 mov [t2+%3*32], t0w
yading@10 982 mov [t2+%5*32], t1w
yading@10 983 %endif
yading@10 984 %endmacro
yading@10 985
yading@10 986 %macro DEQUANT_STORE 1
yading@10 987 %if cpuflag(sse2)
yading@10 988 movd xmm4, t3d
yading@10 989 movq xmm5, [pw_1]
yading@10 990 pshufd xmm4, xmm4, 0
yading@10 991 movq2dq xmm0, m0
yading@10 992 movq2dq xmm1, m1
yading@10 993 movq2dq xmm2, m2
yading@10 994 movq2dq xmm3, m3
yading@10 995 punpcklwd xmm0, xmm5
yading@10 996 punpcklwd xmm1, xmm5
yading@10 997 punpcklwd xmm2, xmm5
yading@10 998 punpcklwd xmm3, xmm5
yading@10 999 pmaddwd xmm0, xmm4
yading@10 1000 pmaddwd xmm1, xmm4
yading@10 1001 pmaddwd xmm2, xmm4
yading@10 1002 pmaddwd xmm3, xmm4
yading@10 1003 psrad xmm0, %1
yading@10 1004 psrad xmm1, %1
yading@10 1005 psrad xmm2, %1
yading@10 1006 psrad xmm3, %1
yading@10 1007 packssdw xmm0, xmm1
yading@10 1008 packssdw xmm2, xmm3
yading@10 1009 STORE_WORDS xmm0, 0, 1, 4, 5, 2, 3, 6, 7
yading@10 1010 STORE_WORDS xmm2, 8, 9, 12, 13, 10, 11, 14, 15
yading@10 1011 %else
yading@10 1012 DEQUANT_MMX m0, m1, %1
yading@10 1013 STORE_WORDS m0, 0, 1, 4, 5
yading@10 1014 STORE_WORDS m1, 2, 3, 6, 7
yading@10 1015
yading@10 1016 DEQUANT_MMX m2, m3, %1
yading@10 1017 STORE_WORDS m2, 8, 9, 12, 13
yading@10 1018 STORE_WORDS m3, 10, 11, 14, 15
yading@10 1019 %endif
yading@10 1020 %endmacro
yading@10 1021
yading@10 1022 %macro IDCT_DC_DEQUANT 1
yading@10 1023 cglobal h264_luma_dc_dequant_idct, 3, 4, %1
yading@10 1024 ; manually spill XMM registers for Win64 because
yading@10 1025 ; the code here is initialized with INIT_MMX
yading@10 1026 WIN64_SPILL_XMM %1
yading@10 1027 movq m3, [r1+24]
yading@10 1028 movq m2, [r1+16]
yading@10 1029 movq m1, [r1+ 8]
yading@10 1030 movq m0, [r1+ 0]
yading@10 1031 WALSH4_1D 0,1,2,3,4
yading@10 1032 TRANSPOSE4x4W 0,1,2,3,4
yading@10 1033 WALSH4_1D 0,1,2,3,4
yading@10 1034
yading@10 1035 ; shift, tmp, output, qmul
yading@10 1036 %if WIN64
yading@10 1037 DECLARE_REG_TMP 0,3,1,2
yading@10 1038 ; we can't avoid this, because r0 is the shift register (ecx) on win64
yading@10 1039 xchg r0, t2
yading@10 1040 %elif ARCH_X86_64
yading@10 1041 DECLARE_REG_TMP 3,1,0,2
yading@10 1042 %else
yading@10 1043 DECLARE_REG_TMP 1,3,0,2
yading@10 1044 %endif
yading@10 1045
yading@10 1046 cmp t3d, 32767
yading@10 1047 jg .big_qmul
yading@10 1048 add t3d, 128 << 16
yading@10 1049 DEQUANT_STORE 8
yading@10 1050 RET
yading@10 1051 .big_qmul:
yading@10 1052 bsr t0d, t3d
yading@10 1053 add t3d, 128 << 16
yading@10 1054 mov t1d, 7
yading@10 1055 cmp t0d, t1d
yading@10 1056 cmovg t0d, t1d
yading@10 1057 inc t1d
yading@10 1058 shr t3d, t0b
yading@10 1059 sub t1d, t0d
yading@10 1060 %if cpuflag(sse2)
yading@10 1061 movd xmm6, t1d
yading@10 1062 DEQUANT_STORE xmm6
yading@10 1063 %else
yading@10 1064 movd m6, t1d
yading@10 1065 DEQUANT_STORE m6
yading@10 1066 %endif
yading@10 1067 RET
yading@10 1068 %endmacro
yading@10 1069
yading@10 1070 INIT_MMX mmx
yading@10 1071 IDCT_DC_DEQUANT 0
yading@10 1072 INIT_MMX sse2
yading@10 1073 IDCT_DC_DEQUANT 7