annotate ffmpeg/libavcodec/x86/dsputilenc.asm @ 13:844d341cf643 tip

Back up before ISMIR
author Yading Song <yading.song@eecs.qmul.ac.uk>
date Thu, 31 Oct 2013 13:17:06 +0000
parents 6840f77b83aa
children
rev   line source
yading@10 1 ;*****************************************************************************
yading@10 2 ;* MMX optimized DSP utils
yading@10 3 ;*****************************************************************************
yading@10 4 ;* Copyright (c) 2000, 2001 Fabrice Bellard
yading@10 5 ;* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
yading@10 6 ;*
yading@10 7 ;* This file is part of FFmpeg.
yading@10 8 ;*
yading@10 9 ;* FFmpeg is free software; you can redistribute it and/or
yading@10 10 ;* modify it under the terms of the GNU Lesser General Public
yading@10 11 ;* License as published by the Free Software Foundation; either
yading@10 12 ;* version 2.1 of the License, or (at your option) any later version.
yading@10 13 ;*
yading@10 14 ;* FFmpeg is distributed in the hope that it will be useful,
yading@10 15 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
yading@10 16 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
yading@10 17 ;* Lesser General Public License for more details.
yading@10 18 ;*
yading@10 19 ;* You should have received a copy of the GNU Lesser General Public
yading@10 20 ;* License along with FFmpeg; if not, write to the Free Software
yading@10 21 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
yading@10 22 ;*****************************************************************************
yading@10 23
yading@10 24 %include "libavutil/x86/x86util.asm"
yading@10 25
yading@10 26 SECTION .text
yading@10 27
yading@10 28 %macro DIFF_PIXELS_1 4
yading@10 29 movh %1, %3
yading@10 30 movh %2, %4
yading@10 31 punpcklbw %2, %1
yading@10 32 punpcklbw %1, %1
yading@10 33 psubw %1, %2
yading@10 34 %endmacro
yading@10 35
yading@10 36 ; %1=uint8_t *pix1, %2=uint8_t *pix2, %3=static offset, %4=stride, %5=stride*3
yading@10 37 ; %6=temporary storage location
yading@10 38 ; this macro requires $mmsize stack space (aligned) on %6 (except on SSE+x86-64)
yading@10 39 %macro DIFF_PIXELS_8 6
yading@10 40 DIFF_PIXELS_1 m0, m7, [%1 +%3], [%2 +%3]
yading@10 41 DIFF_PIXELS_1 m1, m7, [%1+%4 +%3], [%2+%4 +%3]
yading@10 42 DIFF_PIXELS_1 m2, m7, [%1+%4*2+%3], [%2+%4*2+%3]
yading@10 43 add %1, %5
yading@10 44 add %2, %5
yading@10 45 DIFF_PIXELS_1 m3, m7, [%1 +%3], [%2 +%3]
yading@10 46 DIFF_PIXELS_1 m4, m7, [%1+%4 +%3], [%2+%4 +%3]
yading@10 47 DIFF_PIXELS_1 m5, m7, [%1+%4*2+%3], [%2+%4*2+%3]
yading@10 48 DIFF_PIXELS_1 m6, m7, [%1+%5 +%3], [%2+%5 +%3]
yading@10 49 %ifdef m8
yading@10 50 DIFF_PIXELS_1 m7, m8, [%1+%4*4+%3], [%2+%4*4+%3]
yading@10 51 %else
yading@10 52 mova [%6], m0
yading@10 53 DIFF_PIXELS_1 m7, m0, [%1+%4*4+%3], [%2+%4*4+%3]
yading@10 54 mova m0, [%6]
yading@10 55 %endif
yading@10 56 sub %1, %5
yading@10 57 sub %2, %5
yading@10 58 %endmacro
yading@10 59
yading@10 60 %macro HADAMARD8 0
yading@10 61 SUMSUB_BADC w, 0, 1, 2, 3
yading@10 62 SUMSUB_BADC w, 4, 5, 6, 7
yading@10 63 SUMSUB_BADC w, 0, 2, 1, 3
yading@10 64 SUMSUB_BADC w, 4, 6, 5, 7
yading@10 65 SUMSUB_BADC w, 0, 4, 1, 5
yading@10 66 SUMSUB_BADC w, 2, 6, 3, 7
yading@10 67 %endmacro
yading@10 68
yading@10 69 %macro ABS1_SUM 3
yading@10 70 ABS1 %1, %2
yading@10 71 paddusw %3, %1
yading@10 72 %endmacro
yading@10 73
yading@10 74 %macro ABS2_SUM 6
yading@10 75 ABS2 %1, %2, %3, %4
yading@10 76 paddusw %5, %1
yading@10 77 paddusw %6, %2
yading@10 78 %endmacro
yading@10 79
yading@10 80 %macro ABS_SUM_8x8_64 1
yading@10 81 ABS2 m0, m1, m8, m9
yading@10 82 ABS2_SUM m2, m3, m8, m9, m0, m1
yading@10 83 ABS2_SUM m4, m5, m8, m9, m0, m1
yading@10 84 ABS2_SUM m6, m7, m8, m9, m0, m1
yading@10 85 paddusw m0, m1
yading@10 86 %endmacro
yading@10 87
yading@10 88 %macro ABS_SUM_8x8_32 1
yading@10 89 mova [%1], m7
yading@10 90 ABS1 m0, m7
yading@10 91 ABS1 m1, m7
yading@10 92 ABS1_SUM m2, m7, m0
yading@10 93 ABS1_SUM m3, m7, m1
yading@10 94 ABS1_SUM m4, m7, m0
yading@10 95 ABS1_SUM m5, m7, m1
yading@10 96 ABS1_SUM m6, m7, m0
yading@10 97 mova m2, [%1]
yading@10 98 ABS1_SUM m2, m7, m1
yading@10 99 paddusw m0, m1
yading@10 100 %endmacro
yading@10 101
yading@10 102 ; FIXME: HSUM saturates at 64k, while an 8x8 hadamard or dct block can get up to
yading@10 103 ; about 100k on extreme inputs. But that's very unlikely to occur in natural video,
yading@10 104 ; and it's even more unlikely to not have any alternative mvs/modes with lower cost.
yading@10 105 %macro HSUM 3
yading@10 106 %if cpuflag(sse2)
yading@10 107 movhlps %2, %1
yading@10 108 paddusw %1, %2
yading@10 109 pshuflw %2, %1, 0xE
yading@10 110 paddusw %1, %2
yading@10 111 pshuflw %2, %1, 0x1
yading@10 112 paddusw %1, %2
yading@10 113 movd %3, %1
yading@10 114 %elif cpuflag(mmxext)
yading@10 115 pshufw %2, %1, 0xE
yading@10 116 paddusw %1, %2
yading@10 117 pshufw %2, %1, 0x1
yading@10 118 paddusw %1, %2
yading@10 119 movd %3, %1
yading@10 120 %elif cpuflag(mmx)
yading@10 121 mova %2, %1
yading@10 122 psrlq %1, 32
yading@10 123 paddusw %1, %2
yading@10 124 mova %2, %1
yading@10 125 psrlq %1, 16
yading@10 126 paddusw %1, %2
yading@10 127 movd %3, %1
yading@10 128 %endif
yading@10 129 %endmacro
yading@10 130
yading@10 131 %macro STORE4 5
yading@10 132 mova [%1+mmsize*0], %2
yading@10 133 mova [%1+mmsize*1], %3
yading@10 134 mova [%1+mmsize*2], %4
yading@10 135 mova [%1+mmsize*3], %5
yading@10 136 %endmacro
yading@10 137
yading@10 138 %macro LOAD4 5
yading@10 139 mova %2, [%1+mmsize*0]
yading@10 140 mova %3, [%1+mmsize*1]
yading@10 141 mova %4, [%1+mmsize*2]
yading@10 142 mova %5, [%1+mmsize*3]
yading@10 143 %endmacro
yading@10 144
yading@10 145 %macro hadamard8_16_wrapper 2
yading@10 146 cglobal hadamard8_diff, 4, 4, %1
yading@10 147 %ifndef m8
yading@10 148 %assign pad %2*mmsize-(4+stack_offset&(mmsize-1))
yading@10 149 SUB rsp, pad
yading@10 150 %endif
yading@10 151 call hadamard8x8_diff %+ SUFFIX
yading@10 152 %ifndef m8
yading@10 153 ADD rsp, pad
yading@10 154 %endif
yading@10 155 RET
yading@10 156
yading@10 157 cglobal hadamard8_diff16, 5, 6, %1
yading@10 158 %ifndef m8
yading@10 159 %assign pad %2*mmsize-(4+stack_offset&(mmsize-1))
yading@10 160 SUB rsp, pad
yading@10 161 %endif
yading@10 162
yading@10 163 call hadamard8x8_diff %+ SUFFIX
yading@10 164 mov r5d, eax
yading@10 165
yading@10 166 add r1, 8
yading@10 167 add r2, 8
yading@10 168 call hadamard8x8_diff %+ SUFFIX
yading@10 169 add r5d, eax
yading@10 170
yading@10 171 cmp r4d, 16
yading@10 172 jne .done
yading@10 173
yading@10 174 lea r1, [r1+r3*8-8]
yading@10 175 lea r2, [r2+r3*8-8]
yading@10 176 call hadamard8x8_diff %+ SUFFIX
yading@10 177 add r5d, eax
yading@10 178
yading@10 179 add r1, 8
yading@10 180 add r2, 8
yading@10 181 call hadamard8x8_diff %+ SUFFIX
yading@10 182 add r5d, eax
yading@10 183
yading@10 184 .done:
yading@10 185 mov eax, r5d
yading@10 186 %ifndef m8
yading@10 187 ADD rsp, pad
yading@10 188 %endif
yading@10 189 RET
yading@10 190 %endmacro
yading@10 191
yading@10 192 %macro HADAMARD8_DIFF 0-1
yading@10 193 %if cpuflag(sse2)
yading@10 194 hadamard8x8_diff %+ SUFFIX:
yading@10 195 lea r0, [r3*3]
yading@10 196 DIFF_PIXELS_8 r1, r2, 0, r3, r0, rsp+gprsize
yading@10 197 HADAMARD8
yading@10 198 %if ARCH_X86_64
yading@10 199 TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8
yading@10 200 %else
yading@10 201 TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [rsp+gprsize], [rsp+mmsize+gprsize]
yading@10 202 %endif
yading@10 203 HADAMARD8
yading@10 204 ABS_SUM_8x8 rsp+gprsize
yading@10 205 HSUM m0, m1, eax
yading@10 206 and eax, 0xFFFF
yading@10 207 ret
yading@10 208
yading@10 209 hadamard8_16_wrapper %1, 3
yading@10 210 %elif cpuflag(mmx)
yading@10 211 ALIGN 16
yading@10 212 ; int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2,
yading@10 213 ; int stride, int h)
yading@10 214 ; r0 = void *s = unused, int h = unused (always 8)
yading@10 215 ; note how r1, r2 and r3 are not clobbered in this function, so 16x16
yading@10 216 ; can simply call this 2x2x (and that's why we access rsp+gprsize
yading@10 217 ; everywhere, which is rsp of calling func
yading@10 218 hadamard8x8_diff %+ SUFFIX:
yading@10 219 lea r0, [r3*3]
yading@10 220
yading@10 221 ; first 4x8 pixels
yading@10 222 DIFF_PIXELS_8 r1, r2, 0, r3, r0, rsp+gprsize+0x60
yading@10 223 HADAMARD8
yading@10 224 mova [rsp+gprsize+0x60], m7
yading@10 225 TRANSPOSE4x4W 0, 1, 2, 3, 7
yading@10 226 STORE4 rsp+gprsize, m0, m1, m2, m3
yading@10 227 mova m7, [rsp+gprsize+0x60]
yading@10 228 TRANSPOSE4x4W 4, 5, 6, 7, 0
yading@10 229 STORE4 rsp+gprsize+0x40, m4, m5, m6, m7
yading@10 230
yading@10 231 ; second 4x8 pixels
yading@10 232 DIFF_PIXELS_8 r1, r2, 4, r3, r0, rsp+gprsize+0x60
yading@10 233 HADAMARD8
yading@10 234 mova [rsp+gprsize+0x60], m7
yading@10 235 TRANSPOSE4x4W 0, 1, 2, 3, 7
yading@10 236 STORE4 rsp+gprsize+0x20, m0, m1, m2, m3
yading@10 237 mova m7, [rsp+gprsize+0x60]
yading@10 238 TRANSPOSE4x4W 4, 5, 6, 7, 0
yading@10 239
yading@10 240 LOAD4 rsp+gprsize+0x40, m0, m1, m2, m3
yading@10 241 HADAMARD8
yading@10 242 ABS_SUM_8x8_32 rsp+gprsize+0x60
yading@10 243 mova [rsp+gprsize+0x60], m0
yading@10 244
yading@10 245 LOAD4 rsp+gprsize , m0, m1, m2, m3
yading@10 246 LOAD4 rsp+gprsize+0x20, m4, m5, m6, m7
yading@10 247 HADAMARD8
yading@10 248 ABS_SUM_8x8_32 rsp+gprsize
yading@10 249 paddusw m0, [rsp+gprsize+0x60]
yading@10 250
yading@10 251 HSUM m0, m1, eax
yading@10 252 and rax, 0xFFFF
yading@10 253 ret
yading@10 254
yading@10 255 hadamard8_16_wrapper 0, 14
yading@10 256 %endif
yading@10 257 %endmacro
yading@10 258
yading@10 259 INIT_MMX mmx
yading@10 260 HADAMARD8_DIFF
yading@10 261
yading@10 262 INIT_MMX mmxext
yading@10 263 HADAMARD8_DIFF
yading@10 264
yading@10 265 INIT_XMM sse2
yading@10 266 %if ARCH_X86_64
yading@10 267 %define ABS_SUM_8x8 ABS_SUM_8x8_64
yading@10 268 %else
yading@10 269 %define ABS_SUM_8x8 ABS_SUM_8x8_32
yading@10 270 %endif
yading@10 271 HADAMARD8_DIFF 10
yading@10 272
yading@10 273 INIT_XMM ssse3
yading@10 274 %define ABS_SUM_8x8 ABS_SUM_8x8_64
yading@10 275 HADAMARD8_DIFF 9
yading@10 276
yading@10 277 INIT_XMM sse2
yading@10 278 ; sse16_sse2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
yading@10 279 cglobal sse16, 5, 5, 8
yading@10 280 shr r4d, 1
yading@10 281 pxor m0, m0 ; mm0 = 0
yading@10 282 pxor m7, m7 ; mm7 holds the sum
yading@10 283
yading@10 284 .next2lines: ; FIXME why are these unaligned movs? pix1[] is aligned
yading@10 285 movu m1, [r1 ] ; mm1 = pix1[0][0-15]
yading@10 286 movu m2, [r2 ] ; mm2 = pix2[0][0-15]
yading@10 287 movu m3, [r1+r3] ; mm3 = pix1[1][0-15]
yading@10 288 movu m4, [r2+r3] ; mm4 = pix2[1][0-15]
yading@10 289
yading@10 290 ; todo: mm1-mm2, mm3-mm4
yading@10 291 ; algo: subtract mm1 from mm2 with saturation and vice versa
yading@10 292 ; OR the result to get the absolute difference
yading@10 293 mova m5, m1
yading@10 294 mova m6, m3
yading@10 295 psubusb m1, m2
yading@10 296 psubusb m3, m4
yading@10 297 psubusb m2, m5
yading@10 298 psubusb m4, m6
yading@10 299
yading@10 300 por m2, m1
yading@10 301 por m4, m3
yading@10 302
yading@10 303 ; now convert to 16-bit vectors so we can square them
yading@10 304 mova m1, m2
yading@10 305 mova m3, m4
yading@10 306
yading@10 307 punpckhbw m2, m0
yading@10 308 punpckhbw m4, m0
yading@10 309 punpcklbw m1, m0 ; mm1 not spread over (mm1,mm2)
yading@10 310 punpcklbw m3, m0 ; mm4 not spread over (mm3,mm4)
yading@10 311
yading@10 312 pmaddwd m2, m2
yading@10 313 pmaddwd m4, m4
yading@10 314 pmaddwd m1, m1
yading@10 315 pmaddwd m3, m3
yading@10 316
yading@10 317 lea r1, [r1+r3*2] ; pix1 += 2*line_size
yading@10 318 lea r2, [r2+r3*2] ; pix2 += 2*line_size
yading@10 319
yading@10 320 paddd m1, m2
yading@10 321 paddd m3, m4
yading@10 322 paddd m7, m1
yading@10 323 paddd m7, m3
yading@10 324
yading@10 325 dec r4
yading@10 326 jnz .next2lines
yading@10 327
yading@10 328 mova m1, m7
yading@10 329 psrldq m7, 8 ; shift hi qword to lo
yading@10 330 paddd m7, m1
yading@10 331 mova m1, m7
yading@10 332 psrldq m7, 4 ; shift hi dword to lo
yading@10 333 paddd m7, m1
yading@10 334 movd eax, m7 ; return value
yading@10 335 RET
yading@10 336
yading@10 337 INIT_MMX mmx
yading@10 338 ; get_pixels_mmx(int16_t *block, const uint8_t *pixels, int line_size)
yading@10 339 cglobal get_pixels, 3,4
yading@10 340 movsxdifnidn r2, r2d
yading@10 341 add r0, 128
yading@10 342 mov r3, -128
yading@10 343 pxor m7, m7
yading@10 344 .loop:
yading@10 345 mova m0, [r1]
yading@10 346 mova m2, [r1+r2]
yading@10 347 mova m1, m0
yading@10 348 mova m3, m2
yading@10 349 punpcklbw m0, m7
yading@10 350 punpckhbw m1, m7
yading@10 351 punpcklbw m2, m7
yading@10 352 punpckhbw m3, m7
yading@10 353 mova [r0+r3+ 0], m0
yading@10 354 mova [r0+r3+ 8], m1
yading@10 355 mova [r0+r3+16], m2
yading@10 356 mova [r0+r3+24], m3
yading@10 357 lea r1, [r1+r2*2]
yading@10 358 add r3, 32
yading@10 359 js .loop
yading@10 360 REP_RET
yading@10 361
yading@10 362 INIT_XMM sse2
yading@10 363 cglobal get_pixels, 3, 4
yading@10 364 movsxdifnidn r2, r2d
yading@10 365 lea r3, [r2*3]
yading@10 366 pxor m4, m4
yading@10 367 movh m0, [r1]
yading@10 368 movh m1, [r1+r2]
yading@10 369 movh m2, [r1+r2*2]
yading@10 370 movh m3, [r1+r3]
yading@10 371 lea r1, [r1+r2*4]
yading@10 372 punpcklbw m0, m4
yading@10 373 punpcklbw m1, m4
yading@10 374 punpcklbw m2, m4
yading@10 375 punpcklbw m3, m4
yading@10 376 mova [r0], m0
yading@10 377 mova [r0+0x10], m1
yading@10 378 mova [r0+0x20], m2
yading@10 379 mova [r0+0x30], m3
yading@10 380 movh m0, [r1]
yading@10 381 movh m1, [r1+r2*1]
yading@10 382 movh m2, [r1+r2*2]
yading@10 383 movh m3, [r1+r3]
yading@10 384 punpcklbw m0, m4
yading@10 385 punpcklbw m1, m4
yading@10 386 punpcklbw m2, m4
yading@10 387 punpcklbw m3, m4
yading@10 388 mova [r0+0x40], m0
yading@10 389 mova [r0+0x50], m1
yading@10 390 mova [r0+0x60], m2
yading@10 391 mova [r0+0x70], m3
yading@10 392 RET
yading@10 393
yading@10 394 INIT_MMX mmx
yading@10 395 ; diff_pixels_mmx(int16_t *block, const uint8_t *s1, const unint8_t *s2, stride)
yading@10 396 cglobal diff_pixels, 4,5
yading@10 397 movsxdifnidn r3, r3d
yading@10 398 pxor m7, m7
yading@10 399 add r0, 128
yading@10 400 mov r4, -128
yading@10 401 .loop:
yading@10 402 mova m0, [r1]
yading@10 403 mova m2, [r2]
yading@10 404 mova m1, m0
yading@10 405 mova m3, m2
yading@10 406 punpcklbw m0, m7
yading@10 407 punpckhbw m1, m7
yading@10 408 punpcklbw m2, m7
yading@10 409 punpckhbw m3, m7
yading@10 410 psubw m0, m2
yading@10 411 psubw m1, m3
yading@10 412 mova [r0+r4+0], m0
yading@10 413 mova [r0+r4+8], m1
yading@10 414 add r1, r3
yading@10 415 add r2, r3
yading@10 416 add r4, 16
yading@10 417 jne .loop
yading@10 418 REP_RET
yading@10 419
yading@10 420 INIT_MMX mmx
yading@10 421 ; pix_sum16_mmx(uint8_t * pix, int line_size)
yading@10 422 cglobal pix_sum16, 2, 3
yading@10 423 movsxdifnidn r1, r1d
yading@10 424 mov r2, r1
yading@10 425 neg r2
yading@10 426 shl r2, 4
yading@10 427 sub r0, r2
yading@10 428 pxor m7, m7
yading@10 429 pxor m6, m6
yading@10 430 .loop:
yading@10 431 mova m0, [r0+r2+0]
yading@10 432 mova m1, [r0+r2+0]
yading@10 433 mova m2, [r0+r2+8]
yading@10 434 mova m3, [r0+r2+8]
yading@10 435 punpcklbw m0, m7
yading@10 436 punpckhbw m1, m7
yading@10 437 punpcklbw m2, m7
yading@10 438 punpckhbw m3, m7
yading@10 439 paddw m1, m0
yading@10 440 paddw m3, m2
yading@10 441 paddw m3, m1
yading@10 442 paddw m6, m3
yading@10 443 add r2, r1
yading@10 444 js .loop
yading@10 445 mova m5, m6
yading@10 446 psrlq m6, 32
yading@10 447 paddw m6, m5
yading@10 448 mova m5, m6
yading@10 449 psrlq m6, 16
yading@10 450 paddw m6, m5
yading@10 451 movd eax, m6
yading@10 452 and eax, 0xffff
yading@10 453 RET
yading@10 454
yading@10 455 INIT_MMX mmx
yading@10 456 ; pix_norm1_mmx(uint8_t *pix, int line_size)
yading@10 457 cglobal pix_norm1, 2, 4
yading@10 458 movsxdifnidn r1, r1d
yading@10 459 mov r2, 16
yading@10 460 pxor m0, m0
yading@10 461 pxor m7, m7
yading@10 462 .loop:
yading@10 463 mova m2, [r0+0]
yading@10 464 mova m3, [r0+8]
yading@10 465 mova m1, m2
yading@10 466 punpckhbw m1, m0
yading@10 467 punpcklbw m2, m0
yading@10 468 mova m4, m3
yading@10 469 punpckhbw m3, m0
yading@10 470 punpcklbw m4, m0
yading@10 471 pmaddwd m1, m1
yading@10 472 pmaddwd m2, m2
yading@10 473 pmaddwd m3, m3
yading@10 474 pmaddwd m4, m4
yading@10 475 paddd m2, m1
yading@10 476 paddd m4, m3
yading@10 477 paddd m7, m2
yading@10 478 add r0, r1
yading@10 479 paddd m7, m4
yading@10 480 dec r2
yading@10 481 jne .loop
yading@10 482 mova m1, m7
yading@10 483 psrlq m7, 32
yading@10 484 paddd m1, m7
yading@10 485 movd eax, m1
yading@10 486 RET
yading@10 487