annotate ffmpeg/libavcodec/x86/rv40dsp.asm @ 13:844d341cf643 tip

Back up before ISMIR
author Yading Song <yading.song@eecs.qmul.ac.uk>
date Thu, 31 Oct 2013 13:17:06 +0000
parents 6840f77b83aa
children
rev   line source
yading@10 1 ;******************************************************************************
yading@10 2 ;* MMX/SSE2-optimized functions for the RV40 decoder
yading@10 3 ;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
yading@10 4 ;* Copyright (c) 2010 Jason Garrett-Glaser <darkshikari@gmail.com>
yading@10 5 ;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
yading@10 6 ;*
yading@10 7 ;* This file is part of Libav.
yading@10 8 ;*
yading@10 9 ;* Libav is free software; you can redistribute it and/or
yading@10 10 ;* modify it under the terms of the GNU Lesser General Public
yading@10 11 ;* License as published by the Free Software Foundation; either
yading@10 12 ;* version 2.1 of the License, or (at your option) any later version.
yading@10 13 ;*
yading@10 14 ;* Libav is distributed in the hope that it will be useful,
yading@10 15 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
yading@10 16 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
yading@10 17 ;* Lesser General Public License for more details.
yading@10 18 ;*
yading@10 19 ;* You should have received a copy of the GNU Lesser General Public
yading@10 20 ;* License along with Libav; if not, write to the Free Software
yading@10 21 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
yading@10 22 ;******************************************************************************
yading@10 23
yading@10 24 %include "libavutil/x86/x86util.asm"
yading@10 25
yading@10 26 SECTION_RODATA
yading@10 27
yading@10 28 align 16
yading@10 29 pw_1024: times 8 dw 1 << (16 - 6) ; pw_1024
yading@10 30
yading@10 31 sixtap_filter_hb_m: times 8 db 1, -5
yading@10 32 times 8 db 52, 20
yading@10 33 ; multiplied by 2 to have the same shift
yading@10 34 times 8 db 2, -10
yading@10 35 times 8 db 40, 40
yading@10 36 ; back to normal
yading@10 37 times 8 db 1, -5
yading@10 38 times 8 db 20, 52
yading@10 39
yading@10 40 sixtap_filter_v_m: times 8 dw 1
yading@10 41 times 8 dw -5
yading@10 42 times 8 dw 52
yading@10 43 times 8 dw 20
yading@10 44 ; multiplied by 2 to have the same shift
yading@10 45 times 8 dw 2
yading@10 46 times 8 dw -10
yading@10 47 times 8 dw 40
yading@10 48 times 8 dw 40
yading@10 49 ; back to normal
yading@10 50 times 8 dw 1
yading@10 51 times 8 dw -5
yading@10 52 times 8 dw 20
yading@10 53 times 8 dw 52
yading@10 54
yading@10 55 %ifdef PIC
yading@10 56 %define sixtap_filter_hw picregq
yading@10 57 %define sixtap_filter_hb picregq
yading@10 58 %define sixtap_filter_v picregq
yading@10 59 %define npicregs 1
yading@10 60 %else
yading@10 61 %define sixtap_filter_hw sixtap_filter_hw_m
yading@10 62 %define sixtap_filter_hb sixtap_filter_hb_m
yading@10 63 %define sixtap_filter_v sixtap_filter_v_m
yading@10 64 %define npicregs 0
yading@10 65 %endif
yading@10 66
yading@10 67 filter_h6_shuf1: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
yading@10 68 filter_h6_shuf2: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
yading@10 69 filter_h6_shuf3: db 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11, 10, 12, 11
yading@10 70
yading@10 71 cextern pw_32
yading@10 72 cextern pw_16
yading@10 73 cextern pw_512
yading@10 74
yading@10 75 SECTION .text
yading@10 76
yading@10 77 ;-----------------------------------------------------------------------------
yading@10 78 ; subpel MC functions:
yading@10 79 ;
yading@10 80 ; void [put|rv40]_rv40_qpel_[h|v]_<opt>(uint8_t *dst, int deststride,
yading@10 81 ; uint8_t *src, int srcstride,
yading@10 82 ; int len, int m);
yading@10 83 ;----------------------------------------------------------------------
yading@10 84 %macro LOAD 2
yading@10 85 %if WIN64
yading@10 86 movsxd %1q, %1d
yading@10 87 %endif
yading@10 88 %ifdef PIC
yading@10 89 add %1q, picregq
yading@10 90 %else
yading@10 91 add %1q, %2
yading@10 92 %endif
yading@10 93 %endmacro
yading@10 94
yading@10 95 %macro STORE 3
yading@10 96 %ifidn %3, avg
yading@10 97 movh %2, [dstq]
yading@10 98 %endif
yading@10 99 packuswb %1, %1
yading@10 100 %ifidn %3, avg
yading@10 101 %if cpuflag(3dnow)
yading@10 102 pavgusb %1, %2
yading@10 103 %else
yading@10 104 pavgb %1, %2
yading@10 105 %endif
yading@10 106 %endif
yading@10 107 movh [dstq], %1
yading@10 108 %endmacro
yading@10 109
yading@10 110 %macro FILTER_V 1
yading@10 111 cglobal %1_rv40_qpel_v, 6,6+npicregs,12, dst, dststride, src, srcstride, height, my, picreg
yading@10 112 %ifdef PIC
yading@10 113 lea picregq, [sixtap_filter_v_m]
yading@10 114 %endif
yading@10 115 pxor m7, m7
yading@10 116 LOAD my, sixtap_filter_v
yading@10 117
yading@10 118 ; read 5 lines
yading@10 119 sub srcq, srcstrideq
yading@10 120 sub srcq, srcstrideq
yading@10 121 movh m0, [srcq]
yading@10 122 movh m1, [srcq+srcstrideq]
yading@10 123 movh m2, [srcq+srcstrideq*2]
yading@10 124 lea srcq, [srcq+srcstrideq*2]
yading@10 125 add srcq, srcstrideq
yading@10 126 movh m3, [srcq]
yading@10 127 movh m4, [srcq+srcstrideq]
yading@10 128 punpcklbw m0, m7
yading@10 129 punpcklbw m1, m7
yading@10 130 punpcklbw m2, m7
yading@10 131 punpcklbw m3, m7
yading@10 132 punpcklbw m4, m7
yading@10 133
yading@10 134 %ifdef m8
yading@10 135 mova m8, [myq+ 0]
yading@10 136 mova m9, [myq+16]
yading@10 137 mova m10, [myq+32]
yading@10 138 mova m11, [myq+48]
yading@10 139 %define COEFF05 m8
yading@10 140 %define COEFF14 m9
yading@10 141 %define COEFF2 m10
yading@10 142 %define COEFF3 m11
yading@10 143 %else
yading@10 144 %define COEFF05 [myq+ 0]
yading@10 145 %define COEFF14 [myq+16]
yading@10 146 %define COEFF2 [myq+32]
yading@10 147 %define COEFF3 [myq+48]
yading@10 148 %endif
yading@10 149 .nextrow:
yading@10 150 mova m6, m1
yading@10 151 movh m5, [srcq+2*srcstrideq] ; read new row
yading@10 152 paddw m6, m4
yading@10 153 punpcklbw m5, m7
yading@10 154 pmullw m6, COEFF14
yading@10 155 paddw m0, m5
yading@10 156 pmullw m0, COEFF05
yading@10 157 paddw m6, m0
yading@10 158 mova m0, m1
yading@10 159 paddw m6, [pw_32]
yading@10 160 mova m1, m2
yading@10 161 pmullw m2, COEFF2
yading@10 162 paddw m6, m2
yading@10 163 mova m2, m3
yading@10 164 pmullw m3, COEFF3
yading@10 165 paddw m6, m3
yading@10 166
yading@10 167 ; round/clip/store
yading@10 168 mova m3, m4
yading@10 169 psraw m6, 6
yading@10 170 mova m4, m5
yading@10 171 STORE m6, m5, %1
yading@10 172
yading@10 173 ; go to next line
yading@10 174 add dstq, dststrideq
yading@10 175 add srcq, srcstrideq
yading@10 176 dec heightd ; next row
yading@10 177 jg .nextrow
yading@10 178 REP_RET
yading@10 179 %endmacro
yading@10 180
yading@10 181 %macro FILTER_H 1
yading@10 182 cglobal %1_rv40_qpel_h, 6, 6+npicregs, 12, dst, dststride, src, srcstride, height, mx, picreg
yading@10 183 %ifdef PIC
yading@10 184 lea picregq, [sixtap_filter_v_m]
yading@10 185 %endif
yading@10 186 pxor m7, m7
yading@10 187 LOAD mx, sixtap_filter_v
yading@10 188 mova m6, [pw_32]
yading@10 189 %ifdef m8
yading@10 190 mova m8, [mxq+ 0]
yading@10 191 mova m9, [mxq+16]
yading@10 192 mova m10, [mxq+32]
yading@10 193 mova m11, [mxq+48]
yading@10 194 %define COEFF05 m8
yading@10 195 %define COEFF14 m9
yading@10 196 %define COEFF2 m10
yading@10 197 %define COEFF3 m11
yading@10 198 %else
yading@10 199 %define COEFF05 [mxq+ 0]
yading@10 200 %define COEFF14 [mxq+16]
yading@10 201 %define COEFF2 [mxq+32]
yading@10 202 %define COEFF3 [mxq+48]
yading@10 203 %endif
yading@10 204 .nextrow:
yading@10 205 movq m0, [srcq-2]
yading@10 206 movq m5, [srcq+3]
yading@10 207 movq m1, [srcq-1]
yading@10 208 movq m4, [srcq+2]
yading@10 209 punpcklbw m0, m7
yading@10 210 punpcklbw m5, m7
yading@10 211 punpcklbw m1, m7
yading@10 212 punpcklbw m4, m7
yading@10 213 movq m2, [srcq-0]
yading@10 214 movq m3, [srcq+1]
yading@10 215 paddw m0, m5
yading@10 216 paddw m1, m4
yading@10 217 punpcklbw m2, m7
yading@10 218 punpcklbw m3, m7
yading@10 219 pmullw m0, COEFF05
yading@10 220 pmullw m1, COEFF14
yading@10 221 pmullw m2, COEFF2
yading@10 222 pmullw m3, COEFF3
yading@10 223 paddw m0, m6
yading@10 224 paddw m1, m2
yading@10 225 paddw m0, m3
yading@10 226 paddw m0, m1
yading@10 227 psraw m0, 6
yading@10 228 STORE m0, m1, %1
yading@10 229
yading@10 230 ; go to next line
yading@10 231 add dstq, dststrideq
yading@10 232 add srcq, srcstrideq
yading@10 233 dec heightd ; next row
yading@10 234 jg .nextrow
yading@10 235 REP_RET
yading@10 236 %endmacro
yading@10 237
yading@10 238 %if ARCH_X86_32
yading@10 239 INIT_MMX mmx
yading@10 240 FILTER_V put
yading@10 241 FILTER_H put
yading@10 242
yading@10 243 INIT_MMX mmxext
yading@10 244 FILTER_V avg
yading@10 245 FILTER_H avg
yading@10 246
yading@10 247 INIT_MMX 3dnow
yading@10 248 FILTER_V avg
yading@10 249 FILTER_H avg
yading@10 250 %endif
yading@10 251
yading@10 252 INIT_XMM sse2
yading@10 253 FILTER_H put
yading@10 254 FILTER_H avg
yading@10 255 FILTER_V put
yading@10 256 FILTER_V avg
yading@10 257
yading@10 258 %macro FILTER_SSSE3 1
yading@10 259 cglobal %1_rv40_qpel_v, 6,6+npicregs,8, dst, dststride, src, srcstride, height, my, picreg
yading@10 260 %ifdef PIC
yading@10 261 lea picregq, [sixtap_filter_hb_m]
yading@10 262 %endif
yading@10 263
yading@10 264 ; read 5 lines
yading@10 265 sub srcq, srcstrideq
yading@10 266 LOAD my, sixtap_filter_hb
yading@10 267 sub srcq, srcstrideq
yading@10 268 movh m0, [srcq]
yading@10 269 movh m1, [srcq+srcstrideq]
yading@10 270 movh m2, [srcq+srcstrideq*2]
yading@10 271 lea srcq, [srcq+srcstrideq*2]
yading@10 272 add srcq, srcstrideq
yading@10 273 mova m5, [myq]
yading@10 274 movh m3, [srcq]
yading@10 275 movh m4, [srcq+srcstrideq]
yading@10 276 lea srcq, [srcq+2*srcstrideq]
yading@10 277
yading@10 278 .nextrow:
yading@10 279 mova m6, m2
yading@10 280 punpcklbw m0, m1
yading@10 281 punpcklbw m6, m3
yading@10 282 pmaddubsw m0, m5
yading@10 283 pmaddubsw m6, [myq+16]
yading@10 284 movh m7, [srcq] ; read new row
yading@10 285 paddw m6, m0
yading@10 286 mova m0, m1
yading@10 287 mova m1, m2
yading@10 288 mova m2, m3
yading@10 289 mova m3, m4
yading@10 290 mova m4, m7
yading@10 291 punpcklbw m7, m3
yading@10 292 pmaddubsw m7, m5
yading@10 293 paddw m6, m7
yading@10 294 pmulhrsw m6, [pw_512]
yading@10 295 STORE m6, m7, %1
yading@10 296
yading@10 297 ; go to next line
yading@10 298 add dstq, dststrideq
yading@10 299 add srcq, srcstrideq
yading@10 300 dec heightd ; next row
yading@10 301 jg .nextrow
yading@10 302 REP_RET
yading@10 303
yading@10 304 cglobal %1_rv40_qpel_h, 6,6+npicregs,8, dst, dststride, src, srcstride, height, mx, picreg
yading@10 305 %ifdef PIC
yading@10 306 lea picregq, [sixtap_filter_hb_m]
yading@10 307 %endif
yading@10 308 mova m3, [filter_h6_shuf2]
yading@10 309 mova m4, [filter_h6_shuf3]
yading@10 310 LOAD mx, sixtap_filter_hb
yading@10 311 mova m5, [mxq] ; set up 6tap filter in bytes
yading@10 312 mova m6, [mxq+16]
yading@10 313 mova m7, [filter_h6_shuf1]
yading@10 314
yading@10 315 .nextrow:
yading@10 316 movu m0, [srcq-2]
yading@10 317 mova m1, m0
yading@10 318 mova m2, m0
yading@10 319 pshufb m0, m7
yading@10 320 pshufb m1, m3
yading@10 321 pshufb m2, m4
yading@10 322 pmaddubsw m0, m5
yading@10 323 pmaddubsw m1, m6
yading@10 324 pmaddubsw m2, m5
yading@10 325 paddw m0, m1
yading@10 326 paddw m0, m2
yading@10 327 pmulhrsw m0, [pw_512]
yading@10 328 STORE m0, m1, %1
yading@10 329
yading@10 330 ; go to next line
yading@10 331 add dstq, dststrideq
yading@10 332 add srcq, srcstrideq
yading@10 333 dec heightd ; next row
yading@10 334 jg .nextrow
yading@10 335 REP_RET
yading@10 336 %endmacro
yading@10 337
yading@10 338 INIT_XMM ssse3
yading@10 339 FILTER_SSSE3 put
yading@10 340 FILTER_SSSE3 avg
yading@10 341
yading@10 342 ; %1=5bits weights?, %2=dst %3=src1 %4=src3 %5=stride if sse2
yading@10 343 %macro RV40_WCORE 4-5
yading@10 344 movh m4, [%3 + r6 + 0]
yading@10 345 movh m5, [%4 + r6 + 0]
yading@10 346 %if %0 == 4
yading@10 347 %define OFFSET r6 + mmsize / 2
yading@10 348 %else
yading@10 349 ; 8x8 block and sse2, stride was provided
yading@10 350 %define OFFSET r6
yading@10 351 add r6, r5
yading@10 352 %endif
yading@10 353 movh m6, [%3 + OFFSET]
yading@10 354 movh m7, [%4 + OFFSET]
yading@10 355
yading@10 356 %if %1 == 0
yading@10 357 ; 14bits weights
yading@10 358 punpcklbw m4, m0
yading@10 359 punpcklbw m5, m0
yading@10 360 punpcklbw m6, m0
yading@10 361 punpcklbw m7, m0
yading@10 362
yading@10 363 psllw m4, 7
yading@10 364 psllw m5, 7
yading@10 365 psllw m6, 7
yading@10 366 psllw m7, 7
yading@10 367 pmulhw m4, m3
yading@10 368 pmulhw m5, m2
yading@10 369 pmulhw m6, m3
yading@10 370 pmulhw m7, m2
yading@10 371
yading@10 372 paddw m4, m5
yading@10 373 paddw m6, m7
yading@10 374 %else
yading@10 375 ; 5bits weights
yading@10 376 %if cpuflag(ssse3)
yading@10 377 punpcklbw m4, m5
yading@10 378 punpcklbw m6, m7
yading@10 379
yading@10 380 pmaddubsw m4, m3
yading@10 381 pmaddubsw m6, m3
yading@10 382 %else
yading@10 383 punpcklbw m4, m0
yading@10 384 punpcklbw m5, m0
yading@10 385 punpcklbw m6, m0
yading@10 386 punpcklbw m7, m0
yading@10 387
yading@10 388 pmullw m4, m3
yading@10 389 pmullw m5, m2
yading@10 390 pmullw m6, m3
yading@10 391 pmullw m7, m2
yading@10 392 paddw m4, m5
yading@10 393 paddw m6, m7
yading@10 394 %endif
yading@10 395
yading@10 396 %endif
yading@10 397
yading@10 398 ; bias and shift down
yading@10 399 %if cpuflag(ssse3)
yading@10 400 pmulhrsw m4, m1
yading@10 401 pmulhrsw m6, m1
yading@10 402 %else
yading@10 403 paddw m4, m1
yading@10 404 paddw m6, m1
yading@10 405 psrlw m4, 5
yading@10 406 psrlw m6, 5
yading@10 407 %endif
yading@10 408
yading@10 409 packuswb m4, m6
yading@10 410 %if %0 == 5
yading@10 411 ; Only called for 8x8 blocks and sse2
yading@10 412 sub r6, r5
yading@10 413 movh [%2 + r6], m4
yading@10 414 add r6, r5
yading@10 415 movhps [%2 + r6], m4
yading@10 416 %else
yading@10 417 mova [%2 + r6], m4
yading@10 418 %endif
yading@10 419 %endmacro
yading@10 420
yading@10 421
yading@10 422 %macro MAIN_LOOP 2
yading@10 423 %if mmsize == 8
yading@10 424 RV40_WCORE %2, r0, r1, r2
yading@10 425 %if %1 == 16
yading@10 426 RV40_WCORE %2, r0 + 8, r1 + 8, r2 + 8
yading@10 427 %endif
yading@10 428
yading@10 429 ; Prepare for next loop
yading@10 430 add r6, r5
yading@10 431 %else
yading@10 432 %ifidn %1, 8
yading@10 433 RV40_WCORE %2, r0, r1, r2, r5
yading@10 434 ; Prepare 2 next lines
yading@10 435 add r6, r5
yading@10 436 %else
yading@10 437 RV40_WCORE %2, r0, r1, r2
yading@10 438 ; Prepare single next line
yading@10 439 add r6, r5
yading@10 440 %endif
yading@10 441 %endif
yading@10 442
yading@10 443 %endmacro
yading@10 444
yading@10 445 ; rv40_weight_func_%1(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w1, int w2, int stride)
yading@10 446 ; %1=size %2=num of xmm regs
yading@10 447 ; The weights are FP0.14 notation of fractions depending on pts.
yading@10 448 ; For timebases without rounding error (i.e. PAL), the fractions
yading@10 449 ; can be simplified, and several operations can be avoided.
yading@10 450 ; Therefore, we check here whether they are multiples of 2^9 for
yading@10 451 ; those simplifications to occur.
yading@10 452 %macro RV40_WEIGHT 3
yading@10 453 cglobal rv40_weight_func_%1_%2, 6, 7, 8
yading@10 454 %if cpuflag(ssse3)
yading@10 455 mova m1, [pw_1024]
yading@10 456 %else
yading@10 457 mova m1, [pw_16]
yading@10 458 %endif
yading@10 459 pxor m0, m0
yading@10 460 ; Set loop counter and increments
yading@10 461 mov r6, r5
yading@10 462 shl r6, %3
yading@10 463 add r0, r6
yading@10 464 add r1, r6
yading@10 465 add r2, r6
yading@10 466 neg r6
yading@10 467
yading@10 468 movd m2, r3d
yading@10 469 movd m3, r4d
yading@10 470 %ifidn %1,rnd
yading@10 471 %define RND 0
yading@10 472 SPLATW m2, m2
yading@10 473 %else
yading@10 474 %define RND 1
yading@10 475 %if cpuflag(ssse3)
yading@10 476 punpcklbw m3, m2
yading@10 477 %else
yading@10 478 SPLATW m2, m2
yading@10 479 %endif
yading@10 480 %endif
yading@10 481 SPLATW m3, m3
yading@10 482
yading@10 483 .loop:
yading@10 484 MAIN_LOOP %2, RND
yading@10 485 jnz .loop
yading@10 486 REP_RET
yading@10 487 %endmacro
yading@10 488
yading@10 489 INIT_MMX mmxext
yading@10 490 RV40_WEIGHT rnd, 8, 3
yading@10 491 RV40_WEIGHT rnd, 16, 4
yading@10 492 RV40_WEIGHT nornd, 8, 3
yading@10 493 RV40_WEIGHT nornd, 16, 4
yading@10 494
yading@10 495 INIT_XMM sse2
yading@10 496 RV40_WEIGHT rnd, 8, 3
yading@10 497 RV40_WEIGHT rnd, 16, 4
yading@10 498 RV40_WEIGHT nornd, 8, 3
yading@10 499 RV40_WEIGHT nornd, 16, 4
yading@10 500
yading@10 501 INIT_XMM ssse3
yading@10 502 RV40_WEIGHT rnd, 8, 3
yading@10 503 RV40_WEIGHT rnd, 16, 4
yading@10 504 RV40_WEIGHT nornd, 8, 3
yading@10 505 RV40_WEIGHT nornd, 16, 4