35 #if COMPILE_TEMPLATE_AMD3DNOW 36 #define PREFETCH "prefetch" 37 #define PAVGB "pavgusb" 38 #elif COMPILE_TEMPLATE_MMXEXT 39 #define PREFETCH "prefetchnta" 42 #define PREFETCH " # nop" 45 #if COMPILE_TEMPLATE_AMD3DNOW 52 #if COMPILE_TEMPLATE_MMXEXT 53 #define MOVNTQ "movntq" 54 #define SFENCE "sfence" 57 #define SFENCE " # nop" 60 #if !COMPILE_TEMPLATE_SSE2 62 #if !COMPILE_TEMPLATE_AMD3DNOW 71 __asm__
volatile(
PREFETCH" %0"::
"m"(*s):
"memory");
73 __asm__
volatile(
"movq %0, %%mm7"::
"m"(mask32a):
"memory");
77 "movd (%1), %%mm0 \n\t" 78 "punpckldq 3(%1), %%mm0 \n\t" 79 "movd 6(%1), %%mm1 \n\t" 80 "punpckldq 9(%1), %%mm1 \n\t" 81 "movd 12(%1), %%mm2 \n\t" 82 "punpckldq 15(%1), %%mm2 \n\t" 83 "movd 18(%1), %%mm3 \n\t" 84 "punpckldq 21(%1), %%mm3 \n\t" 85 "por %%mm7, %%mm0 \n\t" 86 "por %%mm7, %%mm1 \n\t" 87 "por %%mm7, %%mm2 \n\t" 88 "por %%mm7, %%mm3 \n\t" 91 MOVNTQ" %%mm2, 16(%0) \n\t" 98 __asm__
volatile(
SFENCE:::
"memory");
99 __asm__
volatile(
EMMS:::
"memory");
108 #define STORE_BGR24_MMX \ 109 "psrlq $8, %%mm2 \n\t" \ 110 "psrlq $8, %%mm3 \n\t" \ 111 "psrlq $8, %%mm6 \n\t" \ 112 "psrlq $8, %%mm7 \n\t" \ 113 "pand "MANGLE(mask24l)", %%mm0\n\t" \ 114 "pand "MANGLE(mask24l)", %%mm1\n\t" \ 115 "pand "MANGLE(mask24l)", %%mm4\n\t" \ 116 "pand "MANGLE(mask24l)", %%mm5\n\t" \ 117 "pand "MANGLE(mask24h)", %%mm2\n\t" \ 118 "pand "MANGLE(mask24h)", %%mm3\n\t" \ 119 "pand "MANGLE(mask24h)", %%mm6\n\t" \ 120 "pand "MANGLE(mask24h)", %%mm7\n\t" \ 121 "por %%mm2, %%mm0 \n\t" \ 122 "por %%mm3, %%mm1 \n\t" \ 123 "por %%mm6, %%mm4 \n\t" \ 124 "por %%mm7, %%mm5 \n\t" \ 126 "movq %%mm1, %%mm2 \n\t" \ 127 "movq %%mm4, %%mm3 \n\t" \ 128 "psllq $48, %%mm2 \n\t" \ 129 "psllq $32, %%mm3 \n\t" \ 130 "por %%mm2, %%mm0 \n\t" \ 131 "psrlq $16, %%mm1 \n\t" \ 132 "psrlq $32, %%mm4 \n\t" \ 133 "psllq $16, %%mm5 \n\t" \ 134 "por %%mm3, %%mm1 \n\t" \ 135 "por %%mm5, %%mm4 \n\t" \ 137 MOVNTQ" %%mm0, (%0) \n\t" \ 138 MOVNTQ" %%mm1, 8(%0) \n\t" \ 139 MOVNTQ" %%mm4, 16(%0)" 149 __asm__
volatile(
PREFETCH" %0"::
"m"(*s):
"memory");
154 "movq (%1), %%mm0 \n\t" 155 "movq 8(%1), %%mm1 \n\t" 156 "movq 16(%1), %%mm4 \n\t" 157 "movq 24(%1), %%mm5 \n\t" 158 "movq %%mm0, %%mm2 \n\t" 159 "movq %%mm1, %%mm3 \n\t" 160 "movq %%mm4, %%mm6 \n\t" 161 "movq %%mm5, %%mm7 \n\t" 168 __asm__
volatile(
SFENCE:::
"memory");
169 __asm__
volatile(
EMMS:::
"memory");
191 __asm__
volatile(
PREFETCH" %0"::
"m"(*s));
192 __asm__
volatile(
"movq %0, %%mm4"::
"m"(mask15s));
197 "movq (%1), %%mm0 \n\t" 198 "movq 8(%1), %%mm2 \n\t" 199 "movq %%mm0, %%mm1 \n\t" 200 "movq %%mm2, %%mm3 \n\t" 201 "pand %%mm4, %%mm0 \n\t" 202 "pand %%mm4, %%mm2 \n\t" 203 "paddw %%mm1, %%mm0 \n\t" 204 "paddw %%mm3, %%mm2 \n\t" 212 __asm__
volatile(
SFENCE:::
"memory");
213 __asm__
volatile(
EMMS:::
"memory");
216 register unsigned x= *((
const uint32_t *)s);
217 *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
222 register unsigned short x= *((
const uint16_t *)s);
223 *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
234 __asm__
volatile(
PREFETCH" %0"::
"m"(*s));
235 __asm__
volatile(
"movq %0, %%mm7"::
"m"(mask15rg));
236 __asm__
volatile(
"movq %0, %%mm6"::
"m"(mask15b));
241 "movq (%1), %%mm0 \n\t" 242 "movq 8(%1), %%mm2 \n\t" 243 "movq %%mm0, %%mm1 \n\t" 244 "movq %%mm2, %%mm3 \n\t" 245 "psrlq $1, %%mm0 \n\t" 246 "psrlq $1, %%mm2 \n\t" 247 "pand %%mm7, %%mm0 \n\t" 248 "pand %%mm7, %%mm2 \n\t" 249 "pand %%mm6, %%mm1 \n\t" 250 "pand %%mm6, %%mm3 \n\t" 251 "por %%mm1, %%mm0 \n\t" 252 "por %%mm3, %%mm2 \n\t" 260 __asm__
volatile(
SFENCE:::
"memory");
261 __asm__
volatile(
EMMS:::
"memory");
264 register uint32_t
x= *((
const uint32_t*)s);
265 *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
270 register uint16_t
x= *((
const uint16_t*)s);
271 *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
280 uint16_t *
d = (uint16_t *)
dst;
284 "movq %3, %%mm5 \n\t" 285 "movq %4, %%mm6 \n\t" 286 "movq %5, %%mm7 \n\t" 291 "movd (%1), %%mm0 \n\t" 292 "movd 4(%1), %%mm3 \n\t" 293 "punpckldq 8(%1), %%mm0 \n\t" 294 "punpckldq 12(%1), %%mm3 \n\t" 295 "movq %%mm0, %%mm1 \n\t" 296 "movq %%mm3, %%mm4 \n\t" 297 "pand %%mm6, %%mm0 \n\t" 298 "pand %%mm6, %%mm3 \n\t" 299 "pmaddwd %%mm7, %%mm0 \n\t" 300 "pmaddwd %%mm7, %%mm3 \n\t" 301 "pand %%mm5, %%mm1 \n\t" 302 "pand %%mm5, %%mm4 \n\t" 303 "por %%mm1, %%mm0 \n\t" 304 "por %%mm4, %%mm3 \n\t" 305 "psrld $5, %%mm0 \n\t" 306 "pslld $11, %%mm3 \n\t" 307 "por %%mm3, %%mm0 \n\t" 315 :
"r" (mm_end),
"m" (mask3216g),
"m" (mask3216br),
"m" (mul3216)
317 __asm__
volatile(
SFENCE:::
"memory");
318 __asm__
volatile(
EMMS:::
"memory");
320 register int rgb = *(
const uint32_t*)s; s += 4;
321 *d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8);
330 uint16_t *
d = (uint16_t *)
dst;
332 __asm__
volatile(
PREFETCH" %0"::
"m"(*src):
"memory");
334 "movq %0, %%mm7 \n\t" 335 "movq %1, %%mm6 \n\t" 336 ::
"m"(red_16mask),
"m"(green_16mask));
341 "movd (%1), %%mm0 \n\t" 342 "movd 4(%1), %%mm3 \n\t" 343 "punpckldq 8(%1), %%mm0 \n\t" 344 "punpckldq 12(%1), %%mm3 \n\t" 345 "movq %%mm0, %%mm1 \n\t" 346 "movq %%mm0, %%mm2 \n\t" 347 "movq %%mm3, %%mm4 \n\t" 348 "movq %%mm3, %%mm5 \n\t" 349 "psllq $8, %%mm0 \n\t" 350 "psllq $8, %%mm3 \n\t" 351 "pand %%mm7, %%mm0 \n\t" 352 "pand %%mm7, %%mm3 \n\t" 353 "psrlq $5, %%mm1 \n\t" 354 "psrlq $5, %%mm4 \n\t" 355 "pand %%mm6, %%mm1 \n\t" 356 "pand %%mm6, %%mm4 \n\t" 357 "psrlq $19, %%mm2 \n\t" 358 "psrlq $19, %%mm5 \n\t" 359 "pand %2, %%mm2 \n\t" 360 "pand %2, %%mm5 \n\t" 361 "por %%mm1, %%mm0 \n\t" 362 "por %%mm4, %%mm3 \n\t" 363 "por %%mm2, %%mm0 \n\t" 364 "por %%mm5, %%mm3 \n\t" 365 "psllq $16, %%mm3 \n\t" 366 "por %%mm3, %%mm0 \n\t" 368 ::
"r"(
d),
"r"(s),
"m"(blue_16mask):
"memory");
372 __asm__
volatile(
SFENCE:::
"memory");
373 __asm__
volatile(
EMMS:::
"memory");
375 register int rgb = *(
const uint32_t*)s; s += 4;
376 *d++ = ((rgb&0xF8)<<8) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>19);
385 uint16_t *
d = (uint16_t *)
dst;
389 "movq %3, %%mm5 \n\t" 390 "movq %4, %%mm6 \n\t" 391 "movq %5, %%mm7 \n\t" 396 "movd (%1), %%mm0 \n\t" 397 "movd 4(%1), %%mm3 \n\t" 398 "punpckldq 8(%1), %%mm0 \n\t" 399 "punpckldq 12(%1), %%mm3 \n\t" 400 "movq %%mm0, %%mm1 \n\t" 401 "movq %%mm3, %%mm4 \n\t" 402 "pand %%mm6, %%mm0 \n\t" 403 "pand %%mm6, %%mm3 \n\t" 404 "pmaddwd %%mm7, %%mm0 \n\t" 405 "pmaddwd %%mm7, %%mm3 \n\t" 406 "pand %%mm5, %%mm1 \n\t" 407 "pand %%mm5, %%mm4 \n\t" 408 "por %%mm1, %%mm0 \n\t" 409 "por %%mm4, %%mm3 \n\t" 410 "psrld $6, %%mm0 \n\t" 411 "pslld $10, %%mm3 \n\t" 412 "por %%mm3, %%mm0 \n\t" 420 :
"r" (mm_end),
"m" (mask3215g),
"m" (mask3216br),
"m" (mul3215)
422 __asm__
volatile(
SFENCE:::
"memory");
423 __asm__
volatile(
EMMS:::
"memory");
425 register int rgb = *(
const uint32_t*)s; s += 4;
426 *d++ = ((rgb&0xFF)>>3) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>9);
435 uint16_t *
d = (uint16_t *)
dst;
437 __asm__
volatile(
PREFETCH" %0"::
"m"(*src):
"memory");
439 "movq %0, %%mm7 \n\t" 440 "movq %1, %%mm6 \n\t" 441 ::
"m"(red_15mask),
"m"(green_15mask));
446 "movd (%1), %%mm0 \n\t" 447 "movd 4(%1), %%mm3 \n\t" 448 "punpckldq 8(%1), %%mm0 \n\t" 449 "punpckldq 12(%1), %%mm3 \n\t" 450 "movq %%mm0, %%mm1 \n\t" 451 "movq %%mm0, %%mm2 \n\t" 452 "movq %%mm3, %%mm4 \n\t" 453 "movq %%mm3, %%mm5 \n\t" 454 "psllq $7, %%mm0 \n\t" 455 "psllq $7, %%mm3 \n\t" 456 "pand %%mm7, %%mm0 \n\t" 457 "pand %%mm7, %%mm3 \n\t" 458 "psrlq $6, %%mm1 \n\t" 459 "psrlq $6, %%mm4 \n\t" 460 "pand %%mm6, %%mm1 \n\t" 461 "pand %%mm6, %%mm4 \n\t" 462 "psrlq $19, %%mm2 \n\t" 463 "psrlq $19, %%mm5 \n\t" 464 "pand %2, %%mm2 \n\t" 465 "pand %2, %%mm5 \n\t" 466 "por %%mm1, %%mm0 \n\t" 467 "por %%mm4, %%mm3 \n\t" 468 "por %%mm2, %%mm0 \n\t" 469 "por %%mm5, %%mm3 \n\t" 470 "psllq $16, %%mm3 \n\t" 471 "por %%mm3, %%mm0 \n\t" 473 ::
"r"(
d),
"r"(s),
"m"(blue_15mask):
"memory");
477 __asm__
volatile(
SFENCE:::
"memory");
478 __asm__
volatile(
EMMS:::
"memory");
480 register int rgb = *(
const uint32_t*)s; s += 4;
481 *d++ = ((rgb&0xF8)<<7) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>19);
490 uint16_t *
d = (uint16_t *)
dst;
492 __asm__
volatile(
PREFETCH" %0"::
"m"(*src):
"memory");
494 "movq %0, %%mm7 \n\t" 495 "movq %1, %%mm6 \n\t" 496 ::
"m"(red_16mask),
"m"(green_16mask));
501 "movd (%1), %%mm0 \n\t" 502 "movd 3(%1), %%mm3 \n\t" 503 "punpckldq 6(%1), %%mm0 \n\t" 504 "punpckldq 9(%1), %%mm3 \n\t" 505 "movq %%mm0, %%mm1 \n\t" 506 "movq %%mm0, %%mm2 \n\t" 507 "movq %%mm3, %%mm4 \n\t" 508 "movq %%mm3, %%mm5 \n\t" 509 "psrlq $3, %%mm0 \n\t" 510 "psrlq $3, %%mm3 \n\t" 511 "pand %2, %%mm0 \n\t" 512 "pand %2, %%mm3 \n\t" 513 "psrlq $5, %%mm1 \n\t" 514 "psrlq $5, %%mm4 \n\t" 515 "pand %%mm6, %%mm1 \n\t" 516 "pand %%mm6, %%mm4 \n\t" 517 "psrlq $8, %%mm2 \n\t" 518 "psrlq $8, %%mm5 \n\t" 519 "pand %%mm7, %%mm2 \n\t" 520 "pand %%mm7, %%mm5 \n\t" 521 "por %%mm1, %%mm0 \n\t" 522 "por %%mm4, %%mm3 \n\t" 523 "por %%mm2, %%mm0 \n\t" 524 "por %%mm5, %%mm3 \n\t" 525 "psllq $16, %%mm3 \n\t" 526 "por %%mm3, %%mm0 \n\t" 528 ::
"r"(
d),
"r"(s),
"m"(blue_16mask):
"memory");
532 __asm__
volatile(
SFENCE:::
"memory");
533 __asm__
volatile(
EMMS:::
"memory");
538 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
547 uint16_t *
d = (uint16_t *)
dst;
549 __asm__
volatile(
PREFETCH" %0"::
"m"(*src):
"memory");
551 "movq %0, %%mm7 \n\t" 552 "movq %1, %%mm6 \n\t" 553 ::
"m"(red_16mask),
"m"(green_16mask));
558 "movd (%1), %%mm0 \n\t" 559 "movd 3(%1), %%mm3 \n\t" 560 "punpckldq 6(%1), %%mm0 \n\t" 561 "punpckldq 9(%1), %%mm3 \n\t" 562 "movq %%mm0, %%mm1 \n\t" 563 "movq %%mm0, %%mm2 \n\t" 564 "movq %%mm3, %%mm4 \n\t" 565 "movq %%mm3, %%mm5 \n\t" 566 "psllq $8, %%mm0 \n\t" 567 "psllq $8, %%mm3 \n\t" 568 "pand %%mm7, %%mm0 \n\t" 569 "pand %%mm7, %%mm3 \n\t" 570 "psrlq $5, %%mm1 \n\t" 571 "psrlq $5, %%mm4 \n\t" 572 "pand %%mm6, %%mm1 \n\t" 573 "pand %%mm6, %%mm4 \n\t" 574 "psrlq $19, %%mm2 \n\t" 575 "psrlq $19, %%mm5 \n\t" 576 "pand %2, %%mm2 \n\t" 577 "pand %2, %%mm5 \n\t" 578 "por %%mm1, %%mm0 \n\t" 579 "por %%mm4, %%mm3 \n\t" 580 "por %%mm2, %%mm0 \n\t" 581 "por %%mm5, %%mm3 \n\t" 582 "psllq $16, %%mm3 \n\t" 583 "por %%mm3, %%mm0 \n\t" 585 ::
"r"(
d),
"r"(s),
"m"(blue_16mask):
"memory");
589 __asm__
volatile(
SFENCE:::
"memory");
590 __asm__
volatile(
EMMS:::
"memory");
595 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
604 uint16_t *
d = (uint16_t *)
dst;
606 __asm__
volatile(
PREFETCH" %0"::
"m"(*src):
"memory");
608 "movq %0, %%mm7 \n\t" 609 "movq %1, %%mm6 \n\t" 610 ::
"m"(red_15mask),
"m"(green_15mask));
615 "movd (%1), %%mm0 \n\t" 616 "movd 3(%1), %%mm3 \n\t" 617 "punpckldq 6(%1), %%mm0 \n\t" 618 "punpckldq 9(%1), %%mm3 \n\t" 619 "movq %%mm0, %%mm1 \n\t" 620 "movq %%mm0, %%mm2 \n\t" 621 "movq %%mm3, %%mm4 \n\t" 622 "movq %%mm3, %%mm5 \n\t" 623 "psrlq $3, %%mm0 \n\t" 624 "psrlq $3, %%mm3 \n\t" 625 "pand %2, %%mm0 \n\t" 626 "pand %2, %%mm3 \n\t" 627 "psrlq $6, %%mm1 \n\t" 628 "psrlq $6, %%mm4 \n\t" 629 "pand %%mm6, %%mm1 \n\t" 630 "pand %%mm6, %%mm4 \n\t" 631 "psrlq $9, %%mm2 \n\t" 632 "psrlq $9, %%mm5 \n\t" 633 "pand %%mm7, %%mm2 \n\t" 634 "pand %%mm7, %%mm5 \n\t" 635 "por %%mm1, %%mm0 \n\t" 636 "por %%mm4, %%mm3 \n\t" 637 "por %%mm2, %%mm0 \n\t" 638 "por %%mm5, %%mm3 \n\t" 639 "psllq $16, %%mm3 \n\t" 640 "por %%mm3, %%mm0 \n\t" 642 ::
"r"(
d),
"r"(s),
"m"(blue_15mask):
"memory");
646 __asm__
volatile(
SFENCE:::
"memory");
647 __asm__
volatile(
EMMS:::
"memory");
652 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
661 uint16_t *
d = (uint16_t *)
dst;
663 __asm__
volatile(
PREFETCH" %0"::
"m"(*src):
"memory");
665 "movq %0, %%mm7 \n\t" 666 "movq %1, %%mm6 \n\t" 667 ::
"m"(red_15mask),
"m"(green_15mask));
672 "movd (%1), %%mm0 \n\t" 673 "movd 3(%1), %%mm3 \n\t" 674 "punpckldq 6(%1), %%mm0 \n\t" 675 "punpckldq 9(%1), %%mm3 \n\t" 676 "movq %%mm0, %%mm1 \n\t" 677 "movq %%mm0, %%mm2 \n\t" 678 "movq %%mm3, %%mm4 \n\t" 679 "movq %%mm3, %%mm5 \n\t" 680 "psllq $7, %%mm0 \n\t" 681 "psllq $7, %%mm3 \n\t" 682 "pand %%mm7, %%mm0 \n\t" 683 "pand %%mm7, %%mm3 \n\t" 684 "psrlq $6, %%mm1 \n\t" 685 "psrlq $6, %%mm4 \n\t" 686 "pand %%mm6, %%mm1 \n\t" 687 "pand %%mm6, %%mm4 \n\t" 688 "psrlq $19, %%mm2 \n\t" 689 "psrlq $19, %%mm5 \n\t" 690 "pand %2, %%mm2 \n\t" 691 "pand %2, %%mm5 \n\t" 692 "por %%mm1, %%mm0 \n\t" 693 "por %%mm4, %%mm3 \n\t" 694 "por %%mm2, %%mm0 \n\t" 695 "por %%mm5, %%mm3 \n\t" 696 "psllq $16, %%mm3 \n\t" 697 "por %%mm3, %%mm0 \n\t" 699 ::
"r"(
d),
"r"(s),
"m"(blue_15mask):
"memory");
703 __asm__
volatile(
SFENCE:::
"memory");
704 __asm__
volatile(
EMMS:::
"memory");
709 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
716 const uint16_t *mm_end;
718 const uint16_t *
s = (
const uint16_t*)
src;
719 end = s + src_size/2;
720 __asm__
volatile(
PREFETCH" %0"::
"m"(*s):
"memory");
725 "movq (%1), %%mm0 \n\t" 726 "movq (%1), %%mm1 \n\t" 727 "movq (%1), %%mm2 \n\t" 728 "pand %2, %%mm0 \n\t" 729 "pand %3, %%mm1 \n\t" 730 "pand %4, %%mm2 \n\t" 731 "psllq $5, %%mm0 \n\t" 732 "pmulhw "MANGLE(mul15_mid)
", %%mm0 \n\t" 733 "pmulhw "MANGLE(mul15_mid)
", %%mm1 \n\t" 734 "pmulhw "MANGLE(mul15_hi)
", %%mm2 \n\t" 735 "movq %%mm0, %%mm3 \n\t" 736 "movq %%mm1, %%mm4 \n\t" 737 "movq %%mm2, %%mm5 \n\t" 738 "punpcklwd %5, %%mm0 \n\t" 739 "punpcklwd %5, %%mm1 \n\t" 740 "punpcklwd %5, %%mm2 \n\t" 741 "punpckhwd %5, %%mm3 \n\t" 742 "punpckhwd %5, %%mm4 \n\t" 743 "punpckhwd %5, %%mm5 \n\t" 744 "psllq $8, %%mm1 \n\t" 745 "psllq $16, %%mm2 \n\t" 746 "por %%mm1, %%mm0 \n\t" 747 "por %%mm2, %%mm0 \n\t" 748 "psllq $8, %%mm4 \n\t" 749 "psllq $16, %%mm5 \n\t" 750 "por %%mm4, %%mm3 \n\t" 751 "por %%mm5, %%mm3 \n\t" 753 "movq %%mm0, %%mm6 \n\t" 754 "movq %%mm3, %%mm7 \n\t" 756 "movq 8(%1), %%mm0 \n\t" 757 "movq 8(%1), %%mm1 \n\t" 758 "movq 8(%1), %%mm2 \n\t" 759 "pand %2, %%mm0 \n\t" 760 "pand %3, %%mm1 \n\t" 761 "pand %4, %%mm2 \n\t" 762 "psllq $5, %%mm0 \n\t" 763 "pmulhw "MANGLE(mul15_mid)
", %%mm0 \n\t" 764 "pmulhw "MANGLE(mul15_mid)
", %%mm1 \n\t" 765 "pmulhw "MANGLE(mul15_hi)
", %%mm2 \n\t" 766 "movq %%mm0, %%mm3 \n\t" 767 "movq %%mm1, %%mm4 \n\t" 768 "movq %%mm2, %%mm5 \n\t" 769 "punpcklwd %5, %%mm0 \n\t" 770 "punpcklwd %5, %%mm1 \n\t" 771 "punpcklwd %5, %%mm2 \n\t" 772 "punpckhwd %5, %%mm3 \n\t" 773 "punpckhwd %5, %%mm4 \n\t" 774 "punpckhwd %5, %%mm5 \n\t" 775 "psllq $8, %%mm1 \n\t" 776 "psllq $16, %%mm2 \n\t" 777 "por %%mm1, %%mm0 \n\t" 778 "por %%mm2, %%mm0 \n\t" 779 "psllq $8, %%mm4 \n\t" 780 "psllq $16, %%mm5 \n\t" 781 "por %%mm4, %%mm3 \n\t" 782 "por %%mm5, %%mm3 \n\t" 785 :
"r"(
s),
"m"(mask15b),
"m"(mask15g),
"m"(mask15r),
"m"(mmx_null)
789 "movq %%mm0, %%mm4 \n\t" 790 "movq %%mm3, %%mm5 \n\t" 791 "movq %%mm6, %%mm0 \n\t" 792 "movq %%mm7, %%mm1 \n\t" 794 "movq %%mm4, %%mm6 \n\t" 795 "movq %%mm5, %%mm7 \n\t" 796 "movq %%mm0, %%mm2 \n\t" 797 "movq %%mm1, %%mm3 \n\t" 806 __asm__
volatile(
SFENCE:::
"memory");
807 __asm__
volatile(
EMMS:::
"memory");
809 register uint16_t bgr;
811 *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
812 *d++ = ((bgr&0x3E0)>>2) | ((bgr&0x3E0)>>7);
813 *d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12);
820 const uint16_t *mm_end;
822 const uint16_t *
s = (
const uint16_t *)
src;
823 end = s + src_size/2;
824 __asm__
volatile(
PREFETCH" %0"::
"m"(*s):
"memory");
829 "movq (%1), %%mm0 \n\t" 830 "movq (%1), %%mm1 \n\t" 831 "movq (%1), %%mm2 \n\t" 832 "pand %2, %%mm0 \n\t" 833 "pand %3, %%mm1 \n\t" 834 "pand %4, %%mm2 \n\t" 835 "psllq $5, %%mm0 \n\t" 836 "psrlq $1, %%mm2 \n\t" 837 "pmulhw "MANGLE(mul15_mid)
", %%mm0 \n\t" 838 "pmulhw "MANGLE(mul16_mid)
", %%mm1 \n\t" 839 "pmulhw "MANGLE(mul15_hi)
", %%mm2 \n\t" 840 "movq %%mm0, %%mm3 \n\t" 841 "movq %%mm1, %%mm4 \n\t" 842 "movq %%mm2, %%mm5 \n\t" 843 "punpcklwd %5, %%mm0 \n\t" 844 "punpcklwd %5, %%mm1 \n\t" 845 "punpcklwd %5, %%mm2 \n\t" 846 "punpckhwd %5, %%mm3 \n\t" 847 "punpckhwd %5, %%mm4 \n\t" 848 "punpckhwd %5, %%mm5 \n\t" 849 "psllq $8, %%mm1 \n\t" 850 "psllq $16, %%mm2 \n\t" 851 "por %%mm1, %%mm0 \n\t" 852 "por %%mm2, %%mm0 \n\t" 853 "psllq $8, %%mm4 \n\t" 854 "psllq $16, %%mm5 \n\t" 855 "por %%mm4, %%mm3 \n\t" 856 "por %%mm5, %%mm3 \n\t" 858 "movq %%mm0, %%mm6 \n\t" 859 "movq %%mm3, %%mm7 \n\t" 861 "movq 8(%1), %%mm0 \n\t" 862 "movq 8(%1), %%mm1 \n\t" 863 "movq 8(%1), %%mm2 \n\t" 864 "pand %2, %%mm0 \n\t" 865 "pand %3, %%mm1 \n\t" 866 "pand %4, %%mm2 \n\t" 867 "psllq $5, %%mm0 \n\t" 868 "psrlq $1, %%mm2 \n\t" 869 "pmulhw "MANGLE(mul15_mid)
", %%mm0 \n\t" 870 "pmulhw "MANGLE(mul16_mid)
", %%mm1 \n\t" 871 "pmulhw "MANGLE(mul15_hi)
", %%mm2 \n\t" 872 "movq %%mm0, %%mm3 \n\t" 873 "movq %%mm1, %%mm4 \n\t" 874 "movq %%mm2, %%mm5 \n\t" 875 "punpcklwd %5, %%mm0 \n\t" 876 "punpcklwd %5, %%mm1 \n\t" 877 "punpcklwd %5, %%mm2 \n\t" 878 "punpckhwd %5, %%mm3 \n\t" 879 "punpckhwd %5, %%mm4 \n\t" 880 "punpckhwd %5, %%mm5 \n\t" 881 "psllq $8, %%mm1 \n\t" 882 "psllq $16, %%mm2 \n\t" 883 "por %%mm1, %%mm0 \n\t" 884 "por %%mm2, %%mm0 \n\t" 885 "psllq $8, %%mm4 \n\t" 886 "psllq $16, %%mm5 \n\t" 887 "por %%mm4, %%mm3 \n\t" 888 "por %%mm5, %%mm3 \n\t" 890 :
"r"(
s),
"m"(mask16b),
"m"(mask16g),
"m"(mask16r),
"m"(mmx_null)
894 "movq %%mm0, %%mm4 \n\t" 895 "movq %%mm3, %%mm5 \n\t" 896 "movq %%mm6, %%mm0 \n\t" 897 "movq %%mm7, %%mm1 \n\t" 899 "movq %%mm4, %%mm6 \n\t" 900 "movq %%mm5, %%mm7 \n\t" 901 "movq %%mm0, %%mm2 \n\t" 902 "movq %%mm1, %%mm3 \n\t" 911 __asm__
volatile(
SFENCE:::
"memory");
912 __asm__
volatile(
EMMS:::
"memory");
914 register uint16_t bgr;
916 *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
917 *d++ = ((bgr&0x7E0)>>3) | ((bgr&0x7E0)>>9);
918 *d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13);
930 "packuswb %%mm7, %%mm0 \n\t" \ 931 "packuswb %%mm7, %%mm1 \n\t" \ 932 "packuswb %%mm7, %%mm2 \n\t" \ 933 "punpcklbw %%mm1, %%mm0 \n\t" \ 934 "punpcklbw %%mm6, %%mm2 \n\t" \ 935 "movq %%mm0, %%mm3 \n\t" \ 936 "punpcklwd %%mm2, %%mm0 \n\t" \ 937 "punpckhwd %%mm2, %%mm3 \n\t" \ 938 MOVNTQ" %%mm0, (%0) \n\t" \ 939 MOVNTQ" %%mm3, 8(%0) \n\t" \ 944 const uint16_t *mm_end;
946 const uint16_t *
s = (
const uint16_t *)
src;
947 end = s + src_size/2;
948 __asm__
volatile(
PREFETCH" %0"::
"m"(*s):
"memory");
949 __asm__
volatile(
"pxor %%mm7,%%mm7 \n\t":::
"memory");
950 __asm__
volatile(
"pcmpeqd %%mm6,%%mm6 \n\t":::
"memory");
955 "movq (%1), %%mm0 \n\t" 956 "movq (%1), %%mm1 \n\t" 957 "movq (%1), %%mm2 \n\t" 958 "pand %2, %%mm0 \n\t" 959 "pand %3, %%mm1 \n\t" 960 "pand %4, %%mm2 \n\t" 961 "psllq $5, %%mm0 \n\t" 962 "pmulhw %5, %%mm0 \n\t" 963 "pmulhw %5, %%mm1 \n\t" 964 "pmulhw "MANGLE(mul15_hi)
", %%mm2 \n\t" 966 ::
"r"(
d),
"r"(s),
"m"(mask15b),
"m"(mask15g),
"m"(mask15r) ,
"m"(mul15_mid)
971 __asm__
volatile(
SFENCE:::
"memory");
972 __asm__
volatile(
EMMS:::
"memory");
974 register uint16_t bgr;
976 *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
977 *d++ = ((bgr&0x3E0)>>2) | ((bgr&0x3E0)>>7);
978 *d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12);
986 const uint16_t *mm_end;
988 const uint16_t *
s = (
const uint16_t*)
src;
989 end = s + src_size/2;
990 __asm__
volatile(
PREFETCH" %0"::
"m"(*s):
"memory");
991 __asm__
volatile(
"pxor %%mm7,%%mm7 \n\t":::
"memory");
992 __asm__
volatile(
"pcmpeqd %%mm6,%%mm6 \n\t":::
"memory");
997 "movq (%1), %%mm0 \n\t" 998 "movq (%1), %%mm1 \n\t" 999 "movq (%1), %%mm2 \n\t" 1000 "pand %2, %%mm0 \n\t" 1001 "pand %3, %%mm1 \n\t" 1002 "pand %4, %%mm2 \n\t" 1003 "psllq $5, %%mm0 \n\t" 1004 "psrlq $1, %%mm2 \n\t" 1005 "pmulhw %5, %%mm0 \n\t" 1006 "pmulhw "MANGLE(mul16_mid)
", %%mm1 \n\t" 1007 "pmulhw "MANGLE(mul15_hi)
", %%mm2 \n\t" 1009 ::
"r"(
d),
"r"(s),
"m"(mask16b),
"m"(mask16g),
"m"(mask16r),
"m"(mul15_mid)
1014 __asm__
volatile(
SFENCE:::
"memory");
1015 __asm__
volatile(
EMMS:::
"memory");
1017 register uint16_t bgr;
1019 *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
1020 *d++ = ((bgr&0x7E0)>>3) | ((bgr&0x7E0)>>9);
1021 *d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13);
1035 "movq %3, %%mm7 \n\t" 1036 "pxor %4, %%mm7 \n\t" 1037 "movq %%mm7, %%mm6 \n\t" 1038 "pxor %5, %%mm7 \n\t" 1042 "movq (%1, %0), %%mm0 \n\t" 1043 "movq 8(%1, %0), %%mm1 \n\t" 1044 # if COMPILE_TEMPLATE_MMXEXT 1045 "pshufw $177, %%mm0, %%mm3 \n\t" 1046 "pshufw $177, %%mm1, %%mm5 \n\t" 1047 "pand %%mm7, %%mm0 \n\t" 1048 "pand %%mm6, %%mm3 \n\t" 1049 "pand %%mm7, %%mm1 \n\t" 1050 "pand %%mm6, %%mm5 \n\t" 1051 "por %%mm3, %%mm0 \n\t" 1052 "por %%mm5, %%mm1 \n\t" 1054 "movq %%mm0, %%mm2 \n\t" 1055 "movq %%mm1, %%mm4 \n\t" 1056 "pand %%mm7, %%mm0 \n\t" 1057 "pand %%mm6, %%mm2 \n\t" 1058 "pand %%mm7, %%mm1 \n\t" 1059 "pand %%mm6, %%mm4 \n\t" 1060 "movq %%mm2, %%mm3 \n\t" 1061 "movq %%mm4, %%mm5 \n\t" 1062 "pslld $16, %%mm2 \n\t" 1063 "psrld $16, %%mm3 \n\t" 1064 "pslld $16, %%mm4 \n\t" 1065 "psrld $16, %%mm5 \n\t" 1066 "por %%mm2, %%mm0 \n\t" 1067 "por %%mm4, %%mm1 \n\t" 1068 "por %%mm3, %%mm0 \n\t" 1069 "por %%mm5, %%mm1 \n\t" 1071 MOVNTQ" %%mm0, (%2, %0) \n\t" 1072 MOVNTQ" %%mm1, 8(%2, %0) \n\t" 1079 :
"r" (s),
"r" (
d),
"m" (mask32b),
"m" (mask32r),
"m" (mmx_one)
1081 for (; idx<15; idx+=4) {
1082 register int v = *(
const uint32_t *)&s[idx],
g = v & 0xff00ff00;
1084 *(uint32_t *)&d[idx] = (v>>16) +
g + (v<<16);
1091 x86_reg mmx_size= 23 - src_size;
1093 "test %%"REG_a
", %%"REG_a
" \n\t" 1095 "movq "MANGLE(mask24r)
", %%mm5 \n\t" 1096 "movq "MANGLE(mask24g)
", %%mm6 \n\t" 1097 "movq "MANGLE(mask24b)
", %%mm7 \n\t" 1101 "movq (%1, %%"REG_a
"), %%mm0 \n\t" 1102 "movq (%1, %%"REG_a
"), %%mm1 \n\t" 1103 "movq 2(%1, %%"REG_a
"), %%mm2 \n\t" 1104 "psllq $16, %%mm0 \n\t" 1105 "pand %%mm5, %%mm0 \n\t" 1106 "pand %%mm6, %%mm1 \n\t" 1107 "pand %%mm7, %%mm2 \n\t" 1108 "por %%mm0, %%mm1 \n\t" 1109 "por %%mm2, %%mm1 \n\t" 1110 "movq 6(%1, %%"REG_a
"), %%mm0 \n\t" 1111 MOVNTQ" %%mm1, (%2, %%"REG_a
") \n\t" 1112 "movq 8(%1, %%"REG_a
"), %%mm1 \n\t" 1113 "movq 10(%1, %%"REG_a
"), %%mm2 \n\t" 1114 "pand %%mm7, %%mm0 \n\t" 1115 "pand %%mm5, %%mm1 \n\t" 1116 "pand %%mm6, %%mm2 \n\t" 1117 "por %%mm0, %%mm1 \n\t" 1118 "por %%mm2, %%mm1 \n\t" 1119 "movq 14(%1, %%"REG_a
"), %%mm0 \n\t" 1120 MOVNTQ" %%mm1, 8(%2, %%"REG_a
") \n\t" 1121 "movq 16(%1, %%"REG_a
"), %%mm1 \n\t" 1122 "movq 18(%1, %%"REG_a
"), %%mm2 \n\t" 1123 "pand %%mm6, %%mm0 \n\t" 1124 "pand %%mm7, %%mm1 \n\t" 1125 "pand %%mm5, %%mm2 \n\t" 1126 "por %%mm0, %%mm1 \n\t" 1127 "por %%mm2, %%mm1 \n\t" 1128 MOVNTQ" %%mm1, 16(%2, %%"REG_a
") \n\t" 1129 "add $24, %%"REG_a
" \n\t" 1133 :
"r" (
src-mmx_size),
"r"(
dst-mmx_size)
1136 __asm__
volatile(
SFENCE:::
"memory");
1137 __asm__
volatile(
EMMS:::
"memory");
1139 if (mmx_size==23)
return;
1143 src_size= 23-mmx_size;
1146 for (i=0; i<src_size; i+=3) {
1157 int lumStride,
int chromStride,
int dstStride,
int vertLumPerChroma)
1161 for (y=0; y<
height; y++) {
1164 "xor %%"REG_a
", %%"REG_a
" \n\t" 1167 PREFETCH" 32(%1, %%"REG_a
", 2) \n\t" 1170 "movq (%2, %%"REG_a
"), %%mm0 \n\t" 1171 "movq %%mm0, %%mm2 \n\t" 1172 "movq (%3, %%"REG_a
"), %%mm1 \n\t" 1173 "punpcklbw %%mm1, %%mm0 \n\t" 1174 "punpckhbw %%mm1, %%mm2 \n\t" 1176 "movq (%1, %%"REG_a
",2), %%mm3 \n\t" 1177 "movq 8(%1, %%"REG_a
",2), %%mm5 \n\t" 1178 "movq %%mm3, %%mm4 \n\t" 1179 "movq %%mm5, %%mm6 \n\t" 1180 "punpcklbw %%mm0, %%mm3 \n\t" 1181 "punpckhbw %%mm0, %%mm4 \n\t" 1182 "punpcklbw %%mm2, %%mm5 \n\t" 1183 "punpckhbw %%mm2, %%mm6 \n\t" 1185 MOVNTQ" %%mm3, (%0, %%"REG_a
", 4) \n\t" 1186 MOVNTQ" %%mm4, 8(%0, %%"REG_a
", 4) \n\t" 1187 MOVNTQ" %%mm5, 16(%0, %%"REG_a
", 4) \n\t" 1188 MOVNTQ" %%mm6, 24(%0, %%"REG_a
", 4) \n\t" 1190 "add $8, %%"REG_a
" \n\t" 1191 "cmp %4, %%"REG_a
" \n\t" 1193 ::
"r"(
dst),
"r"(ysrc),
"r"(usrc),
"r"(vsrc),
"g" (chromWidth)
1196 if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
1197 usrc += chromStride;
1198 vsrc += chromStride;
1214 int lumStride,
int chromStride,
int dstStride)
1222 int lumStride,
int chromStride,
int dstStride,
int vertLumPerChroma)
1226 for (y=0; y<
height; y++) {
1229 "xor %%"REG_a
", %%"REG_a
" \n\t" 1232 PREFETCH" 32(%1, %%"REG_a
", 2) \n\t" 1235 "movq (%2, %%"REG_a
"), %%mm0 \n\t" 1236 "movq %%mm0, %%mm2 \n\t" 1237 "movq (%3, %%"REG_a
"), %%mm1 \n\t" 1238 "punpcklbw %%mm1, %%mm0 \n\t" 1239 "punpckhbw %%mm1, %%mm2 \n\t" 1241 "movq (%1, %%"REG_a
",2), %%mm3 \n\t" 1242 "movq 8(%1, %%"REG_a
",2), %%mm5 \n\t" 1243 "movq %%mm0, %%mm4 \n\t" 1244 "movq %%mm2, %%mm6 \n\t" 1245 "punpcklbw %%mm3, %%mm0 \n\t" 1246 "punpckhbw %%mm3, %%mm4 \n\t" 1247 "punpcklbw %%mm5, %%mm2 \n\t" 1248 "punpckhbw %%mm5, %%mm6 \n\t" 1250 MOVNTQ" %%mm0, (%0, %%"REG_a
", 4) \n\t" 1251 MOVNTQ" %%mm4, 8(%0, %%"REG_a
", 4) \n\t" 1252 MOVNTQ" %%mm2, 16(%0, %%"REG_a
", 4) \n\t" 1253 MOVNTQ" %%mm6, 24(%0, %%"REG_a
", 4) \n\t" 1255 "add $8, %%"REG_a
" \n\t" 1256 "cmp %4, %%"REG_a
" \n\t" 1258 ::
"r"(
dst),
"r"(ysrc),
"r"(usrc),
"r"(vsrc),
"g" (chromWidth)
1261 if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
1262 usrc += chromStride;
1263 vsrc += chromStride;
1279 int lumStride,
int chromStride,
int dstStride)
1290 int lumStride,
int chromStride,
int dstStride)
1300 int lumStride,
int chromStride,
int dstStride)
1311 int lumStride,
int chromStride,
int srcStride)
1315 for (y=0; y<
height; y+=2) {
1317 "xor %%"REG_a
", %%"REG_a
" \n\t" 1318 "pcmpeqw %%mm7, %%mm7 \n\t" 1319 "psrlw $8, %%mm7 \n\t" 1322 PREFETCH" 64(%0, %%"REG_a
", 4) \n\t" 1323 "movq (%0, %%"REG_a
", 4), %%mm0 \n\t" 1324 "movq 8(%0, %%"REG_a
", 4), %%mm1 \n\t" 1325 "movq %%mm0, %%mm2 \n\t" 1326 "movq %%mm1, %%mm3 \n\t" 1327 "psrlw $8, %%mm0 \n\t" 1328 "psrlw $8, %%mm1 \n\t" 1329 "pand %%mm7, %%mm2 \n\t" 1330 "pand %%mm7, %%mm3 \n\t" 1331 "packuswb %%mm1, %%mm0 \n\t" 1332 "packuswb %%mm3, %%mm2 \n\t" 1334 MOVNTQ" %%mm2, (%1, %%"REG_a
", 2) \n\t" 1336 "movq 16(%0, %%"REG_a
", 4), %%mm1 \n\t" 1337 "movq 24(%0, %%"REG_a
", 4), %%mm2 \n\t" 1338 "movq %%mm1, %%mm3 \n\t" 1339 "movq %%mm2, %%mm4 \n\t" 1340 "psrlw $8, %%mm1 \n\t" 1341 "psrlw $8, %%mm2 \n\t" 1342 "pand %%mm7, %%mm3 \n\t" 1343 "pand %%mm7, %%mm4 \n\t" 1344 "packuswb %%mm2, %%mm1 \n\t" 1345 "packuswb %%mm4, %%mm3 \n\t" 1347 MOVNTQ" %%mm3, 8(%1, %%"REG_a
", 2) \n\t" 1349 "movq %%mm0, %%mm2 \n\t" 1350 "movq %%mm1, %%mm3 \n\t" 1351 "psrlw $8, %%mm0 \n\t" 1352 "psrlw $8, %%mm1 \n\t" 1353 "pand %%mm7, %%mm2 \n\t" 1354 "pand %%mm7, %%mm3 \n\t" 1355 "packuswb %%mm1, %%mm0 \n\t" 1356 "packuswb %%mm3, %%mm2 \n\t" 1358 MOVNTQ" %%mm0, (%3, %%"REG_a
") \n\t" 1359 MOVNTQ" %%mm2, (%2, %%"REG_a
") \n\t" 1361 "add $8, %%"REG_a
" \n\t" 1362 "cmp %4, %%"REG_a
" \n\t" 1364 ::
"r"(
src),
"r"(ydst),
"r"(udst),
"r"(vdst),
"g" (chromWidth)
1365 :
"memory",
"%"REG_a
1372 "xor %%"REG_a
", %%"REG_a
" \n\t" 1375 PREFETCH" 64(%0, %%"REG_a
", 4) \n\t" 1376 "movq (%0, %%"REG_a
", 4), %%mm0 \n\t" 1377 "movq 8(%0, %%"REG_a
", 4), %%mm1 \n\t" 1378 "movq 16(%0, %%"REG_a
", 4), %%mm2 \n\t" 1379 "movq 24(%0, %%"REG_a
", 4), %%mm3 \n\t" 1380 "pand %%mm7, %%mm0 \n\t" 1381 "pand %%mm7, %%mm1 \n\t" 1382 "pand %%mm7, %%mm2 \n\t" 1383 "pand %%mm7, %%mm3 \n\t" 1384 "packuswb %%mm1, %%mm0 \n\t" 1385 "packuswb %%mm3, %%mm2 \n\t" 1387 MOVNTQ" %%mm0, (%1, %%"REG_a
", 2) \n\t" 1388 MOVNTQ" %%mm2, 8(%1, %%"REG_a
", 2) \n\t" 1390 "add $8, %%"REG_a
" \n\t" 1391 "cmp %4, %%"REG_a
" \n\t" 1394 ::
"r"(
src),
"r"(ydst),
"r"(udst),
"r"(vdst),
"g" (chromWidth)
1395 :
"memory",
"%"REG_a
1397 udst += chromStride;
1398 vdst += chromStride;
1402 __asm__
volatile(
EMMS" \n\t" 1408 #if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW 1416 for (x=0; x<srcWidth-1; x++) {
1420 dst[2*srcWidth-1]=
src[srcWidth-1];
1424 for (y=1; y<srcHeight; y++) {
1425 const x86_reg mmxSize= srcWidth&~15;
1427 "mov %4, %%"REG_a
" \n\t" 1428 "movq "MANGLE(mmx_ff)
", %%mm0 \n\t" 1429 "movq (%0, %%"REG_a
"), %%mm4 \n\t" 1430 "movq %%mm4, %%mm2 \n\t" 1431 "psllq $8, %%mm4 \n\t" 1432 "pand %%mm0, %%mm2 \n\t" 1433 "por %%mm2, %%mm4 \n\t" 1434 "movq (%1, %%"REG_a
"), %%mm5 \n\t" 1435 "movq %%mm5, %%mm3 \n\t" 1436 "psllq $8, %%mm5 \n\t" 1437 "pand %%mm0, %%mm3 \n\t" 1438 "por %%mm3, %%mm5 \n\t" 1440 "movq (%0, %%"REG_a
"), %%mm0 \n\t" 1441 "movq (%1, %%"REG_a
"), %%mm1 \n\t" 1442 "movq 1(%0, %%"REG_a
"), %%mm2 \n\t" 1443 "movq 1(%1, %%"REG_a
"), %%mm3 \n\t" 1444 PAVGB" %%mm0, %%mm5 \n\t" 1445 PAVGB" %%mm0, %%mm3 \n\t" 1446 PAVGB" %%mm0, %%mm5 \n\t" 1447 PAVGB" %%mm0, %%mm3 \n\t" 1448 PAVGB" %%mm1, %%mm4 \n\t" 1449 PAVGB" %%mm1, %%mm2 \n\t" 1450 PAVGB" %%mm1, %%mm4 \n\t" 1451 PAVGB" %%mm1, %%mm2 \n\t" 1452 "movq %%mm5, %%mm7 \n\t" 1453 "movq %%mm4, %%mm6 \n\t" 1454 "punpcklbw %%mm3, %%mm5 \n\t" 1455 "punpckhbw %%mm3, %%mm7 \n\t" 1456 "punpcklbw %%mm2, %%mm4 \n\t" 1457 "punpckhbw %%mm2, %%mm6 \n\t" 1458 MOVNTQ" %%mm5, (%2, %%"REG_a
", 2) \n\t" 1459 MOVNTQ" %%mm7, 8(%2, %%"REG_a
", 2) \n\t" 1460 MOVNTQ" %%mm4, (%3, %%"REG_a
", 2) \n\t" 1461 MOVNTQ" %%mm6, 8(%3, %%"REG_a
", 2) \n\t" 1462 "add $8, %%"REG_a
" \n\t" 1463 "movq -1(%0, %%"REG_a
"), %%mm4 \n\t" 1464 "movq -1(%1, %%"REG_a
"), %%mm5 \n\t" 1466 ::
"r" (
src + mmxSize ),
"r" (
src + srcStride + mmxSize ),
1467 "r" (
dst + mmxSize*2),
"r" (
dst + dstStride + mmxSize*2),
1472 for (x=mmxSize-1; x<srcWidth-1; x++) {
1473 dst[2*x +1]= (3*
src[x+0] +
src[x+srcStride+1])>>2;
1474 dst[2*x+dstStride+2]= (
src[x+0] + 3*
src[x+srcStride+1])>>2;
1475 dst[2*x+dstStride+1]= (
src[x+1] + 3*
src[x+srcStride ])>>2;
1476 dst[2*x +2]= (3*
src[x+1] +
src[x+srcStride ])>>2;
1478 dst[srcWidth*2 -1 ]= (3*
src[srcWidth-1] +
src[srcWidth-1 + srcStride])>>2;
1479 dst[srcWidth*2 -1 + dstStride]= (
src[srcWidth-1] + 3*
src[srcWidth-1 + srcStride])>>2;
1488 for (x=0; x<srcWidth-1; x++) {
1492 dst[2*srcWidth-1]=
src[srcWidth-1];
1494 __asm__
volatile(
EMMS" \n\t" 1500 #if !COMPILE_TEMPLATE_AMD3DNOW 1509 int lumStride,
int chromStride,
int srcStride)
1513 for (y=0; y<
height; y+=2) {
1515 "xor %%"REG_a
", %%"REG_a
" \n\t" 1516 "pcmpeqw %%mm7, %%mm7 \n\t" 1517 "psrlw $8, %%mm7 \n\t" 1520 PREFETCH" 64(%0, %%"REG_a
", 4) \n\t" 1521 "movq (%0, %%"REG_a
", 4), %%mm0 \n\t" 1522 "movq 8(%0, %%"REG_a
", 4), %%mm1 \n\t" 1523 "movq %%mm0, %%mm2 \n\t" 1524 "movq %%mm1, %%mm3 \n\t" 1525 "pand %%mm7, %%mm0 \n\t" 1526 "pand %%mm7, %%mm1 \n\t" 1527 "psrlw $8, %%mm2 \n\t" 1528 "psrlw $8, %%mm3 \n\t" 1529 "packuswb %%mm1, %%mm0 \n\t" 1530 "packuswb %%mm3, %%mm2 \n\t" 1532 MOVNTQ" %%mm2, (%1, %%"REG_a
", 2) \n\t" 1534 "movq 16(%0, %%"REG_a
", 4), %%mm1 \n\t" 1535 "movq 24(%0, %%"REG_a
", 4), %%mm2 \n\t" 1536 "movq %%mm1, %%mm3 \n\t" 1537 "movq %%mm2, %%mm4 \n\t" 1538 "pand %%mm7, %%mm1 \n\t" 1539 "pand %%mm7, %%mm2 \n\t" 1540 "psrlw $8, %%mm3 \n\t" 1541 "psrlw $8, %%mm4 \n\t" 1542 "packuswb %%mm2, %%mm1 \n\t" 1543 "packuswb %%mm4, %%mm3 \n\t" 1545 MOVNTQ" %%mm3, 8(%1, %%"REG_a
", 2) \n\t" 1547 "movq %%mm0, %%mm2 \n\t" 1548 "movq %%mm1, %%mm3 \n\t" 1549 "psrlw $8, %%mm0 \n\t" 1550 "psrlw $8, %%mm1 \n\t" 1551 "pand %%mm7, %%mm2 \n\t" 1552 "pand %%mm7, %%mm3 \n\t" 1553 "packuswb %%mm1, %%mm0 \n\t" 1554 "packuswb %%mm3, %%mm2 \n\t" 1556 MOVNTQ" %%mm0, (%3, %%"REG_a
") \n\t" 1557 MOVNTQ" %%mm2, (%2, %%"REG_a
") \n\t" 1559 "add $8, %%"REG_a
" \n\t" 1560 "cmp %4, %%"REG_a
" \n\t" 1562 ::
"r"(
src),
"r"(ydst),
"r"(udst),
"r"(vdst),
"g" (chromWidth)
1563 :
"memory",
"%"REG_a
1570 "xor %%"REG_a
", %%"REG_a
" \n\t" 1573 PREFETCH" 64(%0, %%"REG_a
", 4) \n\t" 1574 "movq (%0, %%"REG_a
", 4), %%mm0 \n\t" 1575 "movq 8(%0, %%"REG_a
", 4), %%mm1 \n\t" 1576 "movq 16(%0, %%"REG_a
", 4), %%mm2 \n\t" 1577 "movq 24(%0, %%"REG_a
", 4), %%mm3 \n\t" 1578 "psrlw $8, %%mm0 \n\t" 1579 "psrlw $8, %%mm1 \n\t" 1580 "psrlw $8, %%mm2 \n\t" 1581 "psrlw $8, %%mm3 \n\t" 1582 "packuswb %%mm1, %%mm0 \n\t" 1583 "packuswb %%mm3, %%mm2 \n\t" 1585 MOVNTQ" %%mm0, (%1, %%"REG_a
", 2) \n\t" 1586 MOVNTQ" %%mm2, 8(%1, %%"REG_a
", 2) \n\t" 1588 "add $8, %%"REG_a
" \n\t" 1589 "cmp %4, %%"REG_a
" \n\t" 1592 ::
"r"(
src),
"r"(ydst),
"r"(udst),
"r"(vdst),
"g" (chromWidth)
1593 :
"memory",
"%"REG_a
1595 udst += chromStride;
1596 vdst += chromStride;
1600 __asm__
volatile(
EMMS" \n\t" 1616 int lumStride,
int chromStride,
int srcStride,
1619 #define BGR2Y_IDX "16*4+16*32" 1620 #define BGR2U_IDX "16*4+16*33" 1621 #define BGR2V_IDX "16*4+16*34" 1624 for (y=0; y<
height-2; y+=2) {
1626 for (i=0; i<2; i++) {
1628 "mov %2, %%"REG_a
" \n\t" 1629 "movq "BGR2Y_IDX
"(%3), %%mm6 \n\t" 1630 "movq "MANGLE(ff_w1111)
", %%mm5 \n\t" 1631 "pxor %%mm7, %%mm7 \n\t" 1632 "lea (%%"REG_a
", %%"REG_a
", 2), %%"REG_d
" \n\t" 1636 "movd (%0, %%"REG_d
"), %%mm0 \n\t" 1637 "movd 3(%0, %%"REG_d
"), %%mm1 \n\t" 1638 "punpcklbw %%mm7, %%mm0 \n\t" 1639 "punpcklbw %%mm7, %%mm1 \n\t" 1640 "movd 6(%0, %%"REG_d
"), %%mm2 \n\t" 1641 "movd 9(%0, %%"REG_d
"), %%mm3 \n\t" 1642 "punpcklbw %%mm7, %%mm2 \n\t" 1643 "punpcklbw %%mm7, %%mm3 \n\t" 1644 "pmaddwd %%mm6, %%mm0 \n\t" 1645 "pmaddwd %%mm6, %%mm1 \n\t" 1646 "pmaddwd %%mm6, %%mm2 \n\t" 1647 "pmaddwd %%mm6, %%mm3 \n\t" 1648 "psrad $8, %%mm0 \n\t" 1649 "psrad $8, %%mm1 \n\t" 1650 "psrad $8, %%mm2 \n\t" 1651 "psrad $8, %%mm3 \n\t" 1652 "packssdw %%mm1, %%mm0 \n\t" 1653 "packssdw %%mm3, %%mm2 \n\t" 1654 "pmaddwd %%mm5, %%mm0 \n\t" 1655 "pmaddwd %%mm5, %%mm2 \n\t" 1656 "packssdw %%mm2, %%mm0 \n\t" 1657 "psraw $7, %%mm0 \n\t" 1659 "movd 12(%0, %%"REG_d
"), %%mm4 \n\t" 1660 "movd 15(%0, %%"REG_d
"), %%mm1 \n\t" 1661 "punpcklbw %%mm7, %%mm4 \n\t" 1662 "punpcklbw %%mm7, %%mm1 \n\t" 1663 "movd 18(%0, %%"REG_d
"), %%mm2 \n\t" 1664 "movd 21(%0, %%"REG_d
"), %%mm3 \n\t" 1665 "punpcklbw %%mm7, %%mm2 \n\t" 1666 "punpcklbw %%mm7, %%mm3 \n\t" 1667 "pmaddwd %%mm6, %%mm4 \n\t" 1668 "pmaddwd %%mm6, %%mm1 \n\t" 1669 "pmaddwd %%mm6, %%mm2 \n\t" 1670 "pmaddwd %%mm6, %%mm3 \n\t" 1671 "psrad $8, %%mm4 \n\t" 1672 "psrad $8, %%mm1 \n\t" 1673 "psrad $8, %%mm2 \n\t" 1674 "psrad $8, %%mm3 \n\t" 1675 "packssdw %%mm1, %%mm4 \n\t" 1676 "packssdw %%mm3, %%mm2 \n\t" 1677 "pmaddwd %%mm5, %%mm4 \n\t" 1678 "pmaddwd %%mm5, %%mm2 \n\t" 1679 "add $24, %%"REG_d
" \n\t" 1680 "packssdw %%mm2, %%mm4 \n\t" 1681 "psraw $7, %%mm4 \n\t" 1683 "packuswb %%mm4, %%mm0 \n\t" 1684 "paddusb "MANGLE(ff_bgr2YOffset)
", %%mm0 \n\t" 1686 MOVNTQ" %%mm0, (%1, %%"REG_a
") \n\t" 1687 "add $8, %%"REG_a
" \n\t" 1690 :
"%"REG_a,
"%"REG_d
1697 "mov %4, %%"REG_a
" \n\t" 1698 "movq "MANGLE(ff_w1111)
", %%mm5 \n\t" 1699 "movq "BGR2U_IDX
"(%5), %%mm6 \n\t" 1700 "pxor %%mm7, %%mm7 \n\t" 1701 "lea (%%"REG_a
", %%"REG_a
", 2), %%"REG_d
" \n\t" 1702 "add %%"REG_d
", %%"REG_d
" \n\t" 1707 #if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW 1708 "movq (%0, %%"REG_d
"), %%mm0 \n\t" 1709 "movq (%1, %%"REG_d
"), %%mm1 \n\t" 1710 "movq 6(%0, %%"REG_d
"), %%mm2 \n\t" 1711 "movq 6(%1, %%"REG_d
"), %%mm3 \n\t" 1712 PAVGB" %%mm1, %%mm0 \n\t" 1713 PAVGB" %%mm3, %%mm2 \n\t" 1714 "movq %%mm0, %%mm1 \n\t" 1715 "movq %%mm2, %%mm3 \n\t" 1716 "psrlq $24, %%mm0 \n\t" 1717 "psrlq $24, %%mm2 \n\t" 1718 PAVGB" %%mm1, %%mm0 \n\t" 1719 PAVGB" %%mm3, %%mm2 \n\t" 1720 "punpcklbw %%mm7, %%mm0 \n\t" 1721 "punpcklbw %%mm7, %%mm2 \n\t" 1723 "movd (%0, %%"REG_d
"), %%mm0 \n\t" 1724 "movd (%1, %%"REG_d
"), %%mm1 \n\t" 1725 "movd 3(%0, %%"REG_d
"), %%mm2 \n\t" 1726 "movd 3(%1, %%"REG_d
"), %%mm3 \n\t" 1727 "punpcklbw %%mm7, %%mm0 \n\t" 1728 "punpcklbw %%mm7, %%mm1 \n\t" 1729 "punpcklbw %%mm7, %%mm2 \n\t" 1730 "punpcklbw %%mm7, %%mm3 \n\t" 1731 "paddw %%mm1, %%mm0 \n\t" 1732 "paddw %%mm3, %%mm2 \n\t" 1733 "paddw %%mm2, %%mm0 \n\t" 1734 "movd 6(%0, %%"REG_d
"), %%mm4 \n\t" 1735 "movd 6(%1, %%"REG_d
"), %%mm1 \n\t" 1736 "movd 9(%0, %%"REG_d
"), %%mm2 \n\t" 1737 "movd 9(%1, %%"REG_d
"), %%mm3 \n\t" 1738 "punpcklbw %%mm7, %%mm4 \n\t" 1739 "punpcklbw %%mm7, %%mm1 \n\t" 1740 "punpcklbw %%mm7, %%mm2 \n\t" 1741 "punpcklbw %%mm7, %%mm3 \n\t" 1742 "paddw %%mm1, %%mm4 \n\t" 1743 "paddw %%mm3, %%mm2 \n\t" 1744 "paddw %%mm4, %%mm2 \n\t" 1745 "psrlw $2, %%mm0 \n\t" 1746 "psrlw $2, %%mm2 \n\t" 1748 "movq "BGR2V_IDX
"(%5), %%mm1 \n\t" 1749 "movq "BGR2V_IDX
"(%5), %%mm3 \n\t" 1751 "pmaddwd %%mm0, %%mm1 \n\t" 1752 "pmaddwd %%mm2, %%mm3 \n\t" 1753 "pmaddwd %%mm6, %%mm0 \n\t" 1754 "pmaddwd %%mm6, %%mm2 \n\t" 1755 "psrad $8, %%mm0 \n\t" 1756 "psrad $8, %%mm1 \n\t" 1757 "psrad $8, %%mm2 \n\t" 1758 "psrad $8, %%mm3 \n\t" 1759 "packssdw %%mm2, %%mm0 \n\t" 1760 "packssdw %%mm3, %%mm1 \n\t" 1761 "pmaddwd %%mm5, %%mm0 \n\t" 1762 "pmaddwd %%mm5, %%mm1 \n\t" 1763 "packssdw %%mm1, %%mm0 \n\t" 1764 "psraw $7, %%mm0 \n\t" 1766 #if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW 1767 "movq 12(%0, %%"REG_d
"), %%mm4 \n\t" 1768 "movq 12(%1, %%"REG_d
"), %%mm1 \n\t" 1769 "movq 18(%0, %%"REG_d
"), %%mm2 \n\t" 1770 "movq 18(%1, %%"REG_d
"), %%mm3 \n\t" 1771 PAVGB" %%mm1, %%mm4 \n\t" 1772 PAVGB" %%mm3, %%mm2 \n\t" 1773 "movq %%mm4, %%mm1 \n\t" 1774 "movq %%mm2, %%mm3 \n\t" 1775 "psrlq $24, %%mm4 \n\t" 1776 "psrlq $24, %%mm2 \n\t" 1777 PAVGB" %%mm1, %%mm4 \n\t" 1778 PAVGB" %%mm3, %%mm2 \n\t" 1779 "punpcklbw %%mm7, %%mm4 \n\t" 1780 "punpcklbw %%mm7, %%mm2 \n\t" 1782 "movd 12(%0, %%"REG_d
"), %%mm4 \n\t" 1783 "movd 12(%1, %%"REG_d
"), %%mm1 \n\t" 1784 "movd 15(%0, %%"REG_d
"), %%mm2 \n\t" 1785 "movd 15(%1, %%"REG_d
"), %%mm3 \n\t" 1786 "punpcklbw %%mm7, %%mm4 \n\t" 1787 "punpcklbw %%mm7, %%mm1 \n\t" 1788 "punpcklbw %%mm7, %%mm2 \n\t" 1789 "punpcklbw %%mm7, %%mm3 \n\t" 1790 "paddw %%mm1, %%mm4 \n\t" 1791 "paddw %%mm3, %%mm2 \n\t" 1792 "paddw %%mm2, %%mm4 \n\t" 1793 "movd 18(%0, %%"REG_d
"), %%mm5 \n\t" 1794 "movd 18(%1, %%"REG_d
"), %%mm1 \n\t" 1795 "movd 21(%0, %%"REG_d
"), %%mm2 \n\t" 1796 "movd 21(%1, %%"REG_d
"), %%mm3 \n\t" 1797 "punpcklbw %%mm7, %%mm5 \n\t" 1798 "punpcklbw %%mm7, %%mm1 \n\t" 1799 "punpcklbw %%mm7, %%mm2 \n\t" 1800 "punpcklbw %%mm7, %%mm3 \n\t" 1801 "paddw %%mm1, %%mm5 \n\t" 1802 "paddw %%mm3, %%mm2 \n\t" 1803 "paddw %%mm5, %%mm2 \n\t" 1804 "movq "MANGLE(ff_w1111)
", %%mm5 \n\t" 1805 "psrlw $2, %%mm4 \n\t" 1806 "psrlw $2, %%mm2 \n\t" 1808 "movq "BGR2V_IDX
"(%5), %%mm1 \n\t" 1809 "movq "BGR2V_IDX
"(%5), %%mm3 \n\t" 1811 "pmaddwd %%mm4, %%mm1 \n\t" 1812 "pmaddwd %%mm2, %%mm3 \n\t" 1813 "pmaddwd %%mm6, %%mm4 \n\t" 1814 "pmaddwd %%mm6, %%mm2 \n\t" 1815 "psrad $8, %%mm4 \n\t" 1816 "psrad $8, %%mm1 \n\t" 1817 "psrad $8, %%mm2 \n\t" 1818 "psrad $8, %%mm3 \n\t" 1819 "packssdw %%mm2, %%mm4 \n\t" 1820 "packssdw %%mm3, %%mm1 \n\t" 1821 "pmaddwd %%mm5, %%mm4 \n\t" 1822 "pmaddwd %%mm5, %%mm1 \n\t" 1823 "add $24, %%"REG_d
" \n\t" 1824 "packssdw %%mm1, %%mm4 \n\t" 1825 "psraw $7, %%mm4 \n\t" 1827 "movq %%mm0, %%mm1 \n\t" 1828 "punpckldq %%mm4, %%mm0 \n\t" 1829 "punpckhdq %%mm4, %%mm1 \n\t" 1830 "packsswb %%mm1, %%mm0 \n\t" 1831 "paddb "MANGLE(ff_bgr2UVOffset)
", %%mm0 \n\t" 1832 "movd %%mm0, (%2, %%"REG_a
") \n\t" 1833 "punpckhdq %%mm0, %%mm0 \n\t" 1834 "movd %%mm0, (%3, %%"REG_a
") \n\t" 1835 "add $4, %%"REG_a
" \n\t" 1837 : :
"r" (
src+chromWidth*6),
"r" (
src+srcStride+chromWidth*6),
"r" (udst+chromWidth),
"r" (vdst+chromWidth),
"g" (-chromWidth),
"r"(rgb2yuv)
1838 :
"%"REG_a,
"%"REG_d
1841 udst += chromStride;
1842 vdst += chromStride;
1846 __asm__
volatile(
EMMS" \n\t" 1855 #if !COMPILE_TEMPLATE_AMD3DNOW 1858 int src2Stride,
int dstStride)
1862 for (h=0; h <
height; h++) {
1865 #if COMPILE_TEMPLATE_SSE2 1867 "xor %%"REG_a
", %%"REG_a
" \n\t" 1871 "movdqa (%1, %%"REG_a
"), %%xmm0 \n\t" 1872 "movdqa (%1, %%"REG_a
"), %%xmm1 \n\t" 1873 "movdqa (%2, %%"REG_a
"), %%xmm2 \n\t" 1874 "punpcklbw %%xmm2, %%xmm0 \n\t" 1875 "punpckhbw %%xmm2, %%xmm1 \n\t" 1876 "movntdq %%xmm0, (%0, %%"REG_a
", 2) \n\t" 1877 "movntdq %%xmm1, 16(%0, %%"REG_a
", 2) \n\t" 1878 "add $16, %%"REG_a
" \n\t" 1879 "cmp %3, %%"REG_a
" \n\t" 1882 :
"memory",
"%"REG_a
"" 1886 "xor %%"REG_a
", %%"REG_a
" \n\t" 1890 "movq (%1, %%"REG_a
"), %%mm0 \n\t" 1891 "movq 8(%1, %%"REG_a
"), %%mm2 \n\t" 1892 "movq %%mm0, %%mm1 \n\t" 1893 "movq %%mm2, %%mm3 \n\t" 1894 "movq (%2, %%"REG_a
"), %%mm4 \n\t" 1895 "movq 8(%2, %%"REG_a
"), %%mm5 \n\t" 1896 "punpcklbw %%mm4, %%mm0 \n\t" 1897 "punpckhbw %%mm4, %%mm1 \n\t" 1898 "punpcklbw %%mm5, %%mm2 \n\t" 1899 "punpckhbw %%mm5, %%mm3 \n\t" 1900 MOVNTQ" %%mm0, (%0, %%"REG_a
", 2) \n\t" 1901 MOVNTQ" %%mm1, 8(%0, %%"REG_a
", 2) \n\t" 1902 MOVNTQ" %%mm2, 16(%0, %%"REG_a
", 2) \n\t" 1903 MOVNTQ" %%mm3, 24(%0, %%"REG_a
", 2) \n\t" 1904 "add $16, %%"REG_a
" \n\t" 1905 "cmp %3, %%"REG_a
" \n\t" 1908 :
"memory",
"%"REG_a
1912 dest[2*w+0] = src1[
w];
1913 dest[2*w+1] = src2[
w];
1927 #if !COMPILE_TEMPLATE_SSE2 1928 #if !COMPILE_TEMPLATE_AMD3DNOW 1932 int srcStride1,
int srcStride2,
1933 int dstStride1,
int dstStride2)
1941 ::
"m"(*(src1+srcStride1)),
"m"(*(src2+srcStride2)):
"memory");
1943 const uint8_t*
s1=src1+srcStride1*(y>>1);
1946 for (;x<w-31;x+=32) {
1949 "movq (%1,%2), %%mm0 \n\t" 1950 "movq 8(%1,%2), %%mm2 \n\t" 1951 "movq 16(%1,%2), %%mm4 \n\t" 1952 "movq 24(%1,%2), %%mm6 \n\t" 1953 "movq %%mm0, %%mm1 \n\t" 1954 "movq %%mm2, %%mm3 \n\t" 1955 "movq %%mm4, %%mm5 \n\t" 1956 "movq %%mm6, %%mm7 \n\t" 1957 "punpcklbw %%mm0, %%mm0 \n\t" 1958 "punpckhbw %%mm1, %%mm1 \n\t" 1959 "punpcklbw %%mm2, %%mm2 \n\t" 1960 "punpckhbw %%mm3, %%mm3 \n\t" 1961 "punpcklbw %%mm4, %%mm4 \n\t" 1962 "punpckhbw %%mm5, %%mm5 \n\t" 1963 "punpcklbw %%mm6, %%mm6 \n\t" 1964 "punpckhbw %%mm7, %%mm7 \n\t" 1965 MOVNTQ" %%mm0, (%0,%2,2) \n\t" 1966 MOVNTQ" %%mm1, 8(%0,%2,2) \n\t" 1967 MOVNTQ" %%mm2, 16(%0,%2,2) \n\t" 1968 MOVNTQ" %%mm3, 24(%0,%2,2) \n\t" 1969 MOVNTQ" %%mm4, 32(%0,%2,2) \n\t" 1970 MOVNTQ" %%mm5, 40(%0,%2,2) \n\t" 1971 MOVNTQ" %%mm6, 48(%0,%2,2) \n\t" 1972 MOVNTQ" %%mm7, 56(%0,%2,2)" 1973 ::
"r"(
d),
"r"(s1),
"r"(
x)
1976 for (;x<
w;x++) d[2*x]=d[2*x+1]=s1[x];
1979 const uint8_t*
s2=src2+srcStride2*(y>>1);
1982 for (;x<w-31;x+=32) {
1985 "movq (%1,%2), %%mm0 \n\t" 1986 "movq 8(%1,%2), %%mm2 \n\t" 1987 "movq 16(%1,%2), %%mm4 \n\t" 1988 "movq 24(%1,%2), %%mm6 \n\t" 1989 "movq %%mm0, %%mm1 \n\t" 1990 "movq %%mm2, %%mm3 \n\t" 1991 "movq %%mm4, %%mm5 \n\t" 1992 "movq %%mm6, %%mm7 \n\t" 1993 "punpcklbw %%mm0, %%mm0 \n\t" 1994 "punpckhbw %%mm1, %%mm1 \n\t" 1995 "punpcklbw %%mm2, %%mm2 \n\t" 1996 "punpckhbw %%mm3, %%mm3 \n\t" 1997 "punpcklbw %%mm4, %%mm4 \n\t" 1998 "punpckhbw %%mm5, %%mm5 \n\t" 1999 "punpcklbw %%mm6, %%mm6 \n\t" 2000 "punpckhbw %%mm7, %%mm7 \n\t" 2001 MOVNTQ" %%mm0, (%0,%2,2) \n\t" 2002 MOVNTQ" %%mm1, 8(%0,%2,2) \n\t" 2003 MOVNTQ" %%mm2, 16(%0,%2,2) \n\t" 2004 MOVNTQ" %%mm3, 24(%0,%2,2) \n\t" 2005 MOVNTQ" %%mm4, 32(%0,%2,2) \n\t" 2006 MOVNTQ" %%mm5, 40(%0,%2,2) \n\t" 2007 MOVNTQ" %%mm6, 48(%0,%2,2) \n\t" 2008 MOVNTQ" %%mm7, 56(%0,%2,2)" 2009 ::
"r"(
d),
"r"(s2),
"r"(
x)
2012 for (;x<
w;x++) d[2*x]=d[2*x+1]=s2[x];
2024 int srcStride1,
int srcStride2,
2025 int srcStride3,
int dstStride)
2031 const uint8_t* yp=src1+srcStride1*
y;
2032 const uint8_t* up=src2+srcStride2*(y>>2);
2033 const uint8_t* vp=src3+srcStride3*(y>>2);
2041 "movq (%1, %0, 4), %%mm0 \n\t" 2042 "movq (%2, %0), %%mm1 \n\t" 2043 "movq (%3, %0), %%mm2 \n\t" 2044 "movq %%mm0, %%mm3 \n\t" 2045 "movq %%mm1, %%mm4 \n\t" 2046 "movq %%mm2, %%mm5 \n\t" 2047 "punpcklbw %%mm1, %%mm1 \n\t" 2048 "punpcklbw %%mm2, %%mm2 \n\t" 2049 "punpckhbw %%mm4, %%mm4 \n\t" 2050 "punpckhbw %%mm5, %%mm5 \n\t" 2052 "movq %%mm1, %%mm6 \n\t" 2053 "punpcklbw %%mm2, %%mm1 \n\t" 2054 "punpcklbw %%mm1, %%mm0 \n\t" 2055 "punpckhbw %%mm1, %%mm3 \n\t" 2056 MOVNTQ" %%mm0, (%4, %0, 8) \n\t" 2057 MOVNTQ" %%mm3, 8(%4, %0, 8) \n\t" 2059 "punpckhbw %%mm2, %%mm6 \n\t" 2060 "movq 8(%1, %0, 4), %%mm0 \n\t" 2061 "movq %%mm0, %%mm3 \n\t" 2062 "punpcklbw %%mm6, %%mm0 \n\t" 2063 "punpckhbw %%mm6, %%mm3 \n\t" 2064 MOVNTQ" %%mm0, 16(%4, %0, 8) \n\t" 2065 MOVNTQ" %%mm3, 24(%4, %0, 8) \n\t" 2067 "movq %%mm4, %%mm6 \n\t" 2068 "movq 16(%1, %0, 4), %%mm0 \n\t" 2069 "movq %%mm0, %%mm3 \n\t" 2070 "punpcklbw %%mm5, %%mm4 \n\t" 2071 "punpcklbw %%mm4, %%mm0 \n\t" 2072 "punpckhbw %%mm4, %%mm3 \n\t" 2073 MOVNTQ" %%mm0, 32(%4, %0, 8) \n\t" 2074 MOVNTQ" %%mm3, 40(%4, %0, 8) \n\t" 2076 "punpckhbw %%mm5, %%mm6 \n\t" 2077 "movq 24(%1, %0, 4), %%mm0 \n\t" 2078 "movq %%mm0, %%mm3 \n\t" 2079 "punpcklbw %%mm6, %%mm0 \n\t" 2080 "punpckhbw %%mm6, %%mm3 \n\t" 2081 MOVNTQ" %%mm0, 48(%4, %0, 8) \n\t" 2082 MOVNTQ" %%mm3, 56(%4, %0, 8) \n\t" 2085 :
"r"(yp),
"r" (up),
"r"(vp),
"r"(
d)
2089 const int x2 = x<<2;
2092 d[8*x+2] = yp[x2+1];
2094 d[8*x+4] = yp[x2+2];
2096 d[8*x+6] = yp[x2+3];
2117 "pcmpeqw %%mm7, %%mm7 \n\t" 2118 "psrlw $8, %%mm7 \n\t" 2120 "movq -30(%1, %0, 2), %%mm0 \n\t" 2121 "movq -22(%1, %0, 2), %%mm1 \n\t" 2122 "movq -14(%1, %0, 2), %%mm2 \n\t" 2123 "movq -6(%1, %0, 2), %%mm3 \n\t" 2124 "pand %%mm7, %%mm0 \n\t" 2125 "pand %%mm7, %%mm1 \n\t" 2126 "pand %%mm7, %%mm2 \n\t" 2127 "pand %%mm7, %%mm3 \n\t" 2128 "packuswb %%mm1, %%mm0 \n\t" 2129 "packuswb %%mm3, %%mm2 \n\t" 2130 MOVNTQ" %%mm0,-15(%2, %0) \n\t" 2131 MOVNTQ" %%mm2,- 7(%2, %0) \n\t" 2145 #if !COMPILE_TEMPLATE_AMD3DNOW 2155 "pcmpeqw %%mm7, %%mm7 \n\t" 2156 "psrlw $8, %%mm7 \n\t" 2158 "movq -28(%1, %0, 4), %%mm0 \n\t" 2159 "movq -20(%1, %0, 4), %%mm1 \n\t" 2160 "movq -12(%1, %0, 4), %%mm2 \n\t" 2161 "movq -4(%1, %0, 4), %%mm3 \n\t" 2162 "pand %%mm7, %%mm0 \n\t" 2163 "pand %%mm7, %%mm1 \n\t" 2164 "pand %%mm7, %%mm2 \n\t" 2165 "pand %%mm7, %%mm3 \n\t" 2166 "packuswb %%mm1, %%mm0 \n\t" 2167 "packuswb %%mm3, %%mm2 \n\t" 2168 "movq %%mm0, %%mm1 \n\t" 2169 "movq %%mm2, %%mm3 \n\t" 2170 "psrlw $8, %%mm0 \n\t" 2171 "psrlw $8, %%mm2 \n\t" 2172 "pand %%mm7, %%mm1 \n\t" 2173 "pand %%mm7, %%mm3 \n\t" 2174 "packuswb %%mm2, %%mm0 \n\t" 2175 "packuswb %%mm3, %%mm1 \n\t" 2176 MOVNTQ" %%mm0,- 7(%3, %0) \n\t" 2177 MOVNTQ" %%mm1,- 7(%2, %0) \n\t" 2181 :
"r"(
src),
"r"(dst0),
"r"(dst1)
2204 "pcmpeqw %%mm7, %%mm7 \n\t" 2205 "psrlw $8, %%mm7 \n\t" 2207 "movq -28(%1, %0, 4), %%mm0 \n\t" 2208 "movq -20(%1, %0, 4), %%mm1 \n\t" 2209 "movq -12(%1, %0, 4), %%mm2 \n\t" 2210 "movq -4(%1, %0, 4), %%mm3 \n\t" 2211 PAVGB" -28(%2, %0, 4), %%mm0 \n\t" 2212 PAVGB" -20(%2, %0, 4), %%mm1 \n\t" 2213 PAVGB" -12(%2, %0, 4), %%mm2 \n\t" 2214 PAVGB" - 4(%2, %0, 4), %%mm3 \n\t" 2215 "pand %%mm7, %%mm0 \n\t" 2216 "pand %%mm7, %%mm1 \n\t" 2217 "pand %%mm7, %%mm2 \n\t" 2218 "pand %%mm7, %%mm3 \n\t" 2219 "packuswb %%mm1, %%mm0 \n\t" 2220 "packuswb %%mm3, %%mm2 \n\t" 2221 "movq %%mm0, %%mm1 \n\t" 2222 "movq %%mm2, %%mm3 \n\t" 2223 "psrlw $8, %%mm0 \n\t" 2224 "psrlw $8, %%mm2 \n\t" 2225 "pand %%mm7, %%mm1 \n\t" 2226 "pand %%mm7, %%mm3 \n\t" 2227 "packuswb %%mm2, %%mm0 \n\t" 2228 "packuswb %%mm3, %%mm1 \n\t" 2229 MOVNTQ" %%mm0,- 7(%4, %0) \n\t" 2230 MOVNTQ" %%mm1,- 7(%3, %0) \n\t" 2234 :
"r"(src0),
"r"(src1),
"r"(dst0),
"r"(dst1)
2246 #if !COMPILE_TEMPLATE_AMD3DNOW 2256 "pcmpeqw %%mm7, %%mm7 \n\t" 2257 "psrlw $8, %%mm7 \n\t" 2259 "movq -28(%1, %0, 4), %%mm0 \n\t" 2260 "movq -20(%1, %0, 4), %%mm1 \n\t" 2261 "movq -12(%1, %0, 4), %%mm2 \n\t" 2262 "movq -4(%1, %0, 4), %%mm3 \n\t" 2263 "psrlw $8, %%mm0 \n\t" 2264 "psrlw $8, %%mm1 \n\t" 2265 "psrlw $8, %%mm2 \n\t" 2266 "psrlw $8, %%mm3 \n\t" 2267 "packuswb %%mm1, %%mm0 \n\t" 2268 "packuswb %%mm3, %%mm2 \n\t" 2269 "movq %%mm0, %%mm1 \n\t" 2270 "movq %%mm2, %%mm3 \n\t" 2271 "psrlw $8, %%mm0 \n\t" 2272 "psrlw $8, %%mm2 \n\t" 2273 "pand %%mm7, %%mm1 \n\t" 2274 "pand %%mm7, %%mm3 \n\t" 2275 "packuswb %%mm2, %%mm0 \n\t" 2276 "packuswb %%mm3, %%mm1 \n\t" 2277 MOVNTQ" %%mm0,- 7(%3, %0) \n\t" 2278 MOVNTQ" %%mm1,- 7(%2, %0) \n\t" 2282 :
"r"(
src),
"r"(dst0),
"r"(dst1)
2306 "pcmpeqw %%mm7, %%mm7 \n\t" 2307 "psrlw $8, %%mm7 \n\t" 2309 "movq -28(%1, %0, 4), %%mm0 \n\t" 2310 "movq -20(%1, %0, 4), %%mm1 \n\t" 2311 "movq -12(%1, %0, 4), %%mm2 \n\t" 2312 "movq -4(%1, %0, 4), %%mm3 \n\t" 2313 PAVGB" -28(%2, %0, 4), %%mm0 \n\t" 2314 PAVGB" -20(%2, %0, 4), %%mm1 \n\t" 2315 PAVGB" -12(%2, %0, 4), %%mm2 \n\t" 2316 PAVGB" - 4(%2, %0, 4), %%mm3 \n\t" 2317 "psrlw $8, %%mm0 \n\t" 2318 "psrlw $8, %%mm1 \n\t" 2319 "psrlw $8, %%mm2 \n\t" 2320 "psrlw $8, %%mm3 \n\t" 2321 "packuswb %%mm1, %%mm0 \n\t" 2322 "packuswb %%mm3, %%mm2 \n\t" 2323 "movq %%mm0, %%mm1 \n\t" 2324 "movq %%mm2, %%mm3 \n\t" 2325 "psrlw $8, %%mm0 \n\t" 2326 "psrlw $8, %%mm2 \n\t" 2327 "pand %%mm7, %%mm1 \n\t" 2328 "pand %%mm7, %%mm3 \n\t" 2329 "packuswb %%mm2, %%mm0 \n\t" 2330 "packuswb %%mm3, %%mm1 \n\t" 2331 MOVNTQ" %%mm0,- 7(%4, %0) \n\t" 2332 MOVNTQ" %%mm1,- 7(%3, %0) \n\t" 2336 :
"r"(src0),
"r"(src1),
"r"(dst0),
"r"(dst1)
2352 int lumStride,
int chromStride,
int srcStride)
2355 const int chromWidth= -((-
width)>>1);
2357 for (y=0; y<
height; y++) {
2375 #if !COMPILE_TEMPLATE_AMD3DNOW 2378 int lumStride,
int chromStride,
int srcStride)
2381 const int chromWidth= -((-
width)>>1);
2383 for (y=0; y<
height; y++) {
2402 int lumStride,
int chromStride,
int srcStride)
2405 const int chromWidth= -((-
width)>>1);
2407 for (y=0; y<
height; y++) {
2425 #if !COMPILE_TEMPLATE_AMD3DNOW 2428 int lumStride,
int chromStride,
int srcStride)
2431 const int chromWidth= -((-
width)>>1);
2433 for (y=0; y<
height; y++) {
2453 #if !COMPILE_TEMPLATE_SSE2 2454 #if !COMPILE_TEMPLATE_AMD3DNOW 2484 #if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW 2495 #if !COMPILE_TEMPLATE_AMD3DNOW static void RENAME() rgb32tobgr15(const uint8_t *src, uint8_t *dst, int src_size)
static void RENAME() yuyvtoyuv422(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src, int width, int height, int lumStride, int chromStride, int srcStride)
static void RENAME() vu9_to_vu12(const uint8_t *src1, const uint8_t *src2, uint8_t *dst1, uint8_t *dst2, int width, int height, int srcStride1, int srcStride2, int dstStride1, int dstStride2)
static void RENAME() uyvytoyuv422(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src, int width, int height, int lumStride, int chromStride, int srcStride)
void ff_rgb24toyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, int width, int height, int lumStride, int chromStride, int srcStride, int32_t *rgb2yuv)
Height should be a multiple of 2 and width should be a multiple of 2.
static void RENAME() rgb16tobgr24(const uint8_t *src, uint8_t *dst, int src_size)
static void RENAME() rgb32tobgr16(const uint8_t *src, uint8_t *dst, int src_size)
static void RENAME() rgb24tobgr24(const uint8_t *src, uint8_t *dst, int src_size)
static void RENAME() yuv422ptoyuy2(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, int width, int height, int lumStride, int chromStride, int dstStride)
Width should be a multiple of 16.
static void RENAME() extract_even2(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count)
static void RENAME() rgb15to32(const uint8_t *src, uint8_t *dst, int src_size)
static void RENAME() uyvytoyv12(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, int width, int height, int lumStride, int chromStride, int srcStride)
Height should be a multiple of 2 and width should be a multiple of 16.
output residual component w
static void RENAME() yuy2toyv12(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, int width, int height, int lumStride, int chromStride, int srcStride)
Height should be a multiple of 2 and width should be a multiple of 16.
static void RENAME() rgb24tobgr15(const uint8_t *src, uint8_t *dst, int src_size)
static void RENAME() extract_odd2avg(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count)
static void RENAME() rgb24tobgr16(const uint8_t *src, uint8_t *dst, int src_size)
static void RENAME() uyvytoyuv420(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src, int width, int height, int lumStride, int chromStride, int srcStride)
static void RENAME() shuffle_bytes_2103(const uint8_t *src, uint8_t *dst, int src_size)
static void RENAME() yv12touyvy(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, int width, int height, int lumStride, int chromStride, int dstStride)
Height should be a multiple of 2 and width should be a multiple of 16 (If this is a problem for anyon...
static void RENAME() yuvPlanartouyvy(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, int width, int height, int lumStride, int chromStride, int dstStride, int vertLumPerChroma)
void(* ff_rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, int width, int height, int lumStride, int chromStride, int srcStride, int32_t *rgb2yuv)
Height should be a multiple of 2 and width should be a multiple of 2.
static void RENAME() rgb24to15(const uint8_t *src, uint8_t *dst, int src_size)
static void RENAME() rgb15to16(const uint8_t *src, uint8_t *dst, int src_size)
static void RENAME() rgb16to15(const uint8_t *src, uint8_t *dst, int src_size)
static void RENAME() rgb32to16(const uint8_t *src, uint8_t *dst, int src_size)
static void RENAME() extract_even2avg(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count)
static void RENAME() rgb2rgb_init(void)
static void RENAME() yuv422ptouyvy(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, int width, int height, int lumStride, int chromStride, int dstStride)
Width should be a multiple of 16.
typedef void(RENAME(mix_any_func_type))
static void RENAME() yuvPlanartoyuy2(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, int width, int height, int lumStride, int chromStride, int dstStride, int vertLumPerChroma)
static void RENAME() rgb16to32(const uint8_t *src, uint8_t *dst, int src_size)
static void RENAME() rgb15tobgr24(const uint8_t *src, uint8_t *dst, int src_size)
static void RENAME() extract_even(const uint8_t *src, uint8_t *dst, x86_reg count)
BYTE int const BYTE int int int height
synthesis window for stochastic i
static void RENAME() rgb32tobgr24(const uint8_t *src, uint8_t *dst, int src_size)
static void RENAME() yuyvtoyuv420(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src, int width, int height, int lumStride, int chromStride, int srcStride)
static void RENAME() yvu9_to_yuy2(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, uint8_t *dst, int width, int height, int srcStride1, int srcStride2, int srcStride3, int dstStride)
static void RENAME() extract_odd2(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count)
static void RENAME() rgb24tobgr32(const uint8_t *src, uint8_t *dst, int src_size)
static void RENAME() yv12toyuy2(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, int width, int height, int lumStride, int chromStride, int dstStride)
Height should be a multiple of 2 and width should be a multiple of 16.
else dst[i][x+y *dst_stride[i]]
void(* planar2x)(const uint8_t *src, uint8_t *dst, int width, int height, int srcStride, int dstStride)
static void RENAME() rgb24to16(const uint8_t *src, uint8_t *dst, int src_size)
static void RENAME() rgb32to15(const uint8_t *src, uint8_t *dst, int src_size)
static void RENAME() interleaveBytes(const uint8_t *src1, const uint8_t *src2, uint8_t *dest, int width, int height, int src1Stride, int src2Stride, int dstStride)
Height should be a multiple of 2 and width should be a multiple of 2.