26 #if COMPILE_TEMPLATE_MMXEXT 27 #define PREFETCH "prefetchnta" 29 #define PREFETCH " # nop" 32 #if COMPILE_TEMPLATE_MMXEXT 33 #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t" 34 #define MOVNTQ2 "movntq " 36 #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t" 37 #define MOVNTQ2 "movq " 39 #define MOVNTQ(a,b) REAL_MOVNTQ(a,b) 41 #if !COMPILE_TEMPLATE_MMXEXT 46 __asm__
volatile(
"pxor %%mm0, %%mm0\n\t" 47 "movq (%0), %%mm3\n\t" 48 "movq %%mm3, %%mm4\n\t" 49 "psrlq $24, %%mm3\n\t" 50 "psllq $40, %%mm4\n\t" 51 "por %%mm4, %%mm3\n\t" 52 "movq %%mm3, %%mm4\n\t" 53 "punpcklbw %%mm0, %%mm3\n\t" 54 "punpckhbw %%mm0, %%mm4\n\t" 58 __asm__
volatile(
"pxor %%mm0, %%mm0\n\t" 59 "movq (%0), %%mm3\n\t" 60 "movq %%mm3, %%mm4\n\t" 61 "punpcklbw %%mm0, %%mm3\n\t" 62 "punpckhbw %%mm0, %%mm4\n\t" 77 "punpcklwd %%mm1, %%mm1\n\t" 78 "punpckldq %%mm1, %%mm1\n\t" 80 "paddw %%mm1, %%mm3\n\t" 81 "paddw %%mm1, %%mm4\n\t" 88 "movq %%mm3, %%mm6\n\t" 89 "movq %%mm4, %%mm7\n\t" 91 "mov %0, %%"REG_d
" \n\t"\
92 "mov (%%"REG_d
"), %%"REG_S
" \n\t"\
95 "movq 8(%%"REG_d
"), %%mm0 \n\t" \
96 "movq (%%"REG_S
", %%"REG_c
", 2), %%mm2 \n\t" \
97 "movq 8(%%"REG_S
", %%"REG_c
", 2), %%mm5 \n\t" \
98 "add $16, %%"REG_d
" \n\t"\
99 "mov (%%"REG_d
"), %%"REG_S
" \n\t"\
100 "test %%"REG_S
", %%"REG_S
" \n\t"\
101 "pmulhw %%mm0, %%mm2 \n\t"\
102 "pmulhw %%mm0, %%mm5 \n\t"\
103 "paddw %%mm2, %%mm3 \n\t"\
104 "paddw %%mm5, %%mm4 \n\t"\
106 "psraw $3, %%mm3 \n\t"\
107 "psraw $3, %%mm4 \n\t"\
108 "packuswb %%mm4, %%mm3 \n\t" 109 MOVNTQ2 " %%mm3, (%1, %%"REG_c
")\n\t" 110 "add $8, %%"REG_c
" \n\t"\
111 "cmp %2, %%"REG_c
" \n\t"\
112 "movq %%mm6, %%mm3\n\t" 113 "movq %%mm7, %%mm4\n\t" 114 "mov %0, %%"REG_d
" \n\t"\
115 "mov (%%"REG_d
"), %%"REG_S
" \n\t"\
119 :
"%"REG_d,
"%"REG_S,
"%"REG_c
123 #define YSCALEYUV2PACKEDX_UV \ 125 "xor %%"REG_a", %%"REG_a" \n\t"\ 129 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\ 130 "mov (%%"REG_d"), %%"REG_S" \n\t"\ 131 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\ 132 "movq %%mm3, %%mm4 \n\t"\ 135 "movq 8(%%"REG_d"), %%mm0 \n\t" \ 136 "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" \ 137 "add %6, %%"REG_S" \n\t" \ 138 "movq (%%"REG_S", %%"REG_a"), %%mm5 \n\t" \ 139 "add $16, %%"REG_d" \n\t"\ 140 "mov (%%"REG_d"), %%"REG_S" \n\t"\ 141 "pmulhw %%mm0, %%mm2 \n\t"\ 142 "pmulhw %%mm0, %%mm5 \n\t"\ 143 "paddw %%mm2, %%mm3 \n\t"\ 144 "paddw %%mm5, %%mm4 \n\t"\ 145 "test %%"REG_S", %%"REG_S" \n\t"\ 148 #define YSCALEYUV2PACKEDX_YA(offset,coeff,src1,src2,dst1,dst2) \ 149 "lea "offset"(%0), %%"REG_d" \n\t"\ 150 "mov (%%"REG_d"), %%"REG_S" \n\t"\ 151 "movq "VROUNDER_OFFSET"(%0), "#dst1" \n\t"\ 152 "movq "#dst1", "#dst2" \n\t"\ 155 "movq 8(%%"REG_d"), "#coeff" \n\t" \ 156 "movq (%%"REG_S", %%"REG_a", 2), "#src1" \n\t" \ 157 "movq 8(%%"REG_S", %%"REG_a", 2), "#src2" \n\t" \ 158 "add $16, %%"REG_d" \n\t"\ 159 "mov (%%"REG_d"), %%"REG_S" \n\t"\ 160 "pmulhw "#coeff", "#src1" \n\t"\ 161 "pmulhw "#coeff", "#src2" \n\t"\ 162 "paddw "#src1", "#dst1" \n\t"\ 163 "paddw "#src2", "#dst2" \n\t"\ 164 "test %%"REG_S", %%"REG_S" \n\t"\ 167 #define YSCALEYUV2PACKEDX \ 168 YSCALEYUV2PACKEDX_UV \ 169 YSCALEYUV2PACKEDX_YA(LUM_MMX_FILTER_OFFSET,%%mm0,%%mm2,%%mm5,%%mm1,%%mm7) \ 171 #define YSCALEYUV2PACKEDX_END \ 172 :: "r" (&c->redDither), \ 173 "m" (dummy), "m" (dummy), "m" (dummy),\ 174 "r" (dest), "m" (dstW_reg), "m"(uv_off) \ 175 : "%"REG_a, "%"REG_d, "%"REG_S \ 178 #define YSCALEYUV2PACKEDX_ACCURATE_UV \ 180 "xor %%"REG_a", %%"REG_a" \n\t"\ 184 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\ 185 "mov (%%"REG_d"), %%"REG_S" \n\t"\ 186 "pxor %%mm4, %%mm4 \n\t"\ 187 "pxor %%mm5, %%mm5 \n\t"\ 188 "pxor %%mm6, %%mm6 \n\t"\ 189 "pxor %%mm7, %%mm7 \n\t"\ 192 "movq (%%"REG_S", %%"REG_a"), %%mm0 \n\t" \ 193 "add %6, %%"REG_S" \n\t" \ 194 "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" \ 195 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\ 196 "movq (%%"REG_S", %%"REG_a"), %%mm1 \n\t" \ 197 "movq %%mm0, %%mm3 \n\t"\ 198 "punpcklwd %%mm1, %%mm0 \n\t"\ 199 "punpckhwd %%mm1, %%mm3 \n\t"\ 200 "movq "STR(APCK_COEF)"(%%"REG_d"),%%mm1 \n\t" \ 201 "pmaddwd %%mm1, %%mm0 \n\t"\ 202 "pmaddwd %%mm1, %%mm3 \n\t"\ 203 "paddd %%mm0, %%mm4 \n\t"\ 204 "paddd %%mm3, %%mm5 \n\t"\ 205 "add %6, %%"REG_S" \n\t" \ 206 "movq (%%"REG_S", %%"REG_a"), %%mm3 \n\t" \ 207 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\ 208 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\ 209 "test %%"REG_S", %%"REG_S" \n\t"\ 210 "movq %%mm2, %%mm0 \n\t"\ 211 "punpcklwd %%mm3, %%mm2 \n\t"\ 212 "punpckhwd %%mm3, %%mm0 \n\t"\ 213 "pmaddwd %%mm1, %%mm2 \n\t"\ 214 "pmaddwd %%mm1, %%mm0 \n\t"\ 215 "paddd %%mm2, %%mm6 \n\t"\ 216 "paddd %%mm0, %%mm7 \n\t"\ 218 "psrad $16, %%mm4 \n\t"\ 219 "psrad $16, %%mm5 \n\t"\ 220 "psrad $16, %%mm6 \n\t"\ 221 "psrad $16, %%mm7 \n\t"\ 222 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\ 223 "packssdw %%mm5, %%mm4 \n\t"\ 224 "packssdw %%mm7, %%mm6 \n\t"\ 225 "paddw %%mm0, %%mm4 \n\t"\ 226 "paddw %%mm0, %%mm6 \n\t"\ 227 "movq %%mm4, "U_TEMP"(%0) \n\t"\ 228 "movq %%mm6, "V_TEMP"(%0) \n\t"\ 230 #define YSCALEYUV2PACKEDX_ACCURATE_YA(offset) \ 231 "lea "offset"(%0), %%"REG_d" \n\t"\ 232 "mov (%%"REG_d"), %%"REG_S" \n\t"\ 233 "pxor %%mm1, %%mm1 \n\t"\ 234 "pxor %%mm5, %%mm5 \n\t"\ 235 "pxor %%mm7, %%mm7 \n\t"\ 236 "pxor %%mm6, %%mm6 \n\t"\ 239 "movq (%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" \ 240 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" \ 241 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\ 242 "movq (%%"REG_S", %%"REG_a", 2), %%mm4 \n\t" \ 243 "movq %%mm0, %%mm3 \n\t"\ 244 "punpcklwd %%mm4, %%mm0 \n\t"\ 245 "punpckhwd %%mm4, %%mm3 \n\t"\ 246 "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm4 \n\t" \ 247 "pmaddwd %%mm4, %%mm0 \n\t"\ 248 "pmaddwd %%mm4, %%mm3 \n\t"\ 249 "paddd %%mm0, %%mm1 \n\t"\ 250 "paddd %%mm3, %%mm5 \n\t"\ 251 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" \ 252 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\ 253 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\ 254 "test %%"REG_S", %%"REG_S" \n\t"\ 255 "movq %%mm2, %%mm0 \n\t"\ 256 "punpcklwd %%mm3, %%mm2 \n\t"\ 257 "punpckhwd %%mm3, %%mm0 \n\t"\ 258 "pmaddwd %%mm4, %%mm2 \n\t"\ 259 "pmaddwd %%mm4, %%mm0 \n\t"\ 260 "paddd %%mm2, %%mm7 \n\t"\ 261 "paddd %%mm0, %%mm6 \n\t"\ 263 "psrad $16, %%mm1 \n\t"\ 264 "psrad $16, %%mm5 \n\t"\ 265 "psrad $16, %%mm7 \n\t"\ 266 "psrad $16, %%mm6 \n\t"\ 267 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\ 268 "packssdw %%mm5, %%mm1 \n\t"\ 269 "packssdw %%mm6, %%mm7 \n\t"\ 270 "paddw %%mm0, %%mm1 \n\t"\ 271 "paddw %%mm0, %%mm7 \n\t"\ 272 "movq "U_TEMP"(%0), %%mm3 \n\t"\ 273 "movq "V_TEMP"(%0), %%mm4 \n\t"\ 275 #define YSCALEYUV2PACKEDX_ACCURATE \ 276 YSCALEYUV2PACKEDX_ACCURATE_UV \ 277 YSCALEYUV2PACKEDX_ACCURATE_YA(LUM_MMX_FILTER_OFFSET) 279 #define YSCALEYUV2RGBX \ 280 "psubw "U_OFFSET"(%0), %%mm3 \n\t" \ 281 "psubw "V_OFFSET"(%0), %%mm4 \n\t" \ 282 "movq %%mm3, %%mm2 \n\t" \ 283 "movq %%mm4, %%mm5 \n\t" \ 284 "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\ 285 "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\ 287 "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\ 288 "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\ 289 "psubw "Y_OFFSET"(%0), %%mm1 \n\t" \ 290 "psubw "Y_OFFSET"(%0), %%mm7 \n\t" \ 291 "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\ 292 "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\ 294 "paddw %%mm3, %%mm4 \n\t"\ 295 "movq %%mm2, %%mm0 \n\t"\ 296 "movq %%mm5, %%mm6 \n\t"\ 297 "movq %%mm4, %%mm3 \n\t"\ 298 "punpcklwd %%mm2, %%mm2 \n\t"\ 299 "punpcklwd %%mm5, %%mm5 \n\t"\ 300 "punpcklwd %%mm4, %%mm4 \n\t"\ 301 "paddw %%mm1, %%mm2 \n\t"\ 302 "paddw %%mm1, %%mm5 \n\t"\ 303 "paddw %%mm1, %%mm4 \n\t"\ 304 "punpckhwd %%mm0, %%mm0 \n\t"\ 305 "punpckhwd %%mm6, %%mm6 \n\t"\ 306 "punpckhwd %%mm3, %%mm3 \n\t"\ 307 "paddw %%mm7, %%mm0 \n\t"\ 308 "paddw %%mm7, %%mm6 \n\t"\ 309 "paddw %%mm7, %%mm3 \n\t"\ 311 "packuswb %%mm0, %%mm2 \n\t"\ 312 "packuswb %%mm6, %%mm5 \n\t"\ 313 "packuswb %%mm3, %%mm4 \n\t"\ 315 #define REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) \ 316 "movq "#b", "#q2" \n\t" \ 317 "movq "#r", "#t" \n\t" \ 318 "punpcklbw "#g", "#b" \n\t" \ 319 "punpcklbw "#a", "#r" \n\t" \ 320 "punpckhbw "#g", "#q2" \n\t" \ 321 "punpckhbw "#a", "#t" \n\t" \ 322 "movq "#b", "#q0" \n\t" \ 323 "movq "#q2", "#q3" \n\t" \ 324 "punpcklwd "#r", "#q0" \n\t" \ 325 "punpckhwd "#r", "#b" \n\t" \ 326 "punpcklwd "#t", "#q2" \n\t" \ 327 "punpckhwd "#t", "#q3" \n\t" \ 329 MOVNTQ( q0, (dst, index, 4))\ 330 MOVNTQ( b, 8(dst, index, 4))\ 331 MOVNTQ( q2, 16(dst, index, 4))\ 332 MOVNTQ( q3, 24(dst, index, 4))\ 334 "add $8, "#index" \n\t"\ 335 "cmp "#dstw", "#index" \n\t"\ 337 #define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) 340 const int16_t **lumSrc,
int lumFilterSize,
341 const int16_t *chrFilter,
const int16_t **chrUSrc,
342 const int16_t **chrVSrc,
343 int chrFilterSize,
const int16_t **alpSrc,
353 "movq %%mm2, "U_TEMP"(%0) \n\t" 354 "movq %%mm4, "V_TEMP"(%0) \n\t" 355 "movq %%mm5, "Y_TEMP"(%0) \n\t" 357 "movq "Y_TEMP"(%0), %%mm5 \n\t" 358 "psraw $3, %%mm1 \n\t" 359 "psraw $3, %%mm7 \n\t" 360 "packuswb %%mm7, %%mm1 \n\t" 361 WRITEBGR32(%4, %5, %%REGa, %%mm3, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm2, %%mm6)
366 "pcmpeqd %%mm7, %%mm7 \n\t" 367 WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
373 const int16_t **lumSrc,
int lumFilterSize,
374 const int16_t *chrFilter,
const int16_t **chrUSrc,
375 const int16_t **chrVSrc,
376 int chrFilterSize,
const int16_t **alpSrc,
387 "psraw $3, %%mm1 \n\t" 388 "psraw $3, %%mm7 \n\t" 389 "packuswb %%mm7, %%mm1 \n\t" 390 WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
395 "pcmpeqd %%mm7, %%mm7 \n\t" 396 WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
401 #define REAL_WRITERGB16(dst, dstw, index) \ 402 "pand "MANGLE(bF8)", %%mm2 \n\t" \ 403 "pand "MANGLE(bFC)", %%mm4 \n\t" \ 404 "pand "MANGLE(bF8)", %%mm5 \n\t" \ 405 "psrlq $3, %%mm2 \n\t"\ 407 "movq %%mm2, %%mm1 \n\t"\ 408 "movq %%mm4, %%mm3 \n\t"\ 410 "punpcklbw %%mm7, %%mm3 \n\t"\ 411 "punpcklbw %%mm5, %%mm2 \n\t"\ 412 "punpckhbw %%mm7, %%mm4 \n\t"\ 413 "punpckhbw %%mm5, %%mm1 \n\t"\ 415 "psllq $3, %%mm3 \n\t"\ 416 "psllq $3, %%mm4 \n\t"\ 418 "por %%mm3, %%mm2 \n\t"\ 419 "por %%mm4, %%mm1 \n\t"\ 421 MOVNTQ(%%mm2, (dst, index, 2))\ 422 MOVNTQ(%%mm1, 8(dst, index, 2))\ 424 "add $8, "#index" \n\t"\ 425 "cmp "#dstw", "#index" \n\t"\ 427 #define WRITERGB16(dst, dstw, index) REAL_WRITERGB16(dst, dstw, index) 430 const int16_t **lumSrc,
int lumFilterSize,
431 const int16_t *chrFilter,
const int16_t **chrUSrc,
432 const int16_t **chrVSrc,
433 int chrFilterSize,
const int16_t **alpSrc,
442 "pxor %%mm7, %%mm7 \n\t" 454 const int16_t **lumSrc,
int lumFilterSize,
455 const int16_t *chrFilter,
const int16_t **chrUSrc,
456 const int16_t **chrVSrc,
457 int chrFilterSize,
const int16_t **alpSrc,
466 "pxor %%mm7, %%mm7 \n\t" 477 #define REAL_WRITERGB15(dst, dstw, index) \ 478 "pand "MANGLE(bF8)", %%mm2 \n\t" \ 479 "pand "MANGLE(bF8)", %%mm4 \n\t" \ 480 "pand "MANGLE(bF8)", %%mm5 \n\t" \ 481 "psrlq $3, %%mm2 \n\t"\ 482 "psrlq $1, %%mm5 \n\t"\ 484 "movq %%mm2, %%mm1 \n\t"\ 485 "movq %%mm4, %%mm3 \n\t"\ 487 "punpcklbw %%mm7, %%mm3 \n\t"\ 488 "punpcklbw %%mm5, %%mm2 \n\t"\ 489 "punpckhbw %%mm7, %%mm4 \n\t"\ 490 "punpckhbw %%mm5, %%mm1 \n\t"\ 492 "psllq $2, %%mm3 \n\t"\ 493 "psllq $2, %%mm4 \n\t"\ 495 "por %%mm3, %%mm2 \n\t"\ 496 "por %%mm4, %%mm1 \n\t"\ 498 MOVNTQ(%%mm2, (dst, index, 2))\ 499 MOVNTQ(%%mm1, 8(dst, index, 2))\ 501 "add $8, "#index" \n\t"\ 502 "cmp "#dstw", "#index" \n\t"\ 504 #define WRITERGB15(dst, dstw, index) REAL_WRITERGB15(dst, dstw, index) 507 const int16_t **lumSrc,
int lumFilterSize,
508 const int16_t *chrFilter,
const int16_t **chrUSrc,
509 const int16_t **chrVSrc,
510 int chrFilterSize,
const int16_t **alpSrc,
519 "pxor %%mm7, %%mm7 \n\t" 531 const int16_t **lumSrc,
int lumFilterSize,
532 const int16_t *chrFilter,
const int16_t **chrUSrc,
533 const int16_t **chrVSrc,
534 int chrFilterSize,
const int16_t **alpSrc,
543 "pxor %%mm7, %%mm7 \n\t" 554 #define WRITEBGR24MMX(dst, dstw, index) \ 556 "movq %%mm2, %%mm1 \n\t" \ 557 "movq %%mm5, %%mm6 \n\t" \ 558 "punpcklbw %%mm4, %%mm2 \n\t" \ 559 "punpcklbw %%mm7, %%mm5 \n\t" \ 560 "punpckhbw %%mm4, %%mm1 \n\t" \ 561 "punpckhbw %%mm7, %%mm6 \n\t" \ 562 "movq %%mm2, %%mm0 \n\t" \ 563 "movq %%mm1, %%mm3 \n\t" \ 564 "punpcklwd %%mm5, %%mm0 \n\t" \ 565 "punpckhwd %%mm5, %%mm2 \n\t" \ 566 "punpcklwd %%mm6, %%mm1 \n\t" \ 567 "punpckhwd %%mm6, %%mm3 \n\t" \ 569 "movq %%mm0, %%mm4 \n\t" \ 570 "movq %%mm2, %%mm6 \n\t" \ 571 "movq %%mm1, %%mm5 \n\t" \ 572 "movq %%mm3, %%mm7 \n\t" \ 574 "psllq $40, %%mm0 \n\t" \ 575 "psllq $40, %%mm2 \n\t" \ 576 "psllq $40, %%mm1 \n\t" \ 577 "psllq $40, %%mm3 \n\t" \ 579 "punpckhdq %%mm4, %%mm0 \n\t" \ 580 "punpckhdq %%mm6, %%mm2 \n\t" \ 581 "punpckhdq %%mm5, %%mm1 \n\t" \ 582 "punpckhdq %%mm7, %%mm3 \n\t" \ 584 "psrlq $8, %%mm0 \n\t" \ 585 "movq %%mm2, %%mm6 \n\t" \ 586 "psllq $40, %%mm2 \n\t" \ 587 "por %%mm2, %%mm0 \n\t" \ 588 MOVNTQ(%%mm0, (dst))\ 590 "psrlq $24, %%mm6 \n\t" \ 591 "movq %%mm1, %%mm5 \n\t" \ 592 "psllq $24, %%mm1 \n\t" \ 593 "por %%mm1, %%mm6 \n\t" \ 594 MOVNTQ(%%mm6, 8(dst))\ 596 "psrlq $40, %%mm5 \n\t" \ 597 "psllq $8, %%mm3 \n\t" \ 598 "por %%mm3, %%mm5 \n\t" \ 599 MOVNTQ(%%mm5, 16(dst))\ 601 "add $24, "#dst" \n\t"\ 603 "add $8, "#index" \n\t"\ 604 "cmp "#dstw", "#index" \n\t"\ 607 #define WRITEBGR24MMXEXT(dst, dstw, index) \ 609 "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\ 610 "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\ 611 "pshufw $0x50, %%mm2, %%mm1 \n\t" \ 612 "pshufw $0x50, %%mm4, %%mm3 \n\t" \ 613 "pshufw $0x00, %%mm5, %%mm6 \n\t" \ 615 "pand %%mm0, %%mm1 \n\t" \ 616 "pand %%mm0, %%mm3 \n\t" \ 617 "pand %%mm7, %%mm6 \n\t" \ 619 "psllq $8, %%mm3 \n\t" \ 620 "por %%mm1, %%mm6 \n\t"\ 621 "por %%mm3, %%mm6 \n\t"\ 622 MOVNTQ(%%mm6, (dst))\ 624 "psrlq $8, %%mm4 \n\t" \ 625 "pshufw $0xA5, %%mm2, %%mm1 \n\t" \ 626 "pshufw $0x55, %%mm4, %%mm3 \n\t" \ 627 "pshufw $0xA5, %%mm5, %%mm6 \n\t" \ 629 "pand "MANGLE(ff_M24B)", %%mm1 \n\t" \ 630 "pand %%mm7, %%mm3 \n\t" \ 631 "pand %%mm0, %%mm6 \n\t" \ 633 "por %%mm1, %%mm3 \n\t" \ 634 "por %%mm3, %%mm6 \n\t"\ 635 MOVNTQ(%%mm6, 8(dst))\ 637 "pshufw $0xFF, %%mm2, %%mm1 \n\t" \ 638 "pshufw $0xFA, %%mm4, %%mm3 \n\t" \ 639 "pshufw $0xFA, %%mm5, %%mm6 \n\t" \ 641 "pand %%mm7, %%mm1 \n\t" \ 642 "pand %%mm0, %%mm3 \n\t" \ 643 "pand "MANGLE(ff_M24B)", %%mm6 \n\t" \ 645 "por %%mm1, %%mm3 \n\t"\ 646 "por %%mm3, %%mm6 \n\t"\ 647 MOVNTQ(%%mm6, 16(dst))\ 649 "add $24, "#dst" \n\t"\ 651 "add $8, "#index" \n\t"\ 652 "cmp "#dstw", "#index" \n\t"\ 655 #if COMPILE_TEMPLATE_MMXEXT 657 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMXEXT(dst, dstw, index) 660 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index) 664 const int16_t **lumSrc,
int lumFilterSize,
665 const int16_t *chrFilter,
const int16_t **chrUSrc,
666 const int16_t **chrVSrc,
667 int chrFilterSize,
const int16_t **alpSrc,
676 "pxor %%mm7, %%mm7 \n\t" 677 "lea (%%"REG_a
", %%"REG_a
", 2), %%"REG_c
"\n\t" 678 "add %4, %%"REG_c
" \n\t" 680 ::
"r" (&
c->redDither),
681 "m" (dummy),
"m" (
dummy),
"m" (dummy),
682 "r" (
dest),
"m" (dstW_reg),
"m"(uv_off)
683 :
"%"REG_a,
"%"REG_c,
"%"REG_d,
"%"REG_S
688 const int16_t **lumSrc,
int lumFilterSize,
689 const int16_t *chrFilter,
const int16_t **chrUSrc,
690 const int16_t **chrVSrc,
691 int chrFilterSize,
const int16_t **alpSrc,
700 "pxor %%mm7, %%mm7 \n\t" 701 "lea (%%"REG_a
", %%"REG_a
", 2), %%"REG_c
" \n\t" 702 "add %4, %%"REG_c
" \n\t" 704 ::
"r" (&
c->redDither),
705 "m" (dummy),
"m" (
dummy),
"m" (dummy),
706 "r" (
dest),
"m" (dstW_reg),
"m"(uv_off)
707 :
"%"REG_a,
"%"REG_c,
"%"REG_d,
"%"REG_S
711 #define REAL_WRITEYUY2(dst, dstw, index) \ 712 "packuswb %%mm3, %%mm3 \n\t"\ 713 "packuswb %%mm4, %%mm4 \n\t"\ 714 "packuswb %%mm7, %%mm1 \n\t"\ 715 "punpcklbw %%mm4, %%mm3 \n\t"\ 716 "movq %%mm1, %%mm7 \n\t"\ 717 "punpcklbw %%mm3, %%mm1 \n\t"\ 718 "punpckhbw %%mm3, %%mm7 \n\t"\ 720 MOVNTQ(%%mm1, (dst, index, 2))\ 721 MOVNTQ(%%mm7, 8(dst, index, 2))\ 723 "add $8, "#index" \n\t"\ 724 "cmp "#dstw", "#index" \n\t"\ 726 #define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index) 729 const int16_t **lumSrc,
int lumFilterSize,
730 const int16_t *chrFilter,
const int16_t **chrUSrc,
731 const int16_t **chrVSrc,
732 int chrFilterSize,
const int16_t **alpSrc,
741 "psraw $3, %%mm3 \n\t" 742 "psraw $3, %%mm4 \n\t" 743 "psraw $3, %%mm1 \n\t" 744 "psraw $3, %%mm7 \n\t" 750 const int16_t **lumSrc,
int lumFilterSize,
751 const int16_t *chrFilter,
const int16_t **chrUSrc,
752 const int16_t **chrVSrc,
753 int chrFilterSize,
const int16_t **alpSrc,
762 "psraw $3, %%mm3 \n\t" 763 "psraw $3, %%mm4 \n\t" 764 "psraw $3, %%mm1 \n\t" 765 "psraw $3, %%mm7 \n\t" 770 #define REAL_YSCALEYUV2RGB_UV(index, c) \ 771 "xor "#index", "#index" \n\t"\ 774 "movq (%2, "#index"), %%mm2 \n\t" \ 775 "movq (%3, "#index"), %%mm3 \n\t" \ 776 "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \ 777 "movq (%2, "#index"), %%mm5 \n\t" \ 778 "movq (%3, "#index"), %%mm4 \n\t" \ 779 "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \ 780 "psubw %%mm3, %%mm2 \n\t" \ 781 "psubw %%mm4, %%mm5 \n\t" \ 782 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\ 783 "pmulhw %%mm0, %%mm2 \n\t" \ 784 "pmulhw %%mm0, %%mm5 \n\t" \ 785 "psraw $4, %%mm3 \n\t" \ 786 "psraw $4, %%mm4 \n\t" \ 787 "paddw %%mm2, %%mm3 \n\t" \ 788 "paddw %%mm5, %%mm4 \n\t" \ 789 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" \ 790 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" \ 791 "movq %%mm3, %%mm2 \n\t" \ 792 "movq %%mm4, %%mm5 \n\t" \ 793 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\ 794 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\ 797 #define REAL_YSCALEYUV2RGB_YA(index, c, b1, b2) \ 798 "movq ("#b1", "#index", 2), %%mm0 \n\t" \ 799 "movq ("#b2", "#index", 2), %%mm1 \n\t" \ 800 "movq 8("#b1", "#index", 2), %%mm6 \n\t" \ 801 "movq 8("#b2", "#index", 2), %%mm7 \n\t" \ 802 "psubw %%mm1, %%mm0 \n\t" \ 803 "psubw %%mm7, %%mm6 \n\t" \ 804 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" \ 805 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" \ 806 "psraw $4, %%mm1 \n\t" \ 807 "psraw $4, %%mm7 \n\t" \ 808 "paddw %%mm0, %%mm1 \n\t" \ 809 "paddw %%mm6, %%mm7 \n\t" \ 811 #define REAL_YSCALEYUV2RGB_COEFF(c) \ 812 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\ 813 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\ 814 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" \ 815 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" \ 816 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\ 817 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\ 819 "paddw %%mm3, %%mm4 \n\t"\ 820 "movq %%mm2, %%mm0 \n\t"\ 821 "movq %%mm5, %%mm6 \n\t"\ 822 "movq %%mm4, %%mm3 \n\t"\ 823 "punpcklwd %%mm2, %%mm2 \n\t"\ 824 "punpcklwd %%mm5, %%mm5 \n\t"\ 825 "punpcklwd %%mm4, %%mm4 \n\t"\ 826 "paddw %%mm1, %%mm2 \n\t"\ 827 "paddw %%mm1, %%mm5 \n\t"\ 828 "paddw %%mm1, %%mm4 \n\t"\ 829 "punpckhwd %%mm0, %%mm0 \n\t"\ 830 "punpckhwd %%mm6, %%mm6 \n\t"\ 831 "punpckhwd %%mm3, %%mm3 \n\t"\ 832 "paddw %%mm7, %%mm0 \n\t"\ 833 "paddw %%mm7, %%mm6 \n\t"\ 834 "paddw %%mm7, %%mm3 \n\t"\ 836 "packuswb %%mm0, %%mm2 \n\t"\ 837 "packuswb %%mm6, %%mm5 \n\t"\ 838 "packuswb %%mm3, %%mm4 \n\t"\ 840 #define YSCALEYUV2RGB_YA(index, c, b1, b2) REAL_YSCALEYUV2RGB_YA(index, c, b1, b2) 842 #define YSCALEYUV2RGB(index, c) \ 843 REAL_YSCALEYUV2RGB_UV(index, c) \ 844 REAL_YSCALEYUV2RGB_YA(index, c, %0, %1) \ 845 REAL_YSCALEYUV2RGB_COEFF(c) 851 const int16_t *ubuf[2],
const int16_t *vbuf[2],
853 int dstW,
int yalpha,
int uvalpha,
int y)
855 const int16_t *buf0 =
buf[0], *buf1 =
buf[1],
856 *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
859 const int16_t *abuf0 = abuf[0], *abuf1 = abuf[1];
864 "psraw $3, %%mm1 \n\t" 865 "psraw $3, %%mm7 \n\t" 866 "packuswb %%mm7, %%mm1 \n\t" 867 WRITEBGR32(%4, 8280(%5), %%r8, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
868 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"r" (
dest),
870 "r" (abuf0),
"r" (abuf1)
874 c->u_temp=(intptr_t)abuf0;
875 c->v_temp=(intptr_t)abuf1;
878 "mov %4, %%"REG_b
" \n\t" 879 "push %%"REG_BP
" \n\t" 883 "mov "U_TEMP"(%5), %0 \n\t" 884 "mov "V_TEMP"(%5), %1 \n\t" 886 "psraw $3, %%mm1 \n\t" 887 "psraw $3, %%mm7 \n\t" 888 "packuswb %%mm7, %%mm1 \n\t" 891 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
892 "pop %%"REG_BP
" \n\t" 894 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"m" (
dest),
901 "mov %4, %%"REG_b
" \n\t" 902 "push %%"REG_BP
" \n\t" 904 "pcmpeqd %%mm7, %%mm7 \n\t" 905 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
906 "pop %%"REG_BP
" \n\t" 908 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"m" (
dest),
915 const int16_t *ubuf[2],
const int16_t *vbuf[2],
917 int dstW,
int yalpha,
int uvalpha,
int y)
919 const int16_t *buf0 =
buf[0], *buf1 =
buf[1],
920 *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
925 "mov %4, %%"REG_b
" \n\t" 926 "push %%"REG_BP
" \n\t" 928 "pxor %%mm7, %%mm7 \n\t" 930 "pop %%"REG_BP
" \n\t" 932 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"m" (
dest),
938 const int16_t *ubuf[2],
const int16_t *vbuf[2],
940 int dstW,
int yalpha,
int uvalpha,
int y)
942 const int16_t *buf0 =
buf[0], *buf1 =
buf[1],
943 *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
948 "mov %4, %%"REG_b
" \n\t" 949 "push %%"REG_BP
" \n\t" 951 "pxor %%mm7, %%mm7 \n\t" 959 "pop %%"REG_BP
" \n\t" 961 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"m" (
dest),
967 const int16_t *ubuf[2],
const int16_t *vbuf[2],
969 int dstW,
int yalpha,
int uvalpha,
int y)
971 const int16_t *buf0 =
buf[0], *buf1 =
buf[1],
972 *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
977 "mov %4, %%"REG_b
" \n\t" 978 "push %%"REG_BP
" \n\t" 980 "pxor %%mm7, %%mm7 \n\t" 988 "pop %%"REG_BP
" \n\t" 990 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"m" (
dest),
995 #define REAL_YSCALEYUV2PACKED(index, c) \ 996 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\ 997 "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1 \n\t"\ 998 "psraw $3, %%mm0 \n\t"\ 999 "psraw $3, %%mm1 \n\t"\ 1000 "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\ 1001 "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\ 1002 "xor "#index", "#index" \n\t"\ 1005 "movq (%2, "#index"), %%mm2 \n\t" \ 1006 "movq (%3, "#index"), %%mm3 \n\t" \ 1007 "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \ 1008 "movq (%2, "#index"), %%mm5 \n\t" \ 1009 "movq (%3, "#index"), %%mm4 \n\t" \ 1010 "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \ 1011 "psubw %%mm3, %%mm2 \n\t" \ 1012 "psubw %%mm4, %%mm5 \n\t" \ 1013 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\ 1014 "pmulhw %%mm0, %%mm2 \n\t" \ 1015 "pmulhw %%mm0, %%mm5 \n\t" \ 1016 "psraw $7, %%mm3 \n\t" \ 1017 "psraw $7, %%mm4 \n\t" \ 1018 "paddw %%mm2, %%mm3 \n\t" \ 1019 "paddw %%mm5, %%mm4 \n\t" \ 1020 "movq (%0, "#index", 2), %%mm0 \n\t" \ 1021 "movq (%1, "#index", 2), %%mm1 \n\t" \ 1022 "movq 8(%0, "#index", 2), %%mm6 \n\t" \ 1023 "movq 8(%1, "#index", 2), %%mm7 \n\t" \ 1024 "psubw %%mm1, %%mm0 \n\t" \ 1025 "psubw %%mm7, %%mm6 \n\t" \ 1026 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" \ 1027 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" \ 1028 "psraw $7, %%mm1 \n\t" \ 1029 "psraw $7, %%mm7 \n\t" \ 1030 "paddw %%mm0, %%mm1 \n\t" \ 1031 "paddw %%mm6, %%mm7 \n\t" \ 1033 #define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c) 1036 const int16_t *ubuf[2],
const int16_t *vbuf[2],
1038 int dstW,
int yalpha,
int uvalpha,
int y)
1040 const int16_t *buf0 =
buf[0], *buf1 =
buf[1],
1041 *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
1046 "mov %4, %%"REG_b
" \n\t" 1047 "push %%"REG_BP
" \n\t" 1050 "pop %%"REG_BP
" \n\t" 1052 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"m" (
dest),
1057 #define REAL_YSCALEYUV2RGB1(index, c) \ 1058 "xor "#index", "#index" \n\t"\ 1061 "movq (%2, "#index"), %%mm3 \n\t" \ 1062 "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \ 1063 "movq (%2, "#index"), %%mm4 \n\t" \ 1064 "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \ 1065 "psraw $4, %%mm3 \n\t" \ 1066 "psraw $4, %%mm4 \n\t" \ 1067 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" \ 1068 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" \ 1069 "movq %%mm3, %%mm2 \n\t" \ 1070 "movq %%mm4, %%mm5 \n\t" \ 1071 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\ 1072 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\ 1074 "movq (%0, "#index", 2), %%mm1 \n\t" \ 1075 "movq 8(%0, "#index", 2), %%mm7 \n\t" \ 1076 "psraw $4, %%mm1 \n\t" \ 1077 "psraw $4, %%mm7 \n\t" \ 1078 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\ 1079 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\ 1080 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" \ 1081 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" \ 1082 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\ 1083 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\ 1085 "paddw %%mm3, %%mm4 \n\t"\ 1086 "movq %%mm2, %%mm0 \n\t"\ 1087 "movq %%mm5, %%mm6 \n\t"\ 1088 "movq %%mm4, %%mm3 \n\t"\ 1089 "punpcklwd %%mm2, %%mm2 \n\t"\ 1090 "punpcklwd %%mm5, %%mm5 \n\t"\ 1091 "punpcklwd %%mm4, %%mm4 \n\t"\ 1092 "paddw %%mm1, %%mm2 \n\t"\ 1093 "paddw %%mm1, %%mm5 \n\t"\ 1094 "paddw %%mm1, %%mm4 \n\t"\ 1095 "punpckhwd %%mm0, %%mm0 \n\t"\ 1096 "punpckhwd %%mm6, %%mm6 \n\t"\ 1097 "punpckhwd %%mm3, %%mm3 \n\t"\ 1098 "paddw %%mm7, %%mm0 \n\t"\ 1099 "paddw %%mm7, %%mm6 \n\t"\ 1100 "paddw %%mm7, %%mm3 \n\t"\ 1102 "packuswb %%mm0, %%mm2 \n\t"\ 1103 "packuswb %%mm6, %%mm5 \n\t"\ 1104 "packuswb %%mm3, %%mm4 \n\t"\ 1106 #define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c) 1109 #define REAL_YSCALEYUV2RGB1b(index, c) \ 1110 "xor "#index", "#index" \n\t"\ 1113 "movq (%2, "#index"), %%mm2 \n\t" \ 1114 "movq (%3, "#index"), %%mm3 \n\t" \ 1115 "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \ 1116 "movq (%2, "#index"), %%mm5 \n\t" \ 1117 "movq (%3, "#index"), %%mm4 \n\t" \ 1118 "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \ 1119 "paddw %%mm2, %%mm3 \n\t" \ 1120 "paddw %%mm5, %%mm4 \n\t" \ 1121 "psrlw $5, %%mm3 \n\t" \ 1122 "psrlw $5, %%mm4 \n\t" \ 1123 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" \ 1124 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" \ 1125 "movq %%mm3, %%mm2 \n\t" \ 1126 "movq %%mm4, %%mm5 \n\t" \ 1127 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\ 1128 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\ 1130 "movq (%0, "#index", 2), %%mm1 \n\t" \ 1131 "movq 8(%0, "#index", 2), %%mm7 \n\t" \ 1132 "psraw $4, %%mm1 \n\t" \ 1133 "psraw $4, %%mm7 \n\t" \ 1134 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\ 1135 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\ 1136 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" \ 1137 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" \ 1138 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\ 1139 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\ 1141 "paddw %%mm3, %%mm4 \n\t"\ 1142 "movq %%mm2, %%mm0 \n\t"\ 1143 "movq %%mm5, %%mm6 \n\t"\ 1144 "movq %%mm4, %%mm3 \n\t"\ 1145 "punpcklwd %%mm2, %%mm2 \n\t"\ 1146 "punpcklwd %%mm5, %%mm5 \n\t"\ 1147 "punpcklwd %%mm4, %%mm4 \n\t"\ 1148 "paddw %%mm1, %%mm2 \n\t"\ 1149 "paddw %%mm1, %%mm5 \n\t"\ 1150 "paddw %%mm1, %%mm4 \n\t"\ 1151 "punpckhwd %%mm0, %%mm0 \n\t"\ 1152 "punpckhwd %%mm6, %%mm6 \n\t"\ 1153 "punpckhwd %%mm3, %%mm3 \n\t"\ 1154 "paddw %%mm7, %%mm0 \n\t"\ 1155 "paddw %%mm7, %%mm6 \n\t"\ 1156 "paddw %%mm7, %%mm3 \n\t"\ 1158 "packuswb %%mm0, %%mm2 \n\t"\ 1159 "packuswb %%mm6, %%mm5 \n\t"\ 1160 "packuswb %%mm3, %%mm4 \n\t"\ 1162 #define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c) 1164 #define REAL_YSCALEYUV2RGB1_ALPHA(index) \ 1165 "movq (%1, "#index", 2), %%mm7 \n\t" \ 1166 "movq 8(%1, "#index", 2), %%mm1 \n\t" \ 1167 "psraw $7, %%mm7 \n\t" \ 1168 "psraw $7, %%mm1 \n\t" \ 1169 "packuswb %%mm1, %%mm7 \n\t" 1170 #define YSCALEYUV2RGB1_ALPHA(index) REAL_YSCALEYUV2RGB1_ALPHA(index) 1176 const int16_t *ubuf[2],
const int16_t *vbuf[2],
1178 int dstW,
int uvalpha,
int y)
1180 const int16_t *ubuf0 = ubuf[0];
1181 const int16_t *buf1= buf0;
1183 if (uvalpha < 2048) {
1184 const int16_t *ubuf1 = ubuf[0];
1188 "mov %4, %%"REG_b
" \n\t" 1189 "push %%"REG_BP
" \n\t" 1192 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1193 "pop %%"REG_BP
" \n\t" 1195 ::
"c" (buf0),
"d" (abuf0),
"S" (ubuf0),
"D" (ubuf1),
"m" (
dest),
1201 "mov %4, %%"REG_b
" \n\t" 1202 "push %%"REG_BP
" \n\t" 1204 "pcmpeqd %%mm7, %%mm7 \n\t" 1205 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1206 "pop %%"REG_BP
" \n\t" 1208 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"m" (
dest),
1213 const int16_t *ubuf1 = ubuf[1];
1217 "mov %4, %%"REG_b
" \n\t" 1218 "push %%"REG_BP
" \n\t" 1221 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1222 "pop %%"REG_BP
" \n\t" 1224 ::
"c" (buf0),
"d" (abuf0),
"S" (ubuf0),
"D" (ubuf1),
"m" (
dest),
1230 "mov %4, %%"REG_b
" \n\t" 1231 "push %%"REG_BP
" \n\t" 1233 "pcmpeqd %%mm7, %%mm7 \n\t" 1234 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1235 "pop %%"REG_BP
" \n\t" 1237 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"m" (
dest),
1245 const int16_t *ubuf[2],
const int16_t *vbuf[2],
1247 int dstW,
int uvalpha,
int y)
1249 const int16_t *ubuf0 = ubuf[0];
1250 const int16_t *buf1= buf0;
1252 if (uvalpha < 2048) {
1253 const int16_t *ubuf1 = ubuf[0];
1256 "mov %4, %%"REG_b
" \n\t" 1257 "push %%"REG_BP
" \n\t" 1259 "pxor %%mm7, %%mm7 \n\t" 1261 "pop %%"REG_BP
" \n\t" 1263 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"m" (
dest),
1267 const int16_t *ubuf1 = ubuf[1];
1270 "mov %4, %%"REG_b
" \n\t" 1271 "push %%"REG_BP
" \n\t" 1273 "pxor %%mm7, %%mm7 \n\t" 1275 "pop %%"REG_BP
" \n\t" 1277 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"m" (
dest),
1284 const int16_t *ubuf[2],
const int16_t *vbuf[2],
1286 int dstW,
int uvalpha,
int y)
1288 const int16_t *ubuf0 = ubuf[0];
1289 const int16_t *buf1= buf0;
1291 if (uvalpha < 2048) {
1292 const int16_t *ubuf1 = ubuf[0];
1295 "mov %4, %%"REG_b
" \n\t" 1296 "push %%"REG_BP
" \n\t" 1298 "pxor %%mm7, %%mm7 \n\t" 1306 "pop %%"REG_BP
" \n\t" 1308 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"m" (
dest),
1312 const int16_t *ubuf1 = ubuf[1];
1315 "mov %4, %%"REG_b
" \n\t" 1316 "push %%"REG_BP
" \n\t" 1318 "pxor %%mm7, %%mm7 \n\t" 1326 "pop %%"REG_BP
" \n\t" 1328 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"m" (
dest),
1335 const int16_t *ubuf[2],
const int16_t *vbuf[2],
1337 int dstW,
int uvalpha,
int y)
1339 const int16_t *ubuf0 = ubuf[0];
1340 const int16_t *buf1= buf0;
1342 if (uvalpha < 2048) {
1343 const int16_t *ubuf1 = ubuf[0];
1346 "mov %4, %%"REG_b
" \n\t" 1347 "push %%"REG_BP
" \n\t" 1349 "pxor %%mm7, %%mm7 \n\t" 1357 "pop %%"REG_BP
" \n\t" 1359 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"m" (
dest),
1363 const int16_t *ubuf1 = ubuf[1];
1366 "mov %4, %%"REG_b
" \n\t" 1367 "push %%"REG_BP
" \n\t" 1369 "pxor %%mm7, %%mm7 \n\t" 1377 "pop %%"REG_BP
" \n\t" 1379 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"m" (
dest),
1385 #define REAL_YSCALEYUV2PACKED1(index, c) \ 1386 "xor "#index", "#index" \n\t"\ 1389 "movq (%2, "#index"), %%mm3 \n\t" \ 1390 "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \ 1391 "movq (%2, "#index"), %%mm4 \n\t" \ 1392 "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \ 1393 "psraw $7, %%mm3 \n\t" \ 1394 "psraw $7, %%mm4 \n\t" \ 1395 "movq (%0, "#index", 2), %%mm1 \n\t" \ 1396 "movq 8(%0, "#index", 2), %%mm7 \n\t" \ 1397 "psraw $7, %%mm1 \n\t" \ 1398 "psraw $7, %%mm7 \n\t" \ 1400 #define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c) 1402 #define REAL_YSCALEYUV2PACKED1b(index, c) \ 1403 "xor "#index", "#index" \n\t"\ 1406 "movq (%2, "#index"), %%mm2 \n\t" \ 1407 "movq (%3, "#index"), %%mm3 \n\t" \ 1408 "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \ 1409 "movq (%2, "#index"), %%mm5 \n\t" \ 1410 "movq (%3, "#index"), %%mm4 \n\t" \ 1411 "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \ 1412 "paddw %%mm2, %%mm3 \n\t" \ 1413 "paddw %%mm5, %%mm4 \n\t" \ 1414 "psrlw $8, %%mm3 \n\t" \ 1415 "psrlw $8, %%mm4 \n\t" \ 1416 "movq (%0, "#index", 2), %%mm1 \n\t" \ 1417 "movq 8(%0, "#index", 2), %%mm7 \n\t" \ 1418 "psraw $7, %%mm1 \n\t" \ 1419 "psraw $7, %%mm7 \n\t" 1420 #define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c) 1423 const int16_t *ubuf[2],
const int16_t *vbuf[2],
1425 int dstW,
int uvalpha,
int y)
1427 const int16_t *ubuf0 = ubuf[0];
1428 const int16_t *buf1= buf0;
1430 if (uvalpha < 2048) {
1431 const int16_t *ubuf1 = ubuf[0];
1434 "mov %4, %%"REG_b
" \n\t" 1435 "push %%"REG_BP
" \n\t" 1438 "pop %%"REG_BP
" \n\t" 1440 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"m" (
dest),
1444 const int16_t *ubuf1 = ubuf[1];
1447 "mov %4, %%"REG_b
" \n\t" 1448 "push %%"REG_BP
" \n\t" 1451 "pop %%"REG_BP
" \n\t" 1453 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"m" (
dest),
1459 #if COMPILE_TEMPLATE_MMXEXT 1464 int32_t *filterPos =
c->hLumFilterPos;
1465 int16_t *
filter =
c->hLumFilter;
1466 void *mmxextFilterCode =
c->lumMmxextFilterCode;
1477 "mov %%"REG_b
", %5 \n\t" 1479 "mov -8(%%rsp), %%"REG_a
" \n\t" 1480 "mov %%"REG_a
", %6 \n\t" 1484 "mov -8(%%rsp), %%"REG_a
" \n\t" 1485 "mov %%"REG_a
", %5 \n\t" 1488 "pxor %%mm7, %%mm7 \n\t" 1489 "mov %0, %%"REG_c
" \n\t" 1490 "mov %1, %%"REG_D
" \n\t" 1491 "mov %2, %%"REG_d
" \n\t" 1492 "mov %3, %%"REG_b
" \n\t" 1493 "xor %%"REG_a
", %%"REG_a
" \n\t" 1499 #define CALL_MMXEXT_FILTER_CODE \ 1500 "movl (%%"REG_b"), %%esi \n\t"\ 1502 "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\ 1503 "add %%"REG_S", %%"REG_c" \n\t"\ 1504 "add %%"REG_a", %%"REG_D" \n\t"\ 1505 "xor %%"REG_a", %%"REG_a" \n\t"\ 1508 #define CALL_MMXEXT_FILTER_CODE \ 1509 "movl (%%"REG_b"), %%esi \n\t"\ 1511 "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\ 1512 "add %%"REG_a", %%"REG_D" \n\t"\ 1513 "xor %%"REG_a", %%"REG_a" \n\t"\ 1517 CALL_MMXEXT_FILTER_CODE
1518 CALL_MMXEXT_FILTER_CODE
1519 CALL_MMXEXT_FILTER_CODE
1520 CALL_MMXEXT_FILTER_CODE
1521 CALL_MMXEXT_FILTER_CODE
1522 CALL_MMXEXT_FILTER_CODE
1523 CALL_MMXEXT_FILTER_CODE
1524 CALL_MMXEXT_FILTER_CODE
1527 "mov %5, %%"REG_b
" \n\t" 1529 "mov %6, %%"REG_a
" \n\t" 1530 "mov %%"REG_a
", -8(%%rsp) \n\t" 1534 "mov %5, %%"REG_a
" \n\t" 1535 "mov %%"REG_a
", -8(%%rsp) \n\t" 1538 ::
"m" (
src),
"m" (
dst),
"m" (
filter),
"m" (filterPos),
1539 "m" (mmxextFilterCode)
1546 :
"%"REG_a,
"%"REG_c,
"%"REG_d,
"%"REG_S,
"%"REG_D
1552 for (i=dstWidth-1; (i*xInc)>>16 >=
srcW-1; i--)
1557 int dstWidth,
const uint8_t *src1,
1560 int32_t *filterPos =
c->hChrFilterPos;
1561 int16_t *
filter =
c->hChrFilter;
1562 void *mmxextFilterCode =
c->chrMmxextFilterCode;
1573 "mov %%"REG_b
", %7 \n\t" 1575 "mov -8(%%rsp), %%"REG_a
" \n\t" 1576 "mov %%"REG_a
", %8 \n\t" 1580 "mov -8(%%rsp), %%"REG_a
" \n\t" 1581 "mov %%"REG_a
", %7 \n\t" 1584 "pxor %%mm7, %%mm7 \n\t" 1585 "mov %0, %%"REG_c
" \n\t" 1586 "mov %1, %%"REG_D
" \n\t" 1587 "mov %2, %%"REG_d
" \n\t" 1588 "mov %3, %%"REG_b
" \n\t" 1589 "xor %%"REG_a
", %%"REG_a
" \n\t" 1594 CALL_MMXEXT_FILTER_CODE
1595 CALL_MMXEXT_FILTER_CODE
1596 CALL_MMXEXT_FILTER_CODE
1597 CALL_MMXEXT_FILTER_CODE
1598 "xor %%"REG_a
", %%"REG_a
" \n\t" 1599 "mov %5, %%"REG_c
" \n\t" 1600 "mov %6, %%"REG_D
" \n\t" 1605 CALL_MMXEXT_FILTER_CODE
1606 CALL_MMXEXT_FILTER_CODE
1607 CALL_MMXEXT_FILTER_CODE
1608 CALL_MMXEXT_FILTER_CODE
1611 "mov %7, %%"REG_b
" \n\t" 1613 "mov %8, %%"REG_a
" \n\t" 1614 "mov %%"REG_a
", -8(%%rsp) \n\t" 1618 "mov %7, %%"REG_a
" \n\t" 1619 "mov %%"REG_a
", -8(%%rsp) \n\t" 1622 ::
"m" (src1),
"m" (dst1),
"m" (
filter),
"m" (filterPos),
1623 "m" (mmxextFilterCode),
"m" (src2),
"m"(dst2)
1630 :
"%"REG_a,
"%"REG_c,
"%"REG_d,
"%"REG_S,
"%"REG_D
1636 for (i=dstWidth-1; (i*xInc)>>16 >=
srcW-1; i--) {
1637 dst1[
i] = src1[
srcW-1]*128;
1638 dst2[
i] = src2[
srcW-1]*128;
1647 c->use_mmx_vfilter= 0;
1652 switch (
c->dstFormat) {
1662 c->use_mmx_vfilter= 1;
1665 switch (
c->dstFormat) {
1676 switch (
c->dstFormat) {
1703 if (
c->srcBpc == 8 &&
c->dstBpc <= 14) {
1705 #if COMPILE_TEMPLATE_MMXEXT 1711 c->hyscale_fast =
NULL;
1712 c->hcscale_fast =
NULL;
1713 #if COMPILE_TEMPLATE_MMXEXT #define YSCALEYUV2RGB1_ALPHA(index)
#define ALP_MMX_FILTER_OFFSET
static void RENAME() yuv2rgb32_X_ar(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize, const int16_t *chrFilter, const int16_t **chrUSrc, const int16_t **chrVSrc, int chrFilterSize, const int16_t **alpSrc, uint8_t *dest, int dstW, int dstY)
#define YSCALEYUV2PACKED1(index, c)
static void RENAME() yuv2yuyv422_X(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize, const int16_t *chrFilter, const int16_t **chrUSrc, const int16_t **chrVSrc, int chrFilterSize, const int16_t **alpSrc, uint8_t *dest, int dstW, int dstY)
#define SWS_FAST_BILINEAR
#define DECLARE_ALIGNED(n, t, v)
#define SWS_FULL_CHR_H_INT
static av_always_inline int is16BPS(enum AVPixelFormat pix_fmt)
void(* hyscale_fast)(struct SwsContext *c, int16_t *dst, int dstWidth, const uint8_t *src, int srcW, int xInc)
Scale one horizontal line of input data using a bilinear filter to produce one line of output data...
int dstY
Last destination vertical line output from last slice.
#define YSCALEYUV2PACKEDX_END
the mask is usually to keep the same permissions Filters should remove permissions on reference they give to output whenever necessary It can be automatically done by setting the rej_perms field on the output pad Here are a few guidelines corresponding to common then the filter should push the output frames on the output link immediately As an exception to the previous rule if the input frame is enough to produce several output frames then the filter needs output only at least one per link The additional frames can be left buffered in the filter
enum AVPixelFormat dstFormat
Destination pixel format.
#define WRITERGB15(dst, dstw, index)
static void RENAME() yuv2rgb565_2(SwsContext *c, const int16_t *buf[2], const int16_t *ubuf[2], const int16_t *vbuf[2], const int16_t *abuf[2], uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
static void RENAME() yuv2rgb32_X(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize, const int16_t *chrFilter, const int16_t **chrUSrc, const int16_t **chrVSrc, int chrFilterSize, const int16_t **alpSrc, uint8_t *dest, int dstW, int dstY)
static void RENAME() yuv2rgb565_X_ar(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize, const int16_t *chrFilter, const int16_t **chrUSrc, const int16_t **chrVSrc, int chrFilterSize, const int16_t **alpSrc, uint8_t *dest, int dstW, int dstY)
#define YSCALEYUV2PACKEDX
static void RENAME() yuv2yuvX(const int16_t *filter, int filterSize, const int16_t **src, uint8_t *dest, int dstW, const uint8_t *dither, int offset)
static void RENAME() yuv2rgb32_1(SwsContext *c, const int16_t *buf0, const int16_t *ubuf[2], const int16_t *vbuf[2], const int16_t *abuf0, uint8_t *dest, int dstW, int uvalpha, int y)
YV12 to RGB without scaling or interpolating.
#define WRITEBGR24(dst, dstw, index)
planar YUV 4:2:0, 12bpp, 1 plane for Y and 1 plane for the UV components, which are interleaved (firs...
#define YSCALEYUV2RGB1b(index, c)
static const uint8_t offset[127][2]
void(* hcscale_fast)(struct SwsContext *c, int16_t *dst1, int16_t *dst2, int dstWidth, const uint8_t *src1, const uint8_t *src2, int srcW, int xInc)
#define YSCALEYUV2RGB_YA(index, c, b1, b2)
#define YSCALEYUV2PACKED(index, c)
static void RENAME() yuv2rgb565_X(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize, const int16_t *chrFilter, const int16_t **chrUSrc, const int16_t **chrVSrc, int chrFilterSize, const int16_t **alpSrc, uint8_t *dest, int dstW, int dstY)
static void RENAME() yuv2bgr24_X(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize, const int16_t *chrFilter, const int16_t **chrUSrc, const int16_t **chrVSrc, int chrFilterSize, const int16_t **alpSrc, uint8_t *dest, int dstW, int dstY)
#define WRITERGB16(dst, dstw, index)
as above, but U and V bytes are swapped
#define YSCALEYUV2PACKEDX_ACCURATE
static av_always_inline int is9_OR_10BPS(enum AVPixelFormat pix_fmt)
static void RENAME() yuv2rgb555_1(SwsContext *c, const int16_t *buf0, const int16_t *ubuf[2], const int16_t *vbuf[2], const int16_t *abuf0, uint8_t *dest, int dstW, int uvalpha, int y)
#define YSCALEYUV2PACKEDX_YA(offset, coeff, src1, src2, dst1, dst2)
static void RENAME() yuv2yuyv422_1(SwsContext *c, const int16_t *buf0, const int16_t *ubuf[2], const int16_t *vbuf[2], const int16_t *abuf0, uint8_t *dest, int dstW, int uvalpha, int y)
packed RGB 8:8:8, 24bpp, BGRBGR...
int dstW
Width of destination luma/alpha planes.
static void RENAME() yuv2yuyv422_2(SwsContext *c, const int16_t *buf[2], const int16_t *ubuf[2], const int16_t *vbuf[2], const int16_t *abuf[2], uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
static void RENAME() yuv2rgb555_X(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize, const int16_t *chrFilter, const int16_t **chrUSrc, const int16_t **chrVSrc, int chrFilterSize, const int16_t **alpSrc, uint8_t *dest, int dstW, int dstY)
static av_always_inline void dither_8to16(const uint8_t *srcDither, int rot)
#define YSCALEYUV2RGB(index, c)
static av_cold void RENAME() sws_init_swScale(SwsContext *c)
packed YUV 4:2:2, 16bpp, Y0 Cb Y1 Cr
#define YSCALEYUV2PACKED1b(index, c)
static void RENAME() yuv2rgb555_X_ar(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize, const int16_t *chrFilter, const int16_t **chrUSrc, const int16_t **chrVSrc, int chrFilterSize, const int16_t **alpSrc, uint8_t *dest, int dstW, int dstY)
static void RENAME() yuv2bgr24_X_ar(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize, const int16_t *chrFilter, const int16_t **chrUSrc, const int16_t **chrVSrc, int chrFilterSize, const int16_t **alpSrc, uint8_t *dest, int dstW, int dstY)
synthesis window for stochastic i
static void RENAME() yuv2yuyv422_X_ar(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize, const int16_t *chrFilter, const int16_t **chrUSrc, const int16_t **chrVSrc, int chrFilterSize, const int16_t **alpSrc, uint8_t *dest, int dstW, int dstY)
static void RENAME() yuv2rgb32_2(SwsContext *c, const int16_t *buf[2], const int16_t *ubuf[2], const int16_t *vbuf[2], const int16_t *abuf[2], uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
vertical bilinear scale YV12 to RGB
#define CONFIG_SWSCALE_ALPHA
#define AV_PIX_FMT_RGB555
static void RENAME() yuv2rgb565_1(SwsContext *c, const int16_t *buf0, const int16_t *ubuf[2], const int16_t *vbuf[2], const int16_t *abuf0, uint8_t *dest, int dstW, int uvalpha, int y)
else dst[i][x+y *dst_stride[i]]
#define YSCALEYUV2PACKEDX_ACCURATE_YA(offset)
#define AV_PIX_FMT_RGB565
static void RENAME() yuv2bgr24_2(SwsContext *c, const int16_t *buf[2], const int16_t *ubuf[2], const int16_t *vbuf[2], const int16_t *abuf[2], uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
static void RENAME() yuv2rgb555_2(SwsContext *c, const int16_t *buf[2], const int16_t *ubuf[2], const int16_t *vbuf[2], const int16_t *abuf[2], uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
#define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)
#define WRITEYUY2(dst, dstw, index)
int srcW
Width of source luma/alpha planes.
AVPixelFormat
Pixel format.
static void RENAME() yuv2bgr24_1(SwsContext *c, const int16_t *buf0, const int16_t *ubuf[2], const int16_t *vbuf[2], const int16_t *abuf0, uint8_t *dest, int dstW, int uvalpha, int y)
#define YSCALEYUV2RGB1(index, c)