swscale_template.c
Go to the documentation of this file.
1 /*
2  * Copyright (C) 2001-2011 Michael Niedermayer <michaelni@gmx.at>
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
21 #undef REAL_MOVNTQ
22 #undef MOVNTQ
23 #undef MOVNTQ2
24 #undef PREFETCH
25 
26 #if COMPILE_TEMPLATE_MMXEXT
27 #define PREFETCH "prefetchnta"
28 #else
29 #define PREFETCH " # nop"
30 #endif
31 
32 #if COMPILE_TEMPLATE_MMXEXT
33 #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
34 #define MOVNTQ2 "movntq "
35 #else
36 #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
37 #define MOVNTQ2 "movq "
38 #endif
39 #define MOVNTQ(a,b) REAL_MOVNTQ(a,b)
40 
41 #if !COMPILE_TEMPLATE_MMXEXT
42 static av_always_inline void
43 dither_8to16(const uint8_t *srcDither, int rot)
44 {
45  if (rot) {
46  __asm__ volatile("pxor %%mm0, %%mm0\n\t"
47  "movq (%0), %%mm3\n\t"
48  "movq %%mm3, %%mm4\n\t"
49  "psrlq $24, %%mm3\n\t"
50  "psllq $40, %%mm4\n\t"
51  "por %%mm4, %%mm3\n\t"
52  "movq %%mm3, %%mm4\n\t"
53  "punpcklbw %%mm0, %%mm3\n\t"
54  "punpckhbw %%mm0, %%mm4\n\t"
55  :: "r"(srcDither)
56  );
57  } else {
58  __asm__ volatile("pxor %%mm0, %%mm0\n\t"
59  "movq (%0), %%mm3\n\t"
60  "movq %%mm3, %%mm4\n\t"
61  "punpcklbw %%mm0, %%mm3\n\t"
62  "punpckhbw %%mm0, %%mm4\n\t"
63  :: "r"(srcDither)
64  );
65  }
66 }
67 #endif
68 
69 static void RENAME(yuv2yuvX)(const int16_t *filter, int filterSize,
70  const int16_t **src, uint8_t *dest, int dstW,
71  const uint8_t *dither, int offset)
72 {
73  dither_8to16(dither, offset);
74  filterSize--;
75  __asm__ volatile(
76  "movd %0, %%mm1\n\t"
77  "punpcklwd %%mm1, %%mm1\n\t"
78  "punpckldq %%mm1, %%mm1\n\t"
79  "psllw $3, %%mm1\n\t"
80  "paddw %%mm1, %%mm3\n\t"
81  "paddw %%mm1, %%mm4\n\t"
82  "psraw $4, %%mm3\n\t"
83  "psraw $4, %%mm4\n\t"
84  ::"m"(filterSize)
85  );
86 
87  __asm__ volatile(\
88  "movq %%mm3, %%mm6\n\t"
89  "movq %%mm4, %%mm7\n\t"
90  "movl %3, %%ecx\n\t"
91  "mov %0, %%"REG_d" \n\t"\
92  "mov (%%"REG_d"), %%"REG_S" \n\t"\
93  ".p2align 4 \n\t" /* FIXME Unroll? */\
94  "1: \n\t"\
95  "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
96  "movq (%%"REG_S", %%"REG_c", 2), %%mm2 \n\t" /* srcData */\
97  "movq 8(%%"REG_S", %%"REG_c", 2), %%mm5 \n\t" /* srcData */\
98  "add $16, %%"REG_d" \n\t"\
99  "mov (%%"REG_d"), %%"REG_S" \n\t"\
100  "test %%"REG_S", %%"REG_S" \n\t"\
101  "pmulhw %%mm0, %%mm2 \n\t"\
102  "pmulhw %%mm0, %%mm5 \n\t"\
103  "paddw %%mm2, %%mm3 \n\t"\
104  "paddw %%mm5, %%mm4 \n\t"\
105  " jnz 1b \n\t"\
106  "psraw $3, %%mm3 \n\t"\
107  "psraw $3, %%mm4 \n\t"\
108  "packuswb %%mm4, %%mm3 \n\t"
109  MOVNTQ2 " %%mm3, (%1, %%"REG_c")\n\t"
110  "add $8, %%"REG_c" \n\t"\
111  "cmp %2, %%"REG_c" \n\t"\
112  "movq %%mm6, %%mm3\n\t"
113  "movq %%mm7, %%mm4\n\t"
114  "mov %0, %%"REG_d" \n\t"\
115  "mov (%%"REG_d"), %%"REG_S" \n\t"\
116  "jb 1b \n\t"\
117  :: "g" (filter),
118  "r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m" (offset)
119  : "%"REG_d, "%"REG_S, "%"REG_c
120  );
121 }
122 
123 #define YSCALEYUV2PACKEDX_UV \
124  __asm__ volatile(\
125  "xor %%"REG_a", %%"REG_a" \n\t"\
126  ".p2align 4 \n\t"\
127  "nop \n\t"\
128  "1: \n\t"\
129  "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
130  "mov (%%"REG_d"), %%"REG_S" \n\t"\
131  "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
132  "movq %%mm3, %%mm4 \n\t"\
133  ".p2align 4 \n\t"\
134  "2: \n\t"\
135  "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
136  "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* UsrcData */\
137  "add %6, %%"REG_S" \n\t" \
138  "movq (%%"REG_S", %%"REG_a"), %%mm5 \n\t" /* VsrcData */\
139  "add $16, %%"REG_d" \n\t"\
140  "mov (%%"REG_d"), %%"REG_S" \n\t"\
141  "pmulhw %%mm0, %%mm2 \n\t"\
142  "pmulhw %%mm0, %%mm5 \n\t"\
143  "paddw %%mm2, %%mm3 \n\t"\
144  "paddw %%mm5, %%mm4 \n\t"\
145  "test %%"REG_S", %%"REG_S" \n\t"\
146  " jnz 2b \n\t"\
147 
148 #define YSCALEYUV2PACKEDX_YA(offset,coeff,src1,src2,dst1,dst2) \
149  "lea "offset"(%0), %%"REG_d" \n\t"\
150  "mov (%%"REG_d"), %%"REG_S" \n\t"\
151  "movq "VROUNDER_OFFSET"(%0), "#dst1" \n\t"\
152  "movq "#dst1", "#dst2" \n\t"\
153  ".p2align 4 \n\t"\
154  "2: \n\t"\
155  "movq 8(%%"REG_d"), "#coeff" \n\t" /* filterCoeff */\
156  "movq (%%"REG_S", %%"REG_a", 2), "#src1" \n\t" /* Y1srcData */\
157  "movq 8(%%"REG_S", %%"REG_a", 2), "#src2" \n\t" /* Y2srcData */\
158  "add $16, %%"REG_d" \n\t"\
159  "mov (%%"REG_d"), %%"REG_S" \n\t"\
160  "pmulhw "#coeff", "#src1" \n\t"\
161  "pmulhw "#coeff", "#src2" \n\t"\
162  "paddw "#src1", "#dst1" \n\t"\
163  "paddw "#src2", "#dst2" \n\t"\
164  "test %%"REG_S", %%"REG_S" \n\t"\
165  " jnz 2b \n\t"\
166 
167 #define YSCALEYUV2PACKEDX \
168  YSCALEYUV2PACKEDX_UV \
169  YSCALEYUV2PACKEDX_YA(LUM_MMX_FILTER_OFFSET,%%mm0,%%mm2,%%mm5,%%mm1,%%mm7) \
170 
171 #define YSCALEYUV2PACKEDX_END \
172  :: "r" (&c->redDither), \
173  "m" (dummy), "m" (dummy), "m" (dummy),\
174  "r" (dest), "m" (dstW_reg), "m"(uv_off) \
175  : "%"REG_a, "%"REG_d, "%"REG_S \
176  );
177 
178 #define YSCALEYUV2PACKEDX_ACCURATE_UV \
179  __asm__ volatile(\
180  "xor %%"REG_a", %%"REG_a" \n\t"\
181  ".p2align 4 \n\t"\
182  "nop \n\t"\
183  "1: \n\t"\
184  "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
185  "mov (%%"REG_d"), %%"REG_S" \n\t"\
186  "pxor %%mm4, %%mm4 \n\t"\
187  "pxor %%mm5, %%mm5 \n\t"\
188  "pxor %%mm6, %%mm6 \n\t"\
189  "pxor %%mm7, %%mm7 \n\t"\
190  ".p2align 4 \n\t"\
191  "2: \n\t"\
192  "movq (%%"REG_S", %%"REG_a"), %%mm0 \n\t" /* UsrcData */\
193  "add %6, %%"REG_S" \n\t" \
194  "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* VsrcData */\
195  "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
196  "movq (%%"REG_S", %%"REG_a"), %%mm1 \n\t" /* UsrcData */\
197  "movq %%mm0, %%mm3 \n\t"\
198  "punpcklwd %%mm1, %%mm0 \n\t"\
199  "punpckhwd %%mm1, %%mm3 \n\t"\
200  "movq "STR(APCK_COEF)"(%%"REG_d"),%%mm1 \n\t" /* filterCoeff */\
201  "pmaddwd %%mm1, %%mm0 \n\t"\
202  "pmaddwd %%mm1, %%mm3 \n\t"\
203  "paddd %%mm0, %%mm4 \n\t"\
204  "paddd %%mm3, %%mm5 \n\t"\
205  "add %6, %%"REG_S" \n\t" \
206  "movq (%%"REG_S", %%"REG_a"), %%mm3 \n\t" /* VsrcData */\
207  "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
208  "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
209  "test %%"REG_S", %%"REG_S" \n\t"\
210  "movq %%mm2, %%mm0 \n\t"\
211  "punpcklwd %%mm3, %%mm2 \n\t"\
212  "punpckhwd %%mm3, %%mm0 \n\t"\
213  "pmaddwd %%mm1, %%mm2 \n\t"\
214  "pmaddwd %%mm1, %%mm0 \n\t"\
215  "paddd %%mm2, %%mm6 \n\t"\
216  "paddd %%mm0, %%mm7 \n\t"\
217  " jnz 2b \n\t"\
218  "psrad $16, %%mm4 \n\t"\
219  "psrad $16, %%mm5 \n\t"\
220  "psrad $16, %%mm6 \n\t"\
221  "psrad $16, %%mm7 \n\t"\
222  "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
223  "packssdw %%mm5, %%mm4 \n\t"\
224  "packssdw %%mm7, %%mm6 \n\t"\
225  "paddw %%mm0, %%mm4 \n\t"\
226  "paddw %%mm0, %%mm6 \n\t"\
227  "movq %%mm4, "U_TEMP"(%0) \n\t"\
228  "movq %%mm6, "V_TEMP"(%0) \n\t"\
229 
230 #define YSCALEYUV2PACKEDX_ACCURATE_YA(offset) \
231  "lea "offset"(%0), %%"REG_d" \n\t"\
232  "mov (%%"REG_d"), %%"REG_S" \n\t"\
233  "pxor %%mm1, %%mm1 \n\t"\
234  "pxor %%mm5, %%mm5 \n\t"\
235  "pxor %%mm7, %%mm7 \n\t"\
236  "pxor %%mm6, %%mm6 \n\t"\
237  ".p2align 4 \n\t"\
238  "2: \n\t"\
239  "movq (%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* Y1srcData */\
240  "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y2srcData */\
241  "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
242  "movq (%%"REG_S", %%"REG_a", 2), %%mm4 \n\t" /* Y1srcData */\
243  "movq %%mm0, %%mm3 \n\t"\
244  "punpcklwd %%mm4, %%mm0 \n\t"\
245  "punpckhwd %%mm4, %%mm3 \n\t"\
246  "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm4 \n\t" /* filterCoeff */\
247  "pmaddwd %%mm4, %%mm0 \n\t"\
248  "pmaddwd %%mm4, %%mm3 \n\t"\
249  "paddd %%mm0, %%mm1 \n\t"\
250  "paddd %%mm3, %%mm5 \n\t"\
251  "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* Y2srcData */\
252  "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
253  "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
254  "test %%"REG_S", %%"REG_S" \n\t"\
255  "movq %%mm2, %%mm0 \n\t"\
256  "punpcklwd %%mm3, %%mm2 \n\t"\
257  "punpckhwd %%mm3, %%mm0 \n\t"\
258  "pmaddwd %%mm4, %%mm2 \n\t"\
259  "pmaddwd %%mm4, %%mm0 \n\t"\
260  "paddd %%mm2, %%mm7 \n\t"\
261  "paddd %%mm0, %%mm6 \n\t"\
262  " jnz 2b \n\t"\
263  "psrad $16, %%mm1 \n\t"\
264  "psrad $16, %%mm5 \n\t"\
265  "psrad $16, %%mm7 \n\t"\
266  "psrad $16, %%mm6 \n\t"\
267  "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
268  "packssdw %%mm5, %%mm1 \n\t"\
269  "packssdw %%mm6, %%mm7 \n\t"\
270  "paddw %%mm0, %%mm1 \n\t"\
271  "paddw %%mm0, %%mm7 \n\t"\
272  "movq "U_TEMP"(%0), %%mm3 \n\t"\
273  "movq "V_TEMP"(%0), %%mm4 \n\t"\
274 
275 #define YSCALEYUV2PACKEDX_ACCURATE \
276  YSCALEYUV2PACKEDX_ACCURATE_UV \
277  YSCALEYUV2PACKEDX_ACCURATE_YA(LUM_MMX_FILTER_OFFSET)
278 
279 #define YSCALEYUV2RGBX \
280  "psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\
281  "psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\
282  "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
283  "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
284  "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\
285  "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\
286  /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
287  "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\
288  "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\
289  "psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\
290  "psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\
291  "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\
292  "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\
293  /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
294  "paddw %%mm3, %%mm4 \n\t"\
295  "movq %%mm2, %%mm0 \n\t"\
296  "movq %%mm5, %%mm6 \n\t"\
297  "movq %%mm4, %%mm3 \n\t"\
298  "punpcklwd %%mm2, %%mm2 \n\t"\
299  "punpcklwd %%mm5, %%mm5 \n\t"\
300  "punpcklwd %%mm4, %%mm4 \n\t"\
301  "paddw %%mm1, %%mm2 \n\t"\
302  "paddw %%mm1, %%mm5 \n\t"\
303  "paddw %%mm1, %%mm4 \n\t"\
304  "punpckhwd %%mm0, %%mm0 \n\t"\
305  "punpckhwd %%mm6, %%mm6 \n\t"\
306  "punpckhwd %%mm3, %%mm3 \n\t"\
307  "paddw %%mm7, %%mm0 \n\t"\
308  "paddw %%mm7, %%mm6 \n\t"\
309  "paddw %%mm7, %%mm3 \n\t"\
310  /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
311  "packuswb %%mm0, %%mm2 \n\t"\
312  "packuswb %%mm6, %%mm5 \n\t"\
313  "packuswb %%mm3, %%mm4 \n\t"\
314 
315 #define REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) \
316  "movq "#b", "#q2" \n\t" /* B */\
317  "movq "#r", "#t" \n\t" /* R */\
318  "punpcklbw "#g", "#b" \n\t" /* GBGBGBGB 0 */\
319  "punpcklbw "#a", "#r" \n\t" /* ARARARAR 0 */\
320  "punpckhbw "#g", "#q2" \n\t" /* GBGBGBGB 2 */\
321  "punpckhbw "#a", "#t" \n\t" /* ARARARAR 2 */\
322  "movq "#b", "#q0" \n\t" /* GBGBGBGB 0 */\
323  "movq "#q2", "#q3" \n\t" /* GBGBGBGB 2 */\
324  "punpcklwd "#r", "#q0" \n\t" /* ARGBARGB 0 */\
325  "punpckhwd "#r", "#b" \n\t" /* ARGBARGB 1 */\
326  "punpcklwd "#t", "#q2" \n\t" /* ARGBARGB 2 */\
327  "punpckhwd "#t", "#q3" \n\t" /* ARGBARGB 3 */\
328 \
329  MOVNTQ( q0, (dst, index, 4))\
330  MOVNTQ( b, 8(dst, index, 4))\
331  MOVNTQ( q2, 16(dst, index, 4))\
332  MOVNTQ( q3, 24(dst, index, 4))\
333 \
334  "add $8, "#index" \n\t"\
335  "cmp "#dstw", "#index" \n\t"\
336  " jb 1b \n\t"
337 #define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)
338 
339 static void RENAME(yuv2rgb32_X_ar)(SwsContext *c, const int16_t *lumFilter,
340  const int16_t **lumSrc, int lumFilterSize,
341  const int16_t *chrFilter, const int16_t **chrUSrc,
342  const int16_t **chrVSrc,
343  int chrFilterSize, const int16_t **alpSrc,
344  uint8_t *dest, int dstW, int dstY)
345 {
346  x86_reg dummy=0;
347  x86_reg dstW_reg = dstW;
348  x86_reg uv_off = c->uv_offx2;
349 
350  if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
353  "movq %%mm2, "U_TEMP"(%0) \n\t"
354  "movq %%mm4, "V_TEMP"(%0) \n\t"
355  "movq %%mm5, "Y_TEMP"(%0) \n\t"
357  "movq "Y_TEMP"(%0), %%mm5 \n\t"
358  "psraw $3, %%mm1 \n\t"
359  "psraw $3, %%mm7 \n\t"
360  "packuswb %%mm7, %%mm1 \n\t"
361  WRITEBGR32(%4, %5, %%REGa, %%mm3, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm2, %%mm6)
363  } else {
366  "pcmpeqd %%mm7, %%mm7 \n\t"
367  WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
369  }
370 }
371 
372 static void RENAME(yuv2rgb32_X)(SwsContext *c, const int16_t *lumFilter,
373  const int16_t **lumSrc, int lumFilterSize,
374  const int16_t *chrFilter, const int16_t **chrUSrc,
375  const int16_t **chrVSrc,
376  int chrFilterSize, const int16_t **alpSrc,
377  uint8_t *dest, int dstW, int dstY)
378 {
379  x86_reg dummy=0;
380  x86_reg dstW_reg = dstW;
381  x86_reg uv_off = c->uv_offx2;
382 
383  if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
386  YSCALEYUV2PACKEDX_YA(ALP_MMX_FILTER_OFFSET, %%mm0, %%mm3, %%mm6, %%mm1, %%mm7)
387  "psraw $3, %%mm1 \n\t"
388  "psraw $3, %%mm7 \n\t"
389  "packuswb %%mm7, %%mm1 \n\t"
390  WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
392  } else {
395  "pcmpeqd %%mm7, %%mm7 \n\t"
396  WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
398  }
399 }
400 
401 #define REAL_WRITERGB16(dst, dstw, index) \
402  "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
403  "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\
404  "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
405  "psrlq $3, %%mm2 \n\t"\
406 \
407  "movq %%mm2, %%mm1 \n\t"\
408  "movq %%mm4, %%mm3 \n\t"\
409 \
410  "punpcklbw %%mm7, %%mm3 \n\t"\
411  "punpcklbw %%mm5, %%mm2 \n\t"\
412  "punpckhbw %%mm7, %%mm4 \n\t"\
413  "punpckhbw %%mm5, %%mm1 \n\t"\
414 \
415  "psllq $3, %%mm3 \n\t"\
416  "psllq $3, %%mm4 \n\t"\
417 \
418  "por %%mm3, %%mm2 \n\t"\
419  "por %%mm4, %%mm1 \n\t"\
420 \
421  MOVNTQ(%%mm2, (dst, index, 2))\
422  MOVNTQ(%%mm1, 8(dst, index, 2))\
423 \
424  "add $8, "#index" \n\t"\
425  "cmp "#dstw", "#index" \n\t"\
426  " jb 1b \n\t"
427 #define WRITERGB16(dst, dstw, index) REAL_WRITERGB16(dst, dstw, index)
428 
429 static void RENAME(yuv2rgb565_X_ar)(SwsContext *c, const int16_t *lumFilter,
430  const int16_t **lumSrc, int lumFilterSize,
431  const int16_t *chrFilter, const int16_t **chrUSrc,
432  const int16_t **chrVSrc,
433  int chrFilterSize, const int16_t **alpSrc,
434  uint8_t *dest, int dstW, int dstY)
435 {
436  x86_reg dummy=0;
437  x86_reg dstW_reg = dstW;
438  x86_reg uv_off = c->uv_offx2;
439 
442  "pxor %%mm7, %%mm7 \n\t"
443  /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
444 #ifdef DITHER1XBPP
445  "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
446  "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
447  "paddusb "RED_DITHER"(%0), %%mm5\n\t"
448 #endif
449  WRITERGB16(%4, %5, %%REGa)
451 }
452 
453 static void RENAME(yuv2rgb565_X)(SwsContext *c, const int16_t *lumFilter,
454  const int16_t **lumSrc, int lumFilterSize,
455  const int16_t *chrFilter, const int16_t **chrUSrc,
456  const int16_t **chrVSrc,
457  int chrFilterSize, const int16_t **alpSrc,
458  uint8_t *dest, int dstW, int dstY)
459 {
460  x86_reg dummy=0;
461  x86_reg dstW_reg = dstW;
462  x86_reg uv_off = c->uv_offx2;
463 
466  "pxor %%mm7, %%mm7 \n\t"
467  /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
468 #ifdef DITHER1XBPP
469  "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
470  "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
471  "paddusb "RED_DITHER"(%0), %%mm5 \n\t"
472 #endif
473  WRITERGB16(%4, %5, %%REGa)
475 }
476 
477 #define REAL_WRITERGB15(dst, dstw, index) \
478  "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
479  "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\
480  "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
481  "psrlq $3, %%mm2 \n\t"\
482  "psrlq $1, %%mm5 \n\t"\
483 \
484  "movq %%mm2, %%mm1 \n\t"\
485  "movq %%mm4, %%mm3 \n\t"\
486 \
487  "punpcklbw %%mm7, %%mm3 \n\t"\
488  "punpcklbw %%mm5, %%mm2 \n\t"\
489  "punpckhbw %%mm7, %%mm4 \n\t"\
490  "punpckhbw %%mm5, %%mm1 \n\t"\
491 \
492  "psllq $2, %%mm3 \n\t"\
493  "psllq $2, %%mm4 \n\t"\
494 \
495  "por %%mm3, %%mm2 \n\t"\
496  "por %%mm4, %%mm1 \n\t"\
497 \
498  MOVNTQ(%%mm2, (dst, index, 2))\
499  MOVNTQ(%%mm1, 8(dst, index, 2))\
500 \
501  "add $8, "#index" \n\t"\
502  "cmp "#dstw", "#index" \n\t"\
503  " jb 1b \n\t"
504 #define WRITERGB15(dst, dstw, index) REAL_WRITERGB15(dst, dstw, index)
505 
506 static void RENAME(yuv2rgb555_X_ar)(SwsContext *c, const int16_t *lumFilter,
507  const int16_t **lumSrc, int lumFilterSize,
508  const int16_t *chrFilter, const int16_t **chrUSrc,
509  const int16_t **chrVSrc,
510  int chrFilterSize, const int16_t **alpSrc,
511  uint8_t *dest, int dstW, int dstY)
512 {
513  x86_reg dummy=0;
514  x86_reg dstW_reg = dstW;
515  x86_reg uv_off = c->uv_offx2;
516 
519  "pxor %%mm7, %%mm7 \n\t"
520  /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
521 #ifdef DITHER1XBPP
522  "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
523  "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
524  "paddusb "RED_DITHER"(%0), %%mm5\n\t"
525 #endif
526  WRITERGB15(%4, %5, %%REGa)
528 }
529 
530 static void RENAME(yuv2rgb555_X)(SwsContext *c, const int16_t *lumFilter,
531  const int16_t **lumSrc, int lumFilterSize,
532  const int16_t *chrFilter, const int16_t **chrUSrc,
533  const int16_t **chrVSrc,
534  int chrFilterSize, const int16_t **alpSrc,
535  uint8_t *dest, int dstW, int dstY)
536 {
537  x86_reg dummy=0;
538  x86_reg dstW_reg = dstW;
539  x86_reg uv_off = c->uv_offx2;
540 
543  "pxor %%mm7, %%mm7 \n\t"
544  /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
545 #ifdef DITHER1XBPP
546  "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
547  "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
548  "paddusb "RED_DITHER"(%0), %%mm5 \n\t"
549 #endif
550  WRITERGB15(%4, %5, %%REGa)
552 }
553 
554 #define WRITEBGR24MMX(dst, dstw, index) \
555  /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
556  "movq %%mm2, %%mm1 \n\t" /* B */\
557  "movq %%mm5, %%mm6 \n\t" /* R */\
558  "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
559  "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
560  "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
561  "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
562  "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
563  "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
564  "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
565  "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
566  "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
567  "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
568 \
569  "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
570  "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\
571  "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\
572  "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\
573 \
574  "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\
575  "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\
576  "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\
577  "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\
578 \
579  "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\
580  "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\
581  "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\
582  "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\
583 \
584  "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\
585  "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\
586  "psllq $40, %%mm2 \n\t" /* GB000000 1 */\
587  "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
588  MOVNTQ(%%mm0, (dst))\
589 \
590  "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\
591  "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\
592  "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\
593  "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\
594  MOVNTQ(%%mm6, 8(dst))\
595 \
596  "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\
597  "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\
598  "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\
599  MOVNTQ(%%mm5, 16(dst))\
600 \
601  "add $24, "#dst" \n\t"\
602 \
603  "add $8, "#index" \n\t"\
604  "cmp "#dstw", "#index" \n\t"\
605  " jb 1b \n\t"
606 
607 #define WRITEBGR24MMXEXT(dst, dstw, index) \
608  /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
609  "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\
610  "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\
611  "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\
612  "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\
613  "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\
614 \
615  "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\
616  "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\
617  "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\
618 \
619  "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\
620  "por %%mm1, %%mm6 \n\t"\
621  "por %%mm3, %%mm6 \n\t"\
622  MOVNTQ(%%mm6, (dst))\
623 \
624  "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\
625  "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\
626  "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\
627  "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\
628 \
629  "pand "MANGLE(ff_M24B)", %%mm1 \n\t" /* B5 B4 B3 */\
630  "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\
631  "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\
632 \
633  "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\
634  "por %%mm3, %%mm6 \n\t"\
635  MOVNTQ(%%mm6, 8(dst))\
636 \
637  "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\
638  "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\
639  "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\
640 \
641  "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\
642  "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\
643  "pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7 R6 R5 */\
644 \
645  "por %%mm1, %%mm3 \n\t"\
646  "por %%mm3, %%mm6 \n\t"\
647  MOVNTQ(%%mm6, 16(dst))\
648 \
649  "add $24, "#dst" \n\t"\
650 \
651  "add $8, "#index" \n\t"\
652  "cmp "#dstw", "#index" \n\t"\
653  " jb 1b \n\t"
654 
655 #if COMPILE_TEMPLATE_MMXEXT
656 #undef WRITEBGR24
657 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMXEXT(dst, dstw, index)
658 #else
659 #undef WRITEBGR24
660 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index)
661 #endif
662 
663 static void RENAME(yuv2bgr24_X_ar)(SwsContext *c, const int16_t *lumFilter,
664  const int16_t **lumSrc, int lumFilterSize,
665  const int16_t *chrFilter, const int16_t **chrUSrc,
666  const int16_t **chrVSrc,
667  int chrFilterSize, const int16_t **alpSrc,
668  uint8_t *dest, int dstW, int dstY)
669 {
670  x86_reg dummy=0;
671  x86_reg dstW_reg = dstW;
672  x86_reg uv_off = c->uv_offx2;
673 
676  "pxor %%mm7, %%mm7 \n\t"
677  "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize
678  "add %4, %%"REG_c" \n\t"
679  WRITEBGR24(%%REGc, %5, %%REGa)
680  :: "r" (&c->redDither),
681  "m" (dummy), "m" (dummy), "m" (dummy),
682  "r" (dest), "m" (dstW_reg), "m"(uv_off)
683  : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
684  );
685 }
686 
687 static void RENAME(yuv2bgr24_X)(SwsContext *c, const int16_t *lumFilter,
688  const int16_t **lumSrc, int lumFilterSize,
689  const int16_t *chrFilter, const int16_t **chrUSrc,
690  const int16_t **chrVSrc,
691  int chrFilterSize, const int16_t **alpSrc,
692  uint8_t *dest, int dstW, int dstY)
693 {
694  x86_reg dummy=0;
695  x86_reg dstW_reg = dstW;
696  x86_reg uv_off = c->uv_offx2;
697 
700  "pxor %%mm7, %%mm7 \n\t"
701  "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c" \n\t" //FIXME optimize
702  "add %4, %%"REG_c" \n\t"
703  WRITEBGR24(%%REGc, %5, %%REGa)
704  :: "r" (&c->redDither),
705  "m" (dummy), "m" (dummy), "m" (dummy),
706  "r" (dest), "m" (dstW_reg), "m"(uv_off)
707  : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
708  );
709 }
710 
711 #define REAL_WRITEYUY2(dst, dstw, index) \
712  "packuswb %%mm3, %%mm3 \n\t"\
713  "packuswb %%mm4, %%mm4 \n\t"\
714  "packuswb %%mm7, %%mm1 \n\t"\
715  "punpcklbw %%mm4, %%mm3 \n\t"\
716  "movq %%mm1, %%mm7 \n\t"\
717  "punpcklbw %%mm3, %%mm1 \n\t"\
718  "punpckhbw %%mm3, %%mm7 \n\t"\
719 \
720  MOVNTQ(%%mm1, (dst, index, 2))\
721  MOVNTQ(%%mm7, 8(dst, index, 2))\
722 \
723  "add $8, "#index" \n\t"\
724  "cmp "#dstw", "#index" \n\t"\
725  " jb 1b \n\t"
726 #define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index)
727 
728 static void RENAME(yuv2yuyv422_X_ar)(SwsContext *c, const int16_t *lumFilter,
729  const int16_t **lumSrc, int lumFilterSize,
730  const int16_t *chrFilter, const int16_t **chrUSrc,
731  const int16_t **chrVSrc,
732  int chrFilterSize, const int16_t **alpSrc,
733  uint8_t *dest, int dstW, int dstY)
734 {
735  x86_reg dummy=0;
736  x86_reg dstW_reg = dstW;
737  x86_reg uv_off = c->uv_offx2;
738 
740  /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
741  "psraw $3, %%mm3 \n\t"
742  "psraw $3, %%mm4 \n\t"
743  "psraw $3, %%mm1 \n\t"
744  "psraw $3, %%mm7 \n\t"
745  WRITEYUY2(%4, %5, %%REGa)
747 }
748 
749 static void RENAME(yuv2yuyv422_X)(SwsContext *c, const int16_t *lumFilter,
750  const int16_t **lumSrc, int lumFilterSize,
751  const int16_t *chrFilter, const int16_t **chrUSrc,
752  const int16_t **chrVSrc,
753  int chrFilterSize, const int16_t **alpSrc,
754  uint8_t *dest, int dstW, int dstY)
755 {
756  x86_reg dummy=0;
757  x86_reg dstW_reg = dstW;
758  x86_reg uv_off = c->uv_offx2;
759 
761  /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
762  "psraw $3, %%mm3 \n\t"
763  "psraw $3, %%mm4 \n\t"
764  "psraw $3, %%mm1 \n\t"
765  "psraw $3, %%mm7 \n\t"
766  WRITEYUY2(%4, %5, %%REGa)
768 }
769 
770 #define REAL_YSCALEYUV2RGB_UV(index, c) \
771  "xor "#index", "#index" \n\t"\
772  ".p2align 4 \n\t"\
773  "1: \n\t"\
774  "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
775  "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
776  "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \
777  "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
778  "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
779  "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \
780  "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
781  "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
782  "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
783  "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
784  "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
785  "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
786  "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
787  "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
788  "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
789  "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
790  "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
791  "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
792  "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
793  "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
794  "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
795  /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
796 
797 #define REAL_YSCALEYUV2RGB_YA(index, c, b1, b2) \
798  "movq ("#b1", "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
799  "movq ("#b2", "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
800  "movq 8("#b1", "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
801  "movq 8("#b2", "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
802  "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
803  "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
804  "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
805  "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
806  "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
807  "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
808  "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
809  "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
810 
811 #define REAL_YSCALEYUV2RGB_COEFF(c) \
812  "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
813  "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
814  "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
815  "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
816  "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
817  "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
818  /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
819  "paddw %%mm3, %%mm4 \n\t"\
820  "movq %%mm2, %%mm0 \n\t"\
821  "movq %%mm5, %%mm6 \n\t"\
822  "movq %%mm4, %%mm3 \n\t"\
823  "punpcklwd %%mm2, %%mm2 \n\t"\
824  "punpcklwd %%mm5, %%mm5 \n\t"\
825  "punpcklwd %%mm4, %%mm4 \n\t"\
826  "paddw %%mm1, %%mm2 \n\t"\
827  "paddw %%mm1, %%mm5 \n\t"\
828  "paddw %%mm1, %%mm4 \n\t"\
829  "punpckhwd %%mm0, %%mm0 \n\t"\
830  "punpckhwd %%mm6, %%mm6 \n\t"\
831  "punpckhwd %%mm3, %%mm3 \n\t"\
832  "paddw %%mm7, %%mm0 \n\t"\
833  "paddw %%mm7, %%mm6 \n\t"\
834  "paddw %%mm7, %%mm3 \n\t"\
835  /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
836  "packuswb %%mm0, %%mm2 \n\t"\
837  "packuswb %%mm6, %%mm5 \n\t"\
838  "packuswb %%mm3, %%mm4 \n\t"\
839 
840 #define YSCALEYUV2RGB_YA(index, c, b1, b2) REAL_YSCALEYUV2RGB_YA(index, c, b1, b2)
841 
842 #define YSCALEYUV2RGB(index, c) \
843  REAL_YSCALEYUV2RGB_UV(index, c) \
844  REAL_YSCALEYUV2RGB_YA(index, c, %0, %1) \
845  REAL_YSCALEYUV2RGB_COEFF(c)
846 
847 /**
848  * vertical bilinear scale YV12 to RGB
849  */
850 static void RENAME(yuv2rgb32_2)(SwsContext *c, const int16_t *buf[2],
851  const int16_t *ubuf[2], const int16_t *vbuf[2],
852  const int16_t *abuf[2], uint8_t *dest,
853  int dstW, int yalpha, int uvalpha, int y)
854 {
855  const int16_t *buf0 = buf[0], *buf1 = buf[1],
856  *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
857 
858  if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
859  const int16_t *abuf0 = abuf[0], *abuf1 = abuf[1];
860 #if ARCH_X86_64
861  __asm__ volatile(
862  YSCALEYUV2RGB(%%r8, %5)
863  YSCALEYUV2RGB_YA(%%r8, %5, %6, %7)
864  "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
865  "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
866  "packuswb %%mm7, %%mm1 \n\t"
867  WRITEBGR32(%4, 8280(%5), %%r8, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
868  :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "r" (dest),
869  "a" (&c->redDither),
870  "r" (abuf0), "r" (abuf1)
871  : "%r8"
872  );
873 #else
874  c->u_temp=(intptr_t)abuf0;
875  c->v_temp=(intptr_t)abuf1;
876  __asm__ volatile(
877  "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
878  "mov %4, %%"REG_b" \n\t"
879  "push %%"REG_BP" \n\t"
880  YSCALEYUV2RGB(%%REGBP, %5)
881  "push %0 \n\t"
882  "push %1 \n\t"
883  "mov "U_TEMP"(%5), %0 \n\t"
884  "mov "V_TEMP"(%5), %1 \n\t"
885  YSCALEYUV2RGB_YA(%%REGBP, %5, %0, %1)
886  "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
887  "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
888  "packuswb %%mm7, %%mm1 \n\t"
889  "pop %1 \n\t"
890  "pop %0 \n\t"
891  WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
892  "pop %%"REG_BP" \n\t"
893  "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
894  :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
895  "a" (&c->redDither)
896  );
897 #endif
898  } else {
899  __asm__ volatile(
900  "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
901  "mov %4, %%"REG_b" \n\t"
902  "push %%"REG_BP" \n\t"
903  YSCALEYUV2RGB(%%REGBP, %5)
904  "pcmpeqd %%mm7, %%mm7 \n\t"
905  WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
906  "pop %%"REG_BP" \n\t"
907  "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
908  :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
909  "a" (&c->redDither)
910  );
911  }
912 }
913 
914 static void RENAME(yuv2bgr24_2)(SwsContext *c, const int16_t *buf[2],
915  const int16_t *ubuf[2], const int16_t *vbuf[2],
916  const int16_t *abuf[2], uint8_t *dest,
917  int dstW, int yalpha, int uvalpha, int y)
918 {
919  const int16_t *buf0 = buf[0], *buf1 = buf[1],
920  *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
921 
922  //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
923  __asm__ volatile(
924  "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
925  "mov %4, %%"REG_b" \n\t"
926  "push %%"REG_BP" \n\t"
927  YSCALEYUV2RGB(%%REGBP, %5)
928  "pxor %%mm7, %%mm7 \n\t"
929  WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
930  "pop %%"REG_BP" \n\t"
931  "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
932  :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
933  "a" (&c->redDither)
934  );
935 }
936 
937 static void RENAME(yuv2rgb555_2)(SwsContext *c, const int16_t *buf[2],
938  const int16_t *ubuf[2], const int16_t *vbuf[2],
939  const int16_t *abuf[2], uint8_t *dest,
940  int dstW, int yalpha, int uvalpha, int y)
941 {
942  const int16_t *buf0 = buf[0], *buf1 = buf[1],
943  *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
944 
945  //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
946  __asm__ volatile(
947  "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
948  "mov %4, %%"REG_b" \n\t"
949  "push %%"REG_BP" \n\t"
950  YSCALEYUV2RGB(%%REGBP, %5)
951  "pxor %%mm7, %%mm7 \n\t"
952  /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
953 #ifdef DITHER1XBPP
954  "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
955  "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
956  "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
957 #endif
958  WRITERGB15(%%REGb, 8280(%5), %%REGBP)
959  "pop %%"REG_BP" \n\t"
960  "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
961  :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
962  "a" (&c->redDither)
963  );
964 }
965 
966 static void RENAME(yuv2rgb565_2)(SwsContext *c, const int16_t *buf[2],
967  const int16_t *ubuf[2], const int16_t *vbuf[2],
968  const int16_t *abuf[2], uint8_t *dest,
969  int dstW, int yalpha, int uvalpha, int y)
970 {
971  const int16_t *buf0 = buf[0], *buf1 = buf[1],
972  *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
973 
974  //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
975  __asm__ volatile(
976  "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
977  "mov %4, %%"REG_b" \n\t"
978  "push %%"REG_BP" \n\t"
979  YSCALEYUV2RGB(%%REGBP, %5)
980  "pxor %%mm7, %%mm7 \n\t"
981  /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
982 #ifdef DITHER1XBPP
983  "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
984  "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
985  "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
986 #endif
987  WRITERGB16(%%REGb, 8280(%5), %%REGBP)
988  "pop %%"REG_BP" \n\t"
989  "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
990  :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
991  "a" (&c->redDither)
992  );
993 }
994 
995 #define REAL_YSCALEYUV2PACKED(index, c) \
996  "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
997  "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1 \n\t"\
998  "psraw $3, %%mm0 \n\t"\
999  "psraw $3, %%mm1 \n\t"\
1000  "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
1001  "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
1002  "xor "#index", "#index" \n\t"\
1003  ".p2align 4 \n\t"\
1004  "1: \n\t"\
1005  "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
1006  "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
1007  "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \
1008  "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
1009  "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
1010  "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \
1011  "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
1012  "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
1013  "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
1014  "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
1015  "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
1016  "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
1017  "psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
1018  "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
1019  "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
1020  "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
1021  "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
1022  "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
1023  "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
1024  "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
1025  "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
1026  "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
1027  "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
1028  "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
1029  "psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
1030  "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
1031  "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
1032 
1033 #define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c)
1034 
1035 static void RENAME(yuv2yuyv422_2)(SwsContext *c, const int16_t *buf[2],
1036  const int16_t *ubuf[2], const int16_t *vbuf[2],
1037  const int16_t *abuf[2], uint8_t *dest,
1038  int dstW, int yalpha, int uvalpha, int y)
1039 {
1040  const int16_t *buf0 = buf[0], *buf1 = buf[1],
1041  *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
1042 
1043  //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
1044  __asm__ volatile(
1045  "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1046  "mov %4, %%"REG_b" \n\t"
1047  "push %%"REG_BP" \n\t"
1048  YSCALEYUV2PACKED(%%REGBP, %5)
1049  WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1050  "pop %%"REG_BP" \n\t"
1051  "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1052  :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1053  "a" (&c->redDither)
1054  );
1055 }
1056 
1057 #define REAL_YSCALEYUV2RGB1(index, c) \
1058  "xor "#index", "#index" \n\t"\
1059  ".p2align 4 \n\t"\
1060  "1: \n\t"\
1061  "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
1062  "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \
1063  "movq (%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
1064  "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \
1065  "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
1066  "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
1067  "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
1068  "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
1069  "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
1070  "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
1071  "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
1072  "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
1073  /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
1074  "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
1075  "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
1076  "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
1077  "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
1078  "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
1079  "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
1080  "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
1081  "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
1082  "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
1083  "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
1084  /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
1085  "paddw %%mm3, %%mm4 \n\t"\
1086  "movq %%mm2, %%mm0 \n\t"\
1087  "movq %%mm5, %%mm6 \n\t"\
1088  "movq %%mm4, %%mm3 \n\t"\
1089  "punpcklwd %%mm2, %%mm2 \n\t"\
1090  "punpcklwd %%mm5, %%mm5 \n\t"\
1091  "punpcklwd %%mm4, %%mm4 \n\t"\
1092  "paddw %%mm1, %%mm2 \n\t"\
1093  "paddw %%mm1, %%mm5 \n\t"\
1094  "paddw %%mm1, %%mm4 \n\t"\
1095  "punpckhwd %%mm0, %%mm0 \n\t"\
1096  "punpckhwd %%mm6, %%mm6 \n\t"\
1097  "punpckhwd %%mm3, %%mm3 \n\t"\
1098  "paddw %%mm7, %%mm0 \n\t"\
1099  "paddw %%mm7, %%mm6 \n\t"\
1100  "paddw %%mm7, %%mm3 \n\t"\
1101  /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
1102  "packuswb %%mm0, %%mm2 \n\t"\
1103  "packuswb %%mm6, %%mm5 \n\t"\
1104  "packuswb %%mm3, %%mm4 \n\t"\
1105 
1106 #define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c)
1107 
1108 // do vertical chrominance interpolation
1109 #define REAL_YSCALEYUV2RGB1b(index, c) \
1110  "xor "#index", "#index" \n\t"\
1111  ".p2align 4 \n\t"\
1112  "1: \n\t"\
1113  "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
1114  "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
1115  "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \
1116  "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
1117  "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
1118  "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \
1119  "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
1120  "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
1121  "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\
1122  "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\
1123  "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
1124  "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
1125  "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
1126  "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
1127  "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
1128  "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
1129  /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
1130  "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
1131  "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
1132  "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
1133  "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
1134  "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
1135  "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
1136  "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
1137  "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
1138  "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
1139  "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
1140  /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
1141  "paddw %%mm3, %%mm4 \n\t"\
1142  "movq %%mm2, %%mm0 \n\t"\
1143  "movq %%mm5, %%mm6 \n\t"\
1144  "movq %%mm4, %%mm3 \n\t"\
1145  "punpcklwd %%mm2, %%mm2 \n\t"\
1146  "punpcklwd %%mm5, %%mm5 \n\t"\
1147  "punpcklwd %%mm4, %%mm4 \n\t"\
1148  "paddw %%mm1, %%mm2 \n\t"\
1149  "paddw %%mm1, %%mm5 \n\t"\
1150  "paddw %%mm1, %%mm4 \n\t"\
1151  "punpckhwd %%mm0, %%mm0 \n\t"\
1152  "punpckhwd %%mm6, %%mm6 \n\t"\
1153  "punpckhwd %%mm3, %%mm3 \n\t"\
1154  "paddw %%mm7, %%mm0 \n\t"\
1155  "paddw %%mm7, %%mm6 \n\t"\
1156  "paddw %%mm7, %%mm3 \n\t"\
1157  /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
1158  "packuswb %%mm0, %%mm2 \n\t"\
1159  "packuswb %%mm6, %%mm5 \n\t"\
1160  "packuswb %%mm3, %%mm4 \n\t"\
1161 
1162 #define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c)
1163 
1164 #define REAL_YSCALEYUV2RGB1_ALPHA(index) \
1165  "movq (%1, "#index", 2), %%mm7 \n\t" /* abuf0[index ] */\
1166  "movq 8(%1, "#index", 2), %%mm1 \n\t" /* abuf0[index+4] */\
1167  "psraw $7, %%mm7 \n\t" /* abuf0[index ] >>7 */\
1168  "psraw $7, %%mm1 \n\t" /* abuf0[index+4] >>7 */\
1169  "packuswb %%mm1, %%mm7 \n\t"
1170 #define YSCALEYUV2RGB1_ALPHA(index) REAL_YSCALEYUV2RGB1_ALPHA(index)
1171 
1172 /**
1173  * YV12 to RGB without scaling or interpolating
1174  */
1175 static void RENAME(yuv2rgb32_1)(SwsContext *c, const int16_t *buf0,
1176  const int16_t *ubuf[2], const int16_t *vbuf[2],
1177  const int16_t *abuf0, uint8_t *dest,
1178  int dstW, int uvalpha, int y)
1179 {
1180  const int16_t *ubuf0 = ubuf[0];
1181  const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1182 
1183  if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1184  const int16_t *ubuf1 = ubuf[0];
1185  if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1186  __asm__ volatile(
1187  "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1188  "mov %4, %%"REG_b" \n\t"
1189  "push %%"REG_BP" \n\t"
1190  YSCALEYUV2RGB1(%%REGBP, %5)
1191  YSCALEYUV2RGB1_ALPHA(%%REGBP)
1192  WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1193  "pop %%"REG_BP" \n\t"
1194  "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1195  :: "c" (buf0), "d" (abuf0), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1196  "a" (&c->redDither)
1197  );
1198  } else {
1199  __asm__ volatile(
1200  "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1201  "mov %4, %%"REG_b" \n\t"
1202  "push %%"REG_BP" \n\t"
1203  YSCALEYUV2RGB1(%%REGBP, %5)
1204  "pcmpeqd %%mm7, %%mm7 \n\t"
1205  WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1206  "pop %%"REG_BP" \n\t"
1207  "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1208  :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1209  "a" (&c->redDither)
1210  );
1211  }
1212  } else {
1213  const int16_t *ubuf1 = ubuf[1];
1214  if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1215  __asm__ volatile(
1216  "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1217  "mov %4, %%"REG_b" \n\t"
1218  "push %%"REG_BP" \n\t"
1219  YSCALEYUV2RGB1b(%%REGBP, %5)
1220  YSCALEYUV2RGB1_ALPHA(%%REGBP)
1221  WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1222  "pop %%"REG_BP" \n\t"
1223  "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1224  :: "c" (buf0), "d" (abuf0), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1225  "a" (&c->redDither)
1226  );
1227  } else {
1228  __asm__ volatile(
1229  "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1230  "mov %4, %%"REG_b" \n\t"
1231  "push %%"REG_BP" \n\t"
1232  YSCALEYUV2RGB1b(%%REGBP, %5)
1233  "pcmpeqd %%mm7, %%mm7 \n\t"
1234  WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1235  "pop %%"REG_BP" \n\t"
1236  "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1237  :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1238  "a" (&c->redDither)
1239  );
1240  }
1241  }
1242 }
1243 
1244 static void RENAME(yuv2bgr24_1)(SwsContext *c, const int16_t *buf0,
1245  const int16_t *ubuf[2], const int16_t *vbuf[2],
1246  const int16_t *abuf0, uint8_t *dest,
1247  int dstW, int uvalpha, int y)
1248 {
1249  const int16_t *ubuf0 = ubuf[0];
1250  const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1251 
1252  if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1253  const int16_t *ubuf1 = ubuf[0];
1254  __asm__ volatile(
1255  "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1256  "mov %4, %%"REG_b" \n\t"
1257  "push %%"REG_BP" \n\t"
1258  YSCALEYUV2RGB1(%%REGBP, %5)
1259  "pxor %%mm7, %%mm7 \n\t"
1260  WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1261  "pop %%"REG_BP" \n\t"
1262  "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1263  :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1264  "a" (&c->redDither)
1265  );
1266  } else {
1267  const int16_t *ubuf1 = ubuf[1];
1268  __asm__ volatile(
1269  "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1270  "mov %4, %%"REG_b" \n\t"
1271  "push %%"REG_BP" \n\t"
1272  YSCALEYUV2RGB1b(%%REGBP, %5)
1273  "pxor %%mm7, %%mm7 \n\t"
1274  WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1275  "pop %%"REG_BP" \n\t"
1276  "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1277  :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1278  "a" (&c->redDither)
1279  );
1280  }
1281 }
1282 
1283 static void RENAME(yuv2rgb555_1)(SwsContext *c, const int16_t *buf0,
1284  const int16_t *ubuf[2], const int16_t *vbuf[2],
1285  const int16_t *abuf0, uint8_t *dest,
1286  int dstW, int uvalpha, int y)
1287 {
1288  const int16_t *ubuf0 = ubuf[0];
1289  const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1290 
1291  if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1292  const int16_t *ubuf1 = ubuf[0];
1293  __asm__ volatile(
1294  "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1295  "mov %4, %%"REG_b" \n\t"
1296  "push %%"REG_BP" \n\t"
1297  YSCALEYUV2RGB1(%%REGBP, %5)
1298  "pxor %%mm7, %%mm7 \n\t"
1299  /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1300 #ifdef DITHER1XBPP
1301  "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1302  "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1303  "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1304 #endif
1305  WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1306  "pop %%"REG_BP" \n\t"
1307  "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1308  :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1309  "a" (&c->redDither)
1310  );
1311  } else {
1312  const int16_t *ubuf1 = ubuf[1];
1313  __asm__ volatile(
1314  "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1315  "mov %4, %%"REG_b" \n\t"
1316  "push %%"REG_BP" \n\t"
1317  YSCALEYUV2RGB1b(%%REGBP, %5)
1318  "pxor %%mm7, %%mm7 \n\t"
1319  /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1320 #ifdef DITHER1XBPP
1321  "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1322  "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1323  "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1324 #endif
1325  WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1326  "pop %%"REG_BP" \n\t"
1327  "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1328  :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1329  "a" (&c->redDither)
1330  );
1331  }
1332 }
1333 
1334 static void RENAME(yuv2rgb565_1)(SwsContext *c, const int16_t *buf0,
1335  const int16_t *ubuf[2], const int16_t *vbuf[2],
1336  const int16_t *abuf0, uint8_t *dest,
1337  int dstW, int uvalpha, int y)
1338 {
1339  const int16_t *ubuf0 = ubuf[0];
1340  const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1341 
1342  if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1343  const int16_t *ubuf1 = ubuf[0];
1344  __asm__ volatile(
1345  "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1346  "mov %4, %%"REG_b" \n\t"
1347  "push %%"REG_BP" \n\t"
1348  YSCALEYUV2RGB1(%%REGBP, %5)
1349  "pxor %%mm7, %%mm7 \n\t"
1350  /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1351 #ifdef DITHER1XBPP
1352  "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1353  "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1354  "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1355 #endif
1356  WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1357  "pop %%"REG_BP" \n\t"
1358  "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1359  :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1360  "a" (&c->redDither)
1361  );
1362  } else {
1363  const int16_t *ubuf1 = ubuf[1];
1364  __asm__ volatile(
1365  "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1366  "mov %4, %%"REG_b" \n\t"
1367  "push %%"REG_BP" \n\t"
1368  YSCALEYUV2RGB1b(%%REGBP, %5)
1369  "pxor %%mm7, %%mm7 \n\t"
1370  /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1371 #ifdef DITHER1XBPP
1372  "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1373  "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1374  "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1375 #endif
1376  WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1377  "pop %%"REG_BP" \n\t"
1378  "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1379  :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1380  "a" (&c->redDither)
1381  );
1382  }
1383 }
1384 
1385 #define REAL_YSCALEYUV2PACKED1(index, c) \
1386  "xor "#index", "#index" \n\t"\
1387  ".p2align 4 \n\t"\
1388  "1: \n\t"\
1389  "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
1390  "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \
1391  "movq (%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
1392  "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \
1393  "psraw $7, %%mm3 \n\t" \
1394  "psraw $7, %%mm4 \n\t" \
1395  "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
1396  "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
1397  "psraw $7, %%mm1 \n\t" \
1398  "psraw $7, %%mm7 \n\t" \
1399 
1400 #define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c)
1401 
1402 #define REAL_YSCALEYUV2PACKED1b(index, c) \
1403  "xor "#index", "#index" \n\t"\
1404  ".p2align 4 \n\t"\
1405  "1: \n\t"\
1406  "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
1407  "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
1408  "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \
1409  "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
1410  "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
1411  "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \
1412  "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
1413  "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
1414  "psrlw $8, %%mm3 \n\t" \
1415  "psrlw $8, %%mm4 \n\t" \
1416  "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
1417  "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
1418  "psraw $7, %%mm1 \n\t" \
1419  "psraw $7, %%mm7 \n\t"
1420 #define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c)
1421 
1422 static void RENAME(yuv2yuyv422_1)(SwsContext *c, const int16_t *buf0,
1423  const int16_t *ubuf[2], const int16_t *vbuf[2],
1424  const int16_t *abuf0, uint8_t *dest,
1425  int dstW, int uvalpha, int y)
1426 {
1427  const int16_t *ubuf0 = ubuf[0];
1428  const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1429 
1430  if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1431  const int16_t *ubuf1 = ubuf[0];
1432  __asm__ volatile(
1433  "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1434  "mov %4, %%"REG_b" \n\t"
1435  "push %%"REG_BP" \n\t"
1436  YSCALEYUV2PACKED1(%%REGBP, %5)
1437  WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1438  "pop %%"REG_BP" \n\t"
1439  "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1440  :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1441  "a" (&c->redDither)
1442  );
1443  } else {
1444  const int16_t *ubuf1 = ubuf[1];
1445  __asm__ volatile(
1446  "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1447  "mov %4, %%"REG_b" \n\t"
1448  "push %%"REG_BP" \n\t"
1449  YSCALEYUV2PACKED1b(%%REGBP, %5)
1450  WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1451  "pop %%"REG_BP" \n\t"
1452  "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1453  :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1454  "a" (&c->redDither)
1455  );
1456  }
1457 }
1458 
1459 #if COMPILE_TEMPLATE_MMXEXT
1460 static void RENAME(hyscale_fast)(SwsContext *c, int16_t *dst,
1461  int dstWidth, const uint8_t *src,
1462  int srcW, int xInc)
1463 {
1464  int32_t *filterPos = c->hLumFilterPos;
1465  int16_t *filter = c->hLumFilter;
1466  void *mmxextFilterCode = c->lumMmxextFilterCode;
1467  int i;
1468 #if defined(PIC)
1469  uint64_t ebxsave;
1470 #endif
1471 #if ARCH_X86_64
1472  uint64_t retsave;
1473 #endif
1474 
1475  __asm__ volatile(
1476 #if defined(PIC)
1477  "mov %%"REG_b", %5 \n\t"
1478 #if ARCH_X86_64
1479  "mov -8(%%rsp), %%"REG_a" \n\t"
1480  "mov %%"REG_a", %6 \n\t"
1481 #endif
1482 #else
1483 #if ARCH_X86_64
1484  "mov -8(%%rsp), %%"REG_a" \n\t"
1485  "mov %%"REG_a", %5 \n\t"
1486 #endif
1487 #endif
1488  "pxor %%mm7, %%mm7 \n\t"
1489  "mov %0, %%"REG_c" \n\t"
1490  "mov %1, %%"REG_D" \n\t"
1491  "mov %2, %%"REG_d" \n\t"
1492  "mov %3, %%"REG_b" \n\t"
1493  "xor %%"REG_a", %%"REG_a" \n\t" // i
1494  PREFETCH" (%%"REG_c") \n\t"
1495  PREFETCH" 32(%%"REG_c") \n\t"
1496  PREFETCH" 64(%%"REG_c") \n\t"
1497 
1498 #if ARCH_X86_64
1499 #define CALL_MMXEXT_FILTER_CODE \
1500  "movl (%%"REG_b"), %%esi \n\t"\
1501  "call *%4 \n\t"\
1502  "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
1503  "add %%"REG_S", %%"REG_c" \n\t"\
1504  "add %%"REG_a", %%"REG_D" \n\t"\
1505  "xor %%"REG_a", %%"REG_a" \n\t"\
1506 
1507 #else
1508 #define CALL_MMXEXT_FILTER_CODE \
1509  "movl (%%"REG_b"), %%esi \n\t"\
1510  "call *%4 \n\t"\
1511  "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
1512  "add %%"REG_a", %%"REG_D" \n\t"\
1513  "xor %%"REG_a", %%"REG_a" \n\t"\
1514 
1515 #endif /* ARCH_X86_64 */
1516 
1517  CALL_MMXEXT_FILTER_CODE
1518  CALL_MMXEXT_FILTER_CODE
1519  CALL_MMXEXT_FILTER_CODE
1520  CALL_MMXEXT_FILTER_CODE
1521  CALL_MMXEXT_FILTER_CODE
1522  CALL_MMXEXT_FILTER_CODE
1523  CALL_MMXEXT_FILTER_CODE
1524  CALL_MMXEXT_FILTER_CODE
1525 
1526 #if defined(PIC)
1527  "mov %5, %%"REG_b" \n\t"
1528 #if ARCH_X86_64
1529  "mov %6, %%"REG_a" \n\t"
1530  "mov %%"REG_a", -8(%%rsp) \n\t"
1531 #endif
1532 #else
1533 #if ARCH_X86_64
1534  "mov %5, %%"REG_a" \n\t"
1535  "mov %%"REG_a", -8(%%rsp) \n\t"
1536 #endif
1537 #endif
1538  :: "m" (src), "m" (dst), "m" (filter), "m" (filterPos),
1539  "m" (mmxextFilterCode)
1540 #if defined(PIC)
1541  ,"m" (ebxsave)
1542 #endif
1543 #if ARCH_X86_64
1544  ,"m"(retsave)
1545 #endif
1546  : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
1547 #if !defined(PIC)
1548  ,"%"REG_b
1549 #endif
1550  );
1551 
1552  for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
1553  dst[i] = src[srcW-1]*128;
1554 }
1555 
1556 static void RENAME(hcscale_fast)(SwsContext *c, int16_t *dst1, int16_t *dst2,
1557  int dstWidth, const uint8_t *src1,
1558  const uint8_t *src2, int srcW, int xInc)
1559 {
1560  int32_t *filterPos = c->hChrFilterPos;
1561  int16_t *filter = c->hChrFilter;
1562  void *mmxextFilterCode = c->chrMmxextFilterCode;
1563  int i;
1564 #if defined(PIC)
1565  DECLARE_ALIGNED(8, uint64_t, ebxsave);
1566 #endif
1567 #if ARCH_X86_64
1568  DECLARE_ALIGNED(8, uint64_t, retsave);
1569 #endif
1570 
1571  __asm__ volatile(
1572 #if defined(PIC)
1573  "mov %%"REG_b", %7 \n\t"
1574 #if ARCH_X86_64
1575  "mov -8(%%rsp), %%"REG_a" \n\t"
1576  "mov %%"REG_a", %8 \n\t"
1577 #endif
1578 #else
1579 #if ARCH_X86_64
1580  "mov -8(%%rsp), %%"REG_a" \n\t"
1581  "mov %%"REG_a", %7 \n\t"
1582 #endif
1583 #endif
1584  "pxor %%mm7, %%mm7 \n\t"
1585  "mov %0, %%"REG_c" \n\t"
1586  "mov %1, %%"REG_D" \n\t"
1587  "mov %2, %%"REG_d" \n\t"
1588  "mov %3, %%"REG_b" \n\t"
1589  "xor %%"REG_a", %%"REG_a" \n\t" // i
1590  PREFETCH" (%%"REG_c") \n\t"
1591  PREFETCH" 32(%%"REG_c") \n\t"
1592  PREFETCH" 64(%%"REG_c") \n\t"
1593 
1594  CALL_MMXEXT_FILTER_CODE
1595  CALL_MMXEXT_FILTER_CODE
1596  CALL_MMXEXT_FILTER_CODE
1597  CALL_MMXEXT_FILTER_CODE
1598  "xor %%"REG_a", %%"REG_a" \n\t" // i
1599  "mov %5, %%"REG_c" \n\t" // src
1600  "mov %6, %%"REG_D" \n\t" // buf2
1601  PREFETCH" (%%"REG_c") \n\t"
1602  PREFETCH" 32(%%"REG_c") \n\t"
1603  PREFETCH" 64(%%"REG_c") \n\t"
1604 
1605  CALL_MMXEXT_FILTER_CODE
1606  CALL_MMXEXT_FILTER_CODE
1607  CALL_MMXEXT_FILTER_CODE
1608  CALL_MMXEXT_FILTER_CODE
1609 
1610 #if defined(PIC)
1611  "mov %7, %%"REG_b" \n\t"
1612 #if ARCH_X86_64
1613  "mov %8, %%"REG_a" \n\t"
1614  "mov %%"REG_a", -8(%%rsp) \n\t"
1615 #endif
1616 #else
1617 #if ARCH_X86_64
1618  "mov %7, %%"REG_a" \n\t"
1619  "mov %%"REG_a", -8(%%rsp) \n\t"
1620 #endif
1621 #endif
1622  :: "m" (src1), "m" (dst1), "m" (filter), "m" (filterPos),
1623  "m" (mmxextFilterCode), "m" (src2), "m"(dst2)
1624 #if defined(PIC)
1625  ,"m" (ebxsave)
1626 #endif
1627 #if ARCH_X86_64
1628  ,"m"(retsave)
1629 #endif
1630  : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
1631 #if !defined(PIC)
1632  ,"%"REG_b
1633 #endif
1634  );
1635 
1636  for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) {
1637  dst1[i] = src1[srcW-1]*128;
1638  dst2[i] = src2[srcW-1]*128;
1639  }
1640 }
1641 #endif /* COMPILE_TEMPLATE_MMXEXT */
1642 
1644 {
1645  enum AVPixelFormat dstFormat = c->dstFormat;
1646 
1647  c->use_mmx_vfilter= 0;
1648  if (!is16BPS(dstFormat) && !is9_OR_10BPS(dstFormat) && dstFormat != AV_PIX_FMT_NV12
1649  && dstFormat != AV_PIX_FMT_NV21 && !(c->flags & SWS_BITEXACT)) {
1650  if (c->flags & SWS_ACCURATE_RND) {
1651  if (!(c->flags & SWS_FULL_CHR_H_INT)) {
1652  switch (c->dstFormat) {
1653  case AV_PIX_FMT_RGB32: c->yuv2packedX = RENAME(yuv2rgb32_X_ar); break;
1654  case AV_PIX_FMT_BGR24: c->yuv2packedX = RENAME(yuv2bgr24_X_ar); break;
1655  case AV_PIX_FMT_RGB555: c->yuv2packedX = RENAME(yuv2rgb555_X_ar); break;
1656  case AV_PIX_FMT_RGB565: c->yuv2packedX = RENAME(yuv2rgb565_X_ar); break;
1657  case AV_PIX_FMT_YUYV422: c->yuv2packedX = RENAME(yuv2yuyv422_X_ar); break;
1658  default: break;
1659  }
1660  }
1661  } else {
1662  c->use_mmx_vfilter= 1;
1663  c->yuv2planeX = RENAME(yuv2yuvX );
1664  if (!(c->flags & SWS_FULL_CHR_H_INT)) {
1665  switch (c->dstFormat) {
1666  case AV_PIX_FMT_RGB32: c->yuv2packedX = RENAME(yuv2rgb32_X); break;
1667  case AV_PIX_FMT_BGR24: c->yuv2packedX = RENAME(yuv2bgr24_X); break;
1668  case AV_PIX_FMT_RGB555: c->yuv2packedX = RENAME(yuv2rgb555_X); break;
1669  case AV_PIX_FMT_RGB565: c->yuv2packedX = RENAME(yuv2rgb565_X); break;
1670  case AV_PIX_FMT_YUYV422: c->yuv2packedX = RENAME(yuv2yuyv422_X); break;
1671  default: break;
1672  }
1673  }
1674  }
1675  if (!(c->flags & SWS_FULL_CHR_H_INT)) {
1676  switch (c->dstFormat) {
1677  case AV_PIX_FMT_RGB32:
1678  c->yuv2packed1 = RENAME(yuv2rgb32_1);
1679  c->yuv2packed2 = RENAME(yuv2rgb32_2);
1680  break;
1681  case AV_PIX_FMT_BGR24:
1682  c->yuv2packed1 = RENAME(yuv2bgr24_1);
1683  c->yuv2packed2 = RENAME(yuv2bgr24_2);
1684  break;
1685  case AV_PIX_FMT_RGB555:
1686  c->yuv2packed1 = RENAME(yuv2rgb555_1);
1687  c->yuv2packed2 = RENAME(yuv2rgb555_2);
1688  break;
1689  case AV_PIX_FMT_RGB565:
1690  c->yuv2packed1 = RENAME(yuv2rgb565_1);
1691  c->yuv2packed2 = RENAME(yuv2rgb565_2);
1692  break;
1693  case AV_PIX_FMT_YUYV422:
1694  c->yuv2packed1 = RENAME(yuv2yuyv422_1);
1695  c->yuv2packed2 = RENAME(yuv2yuyv422_2);
1696  break;
1697  default:
1698  break;
1699  }
1700  }
1701  }
1702 
1703  if (c->srcBpc == 8 && c->dstBpc <= 14) {
1704  // Use the new MMX scaler if the MMXEXT one can't be used (it is faster than the x86 ASM one).
1705 #if COMPILE_TEMPLATE_MMXEXT
1706  if (c->flags & SWS_FAST_BILINEAR && c->canMMXEXTBeUsed) {
1707  c->hyscale_fast = RENAME(hyscale_fast);
1708  c->hcscale_fast = RENAME(hcscale_fast);
1709  } else {
1710 #endif /* COMPILE_TEMPLATE_MMXEXT */
1711  c->hyscale_fast = NULL;
1712  c->hcscale_fast = NULL;
1713 #if COMPILE_TEMPLATE_MMXEXT
1714  }
1715 #endif /* COMPILE_TEMPLATE_MMXEXT */
1716  }
1717 }
#define YSCALEYUV2RGB1_ALPHA(index)
#define ALP_MMX_FILTER_OFFSET
static void RENAME() yuv2rgb32_X_ar(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize, const int16_t *chrFilter, const int16_t **chrUSrc, const int16_t **chrVSrc, int chrFilterSize, const int16_t **alpSrc, uint8_t *dest, int dstW, int dstY)
#define ARCH_X86_64
Definition: config.h:37
#define YSCALEYUV2RGBX
#define YSCALEYUV2PACKED1(index, c)
static void RENAME() yuv2yuyv422_X(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize, const int16_t *chrFilter, const int16_t **chrUSrc, const int16_t **chrVSrc, int chrFilterSize, const int16_t **alpSrc, uint8_t *dest, int dstW, int dstY)
#define SWS_FAST_BILINEAR
Definition: swscale.h:58
#define DECLARE_ALIGNED(n, t, v)
Definition: mem.h:59
#define SWS_FULL_CHR_H_INT
Definition: swscale.h:79
static av_always_inline int is16BPS(enum AVPixelFormat pix_fmt)
void(* hyscale_fast)(struct SwsContext *c, int16_t *dst, int dstWidth, const uint8_t *src, int srcW, int xInc)
Scale one horizontal line of input data using a bilinear filter to produce one line of output data...
int dstY
Last destination vertical line output from last slice.
#define YSCALEYUV2PACKEDX_END
uint8_t
#define av_cold
Definition: attributes.h:78
the mask is usually to keep the same permissions Filters should remove permissions on reference they give to output whenever necessary It can be automatically done by setting the rej_perms field on the output pad Here are a few guidelines corresponding to common then the filter should push the output frames on the output link immediately As an exception to the previous rule if the input frame is enough to produce several output frames then the filter needs output only at least one per link The additional frames can be left buffered in the filter
#define SWS_BITEXACT
Definition: swscale.h:84
enum AVPixelFormat dstFormat
Destination pixel format.
#define WRITERGB15(dst, dstw, index)
#define ESP_OFFSET
static void RENAME() yuv2rgb565_2(SwsContext *c, const int16_t *buf[2], const int16_t *ubuf[2], const int16_t *vbuf[2], const int16_t *abuf[2], uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
static void RENAME() yuv2rgb32_X(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize, const int16_t *chrFilter, const int16_t **chrUSrc, const int16_t **chrVSrc, int chrFilterSize, const int16_t **alpSrc, uint8_t *dest, int dstW, int dstY)
static void RENAME() yuv2rgb565_X_ar(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize, const int16_t *chrFilter, const int16_t **chrUSrc, const int16_t **chrVSrc, int chrFilterSize, const int16_t **alpSrc, uint8_t *dest, int dstW, int dstY)
#define PREFETCH
#define YSCALEYUV2PACKEDX
#define GREEN_DITHER
static void RENAME() yuv2yuvX(const int16_t *filter, int filterSize, const int16_t **src, uint8_t *dest, int dstW, const uint8_t *dither, int offset)
static void RENAME() yuv2rgb32_1(SwsContext *c, const int16_t *buf0, const int16_t *ubuf[2], const int16_t *vbuf[2], const int16_t *abuf0, uint8_t *dest, int dstW, int uvalpha, int y)
YV12 to RGB without scaling or interpolating.
#define WRITEBGR24(dst, dstw, index)
planar YUV 4:2:0, 12bpp, 1 plane for Y and 1 plane for the UV components, which are interleaved (firs...
Definition: pixfmt.h:93
#define YSCALEYUV2RGB1b(index, c)
static const uint8_t offset[127][2]
Definition: vf_spp.c:70
void(* hcscale_fast)(struct SwsContext *c, int16_t *dst1, int16_t *dst2, int dstWidth, const uint8_t *src1, const uint8_t *src2, int srcW, int xInc)
#define YSCALEYUV2RGB_YA(index, c, b1, b2)
#define YSCALEYUV2PACKED(index, c)
static void RENAME() yuv2rgb565_X(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize, const int16_t *chrFilter, const int16_t **chrUSrc, const int16_t **chrVSrc, int chrFilterSize, const int16_t **alpSrc, uint8_t *dest, int dstW, int dstY)
static void RENAME() yuv2bgr24_X(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize, const int16_t *chrFilter, const int16_t **chrUSrc, const int16_t **chrVSrc, int chrFilterSize, const int16_t **alpSrc, uint8_t *dest, int dstW, int dstY)
#define WRITERGB16(dst, dstw, index)
as above, but U and V bytes are swapped
Definition: pixfmt.h:94
#define YSCALEYUV2PACKEDX_ACCURATE
static av_always_inline int is9_OR_10BPS(enum AVPixelFormat pix_fmt)
static void RENAME() yuv2rgb555_1(SwsContext *c, const int16_t *buf0, const int16_t *ubuf[2], const int16_t *vbuf[2], const int16_t *abuf0, uint8_t *dest, int dstW, int uvalpha, int y)
#define YSCALEYUV2PACKEDX_YA(offset, coeff, src1, src2, dst1, dst2)
int32_t
#define SWS_ACCURATE_RND
Definition: swscale.h:83
static void RENAME() yuv2yuyv422_1(SwsContext *c, const int16_t *buf0, const int16_t *ubuf[2], const int16_t *vbuf[2], const int16_t *abuf0, uint8_t *dest, int dstW, int uvalpha, int y)
packed RGB 8:8:8, 24bpp, BGRBGR...
Definition: pixfmt.h:71
#define U_TEMP
int dstW
Width of destination luma/alpha planes.
static void RENAME() yuv2yuyv422_2(SwsContext *c, const int16_t *buf[2], const int16_t *ubuf[2], const int16_t *vbuf[2], const int16_t *abuf[2], uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
NULL
Definition: eval.c:55
dest
Definition: start.py:60
AVS_Value src
Definition: avisynth_c.h:523
static void RENAME() yuv2rgb555_X(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize, const int16_t *chrFilter, const int16_t **chrUSrc, const int16_t **chrVSrc, int chrFilterSize, const int16_t **alpSrc, uint8_t *dest, int dstW, int dstY)
static av_always_inline void dither_8to16(const uint8_t *srcDither, int rot)
#define RENAME(a)
Definition: mpegaudiodec.c:109
#define AV_PIX_FMT_RGB32
Definition: pixfmt.h:259
#define YSCALEYUV2RGB(index, c)
static av_cold void RENAME() sws_init_swScale(SwsContext *c)
packed YUV 4:2:2, 16bpp, Y0 Cb Y1 Cr
Definition: pixfmt.h:69
void * buf
Definition: avisynth_c.h:594
#define YSCALEYUV2PACKED1b(index, c)
static void RENAME() yuv2rgb555_X_ar(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize, const int16_t *chrFilter, const int16_t **chrUSrc, const int16_t **chrVSrc, int chrFilterSize, const int16_t **alpSrc, uint8_t *dest, int dstW, int dstY)
static void RENAME() yuv2bgr24_X_ar(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize, const int16_t *chrFilter, const int16_t **chrUSrc, const int16_t **chrVSrc, int chrFilterSize, const int16_t **alpSrc, uint8_t *dest, int dstW, int dstY)
synthesis window for stochastic i
#define V_TEMP
static void RENAME() yuv2yuyv422_X_ar(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize, const int16_t *chrFilter, const int16_t **chrUSrc, const int16_t **chrVSrc, int chrFilterSize, const int16_t **alpSrc, uint8_t *dest, int dstW, int dstY)
#define Y_TEMP
static void RENAME() yuv2rgb32_2(SwsContext *c, const int16_t *buf[2], const int16_t *ubuf[2], const int16_t *vbuf[2], const int16_t *abuf[2], uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
vertical bilinear scale YV12 to RGB
#define CONFIG_SWSCALE_ALPHA
Definition: config.h:394
#define RED_DITHER
static double c[64]
#define AV_PIX_FMT_RGB555
Definition: pixfmt.h:269
#define MOVNTQ2
function y
Definition: D.m:1
static void RENAME() yuv2rgb565_1(SwsContext *c, const int16_t *buf0, const int16_t *ubuf[2], const int16_t *vbuf[2], const int16_t *abuf0, uint8_t *dest, int dstW, int uvalpha, int y)
int x86_reg
else dst[i][x+y *dst_stride[i]]
Definition: vf_mcdeint.c:160
#define YSCALEYUV2PACKEDX_ACCURATE_YA(offset)
#define AV_PIX_FMT_RGB565
Definition: pixfmt.h:268
static void RENAME() yuv2bgr24_2(SwsContext *c, const int16_t *buf[2], const int16_t *ubuf[2], const int16_t *vbuf[2], const int16_t *abuf[2], uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
#define BLUE_DITHER
#define av_always_inline
Definition: attributes.h:41
static void RENAME() yuv2rgb555_2(SwsContext *c, const int16_t *buf[2], const int16_t *ubuf[2], const int16_t *vbuf[2], const int16_t *abuf[2], uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
#define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)
int dummy
Definition: motion-test.c:64
#define WRITEYUY2(dst, dstw, index)
int srcW
Width of source luma/alpha planes.
AVPixelFormat
Pixel format.
Definition: pixfmt.h:66
static void RENAME() yuv2bgr24_1(SwsContext *c, const int16_t *buf0, const int16_t *ubuf[2], const int16_t *vbuf[2], const int16_t *abuf0, uint8_t *dest, int dstW, int uvalpha, int y)
#define YSCALEYUV2RGB1(index, c)