dsputil_mmx.c
Go to the documentation of this file.
1 /*
2  * MMX optimized DSP utils
3  * Copyright (c) 2000, 2001 Fabrice Bellard
4  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5  *
6  * This file is part of FFmpeg.
7  *
8  * FFmpeg is free software; you can redistribute it and/or
9  * modify it under the terms of the GNU Lesser General Public
10  * License as published by the Free Software Foundation; either
11  * version 2.1 of the License, or (at your option) any later version.
12  *
13  * FFmpeg is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16  * Lesser General Public License for more details.
17  *
18  * You should have received a copy of the GNU Lesser General Public
19  * License along with FFmpeg; if not, write to the Free Software
20  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21  *
22  * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
23  */
24 
25 #include "libavutil/attributes.h"
26 #include "libavutil/cpu.h"
27 #include "libavutil/x86/asm.h"
28 #include "libavcodec/dsputil.h"
29 #include "libavcodec/h264dsp.h"
30 #include "libavcodec/mpegvideo.h"
31 #include "libavcodec/simple_idct.h"
32 #include "libavcodec/videodsp.h"
33 #include "dsputil_mmx.h"
34 #include "idct_xvid.h"
35 #include "diracdsp_mmx.h"
36 
37 //#undef NDEBUG
38 //#include <assert.h>
39 
40 /* pixel operations */
41 DECLARE_ALIGNED(8, const uint64_t, ff_pw_15) = 0x000F000F000F000FULL;
42 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_17) = { 0x0011001100110011ULL, 0x0011001100110011ULL };
43 DECLARE_ALIGNED(8, const uint64_t, ff_pw_20) = 0x0014001400140014ULL;
44 DECLARE_ALIGNED(8, const uint64_t, ff_pw_42) = 0x002A002A002A002AULL;
45 DECLARE_ALIGNED(8, const uint64_t, ff_pw_53) = 0x0035003500350035ULL;
46 DECLARE_ALIGNED(8, const uint64_t, ff_pw_96) = 0x0060006000600060ULL;
47 DECLARE_ALIGNED(8, const uint64_t, ff_pw_128) = 0x0080008000800080ULL;
48 DECLARE_ALIGNED(8, const uint64_t, ff_pw_255) = 0x00ff00ff00ff00ffULL;
49 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_512) = { 0x0200020002000200ULL, 0x0200020002000200ULL };
50 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1019) = { 0x03FB03FB03FB03FBULL, 0x03FB03FB03FB03FBULL };
51 
52 DECLARE_ALIGNED(8, const uint64_t, ff_pb_3F) = 0x3F3F3F3F3F3F3F3FULL;
53 DECLARE_ALIGNED(8, const uint64_t, ff_pb_FC) = 0xFCFCFCFCFCFCFCFCULL;
54 
55 DECLARE_ALIGNED(16, const double, ff_pd_1)[2] = { 1.0, 1.0 };
56 DECLARE_ALIGNED(16, const double, ff_pd_2)[2] = { 2.0, 2.0 };
57 
58 
59 #if HAVE_YASM
60 void ff_put_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
61  int dstStride, int src1Stride, int h);
62 void ff_put_no_rnd_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1,
63  uint8_t *src2, int dstStride,
64  int src1Stride, int h);
65 void ff_avg_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
66  int dstStride, int src1Stride, int h);
67 void ff_put_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
68  int dstStride, int src1Stride, int h);
69 void ff_avg_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
70  int dstStride, int src1Stride, int h);
71 void ff_put_no_rnd_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
72  int dstStride, int src1Stride, int h);
73 void ff_avg_pixels8_mmxext(uint8_t *block, const uint8_t *pixels,
74  ptrdiff_t line_size, int h);
75 
76 static void ff_put_pixels16_mmxext(uint8_t *block, const uint8_t *pixels,
77  ptrdiff_t line_size, int h)
78 {
79  ff_put_pixels8_mmxext(block, pixels, line_size, h);
80  ff_put_pixels8_mmxext(block + 8, pixels + 8, line_size, h);
81 }
82 
83 void ff_put_mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
84  int dstStride, int srcStride, int h);
85 void ff_avg_mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
86  int dstStride, int srcStride, int h);
87 void ff_put_no_rnd_mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
88  int dstStride, int srcStride,
89  int h);
90 void ff_put_mpeg4_qpel8_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
91  int dstStride, int srcStride, int h);
92 void ff_avg_mpeg4_qpel8_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
93  int dstStride, int srcStride, int h);
94 void ff_put_no_rnd_mpeg4_qpel8_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
95  int dstStride, int srcStride,
96  int h);
97 void ff_put_mpeg4_qpel16_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
98  int dstStride, int srcStride);
99 void ff_avg_mpeg4_qpel16_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
100  int dstStride, int srcStride);
101 void ff_put_no_rnd_mpeg4_qpel16_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
102  int dstStride, int srcStride);
103 void ff_put_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
104  int dstStride, int srcStride);
105 void ff_avg_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
106  int dstStride, int srcStride);
107 void ff_put_no_rnd_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
108  int dstStride, int srcStride);
109 #define ff_put_no_rnd_pixels16_mmxext ff_put_pixels16_mmxext
110 #define ff_put_no_rnd_pixels8_mmxext ff_put_pixels8_mmxext
111 #endif /* HAVE_YASM */
112 
113 
114 #if HAVE_INLINE_ASM
115 
116 #define JUMPALIGN() __asm__ volatile (".p2align 3"::)
117 #define MOVQ_ZERO(regd) __asm__ volatile ("pxor %%"#regd", %%"#regd ::)
118 
119 #define MOVQ_BFE(regd) \
120  __asm__ volatile ( \
121  "pcmpeqd %%"#regd", %%"#regd" \n\t" \
122  "paddb %%"#regd", %%"#regd" \n\t" ::)
123 
124 #ifndef PIC
125 #define MOVQ_BONE(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_bone))
126 #define MOVQ_WTWO(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_wtwo))
127 #else
128 // for shared library it's better to use this way for accessing constants
129 // pcmpeqd -> -1
130 #define MOVQ_BONE(regd) \
131  __asm__ volatile ( \
132  "pcmpeqd %%"#regd", %%"#regd" \n\t" \
133  "psrlw $15, %%"#regd" \n\t" \
134  "packuswb %%"#regd", %%"#regd" \n\t" ::)
135 
136 #define MOVQ_WTWO(regd) \
137  __asm__ volatile ( \
138  "pcmpeqd %%"#regd", %%"#regd" \n\t" \
139  "psrlw $15, %%"#regd" \n\t" \
140  "psllw $1, %%"#regd" \n\t"::)
141 
142 #endif
143 
144 // using regr as temporary and for the output result
145 // first argument is unmodifed and second is trashed
146 // regfe is supposed to contain 0xfefefefefefefefe
147 #define PAVGB_MMX(rega, regb, regr, regfe) \
148  "movq "#rega", "#regr" \n\t" \
149  "por "#regb", "#regr" \n\t" \
150  "pxor "#rega", "#regb" \n\t" \
151  "pand "#regfe", "#regb" \n\t" \
152  "psrlq $1, "#regb" \n\t" \
153  "psubb "#regb", "#regr" \n\t"
154 
155 // mm6 is supposed to contain 0xfefefefefefefefe
156 #define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
157  "movq "#rega", "#regr" \n\t" \
158  "movq "#regc", "#regp" \n\t" \
159  "por "#regb", "#regr" \n\t" \
160  "por "#regd", "#regp" \n\t" \
161  "pxor "#rega", "#regb" \n\t" \
162  "pxor "#regc", "#regd" \n\t" \
163  "pand %%mm6, "#regb" \n\t" \
164  "pand %%mm6, "#regd" \n\t" \
165  "psrlq $1, "#regd" \n\t" \
166  "psrlq $1, "#regb" \n\t" \
167  "psubb "#regb", "#regr" \n\t" \
168  "psubb "#regd", "#regp" \n\t"
169 
170 /***********************************/
171 /* MMX rounding */
172 
173 #define DEF(x, y) x ## _ ## y ## _mmx
174 #define SET_RND MOVQ_WTWO
175 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f)
176 #define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e)
177 #define OP_AVG(a, b, c, e) PAVGB_MMX(a, b, c, e)
178 
179 #include "dsputil_rnd_template.c"
180 
181 #undef DEF
182 #undef SET_RND
183 #undef PAVGBP
184 #undef PAVGB
185 #undef OP_AVG
186 
187 #endif /* HAVE_INLINE_ASM */
188 
189 
190 #if HAVE_YASM
191 
192 /***********************************/
193 /* MMXEXT specific */
194 
195 //FIXME the following could be optimized too ...
196 static void ff_avg_pixels16_mmxext(uint8_t *block, const uint8_t *pixels,
197  int line_size, int h)
198 {
199  ff_avg_pixels8_mmxext(block, pixels, line_size, h);
200  ff_avg_pixels8_mmxext(block + 8, pixels + 8, line_size, h);
201 }
202 
203 #endif /* HAVE_YASM */
204 
205 
206 #if HAVE_INLINE_ASM
207 /***********************************/
208 /* standard MMX */
209 
210 void ff_put_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
211  int line_size)
212 {
213  const int16_t *p;
214  uint8_t *pix;
215 
216  /* read the pixels */
217  p = block;
218  pix = pixels;
219  /* unrolled loop */
220  __asm__ volatile (
221  "movq (%3), %%mm0 \n\t"
222  "movq 8(%3), %%mm1 \n\t"
223  "movq 16(%3), %%mm2 \n\t"
224  "movq 24(%3), %%mm3 \n\t"
225  "movq 32(%3), %%mm4 \n\t"
226  "movq 40(%3), %%mm5 \n\t"
227  "movq 48(%3), %%mm6 \n\t"
228  "movq 56(%3), %%mm7 \n\t"
229  "packuswb %%mm1, %%mm0 \n\t"
230  "packuswb %%mm3, %%mm2 \n\t"
231  "packuswb %%mm5, %%mm4 \n\t"
232  "packuswb %%mm7, %%mm6 \n\t"
233  "movq %%mm0, (%0) \n\t"
234  "movq %%mm2, (%0, %1) \n\t"
235  "movq %%mm4, (%0, %1, 2) \n\t"
236  "movq %%mm6, (%0, %2) \n\t"
237  :: "r"(pix), "r"((x86_reg)line_size), "r"((x86_reg)line_size * 3),
238  "r"(p)
239  : "memory");
240  pix += line_size * 4;
241  p += 32;
242 
243  // if here would be an exact copy of the code above
244  // compiler would generate some very strange code
245  // thus using "r"
246  __asm__ volatile (
247  "movq (%3), %%mm0 \n\t"
248  "movq 8(%3), %%mm1 \n\t"
249  "movq 16(%3), %%mm2 \n\t"
250  "movq 24(%3), %%mm3 \n\t"
251  "movq 32(%3), %%mm4 \n\t"
252  "movq 40(%3), %%mm5 \n\t"
253  "movq 48(%3), %%mm6 \n\t"
254  "movq 56(%3), %%mm7 \n\t"
255  "packuswb %%mm1, %%mm0 \n\t"
256  "packuswb %%mm3, %%mm2 \n\t"
257  "packuswb %%mm5, %%mm4 \n\t"
258  "packuswb %%mm7, %%mm6 \n\t"
259  "movq %%mm0, (%0) \n\t"
260  "movq %%mm2, (%0, %1) \n\t"
261  "movq %%mm4, (%0, %1, 2) \n\t"
262  "movq %%mm6, (%0, %2) \n\t"
263  :: "r"(pix), "r"((x86_reg)line_size), "r"((x86_reg)line_size * 3), "r"(p)
264  : "memory");
265 }
266 
267 #define put_signed_pixels_clamped_mmx_half(off) \
268  "movq "#off"(%2), %%mm1 \n\t" \
269  "movq 16 + "#off"(%2), %%mm2 \n\t" \
270  "movq 32 + "#off"(%2), %%mm3 \n\t" \
271  "movq 48 + "#off"(%2), %%mm4 \n\t" \
272  "packsswb 8 + "#off"(%2), %%mm1 \n\t" \
273  "packsswb 24 + "#off"(%2), %%mm2 \n\t" \
274  "packsswb 40 + "#off"(%2), %%mm3 \n\t" \
275  "packsswb 56 + "#off"(%2), %%mm4 \n\t" \
276  "paddb %%mm0, %%mm1 \n\t" \
277  "paddb %%mm0, %%mm2 \n\t" \
278  "paddb %%mm0, %%mm3 \n\t" \
279  "paddb %%mm0, %%mm4 \n\t" \
280  "movq %%mm1, (%0) \n\t" \
281  "movq %%mm2, (%0, %3) \n\t" \
282  "movq %%mm3, (%0, %3, 2) \n\t" \
283  "movq %%mm4, (%0, %1) \n\t"
284 
285 void ff_put_signed_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
286  int line_size)
287 {
288  x86_reg line_skip = line_size;
289  x86_reg line_skip3;
290 
291  __asm__ volatile (
292  "movq "MANGLE(ff_pb_80)", %%mm0 \n\t"
293  "lea (%3, %3, 2), %1 \n\t"
294  put_signed_pixels_clamped_mmx_half(0)
295  "lea (%0, %3, 4), %0 \n\t"
296  put_signed_pixels_clamped_mmx_half(64)
297  : "+&r"(pixels), "=&r"(line_skip3)
298  : "r"(block), "r"(line_skip)
299  : "memory");
300 }
301 
302 void ff_add_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
303  int line_size)
304 {
305  const int16_t *p;
306  uint8_t *pix;
307  int i;
308 
309  /* read the pixels */
310  p = block;
311  pix = pixels;
312  MOVQ_ZERO(mm7);
313  i = 4;
314  do {
315  __asm__ volatile (
316  "movq (%2), %%mm0 \n\t"
317  "movq 8(%2), %%mm1 \n\t"
318  "movq 16(%2), %%mm2 \n\t"
319  "movq 24(%2), %%mm3 \n\t"
320  "movq %0, %%mm4 \n\t"
321  "movq %1, %%mm6 \n\t"
322  "movq %%mm4, %%mm5 \n\t"
323  "punpcklbw %%mm7, %%mm4 \n\t"
324  "punpckhbw %%mm7, %%mm5 \n\t"
325  "paddsw %%mm4, %%mm0 \n\t"
326  "paddsw %%mm5, %%mm1 \n\t"
327  "movq %%mm6, %%mm5 \n\t"
328  "punpcklbw %%mm7, %%mm6 \n\t"
329  "punpckhbw %%mm7, %%mm5 \n\t"
330  "paddsw %%mm6, %%mm2 \n\t"
331  "paddsw %%mm5, %%mm3 \n\t"
332  "packuswb %%mm1, %%mm0 \n\t"
333  "packuswb %%mm3, %%mm2 \n\t"
334  "movq %%mm0, %0 \n\t"
335  "movq %%mm2, %1 \n\t"
336  : "+m"(*pix), "+m"(*(pix + line_size))
337  : "r"(p)
338  : "memory");
339  pix += line_size * 2;
340  p += 16;
341  } while (--i);
342 }
343 
344 static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels,
345  ptrdiff_t line_size, int h)
346 {
347  __asm__ volatile (
348  "lea (%3, %3), %%"REG_a" \n\t"
349  ".p2align 3 \n\t"
350  "1: \n\t"
351  "movq (%1 ), %%mm0 \n\t"
352  "movq (%1, %3), %%mm1 \n\t"
353  "movq %%mm0, (%2) \n\t"
354  "movq %%mm1, (%2, %3) \n\t"
355  "add %%"REG_a", %1 \n\t"
356  "add %%"REG_a", %2 \n\t"
357  "movq (%1 ), %%mm0 \n\t"
358  "movq (%1, %3), %%mm1 \n\t"
359  "movq %%mm0, (%2) \n\t"
360  "movq %%mm1, (%2, %3) \n\t"
361  "add %%"REG_a", %1 \n\t"
362  "add %%"REG_a", %2 \n\t"
363  "subl $4, %0 \n\t"
364  "jnz 1b \n\t"
365  : "+g"(h), "+r"(pixels), "+r"(block)
366  : "r"((x86_reg)line_size)
367  : "%"REG_a, "memory"
368  );
369 }
370 
371 static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels,
372  ptrdiff_t line_size, int h)
373 {
374  __asm__ volatile (
375  "lea (%3, %3), %%"REG_a" \n\t"
376  ".p2align 3 \n\t"
377  "1: \n\t"
378  "movq (%1 ), %%mm0 \n\t"
379  "movq 8(%1 ), %%mm4 \n\t"
380  "movq (%1, %3), %%mm1 \n\t"
381  "movq 8(%1, %3), %%mm5 \n\t"
382  "movq %%mm0, (%2) \n\t"
383  "movq %%mm4, 8(%2) \n\t"
384  "movq %%mm1, (%2, %3) \n\t"
385  "movq %%mm5, 8(%2, %3) \n\t"
386  "add %%"REG_a", %1 \n\t"
387  "add %%"REG_a", %2 \n\t"
388  "movq (%1 ), %%mm0 \n\t"
389  "movq 8(%1 ), %%mm4 \n\t"
390  "movq (%1, %3), %%mm1 \n\t"
391  "movq 8(%1, %3), %%mm5 \n\t"
392  "movq %%mm0, (%2) \n\t"
393  "movq %%mm4, 8(%2) \n\t"
394  "movq %%mm1, (%2, %3) \n\t"
395  "movq %%mm5, 8(%2, %3) \n\t"
396  "add %%"REG_a", %1 \n\t"
397  "add %%"REG_a", %2 \n\t"
398  "subl $4, %0 \n\t"
399  "jnz 1b \n\t"
400  : "+g"(h), "+r"(pixels), "+r"(block)
401  : "r"((x86_reg)line_size)
402  : "%"REG_a, "memory"
403  );
404 }
405 
406 #define CLEAR_BLOCKS(name, n) \
407 static void name(int16_t *blocks) \
408 { \
409  __asm__ volatile ( \
410  "pxor %%mm7, %%mm7 \n\t" \
411  "mov %1, %%"REG_a" \n\t" \
412  "1: \n\t" \
413  "movq %%mm7, (%0, %%"REG_a") \n\t" \
414  "movq %%mm7, 8(%0, %%"REG_a") \n\t" \
415  "movq %%mm7, 16(%0, %%"REG_a") \n\t" \
416  "movq %%mm7, 24(%0, %%"REG_a") \n\t" \
417  "add $32, %%"REG_a" \n\t" \
418  "js 1b \n\t" \
419  :: "r"(((uint8_t *)blocks) + 128 * n), \
420  "i"(-128 * n) \
421  : "%"REG_a \
422  ); \
423 }
424 CLEAR_BLOCKS(clear_blocks_mmx, 6)
425 CLEAR_BLOCKS(clear_block_mmx, 1)
426 
427 static void clear_block_sse(int16_t *block)
428 {
429  __asm__ volatile (
430  "xorps %%xmm0, %%xmm0 \n"
431  "movaps %%xmm0, (%0) \n"
432  "movaps %%xmm0, 16(%0) \n"
433  "movaps %%xmm0, 32(%0) \n"
434  "movaps %%xmm0, 48(%0) \n"
435  "movaps %%xmm0, 64(%0) \n"
436  "movaps %%xmm0, 80(%0) \n"
437  "movaps %%xmm0, 96(%0) \n"
438  "movaps %%xmm0, 112(%0) \n"
439  :: "r"(block)
440  : "memory"
441  );
442 }
443 
444 static void clear_blocks_sse(int16_t *blocks)
445 {
446  __asm__ volatile (
447  "xorps %%xmm0, %%xmm0 \n"
448  "mov %1, %%"REG_a" \n"
449  "1: \n"
450  "movaps %%xmm0, (%0, %%"REG_a") \n"
451  "movaps %%xmm0, 16(%0, %%"REG_a") \n"
452  "movaps %%xmm0, 32(%0, %%"REG_a") \n"
453  "movaps %%xmm0, 48(%0, %%"REG_a") \n"
454  "movaps %%xmm0, 64(%0, %%"REG_a") \n"
455  "movaps %%xmm0, 80(%0, %%"REG_a") \n"
456  "movaps %%xmm0, 96(%0, %%"REG_a") \n"
457  "movaps %%xmm0, 112(%0, %%"REG_a") \n"
458  "add $128, %%"REG_a" \n"
459  "js 1b \n"
460  :: "r"(((uint8_t *)blocks) + 128 * 6),
461  "i"(-128 * 6)
462  : "%"REG_a
463  );
464 }
465 
466 static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w)
467 {
468  x86_reg i = 0;
469  __asm__ volatile (
470  "jmp 2f \n\t"
471  "1: \n\t"
472  "movq (%1, %0), %%mm0 \n\t"
473  "movq (%2, %0), %%mm1 \n\t"
474  "paddb %%mm0, %%mm1 \n\t"
475  "movq %%mm1, (%2, %0) \n\t"
476  "movq 8(%1, %0), %%mm0 \n\t"
477  "movq 8(%2, %0), %%mm1 \n\t"
478  "paddb %%mm0, %%mm1 \n\t"
479  "movq %%mm1, 8(%2, %0) \n\t"
480  "add $16, %0 \n\t"
481  "2: \n\t"
482  "cmp %3, %0 \n\t"
483  "js 1b \n\t"
484  : "+r"(i)
485  : "r"(src), "r"(dst), "r"((x86_reg)w - 15)
486  );
487  for ( ; i < w; i++)
488  dst[i + 0] += src[i + 0];
489 }
490 
491 #if HAVE_7REGS
492 static void add_hfyu_median_prediction_cmov(uint8_t *dst, const uint8_t *top,
493  const uint8_t *diff, int w,
494  int *left, int *left_top)
495 {
496  x86_reg w2 = -w;
497  x86_reg x;
498  int l = *left & 0xff;
499  int tl = *left_top & 0xff;
500  int t;
501  __asm__ volatile (
502  "mov %7, %3 \n"
503  "1: \n"
504  "movzbl (%3, %4), %2 \n"
505  "mov %2, %k3 \n"
506  "sub %b1, %b3 \n"
507  "add %b0, %b3 \n"
508  "mov %2, %1 \n"
509  "cmp %0, %2 \n"
510  "cmovg %0, %2 \n"
511  "cmovg %1, %0 \n"
512  "cmp %k3, %0 \n"
513  "cmovg %k3, %0 \n"
514  "mov %7, %3 \n"
515  "cmp %2, %0 \n"
516  "cmovl %2, %0 \n"
517  "add (%6, %4), %b0 \n"
518  "mov %b0, (%5, %4) \n"
519  "inc %4 \n"
520  "jl 1b \n"
521  : "+&q"(l), "+&q"(tl), "=&r"(t), "=&q"(x), "+&r"(w2)
522  : "r"(dst + w), "r"(diff + w), "rm"(top + w)
523  );
524  *left = l;
525  *left_top = tl;
526 }
527 #endif
528 
529 /* Draw the edges of width 'w' of an image of size width, height
530  * this MMX version can only handle w == 8 || w == 16. */
531 static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height,
532  int w, int h, int sides)
533 {
534  uint8_t *ptr, *last_line;
535  int i;
536 
537  last_line = buf + (height - 1) * wrap;
538  /* left and right */
539  ptr = buf;
540  if (w == 8) {
541  __asm__ volatile (
542  "1: \n\t"
543  "movd (%0), %%mm0 \n\t"
544  "punpcklbw %%mm0, %%mm0 \n\t"
545  "punpcklwd %%mm0, %%mm0 \n\t"
546  "punpckldq %%mm0, %%mm0 \n\t"
547  "movq %%mm0, -8(%0) \n\t"
548  "movq -8(%0, %2), %%mm1 \n\t"
549  "punpckhbw %%mm1, %%mm1 \n\t"
550  "punpckhwd %%mm1, %%mm1 \n\t"
551  "punpckhdq %%mm1, %%mm1 \n\t"
552  "movq %%mm1, (%0, %2) \n\t"
553  "add %1, %0 \n\t"
554  "cmp %3, %0 \n\t"
555  "jb 1b \n\t"
556  : "+r"(ptr)
557  : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
558  );
559  } else if(w==16){
560  __asm__ volatile (
561  "1: \n\t"
562  "movd (%0), %%mm0 \n\t"
563  "punpcklbw %%mm0, %%mm0 \n\t"
564  "punpcklwd %%mm0, %%mm0 \n\t"
565  "punpckldq %%mm0, %%mm0 \n\t"
566  "movq %%mm0, -8(%0) \n\t"
567  "movq %%mm0, -16(%0) \n\t"
568  "movq -8(%0, %2), %%mm1 \n\t"
569  "punpckhbw %%mm1, %%mm1 \n\t"
570  "punpckhwd %%mm1, %%mm1 \n\t"
571  "punpckhdq %%mm1, %%mm1 \n\t"
572  "movq %%mm1, (%0, %2) \n\t"
573  "movq %%mm1, 8(%0, %2) \n\t"
574  "add %1, %0 \n\t"
575  "cmp %3, %0 \n\t"
576  "jb 1b \n\t"
577  : "+r"(ptr)
578  : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
579  );
580  } else {
581  av_assert1(w == 4);
582  __asm__ volatile (
583  "1: \n\t"
584  "movd (%0), %%mm0 \n\t"
585  "punpcklbw %%mm0, %%mm0 \n\t"
586  "punpcklwd %%mm0, %%mm0 \n\t"
587  "movd %%mm0, -4(%0) \n\t"
588  "movd -4(%0, %2), %%mm1 \n\t"
589  "punpcklbw %%mm1, %%mm1 \n\t"
590  "punpckhwd %%mm1, %%mm1 \n\t"
591  "punpckhdq %%mm1, %%mm1 \n\t"
592  "movd %%mm1, (%0, %2) \n\t"
593  "add %1, %0 \n\t"
594  "cmp %3, %0 \n\t"
595  "jb 1b \n\t"
596  : "+r"(ptr)
597  : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
598  );
599  }
600 
601  /* top and bottom (and hopefully also the corners) */
602  if (sides & EDGE_TOP) {
603  for (i = 0; i < h; i += 4) {
604  ptr = buf - (i + 1) * wrap - w;
605  __asm__ volatile (
606  "1: \n\t"
607  "movq (%1, %0), %%mm0 \n\t"
608  "movq %%mm0, (%0) \n\t"
609  "movq %%mm0, (%0, %2) \n\t"
610  "movq %%mm0, (%0, %2, 2) \n\t"
611  "movq %%mm0, (%0, %3) \n\t"
612  "add $8, %0 \n\t"
613  "cmp %4, %0 \n\t"
614  "jb 1b \n\t"
615  : "+r"(ptr)
616  : "r"((x86_reg)buf - (x86_reg)ptr - w), "r"((x86_reg) -wrap),
617  "r"((x86_reg) -wrap * 3), "r"(ptr + width + 2 * w)
618  );
619  }
620  }
621 
622  if (sides & EDGE_BOTTOM) {
623  for (i = 0; i < h; i += 4) {
624  ptr = last_line + (i + 1) * wrap - w;
625  __asm__ volatile (
626  "1: \n\t"
627  "movq (%1, %0), %%mm0 \n\t"
628  "movq %%mm0, (%0) \n\t"
629  "movq %%mm0, (%0, %2) \n\t"
630  "movq %%mm0, (%0, %2, 2) \n\t"
631  "movq %%mm0, (%0, %3) \n\t"
632  "add $8, %0 \n\t"
633  "cmp %4, %0 \n\t"
634  "jb 1b \n\t"
635  : "+r"(ptr)
636  : "r"((x86_reg)last_line - (x86_reg)ptr - w),
637  "r"((x86_reg)wrap), "r"((x86_reg)wrap * 3),
638  "r"(ptr + width + 2 * w)
639  );
640  }
641  }
642 }
643 #endif /* HAVE_INLINE_ASM */
644 
645 
646 #if HAVE_YASM
647 #define QPEL_OP(OPNAME, ROUNDER, RND, MMX) \
648 static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, \
649  ptrdiff_t stride) \
650 { \
651  ff_ ## OPNAME ## pixels8_ ## MMX(dst, src, stride, 8); \
652 } \
653  \
654 static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, \
655  ptrdiff_t stride) \
656 { \
657  uint64_t temp[8]; \
658  uint8_t * const half = (uint8_t*)temp; \
659  ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, \
660  stride, 8); \
661  ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src, half, \
662  stride, stride, 8); \
663 } \
664  \
665 static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, \
666  ptrdiff_t stride) \
667 { \
668  ff_ ## OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, \
669  stride, 8); \
670 } \
671  \
672 static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, \
673  ptrdiff_t stride) \
674 { \
675  uint64_t temp[8]; \
676  uint8_t * const half = (uint8_t*)temp; \
677  ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, \
678  stride, 8); \
679  ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src + 1, half, stride, \
680  stride, 8); \
681 } \
682  \
683 static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, \
684  ptrdiff_t stride) \
685 { \
686  uint64_t temp[8]; \
687  uint8_t * const half = (uint8_t*)temp; \
688  ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, \
689  8, stride); \
690  ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src, half, \
691  stride, stride, 8); \
692 } \
693  \
694 static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, \
695  ptrdiff_t stride) \
696 { \
697  ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, \
698  stride, stride); \
699 } \
700  \
701 static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, \
702  ptrdiff_t stride) \
703 { \
704  uint64_t temp[8]; \
705  uint8_t * const half = (uint8_t*)temp; \
706  ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, \
707  8, stride); \
708  ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src + stride, half, stride,\
709  stride, 8); \
710 } \
711  \
712 static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, \
713  ptrdiff_t stride) \
714 { \
715  uint64_t half[8 + 9]; \
716  uint8_t * const halfH = ((uint8_t*)half) + 64; \
717  uint8_t * const halfHV = ((uint8_t*)half); \
718  ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
719  stride, 9); \
720  ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, \
721  stride, 9); \
722  ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
723  ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, \
724  stride, 8, 8); \
725 } \
726  \
727 static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, \
728  ptrdiff_t stride) \
729 { \
730  uint64_t half[8 + 9]; \
731  uint8_t * const halfH = ((uint8_t*)half) + 64; \
732  uint8_t * const halfHV = ((uint8_t*)half); \
733  ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
734  stride, 9); \
735  ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
736  stride, 9); \
737  ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
738  ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, \
739  stride, 8, 8); \
740 } \
741  \
742 static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, \
743  ptrdiff_t stride) \
744 { \
745  uint64_t half[8 + 9]; \
746  uint8_t * const halfH = ((uint8_t*)half) + 64; \
747  uint8_t * const halfHV = ((uint8_t*)half); \
748  ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
749  stride, 9); \
750  ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, \
751  stride, 9); \
752  ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
753  ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, \
754  stride, 8, 8); \
755 } \
756  \
757 static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, \
758  ptrdiff_t stride) \
759 { \
760  uint64_t half[8 + 9]; \
761  uint8_t * const halfH = ((uint8_t*)half) + 64; \
762  uint8_t * const halfHV = ((uint8_t*)half); \
763  ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
764  stride, 9); \
765  ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
766  stride, 9); \
767  ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
768  ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, \
769  stride, 8, 8); \
770 } \
771  \
772 static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, \
773  ptrdiff_t stride) \
774 { \
775  uint64_t half[8 + 9]; \
776  uint8_t * const halfH = ((uint8_t*)half) + 64; \
777  uint8_t * const halfHV = ((uint8_t*)half); \
778  ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
779  stride, 9); \
780  ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
781  ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, \
782  stride, 8, 8); \
783 } \
784  \
785 static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, \
786  ptrdiff_t stride) \
787 { \
788  uint64_t half[8 + 9]; \
789  uint8_t * const halfH = ((uint8_t*)half) + 64; \
790  uint8_t * const halfHV = ((uint8_t*)half); \
791  ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
792  stride, 9); \
793  ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
794  ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, \
795  stride, 8, 8); \
796 } \
797  \
798 static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, \
799  ptrdiff_t stride) \
800 { \
801  uint64_t half[8 + 9]; \
802  uint8_t * const halfH = ((uint8_t*)half); \
803  ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
804  stride, 9); \
805  ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, \
806  8, stride, 9); \
807  ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, \
808  stride, 8); \
809 } \
810  \
811 static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, \
812  ptrdiff_t stride) \
813 { \
814  uint64_t half[8 + 9]; \
815  uint8_t * const halfH = ((uint8_t*)half); \
816  ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
817  stride, 9); \
818  ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
819  stride, 9); \
820  ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, \
821  stride, 8); \
822 } \
823  \
824 static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, \
825  ptrdiff_t stride) \
826 { \
827  uint64_t half[9]; \
828  uint8_t * const halfH = ((uint8_t*)half); \
829  ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
830  stride, 9); \
831  ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, \
832  stride, 8); \
833 } \
834  \
835 static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, \
836  ptrdiff_t stride) \
837 { \
838  ff_ ## OPNAME ## pixels16_ ## MMX(dst, src, stride, 16); \
839 } \
840  \
841 static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, \
842  ptrdiff_t stride) \
843 { \
844  uint64_t temp[32]; \
845  uint8_t * const half = (uint8_t*)temp; \
846  ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, \
847  stride, 16); \
848  ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, \
849  stride, 16); \
850 } \
851  \
852 static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, \
853  ptrdiff_t stride) \
854 { \
855  ff_ ## OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, \
856  stride, stride, 16);\
857 } \
858  \
859 static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, \
860  ptrdiff_t stride) \
861 { \
862  uint64_t temp[32]; \
863  uint8_t * const half = (uint8_t*)temp; \
864  ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, \
865  stride, 16); \
866  ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src + 1, half, \
867  stride, stride, 16); \
868 } \
869  \
870 static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, \
871  ptrdiff_t stride) \
872 { \
873  uint64_t temp[32]; \
874  uint8_t * const half = (uint8_t*)temp; \
875  ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, \
876  stride); \
877  ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, \
878  stride, 16); \
879 } \
880  \
881 static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, \
882  ptrdiff_t stride) \
883 { \
884  ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, \
885  stride, stride); \
886 } \
887  \
888 static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, \
889  ptrdiff_t stride) \
890 { \
891  uint64_t temp[32]; \
892  uint8_t * const half = (uint8_t*)temp; \
893  ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, \
894  stride); \
895  ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, \
896  stride, stride, 16); \
897 } \
898  \
899 static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, \
900  ptrdiff_t stride) \
901 { \
902  uint64_t half[16 * 2 + 17 * 2]; \
903  uint8_t * const halfH = ((uint8_t*)half) + 256; \
904  uint8_t * const halfHV = ((uint8_t*)half); \
905  ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
906  stride, 17); \
907  ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
908  stride, 17); \
909  ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
910  16, 16); \
911  ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, \
912  stride, 16, 16); \
913 } \
914  \
915 static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, \
916  ptrdiff_t stride) \
917 { \
918  uint64_t half[16 * 2 + 17 * 2]; \
919  uint8_t * const halfH = ((uint8_t*)half) + 256; \
920  uint8_t * const halfHV = ((uint8_t*)half); \
921  ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
922  stride, 17); \
923  ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
924  stride, 17); \
925  ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
926  16, 16); \
927  ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, \
928  stride, 16, 16); \
929 } \
930  \
931 static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, \
932  ptrdiff_t stride) \
933 { \
934  uint64_t half[16 * 2 + 17 * 2]; \
935  uint8_t * const halfH = ((uint8_t*)half) + 256; \
936  uint8_t * const halfHV = ((uint8_t*)half); \
937  ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
938  stride, 17); \
939  ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
940  stride, 17); \
941  ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
942  16, 16); \
943  ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, \
944  stride, 16, 16); \
945 } \
946  \
947 static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, \
948  ptrdiff_t stride) \
949 { \
950  uint64_t half[16 * 2 + 17 * 2]; \
951  uint8_t * const halfH = ((uint8_t*)half) + 256; \
952  uint8_t * const halfHV = ((uint8_t*)half); \
953  ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
954  stride, 17); \
955  ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
956  stride, 17); \
957  ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
958  16, 16); \
959  ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, \
960  stride, 16, 16); \
961 } \
962  \
963 static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, \
964  ptrdiff_t stride) \
965 { \
966  uint64_t half[16 * 2 + 17 * 2]; \
967  uint8_t * const halfH = ((uint8_t*)half) + 256; \
968  uint8_t * const halfHV = ((uint8_t*)half); \
969  ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
970  stride, 17); \
971  ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
972  16, 16); \
973  ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, \
974  stride, 16, 16); \
975 } \
976  \
977 static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, \
978  ptrdiff_t stride) \
979 { \
980  uint64_t half[16 * 2 + 17 * 2]; \
981  uint8_t * const halfH = ((uint8_t*)half) + 256; \
982  uint8_t * const halfHV = ((uint8_t*)half); \
983  ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
984  stride, 17); \
985  ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
986  16, 16); \
987  ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, \
988  stride, 16, 16); \
989 } \
990  \
991 static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, \
992  ptrdiff_t stride) \
993 { \
994  uint64_t half[17 * 2]; \
995  uint8_t * const halfH = ((uint8_t*)half); \
996  ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
997  stride, 17); \
998  ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
999  stride, 17); \
1000  ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, \
1001  stride, 16); \
1002 } \
1003  \
1004 static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, \
1005  ptrdiff_t stride) \
1006 { \
1007  uint64_t half[17 * 2]; \
1008  uint8_t * const halfH = ((uint8_t*)half); \
1009  ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1010  stride, 17); \
1011  ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
1012  stride, 17); \
1013  ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, \
1014  stride, 16); \
1015 } \
1016  \
1017 static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, \
1018  ptrdiff_t stride) \
1019 { \
1020  uint64_t half[17 * 2]; \
1021  uint8_t * const halfH = ((uint8_t*)half); \
1022  ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1023  stride, 17); \
1024  ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, \
1025  stride, 16); \
1026 }
1027 
1028 QPEL_OP(put_, ff_pw_16, _, mmxext)
1029 QPEL_OP(avg_, ff_pw_16, _, mmxext)
1030 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, mmxext)
1031 #endif /* HAVE_YASM */
1032 
1033 
1034 #if HAVE_INLINE_ASM
1036 {
1037  put_pixels8_xy2_mmx(dst, src, stride, 8);
1038 }
1040 {
1041  put_pixels16_xy2_mmx(dst, src, stride, 16);
1042 }
1044 {
1045  avg_pixels8_xy2_mmx(dst, src, stride, 8);
1046 }
1048 {
1049  avg_pixels16_xy2_mmx(dst, src, stride, 16);
1050 }
1051 
1052 typedef void emulated_edge_mc_func(uint8_t *dst, const uint8_t *src,
1053  ptrdiff_t linesize, int block_w, int block_h,
1054  int src_x, int src_y, int w, int h);
1055 
1056 static av_always_inline void gmc(uint8_t *dst, uint8_t *src,
1057  int stride, int h, int ox, int oy,
1058  int dxx, int dxy, int dyx, int dyy,
1059  int shift, int r, int width, int height,
1060  emulated_edge_mc_func *emu_edge_fn)
1061 {
1062  const int w = 8;
1063  const int ix = ox >> (16 + shift);
1064  const int iy = oy >> (16 + shift);
1065  const int oxs = ox >> 4;
1066  const int oys = oy >> 4;
1067  const int dxxs = dxx >> 4;
1068  const int dxys = dxy >> 4;
1069  const int dyxs = dyx >> 4;
1070  const int dyys = dyy >> 4;
1071  const uint16_t r4[4] = { r, r, r, r };
1072  const uint16_t dxy4[4] = { dxys, dxys, dxys, dxys };
1073  const uint16_t dyy4[4] = { dyys, dyys, dyys, dyys };
1074  const uint64_t shift2 = 2 * shift;
1075 #define MAX_STRIDE 4096U
1076 #define MAX_H 8U
1077  uint8_t edge_buf[(MAX_H + 1) * MAX_STRIDE];
1078  int x, y;
1079 
1080  const int dxw = (dxx - (1 << (16 + shift))) * (w - 1);
1081  const int dyh = (dyy - (1 << (16 + shift))) * (h - 1);
1082  const int dxh = dxy * (h - 1);
1083  const int dyw = dyx * (w - 1);
1084  int need_emu = (unsigned)ix >= width - w ||
1085  (unsigned)iy >= height - h;
1086 
1087  if ( // non-constant fullpel offset (3% of blocks)
1088  ((ox ^ (ox + dxw)) | (ox ^ (ox + dxh)) | (ox ^ (ox + dxw + dxh)) |
1089  (oy ^ (oy + dyw)) | (oy ^ (oy + dyh)) | (oy ^ (oy + dyw + dyh))) >> (16 + shift)
1090  // uses more than 16 bits of subpel mv (only at huge resolution)
1091  || (dxx | dxy | dyx | dyy) & 15
1092  || (need_emu && (h > MAX_H || stride > MAX_STRIDE))) {
1093  // FIXME could still use mmx for some of the rows
1094  ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy,
1095  shift, r, width, height);
1096  return;
1097  }
1098 
1099  src += ix + iy * stride;
1100  if (need_emu) {
1101  emu_edge_fn(edge_buf, src, stride, w + 1, h + 1, ix, iy, width, height);
1102  src = edge_buf;
1103  }
1104 
1105  __asm__ volatile (
1106  "movd %0, %%mm6 \n\t"
1107  "pxor %%mm7, %%mm7 \n\t"
1108  "punpcklwd %%mm6, %%mm6 \n\t"
1109  "punpcklwd %%mm6, %%mm6 \n\t"
1110  :: "r"(1<<shift)
1111  );
1112 
1113  for (x = 0; x < w; x += 4) {
1114  uint16_t dx4[4] = { oxs - dxys + dxxs * (x + 0),
1115  oxs - dxys + dxxs * (x + 1),
1116  oxs - dxys + dxxs * (x + 2),
1117  oxs - dxys + dxxs * (x + 3) };
1118  uint16_t dy4[4] = { oys - dyys + dyxs * (x + 0),
1119  oys - dyys + dyxs * (x + 1),
1120  oys - dyys + dyxs * (x + 2),
1121  oys - dyys + dyxs * (x + 3) };
1122 
1123  for (y = 0; y < h; y++) {
1124  __asm__ volatile (
1125  "movq %0, %%mm4 \n\t"
1126  "movq %1, %%mm5 \n\t"
1127  "paddw %2, %%mm4 \n\t"
1128  "paddw %3, %%mm5 \n\t"
1129  "movq %%mm4, %0 \n\t"
1130  "movq %%mm5, %1 \n\t"
1131  "psrlw $12, %%mm4 \n\t"
1132  "psrlw $12, %%mm5 \n\t"
1133  : "+m"(*dx4), "+m"(*dy4)
1134  : "m"(*dxy4), "m"(*dyy4)
1135  );
1136 
1137  __asm__ volatile (
1138  "movq %%mm6, %%mm2 \n\t"
1139  "movq %%mm6, %%mm1 \n\t"
1140  "psubw %%mm4, %%mm2 \n\t"
1141  "psubw %%mm5, %%mm1 \n\t"
1142  "movq %%mm2, %%mm0 \n\t"
1143  "movq %%mm4, %%mm3 \n\t"
1144  "pmullw %%mm1, %%mm0 \n\t" // (s - dx) * (s - dy)
1145  "pmullw %%mm5, %%mm3 \n\t" // dx * dy
1146  "pmullw %%mm5, %%mm2 \n\t" // (s - dx) * dy
1147  "pmullw %%mm4, %%mm1 \n\t" // dx * (s - dy)
1148 
1149  "movd %4, %%mm5 \n\t"
1150  "movd %3, %%mm4 \n\t"
1151  "punpcklbw %%mm7, %%mm5 \n\t"
1152  "punpcklbw %%mm7, %%mm4 \n\t"
1153  "pmullw %%mm5, %%mm3 \n\t" // src[1, 1] * dx * dy
1154  "pmullw %%mm4, %%mm2 \n\t" // src[0, 1] * (s - dx) * dy
1155 
1156  "movd %2, %%mm5 \n\t"
1157  "movd %1, %%mm4 \n\t"
1158  "punpcklbw %%mm7, %%mm5 \n\t"
1159  "punpcklbw %%mm7, %%mm4 \n\t"
1160  "pmullw %%mm5, %%mm1 \n\t" // src[1, 0] * dx * (s - dy)
1161  "pmullw %%mm4, %%mm0 \n\t" // src[0, 0] * (s - dx) * (s - dy)
1162  "paddw %5, %%mm1 \n\t"
1163  "paddw %%mm3, %%mm2 \n\t"
1164  "paddw %%mm1, %%mm0 \n\t"
1165  "paddw %%mm2, %%mm0 \n\t"
1166 
1167  "psrlw %6, %%mm0 \n\t"
1168  "packuswb %%mm0, %%mm0 \n\t"
1169  "movd %%mm0, %0 \n\t"
1170 
1171  : "=m"(dst[x + y * stride])
1172  : "m"(src[0]), "m"(src[1]),
1173  "m"(src[stride]), "m"(src[stride + 1]),
1174  "m"(*r4), "m"(shift2)
1175  );
1176  src += stride;
1177  }
1178  src += 4 - h * stride;
1179  }
1180 }
1181 
1182 
1183 #if CONFIG_VIDEODSP
1184 #if HAVE_YASM
1185 #if ARCH_X86_32
1186 static void gmc_mmx(uint8_t *dst, uint8_t *src,
1187  int stride, int h, int ox, int oy,
1188  int dxx, int dxy, int dyx, int dyy,
1189  int shift, int r, int width, int height)
1190 {
1191  gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
1192  width, height, &ff_emulated_edge_mc_8);
1193 }
1194 #endif
1195 static void gmc_sse(uint8_t *dst, uint8_t *src,
1196  int stride, int h, int ox, int oy,
1197  int dxx, int dxy, int dyx, int dyy,
1198  int shift, int r, int width, int height)
1199 {
1200  gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
1201  width, height, &ff_emulated_edge_mc_8);
1202 }
1203 #else
1204 static void gmc_mmx(uint8_t *dst, uint8_t *src,
1205  int stride, int h, int ox, int oy,
1206  int dxx, int dxy, int dyx, int dyy,
1207  int shift, int r, int width, int height)
1208 {
1209  gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
1210  width, height, &ff_emulated_edge_mc_8);
1211 }
1212 #endif
1213 #endif
1214 
1215 /* CAVS-specific */
1217 {
1218  put_pixels8_mmx(dst, src, stride, 8);
1219 }
1220 
1222 {
1223  avg_pixels8_mmx(dst, src, stride, 8);
1224 }
1225 
1227 {
1228  put_pixels16_mmx(dst, src, stride, 16);
1229 }
1230 
1232 {
1233  avg_pixels16_mmx(dst, src, stride, 16);
1234 }
1235 
1236 /* VC-1-specific */
1238  ptrdiff_t stride, int rnd)
1239 {
1240  put_pixels8_mmx(dst, src, stride, 8);
1241 }
1242 
1243 #if CONFIG_DIRAC_DECODER
1244 #define DIRAC_PIXOP(OPNAME2, OPNAME, EXT)\
1245 void ff_ ## OPNAME2 ## _dirac_pixels8_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1246 {\
1247  if (h&3)\
1248  ff_ ## OPNAME2 ## _dirac_pixels8_c(dst, src, stride, h);\
1249  else\
1250  OPNAME ## _pixels8_ ## EXT(dst, src[0], stride, h);\
1251 }\
1252 void ff_ ## OPNAME2 ## _dirac_pixels16_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1253 {\
1254  if (h&3)\
1255  ff_ ## OPNAME2 ## _dirac_pixels16_c(dst, src, stride, h);\
1256  else\
1257  OPNAME ## _pixels16_ ## EXT(dst, src[0], stride, h);\
1258 }\
1259 void ff_ ## OPNAME2 ## _dirac_pixels32_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1260 {\
1261  if (h&3) {\
1262  ff_ ## OPNAME2 ## _dirac_pixels32_c(dst, src, stride, h);\
1263  } else {\
1264  OPNAME ## _pixels16_ ## EXT(dst , src[0] , stride, h);\
1265  OPNAME ## _pixels16_ ## EXT(dst+16, src[0]+16, stride, h);\
1266  }\
1267 }
1268 
1269 #if HAVE_MMX_INLINE
1270 DIRAC_PIXOP(put, put, mmx)
1271 DIRAC_PIXOP(avg, avg, mmx)
1272 #endif
1273 
1274 #if HAVE_YASM
1275 DIRAC_PIXOP(avg, ff_avg, mmxext)
1276 
1277 void ff_put_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
1278 {
1279  if (h&3)
1280  ff_put_dirac_pixels16_c(dst, src, stride, h);
1281  else
1283 }
1284 void ff_avg_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
1285 {
1286  if (h&3)
1287  ff_avg_dirac_pixels16_c(dst, src, stride, h);
1288  else
1290 }
1291 void ff_put_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
1292 {
1293  if (h&3) {
1294  ff_put_dirac_pixels32_c(dst, src, stride, h);
1295  } else {
1296  ff_put_pixels16_sse2(dst , src[0] , stride, h);
1297  ff_put_pixels16_sse2(dst+16, src[0]+16, stride, h);
1298  }
1299 }
1300 void ff_avg_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
1301 {
1302  if (h&3) {
1303  ff_avg_dirac_pixels32_c(dst, src, stride, h);
1304  } else {
1305  ff_avg_pixels16_sse2(dst , src[0] , stride, h);
1306  ff_avg_pixels16_sse2(dst+16, src[0]+16, stride, h);
1307  }
1308 }
1309 #endif
1310 #endif
1311 
1312 static void vector_clipf_sse(float *dst, const float *src,
1313  float min, float max, int len)
1314 {
1315  x86_reg i = (len - 16) * 4;
1316  __asm__ volatile (
1317  "movss %3, %%xmm4 \n\t"
1318  "movss %4, %%xmm5 \n\t"
1319  "shufps $0, %%xmm4, %%xmm4 \n\t"
1320  "shufps $0, %%xmm5, %%xmm5 \n\t"
1321  "1: \n\t"
1322  "movaps (%2, %0), %%xmm0 \n\t" // 3/1 on intel
1323  "movaps 16(%2, %0), %%xmm1 \n\t"
1324  "movaps 32(%2, %0), %%xmm2 \n\t"
1325  "movaps 48(%2, %0), %%xmm3 \n\t"
1326  "maxps %%xmm4, %%xmm0 \n\t"
1327  "maxps %%xmm4, %%xmm1 \n\t"
1328  "maxps %%xmm4, %%xmm2 \n\t"
1329  "maxps %%xmm4, %%xmm3 \n\t"
1330  "minps %%xmm5, %%xmm0 \n\t"
1331  "minps %%xmm5, %%xmm1 \n\t"
1332  "minps %%xmm5, %%xmm2 \n\t"
1333  "minps %%xmm5, %%xmm3 \n\t"
1334  "movaps %%xmm0, (%1, %0) \n\t"
1335  "movaps %%xmm1, 16(%1, %0) \n\t"
1336  "movaps %%xmm2, 32(%1, %0) \n\t"
1337  "movaps %%xmm3, 48(%1, %0) \n\t"
1338  "sub $64, %0 \n\t"
1339  "jge 1b \n\t"
1340  : "+&r"(i)
1341  : "r"(dst), "r"(src), "m"(min), "m"(max)
1342  : "memory"
1343  );
1344 }
1345 
1346 #endif /* HAVE_INLINE_ASM */
1347 
1348 void ff_h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale);
1349 void ff_h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale);
1350 
1351 int32_t ff_scalarproduct_int16_mmxext(const int16_t *v1, const int16_t *v2,
1352  int order);
1353 int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2,
1354  int order);
1355 int32_t ff_scalarproduct_and_madd_int16_mmxext(int16_t *v1, const int16_t *v2,
1356  const int16_t *v3,
1357  int order, int mul);
1358 int32_t ff_scalarproduct_and_madd_int16_sse2(int16_t *v1, const int16_t *v2,
1359  const int16_t *v3,
1360  int order, int mul);
1361 int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2,
1362  const int16_t *v3,
1363  int order, int mul);
1364 
1365 void ff_apply_window_int16_round_mmxext(int16_t *output, const int16_t *input,
1366  const int16_t *window, unsigned int len);
1367 void ff_apply_window_int16_round_sse2(int16_t *output, const int16_t *input,
1368  const int16_t *window, unsigned int len);
1369 void ff_apply_window_int16_mmxext(int16_t *output, const int16_t *input,
1370  const int16_t *window, unsigned int len);
1371 void ff_apply_window_int16_sse2(int16_t *output, const int16_t *input,
1372  const int16_t *window, unsigned int len);
1373 void ff_apply_window_int16_ssse3(int16_t *output, const int16_t *input,
1374  const int16_t *window, unsigned int len);
1375 void ff_apply_window_int16_ssse3_atom(int16_t *output, const int16_t *input,
1376  const int16_t *window, unsigned int len);
1377 
1378 void ff_bswap32_buf_ssse3(uint32_t *dst, const uint32_t *src, int w);
1379 void ff_bswap32_buf_sse2(uint32_t *dst, const uint32_t *src, int w);
1380 
1382  const uint8_t *diff, int w,
1383  int *left, int *left_top);
1385  int w, int left);
1387  int w, int left);
1388 
1390  int32_t min, int32_t max, unsigned int len);
1392  int32_t min, int32_t max, unsigned int len);
1394  int32_t min, int32_t max, unsigned int len);
1396  int32_t min, int32_t max, unsigned int len);
1397 
1398 #define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU, PREFIX) \
1399  do { \
1400  c->PFX ## _pixels_tab[IDX][ 0] = PREFIX ## PFX ## SIZE ## _mc00_ ## CPU; \
1401  c->PFX ## _pixels_tab[IDX][ 1] = PREFIX ## PFX ## SIZE ## _mc10_ ## CPU; \
1402  c->PFX ## _pixels_tab[IDX][ 2] = PREFIX ## PFX ## SIZE ## _mc20_ ## CPU; \
1403  c->PFX ## _pixels_tab[IDX][ 3] = PREFIX ## PFX ## SIZE ## _mc30_ ## CPU; \
1404  c->PFX ## _pixels_tab[IDX][ 4] = PREFIX ## PFX ## SIZE ## _mc01_ ## CPU; \
1405  c->PFX ## _pixels_tab[IDX][ 5] = PREFIX ## PFX ## SIZE ## _mc11_ ## CPU; \
1406  c->PFX ## _pixels_tab[IDX][ 6] = PREFIX ## PFX ## SIZE ## _mc21_ ## CPU; \
1407  c->PFX ## _pixels_tab[IDX][ 7] = PREFIX ## PFX ## SIZE ## _mc31_ ## CPU; \
1408  c->PFX ## _pixels_tab[IDX][ 8] = PREFIX ## PFX ## SIZE ## _mc02_ ## CPU; \
1409  c->PFX ## _pixels_tab[IDX][ 9] = PREFIX ## PFX ## SIZE ## _mc12_ ## CPU; \
1410  c->PFX ## _pixels_tab[IDX][10] = PREFIX ## PFX ## SIZE ## _mc22_ ## CPU; \
1411  c->PFX ## _pixels_tab[IDX][11] = PREFIX ## PFX ## SIZE ## _mc32_ ## CPU; \
1412  c->PFX ## _pixels_tab[IDX][12] = PREFIX ## PFX ## SIZE ## _mc03_ ## CPU; \
1413  c->PFX ## _pixels_tab[IDX][13] = PREFIX ## PFX ## SIZE ## _mc13_ ## CPU; \
1414  c->PFX ## _pixels_tab[IDX][14] = PREFIX ## PFX ## SIZE ## _mc23_ ## CPU; \
1415  c->PFX ## _pixels_tab[IDX][15] = PREFIX ## PFX ## SIZE ## _mc33_ ## CPU; \
1416  } while (0)
1417 
1419  int mm_flags)
1420 {
1421  const int high_bit_depth = avctx->bits_per_raw_sample > 8;
1422 
1423 #if HAVE_INLINE_ASM
1427 
1428  if (!high_bit_depth) {
1429  c->clear_block = clear_block_mmx;
1430  c->clear_blocks = clear_blocks_mmx;
1431  c->draw_edges = draw_edges_mmx;
1432  }
1433 
1434 #if CONFIG_VIDEODSP && (ARCH_X86_32 || !HAVE_YASM)
1435  c->gmc = gmc_mmx;
1436 #endif
1437 
1438  c->add_bytes = add_bytes_mmx;
1439 #endif /* HAVE_INLINE_ASM */
1440 
1441 #if HAVE_YASM
1445  }
1446 
1448 #endif
1449 
1450 }
1451 
1453  int mm_flags)
1454 {
1455 
1456 #if HAVE_YASM
1457  SET_QPEL_FUNCS(avg_qpel, 0, 16, mmxext, );
1458  SET_QPEL_FUNCS(avg_qpel, 1, 8, mmxext, );
1459 
1460  SET_QPEL_FUNCS(put_qpel, 0, 16, mmxext, );
1461  SET_QPEL_FUNCS(put_qpel, 1, 8, mmxext, );
1462  SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, mmxext, );
1463  SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, mmxext, );
1464 #endif /* HAVE_YASM */
1465 
1466 #if HAVE_MMXEXT_EXTERNAL
1467  /* slower than cmov version on AMD */
1468  if (!(mm_flags & AV_CPU_FLAG_3DNOW))
1470 
1473 
1474  if (avctx->flags & CODEC_FLAG_BITEXACT) {
1476  } else {
1478  }
1479 #endif /* HAVE_MMXEXT_EXTERNAL */
1480 }
1481 
1483  int mm_flags)
1484 {
1485 #if HAVE_INLINE_ASM
1486  const int high_bit_depth = avctx->bits_per_raw_sample > 8;
1487 
1488  if (!high_bit_depth) {
1489  if (!(CONFIG_MPEG_XVMC_DECODER && avctx->xvmc_acceleration > 1)) {
1490  /* XvMCCreateBlocks() may not allocate 16-byte aligned blocks */
1491  c->clear_block = clear_block_sse;
1492  c->clear_blocks = clear_blocks_sse;
1493  }
1494  }
1495 
1496  c->vector_clipf = vector_clipf_sse;
1497 #endif /* HAVE_INLINE_ASM */
1498 
1499 #if HAVE_YASM
1500 #if HAVE_INLINE_ASM && CONFIG_VIDEODSP
1501  c->gmc = gmc_sse;
1502 #endif
1503 #endif /* HAVE_YASM */
1504 }
1505 
1507  int mm_flags)
1508 {
1509  const int high_bit_depth = avctx->bits_per_raw_sample > 8;
1510 
1511 #if HAVE_SSE2_INLINE
1512  if (!high_bit_depth && avctx->idct_algo == FF_IDCT_XVIDMMX) {
1515  c->idct = ff_idct_xvid_sse2;
1517  }
1518 #endif /* HAVE_SSE2_INLINE */
1519 
1520 #if HAVE_SSE2_EXTERNAL
1523  if (mm_flags & AV_CPU_FLAG_ATOM) {
1525  } else {
1527  }
1528  if (avctx->flags & CODEC_FLAG_BITEXACT) {
1530  } else if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
1532  }
1534 #endif /* HAVE_SSE2_EXTERNAL */
1535 }
1536 
1538  int mm_flags)
1539 {
1540 #if HAVE_SSSE3_EXTERNAL
1542  if (mm_flags & AV_CPU_FLAG_SSE4) // not really sse4, just slow on Conroe
1544 
1545  if (mm_flags & AV_CPU_FLAG_ATOM)
1547  else
1549  if (!(mm_flags & (AV_CPU_FLAG_SSE42|AV_CPU_FLAG_3DNOW))) // cachesplit
1552 #endif /* HAVE_SSSE3_EXTERNAL */
1553 }
1554 
1556  int mm_flags)
1557 {
1558 #if HAVE_SSE4_EXTERNAL
1560 #endif /* HAVE_SSE4_EXTERNAL */
1561 }
1562 
1564 {
1565  int mm_flags = av_get_cpu_flags();
1566 
1567 #if HAVE_7REGS && HAVE_INLINE_ASM
1568  if (mm_flags & AV_CPU_FLAG_CMOV)
1569  c->add_hfyu_median_prediction = add_hfyu_median_prediction_cmov;
1570 #endif
1571 
1572  if (mm_flags & AV_CPU_FLAG_MMX) {
1573 #if HAVE_INLINE_ASM
1574  const int idct_algo = avctx->idct_algo;
1575 
1576  if (avctx->lowres == 0 && avctx->bits_per_raw_sample <= 8) {
1577  if (idct_algo == FF_IDCT_AUTO || idct_algo == FF_IDCT_SIMPLEMMX) {
1580  c->idct = ff_simple_idct_mmx;
1582  } else if (idct_algo == FF_IDCT_XVIDMMX) {
1583  if (mm_flags & AV_CPU_FLAG_SSE2) {
1586  c->idct = ff_idct_xvid_sse2;
1588  } else if (mm_flags & AV_CPU_FLAG_MMXEXT) {
1592  } else {
1595  c->idct = ff_idct_xvid_mmx;
1596  }
1597  }
1598  }
1599 #endif /* HAVE_INLINE_ASM */
1600 
1601  dsputil_init_mmx(c, avctx, mm_flags);
1602  }
1603 
1604  if (mm_flags & AV_CPU_FLAG_MMXEXT)
1605  dsputil_init_mmxext(c, avctx, mm_flags);
1606 
1607  if (mm_flags & AV_CPU_FLAG_SSE)
1608  dsputil_init_sse(c, avctx, mm_flags);
1609 
1610  if (mm_flags & AV_CPU_FLAG_SSE2)
1611  dsputil_init_sse2(c, avctx, mm_flags);
1612 
1613  if (mm_flags & AV_CPU_FLAG_SSSE3)
1614  dsputil_init_ssse3(c, avctx, mm_flags);
1615 
1616  if (mm_flags & AV_CPU_FLAG_SSE4)
1617  dsputil_init_sse4(c, avctx, mm_flags);
1618 
1619  if (CONFIG_ENCODERS)
1620  ff_dsputilenc_init_mmx(c, avctx);
1621 }
const uint64_t ff_pw_42
void ff_put_pixels8_mmxext(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
#define MANGLE(a)
static av_cold void dsputil_init_ssse3(DSPContext *c, AVCodecContext *avctx, int mm_flags)
Definition: dsputil_mmx.c:1537
#define CONFIG_MPEG_XVMC_DECODER
Definition: config.h:527
void ff_put_rv40_qpel16_mc33_mmx(uint8_t *block, uint8_t *pixels, ptrdiff_t stride)
int32_t ff_scalarproduct_and_madd_int16_mmxext(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul)
static int shift(int a, int b)
Definition: sonic.c:86
void ff_avg_cavs_qpel8_mc00_mmxext(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
#define CONFIG_ENCODERS
Definition: config.h:286
void ff_vector_clip_int32_sse2(int32_t *dst, const int32_t *src, int32_t min, int32_t max, unsigned int len)
void ff_apply_window_int16_ssse3(int16_t *output, const int16_t *input, const int16_t *window, unsigned int len)
void ff_vector_clip_int32_int_sse2(int32_t *dst, const int32_t *src, int32_t min, int32_t max, unsigned int len)
#define EDGE_TOP
Definition: dsputil.h:265
if max(w)>1 w=0.9 *w/max(w)
void ff_put_pixels16_sse2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
void ff_idct_xvid_sse2(short *block)
#define AV_CPU_FLAG_SSE
SSE functions.
Definition: cpu.h:33
void ff_avg_rv40_qpel16_mc33_mmx(uint8_t *block, uint8_t *pixels, ptrdiff_t stride)
static av_cold void dsputil_init_sse4(DSPContext *c, AVCodecContext *avctx, int mm_flags)
Definition: dsputil_mmx.c:1555
void ff_bswap32_buf_sse2(uint32_t *dst, const uint32_t *src, int w)
void ff_put_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
#define AV_CPU_FLAG_CMOV
supports cmov instruction
Definition: cpu.h:47
#define wrap(func)
Definition: w64xmmtest.h:70
mpegvideo header.
void ff_avg_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
int bits_per_raw_sample
Bits per sample/pixel of internal libavcodec pixel/sample format.
void(* clear_block)(int16_t *block)
Definition: dsputil.h:145
int stride
Definition: mace.c:144
const uint64_t ff_pw_255
H.264 DSP functions.
output residual component w
Macro definitions for various function/variable attributes.
#define _(x)
static av_cold void dsputil_init_sse(DSPContext *c, AVCodecContext *avctx, int mm_flags)
Definition: dsputil_mmx.c:1482
const uint64_t ff_pw_128
uint8_t
#define av_cold
Definition: attributes.h:78
#define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU, PREFIX)
Definition: dsputil_mmx.c:1398
void ff_vector_clip_int32_mmx(int32_t *dst, const int32_t *src, int32_t min, int32_t max, unsigned int len)
#define put(d, s)
Definition: dsputil_align.c:51
#define AV_CPU_FLAG_MMXEXT
SSE integer functions or AMD MMX ext.
Definition: cpu.h:30
void ff_idct_xvid_sse2_put(uint8_t *dest, int line_size, short *block)
#define AV_CPU_FLAG_ATOM
Atom processor, some SSSE3 instructions are slower.
Definition: cpu.h:40
const uint64_t ff_pb_3F
int32_t ff_scalarproduct_int16_mmxext(const int16_t *v1, const int16_t *v2, int order)
#define CODEC_FLAG_BITEXACT
Use only bitexact stuff (except (I)DCT).
int lowres
low resolution decoding, 1-> 1/2 size, 2->1/4 size
#define AV_CPU_FLAG_SSE2SLOW
SSE2 supported, but usually not faster.
Definition: cpu.h:35
void ff_apply_window_int16_sse2(int16_t *output, const int16_t *input, const int16_t *window, unsigned int len)
void ff_idct_xvid_mmxext_put(uint8_t *dest, int line_size, int16_t *block)
void ff_put_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
void ff_put_cavs_qpel8_mc00_mmxext(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
Discrete Time axis x
void ff_avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
#define AV_CPU_FLAG_SSE42
Nehalem SSE4.2 functions.
Definition: cpu.h:42
#define FF_IDCT_XVIDMMX
void(* vector_clipf)(float *dst, const float *src, float min, float max, int len)
Definition: dsputil.h:215
#define FF_SSE2_IDCT_PERM
Definition: dsputil.h:256
#define AV_CPU_FLAG_SSSE3
Conroe SSSE3 functions.
Definition: cpu.h:39
void(* add_hfyu_median_prediction)(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top)
Definition: dsputil.h:204
static av_cold void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx, int mm_flags)
Definition: dsputil_mmx.c:1418
void ff_idct_xvid_sse2_add(uint8_t *dest, int line_size, short *block)
const char * r
Definition: vf_curves.c:94
void(* h263_h_loop_filter)(uint8_t *src, int stride, int qscale)
Definition: dsputil.h:212
static av_cold void dsputil_init_mmxext(DSPContext *c, AVCodecContext *avctx, int mm_flags)
Definition: dsputil_mmx.c:1452
int flags
CODEC_FLAG_*.
these buffered frames must be flushed immediately if a new input produces new the filter must not call request_frame to get more It must just process the frame or queue it The task of requesting more frames is left to the filter s request_frame method or the application If a filter has several the filter must be ready for frames arriving randomly on any input any filter with several inputs will most likely require some kind of queuing mechanism It is perfectly acceptable to have a limited queue and to drop frames when the inputs are too unbalanced request_frame This method is called when a frame is wanted on an output For an input
void ff_apply_window_int16_ssse3_atom(int16_t *output, const int16_t *input, const int16_t *window, unsigned int len)
void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy, int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
Definition: dsputil.c:546
void(* clear_blocks)(int16_t *blocks)
Definition: dsputil.h:146
int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2, int order)
overlapping window(triangular window to avoid too much overlapping) ovidx
void ff_simple_idct_mmx(int16_t *block)
void(* apply_window_int16)(int16_t *output, const int16_t *input, const int16_t *window, unsigned int len)
Apply symmetric window in 16-bit fixed-point.
Definition: dsputil.h:294
void ff_apply_window_int16_mmxext(int16_t *output, const int16_t *input, const int16_t *window, unsigned int len)
void ff_simple_idct_put_mmx(uint8_t *dest, int line_size, int16_t *block)
void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride, int rnd)
void ff_put_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels, int line_size)
#define av_assert1(cond)
assert() equivalent, that does not lie in speed critical code.
Definition: avassert.h:53
int32_t(* scalarproduct_and_madd_int16)(int16_t *v1, const int16_t *v2, const int16_t *v3, int len, int mul)
Calculate scalar product of v1 and v2, and v1[i] += v3[i] * mul.
Definition: dsputil.h:281
void(* draw_edges)(uint8_t *buf, int wrap, int width, int height, int w, int h, int sides)
Definition: dsputil.h:263
int idct_algo
IDCT algorithm, see FF_IDCT_* below.
void(* put_signed_pixels_clamped)(const int16_t *block, uint8_t *pixels, int line_size)
Definition: dsputil.h:132
void ff_dsputilenc_init_mmx(DSPContext *c, AVCodecContext *avctx)
void(* add_bytes)(uint8_t *dst, uint8_t *src, int w)
Definition: dsputil.h:197
void ff_simple_idct_add_mmx(uint8_t *dest, int line_size, int16_t *block)
t
Definition: genspecsines3.m:6
void ff_add_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels, int line_size)
void(* put_pixels_clamped)(const int16_t *block, uint8_t *pixels, int line_size)
Definition: dsputil.h:131
int32_t
static av_cold void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx, int mm_flags)
Definition: dsputil_mmx.c:1506
void ff_vector_clip_int32_sse4(int32_t *dst, const int32_t *src, int32_t min, int32_t max, unsigned int len)
void ff_h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale)
#define diff(a, as, b, bs)
Definition: vf_phase.c:80
#define EDGE_BOTTOM
Definition: dsputil.h:266
const uint64_t ff_pw_96
int xvmc_acceleration
XVideo Motion Acceleration.
void ff_avg_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
#define AV_CPU_FLAG_3DNOW
AMD 3DNOW.
Definition: cpu.h:32
void(* add_pixels_clamped)(const int16_t *block, uint8_t *pixels, int line_size)
Definition: dsputil.h:133
#define FF_IDCT_AUTO
void(* vector_clip_int32)(int32_t *dst, const int32_t *src, int32_t min, int32_t max, unsigned int len)
Clip each element in an array of int32_t to a given minimum and maximum value.
Definition: dsputil.h:310
const uint64_t ff_pw_53
const double ff_pd_2[2]
static int width
Definition: tests/utils.c:158
AVS_Value src
Definition: avisynth_c.h:523
int idct_permutation_type
Definition: dsputil.h:250
void ff_bswap32_buf_ssse3(uint32_t *dst, const uint32_t *src, int w)
void(* idct_add)(uint8_t *dest, int line_size, int16_t *block)
block -> idct -> add dest -> clip to unsigned 8 bit -> dest.
Definition: dsputil.h:235
main external API structure.
#define AV_CPU_FLAG_MMX
standard MMX
Definition: cpu.h:29
#define FF_SIMPLE_IDCT_PERM
Definition: dsputil.h:253
void ff_put_signed_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels, int line_size)
void ff_add_hfyu_median_prediction_mmxext(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top)
int32_t ff_scalarproduct_and_madd_int16_sse2(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul)
void * buf
Definition: avisynth_c.h:594
#define CONFIG_H263_DECODER
Definition: config.h:498
BYTE int const BYTE int int int height
Definition: avisynth_c.h:713
const uint64_t ff_pw_20
synthesis window for stochastic i
void(* bswap_buf)(uint32_t *dst, const uint32_t *src, int w)
Definition: dsputil.h:208
const uint64_t ff_pw_15
void(* gmc)(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy, int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
global motion compensation.
Definition: dsputil.h:143
#define AV_CPU_FLAG_SSE4
Penryn SSE4.1 functions.
Definition: cpu.h:41
void ff_idct_xvid_mmx_add(uint8_t *dest, int line_size, int16_t *block)
void ff_put_cavs_qpel16_mc00_mmxext(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
void ff_idct_xvid_mmx_put(uint8_t *dest, int line_size, int16_t *block)
int32_t(* scalarproduct_int16)(const int16_t *v1, const int16_t *v2, int len)
Calculate scalar product of two vectors.
Definition: dsputil.h:274
void ff_idct_xvid_mmxext_add(uint8_t *dest, int line_size, int16_t *block)
int av_get_cpu_flags(void)
Return the flags which specify extensions supported by the CPU.
Definition: cpu.c:30
void ff_apply_window_int16_round_sse2(int16_t *output, const int16_t *input, const int16_t *window, unsigned int len)
void(* idct)(int16_t *block)
Definition: dsputil.h:222
header for Xvid IDCT functions
av_cold void ff_dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx)
Definition: dsputil_mmx.c:1563
void ff_apply_window_int16_round_mmxext(int16_t *output, const int16_t *input, const int16_t *window, unsigned int len)
static const int shift2[6]
static double c[64]
these buffered frames must be flushed immediately if a new input produces new output(Example:frame rate-doubling filter:filter_frame must(1) flush the second copy of the previous frame, if it is still there,(2) push the first copy of the incoming frame,(3) keep the second copy for later.) If the input frame is not enough to produce output
#define CONFIG_H263_ENCODER
Definition: config.h:1040
function y
Definition: D.m:1
int(* add_hfyu_left_prediction)(uint8_t *dst, const uint8_t *src, int w, int left)
Definition: dsputil.h:205
Core video DSP helper functions.
void ff_put_rv40_qpel8_mc33_mmx(uint8_t *block, uint8_t *pixels, ptrdiff_t stride)
DSP utils.
void(* idct_put)(uint8_t *dest, int line_size, int16_t *block)
block -> idct -> clip to unsigned 8 bit -> dest.
Definition: dsputil.h:229
void(* h263_v_loop_filter)(uint8_t *src, int stride, int qscale)
Definition: dsputil.h:211
int x86_reg
simple idct header.
int len
#define avg(d, s)
Definition: dsputil_align.c:52
int ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src, int w, int left)
else dst[i][x+y *dst_stride[i]]
Definition: vf_mcdeint.c:160
void ff_avg_cavs_qpel16_mc00_mmxext(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
int ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src, int w, int left)
const uint64_t ff_pb_FC
#define FF_IDCT_SIMPLEMMX
const xmm_reg ff_pw_16
#define AV_CPU_FLAG_SSE2
PIV SSE2 functions.
Definition: cpu.h:34
#define av_always_inline
Definition: attributes.h:41
const double ff_pd_1[2]
void ff_idct_xvid_mmx(short *block)
int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul)
DECLARE_ALIGNED(8, const uint64_t, ff_pw_15)=0
Definition: dsputil_mmx.c:41
void ff_idct_xvid_mmxext(short *block)
float min
void ff_avg_pixels8_mmxext(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
void ff_h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale)
void ff_avg_rv40_qpel8_mc33_mmx(uint8_t *block, uint8_t *pixels, ptrdiff_t stride)
DSPContext.
Definition: dsputil.h:127