yading@10
|
1 /*
|
yading@10
|
2 * MMX optimized DSP utils
|
yading@10
|
3 * Copyright (c) 2000, 2001 Fabrice Bellard
|
yading@10
|
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
|
yading@10
|
5 *
|
yading@10
|
6 * This file is part of FFmpeg.
|
yading@10
|
7 *
|
yading@10
|
8 * FFmpeg is free software; you can redistribute it and/or
|
yading@10
|
9 * modify it under the terms of the GNU Lesser General Public
|
yading@10
|
10 * License as published by the Free Software Foundation; either
|
yading@10
|
11 * version 2.1 of the License, or (at your option) any later version.
|
yading@10
|
12 *
|
yading@10
|
13 * FFmpeg is distributed in the hope that it will be useful,
|
yading@10
|
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
yading@10
|
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
yading@10
|
16 * Lesser General Public License for more details.
|
yading@10
|
17 *
|
yading@10
|
18 * You should have received a copy of the GNU Lesser General Public
|
yading@10
|
19 * License along with FFmpeg; if not, write to the Free Software
|
yading@10
|
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
yading@10
|
21 *
|
yading@10
|
22 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
|
yading@10
|
23 */
|
yading@10
|
24
|
yading@10
|
25 #include "libavutil/attributes.h"
|
yading@10
|
26 #include "libavutil/cpu.h"
|
yading@10
|
27 #include "libavutil/x86/asm.h"
|
yading@10
|
28 #include "libavcodec/dsputil.h"
|
yading@10
|
29 #include "libavcodec/h264dsp.h"
|
yading@10
|
30 #include "libavcodec/mpegvideo.h"
|
yading@10
|
31 #include "libavcodec/simple_idct.h"
|
yading@10
|
32 #include "libavcodec/videodsp.h"
|
yading@10
|
33 #include "dsputil_mmx.h"
|
yading@10
|
34 #include "idct_xvid.h"
|
yading@10
|
35 #include "diracdsp_mmx.h"
|
yading@10
|
36
|
yading@10
|
37 //#undef NDEBUG
|
yading@10
|
38 //#include <assert.h>
|
yading@10
|
39
|
yading@10
|
40 /* pixel operations */
|
yading@10
|
41 DECLARE_ALIGNED(8, const uint64_t, ff_pw_15) = 0x000F000F000F000FULL;
|
yading@10
|
42 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_17) = { 0x0011001100110011ULL, 0x0011001100110011ULL };
|
yading@10
|
43 DECLARE_ALIGNED(8, const uint64_t, ff_pw_20) = 0x0014001400140014ULL;
|
yading@10
|
44 DECLARE_ALIGNED(8, const uint64_t, ff_pw_42) = 0x002A002A002A002AULL;
|
yading@10
|
45 DECLARE_ALIGNED(8, const uint64_t, ff_pw_53) = 0x0035003500350035ULL;
|
yading@10
|
46 DECLARE_ALIGNED(8, const uint64_t, ff_pw_96) = 0x0060006000600060ULL;
|
yading@10
|
47 DECLARE_ALIGNED(8, const uint64_t, ff_pw_128) = 0x0080008000800080ULL;
|
yading@10
|
48 DECLARE_ALIGNED(8, const uint64_t, ff_pw_255) = 0x00ff00ff00ff00ffULL;
|
yading@10
|
49 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_512) = { 0x0200020002000200ULL, 0x0200020002000200ULL };
|
yading@10
|
50 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1019) = { 0x03FB03FB03FB03FBULL, 0x03FB03FB03FB03FBULL };
|
yading@10
|
51
|
yading@10
|
52 DECLARE_ALIGNED(8, const uint64_t, ff_pb_3F) = 0x3F3F3F3F3F3F3F3FULL;
|
yading@10
|
53 DECLARE_ALIGNED(8, const uint64_t, ff_pb_FC) = 0xFCFCFCFCFCFCFCFCULL;
|
yading@10
|
54
|
yading@10
|
55 DECLARE_ALIGNED(16, const double, ff_pd_1)[2] = { 1.0, 1.0 };
|
yading@10
|
56 DECLARE_ALIGNED(16, const double, ff_pd_2)[2] = { 2.0, 2.0 };
|
yading@10
|
57
|
yading@10
|
58
|
yading@10
|
59 #if HAVE_YASM
|
yading@10
|
60 void ff_put_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
|
yading@10
|
61 int dstStride, int src1Stride, int h);
|
yading@10
|
62 void ff_put_no_rnd_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1,
|
yading@10
|
63 uint8_t *src2, int dstStride,
|
yading@10
|
64 int src1Stride, int h);
|
yading@10
|
65 void ff_avg_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
|
yading@10
|
66 int dstStride, int src1Stride, int h);
|
yading@10
|
67 void ff_put_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
|
yading@10
|
68 int dstStride, int src1Stride, int h);
|
yading@10
|
69 void ff_avg_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
|
yading@10
|
70 int dstStride, int src1Stride, int h);
|
yading@10
|
71 void ff_put_no_rnd_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
|
yading@10
|
72 int dstStride, int src1Stride, int h);
|
yading@10
|
73 void ff_avg_pixels8_mmxext(uint8_t *block, const uint8_t *pixels,
|
yading@10
|
74 ptrdiff_t line_size, int h);
|
yading@10
|
75
|
yading@10
|
76 static void ff_put_pixels16_mmxext(uint8_t *block, const uint8_t *pixels,
|
yading@10
|
77 ptrdiff_t line_size, int h)
|
yading@10
|
78 {
|
yading@10
|
79 ff_put_pixels8_mmxext(block, pixels, line_size, h);
|
yading@10
|
80 ff_put_pixels8_mmxext(block + 8, pixels + 8, line_size, h);
|
yading@10
|
81 }
|
yading@10
|
82
|
yading@10
|
83 void ff_put_mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
|
yading@10
|
84 int dstStride, int srcStride, int h);
|
yading@10
|
85 void ff_avg_mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
|
yading@10
|
86 int dstStride, int srcStride, int h);
|
yading@10
|
87 void ff_put_no_rnd_mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
|
yading@10
|
88 int dstStride, int srcStride,
|
yading@10
|
89 int h);
|
yading@10
|
90 void ff_put_mpeg4_qpel8_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
|
yading@10
|
91 int dstStride, int srcStride, int h);
|
yading@10
|
92 void ff_avg_mpeg4_qpel8_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
|
yading@10
|
93 int dstStride, int srcStride, int h);
|
yading@10
|
94 void ff_put_no_rnd_mpeg4_qpel8_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
|
yading@10
|
95 int dstStride, int srcStride,
|
yading@10
|
96 int h);
|
yading@10
|
97 void ff_put_mpeg4_qpel16_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
|
yading@10
|
98 int dstStride, int srcStride);
|
yading@10
|
99 void ff_avg_mpeg4_qpel16_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
|
yading@10
|
100 int dstStride, int srcStride);
|
yading@10
|
101 void ff_put_no_rnd_mpeg4_qpel16_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
|
yading@10
|
102 int dstStride, int srcStride);
|
yading@10
|
103 void ff_put_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
|
yading@10
|
104 int dstStride, int srcStride);
|
yading@10
|
105 void ff_avg_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
|
yading@10
|
106 int dstStride, int srcStride);
|
yading@10
|
107 void ff_put_no_rnd_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
|
yading@10
|
108 int dstStride, int srcStride);
|
yading@10
|
109 #define ff_put_no_rnd_pixels16_mmxext ff_put_pixels16_mmxext
|
yading@10
|
110 #define ff_put_no_rnd_pixels8_mmxext ff_put_pixels8_mmxext
|
yading@10
|
111 #endif /* HAVE_YASM */
|
yading@10
|
112
|
yading@10
|
113
|
yading@10
|
114 #if HAVE_INLINE_ASM
|
yading@10
|
115
|
yading@10
|
116 #define JUMPALIGN() __asm__ volatile (".p2align 3"::)
|
yading@10
|
117 #define MOVQ_ZERO(regd) __asm__ volatile ("pxor %%"#regd", %%"#regd ::)
|
yading@10
|
118
|
yading@10
|
119 #define MOVQ_BFE(regd) \
|
yading@10
|
120 __asm__ volatile ( \
|
yading@10
|
121 "pcmpeqd %%"#regd", %%"#regd" \n\t" \
|
yading@10
|
122 "paddb %%"#regd", %%"#regd" \n\t" ::)
|
yading@10
|
123
|
yading@10
|
124 #ifndef PIC
|
yading@10
|
125 #define MOVQ_BONE(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_bone))
|
yading@10
|
126 #define MOVQ_WTWO(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_wtwo))
|
yading@10
|
127 #else
|
yading@10
|
128 // for shared library it's better to use this way for accessing constants
|
yading@10
|
129 // pcmpeqd -> -1
|
yading@10
|
130 #define MOVQ_BONE(regd) \
|
yading@10
|
131 __asm__ volatile ( \
|
yading@10
|
132 "pcmpeqd %%"#regd", %%"#regd" \n\t" \
|
yading@10
|
133 "psrlw $15, %%"#regd" \n\t" \
|
yading@10
|
134 "packuswb %%"#regd", %%"#regd" \n\t" ::)
|
yading@10
|
135
|
yading@10
|
136 #define MOVQ_WTWO(regd) \
|
yading@10
|
137 __asm__ volatile ( \
|
yading@10
|
138 "pcmpeqd %%"#regd", %%"#regd" \n\t" \
|
yading@10
|
139 "psrlw $15, %%"#regd" \n\t" \
|
yading@10
|
140 "psllw $1, %%"#regd" \n\t"::)
|
yading@10
|
141
|
yading@10
|
142 #endif
|
yading@10
|
143
|
yading@10
|
144 // using regr as temporary and for the output result
|
yading@10
|
145 // first argument is unmodifed and second is trashed
|
yading@10
|
146 // regfe is supposed to contain 0xfefefefefefefefe
|
yading@10
|
147 #define PAVGB_MMX(rega, regb, regr, regfe) \
|
yading@10
|
148 "movq "#rega", "#regr" \n\t" \
|
yading@10
|
149 "por "#regb", "#regr" \n\t" \
|
yading@10
|
150 "pxor "#rega", "#regb" \n\t" \
|
yading@10
|
151 "pand "#regfe", "#regb" \n\t" \
|
yading@10
|
152 "psrlq $1, "#regb" \n\t" \
|
yading@10
|
153 "psubb "#regb", "#regr" \n\t"
|
yading@10
|
154
|
yading@10
|
155 // mm6 is supposed to contain 0xfefefefefefefefe
|
yading@10
|
156 #define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
|
yading@10
|
157 "movq "#rega", "#regr" \n\t" \
|
yading@10
|
158 "movq "#regc", "#regp" \n\t" \
|
yading@10
|
159 "por "#regb", "#regr" \n\t" \
|
yading@10
|
160 "por "#regd", "#regp" \n\t" \
|
yading@10
|
161 "pxor "#rega", "#regb" \n\t" \
|
yading@10
|
162 "pxor "#regc", "#regd" \n\t" \
|
yading@10
|
163 "pand %%mm6, "#regb" \n\t" \
|
yading@10
|
164 "pand %%mm6, "#regd" \n\t" \
|
yading@10
|
165 "psrlq $1, "#regd" \n\t" \
|
yading@10
|
166 "psrlq $1, "#regb" \n\t" \
|
yading@10
|
167 "psubb "#regb", "#regr" \n\t" \
|
yading@10
|
168 "psubb "#regd", "#regp" \n\t"
|
yading@10
|
169
|
yading@10
|
170 /***********************************/
|
yading@10
|
171 /* MMX rounding */
|
yading@10
|
172
|
yading@10
|
173 #define DEF(x, y) x ## _ ## y ## _mmx
|
yading@10
|
174 #define SET_RND MOVQ_WTWO
|
yading@10
|
175 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f)
|
yading@10
|
176 #define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e)
|
yading@10
|
177 #define OP_AVG(a, b, c, e) PAVGB_MMX(a, b, c, e)
|
yading@10
|
178
|
yading@10
|
179 #include "dsputil_rnd_template.c"
|
yading@10
|
180
|
yading@10
|
181 #undef DEF
|
yading@10
|
182 #undef SET_RND
|
yading@10
|
183 #undef PAVGBP
|
yading@10
|
184 #undef PAVGB
|
yading@10
|
185 #undef OP_AVG
|
yading@10
|
186
|
yading@10
|
187 #endif /* HAVE_INLINE_ASM */
|
yading@10
|
188
|
yading@10
|
189
|
yading@10
|
190 #if HAVE_YASM
|
yading@10
|
191
|
yading@10
|
192 /***********************************/
|
yading@10
|
193 /* MMXEXT specific */
|
yading@10
|
194
|
yading@10
|
195 //FIXME the following could be optimized too ...
|
yading@10
|
196 static void ff_avg_pixels16_mmxext(uint8_t *block, const uint8_t *pixels,
|
yading@10
|
197 int line_size, int h)
|
yading@10
|
198 {
|
yading@10
|
199 ff_avg_pixels8_mmxext(block, pixels, line_size, h);
|
yading@10
|
200 ff_avg_pixels8_mmxext(block + 8, pixels + 8, line_size, h);
|
yading@10
|
201 }
|
yading@10
|
202
|
yading@10
|
203 #endif /* HAVE_YASM */
|
yading@10
|
204
|
yading@10
|
205
|
yading@10
|
206 #if HAVE_INLINE_ASM
|
yading@10
|
207 /***********************************/
|
yading@10
|
208 /* standard MMX */
|
yading@10
|
209
|
yading@10
|
210 void ff_put_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
|
yading@10
|
211 int line_size)
|
yading@10
|
212 {
|
yading@10
|
213 const int16_t *p;
|
yading@10
|
214 uint8_t *pix;
|
yading@10
|
215
|
yading@10
|
216 /* read the pixels */
|
yading@10
|
217 p = block;
|
yading@10
|
218 pix = pixels;
|
yading@10
|
219 /* unrolled loop */
|
yading@10
|
220 __asm__ volatile (
|
yading@10
|
221 "movq (%3), %%mm0 \n\t"
|
yading@10
|
222 "movq 8(%3), %%mm1 \n\t"
|
yading@10
|
223 "movq 16(%3), %%mm2 \n\t"
|
yading@10
|
224 "movq 24(%3), %%mm3 \n\t"
|
yading@10
|
225 "movq 32(%3), %%mm4 \n\t"
|
yading@10
|
226 "movq 40(%3), %%mm5 \n\t"
|
yading@10
|
227 "movq 48(%3), %%mm6 \n\t"
|
yading@10
|
228 "movq 56(%3), %%mm7 \n\t"
|
yading@10
|
229 "packuswb %%mm1, %%mm0 \n\t"
|
yading@10
|
230 "packuswb %%mm3, %%mm2 \n\t"
|
yading@10
|
231 "packuswb %%mm5, %%mm4 \n\t"
|
yading@10
|
232 "packuswb %%mm7, %%mm6 \n\t"
|
yading@10
|
233 "movq %%mm0, (%0) \n\t"
|
yading@10
|
234 "movq %%mm2, (%0, %1) \n\t"
|
yading@10
|
235 "movq %%mm4, (%0, %1, 2) \n\t"
|
yading@10
|
236 "movq %%mm6, (%0, %2) \n\t"
|
yading@10
|
237 :: "r"(pix), "r"((x86_reg)line_size), "r"((x86_reg)line_size * 3),
|
yading@10
|
238 "r"(p)
|
yading@10
|
239 : "memory");
|
yading@10
|
240 pix += line_size * 4;
|
yading@10
|
241 p += 32;
|
yading@10
|
242
|
yading@10
|
243 // if here would be an exact copy of the code above
|
yading@10
|
244 // compiler would generate some very strange code
|
yading@10
|
245 // thus using "r"
|
yading@10
|
246 __asm__ volatile (
|
yading@10
|
247 "movq (%3), %%mm0 \n\t"
|
yading@10
|
248 "movq 8(%3), %%mm1 \n\t"
|
yading@10
|
249 "movq 16(%3), %%mm2 \n\t"
|
yading@10
|
250 "movq 24(%3), %%mm3 \n\t"
|
yading@10
|
251 "movq 32(%3), %%mm4 \n\t"
|
yading@10
|
252 "movq 40(%3), %%mm5 \n\t"
|
yading@10
|
253 "movq 48(%3), %%mm6 \n\t"
|
yading@10
|
254 "movq 56(%3), %%mm7 \n\t"
|
yading@10
|
255 "packuswb %%mm1, %%mm0 \n\t"
|
yading@10
|
256 "packuswb %%mm3, %%mm2 \n\t"
|
yading@10
|
257 "packuswb %%mm5, %%mm4 \n\t"
|
yading@10
|
258 "packuswb %%mm7, %%mm6 \n\t"
|
yading@10
|
259 "movq %%mm0, (%0) \n\t"
|
yading@10
|
260 "movq %%mm2, (%0, %1) \n\t"
|
yading@10
|
261 "movq %%mm4, (%0, %1, 2) \n\t"
|
yading@10
|
262 "movq %%mm6, (%0, %2) \n\t"
|
yading@10
|
263 :: "r"(pix), "r"((x86_reg)line_size), "r"((x86_reg)line_size * 3), "r"(p)
|
yading@10
|
264 : "memory");
|
yading@10
|
265 }
|
yading@10
|
266
|
yading@10
|
267 #define put_signed_pixels_clamped_mmx_half(off) \
|
yading@10
|
268 "movq "#off"(%2), %%mm1 \n\t" \
|
yading@10
|
269 "movq 16 + "#off"(%2), %%mm2 \n\t" \
|
yading@10
|
270 "movq 32 + "#off"(%2), %%mm3 \n\t" \
|
yading@10
|
271 "movq 48 + "#off"(%2), %%mm4 \n\t" \
|
yading@10
|
272 "packsswb 8 + "#off"(%2), %%mm1 \n\t" \
|
yading@10
|
273 "packsswb 24 + "#off"(%2), %%mm2 \n\t" \
|
yading@10
|
274 "packsswb 40 + "#off"(%2), %%mm3 \n\t" \
|
yading@10
|
275 "packsswb 56 + "#off"(%2), %%mm4 \n\t" \
|
yading@10
|
276 "paddb %%mm0, %%mm1 \n\t" \
|
yading@10
|
277 "paddb %%mm0, %%mm2 \n\t" \
|
yading@10
|
278 "paddb %%mm0, %%mm3 \n\t" \
|
yading@10
|
279 "paddb %%mm0, %%mm4 \n\t" \
|
yading@10
|
280 "movq %%mm1, (%0) \n\t" \
|
yading@10
|
281 "movq %%mm2, (%0, %3) \n\t" \
|
yading@10
|
282 "movq %%mm3, (%0, %3, 2) \n\t" \
|
yading@10
|
283 "movq %%mm4, (%0, %1) \n\t"
|
yading@10
|
284
|
yading@10
|
285 void ff_put_signed_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
|
yading@10
|
286 int line_size)
|
yading@10
|
287 {
|
yading@10
|
288 x86_reg line_skip = line_size;
|
yading@10
|
289 x86_reg line_skip3;
|
yading@10
|
290
|
yading@10
|
291 __asm__ volatile (
|
yading@10
|
292 "movq "MANGLE(ff_pb_80)", %%mm0 \n\t"
|
yading@10
|
293 "lea (%3, %3, 2), %1 \n\t"
|
yading@10
|
294 put_signed_pixels_clamped_mmx_half(0)
|
yading@10
|
295 "lea (%0, %3, 4), %0 \n\t"
|
yading@10
|
296 put_signed_pixels_clamped_mmx_half(64)
|
yading@10
|
297 : "+&r"(pixels), "=&r"(line_skip3)
|
yading@10
|
298 : "r"(block), "r"(line_skip)
|
yading@10
|
299 : "memory");
|
yading@10
|
300 }
|
yading@10
|
301
|
yading@10
|
302 void ff_add_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
|
yading@10
|
303 int line_size)
|
yading@10
|
304 {
|
yading@10
|
305 const int16_t *p;
|
yading@10
|
306 uint8_t *pix;
|
yading@10
|
307 int i;
|
yading@10
|
308
|
yading@10
|
309 /* read the pixels */
|
yading@10
|
310 p = block;
|
yading@10
|
311 pix = pixels;
|
yading@10
|
312 MOVQ_ZERO(mm7);
|
yading@10
|
313 i = 4;
|
yading@10
|
314 do {
|
yading@10
|
315 __asm__ volatile (
|
yading@10
|
316 "movq (%2), %%mm0 \n\t"
|
yading@10
|
317 "movq 8(%2), %%mm1 \n\t"
|
yading@10
|
318 "movq 16(%2), %%mm2 \n\t"
|
yading@10
|
319 "movq 24(%2), %%mm3 \n\t"
|
yading@10
|
320 "movq %0, %%mm4 \n\t"
|
yading@10
|
321 "movq %1, %%mm6 \n\t"
|
yading@10
|
322 "movq %%mm4, %%mm5 \n\t"
|
yading@10
|
323 "punpcklbw %%mm7, %%mm4 \n\t"
|
yading@10
|
324 "punpckhbw %%mm7, %%mm5 \n\t"
|
yading@10
|
325 "paddsw %%mm4, %%mm0 \n\t"
|
yading@10
|
326 "paddsw %%mm5, %%mm1 \n\t"
|
yading@10
|
327 "movq %%mm6, %%mm5 \n\t"
|
yading@10
|
328 "punpcklbw %%mm7, %%mm6 \n\t"
|
yading@10
|
329 "punpckhbw %%mm7, %%mm5 \n\t"
|
yading@10
|
330 "paddsw %%mm6, %%mm2 \n\t"
|
yading@10
|
331 "paddsw %%mm5, %%mm3 \n\t"
|
yading@10
|
332 "packuswb %%mm1, %%mm0 \n\t"
|
yading@10
|
333 "packuswb %%mm3, %%mm2 \n\t"
|
yading@10
|
334 "movq %%mm0, %0 \n\t"
|
yading@10
|
335 "movq %%mm2, %1 \n\t"
|
yading@10
|
336 : "+m"(*pix), "+m"(*(pix + line_size))
|
yading@10
|
337 : "r"(p)
|
yading@10
|
338 : "memory");
|
yading@10
|
339 pix += line_size * 2;
|
yading@10
|
340 p += 16;
|
yading@10
|
341 } while (--i);
|
yading@10
|
342 }
|
yading@10
|
343
|
yading@10
|
344 static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels,
|
yading@10
|
345 ptrdiff_t line_size, int h)
|
yading@10
|
346 {
|
yading@10
|
347 __asm__ volatile (
|
yading@10
|
348 "lea (%3, %3), %%"REG_a" \n\t"
|
yading@10
|
349 ".p2align 3 \n\t"
|
yading@10
|
350 "1: \n\t"
|
yading@10
|
351 "movq (%1 ), %%mm0 \n\t"
|
yading@10
|
352 "movq (%1, %3), %%mm1 \n\t"
|
yading@10
|
353 "movq %%mm0, (%2) \n\t"
|
yading@10
|
354 "movq %%mm1, (%2, %3) \n\t"
|
yading@10
|
355 "add %%"REG_a", %1 \n\t"
|
yading@10
|
356 "add %%"REG_a", %2 \n\t"
|
yading@10
|
357 "movq (%1 ), %%mm0 \n\t"
|
yading@10
|
358 "movq (%1, %3), %%mm1 \n\t"
|
yading@10
|
359 "movq %%mm0, (%2) \n\t"
|
yading@10
|
360 "movq %%mm1, (%2, %3) \n\t"
|
yading@10
|
361 "add %%"REG_a", %1 \n\t"
|
yading@10
|
362 "add %%"REG_a", %2 \n\t"
|
yading@10
|
363 "subl $4, %0 \n\t"
|
yading@10
|
364 "jnz 1b \n\t"
|
yading@10
|
365 : "+g"(h), "+r"(pixels), "+r"(block)
|
yading@10
|
366 : "r"((x86_reg)line_size)
|
yading@10
|
367 : "%"REG_a, "memory"
|
yading@10
|
368 );
|
yading@10
|
369 }
|
yading@10
|
370
|
yading@10
|
371 static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels,
|
yading@10
|
372 ptrdiff_t line_size, int h)
|
yading@10
|
373 {
|
yading@10
|
374 __asm__ volatile (
|
yading@10
|
375 "lea (%3, %3), %%"REG_a" \n\t"
|
yading@10
|
376 ".p2align 3 \n\t"
|
yading@10
|
377 "1: \n\t"
|
yading@10
|
378 "movq (%1 ), %%mm0 \n\t"
|
yading@10
|
379 "movq 8(%1 ), %%mm4 \n\t"
|
yading@10
|
380 "movq (%1, %3), %%mm1 \n\t"
|
yading@10
|
381 "movq 8(%1, %3), %%mm5 \n\t"
|
yading@10
|
382 "movq %%mm0, (%2) \n\t"
|
yading@10
|
383 "movq %%mm4, 8(%2) \n\t"
|
yading@10
|
384 "movq %%mm1, (%2, %3) \n\t"
|
yading@10
|
385 "movq %%mm5, 8(%2, %3) \n\t"
|
yading@10
|
386 "add %%"REG_a", %1 \n\t"
|
yading@10
|
387 "add %%"REG_a", %2 \n\t"
|
yading@10
|
388 "movq (%1 ), %%mm0 \n\t"
|
yading@10
|
389 "movq 8(%1 ), %%mm4 \n\t"
|
yading@10
|
390 "movq (%1, %3), %%mm1 \n\t"
|
yading@10
|
391 "movq 8(%1, %3), %%mm5 \n\t"
|
yading@10
|
392 "movq %%mm0, (%2) \n\t"
|
yading@10
|
393 "movq %%mm4, 8(%2) \n\t"
|
yading@10
|
394 "movq %%mm1, (%2, %3) \n\t"
|
yading@10
|
395 "movq %%mm5, 8(%2, %3) \n\t"
|
yading@10
|
396 "add %%"REG_a", %1 \n\t"
|
yading@10
|
397 "add %%"REG_a", %2 \n\t"
|
yading@10
|
398 "subl $4, %0 \n\t"
|
yading@10
|
399 "jnz 1b \n\t"
|
yading@10
|
400 : "+g"(h), "+r"(pixels), "+r"(block)
|
yading@10
|
401 : "r"((x86_reg)line_size)
|
yading@10
|
402 : "%"REG_a, "memory"
|
yading@10
|
403 );
|
yading@10
|
404 }
|
yading@10
|
405
|
yading@10
|
406 #define CLEAR_BLOCKS(name, n) \
|
yading@10
|
407 static void name(int16_t *blocks) \
|
yading@10
|
408 { \
|
yading@10
|
409 __asm__ volatile ( \
|
yading@10
|
410 "pxor %%mm7, %%mm7 \n\t" \
|
yading@10
|
411 "mov %1, %%"REG_a" \n\t" \
|
yading@10
|
412 "1: \n\t" \
|
yading@10
|
413 "movq %%mm7, (%0, %%"REG_a") \n\t" \
|
yading@10
|
414 "movq %%mm7, 8(%0, %%"REG_a") \n\t" \
|
yading@10
|
415 "movq %%mm7, 16(%0, %%"REG_a") \n\t" \
|
yading@10
|
416 "movq %%mm7, 24(%0, %%"REG_a") \n\t" \
|
yading@10
|
417 "add $32, %%"REG_a" \n\t" \
|
yading@10
|
418 "js 1b \n\t" \
|
yading@10
|
419 :: "r"(((uint8_t *)blocks) + 128 * n), \
|
yading@10
|
420 "i"(-128 * n) \
|
yading@10
|
421 : "%"REG_a \
|
yading@10
|
422 ); \
|
yading@10
|
423 }
|
yading@10
|
424 CLEAR_BLOCKS(clear_blocks_mmx, 6)
|
yading@10
|
425 CLEAR_BLOCKS(clear_block_mmx, 1)
|
yading@10
|
426
|
yading@10
|
427 static void clear_block_sse(int16_t *block)
|
yading@10
|
428 {
|
yading@10
|
429 __asm__ volatile (
|
yading@10
|
430 "xorps %%xmm0, %%xmm0 \n"
|
yading@10
|
431 "movaps %%xmm0, (%0) \n"
|
yading@10
|
432 "movaps %%xmm0, 16(%0) \n"
|
yading@10
|
433 "movaps %%xmm0, 32(%0) \n"
|
yading@10
|
434 "movaps %%xmm0, 48(%0) \n"
|
yading@10
|
435 "movaps %%xmm0, 64(%0) \n"
|
yading@10
|
436 "movaps %%xmm0, 80(%0) \n"
|
yading@10
|
437 "movaps %%xmm0, 96(%0) \n"
|
yading@10
|
438 "movaps %%xmm0, 112(%0) \n"
|
yading@10
|
439 :: "r"(block)
|
yading@10
|
440 : "memory"
|
yading@10
|
441 );
|
yading@10
|
442 }
|
yading@10
|
443
|
yading@10
|
444 static void clear_blocks_sse(int16_t *blocks)
|
yading@10
|
445 {
|
yading@10
|
446 __asm__ volatile (
|
yading@10
|
447 "xorps %%xmm0, %%xmm0 \n"
|
yading@10
|
448 "mov %1, %%"REG_a" \n"
|
yading@10
|
449 "1: \n"
|
yading@10
|
450 "movaps %%xmm0, (%0, %%"REG_a") \n"
|
yading@10
|
451 "movaps %%xmm0, 16(%0, %%"REG_a") \n"
|
yading@10
|
452 "movaps %%xmm0, 32(%0, %%"REG_a") \n"
|
yading@10
|
453 "movaps %%xmm0, 48(%0, %%"REG_a") \n"
|
yading@10
|
454 "movaps %%xmm0, 64(%0, %%"REG_a") \n"
|
yading@10
|
455 "movaps %%xmm0, 80(%0, %%"REG_a") \n"
|
yading@10
|
456 "movaps %%xmm0, 96(%0, %%"REG_a") \n"
|
yading@10
|
457 "movaps %%xmm0, 112(%0, %%"REG_a") \n"
|
yading@10
|
458 "add $128, %%"REG_a" \n"
|
yading@10
|
459 "js 1b \n"
|
yading@10
|
460 :: "r"(((uint8_t *)blocks) + 128 * 6),
|
yading@10
|
461 "i"(-128 * 6)
|
yading@10
|
462 : "%"REG_a
|
yading@10
|
463 );
|
yading@10
|
464 }
|
yading@10
|
465
|
yading@10
|
466 static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w)
|
yading@10
|
467 {
|
yading@10
|
468 x86_reg i = 0;
|
yading@10
|
469 __asm__ volatile (
|
yading@10
|
470 "jmp 2f \n\t"
|
yading@10
|
471 "1: \n\t"
|
yading@10
|
472 "movq (%1, %0), %%mm0 \n\t"
|
yading@10
|
473 "movq (%2, %0), %%mm1 \n\t"
|
yading@10
|
474 "paddb %%mm0, %%mm1 \n\t"
|
yading@10
|
475 "movq %%mm1, (%2, %0) \n\t"
|
yading@10
|
476 "movq 8(%1, %0), %%mm0 \n\t"
|
yading@10
|
477 "movq 8(%2, %0), %%mm1 \n\t"
|
yading@10
|
478 "paddb %%mm0, %%mm1 \n\t"
|
yading@10
|
479 "movq %%mm1, 8(%2, %0) \n\t"
|
yading@10
|
480 "add $16, %0 \n\t"
|
yading@10
|
481 "2: \n\t"
|
yading@10
|
482 "cmp %3, %0 \n\t"
|
yading@10
|
483 "js 1b \n\t"
|
yading@10
|
484 : "+r"(i)
|
yading@10
|
485 : "r"(src), "r"(dst), "r"((x86_reg)w - 15)
|
yading@10
|
486 );
|
yading@10
|
487 for ( ; i < w; i++)
|
yading@10
|
488 dst[i + 0] += src[i + 0];
|
yading@10
|
489 }
|
yading@10
|
490
|
yading@10
|
491 #if HAVE_7REGS
|
yading@10
|
492 static void add_hfyu_median_prediction_cmov(uint8_t *dst, const uint8_t *top,
|
yading@10
|
493 const uint8_t *diff, int w,
|
yading@10
|
494 int *left, int *left_top)
|
yading@10
|
495 {
|
yading@10
|
496 x86_reg w2 = -w;
|
yading@10
|
497 x86_reg x;
|
yading@10
|
498 int l = *left & 0xff;
|
yading@10
|
499 int tl = *left_top & 0xff;
|
yading@10
|
500 int t;
|
yading@10
|
501 __asm__ volatile (
|
yading@10
|
502 "mov %7, %3 \n"
|
yading@10
|
503 "1: \n"
|
yading@10
|
504 "movzbl (%3, %4), %2 \n"
|
yading@10
|
505 "mov %2, %k3 \n"
|
yading@10
|
506 "sub %b1, %b3 \n"
|
yading@10
|
507 "add %b0, %b3 \n"
|
yading@10
|
508 "mov %2, %1 \n"
|
yading@10
|
509 "cmp %0, %2 \n"
|
yading@10
|
510 "cmovg %0, %2 \n"
|
yading@10
|
511 "cmovg %1, %0 \n"
|
yading@10
|
512 "cmp %k3, %0 \n"
|
yading@10
|
513 "cmovg %k3, %0 \n"
|
yading@10
|
514 "mov %7, %3 \n"
|
yading@10
|
515 "cmp %2, %0 \n"
|
yading@10
|
516 "cmovl %2, %0 \n"
|
yading@10
|
517 "add (%6, %4), %b0 \n"
|
yading@10
|
518 "mov %b0, (%5, %4) \n"
|
yading@10
|
519 "inc %4 \n"
|
yading@10
|
520 "jl 1b \n"
|
yading@10
|
521 : "+&q"(l), "+&q"(tl), "=&r"(t), "=&q"(x), "+&r"(w2)
|
yading@10
|
522 : "r"(dst + w), "r"(diff + w), "rm"(top + w)
|
yading@10
|
523 );
|
yading@10
|
524 *left = l;
|
yading@10
|
525 *left_top = tl;
|
yading@10
|
526 }
|
yading@10
|
527 #endif
|
yading@10
|
528
|
yading@10
|
529 /* Draw the edges of width 'w' of an image of size width, height
|
yading@10
|
530 * this MMX version can only handle w == 8 || w == 16. */
|
yading@10
|
531 static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height,
|
yading@10
|
532 int w, int h, int sides)
|
yading@10
|
533 {
|
yading@10
|
534 uint8_t *ptr, *last_line;
|
yading@10
|
535 int i;
|
yading@10
|
536
|
yading@10
|
537 last_line = buf + (height - 1) * wrap;
|
yading@10
|
538 /* left and right */
|
yading@10
|
539 ptr = buf;
|
yading@10
|
540 if (w == 8) {
|
yading@10
|
541 __asm__ volatile (
|
yading@10
|
542 "1: \n\t"
|
yading@10
|
543 "movd (%0), %%mm0 \n\t"
|
yading@10
|
544 "punpcklbw %%mm0, %%mm0 \n\t"
|
yading@10
|
545 "punpcklwd %%mm0, %%mm0 \n\t"
|
yading@10
|
546 "punpckldq %%mm0, %%mm0 \n\t"
|
yading@10
|
547 "movq %%mm0, -8(%0) \n\t"
|
yading@10
|
548 "movq -8(%0, %2), %%mm1 \n\t"
|
yading@10
|
549 "punpckhbw %%mm1, %%mm1 \n\t"
|
yading@10
|
550 "punpckhwd %%mm1, %%mm1 \n\t"
|
yading@10
|
551 "punpckhdq %%mm1, %%mm1 \n\t"
|
yading@10
|
552 "movq %%mm1, (%0, %2) \n\t"
|
yading@10
|
553 "add %1, %0 \n\t"
|
yading@10
|
554 "cmp %3, %0 \n\t"
|
yading@10
|
555 "jb 1b \n\t"
|
yading@10
|
556 : "+r"(ptr)
|
yading@10
|
557 : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
|
yading@10
|
558 );
|
yading@10
|
559 } else if(w==16){
|
yading@10
|
560 __asm__ volatile (
|
yading@10
|
561 "1: \n\t"
|
yading@10
|
562 "movd (%0), %%mm0 \n\t"
|
yading@10
|
563 "punpcklbw %%mm0, %%mm0 \n\t"
|
yading@10
|
564 "punpcklwd %%mm0, %%mm0 \n\t"
|
yading@10
|
565 "punpckldq %%mm0, %%mm0 \n\t"
|
yading@10
|
566 "movq %%mm0, -8(%0) \n\t"
|
yading@10
|
567 "movq %%mm0, -16(%0) \n\t"
|
yading@10
|
568 "movq -8(%0, %2), %%mm1 \n\t"
|
yading@10
|
569 "punpckhbw %%mm1, %%mm1 \n\t"
|
yading@10
|
570 "punpckhwd %%mm1, %%mm1 \n\t"
|
yading@10
|
571 "punpckhdq %%mm1, %%mm1 \n\t"
|
yading@10
|
572 "movq %%mm1, (%0, %2) \n\t"
|
yading@10
|
573 "movq %%mm1, 8(%0, %2) \n\t"
|
yading@10
|
574 "add %1, %0 \n\t"
|
yading@10
|
575 "cmp %3, %0 \n\t"
|
yading@10
|
576 "jb 1b \n\t"
|
yading@10
|
577 : "+r"(ptr)
|
yading@10
|
578 : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
|
yading@10
|
579 );
|
yading@10
|
580 } else {
|
yading@10
|
581 av_assert1(w == 4);
|
yading@10
|
582 __asm__ volatile (
|
yading@10
|
583 "1: \n\t"
|
yading@10
|
584 "movd (%0), %%mm0 \n\t"
|
yading@10
|
585 "punpcklbw %%mm0, %%mm0 \n\t"
|
yading@10
|
586 "punpcklwd %%mm0, %%mm0 \n\t"
|
yading@10
|
587 "movd %%mm0, -4(%0) \n\t"
|
yading@10
|
588 "movd -4(%0, %2), %%mm1 \n\t"
|
yading@10
|
589 "punpcklbw %%mm1, %%mm1 \n\t"
|
yading@10
|
590 "punpckhwd %%mm1, %%mm1 \n\t"
|
yading@10
|
591 "punpckhdq %%mm1, %%mm1 \n\t"
|
yading@10
|
592 "movd %%mm1, (%0, %2) \n\t"
|
yading@10
|
593 "add %1, %0 \n\t"
|
yading@10
|
594 "cmp %3, %0 \n\t"
|
yading@10
|
595 "jb 1b \n\t"
|
yading@10
|
596 : "+r"(ptr)
|
yading@10
|
597 : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
|
yading@10
|
598 );
|
yading@10
|
599 }
|
yading@10
|
600
|
yading@10
|
601 /* top and bottom (and hopefully also the corners) */
|
yading@10
|
602 if (sides & EDGE_TOP) {
|
yading@10
|
603 for (i = 0; i < h; i += 4) {
|
yading@10
|
604 ptr = buf - (i + 1) * wrap - w;
|
yading@10
|
605 __asm__ volatile (
|
yading@10
|
606 "1: \n\t"
|
yading@10
|
607 "movq (%1, %0), %%mm0 \n\t"
|
yading@10
|
608 "movq %%mm0, (%0) \n\t"
|
yading@10
|
609 "movq %%mm0, (%0, %2) \n\t"
|
yading@10
|
610 "movq %%mm0, (%0, %2, 2) \n\t"
|
yading@10
|
611 "movq %%mm0, (%0, %3) \n\t"
|
yading@10
|
612 "add $8, %0 \n\t"
|
yading@10
|
613 "cmp %4, %0 \n\t"
|
yading@10
|
614 "jb 1b \n\t"
|
yading@10
|
615 : "+r"(ptr)
|
yading@10
|
616 : "r"((x86_reg)buf - (x86_reg)ptr - w), "r"((x86_reg) -wrap),
|
yading@10
|
617 "r"((x86_reg) -wrap * 3), "r"(ptr + width + 2 * w)
|
yading@10
|
618 );
|
yading@10
|
619 }
|
yading@10
|
620 }
|
yading@10
|
621
|
yading@10
|
622 if (sides & EDGE_BOTTOM) {
|
yading@10
|
623 for (i = 0; i < h; i += 4) {
|
yading@10
|
624 ptr = last_line + (i + 1) * wrap - w;
|
yading@10
|
625 __asm__ volatile (
|
yading@10
|
626 "1: \n\t"
|
yading@10
|
627 "movq (%1, %0), %%mm0 \n\t"
|
yading@10
|
628 "movq %%mm0, (%0) \n\t"
|
yading@10
|
629 "movq %%mm0, (%0, %2) \n\t"
|
yading@10
|
630 "movq %%mm0, (%0, %2, 2) \n\t"
|
yading@10
|
631 "movq %%mm0, (%0, %3) \n\t"
|
yading@10
|
632 "add $8, %0 \n\t"
|
yading@10
|
633 "cmp %4, %0 \n\t"
|
yading@10
|
634 "jb 1b \n\t"
|
yading@10
|
635 : "+r"(ptr)
|
yading@10
|
636 : "r"((x86_reg)last_line - (x86_reg)ptr - w),
|
yading@10
|
637 "r"((x86_reg)wrap), "r"((x86_reg)wrap * 3),
|
yading@10
|
638 "r"(ptr + width + 2 * w)
|
yading@10
|
639 );
|
yading@10
|
640 }
|
yading@10
|
641 }
|
yading@10
|
642 }
|
yading@10
|
643 #endif /* HAVE_INLINE_ASM */
|
yading@10
|
644
|
yading@10
|
645
|
yading@10
|
646 #if HAVE_YASM
|
yading@10
|
647 #define QPEL_OP(OPNAME, ROUNDER, RND, MMX) \
|
yading@10
|
648 static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, \
|
yading@10
|
649 ptrdiff_t stride) \
|
yading@10
|
650 { \
|
yading@10
|
651 ff_ ## OPNAME ## pixels8_ ## MMX(dst, src, stride, 8); \
|
yading@10
|
652 } \
|
yading@10
|
653 \
|
yading@10
|
654 static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, \
|
yading@10
|
655 ptrdiff_t stride) \
|
yading@10
|
656 { \
|
yading@10
|
657 uint64_t temp[8]; \
|
yading@10
|
658 uint8_t * const half = (uint8_t*)temp; \
|
yading@10
|
659 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, \
|
yading@10
|
660 stride, 8); \
|
yading@10
|
661 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src, half, \
|
yading@10
|
662 stride, stride, 8); \
|
yading@10
|
663 } \
|
yading@10
|
664 \
|
yading@10
|
665 static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, \
|
yading@10
|
666 ptrdiff_t stride) \
|
yading@10
|
667 { \
|
yading@10
|
668 ff_ ## OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, \
|
yading@10
|
669 stride, 8); \
|
yading@10
|
670 } \
|
yading@10
|
671 \
|
yading@10
|
672 static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, \
|
yading@10
|
673 ptrdiff_t stride) \
|
yading@10
|
674 { \
|
yading@10
|
675 uint64_t temp[8]; \
|
yading@10
|
676 uint8_t * const half = (uint8_t*)temp; \
|
yading@10
|
677 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, \
|
yading@10
|
678 stride, 8); \
|
yading@10
|
679 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src + 1, half, stride, \
|
yading@10
|
680 stride, 8); \
|
yading@10
|
681 } \
|
yading@10
|
682 \
|
yading@10
|
683 static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, \
|
yading@10
|
684 ptrdiff_t stride) \
|
yading@10
|
685 { \
|
yading@10
|
686 uint64_t temp[8]; \
|
yading@10
|
687 uint8_t * const half = (uint8_t*)temp; \
|
yading@10
|
688 ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, \
|
yading@10
|
689 8, stride); \
|
yading@10
|
690 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src, half, \
|
yading@10
|
691 stride, stride, 8); \
|
yading@10
|
692 } \
|
yading@10
|
693 \
|
yading@10
|
694 static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, \
|
yading@10
|
695 ptrdiff_t stride) \
|
yading@10
|
696 { \
|
yading@10
|
697 ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, \
|
yading@10
|
698 stride, stride); \
|
yading@10
|
699 } \
|
yading@10
|
700 \
|
yading@10
|
701 static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, \
|
yading@10
|
702 ptrdiff_t stride) \
|
yading@10
|
703 { \
|
yading@10
|
704 uint64_t temp[8]; \
|
yading@10
|
705 uint8_t * const half = (uint8_t*)temp; \
|
yading@10
|
706 ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, \
|
yading@10
|
707 8, stride); \
|
yading@10
|
708 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src + stride, half, stride,\
|
yading@10
|
709 stride, 8); \
|
yading@10
|
710 } \
|
yading@10
|
711 \
|
yading@10
|
712 static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, \
|
yading@10
|
713 ptrdiff_t stride) \
|
yading@10
|
714 { \
|
yading@10
|
715 uint64_t half[8 + 9]; \
|
yading@10
|
716 uint8_t * const halfH = ((uint8_t*)half) + 64; \
|
yading@10
|
717 uint8_t * const halfHV = ((uint8_t*)half); \
|
yading@10
|
718 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
|
yading@10
|
719 stride, 9); \
|
yading@10
|
720 ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, \
|
yading@10
|
721 stride, 9); \
|
yading@10
|
722 ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
|
yading@10
|
723 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, \
|
yading@10
|
724 stride, 8, 8); \
|
yading@10
|
725 } \
|
yading@10
|
726 \
|
yading@10
|
727 static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, \
|
yading@10
|
728 ptrdiff_t stride) \
|
yading@10
|
729 { \
|
yading@10
|
730 uint64_t half[8 + 9]; \
|
yading@10
|
731 uint8_t * const halfH = ((uint8_t*)half) + 64; \
|
yading@10
|
732 uint8_t * const halfHV = ((uint8_t*)half); \
|
yading@10
|
733 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
|
yading@10
|
734 stride, 9); \
|
yading@10
|
735 ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
|
yading@10
|
736 stride, 9); \
|
yading@10
|
737 ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
|
yading@10
|
738 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, \
|
yading@10
|
739 stride, 8, 8); \
|
yading@10
|
740 } \
|
yading@10
|
741 \
|
yading@10
|
742 static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, \
|
yading@10
|
743 ptrdiff_t stride) \
|
yading@10
|
744 { \
|
yading@10
|
745 uint64_t half[8 + 9]; \
|
yading@10
|
746 uint8_t * const halfH = ((uint8_t*)half) + 64; \
|
yading@10
|
747 uint8_t * const halfHV = ((uint8_t*)half); \
|
yading@10
|
748 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
|
yading@10
|
749 stride, 9); \
|
yading@10
|
750 ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, \
|
yading@10
|
751 stride, 9); \
|
yading@10
|
752 ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
|
yading@10
|
753 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, \
|
yading@10
|
754 stride, 8, 8); \
|
yading@10
|
755 } \
|
yading@10
|
756 \
|
yading@10
|
757 static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, \
|
yading@10
|
758 ptrdiff_t stride) \
|
yading@10
|
759 { \
|
yading@10
|
760 uint64_t half[8 + 9]; \
|
yading@10
|
761 uint8_t * const halfH = ((uint8_t*)half) + 64; \
|
yading@10
|
762 uint8_t * const halfHV = ((uint8_t*)half); \
|
yading@10
|
763 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
|
yading@10
|
764 stride, 9); \
|
yading@10
|
765 ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
|
yading@10
|
766 stride, 9); \
|
yading@10
|
767 ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
|
yading@10
|
768 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, \
|
yading@10
|
769 stride, 8, 8); \
|
yading@10
|
770 } \
|
yading@10
|
771 \
|
yading@10
|
772 static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, \
|
yading@10
|
773 ptrdiff_t stride) \
|
yading@10
|
774 { \
|
yading@10
|
775 uint64_t half[8 + 9]; \
|
yading@10
|
776 uint8_t * const halfH = ((uint8_t*)half) + 64; \
|
yading@10
|
777 uint8_t * const halfHV = ((uint8_t*)half); \
|
yading@10
|
778 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
|
yading@10
|
779 stride, 9); \
|
yading@10
|
780 ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
|
yading@10
|
781 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, \
|
yading@10
|
782 stride, 8, 8); \
|
yading@10
|
783 } \
|
yading@10
|
784 \
|
yading@10
|
785 static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, \
|
yading@10
|
786 ptrdiff_t stride) \
|
yading@10
|
787 { \
|
yading@10
|
788 uint64_t half[8 + 9]; \
|
yading@10
|
789 uint8_t * const halfH = ((uint8_t*)half) + 64; \
|
yading@10
|
790 uint8_t * const halfHV = ((uint8_t*)half); \
|
yading@10
|
791 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
|
yading@10
|
792 stride, 9); \
|
yading@10
|
793 ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
|
yading@10
|
794 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, \
|
yading@10
|
795 stride, 8, 8); \
|
yading@10
|
796 } \
|
yading@10
|
797 \
|
yading@10
|
798 static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, \
|
yading@10
|
799 ptrdiff_t stride) \
|
yading@10
|
800 { \
|
yading@10
|
801 uint64_t half[8 + 9]; \
|
yading@10
|
802 uint8_t * const halfH = ((uint8_t*)half); \
|
yading@10
|
803 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
|
yading@10
|
804 stride, 9); \
|
yading@10
|
805 ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, \
|
yading@10
|
806 8, stride, 9); \
|
yading@10
|
807 ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, \
|
yading@10
|
808 stride, 8); \
|
yading@10
|
809 } \
|
yading@10
|
810 \
|
yading@10
|
811 static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, \
|
yading@10
|
812 ptrdiff_t stride) \
|
yading@10
|
813 { \
|
yading@10
|
814 uint64_t half[8 + 9]; \
|
yading@10
|
815 uint8_t * const halfH = ((uint8_t*)half); \
|
yading@10
|
816 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
|
yading@10
|
817 stride, 9); \
|
yading@10
|
818 ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
|
yading@10
|
819 stride, 9); \
|
yading@10
|
820 ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, \
|
yading@10
|
821 stride, 8); \
|
yading@10
|
822 } \
|
yading@10
|
823 \
|
yading@10
|
824 static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, \
|
yading@10
|
825 ptrdiff_t stride) \
|
yading@10
|
826 { \
|
yading@10
|
827 uint64_t half[9]; \
|
yading@10
|
828 uint8_t * const halfH = ((uint8_t*)half); \
|
yading@10
|
829 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
|
yading@10
|
830 stride, 9); \
|
yading@10
|
831 ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, \
|
yading@10
|
832 stride, 8); \
|
yading@10
|
833 } \
|
yading@10
|
834 \
|
yading@10
|
835 static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, \
|
yading@10
|
836 ptrdiff_t stride) \
|
yading@10
|
837 { \
|
yading@10
|
838 ff_ ## OPNAME ## pixels16_ ## MMX(dst, src, stride, 16); \
|
yading@10
|
839 } \
|
yading@10
|
840 \
|
yading@10
|
841 static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, \
|
yading@10
|
842 ptrdiff_t stride) \
|
yading@10
|
843 { \
|
yading@10
|
844 uint64_t temp[32]; \
|
yading@10
|
845 uint8_t * const half = (uint8_t*)temp; \
|
yading@10
|
846 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, \
|
yading@10
|
847 stride, 16); \
|
yading@10
|
848 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, \
|
yading@10
|
849 stride, 16); \
|
yading@10
|
850 } \
|
yading@10
|
851 \
|
yading@10
|
852 static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, \
|
yading@10
|
853 ptrdiff_t stride) \
|
yading@10
|
854 { \
|
yading@10
|
855 ff_ ## OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, \
|
yading@10
|
856 stride, stride, 16);\
|
yading@10
|
857 } \
|
yading@10
|
858 \
|
yading@10
|
859 static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, \
|
yading@10
|
860 ptrdiff_t stride) \
|
yading@10
|
861 { \
|
yading@10
|
862 uint64_t temp[32]; \
|
yading@10
|
863 uint8_t * const half = (uint8_t*)temp; \
|
yading@10
|
864 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, \
|
yading@10
|
865 stride, 16); \
|
yading@10
|
866 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src + 1, half, \
|
yading@10
|
867 stride, stride, 16); \
|
yading@10
|
868 } \
|
yading@10
|
869 \
|
yading@10
|
870 static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, \
|
yading@10
|
871 ptrdiff_t stride) \
|
yading@10
|
872 { \
|
yading@10
|
873 uint64_t temp[32]; \
|
yading@10
|
874 uint8_t * const half = (uint8_t*)temp; \
|
yading@10
|
875 ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, \
|
yading@10
|
876 stride); \
|
yading@10
|
877 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, \
|
yading@10
|
878 stride, 16); \
|
yading@10
|
879 } \
|
yading@10
|
880 \
|
yading@10
|
881 static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, \
|
yading@10
|
882 ptrdiff_t stride) \
|
yading@10
|
883 { \
|
yading@10
|
884 ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, \
|
yading@10
|
885 stride, stride); \
|
yading@10
|
886 } \
|
yading@10
|
887 \
|
yading@10
|
888 static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, \
|
yading@10
|
889 ptrdiff_t stride) \
|
yading@10
|
890 { \
|
yading@10
|
891 uint64_t temp[32]; \
|
yading@10
|
892 uint8_t * const half = (uint8_t*)temp; \
|
yading@10
|
893 ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, \
|
yading@10
|
894 stride); \
|
yading@10
|
895 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, \
|
yading@10
|
896 stride, stride, 16); \
|
yading@10
|
897 } \
|
yading@10
|
898 \
|
yading@10
|
899 static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, \
|
yading@10
|
900 ptrdiff_t stride) \
|
yading@10
|
901 { \
|
yading@10
|
902 uint64_t half[16 * 2 + 17 * 2]; \
|
yading@10
|
903 uint8_t * const halfH = ((uint8_t*)half) + 256; \
|
yading@10
|
904 uint8_t * const halfHV = ((uint8_t*)half); \
|
yading@10
|
905 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
|
yading@10
|
906 stride, 17); \
|
yading@10
|
907 ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
|
yading@10
|
908 stride, 17); \
|
yading@10
|
909 ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
|
yading@10
|
910 16, 16); \
|
yading@10
|
911 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, \
|
yading@10
|
912 stride, 16, 16); \
|
yading@10
|
913 } \
|
yading@10
|
914 \
|
yading@10
|
915 static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, \
|
yading@10
|
916 ptrdiff_t stride) \
|
yading@10
|
917 { \
|
yading@10
|
918 uint64_t half[16 * 2 + 17 * 2]; \
|
yading@10
|
919 uint8_t * const halfH = ((uint8_t*)half) + 256; \
|
yading@10
|
920 uint8_t * const halfHV = ((uint8_t*)half); \
|
yading@10
|
921 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
|
yading@10
|
922 stride, 17); \
|
yading@10
|
923 ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
|
yading@10
|
924 stride, 17); \
|
yading@10
|
925 ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
|
yading@10
|
926 16, 16); \
|
yading@10
|
927 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, \
|
yading@10
|
928 stride, 16, 16); \
|
yading@10
|
929 } \
|
yading@10
|
930 \
|
yading@10
|
931 static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, \
|
yading@10
|
932 ptrdiff_t stride) \
|
yading@10
|
933 { \
|
yading@10
|
934 uint64_t half[16 * 2 + 17 * 2]; \
|
yading@10
|
935 uint8_t * const halfH = ((uint8_t*)half) + 256; \
|
yading@10
|
936 uint8_t * const halfHV = ((uint8_t*)half); \
|
yading@10
|
937 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
|
yading@10
|
938 stride, 17); \
|
yading@10
|
939 ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
|
yading@10
|
940 stride, 17); \
|
yading@10
|
941 ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
|
yading@10
|
942 16, 16); \
|
yading@10
|
943 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, \
|
yading@10
|
944 stride, 16, 16); \
|
yading@10
|
945 } \
|
yading@10
|
946 \
|
yading@10
|
947 static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, \
|
yading@10
|
948 ptrdiff_t stride) \
|
yading@10
|
949 { \
|
yading@10
|
950 uint64_t half[16 * 2 + 17 * 2]; \
|
yading@10
|
951 uint8_t * const halfH = ((uint8_t*)half) + 256; \
|
yading@10
|
952 uint8_t * const halfHV = ((uint8_t*)half); \
|
yading@10
|
953 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
|
yading@10
|
954 stride, 17); \
|
yading@10
|
955 ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
|
yading@10
|
956 stride, 17); \
|
yading@10
|
957 ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
|
yading@10
|
958 16, 16); \
|
yading@10
|
959 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, \
|
yading@10
|
960 stride, 16, 16); \
|
yading@10
|
961 } \
|
yading@10
|
962 \
|
yading@10
|
963 static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, \
|
yading@10
|
964 ptrdiff_t stride) \
|
yading@10
|
965 { \
|
yading@10
|
966 uint64_t half[16 * 2 + 17 * 2]; \
|
yading@10
|
967 uint8_t * const halfH = ((uint8_t*)half) + 256; \
|
yading@10
|
968 uint8_t * const halfHV = ((uint8_t*)half); \
|
yading@10
|
969 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
|
yading@10
|
970 stride, 17); \
|
yading@10
|
971 ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
|
yading@10
|
972 16, 16); \
|
yading@10
|
973 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, \
|
yading@10
|
974 stride, 16, 16); \
|
yading@10
|
975 } \
|
yading@10
|
976 \
|
yading@10
|
977 static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, \
|
yading@10
|
978 ptrdiff_t stride) \
|
yading@10
|
979 { \
|
yading@10
|
980 uint64_t half[16 * 2 + 17 * 2]; \
|
yading@10
|
981 uint8_t * const halfH = ((uint8_t*)half) + 256; \
|
yading@10
|
982 uint8_t * const halfHV = ((uint8_t*)half); \
|
yading@10
|
983 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
|
yading@10
|
984 stride, 17); \
|
yading@10
|
985 ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
|
yading@10
|
986 16, 16); \
|
yading@10
|
987 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, \
|
yading@10
|
988 stride, 16, 16); \
|
yading@10
|
989 } \
|
yading@10
|
990 \
|
yading@10
|
991 static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, \
|
yading@10
|
992 ptrdiff_t stride) \
|
yading@10
|
993 { \
|
yading@10
|
994 uint64_t half[17 * 2]; \
|
yading@10
|
995 uint8_t * const halfH = ((uint8_t*)half); \
|
yading@10
|
996 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
|
yading@10
|
997 stride, 17); \
|
yading@10
|
998 ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
|
yading@10
|
999 stride, 17); \
|
yading@10
|
1000 ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, \
|
yading@10
|
1001 stride, 16); \
|
yading@10
|
1002 } \
|
yading@10
|
1003 \
|
yading@10
|
1004 static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, \
|
yading@10
|
1005 ptrdiff_t stride) \
|
yading@10
|
1006 { \
|
yading@10
|
1007 uint64_t half[17 * 2]; \
|
yading@10
|
1008 uint8_t * const halfH = ((uint8_t*)half); \
|
yading@10
|
1009 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
|
yading@10
|
1010 stride, 17); \
|
yading@10
|
1011 ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
|
yading@10
|
1012 stride, 17); \
|
yading@10
|
1013 ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, \
|
yading@10
|
1014 stride, 16); \
|
yading@10
|
1015 } \
|
yading@10
|
1016 \
|
yading@10
|
1017 static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, \
|
yading@10
|
1018 ptrdiff_t stride) \
|
yading@10
|
1019 { \
|
yading@10
|
1020 uint64_t half[17 * 2]; \
|
yading@10
|
1021 uint8_t * const halfH = ((uint8_t*)half); \
|
yading@10
|
1022 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
|
yading@10
|
1023 stride, 17); \
|
yading@10
|
1024 ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, \
|
yading@10
|
1025 stride, 16); \
|
yading@10
|
1026 }
|
yading@10
|
1027
|
yading@10
|
1028 QPEL_OP(put_, ff_pw_16, _, mmxext)
|
yading@10
|
1029 QPEL_OP(avg_, ff_pw_16, _, mmxext)
|
yading@10
|
1030 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, mmxext)
|
yading@10
|
1031 #endif /* HAVE_YASM */
|
yading@10
|
1032
|
yading@10
|
1033
|
yading@10
|
1034 #if HAVE_INLINE_ASM
|
yading@10
|
1035 void ff_put_rv40_qpel8_mc33_mmx(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
|
yading@10
|
1036 {
|
yading@10
|
1037 put_pixels8_xy2_mmx(dst, src, stride, 8);
|
yading@10
|
1038 }
|
yading@10
|
1039 void ff_put_rv40_qpel16_mc33_mmx(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
|
yading@10
|
1040 {
|
yading@10
|
1041 put_pixels16_xy2_mmx(dst, src, stride, 16);
|
yading@10
|
1042 }
|
yading@10
|
1043 void ff_avg_rv40_qpel8_mc33_mmx(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
|
yading@10
|
1044 {
|
yading@10
|
1045 avg_pixels8_xy2_mmx(dst, src, stride, 8);
|
yading@10
|
1046 }
|
yading@10
|
1047 void ff_avg_rv40_qpel16_mc33_mmx(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
|
yading@10
|
1048 {
|
yading@10
|
1049 avg_pixels16_xy2_mmx(dst, src, stride, 16);
|
yading@10
|
1050 }
|
yading@10
|
1051
|
yading@10
|
1052 typedef void emulated_edge_mc_func(uint8_t *dst, const uint8_t *src,
|
yading@10
|
1053 ptrdiff_t linesize, int block_w, int block_h,
|
yading@10
|
1054 int src_x, int src_y, int w, int h);
|
yading@10
|
1055
|
yading@10
|
1056 static av_always_inline void gmc(uint8_t *dst, uint8_t *src,
|
yading@10
|
1057 int stride, int h, int ox, int oy,
|
yading@10
|
1058 int dxx, int dxy, int dyx, int dyy,
|
yading@10
|
1059 int shift, int r, int width, int height,
|
yading@10
|
1060 emulated_edge_mc_func *emu_edge_fn)
|
yading@10
|
1061 {
|
yading@10
|
1062 const int w = 8;
|
yading@10
|
1063 const int ix = ox >> (16 + shift);
|
yading@10
|
1064 const int iy = oy >> (16 + shift);
|
yading@10
|
1065 const int oxs = ox >> 4;
|
yading@10
|
1066 const int oys = oy >> 4;
|
yading@10
|
1067 const int dxxs = dxx >> 4;
|
yading@10
|
1068 const int dxys = dxy >> 4;
|
yading@10
|
1069 const int dyxs = dyx >> 4;
|
yading@10
|
1070 const int dyys = dyy >> 4;
|
yading@10
|
1071 const uint16_t r4[4] = { r, r, r, r };
|
yading@10
|
1072 const uint16_t dxy4[4] = { dxys, dxys, dxys, dxys };
|
yading@10
|
1073 const uint16_t dyy4[4] = { dyys, dyys, dyys, dyys };
|
yading@10
|
1074 const uint64_t shift2 = 2 * shift;
|
yading@10
|
1075 #define MAX_STRIDE 4096U
|
yading@10
|
1076 #define MAX_H 8U
|
yading@10
|
1077 uint8_t edge_buf[(MAX_H + 1) * MAX_STRIDE];
|
yading@10
|
1078 int x, y;
|
yading@10
|
1079
|
yading@10
|
1080 const int dxw = (dxx - (1 << (16 + shift))) * (w - 1);
|
yading@10
|
1081 const int dyh = (dyy - (1 << (16 + shift))) * (h - 1);
|
yading@10
|
1082 const int dxh = dxy * (h - 1);
|
yading@10
|
1083 const int dyw = dyx * (w - 1);
|
yading@10
|
1084 int need_emu = (unsigned)ix >= width - w ||
|
yading@10
|
1085 (unsigned)iy >= height - h;
|
yading@10
|
1086
|
yading@10
|
1087 if ( // non-constant fullpel offset (3% of blocks)
|
yading@10
|
1088 ((ox ^ (ox + dxw)) | (ox ^ (ox + dxh)) | (ox ^ (ox + dxw + dxh)) |
|
yading@10
|
1089 (oy ^ (oy + dyw)) | (oy ^ (oy + dyh)) | (oy ^ (oy + dyw + dyh))) >> (16 + shift)
|
yading@10
|
1090 // uses more than 16 bits of subpel mv (only at huge resolution)
|
yading@10
|
1091 || (dxx | dxy | dyx | dyy) & 15
|
yading@10
|
1092 || (need_emu && (h > MAX_H || stride > MAX_STRIDE))) {
|
yading@10
|
1093 // FIXME could still use mmx for some of the rows
|
yading@10
|
1094 ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy,
|
yading@10
|
1095 shift, r, width, height);
|
yading@10
|
1096 return;
|
yading@10
|
1097 }
|
yading@10
|
1098
|
yading@10
|
1099 src += ix + iy * stride;
|
yading@10
|
1100 if (need_emu) {
|
yading@10
|
1101 emu_edge_fn(edge_buf, src, stride, w + 1, h + 1, ix, iy, width, height);
|
yading@10
|
1102 src = edge_buf;
|
yading@10
|
1103 }
|
yading@10
|
1104
|
yading@10
|
1105 __asm__ volatile (
|
yading@10
|
1106 "movd %0, %%mm6 \n\t"
|
yading@10
|
1107 "pxor %%mm7, %%mm7 \n\t"
|
yading@10
|
1108 "punpcklwd %%mm6, %%mm6 \n\t"
|
yading@10
|
1109 "punpcklwd %%mm6, %%mm6 \n\t"
|
yading@10
|
1110 :: "r"(1<<shift)
|
yading@10
|
1111 );
|
yading@10
|
1112
|
yading@10
|
1113 for (x = 0; x < w; x += 4) {
|
yading@10
|
1114 uint16_t dx4[4] = { oxs - dxys + dxxs * (x + 0),
|
yading@10
|
1115 oxs - dxys + dxxs * (x + 1),
|
yading@10
|
1116 oxs - dxys + dxxs * (x + 2),
|
yading@10
|
1117 oxs - dxys + dxxs * (x + 3) };
|
yading@10
|
1118 uint16_t dy4[4] = { oys - dyys + dyxs * (x + 0),
|
yading@10
|
1119 oys - dyys + dyxs * (x + 1),
|
yading@10
|
1120 oys - dyys + dyxs * (x + 2),
|
yading@10
|
1121 oys - dyys + dyxs * (x + 3) };
|
yading@10
|
1122
|
yading@10
|
1123 for (y = 0; y < h; y++) {
|
yading@10
|
1124 __asm__ volatile (
|
yading@10
|
1125 "movq %0, %%mm4 \n\t"
|
yading@10
|
1126 "movq %1, %%mm5 \n\t"
|
yading@10
|
1127 "paddw %2, %%mm4 \n\t"
|
yading@10
|
1128 "paddw %3, %%mm5 \n\t"
|
yading@10
|
1129 "movq %%mm4, %0 \n\t"
|
yading@10
|
1130 "movq %%mm5, %1 \n\t"
|
yading@10
|
1131 "psrlw $12, %%mm4 \n\t"
|
yading@10
|
1132 "psrlw $12, %%mm5 \n\t"
|
yading@10
|
1133 : "+m"(*dx4), "+m"(*dy4)
|
yading@10
|
1134 : "m"(*dxy4), "m"(*dyy4)
|
yading@10
|
1135 );
|
yading@10
|
1136
|
yading@10
|
1137 __asm__ volatile (
|
yading@10
|
1138 "movq %%mm6, %%mm2 \n\t"
|
yading@10
|
1139 "movq %%mm6, %%mm1 \n\t"
|
yading@10
|
1140 "psubw %%mm4, %%mm2 \n\t"
|
yading@10
|
1141 "psubw %%mm5, %%mm1 \n\t"
|
yading@10
|
1142 "movq %%mm2, %%mm0 \n\t"
|
yading@10
|
1143 "movq %%mm4, %%mm3 \n\t"
|
yading@10
|
1144 "pmullw %%mm1, %%mm0 \n\t" // (s - dx) * (s - dy)
|
yading@10
|
1145 "pmullw %%mm5, %%mm3 \n\t" // dx * dy
|
yading@10
|
1146 "pmullw %%mm5, %%mm2 \n\t" // (s - dx) * dy
|
yading@10
|
1147 "pmullw %%mm4, %%mm1 \n\t" // dx * (s - dy)
|
yading@10
|
1148
|
yading@10
|
1149 "movd %4, %%mm5 \n\t"
|
yading@10
|
1150 "movd %3, %%mm4 \n\t"
|
yading@10
|
1151 "punpcklbw %%mm7, %%mm5 \n\t"
|
yading@10
|
1152 "punpcklbw %%mm7, %%mm4 \n\t"
|
yading@10
|
1153 "pmullw %%mm5, %%mm3 \n\t" // src[1, 1] * dx * dy
|
yading@10
|
1154 "pmullw %%mm4, %%mm2 \n\t" // src[0, 1] * (s - dx) * dy
|
yading@10
|
1155
|
yading@10
|
1156 "movd %2, %%mm5 \n\t"
|
yading@10
|
1157 "movd %1, %%mm4 \n\t"
|
yading@10
|
1158 "punpcklbw %%mm7, %%mm5 \n\t"
|
yading@10
|
1159 "punpcklbw %%mm7, %%mm4 \n\t"
|
yading@10
|
1160 "pmullw %%mm5, %%mm1 \n\t" // src[1, 0] * dx * (s - dy)
|
yading@10
|
1161 "pmullw %%mm4, %%mm0 \n\t" // src[0, 0] * (s - dx) * (s - dy)
|
yading@10
|
1162 "paddw %5, %%mm1 \n\t"
|
yading@10
|
1163 "paddw %%mm3, %%mm2 \n\t"
|
yading@10
|
1164 "paddw %%mm1, %%mm0 \n\t"
|
yading@10
|
1165 "paddw %%mm2, %%mm0 \n\t"
|
yading@10
|
1166
|
yading@10
|
1167 "psrlw %6, %%mm0 \n\t"
|
yading@10
|
1168 "packuswb %%mm0, %%mm0 \n\t"
|
yading@10
|
1169 "movd %%mm0, %0 \n\t"
|
yading@10
|
1170
|
yading@10
|
1171 : "=m"(dst[x + y * stride])
|
yading@10
|
1172 : "m"(src[0]), "m"(src[1]),
|
yading@10
|
1173 "m"(src[stride]), "m"(src[stride + 1]),
|
yading@10
|
1174 "m"(*r4), "m"(shift2)
|
yading@10
|
1175 );
|
yading@10
|
1176 src += stride;
|
yading@10
|
1177 }
|
yading@10
|
1178 src += 4 - h * stride;
|
yading@10
|
1179 }
|
yading@10
|
1180 }
|
yading@10
|
1181
|
yading@10
|
1182
|
yading@10
|
1183 #if CONFIG_VIDEODSP
|
yading@10
|
1184 #if HAVE_YASM
|
yading@10
|
1185 #if ARCH_X86_32
|
yading@10
|
1186 static void gmc_mmx(uint8_t *dst, uint8_t *src,
|
yading@10
|
1187 int stride, int h, int ox, int oy,
|
yading@10
|
1188 int dxx, int dxy, int dyx, int dyy,
|
yading@10
|
1189 int shift, int r, int width, int height)
|
yading@10
|
1190 {
|
yading@10
|
1191 gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
|
yading@10
|
1192 width, height, &ff_emulated_edge_mc_8);
|
yading@10
|
1193 }
|
yading@10
|
1194 #endif
|
yading@10
|
1195 static void gmc_sse(uint8_t *dst, uint8_t *src,
|
yading@10
|
1196 int stride, int h, int ox, int oy,
|
yading@10
|
1197 int dxx, int dxy, int dyx, int dyy,
|
yading@10
|
1198 int shift, int r, int width, int height)
|
yading@10
|
1199 {
|
yading@10
|
1200 gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
|
yading@10
|
1201 width, height, &ff_emulated_edge_mc_8);
|
yading@10
|
1202 }
|
yading@10
|
1203 #else
|
yading@10
|
1204 static void gmc_mmx(uint8_t *dst, uint8_t *src,
|
yading@10
|
1205 int stride, int h, int ox, int oy,
|
yading@10
|
1206 int dxx, int dxy, int dyx, int dyy,
|
yading@10
|
1207 int shift, int r, int width, int height)
|
yading@10
|
1208 {
|
yading@10
|
1209 gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
|
yading@10
|
1210 width, height, &ff_emulated_edge_mc_8);
|
yading@10
|
1211 }
|
yading@10
|
1212 #endif
|
yading@10
|
1213 #endif
|
yading@10
|
1214
|
yading@10
|
1215 /* CAVS-specific */
|
yading@10
|
1216 void ff_put_cavs_qpel8_mc00_mmxext(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
|
yading@10
|
1217 {
|
yading@10
|
1218 put_pixels8_mmx(dst, src, stride, 8);
|
yading@10
|
1219 }
|
yading@10
|
1220
|
yading@10
|
1221 void ff_avg_cavs_qpel8_mc00_mmxext(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
|
yading@10
|
1222 {
|
yading@10
|
1223 avg_pixels8_mmx(dst, src, stride, 8);
|
yading@10
|
1224 }
|
yading@10
|
1225
|
yading@10
|
1226 void ff_put_cavs_qpel16_mc00_mmxext(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
|
yading@10
|
1227 {
|
yading@10
|
1228 put_pixels16_mmx(dst, src, stride, 16);
|
yading@10
|
1229 }
|
yading@10
|
1230
|
yading@10
|
1231 void ff_avg_cavs_qpel16_mc00_mmxext(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
|
yading@10
|
1232 {
|
yading@10
|
1233 avg_pixels16_mmx(dst, src, stride, 16);
|
yading@10
|
1234 }
|
yading@10
|
1235
|
yading@10
|
1236 /* VC-1-specific */
|
yading@10
|
1237 void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src,
|
yading@10
|
1238 ptrdiff_t stride, int rnd)
|
yading@10
|
1239 {
|
yading@10
|
1240 put_pixels8_mmx(dst, src, stride, 8);
|
yading@10
|
1241 }
|
yading@10
|
1242
|
yading@10
|
1243 #if CONFIG_DIRAC_DECODER
|
yading@10
|
1244 #define DIRAC_PIXOP(OPNAME2, OPNAME, EXT)\
|
yading@10
|
1245 void ff_ ## OPNAME2 ## _dirac_pixels8_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
|
yading@10
|
1246 {\
|
yading@10
|
1247 if (h&3)\
|
yading@10
|
1248 ff_ ## OPNAME2 ## _dirac_pixels8_c(dst, src, stride, h);\
|
yading@10
|
1249 else\
|
yading@10
|
1250 OPNAME ## _pixels8_ ## EXT(dst, src[0], stride, h);\
|
yading@10
|
1251 }\
|
yading@10
|
1252 void ff_ ## OPNAME2 ## _dirac_pixels16_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
|
yading@10
|
1253 {\
|
yading@10
|
1254 if (h&3)\
|
yading@10
|
1255 ff_ ## OPNAME2 ## _dirac_pixels16_c(dst, src, stride, h);\
|
yading@10
|
1256 else\
|
yading@10
|
1257 OPNAME ## _pixels16_ ## EXT(dst, src[0], stride, h);\
|
yading@10
|
1258 }\
|
yading@10
|
1259 void ff_ ## OPNAME2 ## _dirac_pixels32_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
|
yading@10
|
1260 {\
|
yading@10
|
1261 if (h&3) {\
|
yading@10
|
1262 ff_ ## OPNAME2 ## _dirac_pixels32_c(dst, src, stride, h);\
|
yading@10
|
1263 } else {\
|
yading@10
|
1264 OPNAME ## _pixels16_ ## EXT(dst , src[0] , stride, h);\
|
yading@10
|
1265 OPNAME ## _pixels16_ ## EXT(dst+16, src[0]+16, stride, h);\
|
yading@10
|
1266 }\
|
yading@10
|
1267 }
|
yading@10
|
1268
|
yading@10
|
1269 #if HAVE_MMX_INLINE
|
yading@10
|
1270 DIRAC_PIXOP(put, put, mmx)
|
yading@10
|
1271 DIRAC_PIXOP(avg, avg, mmx)
|
yading@10
|
1272 #endif
|
yading@10
|
1273
|
yading@10
|
1274 #if HAVE_YASM
|
yading@10
|
1275 DIRAC_PIXOP(avg, ff_avg, mmxext)
|
yading@10
|
1276
|
yading@10
|
1277 void ff_put_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
|
yading@10
|
1278 {
|
yading@10
|
1279 if (h&3)
|
yading@10
|
1280 ff_put_dirac_pixels16_c(dst, src, stride, h);
|
yading@10
|
1281 else
|
yading@10
|
1282 ff_put_pixels16_sse2(dst, src[0], stride, h);
|
yading@10
|
1283 }
|
yading@10
|
1284 void ff_avg_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
|
yading@10
|
1285 {
|
yading@10
|
1286 if (h&3)
|
yading@10
|
1287 ff_avg_dirac_pixels16_c(dst, src, stride, h);
|
yading@10
|
1288 else
|
yading@10
|
1289 ff_avg_pixels16_sse2(dst, src[0], stride, h);
|
yading@10
|
1290 }
|
yading@10
|
1291 void ff_put_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
|
yading@10
|
1292 {
|
yading@10
|
1293 if (h&3) {
|
yading@10
|
1294 ff_put_dirac_pixels32_c(dst, src, stride, h);
|
yading@10
|
1295 } else {
|
yading@10
|
1296 ff_put_pixels16_sse2(dst , src[0] , stride, h);
|
yading@10
|
1297 ff_put_pixels16_sse2(dst+16, src[0]+16, stride, h);
|
yading@10
|
1298 }
|
yading@10
|
1299 }
|
yading@10
|
1300 void ff_avg_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
|
yading@10
|
1301 {
|
yading@10
|
1302 if (h&3) {
|
yading@10
|
1303 ff_avg_dirac_pixels32_c(dst, src, stride, h);
|
yading@10
|
1304 } else {
|
yading@10
|
1305 ff_avg_pixels16_sse2(dst , src[0] , stride, h);
|
yading@10
|
1306 ff_avg_pixels16_sse2(dst+16, src[0]+16, stride, h);
|
yading@10
|
1307 }
|
yading@10
|
1308 }
|
yading@10
|
1309 #endif
|
yading@10
|
1310 #endif
|
yading@10
|
1311
|
yading@10
|
1312 static void vector_clipf_sse(float *dst, const float *src,
|
yading@10
|
1313 float min, float max, int len)
|
yading@10
|
1314 {
|
yading@10
|
1315 x86_reg i = (len - 16) * 4;
|
yading@10
|
1316 __asm__ volatile (
|
yading@10
|
1317 "movss %3, %%xmm4 \n\t"
|
yading@10
|
1318 "movss %4, %%xmm5 \n\t"
|
yading@10
|
1319 "shufps $0, %%xmm4, %%xmm4 \n\t"
|
yading@10
|
1320 "shufps $0, %%xmm5, %%xmm5 \n\t"
|
yading@10
|
1321 "1: \n\t"
|
yading@10
|
1322 "movaps (%2, %0), %%xmm0 \n\t" // 3/1 on intel
|
yading@10
|
1323 "movaps 16(%2, %0), %%xmm1 \n\t"
|
yading@10
|
1324 "movaps 32(%2, %0), %%xmm2 \n\t"
|
yading@10
|
1325 "movaps 48(%2, %0), %%xmm3 \n\t"
|
yading@10
|
1326 "maxps %%xmm4, %%xmm0 \n\t"
|
yading@10
|
1327 "maxps %%xmm4, %%xmm1 \n\t"
|
yading@10
|
1328 "maxps %%xmm4, %%xmm2 \n\t"
|
yading@10
|
1329 "maxps %%xmm4, %%xmm3 \n\t"
|
yading@10
|
1330 "minps %%xmm5, %%xmm0 \n\t"
|
yading@10
|
1331 "minps %%xmm5, %%xmm1 \n\t"
|
yading@10
|
1332 "minps %%xmm5, %%xmm2 \n\t"
|
yading@10
|
1333 "minps %%xmm5, %%xmm3 \n\t"
|
yading@10
|
1334 "movaps %%xmm0, (%1, %0) \n\t"
|
yading@10
|
1335 "movaps %%xmm1, 16(%1, %0) \n\t"
|
yading@10
|
1336 "movaps %%xmm2, 32(%1, %0) \n\t"
|
yading@10
|
1337 "movaps %%xmm3, 48(%1, %0) \n\t"
|
yading@10
|
1338 "sub $64, %0 \n\t"
|
yading@10
|
1339 "jge 1b \n\t"
|
yading@10
|
1340 : "+&r"(i)
|
yading@10
|
1341 : "r"(dst), "r"(src), "m"(min), "m"(max)
|
yading@10
|
1342 : "memory"
|
yading@10
|
1343 );
|
yading@10
|
1344 }
|
yading@10
|
1345
|
yading@10
|
1346 #endif /* HAVE_INLINE_ASM */
|
yading@10
|
1347
|
yading@10
|
1348 void ff_h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale);
|
yading@10
|
1349 void ff_h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale);
|
yading@10
|
1350
|
yading@10
|
1351 int32_t ff_scalarproduct_int16_mmxext(const int16_t *v1, const int16_t *v2,
|
yading@10
|
1352 int order);
|
yading@10
|
1353 int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2,
|
yading@10
|
1354 int order);
|
yading@10
|
1355 int32_t ff_scalarproduct_and_madd_int16_mmxext(int16_t *v1, const int16_t *v2,
|
yading@10
|
1356 const int16_t *v3,
|
yading@10
|
1357 int order, int mul);
|
yading@10
|
1358 int32_t ff_scalarproduct_and_madd_int16_sse2(int16_t *v1, const int16_t *v2,
|
yading@10
|
1359 const int16_t *v3,
|
yading@10
|
1360 int order, int mul);
|
yading@10
|
1361 int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2,
|
yading@10
|
1362 const int16_t *v3,
|
yading@10
|
1363 int order, int mul);
|
yading@10
|
1364
|
yading@10
|
1365 void ff_apply_window_int16_round_mmxext(int16_t *output, const int16_t *input,
|
yading@10
|
1366 const int16_t *window, unsigned int len);
|
yading@10
|
1367 void ff_apply_window_int16_round_sse2(int16_t *output, const int16_t *input,
|
yading@10
|
1368 const int16_t *window, unsigned int len);
|
yading@10
|
1369 void ff_apply_window_int16_mmxext(int16_t *output, const int16_t *input,
|
yading@10
|
1370 const int16_t *window, unsigned int len);
|
yading@10
|
1371 void ff_apply_window_int16_sse2(int16_t *output, const int16_t *input,
|
yading@10
|
1372 const int16_t *window, unsigned int len);
|
yading@10
|
1373 void ff_apply_window_int16_ssse3(int16_t *output, const int16_t *input,
|
yading@10
|
1374 const int16_t *window, unsigned int len);
|
yading@10
|
1375 void ff_apply_window_int16_ssse3_atom(int16_t *output, const int16_t *input,
|
yading@10
|
1376 const int16_t *window, unsigned int len);
|
yading@10
|
1377
|
yading@10
|
1378 void ff_bswap32_buf_ssse3(uint32_t *dst, const uint32_t *src, int w);
|
yading@10
|
1379 void ff_bswap32_buf_sse2(uint32_t *dst, const uint32_t *src, int w);
|
yading@10
|
1380
|
yading@10
|
1381 void ff_add_hfyu_median_prediction_mmxext(uint8_t *dst, const uint8_t *top,
|
yading@10
|
1382 const uint8_t *diff, int w,
|
yading@10
|
1383 int *left, int *left_top);
|
yading@10
|
1384 int ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src,
|
yading@10
|
1385 int w, int left);
|
yading@10
|
1386 int ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src,
|
yading@10
|
1387 int w, int left);
|
yading@10
|
1388
|
yading@10
|
1389 void ff_vector_clip_int32_mmx (int32_t *dst, const int32_t *src,
|
yading@10
|
1390 int32_t min, int32_t max, unsigned int len);
|
yading@10
|
1391 void ff_vector_clip_int32_sse2 (int32_t *dst, const int32_t *src,
|
yading@10
|
1392 int32_t min, int32_t max, unsigned int len);
|
yading@10
|
1393 void ff_vector_clip_int32_int_sse2(int32_t *dst, const int32_t *src,
|
yading@10
|
1394 int32_t min, int32_t max, unsigned int len);
|
yading@10
|
1395 void ff_vector_clip_int32_sse4 (int32_t *dst, const int32_t *src,
|
yading@10
|
1396 int32_t min, int32_t max, unsigned int len);
|
yading@10
|
1397
|
yading@10
|
1398 #define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU, PREFIX) \
|
yading@10
|
1399 do { \
|
yading@10
|
1400 c->PFX ## _pixels_tab[IDX][ 0] = PREFIX ## PFX ## SIZE ## _mc00_ ## CPU; \
|
yading@10
|
1401 c->PFX ## _pixels_tab[IDX][ 1] = PREFIX ## PFX ## SIZE ## _mc10_ ## CPU; \
|
yading@10
|
1402 c->PFX ## _pixels_tab[IDX][ 2] = PREFIX ## PFX ## SIZE ## _mc20_ ## CPU; \
|
yading@10
|
1403 c->PFX ## _pixels_tab[IDX][ 3] = PREFIX ## PFX ## SIZE ## _mc30_ ## CPU; \
|
yading@10
|
1404 c->PFX ## _pixels_tab[IDX][ 4] = PREFIX ## PFX ## SIZE ## _mc01_ ## CPU; \
|
yading@10
|
1405 c->PFX ## _pixels_tab[IDX][ 5] = PREFIX ## PFX ## SIZE ## _mc11_ ## CPU; \
|
yading@10
|
1406 c->PFX ## _pixels_tab[IDX][ 6] = PREFIX ## PFX ## SIZE ## _mc21_ ## CPU; \
|
yading@10
|
1407 c->PFX ## _pixels_tab[IDX][ 7] = PREFIX ## PFX ## SIZE ## _mc31_ ## CPU; \
|
yading@10
|
1408 c->PFX ## _pixels_tab[IDX][ 8] = PREFIX ## PFX ## SIZE ## _mc02_ ## CPU; \
|
yading@10
|
1409 c->PFX ## _pixels_tab[IDX][ 9] = PREFIX ## PFX ## SIZE ## _mc12_ ## CPU; \
|
yading@10
|
1410 c->PFX ## _pixels_tab[IDX][10] = PREFIX ## PFX ## SIZE ## _mc22_ ## CPU; \
|
yading@10
|
1411 c->PFX ## _pixels_tab[IDX][11] = PREFIX ## PFX ## SIZE ## _mc32_ ## CPU; \
|
yading@10
|
1412 c->PFX ## _pixels_tab[IDX][12] = PREFIX ## PFX ## SIZE ## _mc03_ ## CPU; \
|
yading@10
|
1413 c->PFX ## _pixels_tab[IDX][13] = PREFIX ## PFX ## SIZE ## _mc13_ ## CPU; \
|
yading@10
|
1414 c->PFX ## _pixels_tab[IDX][14] = PREFIX ## PFX ## SIZE ## _mc23_ ## CPU; \
|
yading@10
|
1415 c->PFX ## _pixels_tab[IDX][15] = PREFIX ## PFX ## SIZE ## _mc33_ ## CPU; \
|
yading@10
|
1416 } while (0)
|
yading@10
|
1417
|
yading@10
|
1418 static av_cold void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx,
|
yading@10
|
1419 int mm_flags)
|
yading@10
|
1420 {
|
yading@10
|
1421 const int high_bit_depth = avctx->bits_per_raw_sample > 8;
|
yading@10
|
1422
|
yading@10
|
1423 #if HAVE_INLINE_ASM
|
yading@10
|
1424 c->put_pixels_clamped = ff_put_pixels_clamped_mmx;
|
yading@10
|
1425 c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_mmx;
|
yading@10
|
1426 c->add_pixels_clamped = ff_add_pixels_clamped_mmx;
|
yading@10
|
1427
|
yading@10
|
1428 if (!high_bit_depth) {
|
yading@10
|
1429 c->clear_block = clear_block_mmx;
|
yading@10
|
1430 c->clear_blocks = clear_blocks_mmx;
|
yading@10
|
1431 c->draw_edges = draw_edges_mmx;
|
yading@10
|
1432 }
|
yading@10
|
1433
|
yading@10
|
1434 #if CONFIG_VIDEODSP && (ARCH_X86_32 || !HAVE_YASM)
|
yading@10
|
1435 c->gmc = gmc_mmx;
|
yading@10
|
1436 #endif
|
yading@10
|
1437
|
yading@10
|
1438 c->add_bytes = add_bytes_mmx;
|
yading@10
|
1439 #endif /* HAVE_INLINE_ASM */
|
yading@10
|
1440
|
yading@10
|
1441 #if HAVE_YASM
|
yading@10
|
1442 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
|
yading@10
|
1443 c->h263_v_loop_filter = ff_h263_v_loop_filter_mmx;
|
yading@10
|
1444 c->h263_h_loop_filter = ff_h263_h_loop_filter_mmx;
|
yading@10
|
1445 }
|
yading@10
|
1446
|
yading@10
|
1447 c->vector_clip_int32 = ff_vector_clip_int32_mmx;
|
yading@10
|
1448 #endif
|
yading@10
|
1449
|
yading@10
|
1450 }
|
yading@10
|
1451
|
yading@10
|
1452 static av_cold void dsputil_init_mmxext(DSPContext *c, AVCodecContext *avctx,
|
yading@10
|
1453 int mm_flags)
|
yading@10
|
1454 {
|
yading@10
|
1455
|
yading@10
|
1456 #if HAVE_YASM
|
yading@10
|
1457 SET_QPEL_FUNCS(avg_qpel, 0, 16, mmxext, );
|
yading@10
|
1458 SET_QPEL_FUNCS(avg_qpel, 1, 8, mmxext, );
|
yading@10
|
1459
|
yading@10
|
1460 SET_QPEL_FUNCS(put_qpel, 0, 16, mmxext, );
|
yading@10
|
1461 SET_QPEL_FUNCS(put_qpel, 1, 8, mmxext, );
|
yading@10
|
1462 SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, mmxext, );
|
yading@10
|
1463 SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, mmxext, );
|
yading@10
|
1464 #endif /* HAVE_YASM */
|
yading@10
|
1465
|
yading@10
|
1466 #if HAVE_MMXEXT_EXTERNAL
|
yading@10
|
1467 /* slower than cmov version on AMD */
|
yading@10
|
1468 if (!(mm_flags & AV_CPU_FLAG_3DNOW))
|
yading@10
|
1469 c->add_hfyu_median_prediction = ff_add_hfyu_median_prediction_mmxext;
|
yading@10
|
1470
|
yading@10
|
1471 c->scalarproduct_int16 = ff_scalarproduct_int16_mmxext;
|
yading@10
|
1472 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmxext;
|
yading@10
|
1473
|
yading@10
|
1474 if (avctx->flags & CODEC_FLAG_BITEXACT) {
|
yading@10
|
1475 c->apply_window_int16 = ff_apply_window_int16_mmxext;
|
yading@10
|
1476 } else {
|
yading@10
|
1477 c->apply_window_int16 = ff_apply_window_int16_round_mmxext;
|
yading@10
|
1478 }
|
yading@10
|
1479 #endif /* HAVE_MMXEXT_EXTERNAL */
|
yading@10
|
1480 }
|
yading@10
|
1481
|
yading@10
|
1482 static av_cold void dsputil_init_sse(DSPContext *c, AVCodecContext *avctx,
|
yading@10
|
1483 int mm_flags)
|
yading@10
|
1484 {
|
yading@10
|
1485 #if HAVE_INLINE_ASM
|
yading@10
|
1486 const int high_bit_depth = avctx->bits_per_raw_sample > 8;
|
yading@10
|
1487
|
yading@10
|
1488 if (!high_bit_depth) {
|
yading@10
|
1489 if (!(CONFIG_MPEG_XVMC_DECODER && avctx->xvmc_acceleration > 1)) {
|
yading@10
|
1490 /* XvMCCreateBlocks() may not allocate 16-byte aligned blocks */
|
yading@10
|
1491 c->clear_block = clear_block_sse;
|
yading@10
|
1492 c->clear_blocks = clear_blocks_sse;
|
yading@10
|
1493 }
|
yading@10
|
1494 }
|
yading@10
|
1495
|
yading@10
|
1496 c->vector_clipf = vector_clipf_sse;
|
yading@10
|
1497 #endif /* HAVE_INLINE_ASM */
|
yading@10
|
1498
|
yading@10
|
1499 #if HAVE_YASM
|
yading@10
|
1500 #if HAVE_INLINE_ASM && CONFIG_VIDEODSP
|
yading@10
|
1501 c->gmc = gmc_sse;
|
yading@10
|
1502 #endif
|
yading@10
|
1503 #endif /* HAVE_YASM */
|
yading@10
|
1504 }
|
yading@10
|
1505
|
yading@10
|
1506 static av_cold void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx,
|
yading@10
|
1507 int mm_flags)
|
yading@10
|
1508 {
|
yading@10
|
1509 const int high_bit_depth = avctx->bits_per_raw_sample > 8;
|
yading@10
|
1510
|
yading@10
|
1511 #if HAVE_SSE2_INLINE
|
yading@10
|
1512 if (!high_bit_depth && avctx->idct_algo == FF_IDCT_XVIDMMX) {
|
yading@10
|
1513 c->idct_put = ff_idct_xvid_sse2_put;
|
yading@10
|
1514 c->idct_add = ff_idct_xvid_sse2_add;
|
yading@10
|
1515 c->idct = ff_idct_xvid_sse2;
|
yading@10
|
1516 c->idct_permutation_type = FF_SSE2_IDCT_PERM;
|
yading@10
|
1517 }
|
yading@10
|
1518 #endif /* HAVE_SSE2_INLINE */
|
yading@10
|
1519
|
yading@10
|
1520 #if HAVE_SSE2_EXTERNAL
|
yading@10
|
1521 c->scalarproduct_int16 = ff_scalarproduct_int16_sse2;
|
yading@10
|
1522 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2;
|
yading@10
|
1523 if (mm_flags & AV_CPU_FLAG_ATOM) {
|
yading@10
|
1524 c->vector_clip_int32 = ff_vector_clip_int32_int_sse2;
|
yading@10
|
1525 } else {
|
yading@10
|
1526 c->vector_clip_int32 = ff_vector_clip_int32_sse2;
|
yading@10
|
1527 }
|
yading@10
|
1528 if (avctx->flags & CODEC_FLAG_BITEXACT) {
|
yading@10
|
1529 c->apply_window_int16 = ff_apply_window_int16_sse2;
|
yading@10
|
1530 } else if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
|
yading@10
|
1531 c->apply_window_int16 = ff_apply_window_int16_round_sse2;
|
yading@10
|
1532 }
|
yading@10
|
1533 c->bswap_buf = ff_bswap32_buf_sse2;
|
yading@10
|
1534 #endif /* HAVE_SSE2_EXTERNAL */
|
yading@10
|
1535 }
|
yading@10
|
1536
|
yading@10
|
1537 static av_cold void dsputil_init_ssse3(DSPContext *c, AVCodecContext *avctx,
|
yading@10
|
1538 int mm_flags)
|
yading@10
|
1539 {
|
yading@10
|
1540 #if HAVE_SSSE3_EXTERNAL
|
yading@10
|
1541 c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_ssse3;
|
yading@10
|
1542 if (mm_flags & AV_CPU_FLAG_SSE4) // not really sse4, just slow on Conroe
|
yading@10
|
1543 c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_sse4;
|
yading@10
|
1544
|
yading@10
|
1545 if (mm_flags & AV_CPU_FLAG_ATOM)
|
yading@10
|
1546 c->apply_window_int16 = ff_apply_window_int16_ssse3_atom;
|
yading@10
|
1547 else
|
yading@10
|
1548 c->apply_window_int16 = ff_apply_window_int16_ssse3;
|
yading@10
|
1549 if (!(mm_flags & (AV_CPU_FLAG_SSE42|AV_CPU_FLAG_3DNOW))) // cachesplit
|
yading@10
|
1550 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3;
|
yading@10
|
1551 c->bswap_buf = ff_bswap32_buf_ssse3;
|
yading@10
|
1552 #endif /* HAVE_SSSE3_EXTERNAL */
|
yading@10
|
1553 }
|
yading@10
|
1554
|
yading@10
|
1555 static av_cold void dsputil_init_sse4(DSPContext *c, AVCodecContext *avctx,
|
yading@10
|
1556 int mm_flags)
|
yading@10
|
1557 {
|
yading@10
|
1558 #if HAVE_SSE4_EXTERNAL
|
yading@10
|
1559 c->vector_clip_int32 = ff_vector_clip_int32_sse4;
|
yading@10
|
1560 #endif /* HAVE_SSE4_EXTERNAL */
|
yading@10
|
1561 }
|
yading@10
|
1562
|
yading@10
|
1563 av_cold void ff_dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx)
|
yading@10
|
1564 {
|
yading@10
|
1565 int mm_flags = av_get_cpu_flags();
|
yading@10
|
1566
|
yading@10
|
1567 #if HAVE_7REGS && HAVE_INLINE_ASM
|
yading@10
|
1568 if (mm_flags & AV_CPU_FLAG_CMOV)
|
yading@10
|
1569 c->add_hfyu_median_prediction = add_hfyu_median_prediction_cmov;
|
yading@10
|
1570 #endif
|
yading@10
|
1571
|
yading@10
|
1572 if (mm_flags & AV_CPU_FLAG_MMX) {
|
yading@10
|
1573 #if HAVE_INLINE_ASM
|
yading@10
|
1574 const int idct_algo = avctx->idct_algo;
|
yading@10
|
1575
|
yading@10
|
1576 if (avctx->lowres == 0 && avctx->bits_per_raw_sample <= 8) {
|
yading@10
|
1577 if (idct_algo == FF_IDCT_AUTO || idct_algo == FF_IDCT_SIMPLEMMX) {
|
yading@10
|
1578 c->idct_put = ff_simple_idct_put_mmx;
|
yading@10
|
1579 c->idct_add = ff_simple_idct_add_mmx;
|
yading@10
|
1580 c->idct = ff_simple_idct_mmx;
|
yading@10
|
1581 c->idct_permutation_type = FF_SIMPLE_IDCT_PERM;
|
yading@10
|
1582 } else if (idct_algo == FF_IDCT_XVIDMMX) {
|
yading@10
|
1583 if (mm_flags & AV_CPU_FLAG_SSE2) {
|
yading@10
|
1584 c->idct_put = ff_idct_xvid_sse2_put;
|
yading@10
|
1585 c->idct_add = ff_idct_xvid_sse2_add;
|
yading@10
|
1586 c->idct = ff_idct_xvid_sse2;
|
yading@10
|
1587 c->idct_permutation_type = FF_SSE2_IDCT_PERM;
|
yading@10
|
1588 } else if (mm_flags & AV_CPU_FLAG_MMXEXT) {
|
yading@10
|
1589 c->idct_put = ff_idct_xvid_mmxext_put;
|
yading@10
|
1590 c->idct_add = ff_idct_xvid_mmxext_add;
|
yading@10
|
1591 c->idct = ff_idct_xvid_mmxext;
|
yading@10
|
1592 } else {
|
yading@10
|
1593 c->idct_put = ff_idct_xvid_mmx_put;
|
yading@10
|
1594 c->idct_add = ff_idct_xvid_mmx_add;
|
yading@10
|
1595 c->idct = ff_idct_xvid_mmx;
|
yading@10
|
1596 }
|
yading@10
|
1597 }
|
yading@10
|
1598 }
|
yading@10
|
1599 #endif /* HAVE_INLINE_ASM */
|
yading@10
|
1600
|
yading@10
|
1601 dsputil_init_mmx(c, avctx, mm_flags);
|
yading@10
|
1602 }
|
yading@10
|
1603
|
yading@10
|
1604 if (mm_flags & AV_CPU_FLAG_MMXEXT)
|
yading@10
|
1605 dsputil_init_mmxext(c, avctx, mm_flags);
|
yading@10
|
1606
|
yading@10
|
1607 if (mm_flags & AV_CPU_FLAG_SSE)
|
yading@10
|
1608 dsputil_init_sse(c, avctx, mm_flags);
|
yading@10
|
1609
|
yading@10
|
1610 if (mm_flags & AV_CPU_FLAG_SSE2)
|
yading@10
|
1611 dsputil_init_sse2(c, avctx, mm_flags);
|
yading@10
|
1612
|
yading@10
|
1613 if (mm_flags & AV_CPU_FLAG_SSSE3)
|
yading@10
|
1614 dsputil_init_ssse3(c, avctx, mm_flags);
|
yading@10
|
1615
|
yading@10
|
1616 if (mm_flags & AV_CPU_FLAG_SSE4)
|
yading@10
|
1617 dsputil_init_sse4(c, avctx, mm_flags);
|
yading@10
|
1618
|
yading@10
|
1619 if (CONFIG_ENCODERS)
|
yading@10
|
1620 ff_dsputilenc_init_mmx(c, avctx);
|
yading@10
|
1621 }
|