vc1dsp_mmx.c
Go to the documentation of this file.
1 /*
2  * VC-1 and WMV3 - DSP functions MMX-optimized
3  * Copyright (c) 2007 Christophe GISQUET <christophe.gisquet@free.fr>
4  *
5  * Permission is hereby granted, free of charge, to any person
6  * obtaining a copy of this software and associated documentation
7  * files (the "Software"), to deal in the Software without
8  * restriction, including without limitation the rights to use,
9  * copy, modify, merge, publish, distribute, sublicense, and/or sell
10  * copies of the Software, and to permit persons to whom the
11  * Software is furnished to do so, subject to the following
12  * conditions:
13  *
14  * The above copyright notice and this permission notice shall be
15  * included in all copies or substantial portions of the Software.
16  *
17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
18  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
19  * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
20  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
21  * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
22  * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
24  * OTHER DEALINGS IN THE SOFTWARE.
25  */
26 
27 #include "libavutil/cpu.h"
28 #include "libavutil/mem.h"
29 #include "libavutil/x86/asm.h"
30 #include "libavutil/x86/cpu.h"
31 #include "dsputil_mmx.h"
32 #include "libavcodec/vc1dsp.h"
33 #include "vc1dsp.h"
34 
35 #if HAVE_INLINE_ASM
36 
37 #define OP_PUT(S,D)
38 #define OP_AVG(S,D) "pavgb " #S ", " #D " \n\t"
39 
40 /** Add rounder from mm7 to mm3 and pack result at destination */
41 #define NORMALIZE_MMX(SHIFT) \
42  "paddw %%mm7, %%mm3 \n\t" /* +bias-r */ \
43  "paddw %%mm7, %%mm4 \n\t" /* +bias-r */ \
44  "psraw "SHIFT", %%mm3 \n\t" \
45  "psraw "SHIFT", %%mm4 \n\t"
46 
47 #define TRANSFER_DO_PACK(OP) \
48  "packuswb %%mm4, %%mm3 \n\t" \
49  OP((%2), %%mm3) \
50  "movq %%mm3, (%2) \n\t"
51 
52 #define TRANSFER_DONT_PACK(OP) \
53  OP(0(%2), %%mm3) \
54  OP(8(%2), %%mm4) \
55  "movq %%mm3, 0(%2) \n\t" \
56  "movq %%mm4, 8(%2) \n\t"
57 
58 /** @see MSPEL_FILTER13_CORE for use as UNPACK macro */
59 #define DO_UNPACK(reg) "punpcklbw %%mm0, " reg "\n\t"
60 #define DONT_UNPACK(reg)
61 
62 /** Compute the rounder 32-r or 8-r and unpacks it to mm7 */
63 #define LOAD_ROUNDER_MMX(ROUND) \
64  "movd "ROUND", %%mm7 \n\t" \
65  "punpcklwd %%mm7, %%mm7 \n\t" \
66  "punpckldq %%mm7, %%mm7 \n\t"
67 
68 #define SHIFT2_LINE(OFF, R0,R1,R2,R3) \
69  "paddw %%mm"#R2", %%mm"#R1" \n\t" \
70  "movd (%0,%3), %%mm"#R0" \n\t" \
71  "pmullw %%mm6, %%mm"#R1" \n\t" \
72  "punpcklbw %%mm0, %%mm"#R0" \n\t" \
73  "movd (%0,%2), %%mm"#R3" \n\t" \
74  "psubw %%mm"#R0", %%mm"#R1" \n\t" \
75  "punpcklbw %%mm0, %%mm"#R3" \n\t" \
76  "paddw %%mm7, %%mm"#R1" \n\t" \
77  "psubw %%mm"#R3", %%mm"#R1" \n\t" \
78  "psraw %4, %%mm"#R1" \n\t" \
79  "movq %%mm"#R1", "#OFF"(%1) \n\t" \
80  "add %2, %0 \n\t"
81 
82 /** Sacrifying mm6 allows to pipeline loads from src */
83 static void vc1_put_ver_16b_shift2_mmx(int16_t *dst,
84  const uint8_t *src, x86_reg stride,
85  int rnd, int64_t shift)
86 {
87  __asm__ volatile(
88  "mov $3, %%"REG_c" \n\t"
89  LOAD_ROUNDER_MMX("%5")
90  "movq "MANGLE(ff_pw_9)", %%mm6 \n\t"
91  "1: \n\t"
92  "movd (%0), %%mm2 \n\t"
93  "add %2, %0 \n\t"
94  "movd (%0), %%mm3 \n\t"
95  "punpcklbw %%mm0, %%mm2 \n\t"
96  "punpcklbw %%mm0, %%mm3 \n\t"
97  SHIFT2_LINE( 0, 1, 2, 3, 4)
98  SHIFT2_LINE( 24, 2, 3, 4, 1)
99  SHIFT2_LINE( 48, 3, 4, 1, 2)
100  SHIFT2_LINE( 72, 4, 1, 2, 3)
101  SHIFT2_LINE( 96, 1, 2, 3, 4)
102  SHIFT2_LINE(120, 2, 3, 4, 1)
103  SHIFT2_LINE(144, 3, 4, 1, 2)
104  SHIFT2_LINE(168, 4, 1, 2, 3)
105  "sub %6, %0 \n\t"
106  "add $8, %1 \n\t"
107  "dec %%"REG_c" \n\t"
108  "jnz 1b \n\t"
109  : "+r"(src), "+r"(dst)
110  : "r"(stride), "r"(-2*stride),
111  "m"(shift), "m"(rnd), "r"(9*stride-4)
112  : "%"REG_c, "memory"
113  );
114 }
115 
116 /**
117  * Data is already unpacked, so some operations can directly be made from
118  * memory.
119  */
120 #define VC1_HOR_16b_SHIFT2(OP, OPNAME)\
121 static void OPNAME ## vc1_hor_16b_shift2_mmx(uint8_t *dst, x86_reg stride,\
122  const int16_t *src, int rnd)\
123 {\
124  int h = 8;\
125 \
126  src -= 1;\
127  rnd -= (-1+9+9-1)*1024; /* Add -1024 bias */\
128  __asm__ volatile(\
129  LOAD_ROUNDER_MMX("%4")\
130  "movq "MANGLE(ff_pw_128)", %%mm6\n\t"\
131  "movq "MANGLE(ff_pw_9)", %%mm5 \n\t"\
132  "1: \n\t"\
133  "movq 2*0+0(%1), %%mm1 \n\t"\
134  "movq 2*0+8(%1), %%mm2 \n\t"\
135  "movq 2*1+0(%1), %%mm3 \n\t"\
136  "movq 2*1+8(%1), %%mm4 \n\t"\
137  "paddw 2*3+0(%1), %%mm1 \n\t"\
138  "paddw 2*3+8(%1), %%mm2 \n\t"\
139  "paddw 2*2+0(%1), %%mm3 \n\t"\
140  "paddw 2*2+8(%1), %%mm4 \n\t"\
141  "pmullw %%mm5, %%mm3 \n\t"\
142  "pmullw %%mm5, %%mm4 \n\t"\
143  "psubw %%mm1, %%mm3 \n\t"\
144  "psubw %%mm2, %%mm4 \n\t"\
145  NORMALIZE_MMX("$7")\
146  /* Remove bias */\
147  "paddw %%mm6, %%mm3 \n\t"\
148  "paddw %%mm6, %%mm4 \n\t"\
149  TRANSFER_DO_PACK(OP)\
150  "add $24, %1 \n\t"\
151  "add %3, %2 \n\t"\
152  "decl %0 \n\t"\
153  "jnz 1b \n\t"\
154  : "+r"(h), "+r" (src), "+r" (dst)\
155  : "r"(stride), "m"(rnd)\
156  : "memory"\
157  );\
158 }
159 
160 VC1_HOR_16b_SHIFT2(OP_PUT, put_)
161 VC1_HOR_16b_SHIFT2(OP_AVG, avg_)
162 
163 
164 /**
165  * Purely vertical or horizontal 1/2 shift interpolation.
166  * Sacrify mm6 for *9 factor.
167  */
168 #define VC1_SHIFT2(OP, OPNAME)\
169 static void OPNAME ## vc1_shift2_mmx(uint8_t *dst, const uint8_t *src,\
170  x86_reg stride, int rnd, x86_reg offset)\
171 {\
172  rnd = 8-rnd;\
173  __asm__ volatile(\
174  "mov $8, %%"REG_c" \n\t"\
175  LOAD_ROUNDER_MMX("%5")\
176  "movq "MANGLE(ff_pw_9)", %%mm6\n\t"\
177  "1: \n\t"\
178  "movd 0(%0 ), %%mm3 \n\t"\
179  "movd 4(%0 ), %%mm4 \n\t"\
180  "movd 0(%0,%2), %%mm1 \n\t"\
181  "movd 4(%0,%2), %%mm2 \n\t"\
182  "add %2, %0 \n\t"\
183  "punpcklbw %%mm0, %%mm3 \n\t"\
184  "punpcklbw %%mm0, %%mm4 \n\t"\
185  "punpcklbw %%mm0, %%mm1 \n\t"\
186  "punpcklbw %%mm0, %%mm2 \n\t"\
187  "paddw %%mm1, %%mm3 \n\t"\
188  "paddw %%mm2, %%mm4 \n\t"\
189  "movd 0(%0,%3), %%mm1 \n\t"\
190  "movd 4(%0,%3), %%mm2 \n\t"\
191  "pmullw %%mm6, %%mm3 \n\t" /* 0,9,9,0*/\
192  "pmullw %%mm6, %%mm4 \n\t" /* 0,9,9,0*/\
193  "punpcklbw %%mm0, %%mm1 \n\t"\
194  "punpcklbw %%mm0, %%mm2 \n\t"\
195  "psubw %%mm1, %%mm3 \n\t" /*-1,9,9,0*/\
196  "psubw %%mm2, %%mm4 \n\t" /*-1,9,9,0*/\
197  "movd 0(%0,%2), %%mm1 \n\t"\
198  "movd 4(%0,%2), %%mm2 \n\t"\
199  "punpcklbw %%mm0, %%mm1 \n\t"\
200  "punpcklbw %%mm0, %%mm2 \n\t"\
201  "psubw %%mm1, %%mm3 \n\t" /*-1,9,9,-1*/\
202  "psubw %%mm2, %%mm4 \n\t" /*-1,9,9,-1*/\
203  NORMALIZE_MMX("$4")\
204  "packuswb %%mm4, %%mm3 \n\t"\
205  OP((%1), %%mm3)\
206  "movq %%mm3, (%1) \n\t"\
207  "add %6, %0 \n\t"\
208  "add %4, %1 \n\t"\
209  "dec %%"REG_c" \n\t"\
210  "jnz 1b \n\t"\
211  : "+r"(src), "+r"(dst)\
212  : "r"(offset), "r"(-2*offset), "g"(stride), "m"(rnd),\
213  "g"(stride-offset)\
214  : "%"REG_c, "memory"\
215  );\
216 }
217 
218 VC1_SHIFT2(OP_PUT, put_)
219 VC1_SHIFT2(OP_AVG, avg_)
220 
221 /**
222  * Core of the 1/4 and 3/4 shift bicubic interpolation.
223  *
224  * @param UNPACK Macro unpacking arguments from 8 to 16bits (can be empty).
225  * @param MOVQ "movd 1" or "movq 2", if data read is already unpacked.
226  * @param A1 Address of 1st tap (beware of unpacked/packed).
227  * @param A2 Address of 2nd tap
228  * @param A3 Address of 3rd tap
229  * @param A4 Address of 4th tap
230  */
231 #define MSPEL_FILTER13_CORE(UNPACK, MOVQ, A1, A2, A3, A4) \
232  MOVQ "*0+"A1", %%mm1 \n\t" \
233  MOVQ "*4+"A1", %%mm2 \n\t" \
234  UNPACK("%%mm1") \
235  UNPACK("%%mm2") \
236  "pmullw "MANGLE(ff_pw_3)", %%mm1\n\t" \
237  "pmullw "MANGLE(ff_pw_3)", %%mm2\n\t" \
238  MOVQ "*0+"A2", %%mm3 \n\t" \
239  MOVQ "*4+"A2", %%mm4 \n\t" \
240  UNPACK("%%mm3") \
241  UNPACK("%%mm4") \
242  "pmullw %%mm6, %%mm3 \n\t" /* *18 */ \
243  "pmullw %%mm6, %%mm4 \n\t" /* *18 */ \
244  "psubw %%mm1, %%mm3 \n\t" /* 18,-3 */ \
245  "psubw %%mm2, %%mm4 \n\t" /* 18,-3 */ \
246  MOVQ "*0+"A4", %%mm1 \n\t" \
247  MOVQ "*4+"A4", %%mm2 \n\t" \
248  UNPACK("%%mm1") \
249  UNPACK("%%mm2") \
250  "psllw $2, %%mm1 \n\t" /* 4* */ \
251  "psllw $2, %%mm2 \n\t" /* 4* */ \
252  "psubw %%mm1, %%mm3 \n\t" /* -4,18,-3 */ \
253  "psubw %%mm2, %%mm4 \n\t" /* -4,18,-3 */ \
254  MOVQ "*0+"A3", %%mm1 \n\t" \
255  MOVQ "*4+"A3", %%mm2 \n\t" \
256  UNPACK("%%mm1") \
257  UNPACK("%%mm2") \
258  "pmullw %%mm5, %%mm1 \n\t" /* *53 */ \
259  "pmullw %%mm5, %%mm2 \n\t" /* *53 */ \
260  "paddw %%mm1, %%mm3 \n\t" /* 4,53,18,-3 */ \
261  "paddw %%mm2, %%mm4 \n\t" /* 4,53,18,-3 */
262 
263 /**
264  * Macro to build the vertical 16bits version of vc1_put_shift[13].
265  * Here, offset=src_stride. Parameters passed A1 to A4 must use
266  * %3 (src_stride) and %4 (3*src_stride).
267  *
268  * @param NAME Either 1 or 3
269  * @see MSPEL_FILTER13_CORE for information on A1->A4
270  */
271 #define MSPEL_FILTER13_VER_16B(NAME, A1, A2, A3, A4) \
272 static void \
273 vc1_put_ver_16b_ ## NAME ## _mmx(int16_t *dst, const uint8_t *src, \
274  x86_reg src_stride, \
275  int rnd, int64_t shift) \
276 { \
277  int h = 8; \
278  src -= src_stride; \
279  __asm__ volatile( \
280  LOAD_ROUNDER_MMX("%5") \
281  "movq "MANGLE(ff_pw_53)", %%mm5\n\t" \
282  "movq "MANGLE(ff_pw_18)", %%mm6\n\t" \
283  ".p2align 3 \n\t" \
284  "1: \n\t" \
285  MSPEL_FILTER13_CORE(DO_UNPACK, "movd 1", A1, A2, A3, A4) \
286  NORMALIZE_MMX("%6") \
287  TRANSFER_DONT_PACK(OP_PUT) \
288  /* Last 3 (in fact 4) bytes on the line */ \
289  "movd 8+"A1", %%mm1 \n\t" \
290  DO_UNPACK("%%mm1") \
291  "movq %%mm1, %%mm3 \n\t" \
292  "paddw %%mm1, %%mm1 \n\t" \
293  "paddw %%mm3, %%mm1 \n\t" /* 3* */ \
294  "movd 8+"A2", %%mm3 \n\t" \
295  DO_UNPACK("%%mm3") \
296  "pmullw %%mm6, %%mm3 \n\t" /* *18 */ \
297  "psubw %%mm1, %%mm3 \n\t" /*18,-3 */ \
298  "movd 8+"A3", %%mm1 \n\t" \
299  DO_UNPACK("%%mm1") \
300  "pmullw %%mm5, %%mm1 \n\t" /* *53 */ \
301  "paddw %%mm1, %%mm3 \n\t" /*53,18,-3 */ \
302  "movd 8+"A4", %%mm1 \n\t" \
303  DO_UNPACK("%%mm1") \
304  "psllw $2, %%mm1 \n\t" /* 4* */ \
305  "psubw %%mm1, %%mm3 \n\t" \
306  "paddw %%mm7, %%mm3 \n\t" \
307  "psraw %6, %%mm3 \n\t" \
308  "movq %%mm3, 16(%2) \n\t" \
309  "add %3, %1 \n\t" \
310  "add $24, %2 \n\t" \
311  "decl %0 \n\t" \
312  "jnz 1b \n\t" \
313  : "+r"(h), "+r" (src), "+r" (dst) \
314  : "r"(src_stride), "r"(3*src_stride), \
315  "m"(rnd), "m"(shift) \
316  : "memory" \
317  ); \
318 }
319 
320 /**
321  * Macro to build the horizontal 16bits version of vc1_put_shift[13].
322  * Here, offset=16bits, so parameters passed A1 to A4 should be simple.
323  *
324  * @param NAME Either 1 or 3
325  * @see MSPEL_FILTER13_CORE for information on A1->A4
326  */
327 #define MSPEL_FILTER13_HOR_16B(NAME, A1, A2, A3, A4, OP, OPNAME) \
328 static void \
329 OPNAME ## vc1_hor_16b_ ## NAME ## _mmx(uint8_t *dst, x86_reg stride, \
330  const int16_t *src, int rnd) \
331 { \
332  int h = 8; \
333  src -= 1; \
334  rnd -= (-4+58+13-3)*256; /* Add -256 bias */ \
335  __asm__ volatile( \
336  LOAD_ROUNDER_MMX("%4") \
337  "movq "MANGLE(ff_pw_18)", %%mm6 \n\t" \
338  "movq "MANGLE(ff_pw_53)", %%mm5 \n\t" \
339  ".p2align 3 \n\t" \
340  "1: \n\t" \
341  MSPEL_FILTER13_CORE(DONT_UNPACK, "movq 2", A1, A2, A3, A4) \
342  NORMALIZE_MMX("$7") \
343  /* Remove bias */ \
344  "paddw "MANGLE(ff_pw_128)", %%mm3 \n\t" \
345  "paddw "MANGLE(ff_pw_128)", %%mm4 \n\t" \
346  TRANSFER_DO_PACK(OP) \
347  "add $24, %1 \n\t" \
348  "add %3, %2 \n\t" \
349  "decl %0 \n\t" \
350  "jnz 1b \n\t" \
351  : "+r"(h), "+r" (src), "+r" (dst) \
352  : "r"(stride), "m"(rnd) \
353  : "memory" \
354  ); \
355 }
356 
357 /**
358  * Macro to build the 8bits, any direction, version of vc1_put_shift[13].
359  * Here, offset=src_stride. Parameters passed A1 to A4 must use
360  * %3 (offset) and %4 (3*offset).
361  *
362  * @param NAME Either 1 or 3
363  * @see MSPEL_FILTER13_CORE for information on A1->A4
364  */
365 #define MSPEL_FILTER13_8B(NAME, A1, A2, A3, A4, OP, OPNAME) \
366 static void \
367 OPNAME ## vc1_## NAME ## _mmx(uint8_t *dst, const uint8_t *src, \
368  x86_reg stride, int rnd, x86_reg offset) \
369 { \
370  int h = 8; \
371  src -= offset; \
372  rnd = 32-rnd; \
373  __asm__ volatile ( \
374  LOAD_ROUNDER_MMX("%6") \
375  "movq "MANGLE(ff_pw_53)", %%mm5 \n\t" \
376  "movq "MANGLE(ff_pw_18)", %%mm6 \n\t" \
377  ".p2align 3 \n\t" \
378  "1: \n\t" \
379  MSPEL_FILTER13_CORE(DO_UNPACK, "movd 1", A1, A2, A3, A4) \
380  NORMALIZE_MMX("$6") \
381  TRANSFER_DO_PACK(OP) \
382  "add %5, %1 \n\t" \
383  "add %5, %2 \n\t" \
384  "decl %0 \n\t" \
385  "jnz 1b \n\t" \
386  : "+r"(h), "+r" (src), "+r" (dst) \
387  : "r"(offset), "r"(3*offset), "g"(stride), "m"(rnd) \
388  : "memory" \
389  ); \
390 }
391 
392 /** 1/4 shift bicubic interpolation */
393 MSPEL_FILTER13_8B (shift1, "0(%1,%4 )", "0(%1,%3,2)", "0(%1,%3 )", "0(%1 )", OP_PUT, put_)
394 MSPEL_FILTER13_8B (shift1, "0(%1,%4 )", "0(%1,%3,2)", "0(%1,%3 )", "0(%1 )", OP_AVG, avg_)
395 MSPEL_FILTER13_VER_16B(shift1, "0(%1,%4 )", "0(%1,%3,2)", "0(%1,%3 )", "0(%1 )")
396 MSPEL_FILTER13_HOR_16B(shift1, "2*3(%1)", "2*2(%1)", "2*1(%1)", "2*0(%1)", OP_PUT, put_)
397 MSPEL_FILTER13_HOR_16B(shift1, "2*3(%1)", "2*2(%1)", "2*1(%1)", "2*0(%1)", OP_AVG, avg_)
398 
399 /** 3/4 shift bicubic interpolation */
400 MSPEL_FILTER13_8B (shift3, "0(%1 )", "0(%1,%3 )", "0(%1,%3,2)", "0(%1,%4 )", OP_PUT, put_)
401 MSPEL_FILTER13_8B (shift3, "0(%1 )", "0(%1,%3 )", "0(%1,%3,2)", "0(%1,%4 )", OP_AVG, avg_)
402 MSPEL_FILTER13_VER_16B(shift3, "0(%1 )", "0(%1,%3 )", "0(%1,%3,2)", "0(%1,%4 )")
403 MSPEL_FILTER13_HOR_16B(shift3, "2*0(%1)", "2*1(%1)", "2*2(%1)", "2*3(%1)", OP_PUT, put_)
404 MSPEL_FILTER13_HOR_16B(shift3, "2*0(%1)", "2*1(%1)", "2*2(%1)", "2*3(%1)", OP_AVG, avg_)
405 
406 typedef void (*vc1_mspel_mc_filter_ver_16bits)(int16_t *dst, const uint8_t *src, x86_reg src_stride, int rnd, int64_t shift);
407 typedef void (*vc1_mspel_mc_filter_hor_16bits)(uint8_t *dst, x86_reg dst_stride, const int16_t *src, int rnd);
408 typedef void (*vc1_mspel_mc_filter_8bits)(uint8_t *dst, const uint8_t *src, x86_reg stride, int rnd, x86_reg offset);
409 
410 /**
411  * Interpolate fractional pel values by applying proper vertical then
412  * horizontal filter.
413  *
414  * @param dst Destination buffer for interpolated pels.
415  * @param src Source buffer.
416  * @param stride Stride for both src and dst buffers.
417  * @param hmode Horizontal filter (expressed in quarter pixels shift).
418  * @param hmode Vertical filter.
419  * @param rnd Rounding bias.
420  */
421 #define VC1_MSPEL_MC(OP)\
422 static void OP ## vc1_mspel_mc(uint8_t *dst, const uint8_t *src, int stride,\
423  int hmode, int vmode, int rnd)\
424 {\
425  static const vc1_mspel_mc_filter_ver_16bits vc1_put_shift_ver_16bits[] =\
426  { NULL, vc1_put_ver_16b_shift1_mmx, vc1_put_ver_16b_shift2_mmx, vc1_put_ver_16b_shift3_mmx };\
427  static const vc1_mspel_mc_filter_hor_16bits vc1_put_shift_hor_16bits[] =\
428  { NULL, OP ## vc1_hor_16b_shift1_mmx, OP ## vc1_hor_16b_shift2_mmx, OP ## vc1_hor_16b_shift3_mmx };\
429  static const vc1_mspel_mc_filter_8bits vc1_put_shift_8bits[] =\
430  { NULL, OP ## vc1_shift1_mmx, OP ## vc1_shift2_mmx, OP ## vc1_shift3_mmx };\
431 \
432  __asm__ volatile(\
433  "pxor %%mm0, %%mm0 \n\t"\
434  ::: "memory"\
435  );\
436 \
437  if (vmode) { /* Vertical filter to apply */\
438  if (hmode) { /* Horizontal filter to apply, output to tmp */\
439  static const int shift_value[] = { 0, 5, 1, 5 };\
440  int shift = (shift_value[hmode]+shift_value[vmode])>>1;\
441  int r;\
442  DECLARE_ALIGNED(16, int16_t, tmp)[12*8];\
443 \
444  r = (1<<(shift-1)) + rnd-1;\
445  vc1_put_shift_ver_16bits[vmode](tmp, src-1, stride, r, shift);\
446 \
447  vc1_put_shift_hor_16bits[hmode](dst, stride, tmp+1, 64-rnd);\
448  return;\
449  }\
450  else { /* No horizontal filter, output 8 lines to dst */\
451  vc1_put_shift_8bits[vmode](dst, src, stride, 1-rnd, stride);\
452  return;\
453  }\
454  }\
455 \
456  /* Horizontal mode with no vertical mode */\
457  vc1_put_shift_8bits[hmode](dst, src, stride, rnd, 1);\
458 }
459 
460 VC1_MSPEL_MC(put_)
461 VC1_MSPEL_MC(avg_)
462 
463 /** Macro to ease bicubic filter interpolation functions declarations */
464 #define DECLARE_FUNCTION(a, b) \
465 static void put_vc1_mspel_mc ## a ## b ## _mmx(uint8_t *dst, \
466  const uint8_t *src, \
467  ptrdiff_t stride, \
468  int rnd) \
469 { \
470  put_vc1_mspel_mc(dst, src, stride, a, b, rnd); \
471 }\
472 static void avg_vc1_mspel_mc ## a ## b ## _mmxext(uint8_t *dst, \
473  const uint8_t *src, \
474  ptrdiff_t stride, \
475  int rnd) \
476 { \
477  avg_vc1_mspel_mc(dst, src, stride, a, b, rnd); \
478 }
479 
480 DECLARE_FUNCTION(0, 1)
481 DECLARE_FUNCTION(0, 2)
482 DECLARE_FUNCTION(0, 3)
483 
484 DECLARE_FUNCTION(1, 0)
485 DECLARE_FUNCTION(1, 1)
486 DECLARE_FUNCTION(1, 2)
487 DECLARE_FUNCTION(1, 3)
488 
489 DECLARE_FUNCTION(2, 0)
490 DECLARE_FUNCTION(2, 1)
491 DECLARE_FUNCTION(2, 2)
492 DECLARE_FUNCTION(2, 3)
493 
494 DECLARE_FUNCTION(3, 0)
495 DECLARE_FUNCTION(3, 1)
496 DECLARE_FUNCTION(3, 2)
497 DECLARE_FUNCTION(3, 3)
498 
499 static void vc1_inv_trans_4x4_dc_mmxext(uint8_t *dest, int linesize,
500  int16_t *block)
501 {
502  int dc = block[0];
503  dc = (17 * dc + 4) >> 3;
504  dc = (17 * dc + 64) >> 7;
505  __asm__ volatile(
506  "movd %0, %%mm0 \n\t"
507  "pshufw $0, %%mm0, %%mm0 \n\t"
508  "pxor %%mm1, %%mm1 \n\t"
509  "psubw %%mm0, %%mm1 \n\t"
510  "packuswb %%mm0, %%mm0 \n\t"
511  "packuswb %%mm1, %%mm1 \n\t"
512  ::"r"(dc)
513  );
514  __asm__ volatile(
515  "movd %0, %%mm2 \n\t"
516  "movd %1, %%mm3 \n\t"
517  "movd %2, %%mm4 \n\t"
518  "movd %3, %%mm5 \n\t"
519  "paddusb %%mm0, %%mm2 \n\t"
520  "paddusb %%mm0, %%mm3 \n\t"
521  "paddusb %%mm0, %%mm4 \n\t"
522  "paddusb %%mm0, %%mm5 \n\t"
523  "psubusb %%mm1, %%mm2 \n\t"
524  "psubusb %%mm1, %%mm3 \n\t"
525  "psubusb %%mm1, %%mm4 \n\t"
526  "psubusb %%mm1, %%mm5 \n\t"
527  "movd %%mm2, %0 \n\t"
528  "movd %%mm3, %1 \n\t"
529  "movd %%mm4, %2 \n\t"
530  "movd %%mm5, %3 \n\t"
531  :"+m"(*(uint32_t*)(dest+0*linesize)),
532  "+m"(*(uint32_t*)(dest+1*linesize)),
533  "+m"(*(uint32_t*)(dest+2*linesize)),
534  "+m"(*(uint32_t*)(dest+3*linesize))
535  );
536 }
537 
538 static void vc1_inv_trans_4x8_dc_mmxext(uint8_t *dest, int linesize,
539  int16_t *block)
540 {
541  int dc = block[0];
542  dc = (17 * dc + 4) >> 3;
543  dc = (12 * dc + 64) >> 7;
544  __asm__ volatile(
545  "movd %0, %%mm0 \n\t"
546  "pshufw $0, %%mm0, %%mm0 \n\t"
547  "pxor %%mm1, %%mm1 \n\t"
548  "psubw %%mm0, %%mm1 \n\t"
549  "packuswb %%mm0, %%mm0 \n\t"
550  "packuswb %%mm1, %%mm1 \n\t"
551  ::"r"(dc)
552  );
553  __asm__ volatile(
554  "movd %0, %%mm2 \n\t"
555  "movd %1, %%mm3 \n\t"
556  "movd %2, %%mm4 \n\t"
557  "movd %3, %%mm5 \n\t"
558  "paddusb %%mm0, %%mm2 \n\t"
559  "paddusb %%mm0, %%mm3 \n\t"
560  "paddusb %%mm0, %%mm4 \n\t"
561  "paddusb %%mm0, %%mm5 \n\t"
562  "psubusb %%mm1, %%mm2 \n\t"
563  "psubusb %%mm1, %%mm3 \n\t"
564  "psubusb %%mm1, %%mm4 \n\t"
565  "psubusb %%mm1, %%mm5 \n\t"
566  "movd %%mm2, %0 \n\t"
567  "movd %%mm3, %1 \n\t"
568  "movd %%mm4, %2 \n\t"
569  "movd %%mm5, %3 \n\t"
570  :"+m"(*(uint32_t*)(dest+0*linesize)),
571  "+m"(*(uint32_t*)(dest+1*linesize)),
572  "+m"(*(uint32_t*)(dest+2*linesize)),
573  "+m"(*(uint32_t*)(dest+3*linesize))
574  );
575  dest += 4*linesize;
576  __asm__ volatile(
577  "movd %0, %%mm2 \n\t"
578  "movd %1, %%mm3 \n\t"
579  "movd %2, %%mm4 \n\t"
580  "movd %3, %%mm5 \n\t"
581  "paddusb %%mm0, %%mm2 \n\t"
582  "paddusb %%mm0, %%mm3 \n\t"
583  "paddusb %%mm0, %%mm4 \n\t"
584  "paddusb %%mm0, %%mm5 \n\t"
585  "psubusb %%mm1, %%mm2 \n\t"
586  "psubusb %%mm1, %%mm3 \n\t"
587  "psubusb %%mm1, %%mm4 \n\t"
588  "psubusb %%mm1, %%mm5 \n\t"
589  "movd %%mm2, %0 \n\t"
590  "movd %%mm3, %1 \n\t"
591  "movd %%mm4, %2 \n\t"
592  "movd %%mm5, %3 \n\t"
593  :"+m"(*(uint32_t*)(dest+0*linesize)),
594  "+m"(*(uint32_t*)(dest+1*linesize)),
595  "+m"(*(uint32_t*)(dest+2*linesize)),
596  "+m"(*(uint32_t*)(dest+3*linesize))
597  );
598 }
599 
600 static void vc1_inv_trans_8x4_dc_mmxext(uint8_t *dest, int linesize,
601  int16_t *block)
602 {
603  int dc = block[0];
604  dc = ( 3 * dc + 1) >> 1;
605  dc = (17 * dc + 64) >> 7;
606  __asm__ volatile(
607  "movd %0, %%mm0 \n\t"
608  "pshufw $0, %%mm0, %%mm0 \n\t"
609  "pxor %%mm1, %%mm1 \n\t"
610  "psubw %%mm0, %%mm1 \n\t"
611  "packuswb %%mm0, %%mm0 \n\t"
612  "packuswb %%mm1, %%mm1 \n\t"
613  ::"r"(dc)
614  );
615  __asm__ volatile(
616  "movq %0, %%mm2 \n\t"
617  "movq %1, %%mm3 \n\t"
618  "movq %2, %%mm4 \n\t"
619  "movq %3, %%mm5 \n\t"
620  "paddusb %%mm0, %%mm2 \n\t"
621  "paddusb %%mm0, %%mm3 \n\t"
622  "paddusb %%mm0, %%mm4 \n\t"
623  "paddusb %%mm0, %%mm5 \n\t"
624  "psubusb %%mm1, %%mm2 \n\t"
625  "psubusb %%mm1, %%mm3 \n\t"
626  "psubusb %%mm1, %%mm4 \n\t"
627  "psubusb %%mm1, %%mm5 \n\t"
628  "movq %%mm2, %0 \n\t"
629  "movq %%mm3, %1 \n\t"
630  "movq %%mm4, %2 \n\t"
631  "movq %%mm5, %3 \n\t"
632  :"+m"(*(uint32_t*)(dest+0*linesize)),
633  "+m"(*(uint32_t*)(dest+1*linesize)),
634  "+m"(*(uint32_t*)(dest+2*linesize)),
635  "+m"(*(uint32_t*)(dest+3*linesize))
636  );
637 }
638 
639 static void vc1_inv_trans_8x8_dc_mmxext(uint8_t *dest, int linesize,
640  int16_t *block)
641 {
642  int dc = block[0];
643  dc = (3 * dc + 1) >> 1;
644  dc = (3 * dc + 16) >> 5;
645  __asm__ volatile(
646  "movd %0, %%mm0 \n\t"
647  "pshufw $0, %%mm0, %%mm0 \n\t"
648  "pxor %%mm1, %%mm1 \n\t"
649  "psubw %%mm0, %%mm1 \n\t"
650  "packuswb %%mm0, %%mm0 \n\t"
651  "packuswb %%mm1, %%mm1 \n\t"
652  ::"r"(dc)
653  );
654  __asm__ volatile(
655  "movq %0, %%mm2 \n\t"
656  "movq %1, %%mm3 \n\t"
657  "movq %2, %%mm4 \n\t"
658  "movq %3, %%mm5 \n\t"
659  "paddusb %%mm0, %%mm2 \n\t"
660  "paddusb %%mm0, %%mm3 \n\t"
661  "paddusb %%mm0, %%mm4 \n\t"
662  "paddusb %%mm0, %%mm5 \n\t"
663  "psubusb %%mm1, %%mm2 \n\t"
664  "psubusb %%mm1, %%mm3 \n\t"
665  "psubusb %%mm1, %%mm4 \n\t"
666  "psubusb %%mm1, %%mm5 \n\t"
667  "movq %%mm2, %0 \n\t"
668  "movq %%mm3, %1 \n\t"
669  "movq %%mm4, %2 \n\t"
670  "movq %%mm5, %3 \n\t"
671  :"+m"(*(uint32_t*)(dest+0*linesize)),
672  "+m"(*(uint32_t*)(dest+1*linesize)),
673  "+m"(*(uint32_t*)(dest+2*linesize)),
674  "+m"(*(uint32_t*)(dest+3*linesize))
675  );
676  dest += 4*linesize;
677  __asm__ volatile(
678  "movq %0, %%mm2 \n\t"
679  "movq %1, %%mm3 \n\t"
680  "movq %2, %%mm4 \n\t"
681  "movq %3, %%mm5 \n\t"
682  "paddusb %%mm0, %%mm2 \n\t"
683  "paddusb %%mm0, %%mm3 \n\t"
684  "paddusb %%mm0, %%mm4 \n\t"
685  "paddusb %%mm0, %%mm5 \n\t"
686  "psubusb %%mm1, %%mm2 \n\t"
687  "psubusb %%mm1, %%mm3 \n\t"
688  "psubusb %%mm1, %%mm4 \n\t"
689  "psubusb %%mm1, %%mm5 \n\t"
690  "movq %%mm2, %0 \n\t"
691  "movq %%mm3, %1 \n\t"
692  "movq %%mm4, %2 \n\t"
693  "movq %%mm5, %3 \n\t"
694  :"+m"(*(uint32_t*)(dest+0*linesize)),
695  "+m"(*(uint32_t*)(dest+1*linesize)),
696  "+m"(*(uint32_t*)(dest+2*linesize)),
697  "+m"(*(uint32_t*)(dest+3*linesize))
698  );
699 }
700 
702 {
704  dsp->put_vc1_mspel_pixels_tab[ 4] = put_vc1_mspel_mc01_mmx;
705  dsp->put_vc1_mspel_pixels_tab[ 8] = put_vc1_mspel_mc02_mmx;
706  dsp->put_vc1_mspel_pixels_tab[12] = put_vc1_mspel_mc03_mmx;
707 
708  dsp->put_vc1_mspel_pixels_tab[ 1] = put_vc1_mspel_mc10_mmx;
709  dsp->put_vc1_mspel_pixels_tab[ 5] = put_vc1_mspel_mc11_mmx;
710  dsp->put_vc1_mspel_pixels_tab[ 9] = put_vc1_mspel_mc12_mmx;
711  dsp->put_vc1_mspel_pixels_tab[13] = put_vc1_mspel_mc13_mmx;
712 
713  dsp->put_vc1_mspel_pixels_tab[ 2] = put_vc1_mspel_mc20_mmx;
714  dsp->put_vc1_mspel_pixels_tab[ 6] = put_vc1_mspel_mc21_mmx;
715  dsp->put_vc1_mspel_pixels_tab[10] = put_vc1_mspel_mc22_mmx;
716  dsp->put_vc1_mspel_pixels_tab[14] = put_vc1_mspel_mc23_mmx;
717 
718  dsp->put_vc1_mspel_pixels_tab[ 3] = put_vc1_mspel_mc30_mmx;
719  dsp->put_vc1_mspel_pixels_tab[ 7] = put_vc1_mspel_mc31_mmx;
720  dsp->put_vc1_mspel_pixels_tab[11] = put_vc1_mspel_mc32_mmx;
721  dsp->put_vc1_mspel_pixels_tab[15] = put_vc1_mspel_mc33_mmx;
722 }
723 
725 {
726  dsp->avg_vc1_mspel_pixels_tab[ 4] = avg_vc1_mspel_mc01_mmxext;
727  dsp->avg_vc1_mspel_pixels_tab[ 8] = avg_vc1_mspel_mc02_mmxext;
728  dsp->avg_vc1_mspel_pixels_tab[12] = avg_vc1_mspel_mc03_mmxext;
729 
730  dsp->avg_vc1_mspel_pixels_tab[ 1] = avg_vc1_mspel_mc10_mmxext;
731  dsp->avg_vc1_mspel_pixels_tab[ 5] = avg_vc1_mspel_mc11_mmxext;
732  dsp->avg_vc1_mspel_pixels_tab[ 9] = avg_vc1_mspel_mc12_mmxext;
733  dsp->avg_vc1_mspel_pixels_tab[13] = avg_vc1_mspel_mc13_mmxext;
734 
735  dsp->avg_vc1_mspel_pixels_tab[ 2] = avg_vc1_mspel_mc20_mmxext;
736  dsp->avg_vc1_mspel_pixels_tab[ 6] = avg_vc1_mspel_mc21_mmxext;
737  dsp->avg_vc1_mspel_pixels_tab[10] = avg_vc1_mspel_mc22_mmxext;
738  dsp->avg_vc1_mspel_pixels_tab[14] = avg_vc1_mspel_mc23_mmxext;
739 
740  dsp->avg_vc1_mspel_pixels_tab[ 3] = avg_vc1_mspel_mc30_mmxext;
741  dsp->avg_vc1_mspel_pixels_tab[ 7] = avg_vc1_mspel_mc31_mmxext;
742  dsp->avg_vc1_mspel_pixels_tab[11] = avg_vc1_mspel_mc32_mmxext;
743  dsp->avg_vc1_mspel_pixels_tab[15] = avg_vc1_mspel_mc33_mmxext;
744 
745  dsp->vc1_inv_trans_8x8_dc = vc1_inv_trans_8x8_dc_mmxext;
746  dsp->vc1_inv_trans_4x8_dc = vc1_inv_trans_4x8_dc_mmxext;
747  dsp->vc1_inv_trans_8x4_dc = vc1_inv_trans_8x4_dc_mmxext;
748  dsp->vc1_inv_trans_4x4_dc = vc1_inv_trans_4x4_dc_mmxext;
749 }
750 #endif /* HAVE_INLINE_ASM */
#define MANGLE(a)
VC-1 and WMV3 decoder.
static int shift(int a, int b)
Definition: sonic.c:86
memory handling functions
int stride
Definition: mace.c:144
void(* vc1_inv_trans_8x4_dc)(uint8_t *dest, int line_size, int16_t *block)
Definition: vc1dsp.h:44
vc1op_pixels_func put_vc1_mspel_pixels_tab[16]
Definition: vc1dsp.h:61
void ff_vc1dsp_init_mmx(VC1DSPContext *dsp)
uint8_t
#define av_cold
Definition: attributes.h:78
window constants for m
#define b
Definition: input.c:42
#define OP_AVG(dst, val)
Definition: diracdsp.c:77
static const int shift1[6]
const char * r
Definition: vf_curves.c:94
static const uint8_t offset[127][2]
Definition: vf_spp.c:70
void(* vc1_inv_trans_8x8_dc)(uint8_t *dest, int line_size, int16_t *block)
Definition: vc1dsp.h:43
#define OP_PUT(dst, val)
Definition: diracdsp.c:76
void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride, int rnd)
t
Definition: genspecsines3.m:6
vc1op_pixels_func avg_vc1_mspel_pixels_tab[16]
Definition: vc1dsp.h:62
void(* vc1_inv_trans_4x8_dc)(uint8_t *dest, int line_size, int16_t *block)
Definition: vc1dsp.h:45
dest
Definition: start.py:60
void ff_vc1dsp_init_mmxext(VC1DSPContext *dsp)
AVS_Value src
Definition: avisynth_c.h:523
FIXME Range Coding of cr are mx and my are Motion Vector top and top right vectors is used as motion vector prediction the used motion vector is the sum of the predictor and(mvx_diff, mvy_diff)*mv_scale Intra DC Predicton block[y][x] dc[1]
Definition: snow.txt:392
#define VC1_MSPEL_MC(OP, OP4, OPNAME)
Function used to do motion compensation with bicubic interpolation.
Definition: vc1dsp.c:567
int x86_reg
else dst[i][x+y *dst_stride[i]]
Definition: vf_mcdeint.c:160
void(* vc1_inv_trans_4x4_dc)(uint8_t *dest, int line_size, int16_t *block)
Definition: vc1dsp.h:46