x86/vf_gradfun.c
Go to the documentation of this file.
1 /*
2  * Copyright (C) 2009 Loren Merritt <lorenm@u.washignton.edu>
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
21 #include "libavutil/attributes.h"
22 #include "libavutil/cpu.h"
23 #include "libavutil/mem.h"
24 #include "libavutil/x86/asm.h"
25 #include "libavfilter/gradfun.h"
26 
27 #if HAVE_INLINE_ASM
28 
29 DECLARE_ALIGNED(16, static const uint16_t, pw_7f)[8] = {0x7F,0x7F,0x7F,0x7F,0x7F,0x7F,0x7F,0x7F};
30 DECLARE_ALIGNED(16, static const uint16_t, pw_ff)[8] = {0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF};
31 
32 #if HAVE_MMXEXT_INLINE
33 static void gradfun_filter_line_mmxext(uint8_t *dst, const uint8_t *src, const uint16_t *dc,
34  int width, int thresh,
35  const uint16_t *dithers)
36 {
37  intptr_t x;
38  if (width & 3) {
39  x = width & ~3;
40  ff_gradfun_filter_line_c(dst + x, src + x, dc + x / 2, width - x, thresh, dithers);
41  width = x;
42  }
43  x = -width;
44  __asm__ volatile(
45  "movd %4, %%mm5 \n"
46  "pxor %%mm7, %%mm7 \n"
47  "pshufw $0, %%mm5, %%mm5 \n"
48  "movq %6, %%mm6 \n"
49  "movq (%5), %%mm3 \n"
50  "movq 8(%5), %%mm4 \n"
51 
52  "1: \n"
53  "movd (%2,%0), %%mm0 \n"
54  "movd (%3,%0), %%mm1 \n"
55  "punpcklbw %%mm7, %%mm0 \n"
56  "punpcklwd %%mm1, %%mm1 \n"
57  "psllw $7, %%mm0 \n"
58  "pxor %%mm2, %%mm2 \n"
59  "psubw %%mm0, %%mm1 \n" // delta = dc - pix
60  "psubw %%mm1, %%mm2 \n"
61  "pmaxsw %%mm1, %%mm2 \n"
62  "pmulhuw %%mm5, %%mm2 \n" // m = abs(delta) * thresh >> 16
63  "psubw %%mm6, %%mm2 \n"
64  "pminsw %%mm7, %%mm2 \n" // m = -max(0, 127-m)
65  "pmullw %%mm2, %%mm2 \n"
66  "paddw %%mm3, %%mm0 \n" // pix += dither
67  "psllw $2, %%mm1 \n" // m = m*m*delta >> 14
68  "pmulhw %%mm2, %%mm1 \n"
69  "paddw %%mm1, %%mm0 \n" // pix += m
70  "psraw $7, %%mm0 \n"
71  "packuswb %%mm0, %%mm0 \n"
72  "movd %%mm0, (%1,%0) \n" // dst = clip(pix>>7)
73  "add $4, %0 \n"
74  "jnl 2f \n"
75 
76  "movd (%2,%0), %%mm0 \n"
77  "movd (%3,%0), %%mm1 \n"
78  "punpcklbw %%mm7, %%mm0 \n"
79  "punpcklwd %%mm1, %%mm1 \n"
80  "psllw $7, %%mm0 \n"
81  "pxor %%mm2, %%mm2 \n"
82  "psubw %%mm0, %%mm1 \n" // delta = dc - pix
83  "psubw %%mm1, %%mm2 \n"
84  "pmaxsw %%mm1, %%mm2 \n"
85  "pmulhuw %%mm5, %%mm2 \n" // m = abs(delta) * thresh >> 16
86  "psubw %%mm6, %%mm2 \n"
87  "pminsw %%mm7, %%mm2 \n" // m = -max(0, 127-m)
88  "pmullw %%mm2, %%mm2 \n"
89  "paddw %%mm4, %%mm0 \n" // pix += dither
90  "psllw $2, %%mm1 \n" // m = m*m*delta >> 14
91  "pmulhw %%mm2, %%mm1 \n"
92  "paddw %%mm1, %%mm0 \n" // pix += m
93  "psraw $7, %%mm0 \n"
94  "packuswb %%mm0, %%mm0 \n"
95  "movd %%mm0, (%1,%0) \n" // dst = clip(pix>>7)
96  "add $4, %0 \n"
97  "jl 1b \n"
98 
99  "2: \n"
100  "emms \n"
101  :"+r"(x)
102  :"r"(dst+width), "r"(src+width), "r"(dc+width/2),
103  "rm"(thresh), "r"(dithers), "m"(*pw_7f)
104  :"memory"
105  );
106 }
107 #endif
108 
109 #if HAVE_SSSE3_INLINE
110 static void gradfun_filter_line_ssse3(uint8_t *dst, const uint8_t *src, const uint16_t *dc, int width, int thresh, const uint16_t *dithers)
111 {
112  intptr_t x;
113  if (width & 7) {
114  // could be 10% faster if I somehow eliminated this
115  x = width & ~7;
116  ff_gradfun_filter_line_c(dst + x, src + x, dc + x / 2, width - x, thresh, dithers);
117  width = x;
118  }
119  x = -width;
120  __asm__ volatile(
121  "movd %4, %%xmm5 \n"
122  "pxor %%xmm7, %%xmm7 \n"
123  "pshuflw $0,%%xmm5, %%xmm5 \n"
124  "movdqa %6, %%xmm6 \n"
125  "punpcklqdq %%xmm5, %%xmm5 \n"
126  "movdqa %5, %%xmm4 \n"
127  "1: \n"
128  "movq (%2,%0), %%xmm0 \n"
129  "movq (%3,%0), %%xmm1 \n"
130  "punpcklbw %%xmm7, %%xmm0 \n"
131  "punpcklwd %%xmm1, %%xmm1 \n"
132  "psllw $7, %%xmm0 \n"
133  "psubw %%xmm0, %%xmm1 \n" // delta = dc - pix
134  "pabsw %%xmm1, %%xmm2 \n"
135  "pmulhuw %%xmm5, %%xmm2 \n" // m = abs(delta) * thresh >> 16
136  "psubw %%xmm6, %%xmm2 \n"
137  "pminsw %%xmm7, %%xmm2 \n" // m = -max(0, 127-m)
138  "pmullw %%xmm2, %%xmm2 \n"
139  "psllw $2, %%xmm1 \n"
140  "paddw %%xmm4, %%xmm0 \n" // pix += dither
141  "pmulhw %%xmm2, %%xmm1 \n" // m = m*m*delta >> 14
142  "paddw %%xmm1, %%xmm0 \n" // pix += m
143  "psraw $7, %%xmm0 \n"
144  "packuswb %%xmm0, %%xmm0 \n"
145  "movq %%xmm0, (%1,%0) \n" // dst = clip(pix>>7)
146  "add $8, %0 \n"
147  "jl 1b \n"
148  :"+&r"(x)
149  :"r"(dst+width), "r"(src+width), "r"(dc+width/2),
150  "rm"(thresh), "m"(*dithers), "m"(*pw_7f)
151  :"memory"
152  );
153 }
154 #endif /* HAVE_SSSE3_INLINE */
155 
156 #if HAVE_SSE2_INLINE
157 static void gradfun_blur_line_sse2(uint16_t *dc, uint16_t *buf, const uint16_t *buf1, const uint8_t *src, int src_linesize, int width)
158 {
159 #define BLURV(load)\
160  intptr_t x = -2*width;\
161  __asm__ volatile(\
162  "movdqa %6, %%xmm7 \n"\
163  "1: \n"\
164  load" (%4,%0), %%xmm0 \n"\
165  load" (%5,%0), %%xmm1 \n"\
166  "movdqa %%xmm0, %%xmm2 \n"\
167  "movdqa %%xmm1, %%xmm3 \n"\
168  "psrlw $8, %%xmm0 \n"\
169  "psrlw $8, %%xmm1 \n"\
170  "pand %%xmm7, %%xmm2 \n"\
171  "pand %%xmm7, %%xmm3 \n"\
172  "paddw %%xmm1, %%xmm0 \n"\
173  "paddw %%xmm3, %%xmm2 \n"\
174  "paddw %%xmm2, %%xmm0 \n"\
175  "paddw (%2,%0), %%xmm0 \n"\
176  "movdqa (%1,%0), %%xmm1 \n"\
177  "movdqa %%xmm0, (%1,%0) \n"\
178  "psubw %%xmm1, %%xmm0 \n"\
179  "movdqa %%xmm0, (%3,%0) \n"\
180  "add $16, %0 \n"\
181  "jl 1b \n"\
182  :"+&r"(x)\
183  :"r"(buf+width),\
184  "r"(buf1+width),\
185  "r"(dc+width),\
186  "r"(src+width*2),\
187  "r"(src+width*2+src_linesize),\
188  "m"(*pw_ff)\
189  :"memory"\
190  );
191  if (((intptr_t) src | src_linesize) & 15) {
192  BLURV("movdqu");
193  } else {
194  BLURV("movdqa");
195  }
196 }
197 #endif /* HAVE_SSE2_INLINE */
198 
199 #endif /* HAVE_INLINE_ASM */
200 
202 {
203  int cpu_flags = av_get_cpu_flags();
204 
205 #if HAVE_MMXEXT_INLINE
206  if (cpu_flags & AV_CPU_FLAG_MMXEXT)
207  gf->filter_line = gradfun_filter_line_mmxext;
208 #endif
209 #if HAVE_SSSE3_INLINE
210  if (cpu_flags & AV_CPU_FLAG_SSSE3)
211  gf->filter_line = gradfun_filter_line_ssse3;
212 #endif
213 #if HAVE_SSE2_INLINE
214  if (cpu_flags & AV_CPU_FLAG_SSE2)
215  gf->blur_line = gradfun_blur_line_sse2;
216 #endif
217 }
memory handling functions
#define DECLARE_ALIGNED(n, t, v)
Definition: mem.h:59
Holds instance-specific information for gradfun.
Definition: gradfun.h:28
Macro definitions for various function/variable attributes.
uint8_t
#define av_cold
Definition: attributes.h:78
#define AV_CPU_FLAG_MMXEXT
SSE integer functions or AMD MMX ext.
Definition: cpu.h:30
Discrete Time axis x
#define AV_CPU_FLAG_SSSE3
Conroe SSSE3 functions.
Definition: cpu.h:39
static int cpu_flags
Definition: dct-test.c:77
void ff_gradfun_filter_line_c(uint8_t *dst, const uint8_t *src, const uint16_t *dc, int width, int thresh, const uint16_t *dithers)
Definition: vf_gradfun.c:58
static int width
Definition: tests/utils.c:158
AVS_Value src
Definition: avisynth_c.h:523
FIXME Range Coding of cr are mx and my are Motion Vector top and top right vectors is used as motion vector prediction the used motion vector is the sum of the predictor and(mvx_diff, mvy_diff)*mv_scale Intra DC Predicton block[y][x] dc[1]
Definition: snow.txt:392
void * buf
Definition: avisynth_c.h:594
av_cold void ff_gradfun_init_x86(GradFunContext *gf)
int av_get_cpu_flags(void)
Return the flags which specify extensions supported by the CPU.
Definition: cpu.c:30
void(* filter_line)(uint8_t *dst, const uint8_t *src, const uint16_t *dc, int width, int thresh, const uint16_t *dithers)
DSP functions.
Definition: gradfun.h:38
const uint8_t dithers[8][8][8]
void(* blur_line)(uint16_t *dc, uint16_t *buf, const uint16_t *buf1, const uint8_t *src, int src_linesize, int width)
Definition: gradfun.h:39
else dst[i][x+y *dst_stride[i]]
Definition: vf_mcdeint.c:160
#define AV_CPU_FLAG_SSE2
PIV SSE2 functions.
Definition: cpu.h:34