swscale_altivec.c
Go to the documentation of this file.
1 /*
2  * AltiVec-enhanced yuv2yuvX
3  *
4  * Copyright (C) 2004 Romain Dolbeau <romain@dolbeau.org>
5  * based on the equivalent C code in swscale.c
6  *
7  * This file is part of FFmpeg.
8  *
9  * FFmpeg is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public
11  * License as published by the Free Software Foundation; either
12  * version 2.1 of the License, or (at your option) any later version.
13  *
14  * FFmpeg is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with FFmpeg; if not, write to the Free Software
21  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22  */
23 
24 #include <inttypes.h>
25 
26 #include "config.h"
27 #include "libswscale/swscale.h"
29 #include "libavutil/attributes.h"
30 #include "libavutil/cpu.h"
31 #include "yuv2rgb_altivec.h"
32 
33 #define vzero vec_splat_s32(0)
34 
35 #define yuv2planeX_8(d1, d2, l1, src, x, perm, filter) do { \
36  vector signed short l2 = vec_ld(((x) << 1) + 16, src); \
37  vector signed short ls = vec_perm(l1, l2, perm); \
38  vector signed int i1 = vec_mule(filter, ls); \
39  vector signed int i2 = vec_mulo(filter, ls); \
40  vector signed int vf1 = vec_mergeh(i1, i2); \
41  vector signed int vf2 = vec_mergel(i1, i2); \
42  d1 = vec_add(d1, vf1); \
43  d2 = vec_add(d2, vf2); \
44  l1 = l2; \
45  } while (0)
46 
47 static void yuv2planeX_16_altivec(const int16_t *filter, int filterSize,
48  const int16_t **src, uint8_t *dest,
49  const uint8_t *dither, int offset, int x)
50 {
51  register int i, j;
52  DECLARE_ALIGNED(16, int, val)[16];
53  vector signed int vo1, vo2, vo3, vo4;
54  vector unsigned short vs1, vs2;
55  vector unsigned char vf;
56  vector unsigned int altivec_vectorShiftInt19 =
57  vec_add(vec_splat_u32(10), vec_splat_u32(9));
58 
59  for (i = 0; i < 16; i++)
60  val[i] = dither[(x + i + offset) & 7] << 12;
61 
62  vo1 = vec_ld(0, val);
63  vo2 = vec_ld(16, val);
64  vo3 = vec_ld(32, val);
65  vo4 = vec_ld(48, val);
66 
67  for (j = 0; j < filterSize; j++) {
68  vector signed short l1, vLumFilter = vec_ld(j << 1, filter);
69  vector unsigned char perm, perm0 = vec_lvsl(j << 1, filter);
70  vLumFilter = vec_perm(vLumFilter, vLumFilter, perm0);
71  vLumFilter = vec_splat(vLumFilter, 0); // lumFilter[j] is loaded 8 times in vLumFilter
72 
73  perm = vec_lvsl(x << 1, src[j]);
74  l1 = vec_ld(x << 1, src[j]);
75 
76  yuv2planeX_8(vo1, vo2, l1, src[j], x, perm, vLumFilter);
77  yuv2planeX_8(vo3, vo4, l1, src[j], x + 8, perm, vLumFilter);
78  }
79 
80  vo1 = vec_sra(vo1, altivec_vectorShiftInt19);
81  vo2 = vec_sra(vo2, altivec_vectorShiftInt19);
82  vo3 = vec_sra(vo3, altivec_vectorShiftInt19);
83  vo4 = vec_sra(vo4, altivec_vectorShiftInt19);
84  vs1 = vec_packsu(vo1, vo2);
85  vs2 = vec_packsu(vo3, vo4);
86  vf = vec_packsu(vs1, vs2);
87  vec_st(vf, 0, dest);
88 }
89 
90 static inline void yuv2planeX_u(const int16_t *filter, int filterSize,
91  const int16_t **src, uint8_t *dest, int dstW,
92  const uint8_t *dither, int offset, int x)
93 {
94  int i, j;
95 
96  for (i = x; i < dstW; i++) {
97  int t = dither[(i + offset) & 7] << 12;
98  for (j = 0; j < filterSize; j++)
99  t += src[j][i] * filter[j];
100  dest[i] = av_clip_uint8(t >> 19);
101  }
102 }
103 
104 static void yuv2planeX_altivec(const int16_t *filter, int filterSize,
105  const int16_t **src, uint8_t *dest, int dstW,
106  const uint8_t *dither, int offset)
107 {
108  int dst_u = -(uintptr_t)dest & 15;
109  int i;
110 
111  yuv2planeX_u(filter, filterSize, src, dest, dst_u, dither, offset, 0);
112 
113  for (i = dst_u; i < dstW - 15; i += 16)
114  yuv2planeX_16_altivec(filter, filterSize, src, dest + i, dither,
115  offset, i);
116 
117  yuv2planeX_u(filter, filterSize, src, dest, dstW, dither, offset, i);
118 }
119 
120 static void hScale_altivec_real(SwsContext *c, int16_t *dst, int dstW,
121  const uint8_t *src, const int16_t *filter,
122  const int32_t *filterPos, int filterSize)
123 {
124  register int i;
125  DECLARE_ALIGNED(16, int, tempo)[4];
126 
127  if (filterSize % 4) {
128  for (i = 0; i < dstW; i++) {
129  register int j;
130  register int srcPos = filterPos[i];
131  register int val = 0;
132  for (j = 0; j < filterSize; j++)
133  val += ((int)src[srcPos + j]) * filter[filterSize * i + j];
134  dst[i] = FFMIN(val >> 7, (1 << 15) - 1);
135  }
136  } else
137  switch (filterSize) {
138  case 4:
139  for (i = 0; i < dstW; i++) {
140  register int srcPos = filterPos[i];
141 
142  vector unsigned char src_v0 = vec_ld(srcPos, src);
143  vector unsigned char src_v1, src_vF;
144  vector signed short src_v, filter_v;
145  vector signed int val_vEven, val_s;
146  if ((((uintptr_t)src + srcPos) % 16) > 12) {
147  src_v1 = vec_ld(srcPos + 16, src);
148  }
149  src_vF = vec_perm(src_v0, src_v1, vec_lvsl(srcPos, src));
150 
151  src_v = // vec_unpackh sign-extends...
152  (vector signed short)(vec_mergeh((vector unsigned char)vzero, src_vF));
153  // now put our elements in the even slots
154  src_v = vec_mergeh(src_v, (vector signed short)vzero);
155 
156  filter_v = vec_ld(i << 3, filter);
157  // The 3 above is 2 (filterSize == 4) + 1 (sizeof(short) == 2).
158 
159  // The neat trick: We only care for half the elements,
160  // high or low depending on (i<<3)%16 (it's 0 or 8 here),
161  // and we're going to use vec_mule, so we choose
162  // carefully how to "unpack" the elements into the even slots.
163  if ((i << 3) % 16)
164  filter_v = vec_mergel(filter_v, (vector signed short)vzero);
165  else
166  filter_v = vec_mergeh(filter_v, (vector signed short)vzero);
167 
168  val_vEven = vec_mule(src_v, filter_v);
169  val_s = vec_sums(val_vEven, vzero);
170  vec_st(val_s, 0, tempo);
171  dst[i] = FFMIN(tempo[3] >> 7, (1 << 15) - 1);
172  }
173  break;
174 
175  case 8:
176  for (i = 0; i < dstW; i++) {
177  register int srcPos = filterPos[i];
178 
179  vector unsigned char src_v0 = vec_ld(srcPos, src);
180  vector unsigned char src_v1, src_vF;
181  vector signed short src_v, filter_v;
182  vector signed int val_v, val_s;
183  if ((((uintptr_t)src + srcPos) % 16) > 8) {
184  src_v1 = vec_ld(srcPos + 16, src);
185  }
186  src_vF = vec_perm(src_v0, src_v1, vec_lvsl(srcPos, src));
187 
188  src_v = // vec_unpackh sign-extends...
189  (vector signed short)(vec_mergeh((vector unsigned char)vzero, src_vF));
190  filter_v = vec_ld(i << 4, filter);
191  // the 4 above is 3 (filterSize == 8) + 1 (sizeof(short) == 2)
192 
193  val_v = vec_msums(src_v, filter_v, (vector signed int)vzero);
194  val_s = vec_sums(val_v, vzero);
195  vec_st(val_s, 0, tempo);
196  dst[i] = FFMIN(tempo[3] >> 7, (1 << 15) - 1);
197  }
198  break;
199 
200  case 16:
201  for (i = 0; i < dstW; i++) {
202  register int srcPos = filterPos[i];
203 
204  vector unsigned char src_v0 = vec_ld(srcPos, src);
205  vector unsigned char src_v1 = vec_ld(srcPos + 16, src);
206  vector unsigned char src_vF = vec_perm(src_v0, src_v1, vec_lvsl(srcPos, src));
207 
208  vector signed short src_vA = // vec_unpackh sign-extends...
209  (vector signed short)(vec_mergeh((vector unsigned char)vzero, src_vF));
210  vector signed short src_vB = // vec_unpackh sign-extends...
211  (vector signed short)(vec_mergel((vector unsigned char)vzero, src_vF));
212 
213  vector signed short filter_v0 = vec_ld(i << 5, filter);
214  vector signed short filter_v1 = vec_ld((i << 5) + 16, filter);
215  // the 5 above are 4 (filterSize == 16) + 1 (sizeof(short) == 2)
216 
217  vector signed int val_acc = vec_msums(src_vA, filter_v0, (vector signed int)vzero);
218  vector signed int val_v = vec_msums(src_vB, filter_v1, val_acc);
219 
220  vector signed int val_s = vec_sums(val_v, vzero);
221 
222  vec_st(val_s, 0, tempo);
223  dst[i] = FFMIN(tempo[3] >> 7, (1 << 15) - 1);
224  }
225  break;
226 
227  default:
228  for (i = 0; i < dstW; i++) {
229  register int j;
230  register int srcPos = filterPos[i];
231 
232  vector signed int val_s, val_v = (vector signed int)vzero;
233  vector signed short filter_v0R = vec_ld(i * 2 * filterSize, filter);
234  vector unsigned char permF = vec_lvsl((i * 2 * filterSize), filter);
235 
236  vector unsigned char src_v0 = vec_ld(srcPos, src);
237  vector unsigned char permS = vec_lvsl(srcPos, src);
238 
239  for (j = 0; j < filterSize - 15; j += 16) {
240  vector unsigned char src_v1 = vec_ld(srcPos + j + 16, src);
241  vector unsigned char src_vF = vec_perm(src_v0, src_v1, permS);
242 
243  vector signed short src_vA = // vec_unpackh sign-extends...
244  (vector signed short)(vec_mergeh((vector unsigned char)vzero, src_vF));
245  vector signed short src_vB = // vec_unpackh sign-extends...
246  (vector signed short)(vec_mergel((vector unsigned char)vzero, src_vF));
247 
248  vector signed short filter_v1R = vec_ld((i * 2 * filterSize) + (j * 2) + 16, filter);
249  vector signed short filter_v2R = vec_ld((i * 2 * filterSize) + (j * 2) + 32, filter);
250  vector signed short filter_v0 = vec_perm(filter_v0R, filter_v1R, permF);
251  vector signed short filter_v1 = vec_perm(filter_v1R, filter_v2R, permF);
252 
253  vector signed int val_acc = vec_msums(src_vA, filter_v0, val_v);
254  val_v = vec_msums(src_vB, filter_v1, val_acc);
255 
256  filter_v0R = filter_v2R;
257  src_v0 = src_v1;
258  }
259 
260  if (j < filterSize - 7) {
261  // loading src_v0 is useless, it's already done above
262  // vector unsigned char src_v0 = vec_ld(srcPos + j, src);
263  vector unsigned char src_v1, src_vF;
264  vector signed short src_v, filter_v1R, filter_v;
265  if ((((uintptr_t)src + srcPos) % 16) > 8) {
266  src_v1 = vec_ld(srcPos + j + 16, src);
267  }
268  src_vF = vec_perm(src_v0, src_v1, permS);
269 
270  src_v = // vec_unpackh sign-extends...
271  (vector signed short)(vec_mergeh((vector unsigned char)vzero, src_vF));
272  // loading filter_v0R is useless, it's already done above
273  // vector signed short filter_v0R = vec_ld((i * 2 * filterSize) + j, filter);
274  filter_v1R = vec_ld((i * 2 * filterSize) + (j * 2) + 16, filter);
275  filter_v = vec_perm(filter_v0R, filter_v1R, permF);
276 
277  val_v = vec_msums(src_v, filter_v, val_v);
278  }
279 
280  val_s = vec_sums(val_v, vzero);
281 
282  vec_st(val_s, 0, tempo);
283  dst[i] = FFMIN(tempo[3] >> 7, (1 << 15) - 1);
284  }
285  }
286 }
287 
289 {
291 
293  return;
294 
295  if (c->srcBpc == 8 && c->dstBpc <= 14) {
297  }
298  if (!is16BPS(dstFormat) && !is9_OR_10BPS(dstFormat) &&
299  dstFormat != AV_PIX_FMT_NV12 && dstFormat != AV_PIX_FMT_NV21 &&
300  !c->alpPixBuf) {
302  }
303 
304  /* The following list of supported dstFormat values should
305  * match what's found in the body of ff_yuv2packedX_altivec() */
306  if (!(c->flags & (SWS_BITEXACT | SWS_FULL_CHR_H_INT)) && !c->alpPixBuf) {
307  switch (c->dstFormat) {
308  case AV_PIX_FMT_ABGR:
309  c->yuv2packedX = ff_yuv2abgr_X_altivec;
310  break;
311  case AV_PIX_FMT_BGRA:
312  c->yuv2packedX = ff_yuv2bgra_X_altivec;
313  break;
314  case AV_PIX_FMT_ARGB:
315  c->yuv2packedX = ff_yuv2argb_X_altivec;
316  break;
317  case AV_PIX_FMT_RGBA:
318  c->yuv2packedX = ff_yuv2rgba_X_altivec;
319  break;
320  case AV_PIX_FMT_BGR24:
321  c->yuv2packedX = ff_yuv2bgr24_X_altivec;
322  break;
323  case AV_PIX_FMT_RGB24:
324  c->yuv2packedX = ff_yuv2rgb24_X_altivec;
325  break;
326  }
327  }
328 }
int16_t ** alpPixBuf
Ring buffer for scaled horizontal alpha plane lines to be fed to the vertical scaler.
static void yuv2planeX_u(const int16_t *filter, int filterSize, const int16_t **src, uint8_t *dest, int dstW, const uint8_t *dither, int offset, int x)
void(* hcScale)(struct SwsContext *c, int16_t *dst, int dstW, const uint8_t *src, const int16_t *filter, const int32_t *filterPos, int filterSize)
#define AV_CPU_FLAG_ALTIVEC
standard
Definition: cpu.h:52
packed RGB 8:8:8, 24bpp, RGBRGB...
Definition: pixfmt.h:70
#define DECLARE_ALIGNED(n, t, v)
Definition: mem.h:59
#define SWS_FULL_CHR_H_INT
Definition: swscale.h:79
static av_always_inline int is16BPS(enum AVPixelFormat pix_fmt)
void(* hyScale)(struct SwsContext *c, int16_t *dst, int dstW, const uint8_t *src, const int16_t *filter, const int32_t *filterPos, int filterSize)
Scale one horizontal line of input data using a filter over the input lines, to produce one (differen...
Macro definitions for various function/variable attributes.
uint8_t
#define av_cold
Definition: attributes.h:78
#define yuv2planeX_8(d1, d2, l1, src, x, perm, filter)
packed ABGR 8:8:8:8, 32bpp, ABGRABGR...
Definition: pixfmt.h:98
the mask is usually to keep the same permissions Filters should remove permissions on reference they give to output whenever necessary It can be automatically done by setting the rej_perms field on the output pad Here are a few guidelines corresponding to common then the filter should push the output frames on the output link immediately As an exception to the previous rule if the input frame is enough to produce several output frames then the filter needs output only at least one per link The additional frames can be left buffered in the filter
#define SWS_BITEXACT
Definition: swscale.h:84
external API header
enum AVPixelFormat dstFormat
Destination pixel format.
yuv2packedX_fn yuv2packedX
Discrete Time axis x
packed BGRA 8:8:8:8, 32bpp, BGRABGRA...
Definition: pixfmt.h:99
planar YUV 4:2:0, 12bpp, 1 plane for Y and 1 plane for the UV components, which are interleaved (firs...
Definition: pixfmt.h:93
static const uint8_t offset[127][2]
Definition: vf_spp.c:70
packed ARGB 8:8:8:8, 32bpp, ARGBARGB...
Definition: pixfmt.h:96
packed RGBA 8:8:8:8, 32bpp, RGBARGBA...
Definition: pixfmt.h:97
as above, but U and V bytes are swapped
Definition: pixfmt.h:94
#define FFMIN(a, b)
Definition: common.h:58
static av_always_inline int is9_OR_10BPS(enum AVPixelFormat pix_fmt)
t
Definition: genspecsines3.m:6
int32_t
perm
Definition: f_perms.c:74
packed RGB 8:8:8, 24bpp, BGRBGR...
Definition: pixfmt.h:71
static void yuv2planeX_altivec(const int16_t *filter, int filterSize, const int16_t **src, uint8_t *dest, int dstW, const uint8_t *dither, int offset)
int dstW
Width of destination luma/alpha planes.
dest
Definition: start.py:60
AVS_Value src
Definition: avisynth_c.h:523
yuv2planarX_fn yuv2planeX
synthesis window for stochastic i
#define vzero
static void yuv2planeX_16_altivec(const int16_t *filter, int filterSize, const int16_t **src, uint8_t *dest, const uint8_t *dither, int offset, int x)
int av_get_cpu_flags(void)
Return the flags which specify extensions supported by the CPU.
Definition: cpu.c:30
static void hScale_altivec_real(SwsContext *c, int16_t *dst, int dstW, const uint8_t *src, const int16_t *filter, const int32_t *filterPos, int filterSize)
static double c[64]
av_cold void ff_sws_init_swScale_altivec(SwsContext *c)
else dst[i][x+y *dst_stride[i]]
Definition: vf_mcdeint.c:160
int16_t * vLumFilter
Array of vertical filter coefficients for luma/alpha planes.
int flags
Flags passed by the user to select scaler algorithm, optimizations, subsampling, etc...
AVPixelFormat
Pixel format.
Definition: pixfmt.h:66