yuv2rgb_altivec.c
Go to the documentation of this file.
1 /*
2  * AltiVec acceleration for colorspace conversion
3  *
4  * copyright (C) 2004 Marc Hoffman <marc.hoffman@analog.com>
5  *
6  * This file is part of FFmpeg.
7  *
8  * FFmpeg is free software; you can redistribute it and/or
9  * modify it under the terms of the GNU Lesser General Public
10  * License as published by the Free Software Foundation; either
11  * version 2.1 of the License, or (at your option) any later version.
12  *
13  * FFmpeg is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16  * Lesser General Public License for more details.
17  *
18  * You should have received a copy of the GNU Lesser General Public
19  * License along with FFmpeg; if not, write to the Free Software
20  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21  */
22 
23 /*
24  * Convert I420 YV12 to RGB in various formats,
25  * it rejects images that are not in 420 formats,
26  * it rejects images that don't have widths of multiples of 16,
27  * it rejects images that don't have heights of multiples of 2.
28  * Reject defers to C simulation code.
29  *
30  * Lots of optimizations to be done here.
31  *
32  * 1. Need to fix saturation code. I just couldn't get it to fly with packs
33  * and adds, so we currently use max/min to clip.
34  *
35  * 2. The inefficient use of chroma loading needs a bit of brushing up.
36  *
37  * 3. Analysis of pipeline stalls needs to be done. Use shark to identify
38  * pipeline stalls.
39  *
40  *
41  * MODIFIED to calculate coeffs from currently selected color space.
42  * MODIFIED core to be a macro where you specify the output format.
43  * ADDED UYVY conversion which is never called due to some thing in swscale.
44  * CORRECTED algorithim selection to be strict on input formats.
45  * ADDED runtime detection of AltiVec.
46  *
47  * ADDED altivec_yuv2packedX vertical scl + RGB converter
48  *
49  * March 27,2004
50  * PERFORMANCE ANALYSIS
51  *
52  * The C version uses 25% of the processor or ~250Mips for D1 video rawvideo
53  * used as test.
54  * The AltiVec version uses 10% of the processor or ~100Mips for D1 video
55  * same sequence.
56  *
57  * 720 * 480 * 30 ~10MPS
58  *
59  * so we have roughly 10 clocks per pixel. This is too high, something has
60  * to be wrong.
61  *
62  * OPTIMIZED clip codes to utilize vec_max and vec_packs removing the
63  * need for vec_min.
64  *
65  * OPTIMIZED DST OUTPUT cache/DMA controls. We are pretty much guaranteed to
66  * have the input video frame, it was just decompressed so it probably resides
67  * in L1 caches. However, we are creating the output video stream. This needs
68  * to use the DSTST instruction to optimize for the cache. We couple this with
69  * the fact that we are not going to be visiting the input buffer again so we
70  * mark it Least Recently Used. This shaves 25% of the processor cycles off.
71  *
72  * Now memcpy is the largest mips consumer in the system, probably due
73  * to the inefficient X11 stuff.
74  *
75  * GL libraries seem to be very slow on this machine 1.33Ghz PB running
76  * Jaguar, this is not the case for my 1Ghz PB. I thought it might be
77  * a versioning issue, however I have libGL.1.2.dylib for both
78  * machines. (We need to figure this out now.)
79  *
80  * GL2 libraries work now with patch for RGB32.
81  *
82  * NOTE: quartz vo driver ARGB32_to_RGB24 consumes 30% of the processor.
83  *
84  * Integrated luma prescaling adjustment for saturation/contrast/brightness
85  * adjustment.
86  */
87 
88 #include <stdio.h>
89 #include <stdlib.h>
90 #include <string.h>
91 #include <inttypes.h>
92 #include <assert.h>
93 
94 #include "config.h"
95 #include "libswscale/rgb2rgb.h"
96 #include "libswscale/swscale.h"
98 #include "libavutil/attributes.h"
99 #include "libavutil/cpu.h"
100 #include "libavutil/pixdesc.h"
101 #include "yuv2rgb_altivec.h"
102 
103 #undef PROFILE_THE_BEAST
104 #undef INC_SCALING
105 
106 typedef unsigned char ubyte;
107 typedef signed char sbyte;
108 
109 /* RGB interleaver, 16 planar pels 8-bit samples per channel in
110  * homogeneous vector registers x0,x1,x2 are interleaved with the
111  * following technique:
112  *
113  * o0 = vec_mergeh(x0, x1);
114  * o1 = vec_perm(o0, x2, perm_rgb_0);
115  * o2 = vec_perm(o0, x2, perm_rgb_1);
116  * o3 = vec_mergel(x0, x1);
117  * o4 = vec_perm(o3, o2, perm_rgb_2);
118  * o5 = vec_perm(o3, o2, perm_rgb_3);
119  *
120  * perm_rgb_0: o0(RG).h v1(B) --> o1*
121  * 0 1 2 3 4
122  * rgbr|gbrg|brgb|rgbr
123  * 0010 0100 1001 0010
124  * 0102 3145 2673 894A
125  *
126  * perm_rgb_1: o0(RG).h v1(B) --> o2
127  * 0 1 2 3 4
128  * gbrg|brgb|bbbb|bbbb
129  * 0100 1001 1111 1111
130  * B5CD 6EF7 89AB CDEF
131  *
132  * perm_rgb_2: o3(RG).l o2(rgbB.l) --> o4*
133  * 0 1 2 3 4
134  * gbrg|brgb|rgbr|gbrg
135  * 1111 1111 0010 0100
136  * 89AB CDEF 0182 3945
137  *
138  * perm_rgb_2: o3(RG).l o2(rgbB.l) ---> o5*
139  * 0 1 2 3 4
140  * brgb|rgbr|gbrg|brgb
141  * 1001 0010 0100 1001
142  * a67b 89cA BdCD eEFf
143  *
144  */
145 static const vector unsigned char
146  perm_rgb_0 = { 0x00, 0x01, 0x10, 0x02, 0x03, 0x11, 0x04, 0x05,
147  0x12, 0x06, 0x07, 0x13, 0x08, 0x09, 0x14, 0x0a },
148  perm_rgb_1 = { 0x0b, 0x15, 0x0c, 0x0d, 0x16, 0x0e, 0x0f, 0x17,
149  0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f },
150  perm_rgb_2 = { 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
151  0x00, 0x01, 0x18, 0x02, 0x03, 0x19, 0x04, 0x05 },
152  perm_rgb_3 = { 0x1a, 0x06, 0x07, 0x1b, 0x08, 0x09, 0x1c, 0x0a,
153  0x0b, 0x1d, 0x0c, 0x0d, 0x1e, 0x0e, 0x0f, 0x1f };
154 
155 #define vec_merge3(x2, x1, x0, y0, y1, y2) \
156  do { \
157  __typeof__(x0) o0, o2, o3; \
158  o0 = vec_mergeh(x0, x1); \
159  y0 = vec_perm(o0, x2, perm_rgb_0); \
160  o2 = vec_perm(o0, x2, perm_rgb_1); \
161  o3 = vec_mergel(x0, x1); \
162  y1 = vec_perm(o3, o2, perm_rgb_2); \
163  y2 = vec_perm(o3, o2, perm_rgb_3); \
164  } while (0)
165 
166 #define vec_mstbgr24(x0, x1, x2, ptr) \
167  do { \
168  __typeof__(x0) _0, _1, _2; \
169  vec_merge3(x0, x1, x2, _0, _1, _2); \
170  vec_st(_0, 0, ptr++); \
171  vec_st(_1, 0, ptr++); \
172  vec_st(_2, 0, ptr++); \
173  } while (0)
174 
175 #define vec_mstrgb24(x0, x1, x2, ptr) \
176  do { \
177  __typeof__(x0) _0, _1, _2; \
178  vec_merge3(x2, x1, x0, _0, _1, _2); \
179  vec_st(_0, 0, ptr++); \
180  vec_st(_1, 0, ptr++); \
181  vec_st(_2, 0, ptr++); \
182  } while (0)
183 
184 /* pack the pixels in rgb0 format
185  * msb R
186  * lsb 0
187  */
188 #define vec_mstrgb32(T, x0, x1, x2, x3, ptr) \
189  do { \
190  T _0, _1, _2, _3; \
191  _0 = vec_mergeh(x0, x1); \
192  _1 = vec_mergeh(x2, x3); \
193  _2 = (T) vec_mergeh((vector unsigned short) _0, \
194  (vector unsigned short) _1); \
195  _3 = (T) vec_mergel((vector unsigned short) _0, \
196  (vector unsigned short) _1); \
197  vec_st(_2, 0 * 16, (T *) ptr); \
198  vec_st(_3, 1 * 16, (T *) ptr); \
199  _0 = vec_mergel(x0, x1); \
200  _1 = vec_mergel(x2, x3); \
201  _2 = (T) vec_mergeh((vector unsigned short) _0, \
202  (vector unsigned short) _1); \
203  _3 = (T) vec_mergel((vector unsigned short) _0, \
204  (vector unsigned short) _1); \
205  vec_st(_2, 2 * 16, (T *) ptr); \
206  vec_st(_3, 3 * 16, (T *) ptr); \
207  ptr += 4; \
208  } while (0)
209 
210 /*
211  * 1 0 1.4021 | | Y |
212  * 1 -0.3441 -0.7142 |x| Cb|
213  * 1 1.7718 0 | | Cr|
214  *
215  *
216  * Y: [-128 127]
217  * Cb/Cr : [-128 127]
218  *
219  * typical YUV conversion works on Y: 0-255 this version has been
220  * optimized for JPEG decoding.
221  */
222 
223 #define vec_unh(x) \
224  (vector signed short) \
225  vec_perm(x, (__typeof__(x)) { 0 }, \
226  ((vector unsigned char) { \
227  0x10, 0x00, 0x10, 0x01, 0x10, 0x02, 0x10, 0x03, \
228  0x10, 0x04, 0x10, 0x05, 0x10, 0x06, 0x10, 0x07 }))
229 
230 #define vec_unl(x) \
231  (vector signed short) \
232  vec_perm(x, (__typeof__(x)) { 0 }, \
233  ((vector unsigned char) { \
234  0x10, 0x08, 0x10, 0x09, 0x10, 0x0A, 0x10, 0x0B, \
235  0x10, 0x0C, 0x10, 0x0D, 0x10, 0x0E, 0x10, 0x0F }))
236 
237 #define vec_clip_s16(x) \
238  vec_max(vec_min(x, ((vector signed short) { \
239  235, 235, 235, 235, 235, 235, 235, 235 })), \
240  ((vector signed short) { 16, 16, 16, 16, 16, 16, 16, 16 }))
241 
242 #define vec_packclp(x, y) \
243  (vector unsigned char) \
244  vec_packs((vector unsigned short) \
245  vec_max(x, ((vector signed short) { 0 })), \
246  (vector unsigned short) \
247  vec_max(y, ((vector signed short) { 0 })))
248 
249 //#define out_pixels(a, b, c, ptr) vec_mstrgb32(__typeof__(a), ((__typeof__(a)) { 255 }), a, a, a, ptr)
250 
251 static inline void cvtyuvtoRGB(SwsContext *c, vector signed short Y,
252  vector signed short U, vector signed short V,
253  vector signed short *R, vector signed short *G,
254  vector signed short *B)
255 {
256  vector signed short vx, ux, uvx;
257 
258  Y = vec_mradds(Y, c->CY, c->OY);
259  U = vec_sub(U, (vector signed short)
260  vec_splat((vector signed short) { 128 }, 0));
261  V = vec_sub(V, (vector signed short)
262  vec_splat((vector signed short) { 128 }, 0));
263 
264  // ux = (CBU * (u << c->CSHIFT) + 0x4000) >> 15;
265  ux = vec_sl(U, c->CSHIFT);
266  *B = vec_mradds(ux, c->CBU, Y);
267 
268  // vx = (CRV * (v << c->CSHIFT) + 0x4000) >> 15;
269  vx = vec_sl(V, c->CSHIFT);
270  *R = vec_mradds(vx, c->CRV, Y);
271 
272  // uvx = ((CGU * u) + (CGV * v)) >> 15;
273  uvx = vec_mradds(U, c->CGU, Y);
274  *G = vec_mradds(V, c->CGV, uvx);
275 }
276 
277 /*
278  * ------------------------------------------------------------------------------
279  * CS converters
280  * ------------------------------------------------------------------------------
281  */
282 
283 #define DEFCSP420_CVT(name, out_pixels) \
284 static int altivec_ ## name(SwsContext *c, const unsigned char **in, \
285  int *instrides, int srcSliceY, int srcSliceH, \
286  unsigned char **oplanes, int *outstrides) \
287 { \
288  int w = c->srcW; \
289  int h = srcSliceH; \
290  int i, j; \
291  int instrides_scl[3]; \
292  vector unsigned char y0, y1; \
293  \
294  vector signed char u, v; \
295  \
296  vector signed short Y0, Y1, Y2, Y3; \
297  vector signed short U, V; \
298  vector signed short vx, ux, uvx; \
299  vector signed short vx0, ux0, uvx0; \
300  vector signed short vx1, ux1, uvx1; \
301  vector signed short R0, G0, B0; \
302  vector signed short R1, G1, B1; \
303  vector unsigned char R, G, B; \
304  \
305  const vector unsigned char *y1ivP, *y2ivP, *uivP, *vivP; \
306  vector unsigned char align_perm; \
307  \
308  vector signed short lCY = c->CY; \
309  vector signed short lOY = c->OY; \
310  vector signed short lCRV = c->CRV; \
311  vector signed short lCBU = c->CBU; \
312  vector signed short lCGU = c->CGU; \
313  vector signed short lCGV = c->CGV; \
314  vector unsigned short lCSHIFT = c->CSHIFT; \
315  \
316  const ubyte *y1i = in[0]; \
317  const ubyte *y2i = in[0] + instrides[0]; \
318  const ubyte *ui = in[1]; \
319  const ubyte *vi = in[2]; \
320  \
321  vector unsigned char *oute, *outo; \
322  \
323  /* loop moves y{1, 2}i by w */ \
324  instrides_scl[0] = instrides[0] * 2 - w; \
325  /* loop moves ui by w / 2 */ \
326  instrides_scl[1] = instrides[1] - w / 2; \
327  /* loop moves vi by w / 2 */ \
328  instrides_scl[2] = instrides[2] - w / 2; \
329  \
330  for (i = 0; i < h / 2; i++) { \
331  oute = (vector unsigned char *)(oplanes[0] + outstrides[0] * \
332  (srcSliceY + i * 2)); \
333  outo = oute + (outstrides[0] >> 4); \
334  vec_dstst(outo, (0x02000002 | (((w * 3 + 32) / 32) << 16)), 0); \
335  vec_dstst(oute, (0x02000002 | (((w * 3 + 32) / 32) << 16)), 1); \
336  \
337  for (j = 0; j < w / 16; j++) { \
338  y1ivP = (const vector unsigned char *) y1i; \
339  y2ivP = (const vector unsigned char *) y2i; \
340  uivP = (const vector unsigned char *) ui; \
341  vivP = (const vector unsigned char *) vi; \
342  \
343  align_perm = vec_lvsl(0, y1i); \
344  y0 = (vector unsigned char) \
345  vec_perm(y1ivP[0], y1ivP[1], align_perm); \
346  \
347  align_perm = vec_lvsl(0, y2i); \
348  y1 = (vector unsigned char) \
349  vec_perm(y2ivP[0], y2ivP[1], align_perm); \
350  \
351  align_perm = vec_lvsl(0, ui); \
352  u = (vector signed char) \
353  vec_perm(uivP[0], uivP[1], align_perm); \
354  \
355  align_perm = vec_lvsl(0, vi); \
356  v = (vector signed char) \
357  vec_perm(vivP[0], vivP[1], align_perm); \
358  \
359  u = (vector signed char) \
360  vec_sub(u, \
361  (vector signed char) \
362  vec_splat((vector signed char) { 128 }, 0)); \
363  v = (vector signed char) \
364  vec_sub(v, \
365  (vector signed char) \
366  vec_splat((vector signed char) { 128 }, 0)); \
367  \
368  U = vec_unpackh(u); \
369  V = vec_unpackh(v); \
370  \
371  Y0 = vec_unh(y0); \
372  Y1 = vec_unl(y0); \
373  Y2 = vec_unh(y1); \
374  Y3 = vec_unl(y1); \
375  \
376  Y0 = vec_mradds(Y0, lCY, lOY); \
377  Y1 = vec_mradds(Y1, lCY, lOY); \
378  Y2 = vec_mradds(Y2, lCY, lOY); \
379  Y3 = vec_mradds(Y3, lCY, lOY); \
380  \
381  /* ux = (CBU * (u << CSHIFT) + 0x4000) >> 15 */ \
382  ux = vec_sl(U, lCSHIFT); \
383  ux = vec_mradds(ux, lCBU, (vector signed short) { 0 }); \
384  ux0 = vec_mergeh(ux, ux); \
385  ux1 = vec_mergel(ux, ux); \
386  \
387  /* vx = (CRV * (v << CSHIFT) + 0x4000) >> 15; */ \
388  vx = vec_sl(V, lCSHIFT); \
389  vx = vec_mradds(vx, lCRV, (vector signed short) { 0 }); \
390  vx0 = vec_mergeh(vx, vx); \
391  vx1 = vec_mergel(vx, vx); \
392  \
393  /* uvx = ((CGU * u) + (CGV * v)) >> 15 */ \
394  uvx = vec_mradds(U, lCGU, (vector signed short) { 0 }); \
395  uvx = vec_mradds(V, lCGV, uvx); \
396  uvx0 = vec_mergeh(uvx, uvx); \
397  uvx1 = vec_mergel(uvx, uvx); \
398  \
399  R0 = vec_add(Y0, vx0); \
400  G0 = vec_add(Y0, uvx0); \
401  B0 = vec_add(Y0, ux0); \
402  R1 = vec_add(Y1, vx1); \
403  G1 = vec_add(Y1, uvx1); \
404  B1 = vec_add(Y1, ux1); \
405  \
406  R = vec_packclp(R0, R1); \
407  G = vec_packclp(G0, G1); \
408  B = vec_packclp(B0, B1); \
409  \
410  out_pixels(R, G, B, oute); \
411  \
412  R0 = vec_add(Y2, vx0); \
413  G0 = vec_add(Y2, uvx0); \
414  B0 = vec_add(Y2, ux0); \
415  R1 = vec_add(Y3, vx1); \
416  G1 = vec_add(Y3, uvx1); \
417  B1 = vec_add(Y3, ux1); \
418  R = vec_packclp(R0, R1); \
419  G = vec_packclp(G0, G1); \
420  B = vec_packclp(B0, B1); \
421  \
422  \
423  out_pixels(R, G, B, outo); \
424  \
425  y1i += 16; \
426  y2i += 16; \
427  ui += 8; \
428  vi += 8; \
429  } \
430  \
431  ui += instrides_scl[1]; \
432  vi += instrides_scl[2]; \
433  y1i += instrides_scl[0]; \
434  y2i += instrides_scl[0]; \
435  } \
436  return srcSliceH; \
437 }
438 
439 #define out_abgr(a, b, c, ptr) \
440  vec_mstrgb32(__typeof__(a), ((__typeof__(a)) { 255 }), c, b, a, ptr)
441 #define out_bgra(a, b, c, ptr) \
442  vec_mstrgb32(__typeof__(a), c, b, a, ((__typeof__(a)) { 255 }), ptr)
443 #define out_rgba(a, b, c, ptr) \
444  vec_mstrgb32(__typeof__(a), a, b, c, ((__typeof__(a)) { 255 }), ptr)
445 #define out_argb(a, b, c, ptr) \
446  vec_mstrgb32(__typeof__(a), ((__typeof__(a)) { 255 }), a, b, c, ptr)
447 #define out_rgb24(a, b, c, ptr) vec_mstrgb24(a, b, c, ptr)
448 #define out_bgr24(a, b, c, ptr) vec_mstbgr24(a, b, c, ptr)
449 
450 DEFCSP420_CVT(yuv2_abgr, out_abgr)
451 DEFCSP420_CVT(yuv2_bgra, out_bgra)
452 DEFCSP420_CVT(yuv2_rgba, out_rgba)
453 DEFCSP420_CVT(yuv2_argb, out_argb)
454 DEFCSP420_CVT(yuv2_rgb24, out_rgb24)
455 DEFCSP420_CVT(yuv2_bgr24, out_bgr24)
456 
457 // uyvy|uyvy|uyvy|uyvy
458 // 0123 4567 89ab cdef
459 static const vector unsigned char
460  demux_u = { 0x10, 0x00, 0x10, 0x00,
461  0x10, 0x04, 0x10, 0x04,
462  0x10, 0x08, 0x10, 0x08,
463  0x10, 0x0c, 0x10, 0x0c },
464  demux_v = { 0x10, 0x02, 0x10, 0x02,
465  0x10, 0x06, 0x10, 0x06,
466  0x10, 0x0A, 0x10, 0x0A,
467  0x10, 0x0E, 0x10, 0x0E },
468  demux_y = { 0x10, 0x01, 0x10, 0x03,
469  0x10, 0x05, 0x10, 0x07,
470  0x10, 0x09, 0x10, 0x0B,
471  0x10, 0x0D, 0x10, 0x0F };
472 
473 /*
474  * this is so I can play live CCIR raw video
475  */
476 static int altivec_uyvy_rgb32(SwsContext *c, const unsigned char **in,
477  int *instrides, int srcSliceY, int srcSliceH,
478  unsigned char **oplanes, int *outstrides)
479 {
480  int w = c->srcW;
481  int h = srcSliceH;
482  int i, j;
483  vector unsigned char uyvy;
484  vector signed short Y, U, V;
485  vector signed short R0, G0, B0, R1, G1, B1;
486  vector unsigned char R, G, B;
487  vector unsigned char *out;
488  const ubyte *img;
489 
490  img = in[0];
491  out = (vector unsigned char *) (oplanes[0] + srcSliceY * outstrides[0]);
492 
493  for (i = 0; i < h; i++)
494  for (j = 0; j < w / 16; j++) {
495  uyvy = vec_ld(0, img);
496 
497  U = (vector signed short)
498  vec_perm(uyvy, (vector unsigned char) { 0 }, demux_u);
499  V = (vector signed short)
500  vec_perm(uyvy, (vector unsigned char) { 0 }, demux_v);
501  Y = (vector signed short)
502  vec_perm(uyvy, (vector unsigned char) { 0 }, demux_y);
503 
504  cvtyuvtoRGB(c, Y, U, V, &R0, &G0, &B0);
505 
506  uyvy = vec_ld(16, img);
507 
508  U = (vector signed short)
509  vec_perm(uyvy, (vector unsigned char) { 0 }, demux_u);
510  V = (vector signed short)
511  vec_perm(uyvy, (vector unsigned char) { 0 }, demux_v);
512  Y = (vector signed short)
513  vec_perm(uyvy, (vector unsigned char) { 0 }, demux_y);
514 
515  cvtyuvtoRGB(c, Y, U, V, &R1, &G1, &B1);
516 
517  R = vec_packclp(R0, R1);
518  G = vec_packclp(G0, G1);
519  B = vec_packclp(B0, B1);
520 
521  // vec_mstbgr24 (R,G,B, out);
522  out_rgba(R, G, B, out);
523 
524  img += 32;
525  }
526  return srcSliceH;
527 }
528 
529 /* Ok currently the acceleration routine only supports
530  * inputs of widths a multiple of 16
531  * and heights a multiple 2
532  *
533  * So we just fall back to the C codes for this.
534  */
536 {
538  return NULL;
539 
540  /*
541  * and this seems not to matter too much I tried a bunch of
542  * videos with abnormal widths and MPlayer crashes elsewhere.
543  * mplayer -vo x11 -rawvideo on:w=350:h=240 raw-350x240.eyuv
544  * boom with X11 bad match.
545  *
546  */
547  if ((c->srcW & 0xf) != 0)
548  return NULL;
549 
550  switch (c->srcFormat) {
551  case AV_PIX_FMT_YUV410P:
552  case AV_PIX_FMT_YUV420P:
553  /*case IMGFMT_CLPL: ??? */
554  case AV_PIX_FMT_GRAY8:
555  case AV_PIX_FMT_NV12:
556  case AV_PIX_FMT_NV21:
557  if ((c->srcH & 0x1) != 0)
558  return NULL;
559 
560  switch (c->dstFormat) {
561  case AV_PIX_FMT_RGB24:
562  av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space RGB24\n");
563  return altivec_yuv2_rgb24;
564  case AV_PIX_FMT_BGR24:
565  av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space BGR24\n");
566  return altivec_yuv2_bgr24;
567  case AV_PIX_FMT_ARGB:
568  av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space ARGB\n");
569  return altivec_yuv2_argb;
570  case AV_PIX_FMT_ABGR:
571  av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space ABGR\n");
572  return altivec_yuv2_abgr;
573  case AV_PIX_FMT_RGBA:
574  av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space RGBA\n");
575  return altivec_yuv2_rgba;
576  case AV_PIX_FMT_BGRA:
577  av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space BGRA\n");
578  return altivec_yuv2_bgra;
579  default: return NULL;
580  }
581  break;
582 
583  case AV_PIX_FMT_UYVY422:
584  switch (c->dstFormat) {
585  case AV_PIX_FMT_BGR32:
586  av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space UYVY -> RGB32\n");
587  return altivec_uyvy_rgb32;
588  default: return NULL;
589  }
590  break;
591  }
592  return NULL;
593 }
594 
596  const int inv_table[4],
597  int brightness,
598  int contrast,
599  int saturation)
600 {
601  union {
602  DECLARE_ALIGNED(16, signed short, tmp)[8];
603  vector signed short vec;
604  } buf;
605 
606  buf.tmp[0] = ((0xffffLL) * contrast >> 8) >> 9; // cy
607  buf.tmp[1] = -256 * brightness; // oy
608  buf.tmp[2] = (inv_table[0] >> 3) * (contrast >> 16) * (saturation >> 16); // crv
609  buf.tmp[3] = (inv_table[1] >> 3) * (contrast >> 16) * (saturation >> 16); // cbu
610  buf.tmp[4] = -((inv_table[2] >> 1) * (contrast >> 16) * (saturation >> 16)); // cgu
611  buf.tmp[5] = -((inv_table[3] >> 1) * (contrast >> 16) * (saturation >> 16)); // cgv
612 
613  c->CSHIFT = (vector unsigned short) vec_splat_u16(2);
614  c->CY = vec_splat((vector signed short) buf.vec, 0);
615  c->OY = vec_splat((vector signed short) buf.vec, 1);
616  c->CRV = vec_splat((vector signed short) buf.vec, 2);
617  c->CBU = vec_splat((vector signed short) buf.vec, 3);
618  c->CGU = vec_splat((vector signed short) buf.vec, 4);
619  c->CGV = vec_splat((vector signed short) buf.vec, 5);
620  return;
621 }
622 
624  const int16_t *lumFilter,
625  const int16_t **lumSrc,
626  int lumFilterSize,
627  const int16_t *chrFilter,
628  const int16_t **chrUSrc,
629  const int16_t **chrVSrc,
630  int chrFilterSize,
631  const int16_t **alpSrc,
632  uint8_t *dest,
633  int dstW, int dstY,
634  enum AVPixelFormat target)
635 {
636  int i, j;
637  vector signed short X, X0, X1, Y0, U0, V0, Y1, U1, V1, U, V;
638  vector signed short R0, G0, B0, R1, G1, B1;
639 
640  vector unsigned char R, G, B;
641  vector unsigned char *out, *nout;
642 
643  vector signed short RND = vec_splat_s16(1 << 3);
644  vector unsigned short SCL = vec_splat_u16(4);
645  DECLARE_ALIGNED(16, unsigned int, scratch)[16];
646 
647  vector signed short *YCoeffs, *CCoeffs;
648 
649  YCoeffs = c->vYCoeffsBank + dstY * lumFilterSize;
650  CCoeffs = c->vCCoeffsBank + dstY * chrFilterSize;
651 
652  out = (vector unsigned char *) dest;
653 
654  for (i = 0; i < dstW; i += 16) {
655  Y0 = RND;
656  Y1 = RND;
657  /* extract 16 coeffs from lumSrc */
658  for (j = 0; j < lumFilterSize; j++) {
659  X0 = vec_ld(0, &lumSrc[j][i]);
660  X1 = vec_ld(16, &lumSrc[j][i]);
661  Y0 = vec_mradds(X0, YCoeffs[j], Y0);
662  Y1 = vec_mradds(X1, YCoeffs[j], Y1);
663  }
664 
665  U = RND;
666  V = RND;
667  /* extract 8 coeffs from U,V */
668  for (j = 0; j < chrFilterSize; j++) {
669  X = vec_ld(0, &chrUSrc[j][i / 2]);
670  U = vec_mradds(X, CCoeffs[j], U);
671  X = vec_ld(0, &chrVSrc[j][i / 2]);
672  V = vec_mradds(X, CCoeffs[j], V);
673  }
674 
675  /* scale and clip signals */
676  Y0 = vec_sra(Y0, SCL);
677  Y1 = vec_sra(Y1, SCL);
678  U = vec_sra(U, SCL);
679  V = vec_sra(V, SCL);
680 
681  Y0 = vec_clip_s16(Y0);
682  Y1 = vec_clip_s16(Y1);
683  U = vec_clip_s16(U);
684  V = vec_clip_s16(V);
685 
686  /* now we have
687  * Y0 = y0 y1 y2 y3 y4 y5 y6 y7 Y1 = y8 y9 y10 y11 y12 y13 y14 y15
688  * U = u0 u1 u2 u3 u4 u5 u6 u7 V = v0 v1 v2 v3 v4 v5 v6 v7
689  *
690  * Y0 = y0 y1 y2 y3 y4 y5 y6 y7 Y1 = y8 y9 y10 y11 y12 y13 y14 y15
691  * U0 = u0 u0 u1 u1 u2 u2 u3 u3 U1 = u4 u4 u5 u5 u6 u6 u7 u7
692  * V0 = v0 v0 v1 v1 v2 v2 v3 v3 V1 = v4 v4 v5 v5 v6 v6 v7 v7
693  */
694 
695  U0 = vec_mergeh(U, U);
696  V0 = vec_mergeh(V, V);
697 
698  U1 = vec_mergel(U, U);
699  V1 = vec_mergel(V, V);
700 
701  cvtyuvtoRGB(c, Y0, U0, V0, &R0, &G0, &B0);
702  cvtyuvtoRGB(c, Y1, U1, V1, &R1, &G1, &B1);
703 
704  R = vec_packclp(R0, R1);
705  G = vec_packclp(G0, G1);
706  B = vec_packclp(B0, B1);
707 
708  switch (target) {
709  case AV_PIX_FMT_ABGR:
710  out_abgr(R, G, B, out);
711  break;
712  case AV_PIX_FMT_BGRA:
713  out_bgra(R, G, B, out);
714  break;
715  case AV_PIX_FMT_RGBA:
716  out_rgba(R, G, B, out);
717  break;
718  case AV_PIX_FMT_ARGB:
719  out_argb(R, G, B, out);
720  break;
721  case AV_PIX_FMT_RGB24:
722  out_rgb24(R, G, B, out);
723  break;
724  case AV_PIX_FMT_BGR24:
725  out_bgr24(R, G, B, out);
726  break;
727  default:
728  {
729  /* If this is reached, the caller should have called yuv2packedXinC
730  * instead. */
731  static int printed_error_message;
732  if (!printed_error_message) {
733  av_log(c, AV_LOG_ERROR,
734  "altivec_yuv2packedX doesn't support %s output\n",
736  printed_error_message = 1;
737  }
738  return;
739  }
740  }
741  }
742 
743  if (i < dstW) {
744  i -= 16;
745 
746  Y0 = RND;
747  Y1 = RND;
748  /* extract 16 coeffs from lumSrc */
749  for (j = 0; j < lumFilterSize; j++) {
750  X0 = vec_ld(0, &lumSrc[j][i]);
751  X1 = vec_ld(16, &lumSrc[j][i]);
752  Y0 = vec_mradds(X0, YCoeffs[j], Y0);
753  Y1 = vec_mradds(X1, YCoeffs[j], Y1);
754  }
755 
756  U = RND;
757  V = RND;
758  /* extract 8 coeffs from U,V */
759  for (j = 0; j < chrFilterSize; j++) {
760  X = vec_ld(0, &chrUSrc[j][i / 2]);
761  U = vec_mradds(X, CCoeffs[j], U);
762  X = vec_ld(0, &chrVSrc[j][i / 2]);
763  V = vec_mradds(X, CCoeffs[j], V);
764  }
765 
766  /* scale and clip signals */
767  Y0 = vec_sra(Y0, SCL);
768  Y1 = vec_sra(Y1, SCL);
769  U = vec_sra(U, SCL);
770  V = vec_sra(V, SCL);
771 
772  Y0 = vec_clip_s16(Y0);
773  Y1 = vec_clip_s16(Y1);
774  U = vec_clip_s16(U);
775  V = vec_clip_s16(V);
776 
777  /* now we have
778  * Y0 = y0 y1 y2 y3 y4 y5 y6 y7 Y1 = y8 y9 y10 y11 y12 y13 y14 y15
779  * U = u0 u1 u2 u3 u4 u5 u6 u7 V = v0 v1 v2 v3 v4 v5 v6 v7
780  *
781  * Y0 = y0 y1 y2 y3 y4 y5 y6 y7 Y1 = y8 y9 y10 y11 y12 y13 y14 y15
782  * U0 = u0 u0 u1 u1 u2 u2 u3 u3 U1 = u4 u4 u5 u5 u6 u6 u7 u7
783  * V0 = v0 v0 v1 v1 v2 v2 v3 v3 V1 = v4 v4 v5 v5 v6 v6 v7 v7
784  */
785 
786  U0 = vec_mergeh(U, U);
787  V0 = vec_mergeh(V, V);
788 
789  U1 = vec_mergel(U, U);
790  V1 = vec_mergel(V, V);
791 
792  cvtyuvtoRGB(c, Y0, U0, V0, &R0, &G0, &B0);
793  cvtyuvtoRGB(c, Y1, U1, V1, &R1, &G1, &B1);
794 
795  R = vec_packclp(R0, R1);
796  G = vec_packclp(G0, G1);
797  B = vec_packclp(B0, B1);
798 
799  nout = (vector unsigned char *) scratch;
800  switch (target) {
801  case AV_PIX_FMT_ABGR:
802  out_abgr(R, G, B, nout);
803  break;
804  case AV_PIX_FMT_BGRA:
805  out_bgra(R, G, B, nout);
806  break;
807  case AV_PIX_FMT_RGBA:
808  out_rgba(R, G, B, nout);
809  break;
810  case AV_PIX_FMT_ARGB:
811  out_argb(R, G, B, nout);
812  break;
813  case AV_PIX_FMT_RGB24:
814  out_rgb24(R, G, B, nout);
815  break;
816  case AV_PIX_FMT_BGR24:
817  out_bgr24(R, G, B, nout);
818  break;
819  default:
820  /* Unreachable, I think. */
821  av_log(c, AV_LOG_ERROR,
822  "altivec_yuv2packedX doesn't support %s output\n",
824  return;
825  }
826 
827  memcpy(&((uint32_t *) dest)[i], scratch, (dstW - i) / 4);
828  }
829 }
830 
831 #define YUV2PACKEDX_WRAPPER(suffix, pixfmt) \
832 void ff_yuv2 ## suffix ## _X_altivec(SwsContext *c, \
833  const int16_t *lumFilter, \
834  const int16_t **lumSrc, \
835  int lumFilterSize, \
836  const int16_t *chrFilter, \
837  const int16_t **chrUSrc, \
838  const int16_t **chrVSrc, \
839  int chrFilterSize, \
840  const int16_t **alpSrc, \
841  uint8_t *dest, int dstW, int dstY) \
842 { \
843  ff_yuv2packedX_altivec(c, lumFilter, lumSrc, lumFilterSize, \
844  chrFilter, chrUSrc, chrVSrc, \
845  chrFilterSize, alpSrc, \
846  dest, dstW, dstY, pixfmt); \
847 }
848 
packed YUV 4:2:2, 16bpp, Cb Y0 Cr Y1
Definition: pixfmt.h:85
#define AV_CPU_FLAG_ALTIVEC
standard
Definition: cpu.h:52
#define YUV2PACKEDX_WRAPPER(suffix, pixfmt)
av_cold void ff_yuv2rgb_init_tables_altivec(SwsContext *c, const int inv_table[4], int brightness, int contrast, int saturation)
#define out_bgra(a, b, c, ptr)
#define B
Definition: dsputil.c:2025
packed RGB 8:8:8, 24bpp, RGBRGB...
Definition: pixfmt.h:70
#define AV_LOG_WARNING
Something somehow does not look correct.
Definition: log.h:154
#define DECLARE_ALIGNED(n, t, v)
Definition: mem.h:59
About Git write you should know how to use GIT properly Luckily Git comes with excellent documentation git help man git shows you the available git< command > help man git< command > shows information about the subcommand< command > The most comprehensive manual is the website Git Reference visit they are quite exhaustive You do not need a special username or password All you need is to provide a ssh public key to the Git server admin What follows now is a basic introduction to Git and some FFmpeg specific guidelines Read it at least if you are granted commit privileges to the FFmpeg project you are expected to be familiar with these rules I if not You can get git from etc no matter how small Every one of them has been saved from looking like a fool by this many times It s very easy for stray debug output or cosmetic modifications to slip in
Definition: git-howto.txt:5
x1
Definition: genspecsines3.m:7
the sinusoids Y1
Definition: lab5.m:33
static const vector unsigned char demux_v
#define vec_clip_s16(x)
output residual component w
Macro definitions for various function/variable attributes.
int srcH
Height of source luma/alpha planes.
#define B1
Definition: faandct.c:41
#define img
uint8_t
#define av_cold
Definition: attributes.h:78
#define out_abgr(a, b, c, ptr)
#define Y
Definition: vf_boxblur.c:76
unsigned char ubyte
static int altivec_uyvy_rgb32(SwsContext *c, const unsigned char **in, int *instrides, int srcSliceY, int srcSliceH, unsigned char **oplanes, int *outstrides)
packed ABGR 8:8:8:8, 32bpp, ABGRABGR...
Definition: pixfmt.h:98
av_cold SwsFunc ff_yuv2rgb_init_altivec(SwsContext *c)
#define R
Definition: dsputil.c:2027
external API header
enum AVPixelFormat dstFormat
Destination pixel format.
static av_always_inline void ff_yuv2packedX_altivec(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize, const int16_t *chrFilter, const int16_t **chrUSrc, const int16_t **chrVSrc, int chrFilterSize, const int16_t **alpSrc, uint8_t *dest, int dstW, int dstY, enum AVPixelFormat target)
#define U(x)
#define R0(v, w, x, y, z, i)
Definition: sha.c:54
packed BGRA 8:8:8:8, 32bpp, BGRABGRA...
Definition: pixfmt.h:99
planar YUV 4:2:0, 12bpp, 1 plane for Y and 1 plane for the UV components, which are interleaved (firs...
Definition: pixfmt.h:93
void av_log(void *avcl, int level, const char *fmt,...)
Definition: log.c:246
static void cvtyuvtoRGB(SwsContext *c, vector signed short Y, vector signed short U, vector signed short V, vector signed short *R, vector signed short *G, vector signed short *B)
packed ARGB 8:8:8:8, 32bpp, ARGBARGB...
Definition: pixfmt.h:96
static const vector unsigned char perm_rgb_2
packed RGBA 8:8:8:8, 32bpp, RGBARGBA...
Definition: pixfmt.h:97
as above, but U and V bytes are swapped
Definition: pixfmt.h:94
#define V
static const vector unsigned char demux_y
Sampled sinusoid X
static const vector unsigned char perm_rgb_3
#define DEFCSP420_CVT(name, out_pixels)
static const vector unsigned char perm_rgb_0
packed RGB 8:8:8, 24bpp, BGRBGR...
Definition: pixfmt.h:71
int(* SwsFunc)(struct SwsContext *context, const uint8_t *src[], int srcStride[], int srcSliceY, int srcSliceH, uint8_t *dst[], int dstStride[])
#define AV_PIX_FMT_BGR32
Definition: pixfmt.h:261
#define R1
Definition: simple_idct.c:154
NULL
Definition: eval.c:55
static const vector unsigned char demux_u
dest
Definition: start.py:60
#define out_rgb24(a, b, c, ptr)
#define out_argb(a, b, c, ptr)
#define AV_LOG_ERROR
Something went wrong and cannot losslessly be recovered.
Definition: log.h:148
void * buf
Definition: avisynth_c.h:594
planar YUV 4:1:0, 9bpp, (1 Cr & Cb sample per 4x4 Y samples)
Definition: pixfmt.h:74
synthesis window for stochastic i
#define vec_packclp(x, y)
signed char sbyte
int av_get_cpu_flags(void)
Return the flags which specify extensions supported by the CPU.
Definition: cpu.c:30
#define out_bgr24(a, b, c, ptr)
planar YUV 4:2:0, 12bpp, (1 Cr & Cb sample per 2x2 Y samples)
Definition: pixfmt.h:68
Y , 8bpp.
Definition: pixfmt.h:76
static double c[64]
enum AVPixelFormat srcFormat
Source pixel format.
#define G
Definition: dsputil.c:2026
#define av_always_inline
Definition: attributes.h:41
#define out_rgba(a, b, c, ptr)
uint8_t pi<< 24) CONV_FUNC_GROUP(AV_SAMPLE_FMT_FLT, float, AV_SAMPLE_FMT_U8, uint8_t,(*(const uint8_t *) pi-0x80)*(1.0f/(1<< 7))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_DBL, double, AV_SAMPLE_FMT_U8, uint8_t,(*(const uint8_t *) pi-0x80)*(1.0/(1<< 7))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_U8, uint8_t, AV_SAMPLE_FMT_S16, int16_t,(*(const int16_t *) pi >> 8)+0x80) CONV_FUNC_GROUP(AV_SAMPLE_FMT_FLT, float, AV_SAMPLE_FMT_S16, int16_t,*(const int16_t *) pi *(1.0f/(1<< 15))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_DBL, double, AV_SAMPLE_FMT_S16, int16_t,*(const int16_t *) pi *(1.0/(1<< 15))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_U8, uint8_t, AV_SAMPLE_FMT_S32, int32_t,(*(const int32_t *) pi >> 24)+0x80) CONV_FUNC_GROUP(AV_SAMPLE_FMT_FLT, float, AV_SAMPLE_FMT_S32, int32_t,*(const int32_t *) pi *(1.0f/(1U<< 31))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_DBL, double, AV_SAMPLE_FMT_S32, int32_t,*(const int32_t *) pi *(1.0/(1U<< 31))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_U8, uint8_t, AV_SAMPLE_FMT_FLT, float, av_clip_uint8(lrintf(*(const float *) pi *(1<< 7))+0x80)) CONV_FUNC_GROUP(AV_SAMPLE_FMT_S16, int16_t, AV_SAMPLE_FMT_FLT, float, av_clip_int16(lrintf(*(const float *) pi *(1<< 15)))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_S32, int32_t, AV_SAMPLE_FMT_FLT, float, av_clipl_int32(llrintf(*(const float *) pi *(1U<< 31)))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_U8, uint8_t, AV_SAMPLE_FMT_DBL, double, av_clip_uint8(lrint(*(const double *) pi *(1<< 7))+0x80)) CONV_FUNC_GROUP(AV_SAMPLE_FMT_S16, int16_t, AV_SAMPLE_FMT_DBL, double, av_clip_int16(lrint(*(const double *) pi *(1<< 15)))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_S32, int32_t, AV_SAMPLE_FMT_DBL, double, av_clipl_int32(llrint(*(const double *) pi *(1U<< 31))))#define SET_CONV_FUNC_GROUP(ofmt, ifmt) static void set_generic_function(AudioConvert *ac){}void ff_audio_convert_free(AudioConvert **ac){if(!*ac) return;ff_dither_free(&(*ac) ->dc);av_freep(ac);}AudioConvert *ff_audio_convert_alloc(AVAudioResampleContext *avr, enum AVSampleFormat out_fmt, enum AVSampleFormat in_fmt, int channels, int sample_rate, int apply_map){AudioConvert *ac;int in_planar, out_planar;ac=av_mallocz(sizeof(*ac));if(!ac) return NULL;ac->avr=avr;ac->out_fmt=out_fmt;ac->in_fmt=in_fmt;ac->channels=channels;ac->apply_map=apply_map;if(avr->dither_method!=AV_RESAMPLE_DITHER_NONE &&av_get_packed_sample_fmt(out_fmt)==AV_SAMPLE_FMT_S16 &&av_get_bytes_per_sample(in_fmt) > 2){ac->dc=ff_dither_alloc(avr, out_fmt, in_fmt, channels, sample_rate, apply_map);if(!ac->dc){av_free(ac);return NULL;}return ac;}in_planar=av_sample_fmt_is_planar(in_fmt);out_planar=av_sample_fmt_is_planar(out_fmt);if(in_planar==out_planar){ac->func_type=CONV_FUNC_TYPE_FLAT;ac->planes=in_planar?ac->channels:1;}else if(in_planar) ac->func_type=CONV_FUNC_TYPE_INTERLEAVE;else ac->func_type=CONV_FUNC_TYPE_DEINTERLEAVE;set_generic_function(ac);if(ARCH_ARM) ff_audio_convert_init_arm(ac);if(ARCH_X86) ff_audio_convert_init_x86(ac);return ac;}int ff_audio_convert(AudioConvert *ac, AudioData *out, AudioData *in){int use_generic=1;int len=in->nb_samples;int p;if(ac->dc){av_dlog(ac->avr,"%d samples - audio_convert: %s to %s (dithered)\n", len, av_get_sample_fmt_name(ac->in_fmt), av_get_sample_fmt_name(ac->out_fmt));return ff_convert_dither(ac-> out
const char * av_get_pix_fmt_name(enum AVPixelFormat pix_fmt)
Return the short name for a pixel format, NULL in case pix_fmt is unknown.
Definition: pixdesc.c:1700
static const vector unsigned char perm_rgb_1
int srcW
Width of source luma/alpha planes.
AVPixelFormat
Pixel format.
Definition: pixfmt.h:66
#define B0
Definition: faandct.c:40