vp3dsp_altivec.c
Go to the documentation of this file.
1 /*
2  * Copyright (C) 2009 David Conrad
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
21 #include <string.h>
22 
23 #include "config.h"
24 #include "libavutil/attributes.h"
25 #include "libavutil/cpu.h"
26 #include "libavcodec/vp3dsp.h"
27 
28 #if HAVE_ALTIVEC
29 
32 #include "dsputil_altivec.h"
33 
34 static const vec_s16 constants =
35  {0, 64277, 60547, 54491, 46341, 36410, 25080, 12785};
36 static const vec_u8 interleave_high =
37  {0, 1, 16, 17, 4, 5, 20, 21, 8, 9, 24, 25, 12, 13, 28, 29};
38 
39 #define IDCT_START \
40  vec_s16 A, B, C, D, Ad, Bd, Cd, Dd, E, F, G, H;\
41  vec_s16 Ed, Gd, Add, Bdd, Fd, Hd;\
42  vec_s16 eight = vec_splat_s16(8);\
43  vec_u16 four = vec_splat_u16(4);\
44 \
45  vec_s16 C1 = vec_splat(constants, 1);\
46  vec_s16 C2 = vec_splat(constants, 2);\
47  vec_s16 C3 = vec_splat(constants, 3);\
48  vec_s16 C4 = vec_splat(constants, 4);\
49  vec_s16 C5 = vec_splat(constants, 5);\
50  vec_s16 C6 = vec_splat(constants, 6);\
51  vec_s16 C7 = vec_splat(constants, 7);\
52 \
53  vec_s16 b0 = vec_ld(0x00, block);\
54  vec_s16 b1 = vec_ld(0x10, block);\
55  vec_s16 b2 = vec_ld(0x20, block);\
56  vec_s16 b3 = vec_ld(0x30, block);\
57  vec_s16 b4 = vec_ld(0x40, block);\
58  vec_s16 b5 = vec_ld(0x50, block);\
59  vec_s16 b6 = vec_ld(0x60, block);\
60  vec_s16 b7 = vec_ld(0x70, block);
61 
62 // these functions do (a*C)>>16
63 // things are tricky because a is signed, but C unsigned.
64 // M15 is used if C fits in 15 bit unsigned (C6,C7)
65 // M16 is used if C requires 16 bits unsigned
66 static inline vec_s16 M15(vec_s16 a, vec_s16 C)
67 {
68  return (vec_s16)vec_perm(vec_mule(a,C), vec_mulo(a,C), interleave_high);
69 }
70 static inline vec_s16 M16(vec_s16 a, vec_s16 C)
71 {
72  return vec_add(a, M15(a, C));
73 }
74 
75 #define IDCT_1D(ADD, SHIFT)\
76  A = vec_add(M16(b1, C1), M15(b7, C7));\
77  B = vec_sub(M15(b1, C7), M16(b7, C1));\
78  C = vec_add(M16(b3, C3), M16(b5, C5));\
79  D = vec_sub(M16(b5, C3), M16(b3, C5));\
80 \
81  Ad = M16(vec_sub(A, C), C4);\
82  Bd = M16(vec_sub(B, D), C4);\
83 \
84  Cd = vec_add(A, C);\
85  Dd = vec_add(B, D);\
86 \
87  E = ADD(M16(vec_add(b0, b4), C4));\
88  F = ADD(M16(vec_sub(b0, b4), C4));\
89 \
90  G = vec_add(M16(b2, C2), M15(b6, C6));\
91  H = vec_sub(M15(b2, C6), M16(b6, C2));\
92 \
93  Ed = vec_sub(E, G);\
94  Gd = vec_add(E, G);\
95 \
96  Add = vec_add(F, Ad);\
97  Bdd = vec_sub(Bd, H);\
98 \
99  Fd = vec_sub(F, Ad);\
100  Hd = vec_add(Bd, H);\
101 \
102  b0 = SHIFT(vec_add(Gd, Cd));\
103  b7 = SHIFT(vec_sub(Gd, Cd));\
104 \
105  b1 = SHIFT(vec_add(Add, Hd));\
106  b2 = SHIFT(vec_sub(Add, Hd));\
107 \
108  b3 = SHIFT(vec_add(Ed, Dd));\
109  b4 = SHIFT(vec_sub(Ed, Dd));\
110 \
111  b5 = SHIFT(vec_add(Fd, Bdd));\
112  b6 = SHIFT(vec_sub(Fd, Bdd));
113 
114 #define NOP(a) a
115 #define ADD8(a) vec_add(a, eight)
116 #define SHIFT4(a) vec_sra(a, four)
117 
118 static void vp3_idct_put_altivec(uint8_t *dst, int stride, int16_t block[64])
119 {
120  vec_u8 t;
121  IDCT_START
122 
123  // pixels are signed; so add 128*16 in addition to the normal 8
124  vec_s16 v2048 = vec_sl(vec_splat_s16(1), vec_splat_u16(11));
125  eight = vec_add(eight, v2048);
126 
127  IDCT_1D(NOP, NOP)
128  TRANSPOSE8(b0, b1, b2, b3, b4, b5, b6, b7);
129  IDCT_1D(ADD8, SHIFT4)
130 
131 #define PUT(a)\
132  t = vec_packsu(a, a);\
133  vec_ste((vec_u32)t, 0, (unsigned int *)dst);\
134  vec_ste((vec_u32)t, 4, (unsigned int *)dst);
135 
136  PUT(b0) dst += stride;
137  PUT(b1) dst += stride;
138  PUT(b2) dst += stride;
139  PUT(b3) dst += stride;
140  PUT(b4) dst += stride;
141  PUT(b5) dst += stride;
142  PUT(b6) dst += stride;
143  PUT(b7)
144  memset(block, 0, sizeof(*block) * 64);
145 }
146 
147 static void vp3_idct_add_altivec(uint8_t *dst, int stride, int16_t block[64])
148 {
149  LOAD_ZERO;
150  vec_u8 t, vdst;
151  vec_s16 vdst_16;
152  vec_u8 vdst_mask = vec_mergeh(vec_splat_u8(-1), vec_lvsl(0, dst));
153 
154  IDCT_START
155 
156  IDCT_1D(NOP, NOP)
157  TRANSPOSE8(b0, b1, b2, b3, b4, b5, b6, b7);
158  IDCT_1D(ADD8, SHIFT4)
159 
160 #define ADD(a)\
161  vdst = vec_ld(0, dst);\
162  vdst_16 = (vec_s16)vec_perm(vdst, zero_u8v, vdst_mask);\
163  vdst_16 = vec_adds(a, vdst_16);\
164  t = vec_packsu(vdst_16, vdst_16);\
165  vec_ste((vec_u32)t, 0, (unsigned int *)dst);\
166  vec_ste((vec_u32)t, 4, (unsigned int *)dst);
167 
168  ADD(b0) dst += stride;
169  ADD(b1) dst += stride;
170  ADD(b2) dst += stride;
171  ADD(b3) dst += stride;
172  ADD(b4) dst += stride;
173  ADD(b5) dst += stride;
174  ADD(b6) dst += stride;
175  ADD(b7)
176  memset(block, 0, sizeof(*block) * 64);
177 }
178 
179 #endif /* HAVE_ALTIVEC */
180 
182 {
183 #if HAVE_ALTIVEC
185  c->idct_put = vp3_idct_put_altivec;
186  c->idct_add = vp3_idct_add_altivec;
187  }
188 #endif
189 }
#define AV_CPU_FLAG_ALTIVEC
standard
Definition: cpu.h:52
#define C
int stride
Definition: mace.c:144
Macro definitions for various function/variable attributes.
it can be given away to ff_start_frame *A reference passed to cur_buf_copy and partial_buf are used by libavfilter internally and must not be accessed by filters Reference permissions The AVFilterBufferRef structure has a perms field that describes what the code that owns the reference is allowed to do to the buffer data Different references for the same buffer can have different permissions For video filters that implement the deprecated start_frame draw_slice end_frame the permissions only apply to the parts of the buffer that have already been covered by the draw_slice method The value is a binary OR of the following constants
void(* idct_add)(uint8_t *dest, int line_size, int16_t *block)
Definition: vp3dsp.h:42
uint8_t
#define av_cold
Definition: attributes.h:78
#define TRANSPOSE8(a, b, c, d, e, f, g, h)
Definition: util_altivec.h:63
#define vec_s16
Definition: types_altivec.h:30
#define LOAD_ZERO
Definition: types_altivec.h:38
void(* idct_put)(uint8_t *dest, int line_size, int16_t *block)
Definition: vp3dsp.h:41
t
Definition: genspecsines3.m:6
#define vec_u8
Definition: types_altivec.h:27
int av_get_cpu_flags(void)
Return the flags which specify extensions supported by the CPU.
Definition: cpu.c:30
Contains misc utility macros and inline functions.
static int flags
Definition: cpu.c:23
static double c[64]
av_cold void ff_vp3dsp_init_ppc(VP3DSPContext *c, int flags)
else dst[i][x+y *dst_stride[i]]
Definition: vf_mcdeint.c:160
#define ADD(a, b)
Definition: dct32.c:112