h264_altivec.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org>
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
21 #include "libavutil/attributes.h"
22 #include "libavutil/cpu.h"
23 #include "libavutil/intreadwrite.h"
26 #include "libavcodec/h264data.h"
27 #include "libavcodec/h264dsp.h"
28 
29 /****************************************************************************
30  * IDCT transform:
31  ****************************************************************************/
32 
33 #define VEC_1D_DCT(vb0,vb1,vb2,vb3,va0,va1,va2,va3) \
34  /* 1st stage */ \
35  vz0 = vec_add(vb0,vb2); /* temp[0] = Y[0] + Y[2] */ \
36  vz1 = vec_sub(vb0,vb2); /* temp[1] = Y[0] - Y[2] */ \
37  vz2 = vec_sra(vb1,vec_splat_u16(1)); \
38  vz2 = vec_sub(vz2,vb3); /* temp[2] = Y[1].1/2 - Y[3] */ \
39  vz3 = vec_sra(vb3,vec_splat_u16(1)); \
40  vz3 = vec_add(vb1,vz3); /* temp[3] = Y[1] + Y[3].1/2 */ \
41  /* 2nd stage: output */ \
42  va0 = vec_add(vz0,vz3); /* x[0] = temp[0] + temp[3] */ \
43  va1 = vec_add(vz1,vz2); /* x[1] = temp[1] + temp[2] */ \
44  va2 = vec_sub(vz1,vz2); /* x[2] = temp[1] - temp[2] */ \
45  va3 = vec_sub(vz0,vz3) /* x[3] = temp[0] - temp[3] */
46 
47 #define VEC_TRANSPOSE_4(a0,a1,a2,a3,b0,b1,b2,b3) \
48  b0 = vec_mergeh( a0, a0 ); \
49  b1 = vec_mergeh( a1, a0 ); \
50  b2 = vec_mergeh( a2, a0 ); \
51  b3 = vec_mergeh( a3, a0 ); \
52  a0 = vec_mergeh( b0, b2 ); \
53  a1 = vec_mergel( b0, b2 ); \
54  a2 = vec_mergeh( b1, b3 ); \
55  a3 = vec_mergel( b1, b3 ); \
56  b0 = vec_mergeh( a0, a2 ); \
57  b1 = vec_mergel( a0, a2 ); \
58  b2 = vec_mergeh( a1, a3 ); \
59  b3 = vec_mergel( a1, a3 )
60 
61 #define VEC_LOAD_U8_ADD_S16_STORE_U8(va) \
62  vdst_orig = vec_ld(0, dst); \
63  vdst = vec_perm(vdst_orig, zero_u8v, vdst_mask); \
64  vdst_ss = (vec_s16) vec_mergeh(zero_u8v, vdst); \
65  va = vec_add(va, vdst_ss); \
66  va_u8 = vec_packsu(va, zero_s16v); \
67  va_u32 = vec_splat((vec_u32)va_u8, 0); \
68  vec_ste(va_u32, element, (uint32_t*)dst);
69 
70 static void ff_h264_idct_add_altivec(uint8_t *dst, int16_t *block, int stride)
71 {
72  vec_s16 va0, va1, va2, va3;
73  vec_s16 vz0, vz1, vz2, vz3;
74  vec_s16 vtmp0, vtmp1, vtmp2, vtmp3;
75  vec_u8 va_u8;
76  vec_u32 va_u32;
77  vec_s16 vdst_ss;
78  const vec_u16 v6us = vec_splat_u16(6);
79  vec_u8 vdst, vdst_orig;
80  vec_u8 vdst_mask = vec_lvsl(0, dst);
81  int element = ((unsigned long)dst & 0xf) >> 2;
82  LOAD_ZERO;
83 
84  block[0] += 32; /* add 32 as a DC-level for rounding */
85 
86  vtmp0 = vec_ld(0,block);
87  vtmp1 = vec_sld(vtmp0, vtmp0, 8);
88  vtmp2 = vec_ld(16,block);
89  vtmp3 = vec_sld(vtmp2, vtmp2, 8);
90  memset(block, 0, 16 * sizeof(int16_t));
91 
92  VEC_1D_DCT(vtmp0,vtmp1,vtmp2,vtmp3,va0,va1,va2,va3);
93  VEC_TRANSPOSE_4(va0,va1,va2,va3,vtmp0,vtmp1,vtmp2,vtmp3);
94  VEC_1D_DCT(vtmp0,vtmp1,vtmp2,vtmp3,va0,va1,va2,va3);
95 
96  va0 = vec_sra(va0,v6us);
97  va1 = vec_sra(va1,v6us);
98  va2 = vec_sra(va2,v6us);
99  va3 = vec_sra(va3,v6us);
100 
102  dst += stride;
104  dst += stride;
106  dst += stride;
108 }
109 
110 #define IDCT8_1D_ALTIVEC(s0, s1, s2, s3, s4, s5, s6, s7, d0, d1, d2, d3, d4, d5, d6, d7) {\
111  /* a0 = SRC(0) + SRC(4); */ \
112  vec_s16 a0v = vec_add(s0, s4); \
113  /* a2 = SRC(0) - SRC(4); */ \
114  vec_s16 a2v = vec_sub(s0, s4); \
115  /* a4 = (SRC(2)>>1) - SRC(6); */ \
116  vec_s16 a4v = vec_sub(vec_sra(s2, onev), s6); \
117  /* a6 = (SRC(6)>>1) + SRC(2); */ \
118  vec_s16 a6v = vec_add(vec_sra(s6, onev), s2); \
119  /* b0 = a0 + a6; */ \
120  vec_s16 b0v = vec_add(a0v, a6v); \
121  /* b2 = a2 + a4; */ \
122  vec_s16 b2v = vec_add(a2v, a4v); \
123  /* b4 = a2 - a4; */ \
124  vec_s16 b4v = vec_sub(a2v, a4v); \
125  /* b6 = a0 - a6; */ \
126  vec_s16 b6v = vec_sub(a0v, a6v); \
127  /* a1 = SRC(5) - SRC(3) - SRC(7) - (SRC(7)>>1); */ \
128  /* a1 = (SRC(5)-SRC(3)) - (SRC(7) + (SRC(7)>>1)); */ \
129  vec_s16 a1v = vec_sub( vec_sub(s5, s3), vec_add(s7, vec_sra(s7, onev)) ); \
130  /* a3 = SRC(7) + SRC(1) - SRC(3) - (SRC(3)>>1); */ \
131  /* a3 = (SRC(7)+SRC(1)) - (SRC(3) + (SRC(3)>>1)); */ \
132  vec_s16 a3v = vec_sub( vec_add(s7, s1), vec_add(s3, vec_sra(s3, onev)) );\
133  /* a5 = SRC(7) - SRC(1) + SRC(5) + (SRC(5)>>1); */ \
134  /* a5 = (SRC(7)-SRC(1)) + SRC(5) + (SRC(5)>>1); */ \
135  vec_s16 a5v = vec_add( vec_sub(s7, s1), vec_add(s5, vec_sra(s5, onev)) );\
136  /* a7 = SRC(5)+SRC(3) + SRC(1) + (SRC(1)>>1); */ \
137  vec_s16 a7v = vec_add( vec_add(s5, s3), vec_add(s1, vec_sra(s1, onev)) );\
138  /* b1 = (a7>>2) + a1; */ \
139  vec_s16 b1v = vec_add( vec_sra(a7v, twov), a1v); \
140  /* b3 = a3 + (a5>>2); */ \
141  vec_s16 b3v = vec_add(a3v, vec_sra(a5v, twov)); \
142  /* b5 = (a3>>2) - a5; */ \
143  vec_s16 b5v = vec_sub( vec_sra(a3v, twov), a5v); \
144  /* b7 = a7 - (a1>>2); */ \
145  vec_s16 b7v = vec_sub( a7v, vec_sra(a1v, twov)); \
146  /* DST(0, b0 + b7); */ \
147  d0 = vec_add(b0v, b7v); \
148  /* DST(1, b2 + b5); */ \
149  d1 = vec_add(b2v, b5v); \
150  /* DST(2, b4 + b3); */ \
151  d2 = vec_add(b4v, b3v); \
152  /* DST(3, b6 + b1); */ \
153  d3 = vec_add(b6v, b1v); \
154  /* DST(4, b6 - b1); */ \
155  d4 = vec_sub(b6v, b1v); \
156  /* DST(5, b4 - b3); */ \
157  d5 = vec_sub(b4v, b3v); \
158  /* DST(6, b2 - b5); */ \
159  d6 = vec_sub(b2v, b5v); \
160  /* DST(7, b0 - b7); */ \
161  d7 = vec_sub(b0v, b7v); \
162 }
163 
164 #define ALTIVEC_STORE_SUM_CLIP(dest, idctv, perm_ldv, perm_stv, sel) { \
165  /* unaligned load */ \
166  vec_u8 hv = vec_ld( 0, dest ); \
167  vec_u8 lv = vec_ld( 7, dest ); \
168  vec_u8 dstv = vec_perm( hv, lv, (vec_u8)perm_ldv ); \
169  vec_s16 idct_sh6 = vec_sra(idctv, sixv); \
170  vec_u16 dst16 = (vec_u16)vec_mergeh(zero_u8v, dstv); \
171  vec_s16 idstsum = vec_adds(idct_sh6, (vec_s16)dst16); \
172  vec_u8 idstsum8 = vec_packsu(zero_s16v, idstsum); \
173  vec_u8 edgehv; \
174  /* unaligned store */ \
175  vec_u8 bodyv = vec_perm( idstsum8, idstsum8, perm_stv );\
176  vec_u8 edgelv = vec_perm( sel, zero_u8v, perm_stv ); \
177  lv = vec_sel( lv, bodyv, edgelv ); \
178  vec_st( lv, 7, dest ); \
179  hv = vec_ld( 0, dest ); \
180  edgehv = vec_perm( zero_u8v, sel, perm_stv ); \
181  hv = vec_sel( hv, bodyv, edgehv ); \
182  vec_st( hv, 0, dest ); \
183  }
184 
185 static void ff_h264_idct8_add_altivec( uint8_t *dst, int16_t *dct, int stride ) {
186  vec_s16 s0, s1, s2, s3, s4, s5, s6, s7;
187  vec_s16 d0, d1, d2, d3, d4, d5, d6, d7;
188  vec_s16 idct0, idct1, idct2, idct3, idct4, idct5, idct6, idct7;
189 
190  vec_u8 perm_ldv = vec_lvsl(0, dst);
191  vec_u8 perm_stv = vec_lvsr(8, dst);
192 
193  const vec_u16 onev = vec_splat_u16(1);
194  const vec_u16 twov = vec_splat_u16(2);
195  const vec_u16 sixv = vec_splat_u16(6);
196 
197  const vec_u8 sel = (vec_u8) {0,0,0,0,0,0,0,0,-1,-1,-1,-1,-1,-1,-1,-1};
198  LOAD_ZERO;
199 
200  dct[0] += 32; // rounding for the >>6 at the end
201 
202  s0 = vec_ld(0x00, (int16_t*)dct);
203  s1 = vec_ld(0x10, (int16_t*)dct);
204  s2 = vec_ld(0x20, (int16_t*)dct);
205  s3 = vec_ld(0x30, (int16_t*)dct);
206  s4 = vec_ld(0x40, (int16_t*)dct);
207  s5 = vec_ld(0x50, (int16_t*)dct);
208  s6 = vec_ld(0x60, (int16_t*)dct);
209  s7 = vec_ld(0x70, (int16_t*)dct);
210  memset(dct, 0, 64 * sizeof(int16_t));
211 
212  IDCT8_1D_ALTIVEC(s0, s1, s2, s3, s4, s5, s6, s7,
213  d0, d1, d2, d3, d4, d5, d6, d7);
214 
215  TRANSPOSE8( d0, d1, d2, d3, d4, d5, d6, d7 );
216 
217  IDCT8_1D_ALTIVEC(d0, d1, d2, d3, d4, d5, d6, d7,
218  idct0, idct1, idct2, idct3, idct4, idct5, idct6, idct7);
219 
220  ALTIVEC_STORE_SUM_CLIP(&dst[0*stride], idct0, perm_ldv, perm_stv, sel);
221  ALTIVEC_STORE_SUM_CLIP(&dst[1*stride], idct1, perm_ldv, perm_stv, sel);
222  ALTIVEC_STORE_SUM_CLIP(&dst[2*stride], idct2, perm_ldv, perm_stv, sel);
223  ALTIVEC_STORE_SUM_CLIP(&dst[3*stride], idct3, perm_ldv, perm_stv, sel);
224  ALTIVEC_STORE_SUM_CLIP(&dst[4*stride], idct4, perm_ldv, perm_stv, sel);
225  ALTIVEC_STORE_SUM_CLIP(&dst[5*stride], idct5, perm_ldv, perm_stv, sel);
226  ALTIVEC_STORE_SUM_CLIP(&dst[6*stride], idct6, perm_ldv, perm_stv, sel);
227  ALTIVEC_STORE_SUM_CLIP(&dst[7*stride], idct7, perm_ldv, perm_stv, sel);
228 }
229 
230 static av_always_inline void h264_idct_dc_add_internal(uint8_t *dst, int16_t *block, int stride, int size)
231 {
232  vec_s16 dc16;
233  vec_u8 dcplus, dcminus, v0, v1, v2, v3, aligner;
234  LOAD_ZERO;
235  DECLARE_ALIGNED(16, int, dc);
236  int i;
237 
238  dc = (block[0] + 32) >> 6;
239  block[0] = 0;
240  dc16 = vec_splat((vec_s16) vec_lde(0, &dc), 1);
241 
242  if (size == 4)
243  dc16 = vec_sld(dc16, zero_s16v, 8);
244  dcplus = vec_packsu(dc16, zero_s16v);
245  dcminus = vec_packsu(vec_sub(zero_s16v, dc16), zero_s16v);
246 
247  aligner = vec_lvsr(0, dst);
248  dcplus = vec_perm(dcplus, dcplus, aligner);
249  dcminus = vec_perm(dcminus, dcminus, aligner);
250 
251  for (i = 0; i < size; i += 4) {
252  v0 = vec_ld(0, dst+0*stride);
253  v1 = vec_ld(0, dst+1*stride);
254  v2 = vec_ld(0, dst+2*stride);
255  v3 = vec_ld(0, dst+3*stride);
256 
257  v0 = vec_adds(v0, dcplus);
258  v1 = vec_adds(v1, dcplus);
259  v2 = vec_adds(v2, dcplus);
260  v3 = vec_adds(v3, dcplus);
261 
262  v0 = vec_subs(v0, dcminus);
263  v1 = vec_subs(v1, dcminus);
264  v2 = vec_subs(v2, dcminus);
265  v3 = vec_subs(v3, dcminus);
266 
267  vec_st(v0, 0, dst+0*stride);
268  vec_st(v1, 0, dst+1*stride);
269  vec_st(v2, 0, dst+2*stride);
270  vec_st(v3, 0, dst+3*stride);
271 
272  dst += 4*stride;
273  }
274 }
275 
276 static void h264_idct_dc_add_altivec(uint8_t *dst, int16_t *block, int stride)
277 {
278  h264_idct_dc_add_internal(dst, block, stride, 4);
279 }
280 
281 static void ff_h264_idct8_dc_add_altivec(uint8_t *dst, int16_t *block, int stride)
282 {
283  h264_idct_dc_add_internal(dst, block, stride, 8);
284 }
285 
286 static void ff_h264_idct_add16_altivec(uint8_t *dst, const int *block_offset, int16_t *block, int stride, const uint8_t nnzc[15*8]){
287  int i;
288  for(i=0; i<16; i++){
289  int nnz = nnzc[ scan8[i] ];
290  if(nnz){
291  if(nnz==1 && block[i*16]) h264_idct_dc_add_altivec(dst + block_offset[i], block + i*16, stride);
292  else ff_h264_idct_add_altivec(dst + block_offset[i], block + i*16, stride);
293  }
294  }
295 }
296 
297 static void ff_h264_idct_add16intra_altivec(uint8_t *dst, const int *block_offset, int16_t *block, int stride, const uint8_t nnzc[15*8]){
298  int i;
299  for(i=0; i<16; i++){
300  if(nnzc[ scan8[i] ]) ff_h264_idct_add_altivec(dst + block_offset[i], block + i*16, stride);
301  else if(block[i*16]) h264_idct_dc_add_altivec(dst + block_offset[i], block + i*16, stride);
302  }
303 }
304 
305 static void ff_h264_idct8_add4_altivec(uint8_t *dst, const int *block_offset, int16_t *block, int stride, const uint8_t nnzc[15*8]){
306  int i;
307  for(i=0; i<16; i+=4){
308  int nnz = nnzc[ scan8[i] ];
309  if(nnz){
310  if(nnz==1 && block[i*16]) ff_h264_idct8_dc_add_altivec(dst + block_offset[i], block + i*16, stride);
311  else ff_h264_idct8_add_altivec (dst + block_offset[i], block + i*16, stride);
312  }
313  }
314 }
315 
316 static void ff_h264_idct_add8_altivec(uint8_t **dest, const int *block_offset, int16_t *block, int stride, const uint8_t nnzc[15*8]){
317  int i, j;
318  for (j = 1; j < 3; j++) {
319  for(i = j * 16; i < j * 16 + 4; i++){
320  if(nnzc[ scan8[i] ])
321  ff_h264_idct_add_altivec(dest[j-1] + block_offset[i], block + i*16, stride);
322  else if(block[i*16])
323  h264_idct_dc_add_altivec(dest[j-1] + block_offset[i], block + i*16, stride);
324  }
325  }
326 }
327 
328 #define transpose4x16(r0, r1, r2, r3) { \
329  register vec_u8 r4; \
330  register vec_u8 r5; \
331  register vec_u8 r6; \
332  register vec_u8 r7; \
333  \
334  r4 = vec_mergeh(r0, r2); /*0, 2 set 0*/ \
335  r5 = vec_mergel(r0, r2); /*0, 2 set 1*/ \
336  r6 = vec_mergeh(r1, r3); /*1, 3 set 0*/ \
337  r7 = vec_mergel(r1, r3); /*1, 3 set 1*/ \
338  \
339  r0 = vec_mergeh(r4, r6); /*all set 0*/ \
340  r1 = vec_mergel(r4, r6); /*all set 1*/ \
341  r2 = vec_mergeh(r5, r7); /*all set 2*/ \
342  r3 = vec_mergel(r5, r7); /*all set 3*/ \
343 }
344 
345 static inline void write16x4(uint8_t *dst, int dst_stride,
346  register vec_u8 r0, register vec_u8 r1,
347  register vec_u8 r2, register vec_u8 r3) {
348  DECLARE_ALIGNED(16, unsigned char, result)[64];
349  uint32_t *src_int = (uint32_t *)result, *dst_int = (uint32_t *)dst;
350  int int_dst_stride = dst_stride/4;
351 
352  vec_st(r0, 0, result);
353  vec_st(r1, 16, result);
354  vec_st(r2, 32, result);
355  vec_st(r3, 48, result);
356  /* FIXME: there has to be a better way!!!! */
357  *dst_int = *src_int;
358  *(dst_int+ int_dst_stride) = *(src_int + 1);
359  *(dst_int+ 2*int_dst_stride) = *(src_int + 2);
360  *(dst_int+ 3*int_dst_stride) = *(src_int + 3);
361  *(dst_int+ 4*int_dst_stride) = *(src_int + 4);
362  *(dst_int+ 5*int_dst_stride) = *(src_int + 5);
363  *(dst_int+ 6*int_dst_stride) = *(src_int + 6);
364  *(dst_int+ 7*int_dst_stride) = *(src_int + 7);
365  *(dst_int+ 8*int_dst_stride) = *(src_int + 8);
366  *(dst_int+ 9*int_dst_stride) = *(src_int + 9);
367  *(dst_int+10*int_dst_stride) = *(src_int + 10);
368  *(dst_int+11*int_dst_stride) = *(src_int + 11);
369  *(dst_int+12*int_dst_stride) = *(src_int + 12);
370  *(dst_int+13*int_dst_stride) = *(src_int + 13);
371  *(dst_int+14*int_dst_stride) = *(src_int + 14);
372  *(dst_int+15*int_dst_stride) = *(src_int + 15);
373 }
374 
375 /** @brief performs a 6x16 transpose of data in src, and stores it to dst
376  @todo FIXME: see if we can't spare some vec_lvsl() by them factorizing
377  out of unaligned_load() */
378 #define readAndTranspose16x6(src, src_stride, r8, r9, r10, r11, r12, r13) {\
379  register vec_u8 r0 = unaligned_load(0, src); \
380  register vec_u8 r1 = unaligned_load( src_stride, src); \
381  register vec_u8 r2 = unaligned_load(2* src_stride, src); \
382  register vec_u8 r3 = unaligned_load(3* src_stride, src); \
383  register vec_u8 r4 = unaligned_load(4* src_stride, src); \
384  register vec_u8 r5 = unaligned_load(5* src_stride, src); \
385  register vec_u8 r6 = unaligned_load(6* src_stride, src); \
386  register vec_u8 r7 = unaligned_load(7* src_stride, src); \
387  register vec_u8 r14 = unaligned_load(14*src_stride, src); \
388  register vec_u8 r15 = unaligned_load(15*src_stride, src); \
389  \
390  r8 = unaligned_load( 8*src_stride, src); \
391  r9 = unaligned_load( 9*src_stride, src); \
392  r10 = unaligned_load(10*src_stride, src); \
393  r11 = unaligned_load(11*src_stride, src); \
394  r12 = unaligned_load(12*src_stride, src); \
395  r13 = unaligned_load(13*src_stride, src); \
396  \
397  /*Merge first pairs*/ \
398  r0 = vec_mergeh(r0, r8); /*0, 8*/ \
399  r1 = vec_mergeh(r1, r9); /*1, 9*/ \
400  r2 = vec_mergeh(r2, r10); /*2,10*/ \
401  r3 = vec_mergeh(r3, r11); /*3,11*/ \
402  r4 = vec_mergeh(r4, r12); /*4,12*/ \
403  r5 = vec_mergeh(r5, r13); /*5,13*/ \
404  r6 = vec_mergeh(r6, r14); /*6,14*/ \
405  r7 = vec_mergeh(r7, r15); /*7,15*/ \
406  \
407  /*Merge second pairs*/ \
408  r8 = vec_mergeh(r0, r4); /*0,4, 8,12 set 0*/ \
409  r9 = vec_mergel(r0, r4); /*0,4, 8,12 set 1*/ \
410  r10 = vec_mergeh(r1, r5); /*1,5, 9,13 set 0*/ \
411  r11 = vec_mergel(r1, r5); /*1,5, 9,13 set 1*/ \
412  r12 = vec_mergeh(r2, r6); /*2,6,10,14 set 0*/ \
413  r13 = vec_mergel(r2, r6); /*2,6,10,14 set 1*/ \
414  r14 = vec_mergeh(r3, r7); /*3,7,11,15 set 0*/ \
415  r15 = vec_mergel(r3, r7); /*3,7,11,15 set 1*/ \
416  \
417  /*Third merge*/ \
418  r0 = vec_mergeh(r8, r12); /*0,2,4,6,8,10,12,14 set 0*/ \
419  r1 = vec_mergel(r8, r12); /*0,2,4,6,8,10,12,14 set 1*/ \
420  r2 = vec_mergeh(r9, r13); /*0,2,4,6,8,10,12,14 set 2*/ \
421  r4 = vec_mergeh(r10, r14); /*1,3,5,7,9,11,13,15 set 0*/ \
422  r5 = vec_mergel(r10, r14); /*1,3,5,7,9,11,13,15 set 1*/ \
423  r6 = vec_mergeh(r11, r15); /*1,3,5,7,9,11,13,15 set 2*/ \
424  /* Don't need to compute 3 and 7*/ \
425  \
426  /*Final merge*/ \
427  r8 = vec_mergeh(r0, r4); /*all set 0*/ \
428  r9 = vec_mergel(r0, r4); /*all set 1*/ \
429  r10 = vec_mergeh(r1, r5); /*all set 2*/ \
430  r11 = vec_mergel(r1, r5); /*all set 3*/ \
431  r12 = vec_mergeh(r2, r6); /*all set 4*/ \
432  r13 = vec_mergel(r2, r6); /*all set 5*/ \
433  /* Don't need to compute 14 and 15*/ \
434  \
435 }
436 
437 // out: o = |x-y| < a
438 static inline vec_u8 diff_lt_altivec ( register vec_u8 x,
439  register vec_u8 y,
440  register vec_u8 a) {
441 
442  register vec_u8 diff = vec_subs(x, y);
443  register vec_u8 diffneg = vec_subs(y, x);
444  register vec_u8 o = vec_or(diff, diffneg); /* |x-y| */
445  o = (vec_u8)vec_cmplt(o, a);
446  return o;
447 }
448 
449 static inline vec_u8 h264_deblock_mask ( register vec_u8 p0,
450  register vec_u8 p1,
451  register vec_u8 q0,
452  register vec_u8 q1,
453  register vec_u8 alpha,
454  register vec_u8 beta) {
455 
456  register vec_u8 mask;
457  register vec_u8 tempmask;
458 
459  mask = diff_lt_altivec(p0, q0, alpha);
460  tempmask = diff_lt_altivec(p1, p0, beta);
461  mask = vec_and(mask, tempmask);
462  tempmask = diff_lt_altivec(q1, q0, beta);
463  mask = vec_and(mask, tempmask);
464 
465  return mask;
466 }
467 
468 // out: newp1 = clip((p2 + ((p0 + q0 + 1) >> 1)) >> 1, p1-tc0, p1+tc0)
469 static inline vec_u8 h264_deblock_q1(register vec_u8 p0,
470  register vec_u8 p1,
471  register vec_u8 p2,
472  register vec_u8 q0,
473  register vec_u8 tc0) {
474 
475  register vec_u8 average = vec_avg(p0, q0);
476  register vec_u8 temp;
477  register vec_u8 uncliped;
478  register vec_u8 ones;
479  register vec_u8 max;
480  register vec_u8 min;
481  register vec_u8 newp1;
482 
483  temp = vec_xor(average, p2);
484  average = vec_avg(average, p2); /*avg(p2, avg(p0, q0)) */
485  ones = vec_splat_u8(1);
486  temp = vec_and(temp, ones); /*(p2^avg(p0, q0)) & 1 */
487  uncliped = vec_subs(average, temp); /*(p2+((p0+q0+1)>>1))>>1 */
488  max = vec_adds(p1, tc0);
489  min = vec_subs(p1, tc0);
490  newp1 = vec_max(min, uncliped);
491  newp1 = vec_min(max, newp1);
492  return newp1;
493 }
494 
495 #define h264_deblock_p0_q0(p0, p1, q0, q1, tc0masked) { \
496  \
497  const vec_u8 A0v = vec_sl(vec_splat_u8(10), vec_splat_u8(4)); \
498  \
499  register vec_u8 pq0bit = vec_xor(p0,q0); \
500  register vec_u8 q1minus; \
501  register vec_u8 p0minus; \
502  register vec_u8 stage1; \
503  register vec_u8 stage2; \
504  register vec_u8 vec160; \
505  register vec_u8 delta; \
506  register vec_u8 deltaneg; \
507  \
508  q1minus = vec_nor(q1, q1); /* 255 - q1 */ \
509  stage1 = vec_avg(p1, q1minus); /* (p1 - q1 + 256)>>1 */ \
510  stage2 = vec_sr(stage1, vec_splat_u8(1)); /* (p1 - q1 + 256)>>2 = 64 + (p1 - q1) >> 2 */ \
511  p0minus = vec_nor(p0, p0); /* 255 - p0 */ \
512  stage1 = vec_avg(q0, p0minus); /* (q0 - p0 + 256)>>1 */ \
513  pq0bit = vec_and(pq0bit, vec_splat_u8(1)); \
514  stage2 = vec_avg(stage2, pq0bit); /* 32 + ((q0 - p0)&1 + (p1 - q1) >> 2 + 1) >> 1 */ \
515  stage2 = vec_adds(stage2, stage1); /* 160 + ((p0 - q0) + (p1 - q1) >> 2 + 1) >> 1 */ \
516  vec160 = vec_ld(0, &A0v); \
517  deltaneg = vec_subs(vec160, stage2); /* -d */ \
518  delta = vec_subs(stage2, vec160); /* d */ \
519  deltaneg = vec_min(tc0masked, deltaneg); \
520  delta = vec_min(tc0masked, delta); \
521  p0 = vec_subs(p0, deltaneg); \
522  q0 = vec_subs(q0, delta); \
523  p0 = vec_adds(p0, delta); \
524  q0 = vec_adds(q0, deltaneg); \
525 }
526 
527 #define h264_loop_filter_luma_altivec(p2, p1, p0, q0, q1, q2, alpha, beta, tc0) { \
528  DECLARE_ALIGNED(16, unsigned char, temp)[16]; \
529  register vec_u8 alphavec; \
530  register vec_u8 betavec; \
531  register vec_u8 mask; \
532  register vec_u8 p1mask; \
533  register vec_u8 q1mask; \
534  register vector signed char tc0vec; \
535  register vec_u8 finaltc0; \
536  register vec_u8 tc0masked; \
537  register vec_u8 newp1; \
538  register vec_u8 newq1; \
539  \
540  temp[0] = alpha; \
541  temp[1] = beta; \
542  alphavec = vec_ld(0, temp); \
543  betavec = vec_splat(alphavec, 0x1); \
544  alphavec = vec_splat(alphavec, 0x0); \
545  mask = h264_deblock_mask(p0, p1, q0, q1, alphavec, betavec); /*if in block */ \
546  \
547  AV_COPY32(temp, tc0); \
548  tc0vec = vec_ld(0, (signed char*)temp); \
549  tc0vec = vec_mergeh(tc0vec, tc0vec); \
550  tc0vec = vec_mergeh(tc0vec, tc0vec); \
551  mask = vec_and(mask, vec_cmpgt(tc0vec, vec_splat_s8(-1))); /* if tc0[i] >= 0 */ \
552  finaltc0 = vec_and((vec_u8)tc0vec, mask); /* tc = tc0 */ \
553  \
554  p1mask = diff_lt_altivec(p2, p0, betavec); \
555  p1mask = vec_and(p1mask, mask); /* if ( |p2 - p0| < beta) */ \
556  tc0masked = vec_and(p1mask, (vec_u8)tc0vec); \
557  finaltc0 = vec_sub(finaltc0, p1mask); /* tc++ */ \
558  newp1 = h264_deblock_q1(p0, p1, p2, q0, tc0masked); \
559  /*end if*/ \
560  \
561  q1mask = diff_lt_altivec(q2, q0, betavec); \
562  q1mask = vec_and(q1mask, mask); /* if ( |q2 - q0| < beta ) */\
563  tc0masked = vec_and(q1mask, (vec_u8)tc0vec); \
564  finaltc0 = vec_sub(finaltc0, q1mask); /* tc++ */ \
565  newq1 = h264_deblock_q1(p0, q1, q2, q0, tc0masked); \
566  /*end if*/ \
567  \
568  h264_deblock_p0_q0(p0, p1, q0, q1, finaltc0); \
569  p1 = newp1; \
570  q1 = newq1; \
571 }
572 
573 static void h264_v_loop_filter_luma_altivec(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) {
574 
575  if ((tc0[0] & tc0[1] & tc0[2] & tc0[3]) >= 0) {
576  register vec_u8 p2 = vec_ld(-3*stride, pix);
577  register vec_u8 p1 = vec_ld(-2*stride, pix);
578  register vec_u8 p0 = vec_ld(-1*stride, pix);
579  register vec_u8 q0 = vec_ld(0, pix);
580  register vec_u8 q1 = vec_ld(stride, pix);
581  register vec_u8 q2 = vec_ld(2*stride, pix);
582  h264_loop_filter_luma_altivec(p2, p1, p0, q0, q1, q2, alpha, beta, tc0);
583  vec_st(p1, -2*stride, pix);
584  vec_st(p0, -1*stride, pix);
585  vec_st(q0, 0, pix);
586  vec_st(q1, stride, pix);
587  }
588 }
589 
590 static void h264_h_loop_filter_luma_altivec(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) {
591 
592  register vec_u8 line0, line1, line2, line3, line4, line5;
593  if ((tc0[0] & tc0[1] & tc0[2] & tc0[3]) < 0)
594  return;
595  readAndTranspose16x6(pix-3, stride, line0, line1, line2, line3, line4, line5);
596  h264_loop_filter_luma_altivec(line0, line1, line2, line3, line4, line5, alpha, beta, tc0);
597  transpose4x16(line1, line2, line3, line4);
598  write16x4(pix-2, stride, line1, line2, line3, line4);
599 }
600 
601 static av_always_inline
603  int log2_denom, int weight, int offset, int w)
604 {
605  int y, aligned;
606  vec_u8 vblock;
607  vec_s16 vtemp, vweight, voffset, v0, v1;
608  vec_u16 vlog2_denom;
609  DECLARE_ALIGNED(16, int32_t, temp)[4];
610  LOAD_ZERO;
611 
612  offset <<= log2_denom;
613  if(log2_denom) offset += 1<<(log2_denom-1);
614  temp[0] = log2_denom;
615  temp[1] = weight;
616  temp[2] = offset;
617 
618  vtemp = (vec_s16)vec_ld(0, temp);
619  vlog2_denom = (vec_u16)vec_splat(vtemp, 1);
620  vweight = vec_splat(vtemp, 3);
621  voffset = vec_splat(vtemp, 5);
622  aligned = !((unsigned long)block & 0xf);
623 
624  for (y = 0; y < height; y++) {
625  vblock = vec_ld(0, block);
626 
627  v0 = (vec_s16)vec_mergeh(zero_u8v, vblock);
628  v1 = (vec_s16)vec_mergel(zero_u8v, vblock);
629 
630  if (w == 16 || aligned) {
631  v0 = vec_mladd(v0, vweight, zero_s16v);
632  v0 = vec_adds(v0, voffset);
633  v0 = vec_sra(v0, vlog2_denom);
634  }
635  if (w == 16 || !aligned) {
636  v1 = vec_mladd(v1, vweight, zero_s16v);
637  v1 = vec_adds(v1, voffset);
638  v1 = vec_sra(v1, vlog2_denom);
639  }
640  vblock = vec_packsu(v0, v1);
641  vec_st(vblock, 0, block);
642 
643  block += stride;
644  }
645 }
646 
647 static av_always_inline
649  int log2_denom, int weightd, int weights, int offset, int w)
650 {
651  int y, dst_aligned, src_aligned;
652  vec_u8 vsrc, vdst;
653  vec_s16 vtemp, vweights, vweightd, voffset, v0, v1, v2, v3;
654  vec_u16 vlog2_denom;
655  DECLARE_ALIGNED(16, int32_t, temp)[4];
656  LOAD_ZERO;
657 
658  offset = ((offset + 1) | 1) << log2_denom;
659  temp[0] = log2_denom+1;
660  temp[1] = weights;
661  temp[2] = weightd;
662  temp[3] = offset;
663 
664  vtemp = (vec_s16)vec_ld(0, temp);
665  vlog2_denom = (vec_u16)vec_splat(vtemp, 1);
666  vweights = vec_splat(vtemp, 3);
667  vweightd = vec_splat(vtemp, 5);
668  voffset = vec_splat(vtemp, 7);
669  dst_aligned = !((unsigned long)dst & 0xf);
670  src_aligned = !((unsigned long)src & 0xf);
671 
672  for (y = 0; y < height; y++) {
673  vdst = vec_ld(0, dst);
674  vsrc = vec_ld(0, src);
675 
676  v0 = (vec_s16)vec_mergeh(zero_u8v, vdst);
677  v1 = (vec_s16)vec_mergel(zero_u8v, vdst);
678  v2 = (vec_s16)vec_mergeh(zero_u8v, vsrc);
679  v3 = (vec_s16)vec_mergel(zero_u8v, vsrc);
680 
681  if (w == 8) {
682  if (src_aligned)
683  v3 = v2;
684  else
685  v2 = v3;
686  }
687 
688  if (w == 16 || dst_aligned) {
689  v0 = vec_mladd(v0, vweightd, zero_s16v);
690  v2 = vec_mladd(v2, vweights, zero_s16v);
691 
692  v0 = vec_adds(v0, voffset);
693  v0 = vec_adds(v0, v2);
694  v0 = vec_sra(v0, vlog2_denom);
695  }
696  if (w == 16 || !dst_aligned) {
697  v1 = vec_mladd(v1, vweightd, zero_s16v);
698  v3 = vec_mladd(v3, vweights, zero_s16v);
699 
700  v1 = vec_adds(v1, voffset);
701  v1 = vec_adds(v1, v3);
702  v1 = vec_sra(v1, vlog2_denom);
703  }
704  vdst = vec_packsu(v0, v1);
705  vec_st(vdst, 0, dst);
706 
707  dst += stride;
708  src += stride;
709  }
710 }
711 
712 #define H264_WEIGHT(W) \
713 static void ff_weight_h264_pixels ## W ## _altivec(uint8_t *block, int stride, int height, \
714  int log2_denom, int weight, int offset){ \
715  weight_h264_W_altivec(block, stride, height, log2_denom, weight, offset, W); \
716 }\
717 static void ff_biweight_h264_pixels ## W ## _altivec(uint8_t *dst, uint8_t *src, int stride, int height, \
718  int log2_denom, int weightd, int weights, int offset){ \
719  biweight_h264_W_altivec(dst, src, stride, height, log2_denom, weightd, weights, offset, W); \
720 }
721 
722 H264_WEIGHT(16)
723 H264_WEIGHT( 8)
724 
725 av_cold void ff_h264dsp_init_ppc(H264DSPContext *c, const int bit_depth,
726  const int chroma_format_idc)
727 {
729  if (bit_depth == 8) {
730  c->h264_idct_add = ff_h264_idct_add_altivec;
731  if (chroma_format_idc == 1)
732  c->h264_idct_add8 = ff_h264_idct_add8_altivec;
733  c->h264_idct_add16 = ff_h264_idct_add16_altivec;
734  c->h264_idct_add16intra = ff_h264_idct_add16intra_altivec;
735  c->h264_idct_dc_add= h264_idct_dc_add_altivec;
736  c->h264_idct8_dc_add = ff_h264_idct8_dc_add_altivec;
737  c->h264_idct8_add = ff_h264_idct8_add_altivec;
738  c->h264_idct8_add4 = ff_h264_idct8_add4_altivec;
739  c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_altivec;
740  c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_altivec;
741 
742  c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels16_altivec;
743  c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels8_altivec;
744  c->biweight_h264_pixels_tab[0] = ff_biweight_h264_pixels16_altivec;
745  c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels8_altivec;
746  }
747  }
748 }
static void ff_h264_idct8_add_altivec(uint8_t *dst, int16_t *dct, int stride)
Definition: h264_altivec.c:185
#define AV_CPU_FLAG_ALTIVEC
standard
Definition: cpu.h:52
#define VEC_LOAD_U8_ADD_S16_STORE_U8(va)
Definition: h264_altivec.c:61
#define transpose4x16(r0, r1, r2, r3)
Definition: h264_altivec.c:328
else temp
Definition: vf_mcdeint.c:148
#define ALTIVEC_STORE_SUM_CLIP(dest, idctv, perm_ldv, perm_stv, sel)
Definition: h264_altivec.c:164
if max(w)>1 w=0.9 *w/max(w)
static av_always_inline void h264_idct_dc_add_internal(uint8_t *dst, int16_t *block, int stride, int size)
Definition: h264_altivec.c:230
#define DECLARE_ALIGNED(n, t, v)
Definition: mem.h:59
static av_always_inline void weight_h264_W_altivec(uint8_t *block, int stride, int height, int log2_denom, int weight, int offset, int w)
Definition: h264_altivec.c:602
#define H264_WEIGHT(W)
Definition: h264_altivec.c:712
int stride
Definition: mace.c:144
static void ff_h264_idct_add16intra_altivec(uint8_t *dst, const int *block_offset, int16_t *block, int stride, const uint8_t nnzc[15 *8])
Definition: h264_altivec.c:297
H.264 DSP functions.
output residual component w
Macro definitions for various function/variable attributes.
static vec_u8 h264_deblock_mask(register vec_u8 p0, register vec_u8 p1, register vec_u8 q0, register vec_u8 q1, register vec_u8 alpha, register vec_u8 beta)
Definition: h264_altivec.c:449
uint8_t
#define av_cold
Definition: attributes.h:78
#define TRANSPOSE8(a, b, c, d, e, f, g, h)
Definition: util_altivec.h:63
#define vec_s16
Definition: types_altivec.h:30
#define zero_s16v
Definition: types_altivec.h:43
static vec_u8 diff_lt_altivec(register vec_u8 x, register vec_u8 y, register vec_u8 a)
Definition: h264_altivec.c:438
Discrete Time axis x
static double alpha(void *priv, double x, double y)
Definition: vf_geq.c:86
#define s2
Definition: regdef.h:39
static const uint16_t mask[17]
Definition: lzw.c:37
#define s0
Definition: regdef.h:37
#define vec_u16
Definition: types_altivec.h:29
static void h264_v_loop_filter_luma_altivec(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
Definition: h264_altivec.c:573
init variable d2
#define LOAD_ZERO
Definition: types_altivec.h:38
static const uint8_t offset[127][2]
Definition: vf_spp.c:70
int size
#define s5
Definition: regdef.h:42
static const uint8_t scan8[16 *3+3]
Definition: h264.h:812
static void idct6(int pre_mant[6])
Calculate 6-point IDCT of the pre-mantissas.
Definition: eac3dec.c:168
#define IDCT8_1D_ALTIVEC(s0, s1, s2, s3, s4, s5, s6, s7,d0, d1, d2, d3, d4, d5, d6, d7)
Definition: h264_altivec.c:110
Context for storing H.264 DSP functions.
Definition: h264dsp.h:41
int32_t
#define vec_u32
Definition: types_altivec.h:31
static void ff_h264_idct8_add4_altivec(uint8_t *dst, const int *block_offset, int16_t *block, int stride, const uint8_t nnzc[15 *8])
Definition: h264_altivec.c:305
static void ff_h264_idct8_dc_add_altivec(uint8_t *dst, int16_t *block, int stride)
Definition: h264_altivec.c:281
static void ff_h264_idct_add8_altivec(uint8_t **dest, const int *block_offset, int16_t *block, int stride, const uint8_t nnzc[15 *8])
Definition: h264_altivec.c:316
#define h264_loop_filter_luma_altivec(p2, p1, p0, q0, q1, q2, alpha, beta, tc0)
Definition: h264_altivec.c:527
#define diff(a, as, b, bs)
Definition: vf_phase.c:80
#define s4
Definition: regdef.h:41
#define s3
Definition: regdef.h:40
static void h264_h_loop_filter_luma_altivec(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
Definition: h264_altivec.c:590
int chroma_format_idc
chroma format from sps to detect changes
Definition: h264.h:493
#define vec_u8
Definition: types_altivec.h:27
dest
Definition: start.py:60
AVS_Value src
Definition: avisynth_c.h:523
H264 / AVC / MPEG4 part10 codec data table
FIXME Range Coding of cr are mx and my are Motion Vector top and top right vectors is used as motion vector prediction the used motion vector is the sum of the predictor and(mvx_diff, mvy_diff)*mv_scale Intra DC Predicton block[y][x] dc[1]
Definition: snow.txt:392
BYTE int const BYTE int int int height
Definition: avisynth_c.h:713
synthesis window for stochastic i
static void ff_h264_idct_add16_altivec(uint8_t *dst, const int *block_offset, int16_t *block, int stride, const uint8_t nnzc[15 *8])
Definition: h264_altivec.c:286
static void write16x4(uint8_t *dst, int dst_stride, register vec_u8 r0, register vec_u8 r1, register vec_u8 r2, register vec_u8 r3)
Definition: h264_altivec.c:345
#define s1
Definition: regdef.h:38
int av_get_cpu_flags(void)
Return the flags which specify extensions supported by the CPU.
Definition: cpu.c:30
static int weight(int i, int blen, int offset)
int block_offset[2 *(16 *3)]
block_offset[ 0..23] for frame macroblocks block_offset[24..47] for field macroblocks ...
Definition: h264.h:350
Contains misc utility macros and inline functions.
av_cold void ff_h264dsp_init_ppc(H264DSPContext *c, const int bit_depth, const int chroma_format_idc)
Definition: h264_altivec.c:725
#define v0
Definition: regdef.h:26
#define VEC_TRANSPOSE_4(a0, a1, a2, a3, b0, b1, b2, b3)
Definition: h264_altivec.c:47
static double c[64]
function y
Definition: D.m:1
static void h264_idct_dc_add_altivec(uint8_t *dst, int16_t *block, int stride)
Definition: h264_altivec.c:276
else dst[i][x+y *dst_stride[i]]
Definition: vf_mcdeint.c:160
static void ff_h264_idct_add_altivec(uint8_t *dst, int16_t *block, int stride)
Definition: h264_altivec.c:70
#define VEC_1D_DCT(vb0, vb1, vb2, vb3, va0, va1, va2, va3)
Definition: h264_altivec.c:33
static av_always_inline void biweight_h264_W_altivec(uint8_t *dst, uint8_t *src, int stride, int height, int log2_denom, int weightd, int weights, int offset, int w)
Definition: h264_altivec.c:648
#define s6
Definition: regdef.h:43
#define av_always_inline
Definition: attributes.h:41
static vec_u8 h264_deblock_q1(register vec_u8 p0, register vec_u8 p1, register vec_u8 p2, register vec_u8 q0, register vec_u8 tc0)
Definition: h264_altivec.c:469
float min
#define readAndTranspose16x6(src, src_stride, r8, r9, r10, r11, r12, r13)
performs a 6x16 transpose of data in src, and stores it to dst
Definition: h264_altivec.c:378
#define zero_u8v
Definition: types_altivec.h:40