hpeldsp_altivec.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2002 Brian Foley
3  * Copyright (c) 2002 Dieter Shirley
4  * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org>
5  *
6  * This file is part of FFmpeg.
7  *
8  * FFmpeg is free software; you can redistribute it and/or
9  * modify it under the terms of the GNU Lesser General Public
10  * License as published by the Free Software Foundation; either
11  * version 2.1 of the License, or (at your option) any later version.
12  *
13  * FFmpeg is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16  * Lesser General Public License for more details.
17  *
18  * You should have received a copy of the GNU Lesser General Public
19  * License along with FFmpeg; if not, write to the Free Software
20  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21  */
22 
23 #include "config.h"
24 #include "libavutil/cpu.h"
25 #include "libavcodec/hpeldsp.h"
26 
27 #if HAVE_ALTIVEC
28 #if HAVE_ALTIVEC_H
29 #include <altivec.h>
30 #endif
33 #include "dsputil_altivec.h"
34 
35 /* next one assumes that ((line_size % 16) == 0) */
36 void ff_put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
37 {
38  register vector unsigned char pixelsv1, pixelsv2;
39  register vector unsigned char pixelsv1B, pixelsv2B;
40  register vector unsigned char pixelsv1C, pixelsv2C;
41  register vector unsigned char pixelsv1D, pixelsv2D;
42 
43  register vector unsigned char perm = vec_lvsl(0, pixels);
44  int i;
45  register ptrdiff_t line_size_2 = line_size << 1;
46  register ptrdiff_t line_size_3 = line_size + line_size_2;
47  register ptrdiff_t line_size_4 = line_size << 2;
48 
49 // hand-unrolling the loop by 4 gains about 15%
50 // mininum execution time goes from 74 to 60 cycles
51 // it's faster than -funroll-loops, but using
52 // -funroll-loops w/ this is bad - 74 cycles again.
53 // all this is on a 7450, tuning for the 7450
54  for (i = 0; i < h; i += 4) {
55  pixelsv1 = vec_ld( 0, pixels);
56  pixelsv2 = vec_ld(15, pixels);
57  pixelsv1B = vec_ld(line_size, pixels);
58  pixelsv2B = vec_ld(15 + line_size, pixels);
59  pixelsv1C = vec_ld(line_size_2, pixels);
60  pixelsv2C = vec_ld(15 + line_size_2, pixels);
61  pixelsv1D = vec_ld(line_size_3, pixels);
62  pixelsv2D = vec_ld(15 + line_size_3, pixels);
63  vec_st(vec_perm(pixelsv1, pixelsv2, perm),
64  0, (unsigned char*)block);
65  vec_st(vec_perm(pixelsv1B, pixelsv2B, perm),
66  line_size, (unsigned char*)block);
67  vec_st(vec_perm(pixelsv1C, pixelsv2C, perm),
68  line_size_2, (unsigned char*)block);
69  vec_st(vec_perm(pixelsv1D, pixelsv2D, perm),
70  line_size_3, (unsigned char*)block);
71  pixels+=line_size_4;
72  block +=line_size_4;
73  }
74 }
75 
76 /* next one assumes that ((line_size % 16) == 0) */
77 #define op_avg(a,b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) )
78 void ff_avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
79 {
80  register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv;
81  register vector unsigned char perm = vec_lvsl(0, pixels);
82  int i;
83 
84  for (i = 0; i < h; i++) {
85  pixelsv1 = vec_ld( 0, pixels);
86  pixelsv2 = vec_ld(16,pixels);
87  blockv = vec_ld(0, block);
88  pixelsv = vec_perm(pixelsv1, pixelsv2, perm);
89  blockv = vec_avg(blockv,pixelsv);
90  vec_st(blockv, 0, (unsigned char*)block);
91  pixels+=line_size;
92  block +=line_size;
93  }
94 }
95 
96 /* next one assumes that ((line_size % 8) == 0) */
97 static void avg_pixels8_altivec(uint8_t * block, const uint8_t * pixels, ptrdiff_t line_size, int h)
98 {
99  register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv;
100  int i;
101 
102  for (i = 0; i < h; i++) {
103  /* block is 8 bytes-aligned, so we're either in the
104  left block (16 bytes-aligned) or in the right block (not) */
105  int rightside = ((unsigned long)block & 0x0000000F);
106 
107  blockv = vec_ld(0, block);
108  pixelsv1 = vec_ld( 0, pixels);
109  pixelsv2 = vec_ld(16, pixels);
110  pixelsv = vec_perm(pixelsv1, pixelsv2, vec_lvsl(0, pixels));
111 
112  if (rightside) {
113  pixelsv = vec_perm(blockv, pixelsv, vcprm(0,1,s0,s1));
114  } else {
115  pixelsv = vec_perm(blockv, pixelsv, vcprm(s0,s1,2,3));
116  }
117 
118  blockv = vec_avg(blockv, pixelsv);
119 
120  vec_st(blockv, 0, block);
121 
122  pixels += line_size;
123  block += line_size;
124  }
125 }
126 
127 /* next one assumes that ((line_size % 8) == 0) */
128 static void put_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
129 {
130  register int i;
131  register vector unsigned char pixelsv1, pixelsv2, pixelsavg;
132  register vector unsigned char blockv, temp1, temp2;
133  register vector unsigned short pixelssum1, pixelssum2, temp3;
134  register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
135  register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
136 
137  temp1 = vec_ld(0, pixels);
138  temp2 = vec_ld(16, pixels);
139  pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
140  if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) {
141  pixelsv2 = temp2;
142  } else {
143  pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
144  }
145  pixelsv1 = vec_mergeh(vczero, pixelsv1);
146  pixelsv2 = vec_mergeh(vczero, pixelsv2);
147  pixelssum1 = vec_add((vector unsigned short)pixelsv1,
148  (vector unsigned short)pixelsv2);
149  pixelssum1 = vec_add(pixelssum1, vctwo);
150 
151  for (i = 0; i < h ; i++) {
152  int rightside = ((unsigned long)block & 0x0000000F);
153  blockv = vec_ld(0, block);
154 
155  temp1 = vec_ld(line_size, pixels);
156  temp2 = vec_ld(line_size + 16, pixels);
157  pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
158  if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) {
159  pixelsv2 = temp2;
160  } else {
161  pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
162  }
163 
164  pixelsv1 = vec_mergeh(vczero, pixelsv1);
165  pixelsv2 = vec_mergeh(vczero, pixelsv2);
166  pixelssum2 = vec_add((vector unsigned short)pixelsv1,
167  (vector unsigned short)pixelsv2);
168  temp3 = vec_add(pixelssum1, pixelssum2);
169  temp3 = vec_sra(temp3, vctwo);
170  pixelssum1 = vec_add(pixelssum2, vctwo);
171  pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
172 
173  if (rightside) {
174  blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
175  } else {
176  blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
177  }
178 
179  vec_st(blockv, 0, block);
180 
181  block += line_size;
182  pixels += line_size;
183  }
184 }
185 
186 /* next one assumes that ((line_size % 8) == 0) */
187 static void put_no_rnd_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
188 {
189  register int i;
190  register vector unsigned char pixelsv1, pixelsv2, pixelsavg;
191  register vector unsigned char blockv, temp1, temp2;
192  register vector unsigned short pixelssum1, pixelssum2, temp3;
193  register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
194  register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1);
195  register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
196 
197  temp1 = vec_ld(0, pixels);
198  temp2 = vec_ld(16, pixels);
199  pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
200  if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) {
201  pixelsv2 = temp2;
202  } else {
203  pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
204  }
205  pixelsv1 = vec_mergeh(vczero, pixelsv1);
206  pixelsv2 = vec_mergeh(vczero, pixelsv2);
207  pixelssum1 = vec_add((vector unsigned short)pixelsv1,
208  (vector unsigned short)pixelsv2);
209  pixelssum1 = vec_add(pixelssum1, vcone);
210 
211  for (i = 0; i < h ; i++) {
212  int rightside = ((unsigned long)block & 0x0000000F);
213  blockv = vec_ld(0, block);
214 
215  temp1 = vec_ld(line_size, pixels);
216  temp2 = vec_ld(line_size + 16, pixels);
217  pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
218  if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) {
219  pixelsv2 = temp2;
220  } else {
221  pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
222  }
223 
224  pixelsv1 = vec_mergeh(vczero, pixelsv1);
225  pixelsv2 = vec_mergeh(vczero, pixelsv2);
226  pixelssum2 = vec_add((vector unsigned short)pixelsv1,
227  (vector unsigned short)pixelsv2);
228  temp3 = vec_add(pixelssum1, pixelssum2);
229  temp3 = vec_sra(temp3, vctwo);
230  pixelssum1 = vec_add(pixelssum2, vcone);
231  pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
232 
233  if (rightside) {
234  blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
235  } else {
236  blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
237  }
238 
239  vec_st(blockv, 0, block);
240 
241  block += line_size;
242  pixels += line_size;
243  }
244 }
245 
246 /* next one assumes that ((line_size % 16) == 0) */
247 static void put_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, ptrdiff_t line_size, int h)
248 {
249  register int i;
250  register vector unsigned char pixelsv1, pixelsv2, pixelsv3, pixelsv4;
251  register vector unsigned char blockv, temp1, temp2;
252  register vector unsigned short temp3, temp4,
253  pixelssum1, pixelssum2, pixelssum3, pixelssum4;
254  register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
255  register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
256 
257  temp1 = vec_ld(0, pixels);
258  temp2 = vec_ld(16, pixels);
259  pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
260  if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) {
261  pixelsv2 = temp2;
262  } else {
263  pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
264  }
265  pixelsv3 = vec_mergel(vczero, pixelsv1);
266  pixelsv4 = vec_mergel(vczero, pixelsv2);
267  pixelsv1 = vec_mergeh(vczero, pixelsv1);
268  pixelsv2 = vec_mergeh(vczero, pixelsv2);
269  pixelssum3 = vec_add((vector unsigned short)pixelsv3,
270  (vector unsigned short)pixelsv4);
271  pixelssum3 = vec_add(pixelssum3, vctwo);
272  pixelssum1 = vec_add((vector unsigned short)pixelsv1,
273  (vector unsigned short)pixelsv2);
274  pixelssum1 = vec_add(pixelssum1, vctwo);
275 
276  for (i = 0; i < h ; i++) {
277  blockv = vec_ld(0, block);
278 
279  temp1 = vec_ld(line_size, pixels);
280  temp2 = vec_ld(line_size + 16, pixels);
281  pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
282  if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) {
283  pixelsv2 = temp2;
284  } else {
285  pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
286  }
287 
288  pixelsv3 = vec_mergel(vczero, pixelsv1);
289  pixelsv4 = vec_mergel(vczero, pixelsv2);
290  pixelsv1 = vec_mergeh(vczero, pixelsv1);
291  pixelsv2 = vec_mergeh(vczero, pixelsv2);
292 
293  pixelssum4 = vec_add((vector unsigned short)pixelsv3,
294  (vector unsigned short)pixelsv4);
295  pixelssum2 = vec_add((vector unsigned short)pixelsv1,
296  (vector unsigned short)pixelsv2);
297  temp4 = vec_add(pixelssum3, pixelssum4);
298  temp4 = vec_sra(temp4, vctwo);
299  temp3 = vec_add(pixelssum1, pixelssum2);
300  temp3 = vec_sra(temp3, vctwo);
301 
302  pixelssum3 = vec_add(pixelssum4, vctwo);
303  pixelssum1 = vec_add(pixelssum2, vctwo);
304 
305  blockv = vec_packsu(temp3, temp4);
306 
307  vec_st(blockv, 0, block);
308 
309  block += line_size;
310  pixels += line_size;
311  }
312 }
313 
314 /* next one assumes that ((line_size % 16) == 0) */
315 static void put_no_rnd_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, ptrdiff_t line_size, int h)
316 {
317  register int i;
318  register vector unsigned char pixelsv1, pixelsv2, pixelsv3, pixelsv4;
319  register vector unsigned char blockv, temp1, temp2;
320  register vector unsigned short temp3, temp4,
321  pixelssum1, pixelssum2, pixelssum3, pixelssum4;
322  register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
323  register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1);
324  register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
325 
326  temp1 = vec_ld(0, pixels);
327  temp2 = vec_ld(16, pixels);
328  pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
329  if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) {
330  pixelsv2 = temp2;
331  } else {
332  pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
333  }
334  pixelsv3 = vec_mergel(vczero, pixelsv1);
335  pixelsv4 = vec_mergel(vczero, pixelsv2);
336  pixelsv1 = vec_mergeh(vczero, pixelsv1);
337  pixelsv2 = vec_mergeh(vczero, pixelsv2);
338  pixelssum3 = vec_add((vector unsigned short)pixelsv3,
339  (vector unsigned short)pixelsv4);
340  pixelssum3 = vec_add(pixelssum3, vcone);
341  pixelssum1 = vec_add((vector unsigned short)pixelsv1,
342  (vector unsigned short)pixelsv2);
343  pixelssum1 = vec_add(pixelssum1, vcone);
344 
345  for (i = 0; i < h ; i++) {
346  blockv = vec_ld(0, block);
347 
348  temp1 = vec_ld(line_size, pixels);
349  temp2 = vec_ld(line_size + 16, pixels);
350  pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
351  if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) {
352  pixelsv2 = temp2;
353  } else {
354  pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
355  }
356 
357  pixelsv3 = vec_mergel(vczero, pixelsv1);
358  pixelsv4 = vec_mergel(vczero, pixelsv2);
359  pixelsv1 = vec_mergeh(vczero, pixelsv1);
360  pixelsv2 = vec_mergeh(vczero, pixelsv2);
361 
362  pixelssum4 = vec_add((vector unsigned short)pixelsv3,
363  (vector unsigned short)pixelsv4);
364  pixelssum2 = vec_add((vector unsigned short)pixelsv1,
365  (vector unsigned short)pixelsv2);
366  temp4 = vec_add(pixelssum3, pixelssum4);
367  temp4 = vec_sra(temp4, vctwo);
368  temp3 = vec_add(pixelssum1, pixelssum2);
369  temp3 = vec_sra(temp3, vctwo);
370 
371  pixelssum3 = vec_add(pixelssum4, vcone);
372  pixelssum1 = vec_add(pixelssum2, vcone);
373 
374  blockv = vec_packsu(temp3, temp4);
375 
376  vec_st(blockv, 0, block);
377 
378  block += line_size;
379  pixels += line_size;
380  }
381 }
382 
383 /* next one assumes that ((line_size % 8) == 0) */
384 static void avg_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
385 {
386  register int i;
387  register vector unsigned char pixelsv1, pixelsv2, pixelsavg;
388  register vector unsigned char blockv, temp1, temp2, blocktemp;
389  register vector unsigned short pixelssum1, pixelssum2, temp3;
390 
391  register const vector unsigned char vczero = (const vector unsigned char)
392  vec_splat_u8(0);
393  register const vector unsigned short vctwo = (const vector unsigned short)
394  vec_splat_u16(2);
395 
396  temp1 = vec_ld(0, pixels);
397  temp2 = vec_ld(16, pixels);
398  pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
399  if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) {
400  pixelsv2 = temp2;
401  } else {
402  pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
403  }
404  pixelsv1 = vec_mergeh(vczero, pixelsv1);
405  pixelsv2 = vec_mergeh(vczero, pixelsv2);
406  pixelssum1 = vec_add((vector unsigned short)pixelsv1,
407  (vector unsigned short)pixelsv2);
408  pixelssum1 = vec_add(pixelssum1, vctwo);
409 
410  for (i = 0; i < h ; i++) {
411  int rightside = ((unsigned long)block & 0x0000000F);
412  blockv = vec_ld(0, block);
413 
414  temp1 = vec_ld(line_size, pixels);
415  temp2 = vec_ld(line_size + 16, pixels);
416  pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
417  if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) {
418  pixelsv2 = temp2;
419  } else {
420  pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
421  }
422 
423  pixelsv1 = vec_mergeh(vczero, pixelsv1);
424  pixelsv2 = vec_mergeh(vczero, pixelsv2);
425  pixelssum2 = vec_add((vector unsigned short)pixelsv1,
426  (vector unsigned short)pixelsv2);
427  temp3 = vec_add(pixelssum1, pixelssum2);
428  temp3 = vec_sra(temp3, vctwo);
429  pixelssum1 = vec_add(pixelssum2, vctwo);
430  pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
431 
432  if (rightside) {
433  blocktemp = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
434  } else {
435  blocktemp = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
436  }
437 
438  blockv = vec_avg(blocktemp, blockv);
439  vec_st(blockv, 0, block);
440 
441  block += line_size;
442  pixels += line_size;
443  }
444 }
445 #endif /* HAVE_ALTIVEC */
446 
448 {
449 #if HAVE_ALTIVEC
450  int mm_flags = av_get_cpu_flags();
451 
452  if (mm_flags & AV_CPU_FLAG_ALTIVEC) {
454  c->avg_pixels_tab[1][0] = avg_pixels8_altivec;
455  c->avg_pixels_tab[1][3] = avg_pixels8_xy2_altivec;
456 
458  c->put_pixels_tab[1][3] = put_pixels8_xy2_altivec;
459  c->put_pixels_tab[0][3] = put_pixels16_xy2_altivec;
460 
462  c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_altivec;
463  c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_altivec;
464  }
465 #endif /* HAVE_ALTIVEC */
466 }
#define AV_CPU_FLAG_ALTIVEC
standard
Definition: cpu.h:52
op_pixels_func avg_pixels_tab[4][4]
Halfpel motion compensation with rounding (a+b+1)>>1.
Definition: hpeldsp.h:68
uint8_t
#define av_cold
Definition: attributes.h:78
#define s0
Definition: regdef.h:37
Half-pel DSP context.
Definition: hpeldsp.h:45
void ff_avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
perm
Definition: f_perms.c:74
void ff_put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
Half-pel DSP functions.
#define vcprm(a, b, c, d)
Definition: util_altivec.h:48
op_pixels_func put_pixels_tab[4][4]
Halfpel motion compensation with rounding (a+b+1)>>1.
Definition: hpeldsp.h:56
op_pixels_func put_no_rnd_pixels_tab[4][4]
Halfpel motion compensation with no rounding (a+b)>>1.
Definition: hpeldsp.h:80
synthesis window for stochastic i
#define s1
Definition: regdef.h:38
int av_get_cpu_flags(void)
Return the flags which specify extensions supported by the CPU.
Definition: cpu.c:30
Contains misc utility macros and inline functions.
static int flags
Definition: cpu.c:23
static double c[64]
av_cold void ff_hpeldsp_init_ppc(HpelDSPContext *c, int flags)