fft_altivec.c
Go to the documentation of this file.
1 /*
2  * FFT/IFFT transforms
3  * AltiVec-enabled
4  * Copyright (c) 2009 Loren Merritt
5  *
6  * This file is part of FFmpeg.
7  *
8  * FFmpeg is free software; you can redistribute it and/or
9  * modify it under the terms of the GNU Lesser General Public
10  * License as published by the Free Software Foundation; either
11  * version 2.1 of the License, or (at your option) any later version.
12  *
13  * FFmpeg is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16  * Lesser General Public License for more details.
17  *
18  * You should have received a copy of the GNU Lesser General Public
19  * License along with FFmpeg; if not, write to the Free Software
20  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21  */
22 
25 #include "libavcodec/fft.h"
26 
27 /**
28  * Do a complex FFT with the parameters defined in ff_fft_init(). The
29  * input data must be permuted before with s->revtab table. No
30  * 1.0/sqrt(n) normalization is done.
31  * AltiVec-enabled
32  * This code assumes that the 'z' pointer is 16 bytes-aligned
33  * It also assumes all FFTComplex are 8 bytes-aligned pair of float
34  */
35 
38 
39 #if HAVE_GNU_AS
40 static void ff_imdct_half_altivec(FFTContext *s, FFTSample *output, const FFTSample *input)
41 {
42  int j, k;
43  int n = 1 << s->mdct_bits;
44  int n4 = n >> 2;
45  int n8 = n >> 3;
46  int n32 = n >> 5;
47  const uint16_t *revtabj = s->revtab;
48  const uint16_t *revtabk = s->revtab+n4;
49  const vec_f *tcos = (const vec_f*)(s->tcos+n8);
50  const vec_f *tsin = (const vec_f*)(s->tsin+n8);
51  const vec_f *pin = (const vec_f*)(input+n4);
52  vec_f *pout = (vec_f*)(output+n4);
53 
54  /* pre rotation */
55  k = n32-1;
56  do {
57  vec_f cos,sin,cos0,sin0,cos1,sin1,re,im,r0,i0,r1,i1,a,b,c,d;
58 #define CMULA(p,o0,o1,o2,o3)\
59  a = pin[ k*2+p]; /* { z[k].re, z[k].im, z[k+1].re, z[k+1].im } */\
60  b = pin[-k*2-p-1]; /* { z[-k-2].re, z[-k-2].im, z[-k-1].re, z[-k-1].im } */\
61  re = vec_perm(a, b, vcprm(0,2,s0,s2)); /* { z[k].re, z[k+1].re, z[-k-2].re, z[-k-1].re } */\
62  im = vec_perm(a, b, vcprm(s3,s1,3,1)); /* { z[-k-1].im, z[-k-2].im, z[k+1].im, z[k].im } */\
63  cos = vec_perm(cos0, cos1, vcprm(o0,o1,s##o2,s##o3)); /* { cos[k], cos[k+1], cos[-k-2], cos[-k-1] } */\
64  sin = vec_perm(sin0, sin1, vcprm(o0,o1,s##o2,s##o3));\
65  r##p = im*cos - re*sin;\
66  i##p = re*cos + im*sin;
67 #define STORE2(v,dst)\
68  j = dst;\
69  vec_ste(v, 0, output+j*2);\
70  vec_ste(v, 4, output+j*2);
71 #define STORE8(p)\
72  a = vec_perm(r##p, i##p, vcprm(0,s0,0,s0));\
73  b = vec_perm(r##p, i##p, vcprm(1,s1,1,s1));\
74  c = vec_perm(r##p, i##p, vcprm(2,s2,2,s2));\
75  d = vec_perm(r##p, i##p, vcprm(3,s3,3,s3));\
76  STORE2(a, revtabk[ p*2-4]);\
77  STORE2(b, revtabk[ p*2-3]);\
78  STORE2(c, revtabj[-p*2+2]);\
79  STORE2(d, revtabj[-p*2+3]);
80 
81  cos0 = tcos[k];
82  sin0 = tsin[k];
83  cos1 = tcos[-k-1];
84  sin1 = tsin[-k-1];
85  CMULA(0, 0,1,2,3);
86  CMULA(1, 2,3,0,1);
87  STORE8(0);
88  STORE8(1);
89  revtabj += 4;
90  revtabk -= 4;
91  k--;
92  } while(k >= 0);
93 
94  ff_fft_calc_altivec(s, (FFTComplex*)output);
95 
96  /* post rotation + reordering */
97  j = -n32;
98  k = n32-1;
99  do {
100  vec_f cos,sin,re,im,a,b,c,d;
101 #define CMULB(d0,d1,o)\
102  re = pout[o*2];\
103  im = pout[o*2+1];\
104  cos = tcos[o];\
105  sin = tsin[o];\
106  d0 = im*sin - re*cos;\
107  d1 = re*sin + im*cos;
108 
109  CMULB(a,b,j);
110  CMULB(c,d,k);
111  pout[2*j] = vec_perm(a, d, vcprm(0,s3,1,s2));
112  pout[2*j+1] = vec_perm(a, d, vcprm(2,s1,3,s0));
113  pout[2*k] = vec_perm(c, b, vcprm(0,s3,1,s2));
114  pout[2*k+1] = vec_perm(c, b, vcprm(2,s1,3,s0));
115  j++;
116  k--;
117  } while(k >= 0);
118 }
119 
120 static void ff_imdct_calc_altivec(FFTContext *s, FFTSample *output, const FFTSample *input)
121 {
122  int k;
123  int n = 1 << s->mdct_bits;
124  int n4 = n >> 2;
125  int n16 = n >> 4;
126  vec_u32 sign = {1U<<31,1U<<31,1U<<31,1U<<31};
127  vec_u32 *p0 = (vec_u32*)(output+n4);
128  vec_u32 *p1 = (vec_u32*)(output+n4*3);
129 
130  ff_imdct_half_altivec(s, output+n4, input);
131 
132  for (k = 0; k < n16; k++) {
133  vec_u32 a = p0[k] ^ sign;
134  vec_u32 b = p1[-k-1];
135  p0[-k-1] = vec_perm(a, a, vcprm(3,2,1,0));
136  p1[k] = vec_perm(b, b, vcprm(3,2,1,0));
137  }
138 }
139 #endif /* HAVE_GNU_AS */
140 
142 {
143 #if HAVE_GNU_AS
145  if (s->mdct_bits >= 5) {
146  s->imdct_calc = ff_imdct_calc_altivec;
147  s->imdct_half = ff_imdct_half_altivec;
148  }
149 #endif
150 }
const char * s
Definition: avisynth_c.h:668
set threshold d
#define av_cold
Definition: attributes.h:78
#define b
Definition: input.c:42
#define U(x)
void ff_fft_calc_altivec(FFTContext *s, FFTComplex *z)
Do a complex FFT with the parameters defined in ff_fft_init().
#define s2
Definition: regdef.h:39
#define s0
Definition: regdef.h:37
these buffered frames must be flushed immediately if a new input produces new the filter must not call request_frame to get more It must just process the frame or queue it The task of requesting more frames is left to the filter s request_frame method or the application If a filter has several the filter must be ready for frames arriving randomly on any input any filter with several inputs will most likely require some kind of queuing mechanism It is perfectly acceptable to have a limited queue and to drop frames when the inputs are too unbalanced request_frame This method is called when a frame is wanted on an output For an input
void(* imdct_calc)(struct FFTContext *s, FFTSample *output, const FFTSample *input)
Definition: fft.h:81
float FFTSample
Definition: avfft.h:35
Definition: fft.h:62
FFTSample * tsin
Definition: fft.h:71
#define vec_u32
Definition: types_altivec.h:31
av_cold void ff_fft_init_altivec(FFTContext *s)
Definition: fft_altivec.c:141
#define s3
Definition: regdef.h:40
for k
float im
Definition: fft-test.c:64
#define vcprm(a, b, c, d)
Definition: util_altivec.h:48
int mdct_bits
Definition: fft.h:68
#define s1
Definition: regdef.h:38
void(* imdct_half)(struct FFTContext *s, FFTSample *output, const FFTSample *input)
Definition: fft.h:82
Contains misc utility macros and inline functions.
static double c[64]
void(* fft_calc)(struct FFTContext *s, FFTComplex *z)
Do a complex FFT with the parameters defined in ff_fft_init().
Definition: fft.h:80
these buffered frames must be flushed immediately if a new input produces new output(Example:frame rate-doubling filter:filter_frame must(1) flush the second copy of the previous frame, if it is still there,(2) push the first copy of the incoming frame,(3) keep the second copy for later.) If the input frame is not enough to produce output
FFTSample * tcos
Definition: fft.h:70
float re
Definition: fft-test.c:64
half analysis window size pin
uint16_t * revtab
Definition: fft.h:65
#define vec_f
Definition: types_altivec.h:33
void ff_fft_calc_interleave_altivec(FFTContext *s, FFTComplex *z)