fft_mips.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2012
3  * MIPS Technologies, Inc., California.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  * notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  * notice, this list of conditions and the following disclaimer in the
12  * documentation and/or other materials provided with the distribution.
13  * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
14  * contributors may be used to endorse or promote products derived from
15  * this software without specific prior written permission.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED. IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  *
29  * Author: Stanislav Ocovaj (socovaj@mips.com)
30  * Author: Zoran Lukic (zoranl@mips.com)
31  *
32  * Optimized MDCT/IMDCT and FFT transforms
33  *
34  * This file is part of FFmpeg.
35  *
36  * FFmpeg is free software; you can redistribute it and/or
37  * modify it under the terms of the GNU Lesser General Public
38  * License as published by the Free Software Foundation; either
39  * version 2.1 of the License, or (at your option) any later version.
40  *
41  * FFmpeg is distributed in the hope that it will be useful,
42  * but WITHOUT ANY WARRANTY; without even the implied warranty of
43  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
44  * Lesser General Public License for more details.
45  *
46  * You should have received a copy of the GNU Lesser General Public
47  * License along with FFmpeg; if not, write to the Free Software
48  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
49  */
50 #include "config.h"
51 #include "libavcodec/fft.h"
52 #include "fft_table.h"
53 
54 /**
55  * FFT transform
56  */
57 
58 #if HAVE_INLINE_ASM
59 static void ff_fft_calc_mips(FFTContext *s, FFTComplex *z)
60 {
61  int nbits, i, n, num_transforms, offset, step;
62  int n4, n2, n34;
63  FFTSample tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
64  FFTComplex *tmpz;
65  float w_re, w_im;
66  float *w_re_ptr, *w_im_ptr;
67  const int fft_size = (1 << s->nbits);
68  int s_n = s->nbits;
69  int tem1, tem2;
70  float pom, pom1, pom2, pom3;
71  float temp, temp1, temp3, temp4;
72  FFTComplex * tmpz_n2, * tmpz_n34, * tmpz_n4;
73  FFTComplex * tmpz_n2_i, * tmpz_n34_i, * tmpz_n4_i, * tmpz_i;
74 
75  /**
76  *num_transforms = (0x2aab >> (16 - s->nbits)) | 1;
77  */
78  __asm__ volatile (
79  "li %[tem1], 16 \n\t"
80  "sub %[s_n], %[tem1], %[s_n] \n\t"
81  "li %[tem2], 10923 \n\t"
82  "srav %[tem2], %[tem2], %[s_n] \n\t"
83  "ori %[num_t],%[tem2], 1 \n\t"
84  : [num_t]"=r"(num_transforms), [s_n]"+r"(s_n),
85  [tem1]"=&r"(tem1), [tem2]"=&r"(tem2)
86  );
87 
88 
89  for (n=0; n<num_transforms; n++) {
90  offset = fft_offsets_lut[n] << 2;
91  tmpz = z + offset;
92 
93  tmp1 = tmpz[0].re + tmpz[1].re;
94  tmp5 = tmpz[2].re + tmpz[3].re;
95  tmp2 = tmpz[0].im + tmpz[1].im;
96  tmp6 = tmpz[2].im + tmpz[3].im;
97  tmp3 = tmpz[0].re - tmpz[1].re;
98  tmp8 = tmpz[2].im - tmpz[3].im;
99  tmp4 = tmpz[0].im - tmpz[1].im;
100  tmp7 = tmpz[2].re - tmpz[3].re;
101 
102  tmpz[0].re = tmp1 + tmp5;
103  tmpz[2].re = tmp1 - tmp5;
104  tmpz[0].im = tmp2 + tmp6;
105  tmpz[2].im = tmp2 - tmp6;
106  tmpz[1].re = tmp3 + tmp8;
107  tmpz[3].re = tmp3 - tmp8;
108  tmpz[1].im = tmp4 - tmp7;
109  tmpz[3].im = tmp4 + tmp7;
110 
111  }
112 
113  if (fft_size < 8)
114  return;
115 
116  num_transforms = (num_transforms >> 1) | 1;
117 
118  for (n=0; n<num_transforms; n++) {
119  offset = fft_offsets_lut[n] << 3;
120  tmpz = z + offset;
121 
122  __asm__ volatile (
123  "lwc1 %[tmp1], 32(%[tmpz]) \n\t"
124  "lwc1 %[pom], 40(%[tmpz]) \n\t"
125  "lwc1 %[tmp3], 48(%[tmpz]) \n\t"
126  "lwc1 %[pom1], 56(%[tmpz]) \n\t"
127  "lwc1 %[tmp2], 36(%[tmpz]) \n\t"
128  "lwc1 %[pom2], 44(%[tmpz]) \n\t"
129  "lwc1 %[pom3], 60(%[tmpz]) \n\t"
130  "lwc1 %[tmp4], 52(%[tmpz]) \n\t"
131  "add.s %[tmp1], %[tmp1], %[pom] \n\t" // tmp1 = tmpz[4].re + tmpz[5].re;
132  "add.s %[tmp3], %[tmp3], %[pom1] \n\t" // tmp3 = tmpz[6].re + tmpz[7].re;
133  "add.s %[tmp2], %[tmp2], %[pom2] \n\t" // tmp2 = tmpz[4].im + tmpz[5].im;
134  "lwc1 %[pom], 40(%[tmpz]) \n\t"
135  "add.s %[tmp4], %[tmp4], %[pom3] \n\t" // tmp4 = tmpz[6].im + tmpz[7].im;
136  "add.s %[tmp5], %[tmp1], %[tmp3] \n\t" // tmp5 = tmp1 + tmp3;
137  "sub.s %[tmp7], %[tmp1], %[tmp3] \n\t" // tmp7 = tmp1 - tmp3;
138  "lwc1 %[tmp1], 32(%[tmpz]) \n\t"
139  "lwc1 %[pom1], 44(%[tmpz]) \n\t"
140  "add.s %[tmp6], %[tmp2], %[tmp4] \n\t" // tmp6 = tmp2 + tmp4;
141  "sub.s %[tmp8], %[tmp2], %[tmp4] \n\t" // tmp8 = tmp2 - tmp4;
142  "lwc1 %[tmp2], 36(%[tmpz]) \n\t"
143  "lwc1 %[pom2], 56(%[tmpz]) \n\t"
144  "lwc1 %[pom3], 60(%[tmpz]) \n\t"
145  "lwc1 %[tmp3], 48(%[tmpz]) \n\t"
146  "lwc1 %[tmp4], 52(%[tmpz]) \n\t"
147  "sub.s %[tmp1], %[tmp1], %[pom] \n\t" // tmp1 = tmpz[4].re - tmpz[5].re;
148  "lwc1 %[pom], 0(%[tmpz]) \n\t"
149  "sub.s %[tmp2], %[tmp2], %[pom1] \n\t" // tmp2 = tmpz[4].im - tmpz[5].im;
150  "sub.s %[tmp3], %[tmp3], %[pom2] \n\t" // tmp3 = tmpz[6].re - tmpz[7].re;
151  "lwc1 %[pom2], 4(%[tmpz]) \n\t"
152  "sub.s %[pom1], %[pom], %[tmp5] \n\t"
153  "sub.s %[tmp4], %[tmp4], %[pom3] \n\t" // tmp4 = tmpz[6].im - tmpz[7].im;
154  "add.s %[pom3], %[pom], %[tmp5] \n\t"
155  "sub.s %[pom], %[pom2], %[tmp6] \n\t"
156  "add.s %[pom2], %[pom2], %[tmp6] \n\t"
157  "swc1 %[pom1], 32(%[tmpz]) \n\t" // tmpz[4].re = tmpz[0].re - tmp5;
158  "swc1 %[pom3], 0(%[tmpz]) \n\t" // tmpz[0].re = tmpz[0].re + tmp5;
159  "swc1 %[pom], 36(%[tmpz]) \n\t" // tmpz[4].im = tmpz[0].im - tmp6;
160  "swc1 %[pom2], 4(%[tmpz]) \n\t" // tmpz[0].im = tmpz[0].im + tmp6;
161  "lwc1 %[pom1], 16(%[tmpz]) \n\t"
162  "lwc1 %[pom3], 20(%[tmpz]) \n\t"
163  "li.s %[pom], 0.7071067812 \n\t" // float pom = 0.7071067812f;
164  "add.s %[temp1],%[tmp1], %[tmp2] \n\t"
165  "sub.s %[temp], %[pom1], %[tmp8] \n\t"
166  "add.s %[pom2], %[pom3], %[tmp7] \n\t"
167  "sub.s %[temp3],%[tmp3], %[tmp4] \n\t"
168  "sub.s %[temp4],%[tmp2], %[tmp1] \n\t"
169  "swc1 %[temp], 48(%[tmpz]) \n\t" // tmpz[6].re = tmpz[2].re - tmp8;
170  "swc1 %[pom2], 52(%[tmpz]) \n\t" // tmpz[6].im = tmpz[2].im + tmp7;
171  "add.s %[pom1], %[pom1], %[tmp8] \n\t"
172  "sub.s %[pom3], %[pom3], %[tmp7] \n\t"
173  "add.s %[tmp3], %[tmp3], %[tmp4] \n\t"
174  "mul.s %[tmp5], %[pom], %[temp1] \n\t" // tmp5 = pom * (tmp1 + tmp2);
175  "mul.s %[tmp7], %[pom], %[temp3] \n\t" // tmp7 = pom * (tmp3 - tmp4);
176  "mul.s %[tmp6], %[pom], %[temp4] \n\t" // tmp6 = pom * (tmp2 - tmp1);
177  "mul.s %[tmp8], %[pom], %[tmp3] \n\t" // tmp8 = pom * (tmp3 + tmp4);
178  "swc1 %[pom1], 16(%[tmpz]) \n\t" // tmpz[2].re = tmpz[2].re + tmp8;
179  "swc1 %[pom3], 20(%[tmpz]) \n\t" // tmpz[2].im = tmpz[2].im - tmp7;
180  "add.s %[tmp1], %[tmp5], %[tmp7] \n\t" // tmp1 = tmp5 + tmp7;
181  "sub.s %[tmp3], %[tmp5], %[tmp7] \n\t" // tmp3 = tmp5 - tmp7;
182  "add.s %[tmp2], %[tmp6], %[tmp8] \n\t" // tmp2 = tmp6 + tmp8;
183  "sub.s %[tmp4], %[tmp6], %[tmp8] \n\t" // tmp4 = tmp6 - tmp8;
184  "lwc1 %[temp], 8(%[tmpz]) \n\t"
185  "lwc1 %[temp1],12(%[tmpz]) \n\t"
186  "lwc1 %[pom], 24(%[tmpz]) \n\t"
187  "lwc1 %[pom2], 28(%[tmpz]) \n\t"
188  "sub.s %[temp4],%[temp], %[tmp1] \n\t"
189  "sub.s %[temp3],%[temp1], %[tmp2] \n\t"
190  "add.s %[temp], %[temp], %[tmp1] \n\t"
191  "add.s %[temp1],%[temp1], %[tmp2] \n\t"
192  "sub.s %[pom1], %[pom], %[tmp4] \n\t"
193  "add.s %[pom3], %[pom2], %[tmp3] \n\t"
194  "add.s %[pom], %[pom], %[tmp4] \n\t"
195  "sub.s %[pom2], %[pom2], %[tmp3] \n\t"
196  "swc1 %[temp4],40(%[tmpz]) \n\t" // tmpz[5].re = tmpz[1].re - tmp1;
197  "swc1 %[temp3],44(%[tmpz]) \n\t" // tmpz[5].im = tmpz[1].im - tmp2;
198  "swc1 %[temp], 8(%[tmpz]) \n\t" // tmpz[1].re = tmpz[1].re + tmp1;
199  "swc1 %[temp1],12(%[tmpz]) \n\t" // tmpz[1].im = tmpz[1].im + tmp2;
200  "swc1 %[pom1], 56(%[tmpz]) \n\t" // tmpz[7].re = tmpz[3].re - tmp4;
201  "swc1 %[pom3], 60(%[tmpz]) \n\t" // tmpz[7].im = tmpz[3].im + tmp3;
202  "swc1 %[pom], 24(%[tmpz]) \n\t" // tmpz[3].re = tmpz[3].re + tmp4;
203  "swc1 %[pom2], 28(%[tmpz]) \n\t" // tmpz[3].im = tmpz[3].im - tmp3;
204  : [tmp1]"=&f"(tmp1), [pom]"=&f"(pom), [pom1]"=&f"(pom1), [pom2]"=&f"(pom2),
205  [tmp3]"=&f"(tmp3), [tmp2]"=&f"(tmp2), [tmp4]"=&f"(tmp4), [tmp5]"=&f"(tmp5), [tmp7]"=&f"(tmp7),
206  [tmp6]"=&f"(tmp6), [tmp8]"=&f"(tmp8), [pom3]"=&f"(pom3),[temp]"=&f"(temp), [temp1]"=&f"(temp1),
207  [temp3]"=&f"(temp3), [temp4]"=&f"(temp4)
208  : [tmpz]"r"(tmpz)
209  : "memory"
210  );
211  }
212 
213  step = 1 << (MAX_LOG2_NFFT - 4);
214  n4 = 4;
215 
216  for (nbits=4; nbits<=s->nbits; nbits++) {
217  /*
218  * num_transforms = (num_transforms >> 1) | 1;
219  */
220  __asm__ volatile (
221  "sra %[num_t], %[num_t], 1 \n\t"
222  "ori %[num_t], %[num_t], 1 \n\t"
223 
224  : [num_t] "+r" (num_transforms)
225  );
226  n2 = 2 * n4;
227  n34 = 3 * n4;
228 
229  for (n=0; n<num_transforms; n++) {
230  offset = fft_offsets_lut[n] << nbits;
231  tmpz = z + offset;
232 
233  tmpz_n2 = tmpz + n2;
234  tmpz_n4 = tmpz + n4;
235  tmpz_n34 = tmpz + n34;
236 
237  __asm__ volatile (
238  "lwc1 %[pom1], 0(%[tmpz_n2]) \n\t"
239  "lwc1 %[pom], 0(%[tmpz_n34]) \n\t"
240  "lwc1 %[pom2], 4(%[tmpz_n2]) \n\t"
241  "lwc1 %[pom3], 4(%[tmpz_n34]) \n\t"
242  "lwc1 %[temp1],0(%[tmpz]) \n\t"
243  "lwc1 %[temp3],4(%[tmpz]) \n\t"
244  "add.s %[tmp5], %[pom1], %[pom] \n\t" // tmp5 = tmpz[ n2].re + tmpz[n34].re;
245  "sub.s %[tmp1], %[pom1], %[pom] \n\t" // tmp1 = tmpz[ n2].re - tmpz[n34].re;
246  "add.s %[tmp6], %[pom2], %[pom3] \n\t" // tmp6 = tmpz[ n2].im + tmpz[n34].im;
247  "sub.s %[tmp2], %[pom2], %[pom3] \n\t" // tmp2 = tmpz[ n2].im - tmpz[n34].im;
248  "sub.s %[temp], %[temp1], %[tmp5] \n\t"
249  "add.s %[temp1],%[temp1], %[tmp5] \n\t"
250  "sub.s %[temp4],%[temp3], %[tmp6] \n\t"
251  "add.s %[temp3],%[temp3], %[tmp6] \n\t"
252  "swc1 %[temp], 0(%[tmpz_n2]) \n\t" // tmpz[ n2].re = tmpz[ 0].re - tmp5;
253  "swc1 %[temp1],0(%[tmpz]) \n\t" // tmpz[ 0].re = tmpz[ 0].re + tmp5;
254  "lwc1 %[pom1], 0(%[tmpz_n4]) \n\t"
255  "swc1 %[temp4],4(%[tmpz_n2]) \n\t" // tmpz[ n2].im = tmpz[ 0].im - tmp6;
256  "lwc1 %[temp], 4(%[tmpz_n4]) \n\t"
257  "swc1 %[temp3],4(%[tmpz]) \n\t" // tmpz[ 0].im = tmpz[ 0].im + tmp6;
258  "sub.s %[pom], %[pom1], %[tmp2] \n\t"
259  "add.s %[pom1], %[pom1], %[tmp2] \n\t"
260  "add.s %[temp1],%[temp], %[tmp1] \n\t"
261  "sub.s %[temp], %[temp], %[tmp1] \n\t"
262  "swc1 %[pom], 0(%[tmpz_n34]) \n\t" // tmpz[n34].re = tmpz[n4].re - tmp2;
263  "swc1 %[pom1], 0(%[tmpz_n4]) \n\t" // tmpz[ n4].re = tmpz[n4].re + tmp2;
264  "swc1 %[temp1],4(%[tmpz_n34]) \n\t" // tmpz[n34].im = tmpz[n4].im + tmp1;
265  "swc1 %[temp], 4(%[tmpz_n4]) \n\t" // tmpz[ n4].im = tmpz[n4].im - tmp1;
266  : [tmp5]"=&f"(tmp5),
267  [tmp1]"=&f"(tmp1), [pom]"=&f"(pom), [pom1]"=&f"(pom1), [pom2]"=&f"(pom2),
268  [tmp2]"=&f"(tmp2), [tmp6]"=&f"(tmp6), [pom3]"=&f"(pom3),
269  [temp]"=&f"(temp), [temp1]"=&f"(temp1), [temp3]"=&f"(temp3), [temp4]"=&f"(temp4)
270  : [tmpz]"r"(tmpz), [tmpz_n2]"r"(tmpz_n2), [tmpz_n34]"r"(tmpz_n34), [tmpz_n4]"r"(tmpz_n4)
271  : "memory"
272  );
273 
274  w_re_ptr = (float*)(ff_cos_65536 + step);
275  w_im_ptr = (float*)(ff_cos_65536 + MAX_FFT_SIZE/4 - step);
276 
277  for (i=1; i<n4; i++) {
278  w_re = w_re_ptr[0];
279  w_im = w_im_ptr[0];
280  tmpz_n2_i = tmpz_n2 + i;
281  tmpz_n4_i = tmpz_n4 + i;
282  tmpz_n34_i= tmpz_n34 + i;
283  tmpz_i = tmpz + i;
284 
285  __asm__ volatile (
286  "lwc1 %[temp], 0(%[tmpz_n2_i]) \n\t"
287  "lwc1 %[temp1], 4(%[tmpz_n2_i]) \n\t"
288  "lwc1 %[pom], 0(%[tmpz_n34_i]) \n\t"
289  "lwc1 %[pom1], 4(%[tmpz_n34_i]) \n\t"
290  "mul.s %[temp3], %[w_im], %[temp] \n\t"
291  "mul.s %[temp4], %[w_im], %[temp1] \n\t"
292  "mul.s %[pom2], %[w_im], %[pom1] \n\t"
293  "mul.s %[pom3], %[w_im], %[pom] \n\t"
294  "msub.s %[tmp2], %[temp3], %[w_re], %[temp1] \n\t" // tmp2 = w_re * tmpz[ n2+i].im - w_im * tmpz[ n2+i].re;
295  "madd.s %[tmp1], %[temp4], %[w_re], %[temp] \n\t" // tmp1 = w_re * tmpz[ n2+i].re + w_im * tmpz[ n2+i].im;
296  "msub.s %[tmp3], %[pom2], %[w_re], %[pom] \n\t" // tmp3 = w_re * tmpz[n34+i].re - w_im * tmpz[n34+i].im;
297  "madd.s %[tmp4], %[pom3], %[w_re], %[pom1] \n\t" // tmp4 = w_re * tmpz[n34+i].im + w_im * tmpz[n34+i].re;
298  "lwc1 %[temp], 0(%[tmpz_i]) \n\t"
299  "lwc1 %[pom], 4(%[tmpz_i]) \n\t"
300  "add.s %[tmp5], %[tmp1], %[tmp3] \n\t" // tmp5 = tmp1 + tmp3;
301  "sub.s %[tmp1], %[tmp1], %[tmp3] \n\t" // tmp1 = tmp1 - tmp3;
302  "add.s %[tmp6], %[tmp2], %[tmp4] \n\t" // tmp6 = tmp2 + tmp4;
303  "sub.s %[tmp2], %[tmp2], %[tmp4] \n\t" // tmp2 = tmp2 - tmp4;
304  "sub.s %[temp1], %[temp], %[tmp5] \n\t"
305  "add.s %[temp], %[temp], %[tmp5] \n\t"
306  "sub.s %[pom1], %[pom], %[tmp6] \n\t"
307  "add.s %[pom], %[pom], %[tmp6] \n\t"
308  "lwc1 %[temp3], 0(%[tmpz_n4_i]) \n\t"
309  "lwc1 %[pom2], 4(%[tmpz_n4_i]) \n\t"
310  "swc1 %[temp1], 0(%[tmpz_n2_i]) \n\t" // tmpz[ n2+i].re = tmpz[ i].re - tmp5;
311  "swc1 %[temp], 0(%[tmpz_i]) \n\t" // tmpz[ i].re = tmpz[ i].re + tmp5;
312  "swc1 %[pom1], 4(%[tmpz_n2_i]) \n\t" // tmpz[ n2+i].im = tmpz[ i].im - tmp6;
313  "swc1 %[pom] , 4(%[tmpz_i]) \n\t" // tmpz[ i].im = tmpz[ i].im + tmp6;
314  "sub.s %[temp4], %[temp3], %[tmp2] \n\t"
315  "add.s %[pom3], %[pom2], %[tmp1] \n\t"
316  "add.s %[temp3], %[temp3], %[tmp2] \n\t"
317  "sub.s %[pom2], %[pom2], %[tmp1] \n\t"
318  "swc1 %[temp4], 0(%[tmpz_n34_i]) \n\t" // tmpz[n34+i].re = tmpz[n4+i].re - tmp2;
319  "swc1 %[pom3], 4(%[tmpz_n34_i]) \n\t" // tmpz[n34+i].im = tmpz[n4+i].im + tmp1;
320  "swc1 %[temp3], 0(%[tmpz_n4_i]) \n\t" // tmpz[ n4+i].re = tmpz[n4+i].re + tmp2;
321  "swc1 %[pom2], 4(%[tmpz_n4_i]) \n\t" // tmpz[ n4+i].im = tmpz[n4+i].im - tmp1;
322  : [tmp1]"=&f"(tmp1), [tmp2]"=&f" (tmp2), [temp]"=&f"(temp), [tmp3]"=&f"(tmp3),
323  [tmp4]"=&f"(tmp4), [tmp5]"=&f"(tmp5), [tmp6]"=&f"(tmp6),
324  [temp1]"=&f"(temp1), [temp3]"=&f"(temp3), [temp4]"=&f"(temp4),
325  [pom]"=&f"(pom), [pom1]"=&f"(pom1), [pom2]"=&f"(pom2), [pom3]"=&f"(pom3)
326  : [w_re]"f"(w_re), [w_im]"f"(w_im),
327  [tmpz_i]"r"(tmpz_i),[tmpz_n2_i]"r"(tmpz_n2_i),
328  [tmpz_n34_i]"r"(tmpz_n34_i), [tmpz_n4_i]"r"(tmpz_n4_i)
329  : "memory"
330  );
331  w_re_ptr += step;
332  w_im_ptr -= step;
333  }
334  }
335  step >>= 1;
336  n4 <<= 1;
337  }
338 }
339 
340 /**
341  * MDCT/IMDCT transforms.
342  */
343 
344 static void ff_imdct_half_mips(FFTContext *s, FFTSample *output, const FFTSample *input)
345 {
346  int k, n8, n4, n2, n, j;
347  const uint16_t *revtab = s->revtab;
348  const FFTSample *tcos = s->tcos;
349  const FFTSample *tsin = s->tsin;
350  const FFTSample *in1, *in2, *in3, *in4;
351  FFTComplex *z = (FFTComplex *)output;
352 
353  int j1;
354  const float *tcos1, *tsin1, *tcos2, *tsin2;
355  float temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8,
356  temp9, temp10, temp11, temp12, temp13, temp14, temp15, temp16;
357  FFTComplex *z1, *z2;
358 
359  n = 1 << s->mdct_bits;
360  n2 = n >> 1;
361  n4 = n >> 2;
362  n8 = n >> 3;
363 
364  /* pre rotation */
365  in1 = input;
366  in2 = input + n2 - 1;
367  in3 = input + 2;
368  in4 = input + n2 - 3;
369 
370  tcos1 = tcos;
371  tsin1 = tsin;
372 
373  /* n4 = 64 or 128 */
374  for(k = 0; k < n4; k += 2) {
375  j = revtab[k ];
376  j1 = revtab[k + 1];
377 
378  __asm__ volatile (
379  "lwc1 %[temp1], 0(%[in2]) \t\n"
380  "lwc1 %[temp2], 0(%[tcos1]) \t\n"
381  "lwc1 %[temp3], 0(%[tsin1]) \t\n"
382  "lwc1 %[temp4], 0(%[in1]) \t\n"
383  "lwc1 %[temp5], 0(%[in4]) \t\n"
384  "mul.s %[temp9], %[temp1], %[temp2] \t\n"
385  "mul.s %[temp10], %[temp1], %[temp3] \t\n"
386  "lwc1 %[temp6], 4(%[tcos1]) \t\n"
387  "lwc1 %[temp7], 4(%[tsin1]) \t\n"
388  "nmsub.s %[temp9], %[temp9], %[temp4], %[temp3] \t\n"
389  "madd.s %[temp10], %[temp10], %[temp4], %[temp2] \t\n"
390  "mul.s %[temp11], %[temp5], %[temp6] \t\n"
391  "mul.s %[temp12], %[temp5], %[temp7] \t\n"
392  "lwc1 %[temp8], 0(%[in3]) \t\n"
393  "addiu %[tcos1], %[tcos1], 8 \t\n"
394  "addiu %[tsin1], %[tsin1], 8 \t\n"
395  "addiu %[in1], %[in1], 16 \t\n"
396  "nmsub.s %[temp11], %[temp11], %[temp8], %[temp7] \t\n"
397  "madd.s %[temp12], %[temp12], %[temp8], %[temp6] \t\n"
398  "addiu %[in2], %[in2], -16 \t\n"
399  "addiu %[in3], %[in3], 16 \t\n"
400  "addiu %[in4], %[in4], -16 \t\n"
401 
402  : [temp1]"=&f"(temp1), [temp2]"=&f"(temp2),
403  [temp3]"=&f"(temp3), [temp4]"=&f"(temp4),
404  [temp5]"=&f"(temp5), [temp6]"=&f"(temp6),
405  [temp7]"=&f"(temp7), [temp8]"=&f"(temp8),
406  [temp9]"=&f"(temp9), [temp10]"=&f"(temp10),
407  [temp11]"=&f"(temp11), [temp12]"=&f"(temp12),
408  [tsin1]"+r"(tsin1), [tcos1]"+r"(tcos1),
409  [in1]"+r"(in1), [in2]"+r"(in2),
410  [in3]"+r"(in3), [in4]"+r"(in4)
411  :
412  : "memory"
413  );
414 
415  z[j ].re = temp9;
416  z[j ].im = temp10;
417  z[j1].re = temp11;
418  z[j1].im = temp12;
419  }
420 
421  s->fft_calc(s, z);
422 
423  /* post rotation + reordering */
424  /* n8 = 32 or 64 */
425  for(k = 0; k < n8; k += 2) {
426  tcos1 = &tcos[n8 - k - 2];
427  tsin1 = &tsin[n8 - k - 2];
428  tcos2 = &tcos[n8 + k];
429  tsin2 = &tsin[n8 + k];
430  z1 = &z[n8 - k - 2];
431  z2 = &z[n8 + k ];
432 
433  __asm__ volatile (
434  "lwc1 %[temp1], 12(%[z1]) \t\n"
435  "lwc1 %[temp2], 4(%[tsin1]) \t\n"
436  "lwc1 %[temp3], 4(%[tcos1]) \t\n"
437  "lwc1 %[temp4], 8(%[z1]) \t\n"
438  "lwc1 %[temp5], 4(%[z1]) \t\n"
439  "mul.s %[temp9], %[temp1], %[temp2] \t\n"
440  "mul.s %[temp10], %[temp1], %[temp3] \t\n"
441  "lwc1 %[temp6], 0(%[tsin1]) \t\n"
442  "lwc1 %[temp7], 0(%[tcos1]) \t\n"
443  "nmsub.s %[temp9], %[temp9], %[temp4], %[temp3] \t\n"
444  "madd.s %[temp10], %[temp10], %[temp4], %[temp2] \t\n"
445  "mul.s %[temp11], %[temp5], %[temp6] \t\n"
446  "mul.s %[temp12], %[temp5], %[temp7] \t\n"
447  "lwc1 %[temp8], 0(%[z1]) \t\n"
448  "lwc1 %[temp1], 4(%[z2]) \t\n"
449  "lwc1 %[temp2], 0(%[tsin2]) \t\n"
450  "lwc1 %[temp3], 0(%[tcos2]) \t\n"
451  "nmsub.s %[temp11], %[temp11], %[temp8], %[temp7] \t\n"
452  "madd.s %[temp12], %[temp12], %[temp8], %[temp6] \t\n"
453  "mul.s %[temp13], %[temp1], %[temp2] \t\n"
454  "mul.s %[temp14], %[temp1], %[temp3] \t\n"
455  "lwc1 %[temp4], 0(%[z2]) \t\n"
456  "lwc1 %[temp5], 12(%[z2]) \t\n"
457  "lwc1 %[temp6], 4(%[tsin2]) \t\n"
458  "lwc1 %[temp7], 4(%[tcos2]) \t\n"
459  "nmsub.s %[temp13], %[temp13], %[temp4], %[temp3] \t\n"
460  "madd.s %[temp14], %[temp14], %[temp4], %[temp2] \t\n"
461  "mul.s %[temp15], %[temp5], %[temp6] \t\n"
462  "mul.s %[temp16], %[temp5], %[temp7] \t\n"
463  "lwc1 %[temp8], 8(%[z2]) \t\n"
464  "nmsub.s %[temp15], %[temp15], %[temp8], %[temp7] \t\n"
465  "madd.s %[temp16], %[temp16], %[temp8], %[temp6] \t\n"
466  : [temp1]"=&f"(temp1), [temp2]"=&f"(temp2),
467  [temp3]"=&f"(temp3), [temp4]"=&f"(temp4),
468  [temp5]"=&f"(temp5), [temp6]"=&f"(temp6),
469  [temp7]"=&f"(temp7), [temp8]"=&f"(temp8),
470  [temp9]"=&f"(temp9), [temp10]"=&f"(temp10),
471  [temp11]"=&f"(temp11), [temp12]"=&f"(temp12),
472  [temp13]"=&f"(temp13), [temp14]"=&f"(temp14),
473  [temp15]"=&f"(temp15), [temp16]"=&f"(temp16)
474  : [z1]"r"(z1), [z2]"r"(z2),
475  [tsin1]"r"(tsin1), [tcos1]"r"(tcos1),
476  [tsin2]"r"(tsin2), [tcos2]"r"(tcos2)
477  : "memory"
478  );
479 
480  z1[1].re = temp9;
481  z1[1].im = temp14;
482  z2[0].re = temp13;
483  z2[0].im = temp10;
484 
485  z1[0].re = temp11;
486  z1[0].im = temp16;
487  z2[1].re = temp15;
488  z2[1].im = temp12;
489  }
490 }
491 
492 /**
493  * Compute inverse MDCT of size N = 2^nbits
494  * @param output N samples
495  * @param input N/2 samples
496  */
497 static void ff_imdct_calc_mips(FFTContext *s, FFTSample *output, const FFTSample *input)
498 {
499  int k;
500  int n = 1 << s->mdct_bits;
501  int n2 = n >> 1;
502  int n4 = n >> 2;
503 
504  ff_imdct_half_mips(s, output+n4, input);
505 
506  for(k = 0; k < n4; k+=4) {
507  output[k] = -output[n2-k-1];
508  output[k+1] = -output[n2-k-2];
509  output[k+2] = -output[n2-k-3];
510  output[k+3] = -output[n2-k-4];
511 
512  output[n-k-1] = output[n2+k];
513  output[n-k-2] = output[n2+k+1];
514  output[n-k-3] = output[n2+k+2];
515  output[n-k-4] = output[n2+k+3];
516  }
517 }
518 #endif /* HAVE_INLINE_ASM */
519 
521 {
522  int n=0;
523 
524  ff_fft_lut_init(fft_offsets_lut, 0, 1 << 16, &n);
526 
527 #if HAVE_INLINE_ASM
528  s->fft_calc = ff_fft_calc_mips;
529 #if CONFIG_MDCT
530  s->imdct_calc = ff_imdct_calc_mips;
531  s->imdct_half = ff_imdct_half_mips;
532 #endif
533 #endif
534 }
#define MAX_FFT_SIZE
Definition: fft_table.h:58
const char * s
Definition: avisynth_c.h:668
else temp
Definition: vf_mcdeint.c:148
FFTSample re
Definition: avfft.h:38
#define MAX_LOG2_NFFT
Specifies maxiumum allowed fft size.
Definition: fft_table.h:57
void ff_fft_lut_init(uint16_t *table, int off, int size, int *index)
#define av_cold
Definition: attributes.h:78
av_cold void ff_fft_init_mips(FFTContext *s)
FFT transform.
Definition: fft_mips.c:520
these buffered frames must be flushed immediately if a new input produces new the filter must not call request_frame to get more It must just process the frame or queue it The task of requesting more frames is left to the filter s request_frame method or the application If a filter has several the filter must be ready for frames arriving randomly on any input any filter with several inputs will most likely require some kind of queuing mechanism It is perfectly acceptable to have a limited queue and to drop frames when the inputs are too unbalanced request_frame This method is called when a frame is wanted on an output For an input
void(* imdct_calc)(struct FFTContext *s, FFTSample *output, const FFTSample *input)
Definition: fft.h:81
static const uint8_t offset[127][2]
Definition: vf_spp.c:70
float FFTSample
Definition: avfft.h:35
Definition: fft.h:62
FFTSample * tsin
Definition: fft.h:71
int nbits
Definition: fft.h:63
for k
uint16_t fft_offsets_lut[0x2aab]
synthesis window for stochastic i
int mdct_bits
Definition: fft.h:68
void(* imdct_half)(struct FFTContext *s, FFTSample *output, const FFTSample *input)
Definition: fft.h:82
#define ff_init_ff_cos_tabs
Definition: fft.h:118
FFTSample im
Definition: avfft.h:38
void(* fft_calc)(struct FFTContext *s, FFTComplex *z)
Do a complex FFT with the parameters defined in ff_fft_init().
Definition: fft.h:80
these buffered frames must be flushed immediately if a new input produces new output(Example:frame rate-doubling filter:filter_frame must(1) flush the second copy of the previous frame, if it is still there,(2) push the first copy of the incoming frame,(3) keep the second copy for later.) If the input frame is not enough to produce output
FFTSample * tcos
Definition: fft.h:70
uint16_t * revtab
Definition: fft.h:65
definitions and LUT table for MIPS FFT
trying all byte sequences megabyte in length and selecting the best looking sequence will yield cases to try But a word about which is also called distortion Distortion can be quantified by almost any quality measurement one chooses the sum of squared differences is used but more complex methods that consider psychovisual effects can be used as well It makes no difference in this discussion First step