mpegaudiodsp_mips_float.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2012
3  * MIPS Technologies, Inc., California.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  * notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  * notice, this list of conditions and the following disclaimer in the
12  * documentation and/or other materials provided with the distribution.
13  * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
14  * contributors may be used to endorse or promote products derived from
15  * this software without specific prior written permission.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED. IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  *
29  * Author: Bojan Zivkovic (bojan@mips.com)
30  *
31  * MPEG Audio decoder optimized for MIPS floating-point architecture
32  *
33  * This file is part of FFmpeg.
34  *
35  * FFmpeg is free software; you can redistribute it and/or
36  * modify it under the terms of the GNU Lesser General Public
37  * License as published by the Free Software Foundation; either
38  * version 2.1 of the License, or (at your option) any later version.
39  *
40  * FFmpeg is distributed in the hope that it will be useful,
41  * but WITHOUT ANY WARRANTY; without even the implied warranty of
42  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
43  * Lesser General Public License for more details.
44  *
45  * You should have received a copy of the GNU Lesser General Public
46  * License along with FFmpeg; if not, write to the Free Software
47  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
48  */
49 
50 /**
51  * @file
52  * Reference: libavcodec/mpegaudiodsp_template.c
53  * libavcodec/dct32.c
54  */
55 
56 #include <string.h>
57 
59 
60 static void ff_mpadsp_apply_window_mips_float(float *synth_buf, float *window,
61  int *dither_state, float *samples, int incr)
62 {
63  register const float *w, *w2, *p;
64  int j;
65  float *samples2;
66  float sum, sum2;
67  /* temporary variables */
68  int incr1 = incr << 2;
69  int t_sample;
70  float in1, in2, in3, in4, in5, in6, in7, in8;
71  float *p2;
72 
73  /* copy to avoid wrap */
74  memcpy(synth_buf + 512, synth_buf, 32 * sizeof(*synth_buf));
75 
76  /**
77  * instructions are scheduled to minimize pipeline stall.
78  * use of round_sample function from the original code is
79  * changed with appropriate assembly instructions.
80  */
81 
82  __asm__ volatile (
83  "lwc1 %[sum], 0(%[dither_state]) \t\n"
84  "sll %[t_sample], %[incr1], 5 \t\n"
85  "sub %[t_sample], %[t_sample], %[incr1] \n\t"
86  "li %[j], 4 \t\n"
87  "lwc1 %[in1], 0(%[window]) \t\n"
88  "lwc1 %[in2], 16*4(%[synth_buf]) \t\n"
89  "sw $zero, 0(%[dither_state]) \t\n"
90  "lwc1 %[in3], 64*4(%[window]) \t\n"
91  "lwc1 %[in4], 80*4(%[synth_buf]) \t\n"
92  "addu %[samples2], %[samples], %[t_sample] \t\n"
93  "madd.s %[sum], %[sum], %[in1], %[in2] \t\n"
94  "lwc1 %[in5], 128*4(%[window]) \t\n"
95  "lwc1 %[in6], 144*4(%[synth_buf]) \t\n"
96  "lwc1 %[in7], 192*4(%[window]) \t\n"
97  "madd.s %[sum], %[sum], %[in3], %[in4] \t\n"
98  "lwc1 %[in8], 208*4(%[synth_buf]) \t\n"
99  "lwc1 %[in1], 256*4(%[window]) \t\n"
100  "lwc1 %[in2], 272*4(%[synth_buf]) \t\n"
101  "madd.s %[sum], %[sum], %[in5], %[in6] \t\n"
102  "lwc1 %[in3], 320*4(%[window]) \t\n"
103  "lwc1 %[in4], 336*4(%[synth_buf]) \t\n"
104  "lwc1 %[in5], 384*4(%[window]) \t\n"
105  "madd.s %[sum], %[sum], %[in7], %[in8] \t\n"
106  "lwc1 %[in6], 400*4(%[synth_buf]) \t\n"
107  "lwc1 %[in7], 448*4(%[window]) \t\n"
108  "lwc1 %[in8], 464*4(%[synth_buf]) \t\n"
109  "madd.s %[sum], %[sum], %[in1], %[in2] \t\n"
110  "lwc1 %[in1], 32*4(%[window]) \t\n"
111  "lwc1 %[in2], 48*4(%[synth_buf]) \t\n"
112  "madd.s %[sum], %[sum], %[in3], %[in4] \t\n"
113  "lwc1 %[in3], 96*4(%[window]) \t\n"
114  "lwc1 %[in4], 112*4(%[synth_buf]) \t\n"
115  "madd.s %[sum], %[sum], %[in5], %[in6] \t\n"
116  "lwc1 %[in5], 160*4(%[window]) \t\n"
117  "lwc1 %[in6], 176*4(%[synth_buf]) \t\n"
118  "madd.s %[sum], %[sum], %[in7], %[in8] \t\n"
119  "lwc1 %[in7], 224*4(%[window]) \t\n"
120  "lwc1 %[in8], 240*4(%[synth_buf]) \t\n"
121  "nmsub.s %[sum], %[sum], %[in1], %[in2] \t\n"
122  "lwc1 %[in1], 288*4(%[window]) \t\n"
123  "lwc1 %[in2], 304*4(%[synth_buf]) \t\n"
124  "nmsub.s %[sum], %[sum], %[in3], %[in4] \t\n"
125  "lwc1 %[in3], 352*4(%[window]) \t\n"
126  "lwc1 %[in4], 368*4(%[synth_buf]) \t\n"
127  "nmsub.s %[sum], %[sum], %[in5], %[in6] \t\n"
128  "lwc1 %[in5], 416*4(%[window]) \t\n"
129  "lwc1 %[in6], 432*4(%[synth_buf]) \t\n"
130  "nmsub.s %[sum], %[sum], %[in7], %[in8] \t\n"
131  "lwc1 %[in7], 480*4(%[window]) \t\n"
132  "lwc1 %[in8], 496*4(%[synth_buf]) \t\n"
133  "nmsub.s %[sum], %[sum], %[in1], %[in2] \t\n"
134  "addu %[w], %[window], 4 \t\n"
135  "nmsub.s %[sum], %[sum], %[in3], %[in4] \t\n"
136  "addu %[w2], %[window], 124 \t\n"
137  "addiu %[p], %[synth_buf], 68 \t\n"
138  "addiu %[p2], %[synth_buf], 188 \t\n"
139  "nmsub.s %[sum], %[sum], %[in5], %[in6] \t\n"
140  "nmsub.s %[sum], %[sum], %[in7], %[in8] \t\n"
141  "swc1 %[sum], 0(%[samples]) \t\n"
142  "addu %[samples], %[samples], %[incr1] \t\n"
143 
144  /* calculate two samples at the same time to avoid one memory
145  access per two sample */
146 
147  "ff_mpadsp_apply_window_loop%=: \t\n"
148  "lwc1 %[in1], 0(%[w]) \t\n"
149  "lwc1 %[in2], 0(%[p]) \t\n"
150  "lwc1 %[in3], 0(%[w2]) \t\n"
151  "lwc1 %[in4], 64*4(%[w]) \t\n"
152  "lwc1 %[in5], 64*4(%[p]) \t\n"
153  "lwc1 %[in6], 64*4(%[w2]) \t\n"
154  "mul.s %[sum], %[in1], %[in2] \t\n"
155  "mul.s %[sum2], %[in2], %[in3] \t\n"
156  "lwc1 %[in1], 128*4(%[w]) \t\n"
157  "lwc1 %[in2], 128*4(%[p]) \t\n"
158  "madd.s %[sum], %[sum], %[in4], %[in5] \t\n"
159  "nmadd.s %[sum2], %[sum2], %[in5], %[in6] \t\n"
160  "lwc1 %[in3], 128*4(%[w2]) \t\n"
161  "lwc1 %[in4], 192*4(%[w]) \t\n"
162  "madd.s %[sum], %[sum], %[in1], %[in2] \t\n"
163  "lwc1 %[in5], 192*4(%[p]) \t\n"
164  "lwc1 %[in6], 192*4(%[w2]) \t\n"
165  "nmsub.s %[sum2], %[sum2], %[in2], %[in3] \t\n"
166  "lwc1 %[in1], 256*4(%[w]) \t\n"
167  "lwc1 %[in2], 256*4(%[p]) \t\n"
168  "madd.s %[sum], %[sum], %[in4], %[in5] \t\n"
169  "nmsub.s %[sum2], %[sum2], %[in5], %[in6] \t\n"
170  "lwc1 %[in3], 256*4(%[w2]) \t\n"
171  "lwc1 %[in4], 320*4(%[w]) \t\n"
172  "madd.s %[sum], %[sum], %[in1], %[in2] \t\n"
173  "lwc1 %[in5], 320*4(%[p]) \t\n"
174  "lwc1 %[in6], 320*4(%[w2]) \t\n"
175  "nmsub.s %[sum2], %[sum2], %[in2], %[in3] \t\n"
176  "lwc1 %[in1], 384*4(%[w]) \t\n"
177  "lwc1 %[in2], 384*4(%[p]) \t\n"
178  "madd.s %[sum], %[sum], %[in4], %[in5] \t\n"
179  "nmsub.s %[sum2], %[sum2], %[in5], %[in6] \t\n"
180  "lwc1 %[in3], 384*4(%[w2]) \t\n"
181  "lwc1 %[in4], 448*4(%[w]) \t\n"
182  "madd.s %[sum], %[sum], %[in1], %[in2] \t\n"
183  "lwc1 %[in5], 448*4(%[p]) \t\n"
184  "lwc1 %[in6], 448*4(%[w2]) \t\n"
185  "nmsub.s %[sum2], %[sum2], %[in2], %[in3] \t\n"
186  "madd.s %[sum], %[sum], %[in4], %[in5] \t\n"
187  "lwc1 %[in1], 32*4(%[w]) \t\n"
188  "lwc1 %[in2], 0(%[p2]) \t\n"
189  "nmsub.s %[sum2], %[sum2], %[in5], %[in6] \t\n"
190  "lwc1 %[in3], 32*4(%[w2]) \t\n"
191  "lwc1 %[in4], 96*4(%[w]) \t\n"
192  "lwc1 %[in5], 64*4(%[p2]) \t\n"
193  "nmsub.s %[sum], %[sum], %[in1], %[in2] \t\n"
194  "lwc1 %[in6], 96*4(%[w2]) \t\n"
195  "nmsub.s %[sum2], %[sum2], %[in2], %[in3] \t\n"
196  "lwc1 %[in1], 160*4(%[w]) \t\n"
197  "nmsub.s %[sum], %[sum], %[in4], %[in5] \t\n"
198  "lwc1 %[in2], 128*4(%[p2]) \t\n"
199  "nmsub.s %[sum2], %[sum2], %[in5], %[in6] \t\n"
200  "lwc1 %[in3], 160*4(%[w2]) \t\n"
201  "lwc1 %[in4], 224*4(%[w]) \t\n"
202  "lwc1 %[in5], 192*4(%[p2]) \t\n"
203  "nmsub.s %[sum], %[sum], %[in1], %[in2] \t\n"
204  "lwc1 %[in6], 224*4(%[w2]) \t\n"
205  "nmsub.s %[sum2], %[sum2], %[in2], %[in3] \t\n"
206  "lwc1 %[in1], 288*4(%[w]) \t\n"
207  "nmsub.s %[sum], %[sum], %[in4], %[in5] \t\n"
208  "lwc1 %[in2], 256*4(%[p2]) \t\n"
209  "nmsub.s %[sum2], %[sum2], %[in5], %[in6] \t\n"
210  "lwc1 %[in3], 288*4(%[w2]) \t\n"
211  "lwc1 %[in4], 352*4(%[w]) \t\n"
212  "lwc1 %[in5], 320*4(%[p2]) \t\n"
213  "nmsub.s %[sum], %[sum], %[in1], %[in2] \t\n"
214  "lwc1 %[in6], 352*4(%[w2]) \t\n"
215  "nmsub.s %[sum2], %[sum2], %[in2], %[in3] \t\n"
216  "lwc1 %[in1], 416*4(%[w]) \t\n"
217  "nmsub.s %[sum], %[sum], %[in4], %[in5] \t\n"
218  "lwc1 %[in2], 384*4(%[p2]) \t\n"
219  "nmsub.s %[sum2], %[sum2], %[in5], %[in6] \t\n"
220  "lwc1 %[in3], 416*4(%[w2]) \t\n"
221  "lwc1 %[in4], 480*4(%[w]) \t\n"
222  "lwc1 %[in5], 448*4(%[p2]) \t\n"
223  "nmsub.s %[sum], %[sum], %[in1], %[in2] \t\n"
224  "lwc1 %[in6], 480*4(%[w2]) \t\n"
225  "nmsub.s %[sum2], %[sum2], %[in2], %[in3] \t\n"
226  "addiu %[w], %[w], 4 \t\n"
227  "nmsub.s %[sum], %[sum], %[in4], %[in5] \t\n"
228  "addiu %[w2], %[w2], -4 \t\n"
229  "nmsub.s %[sum2], %[sum2], %[in5], %[in6] \t\n"
230  "addu %[j], %[j], 4 \t\n"
231  "addiu %[p], 4 \t\n"
232  "swc1 %[sum], 0(%[samples]) \t\n"
233  "addiu %[p2], -4 \t\n"
234  "swc1 %[sum2], 0(%[samples2]) \t\n"
235  "addu %[samples], %[samples], %[incr1] \t\n"
236  "subu %[samples2], %[samples2], %[incr1] \t\n"
237  "bne %[j], 64, ff_mpadsp_apply_window_loop%= \t\n"
238 
239  "lwc1 %[in1], 48*4(%[window]) \t\n"
240  "lwc1 %[in2], 32*4(%[synth_buf]) \t\n"
241  "lwc1 %[in3], 112*4(%[window]) \t\n"
242  "lwc1 %[in4], 96*4(%[synth_buf]) \t\n"
243  "lwc1 %[in5], 176*4(%[window]) \t\n"
244  "lwc1 %[in6], 160*4(%[synth_buf]) \t\n"
245  "mul.s %[sum], %[in1], %[in2] \t\n"
246  "lwc1 %[in7], 240*4(%[window]) \t\n"
247  "lwc1 %[in8], 224*4(%[synth_buf]) \t\n"
248  "lwc1 %[in1], 304*4(%[window]) \t\n"
249  "nmadd.s %[sum], %[sum], %[in3], %[in4] \t\n"
250  "lwc1 %[in2], 288*4(%[synth_buf]) \t\n"
251  "lwc1 %[in3], 368*4(%[window]) \t\n"
252  "lwc1 %[in4], 352*4(%[synth_buf]) \t\n"
253  "nmsub.s %[sum], %[sum], %[in5], %[in6] \t\n"
254  "nmsub.s %[sum], %[sum], %[in7], %[in8] \t\n"
255  "lwc1 %[in5], 432*4(%[window]) \t\n"
256  "lwc1 %[in6], 416*4(%[synth_buf]) \t\n"
257  "nmsub.s %[sum], %[sum], %[in1], %[in2] \t\n"
258  "lwc1 %[in7], 496*4(%[window]) \t\n"
259  "lwc1 %[in8], 480*4(%[synth_buf]) \t\n"
260  "nmsub.s %[sum], %[sum], %[in3], %[in4] \t\n"
261  "nmsub.s %[sum], %[sum], %[in5], %[in6] \t\n"
262  "nmsub.s %[sum], %[sum], %[in7], %[in8] \t\n"
263  "swc1 %[sum], 0(%[samples]) \t\n"
264 
265  : [sum] "=&f" (sum), [sum2] "=&f" (sum2),
266  [w2] "=&r" (w2), [w] "=&r" (w),
267  [p] "=&r" (p), [p2] "=&r" (p2), [j] "=&r" (j),
268  [samples] "+r" (samples), [samples2] "=&r" (samples2),
269  [in1] "=&f" (in1), [in2] "=&f" (in2),
270  [in3] "=&f" (in3), [in4] "=&f" (in4),
271  [in5] "=&f" (in5), [in6] "=&f" (in6),
272  [in7] "=&f" (in7), [in8] "=&f" (in8),
273  [t_sample] "=&r" (t_sample)
274  : [synth_buf] "r" (synth_buf), [window] "r" (window),
275  [dither_state] "r" (dither_state), [incr1] "r" (incr1)
276  : "memory"
277  );
278 }
279 
280 static void ff_dct32_mips_float(float *out, const float *tab)
281 {
282  float val0 , val1 , val2 , val3 , val4 , val5 , val6 , val7,
283  val8 , val9 , val10, val11, val12, val13, val14, val15,
284  val16, val17, val18, val19, val20, val21, val22, val23,
285  val24, val25, val26, val27, val28, val29, val30, val31;
286  float fTmp1, fTmp2, fTmp3, fTmp4, fTmp5, fTmp6, fTmp7, fTmp8,
287  fTmp9, fTmp10, fTmp11;
288 
289  /**
290  * instructions are scheduled to minimize pipeline stall.
291  */
292  __asm__ volatile (
293  "lwc1 %[fTmp1], 0*4(%[tab]) \n\t"
294  "lwc1 %[fTmp2], 31*4(%[tab]) \n\t"
295  "lwc1 %[fTmp3], 15*4(%[tab]) \n\t"
296  "lwc1 %[fTmp4], 16*4(%[tab]) \n\t"
297  "li.s %[fTmp7], 0.50241928618815570551 \n\t"
298  "add.s %[fTmp5], %[fTmp1], %[fTmp2] \n\t"
299  "sub.s %[fTmp8], %[fTmp1], %[fTmp2] \n\t"
300  "add.s %[fTmp6], %[fTmp3], %[fTmp4] \n\t"
301  "sub.s %[fTmp9], %[fTmp3], %[fTmp4] \n\t"
302  "li.s %[fTmp10], 0.50060299823519630134 \n\t"
303  "li.s %[fTmp11], 10.19000812354805681150 \n\t"
304  "mul.s %[fTmp8], %[fTmp8], %[fTmp10] \n\t"
305  "add.s %[val0], %[fTmp5], %[fTmp6] \n\t"
306  "sub.s %[val15], %[fTmp5], %[fTmp6] \n\t"
307  "lwc1 %[fTmp1], 7*4(%[tab]) \n\t"
308  "lwc1 %[fTmp2], 24*4(%[tab]) \n\t"
309  "madd.s %[val16], %[fTmp8], %[fTmp9], %[fTmp11] \n\t"
310  "nmsub.s %[val31], %[fTmp8], %[fTmp9], %[fTmp11] \n\t"
311  "mul.s %[val15], %[val15], %[fTmp7] \n\t"
312  "lwc1 %[fTmp3], 8*4(%[tab]) \n\t"
313  "lwc1 %[fTmp4], 23*4(%[tab]) \n\t"
314  "add.s %[fTmp5], %[fTmp1], %[fTmp2] \n\t"
315  "mul.s %[val31], %[val31], %[fTmp7] \n\t"
316  "sub.s %[fTmp8], %[fTmp1], %[fTmp2] \n\t"
317  "add.s %[fTmp6], %[fTmp3], %[fTmp4] \n\t"
318  "sub.s %[fTmp9], %[fTmp3], %[fTmp4] \n\t"
319  "li.s %[fTmp7], 5.10114861868916385802 \n\t"
320  "li.s %[fTmp10], 0.67480834145500574602 \n\t"
321  "li.s %[fTmp11], 0.74453627100229844977 \n\t"
322  "add.s %[val7], %[fTmp5], %[fTmp6] \n\t"
323  "sub.s %[val8], %[fTmp5], %[fTmp6] \n\t"
324  "mul.s %[fTmp8], %[fTmp8], %[fTmp10] \n\t"
325  "li.s %[fTmp1], 0.50979557910415916894 \n\t"
326  "sub.s %[fTmp2], %[val0], %[val7] \n\t"
327  "mul.s %[val8], %[val8], %[fTmp7] \n\t"
328  "madd.s %[val23], %[fTmp8], %[fTmp9], %[fTmp11] \n\t"
329  "nmsub.s %[val24], %[fTmp8], %[fTmp9], %[fTmp11] \n\t"
330  "add.s %[val0], %[val0], %[val7] \n\t"
331  "mul.s %[val7], %[fTmp1], %[fTmp2] \n\t"
332  "sub.s %[fTmp2], %[val15], %[val8] \n\t"
333  "add.s %[val8], %[val15], %[val8] \n\t"
334  "mul.s %[val24], %[val24], %[fTmp7] \n\t"
335  "sub.s %[fTmp3], %[val16], %[val23] \n\t"
336  "add.s %[val16], %[val16], %[val23] \n\t"
337  "mul.s %[val15], %[fTmp1], %[fTmp2] \n\t"
338  "sub.s %[fTmp4], %[val31], %[val24] \n\t"
339  "mul.s %[val23], %[fTmp1], %[fTmp3] \n\t"
340  "add.s %[val24], %[val31], %[val24] \n\t"
341  "mul.s %[val31], %[fTmp1], %[fTmp4] \n\t"
342 
343  : [fTmp1] "=&f" (fTmp1), [fTmp2] "=&f" (fTmp2), [fTmp3] "=&f" (fTmp3),
344  [fTmp4] "=&f" (fTmp4), [fTmp5] "=&f" (fTmp5), [fTmp6] "=&f" (fTmp6),
345  [fTmp7] "=&f" (fTmp7), [fTmp8] "=&f" (fTmp8), [fTmp9] "=&f" (fTmp9),
346  [fTmp10] "=&f" (fTmp10), [fTmp11] "=&f" (fTmp11),
347  [val0] "=f" (val0), [val7] "=f" (val7),
348  [val8] "=f" (val8), [val15] "=f" (val15),
349  [val16] "=f" (val16), [val23] "=f" (val23),
350  [val24] "=f" (val24), [val31] "=f" (val31)
351  : [tab] "r" (tab)
352  : "memory"
353  );
354 
355  __asm__ volatile (
356  "lwc1 %[fTmp1], 3*4(%[tab]) \n\t"
357  "lwc1 %[fTmp2], 28*4(%[tab]) \n\t"
358  "lwc1 %[fTmp3], 12*4(%[tab]) \n\t"
359  "lwc1 %[fTmp4], 19*4(%[tab]) \n\t"
360  "li.s %[fTmp7], 0.64682178335999012954 \n\t"
361  "add.s %[fTmp5], %[fTmp1], %[fTmp2] \n\t"
362  "sub.s %[fTmp8], %[fTmp1], %[fTmp2] \n\t"
363  "add.s %[fTmp6], %[fTmp3], %[fTmp4] \n\t"
364  "sub.s %[fTmp9], %[fTmp3], %[fTmp4] \n\t"
365  "li.s %[fTmp10], 0.53104259108978417447 \n\t"
366  "li.s %[fTmp11], 1.48416461631416627724 \n\t"
367  "mul.s %[fTmp8], %[fTmp8], %[fTmp10] \n\t"
368  "add.s %[val3], %[fTmp5], %[fTmp6] \n\t"
369  "sub.s %[val12], %[fTmp5], %[fTmp6] \n\t"
370  "lwc1 %[fTmp1], 4*4(%[tab]) \n\t"
371  "lwc1 %[fTmp2], 27*4(%[tab]) \n\t"
372  "madd.s %[val19], %[fTmp8], %[fTmp9], %[fTmp11] \n\t"
373  "nmsub.s %[val28], %[fTmp8], %[fTmp9], %[fTmp11] \n\t"
374  "mul.s %[val12], %[val12], %[fTmp7] \n\t"
375  "lwc1 %[fTmp3], 11*4(%[tab]) \n\t"
376  "lwc1 %[fTmp4], 20*4(%[tab]) \n\t"
377  "add.s %[fTmp5], %[fTmp1], %[fTmp2] \n\t"
378  "mul.s %[val28], %[val28], %[fTmp7] \n\t"
379  "sub.s %[fTmp8], %[fTmp1], %[fTmp2] \n\t"
380  "li.s %[fTmp7], 0.78815462345125022473 \n\t"
381  "add.s %[fTmp6], %[fTmp3], %[fTmp4] \n\t"
382  "sub.s %[fTmp9], %[fTmp3], %[fTmp4] \n\t"
383  "li.s %[fTmp10], 0.55310389603444452782 \n\t"
384  "li.s %[fTmp11], 1.16943993343288495515 \n\t"
385  "mul.s %[fTmp8], %[fTmp8], %[fTmp10] \n\t"
386  "add.s %[val4], %[fTmp5], %[fTmp6] \n\t"
387  "sub.s %[val11], %[fTmp5], %[fTmp6] \n\t"
388  "li.s %[fTmp1], 2.56291544774150617881 \n\t"
389  "madd.s %[val20], %[fTmp8], %[fTmp9], %[fTmp11] \n\t"
390  "nmsub.s %[val27], %[fTmp8], %[fTmp9], %[fTmp11] \n\t"
391  "mul.s %[val11], %[val11], %[fTmp7] \n\t"
392  "sub.s %[fTmp2], %[val3], %[val4] \n\t"
393  "add.s %[val3], %[val3], %[val4] \n\t"
394  "sub.s %[fTmp4], %[val19], %[val20] \n\t"
395  "mul.s %[val27], %[val27], %[fTmp7] \n\t"
396  "sub.s %[fTmp3], %[val12], %[val11] \n\t"
397  "mul.s %[val4], %[fTmp1], %[fTmp2] \n\t"
398  "add.s %[val11], %[val12], %[val11] \n\t"
399  "add.s %[val19], %[val19], %[val20] \n\t"
400  "mul.s %[val20], %[fTmp1], %[fTmp4] \n\t"
401  "mul.s %[val12], %[fTmp1], %[fTmp3] \n\t"
402  "sub.s %[fTmp2], %[val28], %[val27] \n\t"
403  "add.s %[val27], %[val28], %[val27] \n\t"
404  "mul.s %[val28], %[fTmp1], %[fTmp2] \n\t"
405 
406  : [fTmp1] "=&f" (fTmp1), [fTmp2] "=&f" (fTmp2), [fTmp3] "=&f" (fTmp3),
407  [fTmp4] "=&f" (fTmp4), [fTmp5] "=&f" (fTmp5), [fTmp6] "=&f" (fTmp6),
408  [fTmp7] "=&f" (fTmp7), [fTmp8] "=&f" (fTmp8), [fTmp9] "=&f" (fTmp9),
409  [fTmp10] "=&f" (fTmp10), [fTmp11] "=&f" (fTmp11),
410  [val3] "=f" (val3), [val4] "=f" (val4),
411  [val11] "=f" (val11), [val12] "=f" (val12),
412  [val19] "=f" (val19), [val20] "=f" (val20),
413  [val27] "=f" (val27), [val28] "=f" (val28)
414  : [tab] "r" (tab)
415  : "memory"
416  );
417 
418  __asm__ volatile (
419  "li.s %[fTmp1], 0.54119610014619698439 \n\t"
420  "sub.s %[fTmp2], %[val0], %[val3] \n\t"
421  "add.s %[val0], %[val0], %[val3] \n\t"
422  "sub.s %[fTmp3], %[val7], %[val4] \n\t"
423  "add.s %[val4], %[val7], %[val4] \n\t"
424  "sub.s %[fTmp4], %[val8], %[val11] \n\t"
425  "mul.s %[val3], %[fTmp1], %[fTmp2] \n\t"
426  "add.s %[val8], %[val8], %[val11] \n\t"
427  "mul.s %[val7], %[fTmp1], %[fTmp3] \n\t"
428  "sub.s %[fTmp2], %[val15], %[val12] \n\t"
429  "mul.s %[val11], %[fTmp1], %[fTmp4] \n\t"
430  "add.s %[val12], %[val15], %[val12] \n\t"
431  "mul.s %[val15], %[fTmp1], %[fTmp2] \n\t"
432 
433  : [val0] "+f" (val0), [val3] "+f" (val3),
434  [val4] "+f" (val4), [val7] "+f" (val7),
435  [val8] "+f" (val8), [val11] "+f" (val11),
436  [val12] "+f" (val12), [val15] "+f" (val15),
437  [fTmp1] "=f" (fTmp1), [fTmp2] "=&f" (fTmp2),
438  [fTmp3] "=&f" (fTmp3), [fTmp4] "=&f" (fTmp4)
439  :
440  );
441 
442  __asm__ volatile (
443  "sub.s %[fTmp2], %[val16], %[val19] \n\t"
444  "add.s %[val16], %[val16], %[val19] \n\t"
445  "sub.s %[fTmp3], %[val23], %[val20] \n\t"
446  "add.s %[val20], %[val23], %[val20] \n\t"
447  "sub.s %[fTmp4], %[val24], %[val27] \n\t"
448  "mul.s %[val19], %[fTmp1], %[fTmp2] \n\t"
449  "add.s %[val24], %[val24], %[val27] \n\t"
450  "mul.s %[val23], %[fTmp1], %[fTmp3] \n\t"
451  "sub.s %[fTmp2], %[val31], %[val28] \n\t"
452  "mul.s %[val27], %[fTmp1], %[fTmp4] \n\t"
453  "add.s %[val28], %[val31], %[val28] \n\t"
454  "mul.s %[val31], %[fTmp1], %[fTmp2] \n\t"
455 
456  : [fTmp2] "=&f" (fTmp2), [fTmp3] "=&f" (fTmp3), [fTmp4] "=&f" (fTmp4),
457  [val16] "+f" (val16), [val19] "+f" (val19), [val20] "+f" (val20),
458  [val23] "+f" (val23), [val24] "+f" (val24), [val27] "+f" (val27),
459  [val28] "+f" (val28), [val31] "+f" (val31)
460  : [fTmp1] "f" (fTmp1)
461  );
462 
463  __asm__ volatile (
464  "lwc1 %[fTmp1], 1*4(%[tab]) \n\t"
465  "lwc1 %[fTmp2], 30*4(%[tab]) \n\t"
466  "lwc1 %[fTmp3], 14*4(%[tab]) \n\t"
467  "lwc1 %[fTmp4], 17*4(%[tab]) \n\t"
468  "li.s %[fTmp7], 0.52249861493968888062 \n\t"
469  "add.s %[fTmp5], %[fTmp1], %[fTmp2] \n\t"
470  "sub.s %[fTmp8], %[fTmp1], %[fTmp2] \n\t"
471  "add.s %[fTmp6], %[fTmp3], %[fTmp4] \n\t"
472  "sub.s %[fTmp9], %[fTmp3], %[fTmp4] \n\t"
473  "li.s %[fTmp10], 0.50547095989754365998 \n\t"
474  "li.s %[fTmp11], 3.40760841846871878570 \n\t"
475  "mul.s %[fTmp8], %[fTmp8], %[fTmp10] \n\t"
476  "add.s %[val1], %[fTmp5], %[fTmp6] \n\t"
477  "sub.s %[val14], %[fTmp5], %[fTmp6] \n\t"
478  "lwc1 %[fTmp1], 6*4(%[tab]) \n\t"
479  "lwc1 %[fTmp2], 25*4(%[tab]) \n\t"
480  "madd.s %[val17], %[fTmp8], %[fTmp9], %[fTmp11] \n\t"
481  "nmsub.s %[val30], %[fTmp8], %[fTmp9], %[fTmp11] \n\t"
482  "mul.s %[val14], %[val14], %[fTmp7] \n\t"
483  "lwc1 %[fTmp3], 9*4(%[tab]) \n\t"
484  "lwc1 %[fTmp4], 22*4(%[tab]) \n\t"
485  "add.s %[fTmp5], %[fTmp1], %[fTmp2] \n\t"
486  "mul.s %[val30], %[val30], %[fTmp7] \n\t"
487  "sub.s %[fTmp8], %[fTmp1], %[fTmp2] \n\t"
488  "add.s %[fTmp6], %[fTmp3], %[fTmp4] \n\t"
489  "sub.s %[fTmp9], %[fTmp3], %[fTmp4] \n\t"
490  "li.s %[fTmp7], 1.72244709823833392782 \n\t"
491  "li.s %[fTmp10], 0.62250412303566481615 \n\t"
492  "li.s %[fTmp11], 0.83934964541552703873 \n\t"
493  "add.s %[val6], %[fTmp5], %[fTmp6] \n\t"
494  "sub.s %[val9], %[fTmp5], %[fTmp6] \n\t"
495  "mul.s %[fTmp8], %[fTmp8], %[fTmp10] \n\t"
496  "li.s %[fTmp1], 0.60134488693504528054 \n\t"
497  "sub.s %[fTmp2], %[val1], %[val6] \n\t"
498  "add.s %[val1], %[val1], %[val6] \n\t"
499  "mul.s %[val9], %[val9], %[fTmp7] \n\t"
500  "madd.s %[val22], %[fTmp8], %[fTmp9], %[fTmp11] \n\t"
501  "nmsub.s %[val25], %[fTmp8], %[fTmp9], %[fTmp11] \n\t"
502  "mul.s %[val6], %[fTmp1], %[fTmp2] \n\t"
503  "sub.s %[fTmp2], %[val14], %[val9] \n\t"
504  "add.s %[val9], %[val14], %[val9] \n\t"
505  "mul.s %[val25], %[val25], %[fTmp7] \n\t"
506  "sub.s %[fTmp3], %[val17], %[val22] \n\t"
507  "add.s %[val17], %[val17], %[val22] \n\t"
508  "mul.s %[val14], %[fTmp1], %[fTmp2] \n\t"
509  "sub.s %[fTmp2], %[val30], %[val25] \n\t"
510  "mul.s %[val22], %[fTmp1], %[fTmp3] \n\t"
511  "add.s %[val25], %[val30], %[val25] \n\t"
512  "mul.s %[val30], %[fTmp1], %[fTmp2] \n\t"
513 
514  : [fTmp1] "=&f" (fTmp1), [fTmp2] "=&f" (fTmp2), [fTmp3] "=&f" (fTmp3),
515  [fTmp4] "=&f" (fTmp4), [fTmp5] "=&f" (fTmp5), [fTmp6] "=&f" (fTmp6),
516  [fTmp7] "=&f" (fTmp7), [fTmp8] "=&f" (fTmp8), [fTmp9] "=&f" (fTmp9),
517  [fTmp10] "=&f" (fTmp10), [fTmp11] "=&f" (fTmp11),
518  [val1] "=f" (val1), [val6] "=f" (val6),
519  [val9] "=f" (val9), [val14] "=f" (val14),
520  [val17] "=f" (val17), [val22] "=f" (val22),
521  [val25] "=f" (val25), [val30] "=f" (val30)
522  : [tab] "r" (tab)
523  : "memory"
524  );
525 
526  __asm__ volatile (
527  "lwc1 %[fTmp1], 2*4(%[tab]) \n\t"
528  "lwc1 %[fTmp2], 29*4(%[tab]) \n\t"
529  "lwc1 %[fTmp3], 13*4(%[tab]) \n\t"
530  "lwc1 %[fTmp4], 18*4(%[tab]) \n\t"
531  "li.s %[fTmp7], 0.56694403481635770368 \n\t"
532  "add.s %[fTmp5], %[fTmp1], %[fTmp2] \n\t"
533  "sub.s %[fTmp8], %[fTmp1], %[fTmp2] \n\t"
534  "add.s %[fTmp6], %[fTmp3], %[fTmp4] \n\t"
535  "sub.s %[fTmp9], %[fTmp3], %[fTmp4] \n\t"
536  "li.s %[fTmp10], 0.51544730992262454697 \n\t"
537  "li.s %[fTmp11], 2.05778100995341155085 \n\t"
538  "mul.s %[fTmp8], %[fTmp8], %[fTmp10] \n\t"
539  "add.s %[val2], %[fTmp5], %[fTmp6] \n\t"
540  "sub.s %[val13], %[fTmp5], %[fTmp6] \n\t"
541  "lwc1 %[fTmp1], 5*4(%[tab]) \n\t"
542  "lwc1 %[fTmp2], 26*4(%[tab]) \n\t"
543  "madd.s %[val18], %[fTmp8], %[fTmp9], %[fTmp11] \n\t"
544  "nmsub.s %[val29], %[fTmp8], %[fTmp9], %[fTmp11] \n\t"
545  "mul.s %[val13], %[val13], %[fTmp7] \n\t"
546  "lwc1 %[fTmp3], 10*4(%[tab]) \n\t"
547  "lwc1 %[fTmp4], 21*4(%[tab]) \n\t"
548  "mul.s %[val29], %[val29], %[fTmp7] \n\t"
549  "add.s %[fTmp5], %[fTmp1], %[fTmp2] \n\t"
550  "sub.s %[fTmp8], %[fTmp1], %[fTmp2] \n\t"
551  "add.s %[fTmp6], %[fTmp3], %[fTmp4] \n\t"
552  "sub.s %[fTmp9], %[fTmp3], %[fTmp4] \n\t"
553  "li.s %[fTmp7], 1.06067768599034747134 \n\t"
554  "li.s %[fTmp10], 0.58293496820613387367 \n\t"
555  "li.s %[fTmp11], 0.97256823786196069369 \n\t"
556  "add.s %[val5], %[fTmp5], %[fTmp6] \n\t"
557  "sub.s %[val10], %[fTmp5], %[fTmp6] \n\t"
558  "mul.s %[fTmp8], %[fTmp8], %[fTmp10] \n\t"
559  "li.s %[fTmp1], 0.89997622313641570463 \n\t"
560  "sub.s %[fTmp2], %[val2], %[val5] \n\t"
561  "mul.s %[val10], %[val10], %[fTmp7] \n\t"
562  "madd.s %[val21], %[fTmp8], %[fTmp9], %[fTmp11] \n\t"
563  "nmsub.s %[val26], %[fTmp8], %[fTmp9], %[fTmp11] \n\t"
564  "add.s %[val2], %[val2], %[val5] \n\t"
565  "mul.s %[val5], %[fTmp1], %[fTmp2] \n\t"
566  "sub.s %[fTmp3], %[val13], %[val10] \n\t"
567  "add.s %[val10], %[val13], %[val10] \n\t"
568  "mul.s %[val26], %[val26], %[fTmp7] \n\t"
569  "sub.s %[fTmp4], %[val18], %[val21] \n\t"
570  "add.s %[val18], %[val18], %[val21] \n\t"
571  "mul.s %[val13], %[fTmp1], %[fTmp3] \n\t"
572  "sub.s %[fTmp2], %[val29], %[val26] \n\t"
573  "add.s %[val26], %[val29], %[val26] \n\t"
574  "mul.s %[val21], %[fTmp1], %[fTmp4] \n\t"
575  "mul.s %[val29], %[fTmp1], %[fTmp2] \n\t"
576 
577  : [fTmp1] "=&f" (fTmp1), [fTmp2] "=&f" (fTmp2), [fTmp3] "=&f" (fTmp3),
578  [fTmp4] "=&f" (fTmp4), [fTmp5] "=&f" (fTmp5), [fTmp6] "=&f" (fTmp6),
579  [fTmp7] "=&f" (fTmp7), [fTmp8] "=&f" (fTmp8), [fTmp9] "=&f" (fTmp9),
580  [fTmp10] "=&f" (fTmp10), [fTmp11] "=&f" (fTmp11),
581  [val2] "=f" (val2), [val5] "=f" (val5),
582  [val10] "=f" (val10), [val13] "=f" (val13),
583  [val18] "=f" (val18), [val21] "=f" (val21),
584  [val26] "=f" (val26), [val29] "=f" (val29)
585  : [tab] "r" (tab)
586  : "memory"
587  );
588 
589  __asm__ volatile (
590  "li.s %[fTmp1], 1.30656296487637652785 \n\t"
591  "sub.s %[fTmp2], %[val1], %[val2] \n\t"
592  "add.s %[val1], %[val1], %[val2] \n\t"
593  "sub.s %[fTmp3], %[val6], %[val5] \n\t"
594  "add.s %[val5], %[val6], %[val5] \n\t"
595  "sub.s %[fTmp4], %[val9], %[val10] \n\t"
596  "mul.s %[val2], %[fTmp1], %[fTmp2] \n\t"
597  "add.s %[val9], %[val9], %[val10] \n\t"
598  "mul.s %[val6], %[fTmp1], %[fTmp3] \n\t"
599  "sub.s %[fTmp2], %[val14], %[val13] \n\t"
600  "mul.s %[val10], %[fTmp1], %[fTmp4] \n\t"
601  "add.s %[val13], %[val14], %[val13] \n\t"
602  "mul.s %[val14], %[fTmp1], %[fTmp2] \n\t"
603 
604  : [fTmp1] "=f" (fTmp1), [fTmp2] "=&f" (fTmp2),
605  [fTmp3] "=&f" (fTmp3), [fTmp4] "=&f" (fTmp4),
606  [val1] "+f" (val1), [val2] "+f" (val2),
607  [val5] "+f" (val5), [val6] "+f" (val6),
608  [val9] "+f" (val9), [val10] "+f" (val10),
609  [val13] "+f" (val13), [val14] "+f" (val14)
610  :
611  );
612 
613  __asm__ volatile (
614  "sub.s %[fTmp2], %[val17], %[val18] \n\t"
615  "add.s %[val17], %[val17], %[val18] \n\t"
616  "sub.s %[fTmp3], %[val22], %[val21] \n\t"
617  "add.s %[val21], %[val22], %[val21] \n\t"
618  "sub.s %[fTmp4], %[val25], %[val26] \n\t"
619  "mul.s %[val18], %[fTmp1], %[fTmp2] \n\t"
620  "add.s %[val25], %[val25], %[val26] \n\t"
621  "mul.s %[val22], %[fTmp1], %[fTmp3] \n\t"
622  "sub.s %[fTmp2], %[val30], %[val29] \n\t"
623  "mul.s %[val26], %[fTmp1], %[fTmp4] \n\t"
624  "add.s %[val29], %[val30], %[val29] \n\t"
625  "mul.s %[val30], %[fTmp1], %[fTmp2] \n\t"
626 
627  : [fTmp2] "=&f" (fTmp2), [fTmp3] "=&f" (fTmp3), [fTmp4] "=&f" (fTmp4),
628  [val17] "+f" (val17), [val18] "+f" (val18), [val21] "+f" (val21),
629  [val22] "+f" (val22), [val25] "+f" (val25), [val26] "+f" (val26),
630  [val29] "+f" (val29), [val30] "+f" (val30)
631  : [fTmp1] "f" (fTmp1)
632  );
633 
634  __asm__ volatile (
635  "li.s %[fTmp1], 0.70710678118654752439 \n\t"
636  "sub.s %[fTmp2], %[val0], %[val1] \n\t"
637  "add.s %[val0], %[val0], %[val1] \n\t"
638  "sub.s %[fTmp3], %[val3], %[val2] \n\t"
639  "add.s %[val2], %[val3], %[val2] \n\t"
640  "sub.s %[fTmp4], %[val4], %[val5] \n\t"
641  "mul.s %[val1], %[fTmp1], %[fTmp2] \n\t"
642  "swc1 %[val0], 0(%[out]) \n\t"
643  "mul.s %[val3], %[fTmp3], %[fTmp1] \n\t"
644  "add.s %[val4], %[val4], %[val5] \n\t"
645  "mul.s %[val5], %[fTmp1], %[fTmp4] \n\t"
646  "swc1 %[val1], 16*4(%[out]) \n\t"
647  "sub.s %[fTmp2], %[val7], %[val6] \n\t"
648  "add.s %[val2], %[val2], %[val3] \n\t"
649  "swc1 %[val3], 24*4(%[out]) \n\t"
650  "add.s %[val6], %[val7], %[val6] \n\t"
651  "mul.s %[val7], %[fTmp1], %[fTmp2] \n\t"
652  "swc1 %[val2], 8*4(%[out]) \n\t"
653  "add.s %[val6], %[val6], %[val7] \n\t"
654  "swc1 %[val7], 28*4(%[out]) \n\t"
655  "add.s %[val4], %[val4], %[val6] \n\t"
656  "add.s %[val6], %[val6], %[val5] \n\t"
657  "add.s %[val5], %[val5], %[val7] \n\t"
658  "swc1 %[val4], 4*4(%[out]) \n\t"
659  "swc1 %[val5], 20*4(%[out]) \n\t"
660  "swc1 %[val6], 12*4(%[out]) \n\t"
661 
662  : [fTmp1] "=f" (fTmp1), [fTmp2] "=&f" (fTmp2),
663  [fTmp3] "=&f" (fTmp3), [fTmp4] "=&f" (fTmp4),
664  [val0] "+f" (val0), [val1] "+f" (val1),
665  [val2] "+f" (val2), [val3] "+f" (val3),
666  [val4] "+f" (val4), [val5] "+f" (val5),
667  [val6] "+f" (val6), [val7] "+f" (val7)
668  : [out] "r" (out)
669  );
670 
671  __asm__ volatile (
672  "sub.s %[fTmp2], %[val8], %[val9] \n\t"
673  "add.s %[val8], %[val8], %[val9] \n\t"
674  "sub.s %[fTmp3], %[val11], %[val10] \n\t"
675  "add.s %[val10], %[val11], %[val10] \n\t"
676  "sub.s %[fTmp4], %[val12], %[val13] \n\t"
677  "mul.s %[val9], %[fTmp1], %[fTmp2] \n\t"
678  "add.s %[val12], %[val12], %[val13] \n\t"
679  "mul.s %[val11], %[fTmp1], %[fTmp3] \n\t"
680  "sub.s %[fTmp2], %[val15], %[val14] \n\t"
681  "mul.s %[val13], %[fTmp1], %[fTmp4] \n\t"
682  "add.s %[val14], %[val15], %[val14] \n\t"
683  "add.s %[val10], %[val10], %[val11] \n\t"
684  "mul.s %[val15], %[fTmp1], %[fTmp2] \n\t"
685  "add.s %[val14], %[val14], %[val15] \n\t"
686  "add.s %[val12], %[val12], %[val14] \n\t"
687  "add.s %[val14], %[val14], %[val13] \n\t"
688  "add.s %[val13], %[val13], %[val15] \n\t"
689  "add.s %[val8], %[val8], %[val12] \n\t"
690  "add.s %[val12], %[val12], %[val10] \n\t"
691  "add.s %[val10], %[val10], %[val14] \n\t"
692  "add.s %[val14], %[val14], %[val9] \n\t"
693  "add.s %[val9], %[val9], %[val13] \n\t"
694  "add.s %[val13], %[val13], %[val11] \n\t"
695  "add.s %[val11], %[val11], %[val15] \n\t"
696  "swc1 %[val8], 2*4(%[out]) \n\t"
697  "swc1 %[val9], 18*4(%[out]) \n\t"
698  "swc1 %[val10], 10*4(%[out]) \n\t"
699  "swc1 %[val11], 26*4(%[out]) \n\t"
700  "swc1 %[val12], 6*4(%[out]) \n\t"
701  "swc1 %[val13], 22*4(%[out]) \n\t"
702  "swc1 %[val14], 14*4(%[out]) \n\t"
703  "swc1 %[val15], 30*4(%[out]) \n\t"
704 
705  : [fTmp2] "=&f" (fTmp2), [fTmp3] "=&f" (fTmp3), [fTmp4] "=&f" (fTmp4),
706  [val8] "+f" (val8), [val9] "+f" (val9), [val10] "+f" (val10),
707  [val11] "+f" (val11), [val12] "+f" (val12), [val13] "+f" (val13),
708  [val14] "+f" (val14), [val15] "+f" (val15)
709  : [fTmp1] "f" (fTmp1), [out] "r" (out)
710  );
711 
712  __asm__ volatile (
713  "sub.s %[fTmp2], %[val16], %[val17] \n\t"
714  "add.s %[val16], %[val16], %[val17] \n\t"
715  "sub.s %[fTmp3], %[val19], %[val18] \n\t"
716  "add.s %[val18], %[val19], %[val18] \n\t"
717  "sub.s %[fTmp4], %[val20], %[val21] \n\t"
718  "mul.s %[val17], %[fTmp1], %[fTmp2] \n\t"
719  "add.s %[val20], %[val20], %[val21] \n\t"
720  "mul.s %[val19], %[fTmp1], %[fTmp3] \n\t"
721  "sub.s %[fTmp2], %[val23], %[val22] \n\t"
722  "mul.s %[val21], %[fTmp1], %[fTmp4] \n\t"
723  "add.s %[val22], %[val23], %[val22] \n\t"
724  "add.s %[val18], %[val18], %[val19] \n\t"
725  "mul.s %[val23], %[fTmp1], %[fTmp2] \n\t"
726  "add.s %[val22], %[val22], %[val23] \n\t"
727  "add.s %[val20], %[val20], %[val22] \n\t"
728  "add.s %[val22], %[val22], %[val21] \n\t"
729  "add.s %[val21], %[val21], %[val23] \n\t"
730 
731  : [fTmp2] "=&f" (fTmp2), [fTmp3] "=&f" (fTmp3), [fTmp4] "=&f" (fTmp4),
732  [val16] "+f" (val16), [val17] "+f" (val17), [val18] "+f" (val18),
733  [val19] "+f" (val19), [val20] "+f" (val20), [val21] "+f" (val21),
734  [val22] "+f" (val22), [val23] "+f" (val23)
735  : [fTmp1] "f" (fTmp1)
736  );
737 
738  __asm__ volatile (
739  "sub.s %[fTmp2], %[val24], %[val25] \n\t"
740  "add.s %[val24], %[val24], %[val25] \n\t"
741  "sub.s %[fTmp3], %[val27], %[val26] \n\t"
742  "add.s %[val26], %[val27], %[val26] \n\t"
743  "sub.s %[fTmp4], %[val28], %[val29] \n\t"
744  "mul.s %[val25], %[fTmp1], %[fTmp2] \n\t"
745  "add.s %[val28], %[val28], %[val29] \n\t"
746  "mul.s %[val27], %[fTmp1], %[fTmp3] \n\t"
747  "sub.s %[fTmp2], %[val31], %[val30] \n\t"
748  "mul.s %[val29], %[fTmp1], %[fTmp4] \n\t"
749  "add.s %[val30], %[val31], %[val30] \n\t"
750  "add.s %[val26], %[val26], %[val27] \n\t"
751  "mul.s %[val31], %[fTmp1], %[fTmp2] \n\t"
752  "add.s %[val30], %[val30], %[val31] \n\t"
753  "add.s %[val28], %[val28], %[val30] \n\t"
754  "add.s %[val30], %[val30], %[val29] \n\t"
755  "add.s %[val29], %[val29], %[val31] \n\t"
756  "add.s %[val24], %[val24], %[val28] \n\t"
757  "add.s %[val28], %[val28], %[val26] \n\t"
758  "add.s %[val26], %[val26], %[val30] \n\t"
759  "add.s %[val30], %[val30], %[val25] \n\t"
760  "add.s %[val25], %[val25], %[val29] \n\t"
761  "add.s %[val29], %[val29], %[val27] \n\t"
762  "add.s %[val27], %[val27], %[val31] \n\t"
763 
764  : [fTmp2] "=&f" (fTmp2), [fTmp3] "=&f" (fTmp3), [fTmp4] "=&f" (fTmp4),
765  [val24] "+f" (val24), [val25] "+f" (val25), [val26] "+f" (val26),
766  [val27] "+f" (val27), [val28] "+f" (val28), [val29] "+f" (val29),
767  [val30] "+f" (val30), [val31] "+f" (val31)
768  : [fTmp1] "f" (fTmp1)
769  );
770 
771  out[ 1] = val16 + val24;
772  out[17] = val17 + val25;
773  out[ 9] = val18 + val26;
774  out[25] = val19 + val27;
775  out[ 5] = val20 + val28;
776  out[21] = val21 + val29;
777  out[13] = val22 + val30;
778  out[29] = val23 + val31;
779  out[ 3] = val24 + val20;
780  out[19] = val25 + val21;
781  out[11] = val26 + val22;
782  out[27] = val27 + val23;
783  out[ 7] = val28 + val18;
784  out[23] = val29 + val19;
785  out[15] = val30 + val17;
786  out[31] = val31;
787 }
788 
789 static void imdct36_mips_float(float *out, float *buf, float *in, float *win)
790 {
791  float t0, t1, t2, t3, s0, s1, s2, s3;
792  float tmp[18];
793  /* temporary variables */
794  float in1, in2, in3, in4, in5, in6;
795  float out1, out2, out3, out4, out5;
796  float c1, c2, c3, c4, c5, c6, c7, c8, c9;
797 
798  /**
799  * all loops are unrolled totally, and instructions are scheduled to
800  * minimize pipeline stall. instructions of the first two loops are
801  * reorganized, in order to eliminate unnecessary readings and
802  * writings into array. values defined in macros and tables are
803  * eliminated - they are directly loaded in appropriate variables
804  */
805 
806  /* loop 1 and 2 */
807  __asm__ volatile (
808  "lwc1 %[in1], 17*4(%[in]) \t\n"
809  "lwc1 %[in2], 16*4(%[in]) \t\n"
810  "lwc1 %[in3], 15*4(%[in]) \t\n"
811  "lwc1 %[in4], 14*4(%[in]) \t\n"
812  "lwc1 %[in5], 13*4(%[in]) \t\n"
813  "lwc1 %[in6], 12*4(%[in]) \t\n"
814  "add.s %[out1], %[in1], %[in2] \t\n"
815  "add.s %[out2], %[in2], %[in3] \t\n"
816  "add.s %[out3], %[in3], %[in4] \t\n"
817  "add.s %[out4], %[in4], %[in5] \t\n"
818  "add.s %[out5], %[in5], %[in6] \t\n"
819  "lwc1 %[in1], 11*4(%[in]) \t\n"
820  "swc1 %[out2], 16*4(%[in]) \t\n"
821  "add.s %[out1], %[out1], %[out3] \t\n"
822  "swc1 %[out4], 14*4(%[in]) \t\n"
823  "add.s %[out3], %[out3], %[out5] \t\n"
824  "lwc1 %[in2], 10*4(%[in]) \t\n"
825  "lwc1 %[in3], 9*4(%[in]) \t\n"
826  "swc1 %[out1], 17*4(%[in]) \t\n"
827  "lwc1 %[in4], 8*4(%[in]) \t\n"
828  "swc1 %[out3], 15*4(%[in]) \t\n"
829  "add.s %[out1], %[in6], %[in1] \t\n"
830  "add.s %[out2], %[in1], %[in2] \t\n"
831  "add.s %[out3], %[in2], %[in3] \t\n"
832  "add.s %[out4], %[in3], %[in4] \t\n"
833  "lwc1 %[in5], 7*4(%[in]) \t\n"
834  "swc1 %[out1], 12*4(%[in]) \t\n"
835  "add.s %[out5], %[out5], %[out2] \t\n"
836  "swc1 %[out3], 10*4(%[in]) \t\n"
837  "add.s %[out2], %[out2], %[out4] \t\n"
838  "lwc1 %[in6], 6*4(%[in]) \t\n"
839  "lwc1 %[in1], 5*4(%[in]) \t\n"
840  "swc1 %[out5], 13*4(%[in]) \t\n"
841  "lwc1 %[in2], 4*4(%[in]) \t\n"
842  "swc1 %[out2], 11*4(%[in]) \t\n"
843  "add.s %[out5], %[in4], %[in5] \t\n"
844  "add.s %[out1], %[in5], %[in6] \t\n"
845  "add.s %[out2], %[in6], %[in1] \t\n"
846  "add.s %[out3], %[in1], %[in2] \t\n"
847  "lwc1 %[in3], 3*4(%[in]) \t\n"
848  "swc1 %[out5], 8*4(%[in]) \t\n"
849  "add.s %[out4], %[out4], %[out1] \t\n"
850  "swc1 %[out2], 6*4(%[in]) \t\n"
851  "add.s %[out1], %[out1], %[out3] \t\n"
852  "lwc1 %[in4], 2*4(%[in]) \t\n"
853  "lwc1 %[in5], 1*4(%[in]) \t\n"
854  "swc1 %[out4], 9*4(%[in]) \t\n"
855  "lwc1 %[in6], 0(%[in]) \t\n"
856  "swc1 %[out1], 7*4(%[in]) \t\n"
857  "add.s %[out4], %[in2], %[in3] \t\n"
858  "add.s %[out5], %[in3], %[in4] \t\n"
859  "add.s %[out1], %[in4], %[in5] \t\n"
860  "add.s %[out2], %[in5], %[in6] \t\n"
861  "swc1 %[out4], 4*4(%[in]) \t\n"
862  "add.s %[out3], %[out3], %[out5] \t\n"
863  "swc1 %[out1], 2*4(%[in]) \t\n"
864  "add.s %[out5], %[out5], %[out2] \t\n"
865  "swc1 %[out2], 1*4(%[in]) \t\n"
866  "swc1 %[out3], 5*4(%[in]) \t\n"
867  "swc1 %[out5], 3*4(%[in]) \t\n"
868 
869  : [in1] "=&f" (in1), [in2] "=&f" (in2),
870  [in3] "=&f" (in3), [in4] "=&f" (in4),
871  [in5] "=&f" (in5), [in6] "=&f" (in6),
872  [out1] "=&f" (out1), [out2] "=&f" (out2),
873  [out3] "=&f" (out3), [out4] "=&f" (out4),
874  [out5] "=&f" (out5)
875  : [in] "r" (in)
876  : "memory"
877  );
878 
879  /* loop 3 */
880  __asm__ volatile (
881  "li.s %[c1], 0.5 \t\n"
882  "lwc1 %[in1], 8*4(%[in]) \t\n"
883  "lwc1 %[in2], 16*4(%[in]) \t\n"
884  "lwc1 %[in3], 4*4(%[in]) \t\n"
885  "lwc1 %[in4], 0(%[in]) \t\n"
886  "lwc1 %[in5], 12*4(%[in]) \t\n"
887  "li.s %[c2], 0.93969262078590838405 \t\n"
888  "add.s %[t2], %[in1], %[in2] \t\n"
889  "add.s %[t0], %[in1], %[in3] \t\n"
890  "li.s %[c3], -0.76604444311897803520 \t\n"
891  "madd.s %[t3], %[in4], %[in5], %[c1] \t\n"
892  "sub.s %[t1], %[in4], %[in5] \t\n"
893  "sub.s %[t2], %[t2], %[in3] \t\n"
894  "mul.s %[t0], %[t0], %[c2] \t\n"
895  "li.s %[c4], -0.17364817766693034885 \t\n"
896  "li.s %[c5], -0.86602540378443864676 \t\n"
897  "li.s %[c6], 0.98480775301220805936 \t\n"
898  "nmsub.s %[out1], %[t1], %[t2], %[c1] \t\n"
899  "add.s %[out2], %[t1], %[t2] \t\n"
900  "add.s %[t2], %[in2], %[in3] \t\n"
901  "sub.s %[t1], %[in1], %[in2] \t\n"
902  "sub.s %[out3], %[t3], %[t0] \t\n"
903  "swc1 %[out1], 6*4(%[tmp]) \t\n"
904  "swc1 %[out2], 16*4(%[tmp]) \t\n"
905  "mul.s %[t2], %[t2], %[c3] \t\n"
906  "mul.s %[t1], %[t1], %[c4] \t\n"
907  "add.s %[out1], %[t3], %[t0] \t\n"
908  "lwc1 %[in1], 10*4(%[in]) \t\n"
909  "lwc1 %[in2], 14*4(%[in]) \t\n"
910  "sub.s %[out3], %[out3], %[t2] \t\n"
911  "add.s %[out2], %[t3], %[t2] \t\n"
912  "add.s %[out1], %[out1], %[t1] \t\n"
913  "lwc1 %[in3], 2*4(%[in]) \t\n"
914  "lwc1 %[in4], 6*4(%[in]) \t\n"
915  "swc1 %[out3], 10*4(%[tmp]) \t\n"
916  "sub.s %[out2], %[out2], %[t1] \t\n"
917  "swc1 %[out1], 2*4(%[tmp]) \t\n"
918  "add.s %[out1], %[in1], %[in2] \t\n"
919  "add.s %[t2], %[in1], %[in3] \t\n"
920  "sub.s %[t3], %[in1], %[in2] \t\n"
921  "swc1 %[out2], 14*4(%[tmp]) \t\n"
922  "li.s %[c7], -0.34202014332566873304 \t\n"
923  "sub.s %[out1], %[out1], %[in3] \t\n"
924  "mul.s %[t2], %[t2], %[c6] \t\n"
925  "mul.s %[t3], %[t3], %[c7] \t\n"
926  "li.s %[c8], 0.86602540378443864676 \t\n"
927  "mul.s %[t0], %[in4], %[c8] \t\n"
928  "mul.s %[out1], %[out1], %[c5] \t\n"
929  "add.s %[t1], %[in2], %[in3] \t\n"
930  "li.s %[c9], -0.64278760968653932632 \t\n"
931  "add.s %[out2], %[t2], %[t3] \t\n"
932  "lwc1 %[in1], 9*4(%[in]) \t\n"
933  "swc1 %[out1], 4*4(%[tmp]) \t\n"
934  "mul.s %[t1], %[t1], %[c9] \t\n"
935  "lwc1 %[in2], 17*4(%[in]) \t\n"
936  "add.s %[out2], %[out2], %[t0] \t\n"
937  "lwc1 %[in3], 5*4(%[in]) \t\n"
938  "lwc1 %[in4], 1*4(%[in]) \t\n"
939  "add.s %[out3], %[t2], %[t1] \t\n"
940  "sub.s %[out1], %[t3], %[t1] \t\n"
941  "swc1 %[out2], 0(%[tmp]) \t\n"
942  "lwc1 %[in5], 13*4(%[in]) \t\n"
943  "add.s %[t2], %[in1], %[in2] \t\n"
944  "sub.s %[out3], %[out3], %[t0] \t\n"
945  "sub.s %[out1], %[out1], %[t0] \t\n"
946  "add.s %[t0], %[in1], %[in3] \t\n"
947  "madd.s %[t3], %[in4], %[in5], %[c1] \t\n"
948  "sub.s %[t2], %[t2], %[in3] \t\n"
949  "swc1 %[out3], 12*4(%[tmp]) \t\n"
950  "swc1 %[out1], 8*4(%[tmp]) \t\n"
951  "sub.s %[t1], %[in4], %[in5] \t\n"
952  "mul.s %[t0], %[t0], %[c2] \t\n"
953  "nmsub.s %[out1], %[t1], %[t2], %[c1] \t\n"
954  "add.s %[out2], %[t1], %[t2] \t\n"
955  "add.s %[t2], %[in2], %[in3] \t\n"
956  "sub.s %[t1], %[in1], %[in2] \t\n"
957  "sub.s %[out3], %[t3], %[t0] \t\n"
958  "swc1 %[out1], 7*4(%[tmp]) \t\n"
959  "swc1 %[out2], 17*4(%[tmp]) \t\n"
960  "mul.s %[t2], %[t2], %[c3] \t\n"
961  "mul.s %[t1], %[t1], %[c4] \t\n"
962  "add.s %[out1], %[t3], %[t0] \t\n"
963  "lwc1 %[in1], 11*4(%[in]) \t\n"
964  "lwc1 %[in2], 15*4(%[in]) \t\n"
965  "sub.s %[out3], %[out3], %[t2] \t\n"
966  "add.s %[out2], %[t3], %[t2] \t\n"
967  "add.s %[out1], %[out1], %[t1] \t\n"
968  "lwc1 %[in3], 3*4(%[in]) \t\n"
969  "lwc1 %[in4], 7*4(%[in]) \t\n"
970  "swc1 %[out3], 11*4(%[tmp]) \t\n"
971  "sub.s %[out2], %[out2], %[t1] \t\n"
972  "swc1 %[out1], 3*4(%[tmp]) \t\n"
973  "add.s %[out3], %[in1], %[in2] \t\n"
974  "add.s %[t2], %[in1], %[in3] \t\n"
975  "sub.s %[t3], %[in1], %[in2] \t\n"
976  "swc1 %[out2], 15*4(%[tmp]) \t\n"
977  "mul.s %[t0], %[in4], %[c8] \t\n"
978  "sub.s %[out3], %[out3], %[in3] \t\n"
979  "mul.s %[t2], %[t2], %[c6] \t\n"
980  "mul.s %[t3], %[t3], %[c7] \t\n"
981  "add.s %[t1], %[in2], %[in3] \t\n"
982  "mul.s %[out3], %[out3], %[c5] \t\n"
983  "add.s %[out1], %[t2], %[t3] \t\n"
984  "mul.s %[t1], %[t1], %[c9] \t\n"
985  "swc1 %[out3], 5*4(%[tmp]) \t\n"
986  "add.s %[out1], %[out1], %[t0] \t\n"
987  "add.s %[out2], %[t2], %[t1] \t\n"
988  "sub.s %[out3], %[t3], %[t1] \t\n"
989  "swc1 %[out1], 1*4(%[tmp]) \t\n"
990  "sub.s %[out2], %[out2], %[t0] \t\n"
991  "sub.s %[out3], %[out3], %[t0] \t\n"
992  "swc1 %[out2], 13*4(%[tmp]) \t\n"
993  "swc1 %[out3], 9*4(%[tmp]) \t\n"
994 
995  : [t0] "=&f" (t0), [t1] "=&f" (t1),
996  [t2] "=&f" (t2), [t3] "=&f" (t3),
997  [in1] "=&f" (in1), [in2] "=&f" (in2),
998  [in3] "=&f" (in3), [in4] "=&f" (in4),
999  [in5] "=&f" (in5),
1000  [out1] "=&f" (out1), [out2] "=&f" (out2),
1001  [out3] "=&f" (out3),
1002  [c1] "=&f" (c1), [c2] "=&f" (c2),
1003  [c3] "=&f" (c3), [c4] "=&f" (c4),
1004  [c5] "=&f" (c5), [c6] "=&f" (c6),
1005  [c7] "=&f" (c7), [c8] "=&f" (c8),
1006  [c9] "=&f" (c9)
1007  : [in] "r" (in), [tmp] "r" (tmp)
1008  : "memory"
1009  );
1010 
1011  /* loop 4 */
1012  __asm__ volatile (
1013  "lwc1 %[in1], 2*4(%[tmp]) \t\n"
1014  "lwc1 %[in2], 0(%[tmp]) \t\n"
1015  "lwc1 %[in3], 3*4(%[tmp]) \t\n"
1016  "lwc1 %[in4], 1*4(%[tmp]) \t\n"
1017  "li.s %[c1], 0.50190991877167369479 \t\n"
1018  "li.s %[c2], 5.73685662283492756461 \t\n"
1019  "add.s %[s0], %[in1], %[in2] \t\n"
1020  "sub.s %[s2], %[in1], %[in2] \t\n"
1021  "add.s %[s1], %[in3], %[in4] \t\n"
1022  "sub.s %[s3], %[in3], %[in4] \t\n"
1023  "lwc1 %[in1], 9*4(%[win]) \t\n"
1024  "lwc1 %[in2], 4*9*4(%[buf]) \t\n"
1025  "lwc1 %[in3], 8*4(%[win]) \t\n"
1026  "mul.s %[s1], %[s1], %[c1] \t\n"
1027  "mul.s %[s3], %[s3], %[c2] \t\n"
1028  "lwc1 %[in4], 4*8*4(%[buf]) \t\n"
1029  "lwc1 %[in5], 29*4(%[win]) \t\n"
1030  "lwc1 %[in6], 28*4(%[win]) \t\n"
1031  "add.s %[t0], %[s0], %[s1] \t\n"
1032  "sub.s %[t1], %[s0], %[s1] \t\n"
1033  "li.s %[c1], 0.51763809020504152469 \t\n"
1034  "li.s %[c2], 1.93185165257813657349 \t\n"
1035  "mul.s %[out3], %[in5], %[t0] \t\n"
1036  "madd.s %[out1], %[in2], %[in1], %[t1] \t\n"
1037  "madd.s %[out2], %[in4], %[in3], %[t1] \t\n"
1038  "mul.s %[out4], %[in6], %[t0] \t\n"
1039  "add.s %[t0], %[s2], %[s3] \t\n"
1040  "swc1 %[out3], 4*9*4(%[buf]) \t\n"
1041  "swc1 %[out1], 288*4(%[out]) \t\n"
1042  "swc1 %[out2], 256*4(%[out]) \t\n"
1043  "swc1 %[out4], 4*8*4(%[buf]) \t\n"
1044  "sub.s %[t1], %[s2], %[s3] \t\n"
1045  "lwc1 %[in1], 17*4(%[win]) \t\n"
1046  "lwc1 %[in2], 4*17*4(%[buf]) \t\n"
1047  "lwc1 %[in3], 0(%[win]) \t\n"
1048  "lwc1 %[in4], 0(%[buf]) \t\n"
1049  "lwc1 %[in5], 37*4(%[win]) \t\n"
1050  "lwc1 %[in6], 20*4(%[win]) \t\n"
1051  "madd.s %[out1], %[in2], %[in1], %[t1] \t\n"
1052  "lwc1 %[in1], 6*4(%[tmp]) \t\n"
1053  "madd.s %[out2], %[in4], %[in3], %[t1] \t\n"
1054  "mul.s %[out3], %[t0], %[in5] \t\n"
1055  "mul.s %[out4], %[t0], %[in6] \t\n"
1056  "swc1 %[out1], 544*4(%[out]) \t\n"
1057  "lwc1 %[in2], 4*4(%[tmp]) \t\n"
1058  "swc1 %[out2], 0(%[out]) \t\n"
1059  "swc1 %[out3], 4*17*4(%[buf]) \t\n"
1060  "swc1 %[out4], 0(%[buf]) \t\n"
1061  "lwc1 %[in3], 7*4(%[tmp]) \t\n"
1062  "add.s %[s0], %[in1], %[in2] \t\n"
1063  "sub.s %[s2], %[in1], %[in2] \t\n"
1064  "lwc1 %[in4], 5*4(%[tmp]) \t\n"
1065  "add.s %[s1], %[in3], %[in4] \t\n"
1066  "sub.s %[s3], %[in3], %[in4] \t\n"
1067  "lwc1 %[in1], 10*4(%[win]) \t\n"
1068  "lwc1 %[in2], 4*10*4(%[buf]) \t\n"
1069  "lwc1 %[in3], 7*4(%[win]) \t\n"
1070  "mul.s %[s1], %[s1], %[c1] \t\n"
1071  "mul.s %[s3], %[s3], %[c2] \t\n"
1072  "add.s %[t0], %[s0], %[s1] \t\n"
1073  "sub.s %[t1], %[s0], %[s1] \t\n"
1074  "lwc1 %[in4], 4*7*4(%[buf]) \t\n"
1075  "lwc1 %[in5], 30*4(%[win]) \t\n"
1076  "lwc1 %[in6], 27*4(%[win]) \t\n"
1077  "li.s %[c1], 0.55168895948124587824 \t\n"
1078  "madd.s %[out1], %[in2], %[in1], %[t1] \t\n"
1079  "madd.s %[out2], %[in4], %[in3], %[t1] \t\n"
1080  "mul.s %[out3], %[t0], %[in5] \t\n"
1081  "mul.s %[out4], %[t0], %[in6] \t\n"
1082  "add.s %[t0], %[s2], %[s3] \t\n"
1083  "swc1 %[out1], 320*4(%[out]) \t\n"
1084  "swc1 %[out2], 224*4(%[out]) \t\n"
1085  "swc1 %[out3], 4*10*4(%[buf]) \t\n"
1086  "swc1 %[out4], 4*7*4(%[buf]) \t\n"
1087  "sub.s %[t1], %[s2], %[s3] \t\n"
1088  "lwc1 %[in1], 16*4(%[win]) \t\n"
1089  "lwc1 %[in2], 4*16*4(%[buf]) \t\n"
1090  "lwc1 %[in3], 1*4(%[win]) \t\n"
1091  "lwc1 %[in4], 4*1*4(%[buf]) \t\n"
1092  "lwc1 %[in5], 36*4(%[win]) \t\n"
1093  "lwc1 %[in6], 21*4(%[win]) \t\n"
1094  "madd.s %[out1], %[in2], %[in1], %[t1] \t\n"
1095  "lwc1 %[in1], 10*4(%[tmp]) \t\n"
1096  "madd.s %[out2], %[in4], %[in3], %[t1] \t\n"
1097  "mul.s %[out3], %[in5], %[t0] \t\n"
1098  "mul.s %[out4], %[in6], %[t0] \t\n"
1099  "swc1 %[out1], 512*4(%[out]) \t\n"
1100  "lwc1 %[in2], 8*4(%[tmp]) \t\n"
1101  "swc1 %[out2], 32*4(%[out]) \t\n"
1102  "swc1 %[out3], 4*16*4(%[buf]) \t\n"
1103  "swc1 %[out4], 4*1*4(%[buf]) \t\n"
1104  "li.s %[c2], 1.18310079157624925896 \t\n"
1105  "add.s %[s0], %[in1], %[in2] \t\n"
1106  "sub.s %[s2], %[in1], %[in2] \t\n"
1107  "lwc1 %[in3], 11*4(%[tmp]) \t\n"
1108  "lwc1 %[in4], 9*4(%[tmp]) \t\n"
1109  "add.s %[s1], %[in3], %[in4] \t\n"
1110  "sub.s %[s3], %[in3], %[in4] \t\n"
1111  "lwc1 %[in1], 11*4(%[win]) \t\n"
1112  "lwc1 %[in2], 4*11*4(%[buf]) \t\n"
1113  "lwc1 %[in3], 6*4(%[win]) \t\n"
1114  "mul.s %[s1], %[s1], %[c1] \t\n"
1115  "mul.s %[s3], %[s3], %[c2] \t\n"
1116  "lwc1 %[in4], 4*6*4(%[buf]) \t\n"
1117  "lwc1 %[in5], 31*4(%[win]) \t\n"
1118  "lwc1 %[in6], 26*4(%[win]) \t\n"
1119  "add.s %[t0], %[s0], %[s1] \t\n"
1120  "sub.s %[t1], %[s0], %[s1] \t\n"
1121  "mul.s %[out3], %[t0], %[in5] \t\n"
1122  "mul.s %[out4], %[t0], %[in6] \t\n"
1123  "add.s %[t0], %[s2], %[s3] \t\n"
1124  "madd.s %[out1], %[in2], %[in1], %[t1] \t\n"
1125  "madd.s %[out2], %[in4], %[in3], %[t1] \t\n"
1126  "swc1 %[out3], 4*11*4(%[buf]) \t\n"
1127  "swc1 %[out4], 4*6*4(%[buf]) \t\n"
1128  "sub.s %[t1], %[s2], %[s3] \t\n"
1129  "swc1 %[out1], 352*4(%[out]) \t\n"
1130  "swc1 %[out2], 192*4(%[out]) \t\n"
1131  "lwc1 %[in1], 15*4(%[win]) \t\n"
1132  "lwc1 %[in2], 4*15*4(%[buf]) \t\n"
1133  "lwc1 %[in3], 2*4(%[win]) \t\n"
1134  "lwc1 %[in4], 4*2*4(%[buf]) \t\n"
1135  "lwc1 %[in5], 35*4(%[win]) \t\n"
1136  "lwc1 %[in6], 22*4(%[win]) \t\n"
1137  "madd.s %[out1], %[in2], %[in1], %[t1] \t\n"
1138  "lwc1 %[in1], 14*4(%[tmp]) \t\n"
1139  "madd.s %[out2], %[in4], %[in3], %[t1] \t\n"
1140  "mul.s %[out3], %[t0], %[in5] \t\n"
1141  "mul.s %[out4], %[t0], %[in6] \t\n"
1142  "swc1 %[out1], 480*4(%[out]) \t\n"
1143  "lwc1 %[in2], 12*4(%[tmp]) \t\n"
1144  "swc1 %[out2], 64*4(%[out]) \t\n"
1145  "swc1 %[out3], 4*15*4(%[buf]) \t\n"
1146  "swc1 %[out4], 4*2*4(%[buf]) \t\n"
1147  "lwc1 %[in3], 15*4(%[tmp]) \t\n"
1148  "add.s %[s0], %[in1], %[in2] \t\n"
1149  "sub.s %[s2], %[in1], %[in2] \t\n"
1150  "lwc1 %[in4], 13*4(%[tmp]) \t\n"
1151  "li.s %[c1], 0.61038729438072803416 \t\n"
1152  "li.s %[c2], 0.87172339781054900991 \t\n"
1153  "add.s %[s1], %[in3], %[in4] \t\n"
1154  "sub.s %[s3], %[in3], %[in4] \t\n"
1155  "lwc1 %[in1], 12*4(%[win]) \t\n"
1156  "lwc1 %[in2], 4*12*4(%[buf]) \t\n"
1157  "lwc1 %[in3], 5*4(%[win]) \t\n"
1158  "mul.s %[s1], %[s1], %[c1] \t\n"
1159  "mul.s %[s3], %[s3], %[c2] \t\n"
1160  "lwc1 %[in4], 4*5*4(%[buf]) \t\n"
1161  "lwc1 %[in5], 32*4(%[win]) \t\n"
1162  "lwc1 %[in6], 25*4(%[win]) \t\n"
1163  "add.s %[t0], %[s0], %[s1] \t\n"
1164  "sub.s %[t1], %[s0], %[s1] \t\n"
1165  "lwc1 %[s0], 16*4(%[tmp]) \t\n"
1166  "lwc1 %[s1], 17*4(%[tmp]) \t\n"
1167  "li.s %[c1], 0.70710678118654752439 \t\n"
1168  "mul.s %[out3], %[t0], %[in5] \t\n"
1169  "madd.s %[out1], %[in2], %[in1], %[t1] \t\n"
1170  "madd.s %[out2], %[in4], %[in3], %[t1] \t\n"
1171  "mul.s %[out4], %[t0], %[in6] \t\n"
1172  "add.s %[t0], %[s2], %[s3] \t\n"
1173  "swc1 %[out3], 4*12*4(%[buf]) \t\n"
1174  "swc1 %[out1], 384*4(%[out]) \t\n"
1175  "swc1 %[out2], 160*4(%[out]) \t\n"
1176  "swc1 %[out4], 4*5*4(%[buf]) \t\n"
1177  "sub.s %[t1], %[s2], %[s3] \t\n"
1178  "lwc1 %[in1], 14*4(%[win]) \t\n"
1179  "lwc1 %[in2], 4*14*4(%[buf]) \t\n"
1180  "lwc1 %[in3], 3*4(%[win]) \t\n"
1181  "lwc1 %[in4], 4*3*4(%[buf]) \t\n"
1182  "lwc1 %[in5], 34*4(%[win]) \t\n"
1183  "lwc1 %[in6], 23*4(%[win]) \t\n"
1184  "madd.s %[out1], %[in2], %[in1], %[t1] \t\n"
1185  "mul.s %[s1], %[s1], %[c1] \t\n"
1186  "madd.s %[out2], %[in4], %[in3], %[t1] \t\n"
1187  "mul.s %[out3], %[in5], %[t0] \t\n"
1188  "mul.s %[out4], %[in6], %[t0] \t\n"
1189  "swc1 %[out1], 448*4(%[out]) \t\n"
1190  "add.s %[t0], %[s0], %[s1] \t\n"
1191  "swc1 %[out2], 96*4(%[out]) \t\n"
1192  "swc1 %[out3], 4*14*4(%[buf]) \t\n"
1193  "swc1 %[out4], 4*3*4(%[buf]) \t\n"
1194  "sub.s %[t1], %[s0], %[s1] \t\n"
1195  "lwc1 %[in1], 13*4(%[win]) \t\n"
1196  "lwc1 %[in2], 4*13*4(%[buf]) \t\n"
1197  "lwc1 %[in3], 4*4(%[win]) \t\n"
1198  "lwc1 %[in4], 4*4*4(%[buf]) \t\n"
1199  "lwc1 %[in5], 33*4(%[win]) \t\n"
1200  "lwc1 %[in6], 24*4(%[win]) \t\n"
1201  "madd.s %[out1], %[in2], %[in1], %[t1] \t\n"
1202  "madd.s %[out2], %[in4], %[in3], %[t1] \t\n"
1203  "mul.s %[out3], %[t0], %[in5] \t\n"
1204  "mul.s %[out4], %[t0], %[in6] \t\n"
1205  "swc1 %[out1], 416*4(%[out]) \t\n"
1206  "swc1 %[out2], 128*4(%[out]) \t\n"
1207  "swc1 %[out3], 4*13*4(%[buf]) \t\n"
1208  "swc1 %[out4], 4*4*4(%[buf]) \t\n"
1209 
1210  : [c1] "=&f" (c1), [c2] "=&f" (c2),
1211  [in1] "=&f" (in1), [in2] "=&f" (in2),
1212  [in3] "=&f" (in3), [in4] "=&f" (in4),
1213  [in5] "=&f" (in5), [in6] "=&f" (in6),
1214  [out1] "=&f" (out1), [out2] "=&f" (out2),
1215  [out3] "=&f" (out3), [out4] "=&f" (out4),
1216  [t0] "=&f" (t0), [t1] "=&f" (t1),
1217  [t2] "=&f" (t2), [t3] "=&f" (t3),
1218  [s0] "=&f" (s0), [s1] "=&f" (s1),
1219  [s2] "=&f" (s2), [s3] "=&f" (s3)
1220  : [tmp] "r" (tmp), [win] "r" (win),
1221  [buf] "r" (buf), [out] "r" (out)
1222  : "memory"
1223  );
1224 }
1225 
1226 static void ff_imdct36_blocks_mips_float(float *out, float *buf, float *in,
1227  int count, int switch_point, int block_type)
1228 {
1229  int j;
1230  for (j=0 ; j < count; j++) {
1231  /* apply window & overlap with previous buffer */
1232 
1233  /* select window */
1234  int win_idx = (switch_point && j < 2) ? 0 : block_type;
1235  float *win = ff_mdct_win_float[win_idx + (4 & -(j & 1))];
1236 
1237  imdct36_mips_float(out, buf, in, win);
1238 
1239  in += 18;
1240  buf += ((j&3) != 3 ? 1 : (72-3));
1241  out++;
1242  }
1243 }
1244 
1246 {
1250 }
const char * s
Definition: avisynth_c.h:668
#define c2
Definition: idct_sh4.c:27
About Git write you should know how to use GIT properly Luckily Git comes with excellent documentation git help man git shows you the available git< command > help man git< command > shows information about the subcommand< command > The most comprehensive manual is the website Git Reference visit they are quite exhaustive You do not need a special username or password All you need is to provide a ssh public key to the Git server admin What follows now is a basic introduction to Git and some FFmpeg specific guidelines Read it at least if you are granted commit privileges to the FFmpeg project you are expected to be familiar with these rules I if not You can get git from etc no matter how small Every one of them has been saved from looking like a fool by this many times It s very easy for stray debug output or cosmetic modifications to slip in
Definition: git-howto.txt:5
float ff_mdct_win_float[8][MDCT_BUF_SIZE]
output residual component w
#define c6
Definition: idct_sh4.c:31
#define t0
Definition: regdef.h:28
static void ff_mpadsp_apply_window_mips_float(float *synth_buf, float *window, int *dither_state, float *samples, int incr)
#define s2
Definition: regdef.h:39
static void imdct36_mips_float(float *out, float *buf, float *in, float *win)
#define s0
Definition: regdef.h:37
#define t1
Definition: regdef.h:29
overlapping window(triangular window to avoid too much overlapping) ovidx
#define t3
Definition: regdef.h:31
#define c5
Definition: idct_sh4.c:30
void(* imdct36_blocks_float)(float *out, float *buf, float *in, int count, int switch_point, int block_type)
Definition: mpegaudiodsp.h:33
#define s3
Definition: regdef.h:40
void * buf
Definition: avisynth_c.h:594
static void ff_dct32_mips_float(float *out, const float *tab)
void(* dct32_float)(float *dst, const float *src)
Definition: mpegaudiodsp.h:30
static void ff_imdct36_blocks_mips_float(float *out, float *buf, float *in, int count, int switch_point, int block_type)
#define s1
Definition: regdef.h:38
void ff_mpadsp_init_mipsfpu(MPADSPContext *s)
#define c4
Definition: idct_sh4.c:29
void(* apply_window_float)(float *synth_buf, float *window, int *dither_state, float *samples, int incr)
Definition: mpegaudiodsp.h:26
static const struct twinvq_data tab
Filter the word “frame” indicates either a video frame or a group of audio samples
void INT64 INT64 count
Definition: avisynth_c.h:594
uint8_t pi<< 24) CONV_FUNC_GROUP(AV_SAMPLE_FMT_FLT, float, AV_SAMPLE_FMT_U8, uint8_t,(*(const uint8_t *) pi-0x80)*(1.0f/(1<< 7))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_DBL, double, AV_SAMPLE_FMT_U8, uint8_t,(*(const uint8_t *) pi-0x80)*(1.0/(1<< 7))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_U8, uint8_t, AV_SAMPLE_FMT_S16, int16_t,(*(const int16_t *) pi >> 8)+0x80) CONV_FUNC_GROUP(AV_SAMPLE_FMT_FLT, float, AV_SAMPLE_FMT_S16, int16_t,*(const int16_t *) pi *(1.0f/(1<< 15))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_DBL, double, AV_SAMPLE_FMT_S16, int16_t,*(const int16_t *) pi *(1.0/(1<< 15))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_U8, uint8_t, AV_SAMPLE_FMT_S32, int32_t,(*(const int32_t *) pi >> 24)+0x80) CONV_FUNC_GROUP(AV_SAMPLE_FMT_FLT, float, AV_SAMPLE_FMT_S32, int32_t,*(const int32_t *) pi *(1.0f/(1U<< 31))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_DBL, double, AV_SAMPLE_FMT_S32, int32_t,*(const int32_t *) pi *(1.0/(1U<< 31))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_U8, uint8_t, AV_SAMPLE_FMT_FLT, float, av_clip_uint8(lrintf(*(const float *) pi *(1<< 7))+0x80)) CONV_FUNC_GROUP(AV_SAMPLE_FMT_S16, int16_t, AV_SAMPLE_FMT_FLT, float, av_clip_int16(lrintf(*(const float *) pi *(1<< 15)))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_S32, int32_t, AV_SAMPLE_FMT_FLT, float, av_clipl_int32(llrintf(*(const float *) pi *(1U<< 31)))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_U8, uint8_t, AV_SAMPLE_FMT_DBL, double, av_clip_uint8(lrint(*(const double *) pi *(1<< 7))+0x80)) CONV_FUNC_GROUP(AV_SAMPLE_FMT_S16, int16_t, AV_SAMPLE_FMT_DBL, double, av_clip_int16(lrint(*(const double *) pi *(1<< 15)))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_S32, int32_t, AV_SAMPLE_FMT_DBL, double, av_clipl_int32(llrint(*(const double *) pi *(1U<< 31))))#define SET_CONV_FUNC_GROUP(ofmt, ifmt) static void set_generic_function(AudioConvert *ac){}void ff_audio_convert_free(AudioConvert **ac){if(!*ac) return;ff_dither_free(&(*ac) ->dc);av_freep(ac);}AudioConvert *ff_audio_convert_alloc(AVAudioResampleContext *avr, enum AVSampleFormat out_fmt, enum AVSampleFormat in_fmt, int channels, int sample_rate, int apply_map){AudioConvert *ac;int in_planar, out_planar;ac=av_mallocz(sizeof(*ac));if(!ac) return NULL;ac->avr=avr;ac->out_fmt=out_fmt;ac->in_fmt=in_fmt;ac->channels=channels;ac->apply_map=apply_map;if(avr->dither_method!=AV_RESAMPLE_DITHER_NONE &&av_get_packed_sample_fmt(out_fmt)==AV_SAMPLE_FMT_S16 &&av_get_bytes_per_sample(in_fmt) > 2){ac->dc=ff_dither_alloc(avr, out_fmt, in_fmt, channels, sample_rate, apply_map);if(!ac->dc){av_free(ac);return NULL;}return ac;}in_planar=av_sample_fmt_is_planar(in_fmt);out_planar=av_sample_fmt_is_planar(out_fmt);if(in_planar==out_planar){ac->func_type=CONV_FUNC_TYPE_FLAT;ac->planes=in_planar?ac->channels:1;}else if(in_planar) ac->func_type=CONV_FUNC_TYPE_INTERLEAVE;else ac->func_type=CONV_FUNC_TYPE_DEINTERLEAVE;set_generic_function(ac);if(ARCH_ARM) ff_audio_convert_init_arm(ac);if(ARCH_X86) ff_audio_convert_init_x86(ac);return ac;}int ff_audio_convert(AudioConvert *ac, AudioData *out, AudioData *in){int use_generic=1;int len=in->nb_samples;int p;if(ac->dc){av_dlog(ac->avr,"%d samples - audio_convert: %s to %s (dithered)\n", len, av_get_sample_fmt_name(ac->in_fmt), av_get_sample_fmt_name(ac->out_fmt));return ff_convert_dither(ac-> out
#define c3
Definition: idct_sh4.c:28
#define c7
Definition: idct_sh4.c:32
#define t2
Definition: regdef.h:30
#define c1
Definition: idct_sh4.c:26