mpegaudiodsp_mips_fixed.c
Go to the documentation of this file.
1  /*
2  * Copyright (c) 2012
3  * MIPS Technologies, Inc., California.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  * notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  * notice, this list of conditions and the following disclaimer in the
12  * documentation and/or other materials provided with the distribution.
13  * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
14  * contributors may be used to endorse or promote products derived from
15  * this software without specific prior written permission.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED. IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  *
29  * Author: Bojan Zivkovic (bojan@mips.com)
30  *
31  * MPEG Audio decoder optimized for MIPS fixed-point architecture
32  *
33  * This file is part of FFmpeg.
34  *
35  * FFmpeg is free software; you can redistribute it and/or
36  * modify it under the terms of the GNU Lesser General Public
37  * License as published by the Free Software Foundation; either
38  * version 2.1 of the License, or (at your option) any later version.
39  *
40  * FFmpeg is distributed in the hope that it will be useful,
41  * but WITHOUT ANY WARRANTY; without even the implied warranty of
42  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
43  * Lesser General Public License for more details.
44  *
45  * You should have received a copy of the GNU Lesser General Public
46  * License along with FFmpeg; if not, write to the Free Software
47  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
48  */
49 
50 /**
51  * @file
52  * Reference: libavcodec/mpegaudiodsp_template.c
53  */
54 
55 #include <string.h>
56 
58 
60  int *dither_state, int16_t *samples, int incr)
61 {
62  register const int32_t *w, *w2, *p;
63  int j;
64  int16_t *samples2;
65  int w_asm, p_asm, w_asm1, p_asm1, w_asm2, p_asm2;
66  int w2_asm, w2_asm1, *p_temp1, *p_temp2;
67  int sum1 = 0;
68  int const min_asm = -32768, max_asm = 32767;
69  int temp1, temp2 = 0, temp3 = 0;
70  int64_t sum;
71 
72  /* copy to avoid wrap */
73  memcpy(synth_buf + 512, synth_buf, 32 * sizeof(*synth_buf));
74  samples2 = samples + 31 * incr;
75  w = window;
76  w2 = window + 31;
77  sum = *dither_state;
78  p = synth_buf + 16;
79  p_temp1 = synth_buf + 16;
80  p_temp2 = synth_buf + 48;
81  temp1 = sum;
82 
83  /**
84  * use of round_sample function from the original code is eliminated,
85  * changed with appropriate assembly instructions.
86  */
87  __asm__ volatile (
88  "mthi $zero \n\t"
89  "mtlo %[temp1] \n\t"
90  "lw %[w_asm], 0(%[w]) \n\t"
91  "lw %[p_asm], 0(%[p]) \n\t"
92  "lw %[w_asm1], 64*4(%[w]) \n\t"
93  "lw %[p_asm1], 64*4(%[p]) \n\t"
94  "lw %[w_asm2], 128*4(%[w]) \n\t"
95  "lw %[p_asm2], 128*4(%[p]) \n\t"
96  "madd %[w_asm], %[p_asm] \n\t"
97  "madd %[w_asm1], %[p_asm1] \n\t"
98  "madd %[w_asm2], %[p_asm2] \n\t"
99  "lw %[w_asm], 192*4(%[w]) \n\t"
100  "lw %[p_asm], 192*4(%[p]) \n\t"
101  "lw %[w_asm1], 256*4(%[w]) \n\t"
102  "lw %[p_asm1], 256*4(%[p]) \n\t"
103  "lw %[w_asm2], 320*4(%[w]) \n\t"
104  "lw %[p_asm2], 320*4(%[p]) \n\t"
105  "madd %[w_asm], %[p_asm] \n\t"
106  "madd %[w_asm1], %[p_asm1] \n\t"
107  "madd %[w_asm2], %[p_asm2] \n\t"
108  "lw %[w_asm], 384*4(%[w]) \n\t"
109  "lw %[p_asm], 384*4(%[p]) \n\t"
110  "lw %[w_asm1], 448*4(%[w]) \n\t"
111  "lw %[p_asm1], 448*4(%[p]) \n\t"
112  "lw %[w_asm2], 32*4(%[w]) \n\t"
113  "lw %[p_asm2], 32*4(%[p]) \n\t"
114  "madd %[w_asm], %[p_asm] \n\t"
115  "madd %[w_asm1], %[p_asm1] \n\t"
116  "msub %[w_asm2], %[p_asm2] \n\t"
117  "lw %[w_asm], 96*4(%[w]) \n\t"
118  "lw %[p_asm], 96*4(%[p]) \n\t"
119  "lw %[w_asm1], 160*4(%[w]) \n\t"
120  "lw %[p_asm1], 160*4(%[p]) \n\t"
121  "lw %[w_asm2], 224*4(%[w]) \n\t"
122  "lw %[p_asm2], 224*4(%[p]) \n\t"
123  "msub %[w_asm], %[p_asm] \n\t"
124  "msub %[w_asm1], %[p_asm1] \n\t"
125  "msub %[w_asm2], %[p_asm2] \n\t"
126  "lw %[w_asm], 288*4(%[w]) \n\t"
127  "lw %[p_asm], 288*4(%[p]) \n\t"
128  "lw %[w_asm1], 352*4(%[w]) \n\t"
129  "lw %[p_asm1], 352*4(%[p]) \n\t"
130  "msub %[w_asm], %[p_asm] \n\t"
131  "lw %[w_asm], 480*4(%[w]) \n\t"
132  "lw %[p_asm], 480*4(%[p]) \n\t"
133  "lw %[w_asm2], 416*4(%[w]) \n\t"
134  "lw %[p_asm2], 416*4(%[p]) \n\t"
135  "msub %[w_asm], %[p_asm] \n\t"
136  "msub %[w_asm1], %[p_asm1] \n\t"
137  "msub %[w_asm2], %[p_asm2] \n\t"
138 
139  /*round_sample function from the original code is eliminated,
140  * changed with appropriate assembly instructions
141  * code example:
142 
143  "extr.w %[sum1],$ac0,24 \n\t"
144  "mflo %[temp3], $ac0 \n\t"
145  "and %[temp1], %[temp3], 0x00ffffff \n\t"
146  "slt %[temp2], %[sum1], %[min_asm] \n\t"
147  "movn %[sum1], %[min_asm],%[temp2] \n\t"
148  "slt %[temp2], %[max_asm],%[sum1] \n\t"
149  "movn %[sum1], %[max_asm],%[temp2] \n\t"
150  "sh %[sum1], 0(%[samples]) \n\t"
151  */
152 
153  "extr.w %[sum1], $ac0, 24 \n\t"
154  "mflo %[temp3] \n\t"
155  "addi %[w], %[w], 4 \n\t"
156  "and %[temp1], %[temp3], 0x00ffffff \n\t"
157  "slt %[temp2], %[sum1], %[min_asm] \n\t"
158  "movn %[sum1], %[min_asm], %[temp2] \n\t"
159  "slt %[temp2], %[max_asm], %[sum1] \n\t"
160  "movn %[sum1], %[max_asm], %[temp2] \n\t"
161  "sh %[sum1], 0(%[samples]) \n\t"
162 
163  : [w_asm] "=&r" (w_asm), [p_asm] "=&r" (p_asm), [w_asm1] "=&r" (w_asm1),
164  [p_asm1] "=&r" (p_asm1), [temp1] "+r" (temp1), [temp2] "+r" (temp2),
165  [w_asm2] "=&r" (w_asm2), [p_asm2] "=&r" (p_asm2),
166  [sum1] "+r" (sum1), [w] "+r" (w), [temp3] "+r" (temp3)
167  : [p] "r" (p), [samples] "r" (samples), [min_asm] "r" (min_asm),
168  [max_asm] "r" (max_asm)
169  : "memory", "hi","lo"
170  );
171 
172  samples += incr;
173 
174  /* we calculate two samples at the same time to avoid one memory
175  access per two sample */
176 
177  for(j = 1; j < 16; j++) {
178  __asm__ volatile (
179  "mthi $0, $ac1 \n\t"
180  "mtlo $0, $ac1 \n\t"
181  "mthi $0 \n\t"
182  "mtlo %[temp1] \n\t"
183  "addi %[p_temp1], %[p_temp1], 4 \n\t"
184  "lw %[w_asm], 0(%[w]) \n\t"
185  "lw %[p_asm], 0(%[p_temp1]) \n\t"
186  "lw %[w2_asm], 0(%[w2]) \n\t"
187  "lw %[w_asm1], 64*4(%[w]) \n\t"
188  "lw %[p_asm1], 64*4(%[p_temp1]) \n\t"
189  "lw %[w2_asm1], 64*4(%[w2]) \n\t"
190  "madd %[w_asm], %[p_asm] \n\t"
191  "msub $ac1, %[w2_asm], %[p_asm] \n\t"
192  "madd %[w_asm1], %[p_asm1] \n\t"
193  "msub $ac1, %[w2_asm1], %[p_asm1] \n\t"
194  "lw %[w_asm], 128*4(%[w]) \n\t"
195  "lw %[p_asm], 128*4(%[p_temp1]) \n\t"
196  "lw %[w2_asm], 128*4(%[w2]) \n\t"
197  "lw %[w_asm1], 192*4(%[w]) \n\t"
198  "lw %[p_asm1], 192*4(%[p_temp1]) \n\t"
199  "lw %[w2_asm1], 192*4(%[w2]) \n\t"
200  "madd %[w_asm], %[p_asm] \n\t"
201  "msub $ac1, %[w2_asm], %[p_asm] \n\t"
202  "madd %[w_asm1], %[p_asm1] \n\t"
203  "msub $ac1, %[w2_asm1], %[p_asm1] \n\t"
204  "lw %[w_asm], 256*4(%[w]) \n\t"
205  "lw %[p_asm], 256*4(%[p_temp1]) \n\t"
206  "lw %[w2_asm], 256*4(%[w2]) \n\t"
207  "lw %[w_asm1], 320*4(%[w]) \n\t"
208  "lw %[p_asm1], 320*4(%[p_temp1]) \n\t"
209  "lw %[w2_asm1], 320*4(%[w2]) \n\t"
210  "madd %[w_asm], %[p_asm] \n\t"
211  "msub $ac1, %[w2_asm], %[p_asm] \n\t"
212  "madd %[w_asm1], %[p_asm1] \n\t"
213  "msub $ac1, %[w2_asm1], %[p_asm1] \n\t"
214  "lw %[w_asm], 384*4(%[w]) \n\t"
215  "lw %[p_asm], 384*4(%[p_temp1]) \n\t"
216  "lw %[w2_asm], 384*4(%[w2]) \n\t"
217  "lw %[w_asm1], 448*4(%[w]) \n\t"
218  "lw %[p_asm1], 448*4(%[p_temp1]) \n\t"
219  "lw %[w2_asm1], 448*4(%[w2]) \n\t"
220  "madd %[w_asm], %[p_asm] \n\t"
221  "msub $ac1, %[w2_asm], %[p_asm] \n\t"
222  "madd %[w_asm1], %[p_asm1] \n\t"
223  "msub $ac1, %[w2_asm1], %[p_asm1] \n\t"
224  "addi %[p_temp2], %[p_temp2], -4 \n\t"
225  "lw %[w_asm], 32*4(%[w]) \n\t"
226  "lw %[p_asm], 0(%[p_temp2]) \n\t"
227  "lw %[w2_asm], 32*4(%[w2]) \n\t"
228  "lw %[w_asm1], 96*4(%[w]) \n\t"
229  "lw %[p_asm1], 64*4(%[p_temp2]) \n\t"
230  "lw %[w2_asm1], 96*4(%[w2]) \n\t"
231  "msub %[w_asm], %[p_asm] \n\t"
232  "msub $ac1, %[w2_asm], %[p_asm] \n\t"
233  "msub %[w_asm1], %[p_asm1] \n\t"
234  "msub $ac1, %[w2_asm1], %[p_asm1] \n\t"
235  "lw %[w_asm], 160*4(%[w]) \n\t"
236  "lw %[p_asm], 128*4(%[p_temp2]) \n\t"
237  "lw %[w2_asm], 160*4(%[w2]) \n\t"
238  "lw %[w_asm1], 224*4(%[w]) \n\t"
239  "lw %[p_asm1], 192*4(%[p_temp2]) \n\t"
240  "lw %[w2_asm1], 224*4(%[w2]) \n\t"
241  "msub %[w_asm], %[p_asm] \n\t"
242  "msub $ac1, %[w2_asm], %[p_asm] \n\t"
243  "msub %[w_asm1], %[p_asm1] \n\t"
244  "msub $ac1, %[w2_asm1], %[p_asm1] \n\t"
245  "lw %[w_asm], 288*4(%[w]) \n\t"
246  "lw %[p_asm], 256*4(%[p_temp2]) \n\t"
247  "lw %[w2_asm], 288*4(%[w2]) \n\t"
248  "lw %[w_asm1], 352*4(%[w]) \n\t"
249  "lw %[p_asm1], 320*4(%[p_temp2]) \n\t"
250  "lw %[w2_asm1], 352*4(%[w2]) \n\t"
251  "msub %[w_asm], %[p_asm] \n\t"
252  "msub $ac1, %[w2_asm], %[p_asm] \n\t"
253  "msub %[w_asm1], %[p_asm1] \n\t"
254  "msub $ac1, %[w2_asm1], %[p_asm1] \n\t"
255  "lw %[w_asm], 416*4(%[w]) \n\t"
256  "lw %[p_asm], 384*4(%[p_temp2]) \n\t"
257  "lw %[w2_asm], 416*4(%[w2]) \n\t"
258  "lw %[w_asm1], 480*4(%[w]) \n\t"
259  "lw %[p_asm1], 448*4(%[p_temp2]) \n\t"
260  "lw %[w2_asm1], 480*4(%[w2]) \n\t"
261  "msub %[w_asm], %[p_asm] \n\t"
262  "msub %[w_asm1], %[p_asm1] \n\t"
263  "msub $ac1, %[w2_asm], %[p_asm] \n\t"
264  "msub $ac1, %[w2_asm1], %[p_asm1] \n\t"
265  "addi %[w], %[w], 4 \n\t"
266  "addi %[w2], %[w2], -4 \n\t"
267  "mflo %[temp2] \n\t"
268  "extr.w %[sum1], $ac0, 24 \n\t"
269  "li %[temp3], 1 \n\t"
270  "and %[temp1], %[temp2], 0x00ffffff \n\t"
271  "madd $ac1, %[temp1], %[temp3] \n\t"
272  "slt %[temp2], %[sum1], %[min_asm] \n\t"
273  "movn %[sum1], %[min_asm], %[temp2] \n\t"
274  "slt %[temp2], %[max_asm], %[sum1] \n\t"
275  "movn %[sum1], %[max_asm], %[temp2] \n\t"
276  "sh %[sum1], 0(%[samples]) \n\t"
277  "mflo %[temp3], $ac1 \n\t"
278  "extr.w %[sum1], $ac1, 24 \n\t"
279  "and %[temp1], %[temp3], 0x00ffffff \n\t"
280  "slt %[temp2], %[sum1], %[min_asm] \n\t"
281  "movn %[sum1], %[min_asm], %[temp2] \n\t"
282  "slt %[temp2], %[max_asm], %[sum1] \n\t"
283  "movn %[sum1], %[max_asm], %[temp2] \n\t"
284  "sh %[sum1], 0(%[samples2]) \n\t"
285 
286  : [w_asm] "=&r" (w_asm), [p_asm] "=&r" (p_asm), [w_asm1] "=&r" (w_asm1),
287  [p_asm1] "=&r" (p_asm1), [w2_asm1] "=&r" (w2_asm1),
288  [w2_asm] "=&r" (w2_asm), [temp1] "+r" (temp1), [temp2] "+r" (temp2),
289  [p_temp1] "+r" (p_temp1), [p_temp2] "+r" (p_temp2), [sum1] "+r" (sum1),
290  [w] "+r" (w), [w2] "+r" (w2), [samples] "+r" (samples),
291  [samples2] "+r" (samples2), [temp3] "+r" (temp3)
292  : [min_asm] "r" (min_asm), [max_asm] "r" (max_asm)
293  : "memory", "hi", "lo", "$ac1hi", "$ac1lo"
294  );
295 
296  samples += incr;
297  samples2 -= incr;
298  }
299 
300  p = synth_buf + 32;
301 
302  __asm__ volatile (
303  "mthi $0 \n\t"
304  "mtlo %[temp1] \n\t"
305  "lw %[w_asm], 32*4(%[w]) \n\t"
306  "lw %[p_asm], 0(%[p]) \n\t"
307  "lw %[w_asm1], 96*4(%[w]) \n\t"
308  "lw %[p_asm1], 64*4(%[p]) \n\t"
309  "lw %[w_asm2], 160*4(%[w]) \n\t"
310  "lw %[p_asm2], 128*4(%[p]) \n\t"
311  "msub %[w_asm], %[p_asm] \n\t"
312  "msub %[w_asm1], %[p_asm1] \n\t"
313  "msub %[w_asm2], %[p_asm2] \n\t"
314  "lw %[w_asm], 224*4(%[w]) \n\t"
315  "lw %[p_asm], 192*4(%[p]) \n\t"
316  "lw %[w_asm1], 288*4(%[w]) \n\t"
317  "lw %[p_asm1], 256*4(%[p]) \n\t"
318  "lw %[w_asm2], 352*4(%[w]) \n\t"
319  "lw %[p_asm2], 320*4(%[p]) \n\t"
320  "msub %[w_asm], %[p_asm] \n\t"
321  "msub %[w_asm1], %[p_asm1] \n\t"
322  "msub %[w_asm2], %[p_asm2] \n\t"
323  "lw %[w_asm], 416*4(%[w]) \n\t"
324  "lw %[p_asm], 384*4(%[p]) \n\t"
325  "lw %[w_asm1], 480*4(%[w]) \n\t"
326  "lw %[p_asm1], 448*4(%[p]) \n\t"
327  "msub %[w_asm], %[p_asm] \n\t"
328  "msub %[w_asm1], %[p_asm1] \n\t"
329  "extr.w %[sum1], $ac0, 24 \n\t"
330  "mflo %[temp2] \n\t"
331  "and %[temp1], %[temp2], 0x00ffffff \n\t"
332  "slt %[temp2], %[sum1], %[min_asm] \n\t"
333  "movn %[sum1], %[min_asm], %[temp2] \n\t"
334  "slt %[temp2], %[max_asm], %[sum1] \n\t"
335  "movn %[sum1], %[max_asm], %[temp2] \n\t"
336  "sh %[sum1], 0(%[samples]) \n\t"
337 
338  : [w_asm] "=&r" (w_asm), [p_asm] "=&r" (p_asm), [w_asm1] "=&r" (w_asm1),
339  [p_asm1] "=&r" (p_asm1), [temp1] "+r" (temp1), [temp2] "+r" (temp2),
340  [w_asm2] "=&r" (w_asm2), [p_asm2] "=&r" (p_asm2), [sum1] "+r" (sum1)
341  : [w] "r" (w), [p] "r" (p), [samples] "r" (samples), [min_asm] "r" (min_asm),
342  [max_asm] "r" (max_asm)
343  : "memory", "hi", "lo", "$ac1hi", "$ac1lo"
344  );
345 
346  *dither_state= temp1;
347 }
348 
349 static void imdct36_mips_fixed(int *out, int *buf, int *in, int *win)
350 {
351  int j;
352  int t0, t1, t2, t3, s0, s1, s2, s3;
353  int tmp[18], *tmp1, *in1;
354  /* temporary variables */
355  int temp_reg1, temp_reg2, temp_reg3, temp_reg4, temp_reg5, temp_reg6;
356  int t4, t5, t6, t8, t7;
357 
358  /* values defined in macros and tables are
359  * eliminated - they are directly loaded in appropriate variables
360  */
361  int const C_1 = 4229717092; /* cos(pi*1/18)*2 */
362  int const C_2 = 4035949074; /* cos(pi*2/18)*2 */
363  int const C_3 = 575416510; /* -cos(pi*3/18)*2 */
364  int const C_3A = 3719550786; /* cos(pi*3/18)*2 */
365  int const C_4 = 1004831466; /* -cos(pi*4/18)*2 */
366  int const C_5 = 1534215534; /* -cos(pi*5/18)*2 */
367  int const C_7 = -1468965330; /* -cos(pi*7/18)*2 */
368  int const C_8 = -745813244; /* -cos(pi*8/18)*2 */
369 
370  /*
371  * instructions of the first two loops are reorganized and loops are unrolled,
372  * in order to eliminate unnecessary readings and writings in array
373  */
374 
375  __asm__ volatile (
376  "lw %[t1], 17*4(%[in]) \n\t"
377  "lw %[t2], 16*4(%[in]) \n\t"
378  "lw %[t3], 15*4(%[in]) \n\t"
379  "lw %[t4], 14*4(%[in]) \n\t"
380  "addu %[t1], %[t1], %[t2] \n\t"
381  "addu %[t2], %[t2], %[t3] \n\t"
382  "addu %[t3], %[t3], %[t4] \n\t"
383  "lw %[t5], 13*4(%[in]) \n\t"
384  "addu %[t1], %[t1], %[t3] \n\t"
385  "sw %[t2], 16*4(%[in]) \n\t"
386  "lw %[t6], 12*4(%[in]) \n\t"
387  "sw %[t1], 17*4(%[in]) \n\t"
388  "addu %[t4], %[t4], %[t5] \n\t"
389  "addu %[t5], %[t5], %[t6] \n\t"
390  "lw %[t7], 11*4(%[in]) \n\t"
391  "addu %[t3], %[t3], %[t5] \n\t"
392  "sw %[t4], 14*4(%[in]) \n\t"
393  "lw %[t8], 10*4(%[in]) \n\t"
394  "sw %[t3], 15*4(%[in]) \n\t"
395  "addu %[t6], %[t6], %[t7] \n\t"
396  "addu %[t7], %[t7], %[t8] \n\t"
397  "sw %[t6], 12*4(%[in]) \n\t"
398  "addu %[t5], %[t5], %[t7] \n\t"
399  "lw %[t1], 9*4(%[in]) \n\t"
400  "lw %[t2], 8*4(%[in]) \n\t"
401  "sw %[t5], 13*4(%[in]) \n\t"
402  "addu %[t8], %[t8], %[t1] \n\t"
403  "addu %[t1], %[t1], %[t2] \n\t"
404  "sw %[t8], 10*4(%[in]) \n\t"
405  "addu %[t7], %[t7], %[t1] \n\t"
406  "lw %[t3], 7*4(%[in]) \n\t"
407  "lw %[t4], 6*4(%[in]) \n\t"
408  "sw %[t7], 11*4(%[in]) \n\t"
409  "addu %[t2], %[t2], %[t3] \n\t"
410  "addu %[t3], %[t3], %[t4] \n\t"
411  "sw %[t2], 8*4(%[in]) \n\t"
412  "addu %[t1], %[t1], %[t3] \n\t"
413  "lw %[t5], 5*4(%[in]) \n\t"
414  "lw %[t6], 4*4(%[in]) \n\t"
415  "sw %[t1], 9*4(%[in]) \n\t"
416  "addu %[t4], %[t4], %[t5] \n\t"
417  "addu %[t5], %[t5], %[t6] \n\t"
418  "sw %[t4], 6*4(%[in]) \n\t"
419  "addu %[t3], %[t3], %[t5] \n\t"
420  "lw %[t7], 3*4(%[in]) \n\t"
421  "lw %[t8], 2*4(%[in]) \n\t"
422  "sw %[t3], 7*4(%[in]) \n\t"
423  "addu %[t6], %[t6], %[t7] \n\t"
424  "addu %[t7], %[t7], %[t8] \n\t"
425  "sw %[t6], 4*4(%[in]) \n\t"
426  "addu %[t5], %[t5], %[t7] \n\t"
427  "lw %[t1], 1*4(%[in]) \n\t"
428  "lw %[t2], 0*4(%[in]) \n\t"
429  "sw %[t5], 5*4(%[in]) \n\t"
430  "addu %[t8], %[t8], %[t1] \n\t"
431  "addu %[t1], %[t1], %[t2] \n\t"
432  "sw %[t8], 2*4(%[in]) \n\t"
433  "addu %[t7], %[t7], %[t1] \n\t"
434  "sw %[t7], 3*4(%[in]) \n\t"
435  "sw %[t1], 1*4(%[in]) \n\t"
436 
437  : [in] "+r" (in), [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3),
438  [t4] "=&r" (t4), [t5] "=&r" (t5), [t6] "=&r" (t6),
439  [t7] "=&r" (t7), [t8] "=&r" (t8)
440  :
441  : "memory"
442  );
443 
444  for(j = 0; j < 2; j++) {
445 
446  tmp1 = tmp + j;
447  in1 = in + j;
448 
449  /**
450  * Original constants are multiplied by two in advanced
451  * for assembly optimization (e.g. C_2 = 2 * C2).
452  * That can lead to overflow in operations where they are used.
453  *
454  * Example of the solution:
455  *
456  * in original code:
457  * t0 = ((int64_t)(in1[2*2] + in1[2*4]) * (int64_t)(2*C2))>>32
458  *
459  * in assembly:
460  * C_2 = 2 * C2;
461  * .
462  * .
463  * "lw %[t7], 4*4(%[in1]) \n\t"
464  * "lw %[t8], 8*4(%[in1]) \n\t"
465  * "addu %[temp_reg2],%[t7], %[t8] \n\t"
466  * "multu %[C_2], %[temp_reg2] \n\t"
467  * "mfhi %[temp_reg1] \n\t"
468  * "sra %[temp_reg2],%[temp_reg2],31 \n\t"
469  * "move %[t0], $0 \n\t"
470  * "movn %[t0], %[C_2], %[temp_reg2] \n\t"
471  * "sub %[t0], %[temp_reg1],%[t0] \n\t"
472  */
473 
474  __asm__ volatile (
475  "lw %[t7], 4*4(%[in1]) \n\t"
476  "lw %[t8], 8*4(%[in1]) \n\t"
477  "lw %[t6], 16*4(%[in1]) \n\t"
478  "lw %[t4], 0*4(%[in1]) \n\t"
479  "addu %[temp_reg2], %[t7], %[t8] \n\t"
480  "addu %[t2], %[t6], %[t8] \n\t"
481  "multu %[C_2], %[temp_reg2] \n\t"
482  "lw %[t5], 12*4(%[in1]) \n\t"
483  "sub %[t2], %[t2], %[t7] \n\t"
484  "sub %[t1], %[t4], %[t5] \n\t"
485  "sra %[t3], %[t5], 1 \n\t"
486  "sra %[temp_reg1], %[t2], 1 \n\t"
487  "addu %[t3], %[t3], %[t4] \n\t"
488  "sub %[temp_reg1], %[t1], %[temp_reg1] \n\t"
489  "sra %[temp_reg2], %[temp_reg2], 31 \n\t"
490  "sw %[temp_reg1], 6*4(%[tmp1]) \n\t"
491  "move %[t0], $0 \n\t"
492  "movn %[t0], %[C_2], %[temp_reg2] \n\t"
493  "mfhi %[temp_reg1] \n\t"
494  "addu %[t1], %[t1], %[t2] \n\t"
495  "sw %[t1], 16*4(%[tmp1]) \n\t"
496  "sub %[temp_reg4], %[t8], %[t6] \n\t"
497  "add %[temp_reg2], %[t7], %[t6] \n\t"
498  "mult $ac1, %[C_8], %[temp_reg4] \n\t"
499  "multu $ac2, %[C_4], %[temp_reg2] \n\t"
500  "sub %[t0], %[temp_reg1], %[t0] \n\t"
501  "sra %[temp_reg1], %[temp_reg2], 31 \n\t"
502  "move %[t2], $0 \n\t"
503  "movn %[t2], %[C_4], %[temp_reg1] \n\t"
504  "mfhi %[t1], $ac1 \n\t"
505  "mfhi %[temp_reg1], $ac2 \n\t"
506  "lw %[t6], 10*4(%[in1]) \n\t"
507  "lw %[t8], 14*4(%[in1]) \n\t"
508  "lw %[t7], 2*4(%[in1]) \n\t"
509  "lw %[t4], 6*4(%[in1]) \n\t"
510  "sub %[temp_reg3], %[t3], %[t0] \n\t"
511  "add %[temp_reg4], %[t3], %[t0] \n\t"
512  "sub %[temp_reg1], %[temp_reg1], %[temp_reg2] \n\t"
513  "add %[temp_reg4], %[temp_reg4], %[t1] \n\t"
514  "sub %[t2], %[temp_reg1], %[t2] \n\t"
515  "sw %[temp_reg4], 2*4(%[tmp1]) \n\t"
516  "sub %[temp_reg3], %[temp_reg3], %[t2] \n\t"
517  "add %[temp_reg1], %[t3], %[t2] \n\t"
518  "sw %[temp_reg3], 10*4(%[tmp1]) \n\t"
519  "sub %[temp_reg1], %[temp_reg1], %[t1] \n\t"
520  "addu %[temp_reg2], %[t6], %[t8] \n\t"
521  "sw %[temp_reg1], 14*4(%[tmp1]) \n\t"
522  "sub %[temp_reg2], %[temp_reg2], %[t7] \n\t"
523  "addu %[temp_reg3], %[t7], %[t6] \n\t"
524  "multu $ac3, %[C_3], %[temp_reg2] \n\t"
525  "multu %[C_1], %[temp_reg3] \n\t"
526  "sra %[temp_reg1], %[temp_reg2], 31 \n\t"
527  "move %[t1], $0 \n\t"
528  "sra %[temp_reg3], %[temp_reg3], 31 \n\t"
529  "movn %[t1], %[C_3], %[temp_reg1] \n\t"
530  "mfhi %[temp_reg1], $ac3 \n\t"
531  "mfhi %[temp_reg4] \n\t"
532  "move %[t2], $0 \n\t"
533  "movn %[t2], %[C_1], %[temp_reg3] \n\t"
534  "sub %[temp_reg3], %[t6], %[t8] \n\t"
535  "sub %[t2], %[temp_reg4], %[t2] \n\t"
536  "multu $ac1, %[C_7], %[temp_reg3] \n\t"
537  "sub %[temp_reg1], %[temp_reg1], %[temp_reg2] \n\t"
538  "sra %[temp_reg4], %[temp_reg3], 31 \n\t"
539  "sub %[t1], %[temp_reg1], %[t1] \n\t"
540  "move %[t3], $0 \n\t"
541  "sw %[t1], 4*4(%[tmp1]) \n\t"
542  "movn %[t3], %[C_7], %[temp_reg4] \n\t"
543  "multu $ac2, %[C_3A], %[t4] \n\t"
544  "add %[temp_reg2], %[t7], %[t8] \n\t"
545  "move %[t1], $0 \n\t"
546  "mfhi %[temp_reg4], $ac1 \n\t"
547  "multu $ac3,%[C_5], %[temp_reg2] \n\t"
548  "move %[t0], $0 \n\t"
549  "sra %[temp_reg1], %[temp_reg2], 31 \n\t"
550  "movn %[t1],%[C_5], %[temp_reg1] \n\t"
551  "sub %[temp_reg4], %[temp_reg4], %[temp_reg3] \n\t"
552  "mfhi %[temp_reg1], $ac3 \n\t"
553  "sra %[temp_reg3], %[t4], 31 \n\t"
554  "movn %[t0], %[C_3A], %[temp_reg3] \n\t"
555  "mfhi %[temp_reg3], $ac2 \n\t"
556  "sub %[t3], %[temp_reg4], %[t3] \n\t"
557  "add %[temp_reg4], %[t3], %[t2] \n\t"
558  "sub %[temp_reg1], %[temp_reg1], %[temp_reg2] \n\t"
559  "sub %[t1], %[temp_reg1], %[t1] \n\t"
560  "sub %[t0], %[temp_reg3], %[t0] \n\t"
561  "add %[temp_reg1], %[t2], %[t1] \n\t"
562  "add %[temp_reg4], %[temp_reg4], %[t0] \n\t"
563  "sub %[temp_reg2], %[t3], %[t1] \n\t"
564  "sw %[temp_reg4], 0*4(%[tmp1]) \n\t"
565  "sub %[temp_reg1], %[temp_reg1], %[t0] \n\t"
566  "sub %[temp_reg2], %[temp_reg2], %[t0] \n\t"
567  "sw %[temp_reg1], 12*4(%[tmp1]) \n\t"
568  "sw %[temp_reg2], 8*4(%[tmp1]) \n\t"
569 
570  : [t7] "=&r" (t7), [temp_reg1] "=&r" (temp_reg1),
571  [temp_reg2] "=&r" (temp_reg2), [temp_reg4] "=&r" (temp_reg4),
572  [temp_reg3] "=&r" (temp_reg3), [t8] "=&r" (t8), [t0] "=&r" (t0),
573  [t4] "=&r" (t4), [t5] "=&r" (t5), [t6] "=&r"(t6), [t2] "=&r" (t2),
574  [t3] "=&r" (t3), [t1] "=&r" (t1)
575  : [C_2] "r" (C_2), [in1] "r" (in1), [tmp1] "r" (tmp1), [C_8] "r" (C_8),
576  [C_4] "r" (C_4), [C_3] "r" (C_3), [C_1] "r" (C_1), [C_7] "r" (C_7),
577  [C_3A] "r" (C_3A), [C_5] "r" (C_5)
578  : "memory", "hi", "lo", "$ac1hi", "$ac1lo", "$ac2hi", "$ac2lo",
579  "$ac3hi", "$ac3lo"
580  );
581  }
582 
583  /**
584  * loop is unrolled four times
585  *
586  * values defined in tables(icos36[] and icos36h[]) are not loaded from
587  * these tables - they are directly loaded in appropriate registers
588  *
589  */
590 
591  __asm__ volatile (
592  "lw %[t2], 1*4(%[tmp]) \n\t"
593  "lw %[t3], 3*4(%[tmp]) \n\t"
594  "lw %[t0], 0*4(%[tmp]) \n\t"
595  "lw %[t1], 2*4(%[tmp]) \n\t"
596  "addu %[temp_reg1], %[t3], %[t2] \n\t"
597  "li %[temp_reg2], 0x807D2B1E \n\t"
598  "move %[s1], $0 \n\t"
599  "multu %[temp_reg2], %[temp_reg1] \n\t"
600  "sra %[temp_reg1], %[temp_reg1], 31 \n\t"
601  "movn %[s1], %[temp_reg2], %[temp_reg1] \n\t"
602  "sub %[temp_reg3], %[t3], %[t2] \n\t"
603  "li %[temp_reg4], 0x2de5151 \n\t"
604  "mfhi %[temp_reg2] \n\t"
605  "addu %[s0], %[t1], %[t0] \n\t"
606  "lw %[temp_reg5], 9*4(%[win]) \n\t"
607  "mult $ac1, %[temp_reg4], %[temp_reg3] \n\t"
608  "lw %[temp_reg6], 4*9*4(%[buf]) \n\t"
609  "sub %[s2], %[t1], %[t0] \n\t"
610  "lw %[temp_reg3], 29*4(%[win]) \n\t"
611  "subu %[s1], %[temp_reg2], %[s1] \n\t"
612  "lw %[temp_reg4], 28*4(%[win]) \n\t"
613  "add %[t0], %[s0], %[s1] \n\t"
614  "extr.w %[s3], $ac1,23 \n\t"
615  "mult $ac2, %[t0], %[temp_reg3] \n\t"
616  "sub %[t1], %[s0], %[s1] \n\t"
617  "lw %[temp_reg1], 4*8*4(%[buf]) \n\t"
618  "mult %[t1], %[temp_reg5] \n\t"
619  "lw %[temp_reg2], 8*4(%[win]) \n\t"
620  "mfhi %[temp_reg3], $ac2 \n\t"
621  "mult $ac3, %[t0], %[temp_reg4] \n\t"
622  "add %[t0], %[s2], %[s3] \n\t"
623  "mfhi %[temp_reg5] \n\t"
624  "mult $ac1, %[t1], %[temp_reg2] \n\t"
625  "sub %[t1], %[s2], %[s3] \n\t"
626  "sw %[temp_reg3], 4*9*4(%[buf]) \n\t"
627  "mfhi %[temp_reg4], $ac3 \n\t"
628  "lw %[temp_reg3], 37*4(%[win]) \n\t"
629  "mfhi %[temp_reg2], $ac1 \n\t"
630  "add %[temp_reg5], %[temp_reg5], %[temp_reg6] \n\t"
631  "lw %[temp_reg6], 17*4(%[win]) \n\t"
632  "sw %[temp_reg5], 32*9*4(%[out]) \n\t"
633  "sw %[temp_reg4], 4*8*4(%[buf]) \n\t"
634  "mult %[t1], %[temp_reg6] \n\t"
635  "add %[temp_reg1], %[temp_reg1], %[temp_reg2] \n\t"
636  "lw %[temp_reg2], 0*4(%[win]) \n\t"
637  "lw %[temp_reg5], 4*17*4(%[buf]) \n\t"
638  "sw %[temp_reg1], 8*32*4(%[out]) \n\t"
639  "mfhi %[temp_reg6] \n\t"
640  "mult $ac1, %[t1], %[temp_reg2] \n\t"
641  "lw %[temp_reg4], 20*4(%[win]) \n\t"
642  "lw %[temp_reg1], 0(%[buf]) \n\t"
643  "mult $ac2, %[t0], %[temp_reg3] \n\t"
644  "mult %[t0], %[temp_reg4] \n\t"
645  "mfhi %[temp_reg2], $ac1 \n\t"
646  "lw %[t0], 4*4(%[tmp]) \n\t"
647  "add %[temp_reg5], %[temp_reg5], %[temp_reg6] \n\t"
648  "mfhi %[temp_reg3], $ac2 \n\t"
649  "mfhi %[temp_reg4] \n\t"
650  "sw %[temp_reg5], 17*32*4(%[out]) \n\t"
651  "lw %[t1], 6*4(%[tmp]) \n\t"
652  "add %[temp_reg1], %[temp_reg1], %[temp_reg2] \n\t"
653  "lw %[t2], 5*4(%[tmp]) \n\t"
654  "sw %[temp_reg1], 0*32*4(%[out]) \n\t"
655  "addu %[s0], %[t1], %[t0] \n\t"
656  "sw %[temp_reg3], 4*17*4(%[buf]) \n\t"
657  "lw %[t3], 7*4(%[tmp]) \n\t"
658  "sub %[s2], %[t1], %[t0] \n\t"
659  "sw %[temp_reg4], 0(%[buf]) \n\t"
660  "addu %[temp_reg5], %[t3], %[t2] \n\t"
661  "li %[temp_reg6], 0x8483EE0C \n\t"
662  "move %[s1], $0 \n\t"
663  "multu %[temp_reg6], %[temp_reg5] \n\t"
664  "sub %[temp_reg1], %[t3], %[t2] \n\t"
665  "li %[temp_reg2], 0xf746ea \n\t"
666  "sra %[temp_reg5], %[temp_reg5], 31 \n\t"
667  "mult $ac1, %[temp_reg2], %[temp_reg1] \n\t"
668  "movn %[s1], %[temp_reg6], %[temp_reg5] \n\t"
669  "mfhi %[temp_reg5] \n\t"
670  "lw %[temp_reg3], 10*4(%[win]) \n\t"
671  "lw %[temp_reg4], 4*10*4(%[buf]) \n\t"
672  "extr.w %[s3], $ac1, 23 \n\t"
673  "lw %[temp_reg1], 4*7*4(%[buf]) \n\t"
674  "lw %[temp_reg2], 7*4(%[win]) \n\t"
675  "lw %[temp_reg6], 30*4(%[win]) \n\t"
676  "subu %[s1], %[temp_reg5], %[s1] \n\t"
677  "sub %[t1], %[s0], %[s1] \n\t"
678  "add %[t0], %[s0], %[s1] \n\t"
679  "mult $ac2, %[t1], %[temp_reg3] \n\t"
680  "mult $ac3, %[t1], %[temp_reg2] \n\t"
681  "mult %[t0], %[temp_reg6] \n\t"
682  "lw %[temp_reg5], 27*4(%[win]) \n\t"
683  "mult $ac1, %[t0], %[temp_reg5] \n\t"
684  "mfhi %[temp_reg3], $ac2 \n\t"
685  "mfhi %[temp_reg2], $ac3 \n\t"
686  "mfhi %[temp_reg6] \n\t"
687  "add %[t0], %[s2], %[s3] \n\t"
688  "sub %[t1], %[s2], %[s3] \n\t"
689  "add %[temp_reg3], %[temp_reg3], %[temp_reg4] \n\t"
690  "lw %[temp_reg4], 16*4(%[win]) \n\t"
691  "mfhi %[temp_reg5], $ac1 \n\t"
692  "sw %[temp_reg3], 32*10*4(%[out]) \n\t"
693  "add %[temp_reg1], %[temp_reg1], %[temp_reg2] \n\t"
694  "lw %[temp_reg3], 4*16*4(%[buf]) \n\t"
695  "sw %[temp_reg6], 4*10*4(%[buf]) \n\t"
696  "sw %[temp_reg1], 7*32*4(%[out]) \n\t"
697  "mult $ac2, %[t1], %[temp_reg4] \n\t"
698  "sw %[temp_reg5], 4*7*4(%[buf]) \n\t"
699  "lw %[temp_reg6], 1*4(%[win]) \n\t"
700  "lw %[temp_reg5], 4*1*4(%[buf]) \n\t"
701  "lw %[temp_reg1], 36*4(%[win]) \n\t"
702  "mult $ac3, %[t1], %[temp_reg6] \n\t"
703  "lw %[temp_reg2], 21*4(%[win]) \n\t"
704  "mfhi %[temp_reg4], $ac2 \n\t"
705  "mult %[t0], %[temp_reg1] \n\t"
706  "mult $ac1, %[t0],%[temp_reg2] \n\t"
707  "lw %[t0], 8*4(%[tmp]) \n\t"
708  "mfhi %[temp_reg6], $ac3 \n\t"
709  "lw %[t1], 10*4(%[tmp]) \n\t"
710  "lw %[t3], 11*4(%[tmp]) \n\t"
711  "mfhi %[temp_reg1] \n\t"
712  "add %[temp_reg3], %[temp_reg3], %[temp_reg4] \n\t"
713  "lw %[t2], 9*4(%[tmp]) \n\t"
714  "mfhi %[temp_reg2], $ac1 \n\t"
715  "add %[temp_reg5], %[temp_reg5], %[temp_reg6] \n\t"
716  "sw %[temp_reg3], 16*32*4(%[out]) \n\t"
717  "sw %[temp_reg5], 1*32*4(%[out]) \n\t"
718  "sw %[temp_reg1], 4*16*4(%[buf]) \n\t"
719  "addu %[temp_reg3], %[t3], %[t2] \n\t"
720  "li %[temp_reg4], 0x8D3B7CD6 \n\t"
721  "sw %[temp_reg2], 4*1*4(%[buf]) \n\t"
722  "multu %[temp_reg4],%[temp_reg3] \n\t"
723  "sra %[temp_reg3], %[temp_reg3], 31 \n\t"
724  "move %[s1], $0 \n\t"
725  "movn %[s1], %[temp_reg4], %[temp_reg3] \n\t"
726  "addu %[s0], %[t1], %[t0] \n\t"
727  "mfhi %[temp_reg3] \n\t"
728  "sub %[s2], %[t1], %[t0] \n\t"
729  "sub %[temp_reg5], %[t3], %[t2] \n\t"
730  "li %[temp_reg6], 0x976fd9 \n\t"
731  "lw %[temp_reg2], 11*4(%[win]) \n\t"
732  "lw %[temp_reg1], 4*11*4(%[buf]) \n\t"
733  "mult $ac1, %[temp_reg6], %[temp_reg5] \n\t"
734  "subu %[s1], %[temp_reg3], %[s1] \n\t"
735  "lw %[temp_reg5], 31*4(%[win]) \n\t"
736  "sub %[t1], %[s0], %[s1] \n\t"
737  "add %[t0], %[s0], %[s1] \n\t"
738  "mult $ac2, %[t1], %[temp_reg2] \n\t"
739  "mult %[t0], %[temp_reg5] \n\t"
740  "lw %[temp_reg4], 6*4(%[win]) \n\t"
741  "extr.w %[s3], $ac1, 23 \n\t"
742  "lw %[temp_reg3], 4*6*4(%[buf]) \n\t"
743  "mfhi %[temp_reg2], $ac2 \n\t"
744  "lw %[temp_reg6], 26*4(%[win]) \n\t"
745  "mfhi %[temp_reg5] \n\t"
746  "mult $ac3, %[t1], %[temp_reg4] \n\t"
747  "mult $ac1, %[t0], %[temp_reg6] \n\t"
748  "add %[t0], %[s2], %[s3] \n\t"
749  "sub %[t1], %[s2], %[s3] \n\t"
750  "add %[temp_reg2], %[temp_reg2], %[temp_reg1] \n\t"
751  "mfhi %[temp_reg4], $ac3 \n\t"
752  "mfhi %[temp_reg6], $ac1 \n\t"
753  "sw %[temp_reg5], 4*11*4(%[buf]) \n\t"
754  "sw %[temp_reg2], 32*11*4(%[out]) \n\t"
755  "lw %[temp_reg1], 4*15*4(%[buf]) \n\t"
756  "add %[temp_reg3], %[temp_reg3], %[temp_reg4] \n\t"
757  "lw %[temp_reg2], 15*4(%[win]) \n\t"
758  "sw %[temp_reg3], 6*32*4(%[out]) \n\t"
759  "sw %[temp_reg6], 4*6*4(%[buf]) \n\t"
760  "mult %[t1], %[temp_reg2] \n\t"
761  "lw %[temp_reg3], 2*4(%[win]) \n\t"
762  "lw %[temp_reg4], 4*2*4(%[buf]) \n\t"
763  "lw %[temp_reg5], 35*4(%[win]) \n\t"
764  "mult $ac1, %[t1], %[temp_reg3] \n\t"
765  "mfhi %[temp_reg2] \n\t"
766  "lw %[temp_reg6], 22*4(%[win]) \n\t"
767  "mult $ac2, %[t0], %[temp_reg5] \n\t"
768  "lw %[t1], 14*4(%[tmp]) \n\t"
769  "mult $ac3, %[t0], %[temp_reg6] \n\t"
770  "lw %[t0], 12*4(%[tmp]) \n\t"
771  "mfhi %[temp_reg3], $ac1 \n\t"
772  "add %[temp_reg1], %[temp_reg1], %[temp_reg2] \n\t"
773  "mfhi %[temp_reg5], $ac2 \n\t"
774  "sw %[temp_reg1], 15*32*4(%[out]) \n\t"
775  "mfhi %[temp_reg6], $ac3 \n\t"
776  "lw %[t2], 13*4(%[tmp]) \n\t"
777  "lw %[t3], 15*4(%[tmp]) \n\t"
778  "add %[temp_reg4], %[temp_reg4], %[temp_reg3] \n\t"
779  "sw %[temp_reg5], 4*15*4(%[buf]) \n\t"
780  "addu %[temp_reg1], %[t3], %[t2] \n\t"
781  "li %[temp_reg2], 0x9C42577C \n\t"
782  "move %[s1], $0 \n\t"
783  "multu %[temp_reg2], %[temp_reg1] \n\t"
784  "sw %[temp_reg4], 2*32*4(%[out]) \n\t"
785  "sra %[temp_reg1], %[temp_reg1], 31 \n\t"
786  "movn %[s1], %[temp_reg2], %[temp_reg1] \n\t"
787  "sub %[temp_reg3], %[t3], %[t2] \n\t"
788  "li %[temp_reg4], 0x6f94a2 \n\t"
789  "mfhi %[temp_reg1] \n\t"
790  "addu %[s0], %[t1], %[t0] \n\t"
791  "sw %[temp_reg6], 4*2*4(%[buf]) \n\t"
792  "mult $ac1, %[temp_reg4], %[temp_reg3] \n\t"
793  "sub %[s2], %[t1], %[t0] \n\t"
794  "lw %[temp_reg5], 12*4(%[win]) \n\t"
795  "lw %[temp_reg6], 4*12*4(%[buf]) \n\t"
796  "subu %[s1], %[temp_reg1], %[s1] \n\t"
797  "sub %[t1], %[s0], %[s1] \n\t"
798  "lw %[temp_reg3], 32*4(%[win]) \n\t"
799  "mult $ac2, %[t1], %[temp_reg5] \n\t"
800  "add %[t0], %[s0], %[s1] \n\t"
801  "extr.w %[s3], $ac1, 23 \n\t"
802  "lw %[temp_reg2], 5*4(%[win]) \n\t"
803  "mult %[t0], %[temp_reg3] \n\t"
804  "mfhi %[temp_reg5], $ac2 \n\t"
805  "lw %[temp_reg4], 25*4(%[win]) \n\t"
806  "lw %[temp_reg1], 4*5*4(%[buf]) \n\t"
807  "mult $ac3, %[t1], %[temp_reg2] \n\t"
808  "mult $ac1, %[t0], %[temp_reg4] \n\t"
809  "mfhi %[temp_reg3] \n\t"
810  "add %[t0], %[s2], %[s3] \n\t"
811  "add %[temp_reg5], %[temp_reg5], %[temp_reg6] \n\t"
812  "mfhi %[temp_reg2], $ac3 \n\t"
813  "mfhi %[temp_reg4], $ac1 \n\t"
814  "sub %[t1], %[s2], %[s3] \n\t"
815  "sw %[temp_reg5], 32*12*4(%[out]) \n\t"
816  "sw %[temp_reg3], 4*12*4(%[buf]) \n\t"
817  "lw %[temp_reg6], 14*4(%[win]) \n\t"
818  "lw %[temp_reg5], 4*14*4(%[buf]) \n\t"
819  "add %[temp_reg1], %[temp_reg1], %[temp_reg2] \n\t"
820  "sw %[temp_reg4], 4*5*4(%[buf]) \n\t"
821  "sw %[temp_reg1], 5*32*4(%[out]) \n\t"
822  "mult %[t1], %[temp_reg6] \n\t"
823  "lw %[temp_reg4], 34*4(%[win]) \n\t"
824  "lw %[temp_reg2], 3*4(%[win]) \n\t"
825  "lw %[temp_reg1], 4*3*4(%[buf]) \n\t"
826  "mult $ac2, %[t0], %[temp_reg4] \n\t"
827  "mfhi %[temp_reg6] \n\t"
828  "mult $ac1, %[t1], %[temp_reg2] \n\t"
829  "lw %[temp_reg3], 23*4(%[win]) \n\t"
830  "lw %[s0], 16*4(%[tmp]) \n\t"
831  "mfhi %[temp_reg4], $ac2 \n\t"
832  "lw %[t1], 17*4(%[tmp]) \n\t"
833  "mult $ac3, %[t0], %[temp_reg3] \n\t"
834  "move %[s1], $0 \n\t"
835  "add %[temp_reg5], %[temp_reg5], %[temp_reg6] \n\t"
836  "mfhi %[temp_reg2], $ac1 \n\t"
837  "sw %[temp_reg5], 14*32*4(%[out]) \n\t"
838  "sw %[temp_reg4], 4*14*4(%[buf]) \n\t"
839  "mfhi %[temp_reg3], $ac3 \n\t"
840  "li %[temp_reg5], 0xB504F334 \n\t"
841  "add %[temp_reg1], %[temp_reg1], %[temp_reg2] \n\t"
842  "multu %[temp_reg5], %[t1] \n\t"
843  "lw %[temp_reg2], 4*13*4(%[buf]) \n\t"
844  "sw %[temp_reg1], 3*32*4(%[out]) \n\t"
845  "sra %[t1], %[t1], 31 \n\t"
846  "mfhi %[temp_reg6] \n\t"
847  "movn %[s1], %[temp_reg5], %[t1] \n\t"
848  "sw %[temp_reg3], 4*3*4(%[buf]) \n\t"
849  "lw %[temp_reg1], 13*4(%[win]) \n\t"
850  "lw %[temp_reg4], 4*4*4(%[buf]) \n\t"
851  "lw %[temp_reg3], 4*4(%[win]) \n\t"
852  "lw %[temp_reg5], 33*4(%[win]) \n\t"
853  "subu %[s1], %[temp_reg6], %[s1] \n\t"
854  "lw %[temp_reg6], 24*4(%[win]) \n\t"
855  "sub %[t1], %[s0], %[s1] \n\t"
856  "add %[t0], %[s0], %[s1] \n\t"
857  "mult $ac1, %[t1], %[temp_reg1] \n\t"
858  "mult $ac2, %[t1], %[temp_reg3] \n\t"
859  "mult $ac3, %[t0], %[temp_reg5] \n\t"
860  "mult %[t0], %[temp_reg6] \n\t"
861  "mfhi %[temp_reg1], $ac1 \n\t"
862  "mfhi %[temp_reg3], $ac2 \n\t"
863  "mfhi %[temp_reg5], $ac3 \n\t"
864  "mfhi %[temp_reg6] \n\t"
865  "add %[temp_reg2], %[temp_reg2], %[temp_reg1] \n\t"
866  "add %[temp_reg4], %[temp_reg4], %[temp_reg3] \n\t"
867  "sw %[temp_reg2], 13*32*4(%[out]) \n\t"
868  "sw %[temp_reg4], 4*32*4(%[out]) \n\t"
869  "sw %[temp_reg5], 4*13*4(%[buf]) \n\t"
870  "sw %[temp_reg6], 4*4*4(%[buf]) \n\t"
871 
872  : [t0] "=&r" (t0), [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3),
873  [s0] "=&r" (s0), [s2] "=&r" (s2), [temp_reg1] "=&r" (temp_reg1),
874  [temp_reg2] "=&r" (temp_reg2), [s1] "=&r" (s1), [s3] "=&r" (s3),
875  [temp_reg3] "=&r" (temp_reg3), [temp_reg4] "=&r" (temp_reg4),
876  [temp_reg5] "=&r" (temp_reg5), [temp_reg6] "=&r" (temp_reg6),
877  [out] "+r" (out)
878  : [tmp] "r" (tmp), [win] "r" (win), [buf] "r" (buf)
879  : "memory", "hi", "lo", "$ac1hi", "$ac1lo", "$ac2hi", "$ac2lo",
880  "$ac3hi", "$ac3lo"
881  );
882 }
883 
884 static void ff_imdct36_blocks_mips_fixed(int *out, int *buf, int *in,
885  int count, int switch_point, int block_type)
886 {
887  int j;
888  for (j=0 ; j < count; j++) {
889  /* apply window & overlap with previous buffer */
890 
891  /* select window */
892  int win_idx = (switch_point && j < 2) ? 0 : block_type;
893  int *win = ff_mdct_win_fixed[win_idx + (4 & -(j & 1))];
894 
895  imdct36_mips_fixed(out, buf, in, win);
896 
897  in += 18;
898  buf += ((j&3) != 3 ? 1 : (72-3));
899  out++;
900  }
901 }
902 
904 {
907 }
const char * s
Definition: avisynth_c.h:668
About Git write you should know how to use GIT properly Luckily Git comes with excellent documentation git help man git shows you the available git< command > help man git< command > shows information about the subcommand< command > The most comprehensive manual is the website Git Reference visit they are quite exhaustive You do not need a special username or password All you need is to provide a ssh public key to the Git server admin What follows now is a basic introduction to Git and some FFmpeg specific guidelines Read it at least if you are granted commit privileges to the FFmpeg project you are expected to be familiar with these rules I if not You can get git from etc no matter how small Every one of them has been saved from looking like a fool by this many times It s very easy for stray debug output or cosmetic modifications to slip in
Definition: git-howto.txt:5
#define t8
Definition: regdef.h:53
static void ff_imdct36_blocks_mips_fixed(int *out, int *buf, int *in, int count, int switch_point, int block_type)
output residual component w
#define t7
Definition: regdef.h:35
void ff_mpadsp_init_mipsdspr1(MPADSPContext *s)
static void imdct36_mips_fixed(int *out, int *buf, int *in, int *win)
#define t0
Definition: regdef.h:28
#define s2
Definition: regdef.h:39
#define s0
Definition: regdef.h:37
#define t1
Definition: regdef.h:29
overlapping window(triangular window to avoid too much overlapping) ovidx
#define t3
Definition: regdef.h:31
void(* apply_window_fixed)(int32_t *synth_buf, int32_t *window, int *dither_state, int16_t *samples, int incr)
Definition: mpegaudiodsp.h:28
int32_t
#define s3
Definition: regdef.h:40
void * buf
Definition: avisynth_c.h:594
int ff_mdct_win_fixed[8][MDCT_BUF_SIZE]
#define t5
Definition: regdef.h:33
#define s1
Definition: regdef.h:38
#define t6
Definition: regdef.h:34
#define t4
Definition: regdef.h:32
static void ff_mpadsp_apply_window_mips_fixed(int32_t *synth_buf, int32_t *window, int *dither_state, int16_t *samples, int incr)
Filter the word “frame” indicates either a video frame or a group of audio samples
void INT64 INT64 count
Definition: avisynth_c.h:594
uint8_t pi<< 24) CONV_FUNC_GROUP(AV_SAMPLE_FMT_FLT, float, AV_SAMPLE_FMT_U8, uint8_t,(*(const uint8_t *) pi-0x80)*(1.0f/(1<< 7))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_DBL, double, AV_SAMPLE_FMT_U8, uint8_t,(*(const uint8_t *) pi-0x80)*(1.0/(1<< 7))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_U8, uint8_t, AV_SAMPLE_FMT_S16, int16_t,(*(const int16_t *) pi >> 8)+0x80) CONV_FUNC_GROUP(AV_SAMPLE_FMT_FLT, float, AV_SAMPLE_FMT_S16, int16_t,*(const int16_t *) pi *(1.0f/(1<< 15))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_DBL, double, AV_SAMPLE_FMT_S16, int16_t,*(const int16_t *) pi *(1.0/(1<< 15))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_U8, uint8_t, AV_SAMPLE_FMT_S32, int32_t,(*(const int32_t *) pi >> 24)+0x80) CONV_FUNC_GROUP(AV_SAMPLE_FMT_FLT, float, AV_SAMPLE_FMT_S32, int32_t,*(const int32_t *) pi *(1.0f/(1U<< 31))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_DBL, double, AV_SAMPLE_FMT_S32, int32_t,*(const int32_t *) pi *(1.0/(1U<< 31))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_U8, uint8_t, AV_SAMPLE_FMT_FLT, float, av_clip_uint8(lrintf(*(const float *) pi *(1<< 7))+0x80)) CONV_FUNC_GROUP(AV_SAMPLE_FMT_S16, int16_t, AV_SAMPLE_FMT_FLT, float, av_clip_int16(lrintf(*(const float *) pi *(1<< 15)))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_S32, int32_t, AV_SAMPLE_FMT_FLT, float, av_clipl_int32(llrintf(*(const float *) pi *(1U<< 31)))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_U8, uint8_t, AV_SAMPLE_FMT_DBL, double, av_clip_uint8(lrint(*(const double *) pi *(1<< 7))+0x80)) CONV_FUNC_GROUP(AV_SAMPLE_FMT_S16, int16_t, AV_SAMPLE_FMT_DBL, double, av_clip_int16(lrint(*(const double *) pi *(1<< 15)))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_S32, int32_t, AV_SAMPLE_FMT_DBL, double, av_clipl_int32(llrint(*(const double *) pi *(1U<< 31))))#define SET_CONV_FUNC_GROUP(ofmt, ifmt) static void set_generic_function(AudioConvert *ac){}void ff_audio_convert_free(AudioConvert **ac){if(!*ac) return;ff_dither_free(&(*ac) ->dc);av_freep(ac);}AudioConvert *ff_audio_convert_alloc(AVAudioResampleContext *avr, enum AVSampleFormat out_fmt, enum AVSampleFormat in_fmt, int channels, int sample_rate, int apply_map){AudioConvert *ac;int in_planar, out_planar;ac=av_mallocz(sizeof(*ac));if(!ac) return NULL;ac->avr=avr;ac->out_fmt=out_fmt;ac->in_fmt=in_fmt;ac->channels=channels;ac->apply_map=apply_map;if(avr->dither_method!=AV_RESAMPLE_DITHER_NONE &&av_get_packed_sample_fmt(out_fmt)==AV_SAMPLE_FMT_S16 &&av_get_bytes_per_sample(in_fmt) > 2){ac->dc=ff_dither_alloc(avr, out_fmt, in_fmt, channels, sample_rate, apply_map);if(!ac->dc){av_free(ac);return NULL;}return ac;}in_planar=av_sample_fmt_is_planar(in_fmt);out_planar=av_sample_fmt_is_planar(out_fmt);if(in_planar==out_planar){ac->func_type=CONV_FUNC_TYPE_FLAT;ac->planes=in_planar?ac->channels:1;}else if(in_planar) ac->func_type=CONV_FUNC_TYPE_INTERLEAVE;else ac->func_type=CONV_FUNC_TYPE_DEINTERLEAVE;set_generic_function(ac);if(ARCH_ARM) ff_audio_convert_init_arm(ac);if(ARCH_X86) ff_audio_convert_init_x86(ac);return ac;}int ff_audio_convert(AudioConvert *ac, AudioData *out, AudioData *in){int use_generic=1;int len=in->nb_samples;int p;if(ac->dc){av_dlog(ac->avr,"%d samples - audio_convert: %s to %s (dithered)\n", len, av_get_sample_fmt_name(ac->in_fmt), av_get_sample_fmt_name(ac->out_fmt));return ff_convert_dither(ac-> out
#define t2
Definition: regdef.h:30
void(* imdct36_blocks_fixed)(int *out, int *buf, int *in, int count, int switch_point, int block_type)
Definition: mpegaudiodsp.h:35