fdct.c
Go to the documentation of this file.
1 /*
2  * MMX optimized forward DCT
3  * The gcc porting is Copyright (c) 2001 Fabrice Bellard.
4  * cleanup/optimizations are Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5  * SSE2 optimization is Copyright (c) 2004 Denes Balatoni.
6  *
7  * from fdctam32.c - AP922 MMX(3D-Now) forward-DCT
8  *
9  * Intel Application Note AP-922 - fast, precise implementation of DCT
10  * http://developer.intel.com/vtune/cbts/appnotes.htm
11  *
12  * Also of inspiration:
13  * a page about fdct at http://www.geocities.com/ssavekar/dct.htm
14  * Skal's fdct at http://skal.planet-d.net/coding/dct.html
15  *
16  * This file is part of FFmpeg.
17  *
18  * FFmpeg is free software; you can redistribute it and/or
19  * modify it under the terms of the GNU Lesser General Public
20  * License as published by the Free Software Foundation; either
21  * version 2.1 of the License, or (at your option) any later version.
22  *
23  * FFmpeg is distributed in the hope that it will be useful,
24  * but WITHOUT ANY WARRANTY; without even the implied warranty of
25  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
26  * Lesser General Public License for more details.
27  *
28  * You should have received a copy of the GNU Lesser General Public
29  * License along with FFmpeg; if not, write to the Free Software
30  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
31  */
32 
33 #include "libavutil/common.h"
34 #include "libavutil/x86/asm.h"
35 #include "libavcodec/dct.h"
36 
37 #if HAVE_INLINE_ASM
38 
39 //////////////////////////////////////////////////////////////////////
40 //
41 // constants for the forward DCT
42 // -----------------------------
43 //
44 // Be sure to check that your compiler is aligning all constants to QWORD
45 // (8-byte) memory boundaries! Otherwise the unaligned memory access will
46 // severely stall MMX execution.
47 //
48 //////////////////////////////////////////////////////////////////////
49 
50 #define BITS_FRW_ACC 3 //; 2 or 3 for accuracy
51 #define SHIFT_FRW_COL BITS_FRW_ACC
52 #define SHIFT_FRW_ROW (BITS_FRW_ACC + 17 - 3)
53 #define RND_FRW_ROW (1 << (SHIFT_FRW_ROW-1))
54 //#define RND_FRW_COL (1 << (SHIFT_FRW_COL-1))
55 
56 #define X8(x) x,x,x,x,x,x,x,x
57 
58 //concatenated table, for forward DCT transformation
59 DECLARE_ALIGNED(16, static const int16_t, fdct_tg_all_16)[24] = {
60  X8(13036), // tg * (2<<16) + 0.5
61  X8(27146), // tg * (2<<16) + 0.5
62  X8(-21746) // tg * (2<<16) + 0.5
63 };
64 
65 DECLARE_ALIGNED(16, static const int16_t, ocos_4_16)[8] = {
66  X8(23170) //cos * (2<<15) + 0.5
67 };
68 
69 DECLARE_ALIGNED(16, static const int16_t, fdct_one_corr)[8] = { X8(1) };
70 
71 DECLARE_ALIGNED(8, static const int32_t, fdct_r_row)[2] = {RND_FRW_ROW, RND_FRW_ROW };
72 
73 static const struct
74 {
75  DECLARE_ALIGNED(16, const int32_t, fdct_r_row_sse2)[4];
76 } fdct_r_row_sse2 =
77 {{
78  RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW
79 }};
80 //DECLARE_ALIGNED(16, static const long, fdct_r_row_sse2)[4] = {RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW};
81 
82 DECLARE_ALIGNED(8, static const int16_t, tab_frw_01234567)[] = { // forward_dct coeff table
83  16384, 16384, 22725, 19266,
84  16384, 16384, 12873, 4520,
85  21407, 8867, 19266, -4520,
86  -8867, -21407, -22725, -12873,
87  16384, -16384, 12873, -22725,
88  -16384, 16384, 4520, 19266,
89  8867, -21407, 4520, -12873,
90  21407, -8867, 19266, -22725,
91 
92  22725, 22725, 31521, 26722,
93  22725, 22725, 17855, 6270,
94  29692, 12299, 26722, -6270,
95  -12299, -29692, -31521, -17855,
96  22725, -22725, 17855, -31521,
97  -22725, 22725, 6270, 26722,
98  12299, -29692, 6270, -17855,
99  29692, -12299, 26722, -31521,
100 
101  21407, 21407, 29692, 25172,
102  21407, 21407, 16819, 5906,
103  27969, 11585, 25172, -5906,
104  -11585, -27969, -29692, -16819,
105  21407, -21407, 16819, -29692,
106  -21407, 21407, 5906, 25172,
107  11585, -27969, 5906, -16819,
108  27969, -11585, 25172, -29692,
109 
110  19266, 19266, 26722, 22654,
111  19266, 19266, 15137, 5315,
112  25172, 10426, 22654, -5315,
113  -10426, -25172, -26722, -15137,
114  19266, -19266, 15137, -26722,
115  -19266, 19266, 5315, 22654,
116  10426, -25172, 5315, -15137,
117  25172, -10426, 22654, -26722,
118 
119  16384, 16384, 22725, 19266,
120  16384, 16384, 12873, 4520,
121  21407, 8867, 19266, -4520,
122  -8867, -21407, -22725, -12873,
123  16384, -16384, 12873, -22725,
124  -16384, 16384, 4520, 19266,
125  8867, -21407, 4520, -12873,
126  21407, -8867, 19266, -22725,
127 
128  19266, 19266, 26722, 22654,
129  19266, 19266, 15137, 5315,
130  25172, 10426, 22654, -5315,
131  -10426, -25172, -26722, -15137,
132  19266, -19266, 15137, -26722,
133  -19266, 19266, 5315, 22654,
134  10426, -25172, 5315, -15137,
135  25172, -10426, 22654, -26722,
136 
137  21407, 21407, 29692, 25172,
138  21407, 21407, 16819, 5906,
139  27969, 11585, 25172, -5906,
140  -11585, -27969, -29692, -16819,
141  21407, -21407, 16819, -29692,
142  -21407, 21407, 5906, 25172,
143  11585, -27969, 5906, -16819,
144  27969, -11585, 25172, -29692,
145 
146  22725, 22725, 31521, 26722,
147  22725, 22725, 17855, 6270,
148  29692, 12299, 26722, -6270,
149  -12299, -29692, -31521, -17855,
150  22725, -22725, 17855, -31521,
151  -22725, 22725, 6270, 26722,
152  12299, -29692, 6270, -17855,
153  29692, -12299, 26722, -31521,
154 };
155 
156 static const struct
157 {
158  DECLARE_ALIGNED(16, const int16_t, tab_frw_01234567_sse2)[256];
159 } tab_frw_01234567_sse2 =
160 {{
161 //DECLARE_ALIGNED(16, static const int16_t, tab_frw_01234567_sse2)[] = { // forward_dct coeff table
162 #define TABLE_SSE2 C4, C4, C1, C3, -C6, -C2, -C1, -C5, \
163  C4, C4, C5, C7, C2, C6, C3, -C7, \
164  -C4, C4, C7, C3, C6, -C2, C7, -C5, \
165  C4, -C4, C5, -C1, C2, -C6, C3, -C1,
166 // c1..c7 * cos(pi/4) * 2^15
167 #define C1 22725
168 #define C2 21407
169 #define C3 19266
170 #define C4 16384
171 #define C5 12873
172 #define C6 8867
173 #define C7 4520
174 TABLE_SSE2
175 
176 #undef C1
177 #undef C2
178 #undef C3
179 #undef C4
180 #undef C5
181 #undef C6
182 #undef C7
183 #define C1 31521
184 #define C2 29692
185 #define C3 26722
186 #define C4 22725
187 #define C5 17855
188 #define C6 12299
189 #define C7 6270
190 TABLE_SSE2
191 
192 #undef C1
193 #undef C2
194 #undef C3
195 #undef C4
196 #undef C5
197 #undef C6
198 #undef C7
199 #define C1 29692
200 #define C2 27969
201 #define C3 25172
202 #define C4 21407
203 #define C5 16819
204 #define C6 11585
205 #define C7 5906
206 TABLE_SSE2
207 
208 #undef C1
209 #undef C2
210 #undef C3
211 #undef C4
212 #undef C5
213 #undef C6
214 #undef C7
215 #define C1 26722
216 #define C2 25172
217 #define C3 22654
218 #define C4 19266
219 #define C5 15137
220 #define C6 10426
221 #define C7 5315
222 TABLE_SSE2
223 
224 #undef C1
225 #undef C2
226 #undef C3
227 #undef C4
228 #undef C5
229 #undef C6
230 #undef C7
231 #define C1 22725
232 #define C2 21407
233 #define C3 19266
234 #define C4 16384
235 #define C5 12873
236 #define C6 8867
237 #define C7 4520
238 TABLE_SSE2
239 
240 #undef C1
241 #undef C2
242 #undef C3
243 #undef C4
244 #undef C5
245 #undef C6
246 #undef C7
247 #define C1 26722
248 #define C2 25172
249 #define C3 22654
250 #define C4 19266
251 #define C5 15137
252 #define C6 10426
253 #define C7 5315
254 TABLE_SSE2
255 
256 #undef C1
257 #undef C2
258 #undef C3
259 #undef C4
260 #undef C5
261 #undef C6
262 #undef C7
263 #define C1 29692
264 #define C2 27969
265 #define C3 25172
266 #define C4 21407
267 #define C5 16819
268 #define C6 11585
269 #define C7 5906
270 TABLE_SSE2
271 
272 #undef C1
273 #undef C2
274 #undef C3
275 #undef C4
276 #undef C5
277 #undef C6
278 #undef C7
279 #define C1 31521
280 #define C2 29692
281 #define C3 26722
282 #define C4 22725
283 #define C5 17855
284 #define C6 12299
285 #define C7 6270
286 TABLE_SSE2
287 }};
288 
289 #define S(s) AV_TOSTRING(s) //AV_STRINGIFY is too long
290 
291 #define FDCT_COL(cpu, mm, mov)\
292 static av_always_inline void fdct_col_##cpu(const int16_t *in, int16_t *out, int offset)\
293 {\
294  __asm__ volatile (\
295  #mov" 16(%0), %%"#mm"0 \n\t" \
296  #mov" 96(%0), %%"#mm"1 \n\t" \
297  #mov" %%"#mm"0, %%"#mm"2 \n\t" \
298  #mov" 32(%0), %%"#mm"3 \n\t" \
299  "paddsw %%"#mm"1, %%"#mm"0 \n\t" \
300  #mov" 80(%0), %%"#mm"4 \n\t" \
301  "psllw $"S(SHIFT_FRW_COL)", %%"#mm"0 \n\t" \
302  #mov" (%0), %%"#mm"5 \n\t" \
303  "paddsw %%"#mm"3, %%"#mm"4 \n\t" \
304  "paddsw 112(%0), %%"#mm"5 \n\t" \
305  "psllw $"S(SHIFT_FRW_COL)", %%"#mm"4 \n\t" \
306  #mov" %%"#mm"0, %%"#mm"6 \n\t" \
307  "psubsw %%"#mm"1, %%"#mm"2 \n\t" \
308  #mov" 16(%1), %%"#mm"1 \n\t" \
309  "psubsw %%"#mm"4, %%"#mm"0 \n\t" \
310  #mov" 48(%0), %%"#mm"7 \n\t" \
311  "pmulhw %%"#mm"0, %%"#mm"1 \n\t" \
312  "paddsw 64(%0), %%"#mm"7 \n\t" \
313  "psllw $"S(SHIFT_FRW_COL)", %%"#mm"5 \n\t" \
314  "paddsw %%"#mm"4, %%"#mm"6 \n\t" \
315  "psllw $"S(SHIFT_FRW_COL)", %%"#mm"7 \n\t" \
316  #mov" %%"#mm"5, %%"#mm"4 \n\t" \
317  "psubsw %%"#mm"7, %%"#mm"5 \n\t" \
318  "paddsw %%"#mm"5, %%"#mm"1 \n\t" \
319  "paddsw %%"#mm"7, %%"#mm"4 \n\t" \
320  "por (%2), %%"#mm"1 \n\t" \
321  "psllw $"S(SHIFT_FRW_COL)"+1, %%"#mm"2 \n\t" \
322  "pmulhw 16(%1), %%"#mm"5 \n\t" \
323  #mov" %%"#mm"4, %%"#mm"7 \n\t" \
324  "psubsw 80(%0), %%"#mm"3 \n\t" \
325  "psubsw %%"#mm"6, %%"#mm"4 \n\t" \
326  #mov" %%"#mm"1, 32(%3) \n\t" \
327  "paddsw %%"#mm"6, %%"#mm"7 \n\t" \
328  #mov" 48(%0), %%"#mm"1 \n\t" \
329  "psllw $"S(SHIFT_FRW_COL)"+1, %%"#mm"3 \n\t" \
330  "psubsw 64(%0), %%"#mm"1 \n\t" \
331  #mov" %%"#mm"2, %%"#mm"6 \n\t" \
332  #mov" %%"#mm"4, 64(%3) \n\t" \
333  "paddsw %%"#mm"3, %%"#mm"2 \n\t" \
334  "pmulhw (%4), %%"#mm"2 \n\t" \
335  "psubsw %%"#mm"3, %%"#mm"6 \n\t" \
336  "pmulhw (%4), %%"#mm"6 \n\t" \
337  "psubsw %%"#mm"0, %%"#mm"5 \n\t" \
338  "por (%2), %%"#mm"5 \n\t" \
339  "psllw $"S(SHIFT_FRW_COL)", %%"#mm"1 \n\t" \
340  "por (%2), %%"#mm"2 \n\t" \
341  #mov" %%"#mm"1, %%"#mm"4 \n\t" \
342  #mov" (%0), %%"#mm"3 \n\t" \
343  "paddsw %%"#mm"6, %%"#mm"1 \n\t" \
344  "psubsw 112(%0), %%"#mm"3 \n\t" \
345  "psubsw %%"#mm"6, %%"#mm"4 \n\t" \
346  #mov" (%1), %%"#mm"0 \n\t" \
347  "psllw $"S(SHIFT_FRW_COL)", %%"#mm"3 \n\t" \
348  #mov" 32(%1), %%"#mm"6 \n\t" \
349  "pmulhw %%"#mm"1, %%"#mm"0 \n\t" \
350  #mov" %%"#mm"7, (%3) \n\t" \
351  "pmulhw %%"#mm"4, %%"#mm"6 \n\t" \
352  #mov" %%"#mm"5, 96(%3) \n\t" \
353  #mov" %%"#mm"3, %%"#mm"7 \n\t" \
354  #mov" 32(%1), %%"#mm"5 \n\t" \
355  "psubsw %%"#mm"2, %%"#mm"7 \n\t" \
356  "paddsw %%"#mm"2, %%"#mm"3 \n\t" \
357  "pmulhw %%"#mm"7, %%"#mm"5 \n\t" \
358  "paddsw %%"#mm"3, %%"#mm"0 \n\t" \
359  "paddsw %%"#mm"4, %%"#mm"6 \n\t" \
360  "pmulhw (%1), %%"#mm"3 \n\t" \
361  "por (%2), %%"#mm"0 \n\t" \
362  "paddsw %%"#mm"7, %%"#mm"5 \n\t" \
363  "psubsw %%"#mm"6, %%"#mm"7 \n\t" \
364  #mov" %%"#mm"0, 16(%3) \n\t" \
365  "paddsw %%"#mm"4, %%"#mm"5 \n\t" \
366  #mov" %%"#mm"7, 48(%3) \n\t" \
367  "psubsw %%"#mm"1, %%"#mm"3 \n\t" \
368  #mov" %%"#mm"5, 80(%3) \n\t" \
369  #mov" %%"#mm"3, 112(%3) \n\t" \
370  : \
371  : "r" (in + offset), "r" (fdct_tg_all_16), "r" (fdct_one_corr), \
372  "r" (out + offset), "r" (ocos_4_16)); \
373 }
374 
375 FDCT_COL(mmx, mm, movq)
376 FDCT_COL(sse2, xmm, movdqa)
377 
378 static av_always_inline void fdct_row_sse2(const int16_t *in, int16_t *out)
379 {
380  __asm__ volatile(
381 #define FDCT_ROW_SSE2_H1(i,t) \
382  "movq " #i "(%0), %%xmm2 \n\t" \
383  "movq " #i "+8(%0), %%xmm0 \n\t" \
384  "movdqa " #t "+32(%1), %%xmm3 \n\t" \
385  "movdqa " #t "+48(%1), %%xmm7 \n\t" \
386  "movdqa " #t "(%1), %%xmm4 \n\t" \
387  "movdqa " #t "+16(%1), %%xmm5 \n\t"
388 
389 #define FDCT_ROW_SSE2_H2(i,t) \
390  "movq " #i "(%0), %%xmm2 \n\t" \
391  "movq " #i "+8(%0), %%xmm0 \n\t" \
392  "movdqa " #t "+32(%1), %%xmm3 \n\t" \
393  "movdqa " #t "+48(%1), %%xmm7 \n\t"
394 
395 #define FDCT_ROW_SSE2(i) \
396  "movq %%xmm2, %%xmm1 \n\t" \
397  "pshuflw $27, %%xmm0, %%xmm0 \n\t" \
398  "paddsw %%xmm0, %%xmm1 \n\t" \
399  "psubsw %%xmm0, %%xmm2 \n\t" \
400  "punpckldq %%xmm2, %%xmm1 \n\t" \
401  "pshufd $78, %%xmm1, %%xmm2 \n\t" \
402  "pmaddwd %%xmm2, %%xmm3 \n\t" \
403  "pmaddwd %%xmm1, %%xmm7 \n\t" \
404  "pmaddwd %%xmm5, %%xmm2 \n\t" \
405  "pmaddwd %%xmm4, %%xmm1 \n\t" \
406  "paddd %%xmm7, %%xmm3 \n\t" \
407  "paddd %%xmm2, %%xmm1 \n\t" \
408  "paddd %%xmm6, %%xmm3 \n\t" \
409  "paddd %%xmm6, %%xmm1 \n\t" \
410  "psrad %3, %%xmm3 \n\t" \
411  "psrad %3, %%xmm1 \n\t" \
412  "packssdw %%xmm3, %%xmm1 \n\t" \
413  "movdqa %%xmm1, " #i "(%4) \n\t"
414 
415  "movdqa (%2), %%xmm6 \n\t"
416  FDCT_ROW_SSE2_H1(0,0)
417  FDCT_ROW_SSE2(0)
418  FDCT_ROW_SSE2_H2(64,0)
419  FDCT_ROW_SSE2(64)
420 
421  FDCT_ROW_SSE2_H1(16,64)
422  FDCT_ROW_SSE2(16)
423  FDCT_ROW_SSE2_H2(112,64)
424  FDCT_ROW_SSE2(112)
425 
426  FDCT_ROW_SSE2_H1(32,128)
427  FDCT_ROW_SSE2(32)
428  FDCT_ROW_SSE2_H2(96,128)
429  FDCT_ROW_SSE2(96)
430 
431  FDCT_ROW_SSE2_H1(48,192)
432  FDCT_ROW_SSE2(48)
433  FDCT_ROW_SSE2_H2(80,192)
434  FDCT_ROW_SSE2(80)
435  :
436  : "r" (in), "r" (tab_frw_01234567_sse2.tab_frw_01234567_sse2),
437  "r" (fdct_r_row_sse2.fdct_r_row_sse2), "i" (SHIFT_FRW_ROW), "r" (out)
438  XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3",
439  "%xmm4", "%xmm5", "%xmm6", "%xmm7")
440  );
441 }
442 
443 static av_always_inline void fdct_row_mmxext(const int16_t *in, int16_t *out,
444  const int16_t *table)
445 {
446  __asm__ volatile (
447  "pshufw $0x1B, 8(%0), %%mm5 \n\t"
448  "movq (%0), %%mm0 \n\t"
449  "movq %%mm0, %%mm1 \n\t"
450  "paddsw %%mm5, %%mm0 \n\t"
451  "psubsw %%mm5, %%mm1 \n\t"
452  "movq %%mm0, %%mm2 \n\t"
453  "punpckldq %%mm1, %%mm0 \n\t"
454  "punpckhdq %%mm1, %%mm2 \n\t"
455  "movq (%1), %%mm1 \n\t"
456  "movq 8(%1), %%mm3 \n\t"
457  "movq 16(%1), %%mm4 \n\t"
458  "movq 24(%1), %%mm5 \n\t"
459  "movq 32(%1), %%mm6 \n\t"
460  "movq 40(%1), %%mm7 \n\t"
461  "pmaddwd %%mm0, %%mm1 \n\t"
462  "pmaddwd %%mm2, %%mm3 \n\t"
463  "pmaddwd %%mm0, %%mm4 \n\t"
464  "pmaddwd %%mm2, %%mm5 \n\t"
465  "pmaddwd %%mm0, %%mm6 \n\t"
466  "pmaddwd %%mm2, %%mm7 \n\t"
467  "pmaddwd 48(%1), %%mm0 \n\t"
468  "pmaddwd 56(%1), %%mm2 \n\t"
469  "paddd %%mm1, %%mm3 \n\t"
470  "paddd %%mm4, %%mm5 \n\t"
471  "paddd %%mm6, %%mm7 \n\t"
472  "paddd %%mm0, %%mm2 \n\t"
473  "movq (%2), %%mm0 \n\t"
474  "paddd %%mm0, %%mm3 \n\t"
475  "paddd %%mm0, %%mm5 \n\t"
476  "paddd %%mm0, %%mm7 \n\t"
477  "paddd %%mm0, %%mm2 \n\t"
478  "psrad $"S(SHIFT_FRW_ROW)", %%mm3 \n\t"
479  "psrad $"S(SHIFT_FRW_ROW)", %%mm5 \n\t"
480  "psrad $"S(SHIFT_FRW_ROW)", %%mm7 \n\t"
481  "psrad $"S(SHIFT_FRW_ROW)", %%mm2 \n\t"
482  "packssdw %%mm5, %%mm3 \n\t"
483  "packssdw %%mm2, %%mm7 \n\t"
484  "movq %%mm3, (%3) \n\t"
485  "movq %%mm7, 8(%3) \n\t"
486  :
487  : "r" (in), "r" (table), "r" (fdct_r_row), "r" (out));
488 }
489 
490 static av_always_inline void fdct_row_mmx(const int16_t *in, int16_t *out, const int16_t *table)
491 {
492  //FIXME reorder (I do not have an old MMX-only CPU here to benchmark ...)
493  __asm__ volatile(
494  "movd 12(%0), %%mm1 \n\t"
495  "punpcklwd 8(%0), %%mm1 \n\t"
496  "movq %%mm1, %%mm2 \n\t"
497  "psrlq $0x20, %%mm1 \n\t"
498  "movq 0(%0), %%mm0 \n\t"
499  "punpcklwd %%mm2, %%mm1 \n\t"
500  "movq %%mm0, %%mm5 \n\t"
501  "paddsw %%mm1, %%mm0 \n\t"
502  "psubsw %%mm1, %%mm5 \n\t"
503  "movq %%mm0, %%mm2 \n\t"
504  "punpckldq %%mm5, %%mm0 \n\t"
505  "punpckhdq %%mm5, %%mm2 \n\t"
506  "movq 0(%1), %%mm1 \n\t"
507  "movq 8(%1), %%mm3 \n\t"
508  "movq 16(%1), %%mm4 \n\t"
509  "movq 24(%1), %%mm5 \n\t"
510  "movq 32(%1), %%mm6 \n\t"
511  "movq 40(%1), %%mm7 \n\t"
512  "pmaddwd %%mm0, %%mm1 \n\t"
513  "pmaddwd %%mm2, %%mm3 \n\t"
514  "pmaddwd %%mm0, %%mm4 \n\t"
515  "pmaddwd %%mm2, %%mm5 \n\t"
516  "pmaddwd %%mm0, %%mm6 \n\t"
517  "pmaddwd %%mm2, %%mm7 \n\t"
518  "pmaddwd 48(%1), %%mm0 \n\t"
519  "pmaddwd 56(%1), %%mm2 \n\t"
520  "paddd %%mm1, %%mm3 \n\t"
521  "paddd %%mm4, %%mm5 \n\t"
522  "paddd %%mm6, %%mm7 \n\t"
523  "paddd %%mm0, %%mm2 \n\t"
524  "movq (%2), %%mm0 \n\t"
525  "paddd %%mm0, %%mm3 \n\t"
526  "paddd %%mm0, %%mm5 \n\t"
527  "paddd %%mm0, %%mm7 \n\t"
528  "paddd %%mm0, %%mm2 \n\t"
529  "psrad $"S(SHIFT_FRW_ROW)", %%mm3 \n\t"
530  "psrad $"S(SHIFT_FRW_ROW)", %%mm5 \n\t"
531  "psrad $"S(SHIFT_FRW_ROW)", %%mm7 \n\t"
532  "psrad $"S(SHIFT_FRW_ROW)", %%mm2 \n\t"
533  "packssdw %%mm5, %%mm3 \n\t"
534  "packssdw %%mm2, %%mm7 \n\t"
535  "movq %%mm3, 0(%3) \n\t"
536  "movq %%mm7, 8(%3) \n\t"
537  :
538  : "r" (in), "r" (table), "r" (fdct_r_row), "r" (out));
539 }
540 
541 void ff_fdct_mmx(int16_t *block)
542 {
543  DECLARE_ALIGNED(8, int64_t, align_tmp)[16];
544  int16_t * block1= (int16_t*)align_tmp;
545  const int16_t *table= tab_frw_01234567;
546  int i;
547 
548  fdct_col_mmx(block, block1, 0);
549  fdct_col_mmx(block, block1, 4);
550 
551  for(i=8;i>0;i--) {
552  fdct_row_mmx(block1, block, table);
553  block1 += 8;
554  table += 32;
555  block += 8;
556  }
557 }
558 
559 void ff_fdct_mmxext(int16_t *block)
560 {
561  DECLARE_ALIGNED(8, int64_t, align_tmp)[16];
562  int16_t *block1= (int16_t*)align_tmp;
563  const int16_t *table= tab_frw_01234567;
564  int i;
565 
566  fdct_col_mmx(block, block1, 0);
567  fdct_col_mmx(block, block1, 4);
568 
569  for(i=8;i>0;i--) {
570  fdct_row_mmxext(block1, block, table);
571  block1 += 8;
572  table += 32;
573  block += 8;
574  }
575 }
576 
577 void ff_fdct_sse2(int16_t *block)
578 {
579  DECLARE_ALIGNED(16, int64_t, align_tmp)[16];
580  int16_t * const block1= (int16_t*)align_tmp;
581 
582  fdct_col_sse2(block, block1, 0);
583  fdct_row_sse2(block1, block);
584 }
585 
586 #endif /* HAVE_INLINE_ASM */
About Git write you should know how to use GIT properly Luckily Git comes with excellent documentation git help man git shows you the available git< command > help man git< command > shows information about the subcommand< command > The most comprehensive manual is the website Git Reference visit they are quite exhaustive You do not need a special username or password All you need is to provide a ssh public key to the Git server admin What follows now is a basic introduction to Git and some FFmpeg specific guidelines Read it at least if you are granted commit privileges to the FFmpeg project you are expected to be familiar with these rules I if not You can get git from etc no matter how small Every one of them has been saved from looking like a fool by this many times It s very easy for stray debug output or cosmetic modifications to slip in
Definition: git-howto.txt:5
#define DECLARE_ALIGNED(n, t, v)
Definition: mem.h:59
void ff_fdct_mmxext(int16_t *block)
#define S(s, c, i)
static const struct endianess table[]
const char * r
Definition: vf_curves.c:94
void ff_fdct_mmx(int16_t *block)
void ff_fdct_sse2(int16_t *block)
int32_t
synthesis window for stochastic i
#define XMM_CLOBBERS_ONLY(...)
common internal and external API header
#define av_always_inline
Definition: attributes.h:41
uint8_t pi<< 24) CONV_FUNC_GROUP(AV_SAMPLE_FMT_FLT, float, AV_SAMPLE_FMT_U8, uint8_t,(*(const uint8_t *) pi-0x80)*(1.0f/(1<< 7))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_DBL, double, AV_SAMPLE_FMT_U8, uint8_t,(*(const uint8_t *) pi-0x80)*(1.0/(1<< 7))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_U8, uint8_t, AV_SAMPLE_FMT_S16, int16_t,(*(const int16_t *) pi >> 8)+0x80) CONV_FUNC_GROUP(AV_SAMPLE_FMT_FLT, float, AV_SAMPLE_FMT_S16, int16_t,*(const int16_t *) pi *(1.0f/(1<< 15))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_DBL, double, AV_SAMPLE_FMT_S16, int16_t,*(const int16_t *) pi *(1.0/(1<< 15))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_U8, uint8_t, AV_SAMPLE_FMT_S32, int32_t,(*(const int32_t *) pi >> 24)+0x80) CONV_FUNC_GROUP(AV_SAMPLE_FMT_FLT, float, AV_SAMPLE_FMT_S32, int32_t,*(const int32_t *) pi *(1.0f/(1U<< 31))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_DBL, double, AV_SAMPLE_FMT_S32, int32_t,*(const int32_t *) pi *(1.0/(1U<< 31))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_U8, uint8_t, AV_SAMPLE_FMT_FLT, float, av_clip_uint8(lrintf(*(const float *) pi *(1<< 7))+0x80)) CONV_FUNC_GROUP(AV_SAMPLE_FMT_S16, int16_t, AV_SAMPLE_FMT_FLT, float, av_clip_int16(lrintf(*(const float *) pi *(1<< 15)))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_S32, int32_t, AV_SAMPLE_FMT_FLT, float, av_clipl_int32(llrintf(*(const float *) pi *(1U<< 31)))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_U8, uint8_t, AV_SAMPLE_FMT_DBL, double, av_clip_uint8(lrint(*(const double *) pi *(1<< 7))+0x80)) CONV_FUNC_GROUP(AV_SAMPLE_FMT_S16, int16_t, AV_SAMPLE_FMT_DBL, double, av_clip_int16(lrint(*(const double *) pi *(1<< 15)))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_S32, int32_t, AV_SAMPLE_FMT_DBL, double, av_clipl_int32(llrint(*(const double *) pi *(1U<< 31))))#define SET_CONV_FUNC_GROUP(ofmt, ifmt) static void set_generic_function(AudioConvert *ac){}void ff_audio_convert_free(AudioConvert **ac){if(!*ac) return;ff_dither_free(&(*ac) ->dc);av_freep(ac);}AudioConvert *ff_audio_convert_alloc(AVAudioResampleContext *avr, enum AVSampleFormat out_fmt, enum AVSampleFormat in_fmt, int channels, int sample_rate, int apply_map){AudioConvert *ac;int in_planar, out_planar;ac=av_mallocz(sizeof(*ac));if(!ac) return NULL;ac->avr=avr;ac->out_fmt=out_fmt;ac->in_fmt=in_fmt;ac->channels=channels;ac->apply_map=apply_map;if(avr->dither_method!=AV_RESAMPLE_DITHER_NONE &&av_get_packed_sample_fmt(out_fmt)==AV_SAMPLE_FMT_S16 &&av_get_bytes_per_sample(in_fmt) > 2){ac->dc=ff_dither_alloc(avr, out_fmt, in_fmt, channels, sample_rate, apply_map);if(!ac->dc){av_free(ac);return NULL;}return ac;}in_planar=av_sample_fmt_is_planar(in_fmt);out_planar=av_sample_fmt_is_planar(out_fmt);if(in_planar==out_planar){ac->func_type=CONV_FUNC_TYPE_FLAT;ac->planes=in_planar?ac->channels:1;}else if(in_planar) ac->func_type=CONV_FUNC_TYPE_INTERLEAVE;else ac->func_type=CONV_FUNC_TYPE_DEINTERLEAVE;set_generic_function(ac);if(ARCH_ARM) ff_audio_convert_init_arm(ac);if(ARCH_X86) ff_audio_convert_init_x86(ac);return ac;}int ff_audio_convert(AudioConvert *ac, AudioData *out, AudioData *in){int use_generic=1;int len=in->nb_samples;int p;if(ac->dc){av_dlog(ac->avr,"%d samples - audio_convert: %s to %s (dithered)\n", len, av_get_sample_fmt_name(ac->in_fmt), av_get_sample_fmt_name(ac->out_fmt));return ff_convert_dither(ac-> out