yading@10
|
1 /*
|
yading@10
|
2 * Simple IDCT (Alpha optimized)
|
yading@10
|
3 *
|
yading@10
|
4 * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at>
|
yading@10
|
5 *
|
yading@10
|
6 * based upon some outcommented C code from mpeg2dec (idct_mmx.c
|
yading@10
|
7 * written by Aaron Holtzman <aholtzma@ess.engr.uvic.ca>)
|
yading@10
|
8 *
|
yading@10
|
9 * Alpha optimizations by Måns Rullgård <mans@mansr.com>
|
yading@10
|
10 * and Falk Hueffner <falk@debian.org>
|
yading@10
|
11 *
|
yading@10
|
12 * This file is part of FFmpeg.
|
yading@10
|
13 *
|
yading@10
|
14 * FFmpeg is free software; you can redistribute it and/or
|
yading@10
|
15 * modify it under the terms of the GNU Lesser General Public
|
yading@10
|
16 * License as published by the Free Software Foundation; either
|
yading@10
|
17 * version 2.1 of the License, or (at your option) any later version.
|
yading@10
|
18 *
|
yading@10
|
19 * FFmpeg is distributed in the hope that it will be useful,
|
yading@10
|
20 * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
yading@10
|
21 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
yading@10
|
22 * Lesser General Public License for more details.
|
yading@10
|
23 *
|
yading@10
|
24 * You should have received a copy of the GNU Lesser General Public
|
yading@10
|
25 * License along with FFmpeg; if not, write to the Free Software
|
yading@10
|
26 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
yading@10
|
27 */
|
yading@10
|
28
|
yading@10
|
29 #include "dsputil_alpha.h"
|
yading@10
|
30 #include "asm.h"
|
yading@10
|
31
|
yading@10
|
32 // cos(i * M_PI / 16) * sqrt(2) * (1 << 14)
|
yading@10
|
33 // W4 is actually exactly 16384, but using 16383 works around
|
yading@10
|
34 // accumulating rounding errors for some encoders
|
yading@10
|
35 #define W1 22725
|
yading@10
|
36 #define W2 21407
|
yading@10
|
37 #define W3 19266
|
yading@10
|
38 #define W4 16383
|
yading@10
|
39 #define W5 12873
|
yading@10
|
40 #define W6 8867
|
yading@10
|
41 #define W7 4520
|
yading@10
|
42 #define ROW_SHIFT 11
|
yading@10
|
43 #define COL_SHIFT 20
|
yading@10
|
44
|
yading@10
|
45 /* 0: all entries 0, 1: only first entry nonzero, 2: otherwise */
|
yading@10
|
46 static inline int idct_row(int16_t *row)
|
yading@10
|
47 {
|
yading@10
|
48 int a0, a1, a2, a3, b0, b1, b2, b3, t;
|
yading@10
|
49 uint64_t l, r, t2;
|
yading@10
|
50 l = ldq(row);
|
yading@10
|
51 r = ldq(row + 4);
|
yading@10
|
52
|
yading@10
|
53 if (l == 0 && r == 0)
|
yading@10
|
54 return 0;
|
yading@10
|
55
|
yading@10
|
56 a0 = W4 * sextw(l) + (1 << (ROW_SHIFT - 1));
|
yading@10
|
57
|
yading@10
|
58 if (((l & ~0xffffUL) | r) == 0) {
|
yading@10
|
59 a0 >>= ROW_SHIFT;
|
yading@10
|
60 t2 = (uint16_t) a0;
|
yading@10
|
61 t2 |= t2 << 16;
|
yading@10
|
62 t2 |= t2 << 32;
|
yading@10
|
63
|
yading@10
|
64 stq(t2, row);
|
yading@10
|
65 stq(t2, row + 4);
|
yading@10
|
66 return 1;
|
yading@10
|
67 }
|
yading@10
|
68
|
yading@10
|
69 a1 = a0;
|
yading@10
|
70 a2 = a0;
|
yading@10
|
71 a3 = a0;
|
yading@10
|
72
|
yading@10
|
73 t = extwl(l, 4); /* row[2] */
|
yading@10
|
74 if (t != 0) {
|
yading@10
|
75 t = sextw(t);
|
yading@10
|
76 a0 += W2 * t;
|
yading@10
|
77 a1 += W6 * t;
|
yading@10
|
78 a2 -= W6 * t;
|
yading@10
|
79 a3 -= W2 * t;
|
yading@10
|
80 }
|
yading@10
|
81
|
yading@10
|
82 t = extwl(r, 0); /* row[4] */
|
yading@10
|
83 if (t != 0) {
|
yading@10
|
84 t = sextw(t);
|
yading@10
|
85 a0 += W4 * t;
|
yading@10
|
86 a1 -= W4 * t;
|
yading@10
|
87 a2 -= W4 * t;
|
yading@10
|
88 a3 += W4 * t;
|
yading@10
|
89 }
|
yading@10
|
90
|
yading@10
|
91 t = extwl(r, 4); /* row[6] */
|
yading@10
|
92 if (t != 0) {
|
yading@10
|
93 t = sextw(t);
|
yading@10
|
94 a0 += W6 * t;
|
yading@10
|
95 a1 -= W2 * t;
|
yading@10
|
96 a2 += W2 * t;
|
yading@10
|
97 a3 -= W6 * t;
|
yading@10
|
98 }
|
yading@10
|
99
|
yading@10
|
100 t = extwl(l, 2); /* row[1] */
|
yading@10
|
101 if (t != 0) {
|
yading@10
|
102 t = sextw(t);
|
yading@10
|
103 b0 = W1 * t;
|
yading@10
|
104 b1 = W3 * t;
|
yading@10
|
105 b2 = W5 * t;
|
yading@10
|
106 b3 = W7 * t;
|
yading@10
|
107 } else {
|
yading@10
|
108 b0 = 0;
|
yading@10
|
109 b1 = 0;
|
yading@10
|
110 b2 = 0;
|
yading@10
|
111 b3 = 0;
|
yading@10
|
112 }
|
yading@10
|
113
|
yading@10
|
114 t = extwl(l, 6); /* row[3] */
|
yading@10
|
115 if (t) {
|
yading@10
|
116 t = sextw(t);
|
yading@10
|
117 b0 += W3 * t;
|
yading@10
|
118 b1 -= W7 * t;
|
yading@10
|
119 b2 -= W1 * t;
|
yading@10
|
120 b3 -= W5 * t;
|
yading@10
|
121 }
|
yading@10
|
122
|
yading@10
|
123
|
yading@10
|
124 t = extwl(r, 2); /* row[5] */
|
yading@10
|
125 if (t) {
|
yading@10
|
126 t = sextw(t);
|
yading@10
|
127 b0 += W5 * t;
|
yading@10
|
128 b1 -= W1 * t;
|
yading@10
|
129 b2 += W7 * t;
|
yading@10
|
130 b3 += W3 * t;
|
yading@10
|
131 }
|
yading@10
|
132
|
yading@10
|
133 t = extwl(r, 6); /* row[7] */
|
yading@10
|
134 if (t) {
|
yading@10
|
135 t = sextw(t);
|
yading@10
|
136 b0 += W7 * t;
|
yading@10
|
137 b1 -= W5 * t;
|
yading@10
|
138 b2 += W3 * t;
|
yading@10
|
139 b3 -= W1 * t;
|
yading@10
|
140 }
|
yading@10
|
141
|
yading@10
|
142 row[0] = (a0 + b0) >> ROW_SHIFT;
|
yading@10
|
143 row[1] = (a1 + b1) >> ROW_SHIFT;
|
yading@10
|
144 row[2] = (a2 + b2) >> ROW_SHIFT;
|
yading@10
|
145 row[3] = (a3 + b3) >> ROW_SHIFT;
|
yading@10
|
146 row[4] = (a3 - b3) >> ROW_SHIFT;
|
yading@10
|
147 row[5] = (a2 - b2) >> ROW_SHIFT;
|
yading@10
|
148 row[6] = (a1 - b1) >> ROW_SHIFT;
|
yading@10
|
149 row[7] = (a0 - b0) >> ROW_SHIFT;
|
yading@10
|
150
|
yading@10
|
151 return 2;
|
yading@10
|
152 }
|
yading@10
|
153
|
yading@10
|
154 static inline void idct_col(int16_t *col)
|
yading@10
|
155 {
|
yading@10
|
156 int a0, a1, a2, a3, b0, b1, b2, b3;
|
yading@10
|
157
|
yading@10
|
158 col[0] += (1 << (COL_SHIFT - 1)) / W4;
|
yading@10
|
159
|
yading@10
|
160 a0 = W4 * col[8 * 0];
|
yading@10
|
161 a1 = W4 * col[8 * 0];
|
yading@10
|
162 a2 = W4 * col[8 * 0];
|
yading@10
|
163 a3 = W4 * col[8 * 0];
|
yading@10
|
164
|
yading@10
|
165 if (col[8 * 2]) {
|
yading@10
|
166 a0 += W2 * col[8 * 2];
|
yading@10
|
167 a1 += W6 * col[8 * 2];
|
yading@10
|
168 a2 -= W6 * col[8 * 2];
|
yading@10
|
169 a3 -= W2 * col[8 * 2];
|
yading@10
|
170 }
|
yading@10
|
171
|
yading@10
|
172 if (col[8 * 4]) {
|
yading@10
|
173 a0 += W4 * col[8 * 4];
|
yading@10
|
174 a1 -= W4 * col[8 * 4];
|
yading@10
|
175 a2 -= W4 * col[8 * 4];
|
yading@10
|
176 a3 += W4 * col[8 * 4];
|
yading@10
|
177 }
|
yading@10
|
178
|
yading@10
|
179 if (col[8 * 6]) {
|
yading@10
|
180 a0 += W6 * col[8 * 6];
|
yading@10
|
181 a1 -= W2 * col[8 * 6];
|
yading@10
|
182 a2 += W2 * col[8 * 6];
|
yading@10
|
183 a3 -= W6 * col[8 * 6];
|
yading@10
|
184 }
|
yading@10
|
185
|
yading@10
|
186 if (col[8 * 1]) {
|
yading@10
|
187 b0 = W1 * col[8 * 1];
|
yading@10
|
188 b1 = W3 * col[8 * 1];
|
yading@10
|
189 b2 = W5 * col[8 * 1];
|
yading@10
|
190 b3 = W7 * col[8 * 1];
|
yading@10
|
191 } else {
|
yading@10
|
192 b0 = 0;
|
yading@10
|
193 b1 = 0;
|
yading@10
|
194 b2 = 0;
|
yading@10
|
195 b3 = 0;
|
yading@10
|
196 }
|
yading@10
|
197
|
yading@10
|
198 if (col[8 * 3]) {
|
yading@10
|
199 b0 += W3 * col[8 * 3];
|
yading@10
|
200 b1 -= W7 * col[8 * 3];
|
yading@10
|
201 b2 -= W1 * col[8 * 3];
|
yading@10
|
202 b3 -= W5 * col[8 * 3];
|
yading@10
|
203 }
|
yading@10
|
204
|
yading@10
|
205 if (col[8 * 5]) {
|
yading@10
|
206 b0 += W5 * col[8 * 5];
|
yading@10
|
207 b1 -= W1 * col[8 * 5];
|
yading@10
|
208 b2 += W7 * col[8 * 5];
|
yading@10
|
209 b3 += W3 * col[8 * 5];
|
yading@10
|
210 }
|
yading@10
|
211
|
yading@10
|
212 if (col[8 * 7]) {
|
yading@10
|
213 b0 += W7 * col[8 * 7];
|
yading@10
|
214 b1 -= W5 * col[8 * 7];
|
yading@10
|
215 b2 += W3 * col[8 * 7];
|
yading@10
|
216 b3 -= W1 * col[8 * 7];
|
yading@10
|
217 }
|
yading@10
|
218
|
yading@10
|
219 col[8 * 0] = (a0 + b0) >> COL_SHIFT;
|
yading@10
|
220 col[8 * 7] = (a0 - b0) >> COL_SHIFT;
|
yading@10
|
221 col[8 * 1] = (a1 + b1) >> COL_SHIFT;
|
yading@10
|
222 col[8 * 6] = (a1 - b1) >> COL_SHIFT;
|
yading@10
|
223 col[8 * 2] = (a2 + b2) >> COL_SHIFT;
|
yading@10
|
224 col[8 * 5] = (a2 - b2) >> COL_SHIFT;
|
yading@10
|
225 col[8 * 3] = (a3 + b3) >> COL_SHIFT;
|
yading@10
|
226 col[8 * 4] = (a3 - b3) >> COL_SHIFT;
|
yading@10
|
227 }
|
yading@10
|
228
|
yading@10
|
229 /* If all rows but the first one are zero after row transformation,
|
yading@10
|
230 all rows will be identical after column transformation. */
|
yading@10
|
231 static inline void idct_col2(int16_t *col)
|
yading@10
|
232 {
|
yading@10
|
233 int i;
|
yading@10
|
234 uint64_t l, r;
|
yading@10
|
235
|
yading@10
|
236 for (i = 0; i < 8; ++i) {
|
yading@10
|
237 int a0 = col[i] + (1 << (COL_SHIFT - 1)) / W4;
|
yading@10
|
238
|
yading@10
|
239 a0 *= W4;
|
yading@10
|
240 col[i] = a0 >> COL_SHIFT;
|
yading@10
|
241 }
|
yading@10
|
242
|
yading@10
|
243 l = ldq(col + 0 * 4); r = ldq(col + 1 * 4);
|
yading@10
|
244 stq(l, col + 2 * 4); stq(r, col + 3 * 4);
|
yading@10
|
245 stq(l, col + 4 * 4); stq(r, col + 5 * 4);
|
yading@10
|
246 stq(l, col + 6 * 4); stq(r, col + 7 * 4);
|
yading@10
|
247 stq(l, col + 8 * 4); stq(r, col + 9 * 4);
|
yading@10
|
248 stq(l, col + 10 * 4); stq(r, col + 11 * 4);
|
yading@10
|
249 stq(l, col + 12 * 4); stq(r, col + 13 * 4);
|
yading@10
|
250 stq(l, col + 14 * 4); stq(r, col + 15 * 4);
|
yading@10
|
251 }
|
yading@10
|
252
|
yading@10
|
253 void ff_simple_idct_axp(int16_t *block)
|
yading@10
|
254 {
|
yading@10
|
255
|
yading@10
|
256 int i;
|
yading@10
|
257 int rowsZero = 1; /* all rows except row 0 zero */
|
yading@10
|
258 int rowsConstant = 1; /* all rows consist of a constant value */
|
yading@10
|
259
|
yading@10
|
260 for (i = 0; i < 8; i++) {
|
yading@10
|
261 int sparseness = idct_row(block + 8 * i);
|
yading@10
|
262
|
yading@10
|
263 if (i > 0 && sparseness > 0)
|
yading@10
|
264 rowsZero = 0;
|
yading@10
|
265 if (sparseness == 2)
|
yading@10
|
266 rowsConstant = 0;
|
yading@10
|
267 }
|
yading@10
|
268
|
yading@10
|
269 if (rowsZero) {
|
yading@10
|
270 idct_col2(block);
|
yading@10
|
271 } else if (rowsConstant) {
|
yading@10
|
272 idct_col(block);
|
yading@10
|
273 for (i = 0; i < 8; i += 2) {
|
yading@10
|
274 uint64_t v = (uint16_t) block[0];
|
yading@10
|
275 uint64_t w = (uint16_t) block[8];
|
yading@10
|
276
|
yading@10
|
277 v |= v << 16;
|
yading@10
|
278 w |= w << 16;
|
yading@10
|
279 v |= v << 32;
|
yading@10
|
280 w |= w << 32;
|
yading@10
|
281 stq(v, block + 0 * 4);
|
yading@10
|
282 stq(v, block + 1 * 4);
|
yading@10
|
283 stq(w, block + 2 * 4);
|
yading@10
|
284 stq(w, block + 3 * 4);
|
yading@10
|
285 block += 4 * 4;
|
yading@10
|
286 }
|
yading@10
|
287 } else {
|
yading@10
|
288 for (i = 0; i < 8; i++)
|
yading@10
|
289 idct_col(block + i);
|
yading@10
|
290 }
|
yading@10
|
291 }
|
yading@10
|
292
|
yading@10
|
293 void ff_simple_idct_put_axp(uint8_t *dest, int line_size, int16_t *block)
|
yading@10
|
294 {
|
yading@10
|
295 ff_simple_idct_axp(block);
|
yading@10
|
296 put_pixels_clamped_axp_p(block, dest, line_size);
|
yading@10
|
297 }
|
yading@10
|
298
|
yading@10
|
299 void ff_simple_idct_add_axp(uint8_t *dest, int line_size, int16_t *block)
|
yading@10
|
300 {
|
yading@10
|
301 ff_simple_idct_axp(block);
|
yading@10
|
302 add_pixels_clamped_axp_p(block, dest, line_size);
|
yading@10
|
303 }
|