yading@10
|
1 /*
|
yading@10
|
2 * SPARC VIS optimized inverse DCT
|
yading@10
|
3 * Copyright (c) 2007 Denes Balatoni < dbalatoni XatX interware XdotX hu >
|
yading@10
|
4 *
|
yading@10
|
5 * I did consult the following fine web page about dct
|
yading@10
|
6 * http://www.geocities.com/ssavekar/dct.htm
|
yading@10
|
7 *
|
yading@10
|
8 * This file is part of FFmpeg.
|
yading@10
|
9 *
|
yading@10
|
10 * FFmpeg is free software; you can redistribute it and/or
|
yading@10
|
11 * modify it under the terms of the GNU Lesser General Public
|
yading@10
|
12 * License as published by the Free Software Foundation; either
|
yading@10
|
13 * version 2.1 of the License, or (at your option) any later version.
|
yading@10
|
14 *
|
yading@10
|
15 * FFmpeg is distributed in the hope that it will be useful,
|
yading@10
|
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
yading@10
|
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
yading@10
|
18 * Lesser General Public License for more details.
|
yading@10
|
19 *
|
yading@10
|
20 * You should have received a copy of the GNU Lesser General Public
|
yading@10
|
21 * License along with FFmpeg; if not, write to the Free Software
|
yading@10
|
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
yading@10
|
23 */
|
yading@10
|
24
|
yading@10
|
25 #include <stdint.h>
|
yading@10
|
26
|
yading@10
|
27 #include "dsputil_vis.h"
|
yading@10
|
28 #include "libavutil/mem.h"
|
yading@10
|
29
|
yading@10
|
30 static const DECLARE_ALIGNED(8, int16_t, coeffs)[28] = {
|
yading@10
|
31 - 1259,- 1259,- 1259,- 1259,
|
yading@10
|
32 - 4989,- 4989,- 4989,- 4989,
|
yading@10
|
33 -11045,-11045,-11045,-11045,
|
yading@10
|
34 -19195,-19195,-19195,-19195,
|
yading@10
|
35 -29126,-29126,-29126,-29126,
|
yading@10
|
36 25080, 25080, 25080, 25080,
|
yading@10
|
37 12785, 12785, 12785, 12785
|
yading@10
|
38 };
|
yading@10
|
39 static const DECLARE_ALIGNED(8, uint16_t, scale)[4] = {
|
yading@10
|
40 65536>>6, 65536>>6, 65536>>6, 65536>>6
|
yading@10
|
41 };
|
yading@10
|
42 static const DECLARE_ALIGNED(8, uint16_t, rounder)[4] = {
|
yading@10
|
43 1<<5, 1<<5, 1<<5, 1<<5
|
yading@10
|
44 };
|
yading@10
|
45 static const DECLARE_ALIGNED(8, uint16_t, expand)[4] = {
|
yading@10
|
46 1<<14, 1<<14, 1<<14, 1<<14
|
yading@10
|
47 };
|
yading@10
|
48
|
yading@10
|
49 #define INIT_IDCT \
|
yading@10
|
50 "ldd [%1], %%f32 \n\t"\
|
yading@10
|
51 "ldd [%1+8], %%f34 \n\t"\
|
yading@10
|
52 "ldd [%1+16], %%f36 \n\t"\
|
yading@10
|
53 "ldd [%1+24], %%f38 \n\t"\
|
yading@10
|
54 "ldd [%1+32], %%f40 \n\t"\
|
yading@10
|
55 "ldd [%1+40], %%f42 \n\t"\
|
yading@10
|
56 "ldd [%1+48], %%f44 \n\t"\
|
yading@10
|
57 "ldd [%0], %%f46 \n\t"\
|
yading@10
|
58 "fzero %%f62 \n\t"\
|
yading@10
|
59
|
yading@10
|
60 #define LOADSCALE(in) \
|
yading@10
|
61 "ldd [" in "], %%f0 \n\t"\
|
yading@10
|
62 "ldd [" in "+16], %%f2 \n\t"\
|
yading@10
|
63 "ldd [" in "+32], %%f4 \n\t"\
|
yading@10
|
64 "ldd [" in "+48], %%f6 \n\t"\
|
yading@10
|
65 "ldd [" in "+64], %%f8 \n\t"\
|
yading@10
|
66 "ldd [" in "+80], %%f10 \n\t"\
|
yading@10
|
67 "ldd [" in "+96], %%f12 \n\t"\
|
yading@10
|
68 "ldd [" in "+112], %%f14 \n\t"\
|
yading@10
|
69 "fpadd16 %%f0, %%f0, %%f0 \n\t"\
|
yading@10
|
70 "fpadd16 %%f2, %%f2, %%f2 \n\t"\
|
yading@10
|
71 "fpadd16 %%f4, %%f4, %%f4 \n\t"\
|
yading@10
|
72 "fpadd16 %%f6, %%f6, %%f6 \n\t"\
|
yading@10
|
73 "fpadd16 %%f8, %%f8, %%f8 \n\t"\
|
yading@10
|
74 "fpadd16 %%f10, %%f10, %%f10 \n\t"\
|
yading@10
|
75 "fpadd16 %%f12, %%f12, %%f12 \n\t"\
|
yading@10
|
76 "fpadd16 %%f14, %%f14, %%f14 \n\t"\
|
yading@10
|
77 \
|
yading@10
|
78 "fpadd16 %%f0, %%f0, %%f0 \n\t"\
|
yading@10
|
79 "fpadd16 %%f2, %%f2, %%f2 \n\t"\
|
yading@10
|
80 "fpadd16 %%f4, %%f4, %%f4 \n\t"\
|
yading@10
|
81 "fpadd16 %%f6, %%f6, %%f6 \n\t"\
|
yading@10
|
82 "fpadd16 %%f8, %%f8, %%f8 \n\t"\
|
yading@10
|
83 "fpadd16 %%f10, %%f10, %%f10 \n\t"\
|
yading@10
|
84 "fpadd16 %%f12, %%f12, %%f12 \n\t"\
|
yading@10
|
85 "fpadd16 %%f14, %%f14, %%f14 \n\t"\
|
yading@10
|
86 \
|
yading@10
|
87 "fpadd16 %%f0, %%f0, %%f0 \n\t"\
|
yading@10
|
88 "fpadd16 %%f2, %%f2, %%f2 \n\t"\
|
yading@10
|
89 "fpadd16 %%f4, %%f4, %%f4 \n\t"\
|
yading@10
|
90 "fpadd16 %%f6, %%f6, %%f6 \n\t"\
|
yading@10
|
91 "fpadd16 %%f8, %%f8, %%f8 \n\t"\
|
yading@10
|
92 "fpadd16 %%f10, %%f10, %%f10 \n\t"\
|
yading@10
|
93 "fpadd16 %%f12, %%f12, %%f12 \n\t"\
|
yading@10
|
94 "fpadd16 %%f14, %%f14, %%f14 \n\t"\
|
yading@10
|
95 \
|
yading@10
|
96 "fpadd16 %%f0, %%f0, %%f0 \n\t"\
|
yading@10
|
97 "fpadd16 %%f2, %%f2, %%f2 \n\t"\
|
yading@10
|
98 "fpadd16 %%f4, %%f4, %%f4 \n\t"\
|
yading@10
|
99 "fpadd16 %%f6, %%f6, %%f6 \n\t"\
|
yading@10
|
100 "fpadd16 %%f8, %%f8, %%f8 \n\t"\
|
yading@10
|
101 "fpadd16 %%f10, %%f10, %%f10 \n\t"\
|
yading@10
|
102 "fpadd16 %%f12, %%f12, %%f12 \n\t"\
|
yading@10
|
103 "fpadd16 %%f14, %%f14, %%f14 \n\t"\
|
yading@10
|
104
|
yading@10
|
105 #define LOAD(in) \
|
yading@10
|
106 "ldd [" in "], %%f16 \n\t"\
|
yading@10
|
107 "ldd [" in "+8], %%f18 \n\t"\
|
yading@10
|
108 "ldd [" in "+16], %%f20 \n\t"\
|
yading@10
|
109 "ldd [" in "+24], %%f22 \n\t"\
|
yading@10
|
110 "ldd [" in "+32], %%f24 \n\t"\
|
yading@10
|
111 "ldd [" in "+40], %%f26 \n\t"\
|
yading@10
|
112 "ldd [" in "+48], %%f28 \n\t"\
|
yading@10
|
113 "ldd [" in "+56], %%f30 \n\t"\
|
yading@10
|
114
|
yading@10
|
115 #define TRANSPOSE \
|
yading@10
|
116 "fpmerge %%f16, %%f24, %%f0 \n\t"\
|
yading@10
|
117 "fpmerge %%f20, %%f28, %%f2 \n\t"\
|
yading@10
|
118 "fpmerge %%f17, %%f25, %%f4 \n\t"\
|
yading@10
|
119 "fpmerge %%f21, %%f29, %%f6 \n\t"\
|
yading@10
|
120 "fpmerge %%f18, %%f26, %%f8 \n\t"\
|
yading@10
|
121 "fpmerge %%f22, %%f30, %%f10 \n\t"\
|
yading@10
|
122 "fpmerge %%f19, %%f27, %%f12 \n\t"\
|
yading@10
|
123 "fpmerge %%f23, %%f31, %%f14 \n\t"\
|
yading@10
|
124 \
|
yading@10
|
125 "fpmerge %%f0, %%f2, %%f16 \n\t"\
|
yading@10
|
126 "fpmerge %%f1, %%f3, %%f18 \n\t"\
|
yading@10
|
127 "fpmerge %%f4, %%f6, %%f20 \n\t"\
|
yading@10
|
128 "fpmerge %%f5, %%f7, %%f22 \n\t"\
|
yading@10
|
129 "fpmerge %%f8, %%f10, %%f24 \n\t"\
|
yading@10
|
130 "fpmerge %%f9, %%f11, %%f26 \n\t"\
|
yading@10
|
131 "fpmerge %%f12, %%f14, %%f28 \n\t"\
|
yading@10
|
132 "fpmerge %%f13, %%f15, %%f30 \n\t"\
|
yading@10
|
133 \
|
yading@10
|
134 "fpmerge %%f16, %%f17, %%f0 \n\t"\
|
yading@10
|
135 "fpmerge %%f18, %%f19, %%f2 \n\t"\
|
yading@10
|
136 "fpmerge %%f20, %%f21, %%f4 \n\t"\
|
yading@10
|
137 "fpmerge %%f22, %%f23, %%f6 \n\t"\
|
yading@10
|
138 "fpmerge %%f24, %%f25, %%f8 \n\t"\
|
yading@10
|
139 "fpmerge %%f26, %%f27, %%f10 \n\t"\
|
yading@10
|
140 "fpmerge %%f28, %%f29, %%f12 \n\t"\
|
yading@10
|
141 "fpmerge %%f30, %%f31, %%f14 \n\t"\
|
yading@10
|
142
|
yading@10
|
143 #define IDCT4ROWS \
|
yading@10
|
144 /* 1. column */\
|
yading@10
|
145 "fmul8ulx16 %%f0, %%f38, %%f28 \n\t"\
|
yading@10
|
146 "for %%f4, %%f6, %%f60 \n\t"\
|
yading@10
|
147 "fmul8ulx16 %%f2, %%f32, %%f18 \n\t"\
|
yading@10
|
148 "fmul8ulx16 %%f2, %%f36, %%f22 \n\t"\
|
yading@10
|
149 "fmul8ulx16 %%f2, %%f40, %%f26 \n\t"\
|
yading@10
|
150 "fmul8ulx16 %%f2, %%f44, %%f30 \n\t"\
|
yading@10
|
151 \
|
yading@10
|
152 ADDROUNDER\
|
yading@10
|
153 \
|
yading@10
|
154 "fmul8sux16 %%f0, %%f38, %%f48 \n\t"\
|
yading@10
|
155 "fcmpd %%fcc0, %%f62, %%f60 \n\t"\
|
yading@10
|
156 "for %%f8, %%f10, %%f60 \n\t"\
|
yading@10
|
157 "fmul8sux16 %%f2, %%f32, %%f50 \n\t"\
|
yading@10
|
158 "fmul8sux16 %%f2, %%f36, %%f52 \n\t"\
|
yading@10
|
159 "fmul8sux16 %%f2, %%f40, %%f54 \n\t"\
|
yading@10
|
160 "fmul8sux16 %%f2, %%f44, %%f56 \n\t"\
|
yading@10
|
161 \
|
yading@10
|
162 "fpadd16 %%f48, %%f28, %%f28 \n\t"\
|
yading@10
|
163 "fcmpd %%fcc1, %%f62, %%f60 \n\t"\
|
yading@10
|
164 "for %%f12, %%f14, %%f60 \n\t"\
|
yading@10
|
165 "fpadd16 %%f50, %%f18, %%f18 \n\t"\
|
yading@10
|
166 "fpadd16 %%f52, %%f22, %%f22 \n\t"\
|
yading@10
|
167 "fpadd16 %%f54, %%f26, %%f26 \n\t"\
|
yading@10
|
168 "fpadd16 %%f56, %%f30, %%f30 \n\t"\
|
yading@10
|
169 \
|
yading@10
|
170 "fpadd16 %%f28, %%f0, %%f16 \n\t"\
|
yading@10
|
171 "fcmpd %%fcc2, %%f62, %%f60 \n\t"\
|
yading@10
|
172 "fpadd16 %%f28, %%f0, %%f20 \n\t"\
|
yading@10
|
173 "fpadd16 %%f28, %%f0, %%f24 \n\t"\
|
yading@10
|
174 "fpadd16 %%f28, %%f0, %%f28 \n\t"\
|
yading@10
|
175 "fpadd16 %%f18, %%f2, %%f18 \n\t"\
|
yading@10
|
176 "fpadd16 %%f22, %%f2, %%f22 \n\t"\
|
yading@10
|
177 /* 2. column */\
|
yading@10
|
178 "fbe %%fcc0, 3f \n\t"\
|
yading@10
|
179 "fpadd16 %%f26, %%f2, %%f26 \n\t"\
|
yading@10
|
180 "fmul8ulx16 %%f4, %%f34, %%f48 \n\t"\
|
yading@10
|
181 "fmul8ulx16 %%f4, %%f42, %%f50 \n\t"\
|
yading@10
|
182 "fmul8ulx16 %%f6, %%f36, %%f52 \n\t"\
|
yading@10
|
183 "fmul8ulx16 %%f6, %%f44, %%f54 \n\t"\
|
yading@10
|
184 "fmul8ulx16 %%f6, %%f32, %%f56 \n\t"\
|
yading@10
|
185 "fmul8ulx16 %%f6, %%f40, %%f58 \n\t"\
|
yading@10
|
186 \
|
yading@10
|
187 "fpadd16 %%f16, %%f48, %%f16 \n\t"\
|
yading@10
|
188 "fpadd16 %%f20, %%f50, %%f20 \n\t"\
|
yading@10
|
189 "fpsub16 %%f24, %%f50, %%f24 \n\t"\
|
yading@10
|
190 "fpsub16 %%f28, %%f48, %%f28 \n\t"\
|
yading@10
|
191 "fpadd16 %%f18, %%f52, %%f18 \n\t"\
|
yading@10
|
192 "fpsub16 %%f22, %%f54, %%f22 \n\t"\
|
yading@10
|
193 "fpsub16 %%f26, %%f56, %%f26 \n\t"\
|
yading@10
|
194 "fpsub16 %%f30, %%f58, %%f30 \n\t"\
|
yading@10
|
195 \
|
yading@10
|
196 "fmul8sux16 %%f4, %%f34, %%f48 \n\t"\
|
yading@10
|
197 "fmul8sux16 %%f4, %%f42, %%f50 \n\t"\
|
yading@10
|
198 "fmul8sux16 %%f6, %%f36, %%f52 \n\t"\
|
yading@10
|
199 "fmul8sux16 %%f6, %%f44, %%f54 \n\t"\
|
yading@10
|
200 "fmul8sux16 %%f6, %%f32, %%f56 \n\t"\
|
yading@10
|
201 "fmul8sux16 %%f6, %%f40, %%f58 \n\t"\
|
yading@10
|
202 \
|
yading@10
|
203 "fpadd16 %%f16, %%f48, %%f16 \n\t"\
|
yading@10
|
204 "fpadd16 %%f20, %%f50, %%f20 \n\t"\
|
yading@10
|
205 "fpsub16 %%f24, %%f50, %%f24 \n\t"\
|
yading@10
|
206 "fpsub16 %%f28, %%f48, %%f28 \n\t"\
|
yading@10
|
207 "fpadd16 %%f18, %%f52, %%f18 \n\t"\
|
yading@10
|
208 "fpsub16 %%f22, %%f54, %%f22 \n\t"\
|
yading@10
|
209 "fpsub16 %%f26, %%f56, %%f26 \n\t"\
|
yading@10
|
210 "fpsub16 %%f30, %%f58, %%f30 \n\t"\
|
yading@10
|
211 \
|
yading@10
|
212 "fpadd16 %%f16, %%f4, %%f16 \n\t"\
|
yading@10
|
213 "fpsub16 %%f28, %%f4, %%f28 \n\t"\
|
yading@10
|
214 "fpadd16 %%f18, %%f6, %%f18 \n\t"\
|
yading@10
|
215 "fpsub16 %%f26, %%f6, %%f26 \n\t"\
|
yading@10
|
216 /* 3. column */\
|
yading@10
|
217 "3: \n\t"\
|
yading@10
|
218 "fbe %%fcc1, 4f \n\t"\
|
yading@10
|
219 "fpsub16 %%f30, %%f6, %%f30 \n\t"\
|
yading@10
|
220 "fmul8ulx16 %%f8, %%f38, %%f48 \n\t"\
|
yading@10
|
221 "fmul8ulx16 %%f10, %%f40, %%f50 \n\t"\
|
yading@10
|
222 "fmul8ulx16 %%f10, %%f32, %%f52 \n\t"\
|
yading@10
|
223 "fmul8ulx16 %%f10, %%f44, %%f54 \n\t"\
|
yading@10
|
224 "fmul8ulx16 %%f10, %%f36, %%f56 \n\t"\
|
yading@10
|
225 \
|
yading@10
|
226 "fpadd16 %%f16, %%f48, %%f16 \n\t"\
|
yading@10
|
227 "fpsub16 %%f20, %%f48, %%f20 \n\t"\
|
yading@10
|
228 "fpsub16 %%f24, %%f48, %%f24 \n\t"\
|
yading@10
|
229 "fpadd16 %%f28, %%f48, %%f28 \n\t"\
|
yading@10
|
230 "fpadd16 %%f18, %%f50, %%f18 \n\t"\
|
yading@10
|
231 "fpsub16 %%f22, %%f52, %%f22 \n\t"\
|
yading@10
|
232 "fpadd16 %%f26, %%f54, %%f26 \n\t"\
|
yading@10
|
233 "fpadd16 %%f30, %%f56, %%f30 \n\t"\
|
yading@10
|
234 \
|
yading@10
|
235 "fmul8sux16 %%f8, %%f38, %%f48 \n\t"\
|
yading@10
|
236 "fmul8sux16 %%f10, %%f40, %%f50 \n\t"\
|
yading@10
|
237 "fmul8sux16 %%f10, %%f32, %%f52 \n\t"\
|
yading@10
|
238 "fmul8sux16 %%f10, %%f44, %%f54 \n\t"\
|
yading@10
|
239 "fmul8sux16 %%f10, %%f36, %%f56 \n\t"\
|
yading@10
|
240 \
|
yading@10
|
241 "fpadd16 %%f16, %%f48, %%f16 \n\t"\
|
yading@10
|
242 "fpsub16 %%f20, %%f48, %%f20 \n\t"\
|
yading@10
|
243 "fpsub16 %%f24, %%f48, %%f24 \n\t"\
|
yading@10
|
244 "fpadd16 %%f28, %%f48, %%f28 \n\t"\
|
yading@10
|
245 "fpadd16 %%f18, %%f50, %%f18 \n\t"\
|
yading@10
|
246 "fpsub16 %%f22, %%f52, %%f22 \n\t"\
|
yading@10
|
247 "fpadd16 %%f26, %%f54, %%f26 \n\t"\
|
yading@10
|
248 "fpadd16 %%f30, %%f56, %%f30 \n\t"\
|
yading@10
|
249 \
|
yading@10
|
250 "fpadd16 %%f16, %%f8, %%f16 \n\t"\
|
yading@10
|
251 "fpsub16 %%f20, %%f8, %%f20 \n\t"\
|
yading@10
|
252 "fpsub16 %%f24, %%f8, %%f24 \n\t"\
|
yading@10
|
253 "fpadd16 %%f28, %%f8, %%f28 \n\t"\
|
yading@10
|
254 "fpadd16 %%f18, %%f10, %%f18 \n\t"\
|
yading@10
|
255 "fpsub16 %%f22, %%f10, %%f22 \n\t"\
|
yading@10
|
256 /* 4. column */\
|
yading@10
|
257 "4: \n\t"\
|
yading@10
|
258 "fbe %%fcc2, 5f \n\t"\
|
yading@10
|
259 "fpadd16 %%f30, %%f10, %%f30 \n\t"\
|
yading@10
|
260 "fmul8ulx16 %%f12, %%f42, %%f48 \n\t"\
|
yading@10
|
261 "fmul8ulx16 %%f12, %%f34, %%f50 \n\t"\
|
yading@10
|
262 "fmul8ulx16 %%f14, %%f44, %%f52 \n\t"\
|
yading@10
|
263 "fmul8ulx16 %%f14, %%f40, %%f54 \n\t"\
|
yading@10
|
264 "fmul8ulx16 %%f14, %%f36, %%f56 \n\t"\
|
yading@10
|
265 "fmul8ulx16 %%f14, %%f32, %%f58 \n\t"\
|
yading@10
|
266 \
|
yading@10
|
267 "fpadd16 %%f16, %%f48, %%f16 \n\t"\
|
yading@10
|
268 "fpsub16 %%f20, %%f50, %%f20 \n\t"\
|
yading@10
|
269 "fpadd16 %%f24, %%f50, %%f24 \n\t"\
|
yading@10
|
270 "fpsub16 %%f28, %%f48, %%f28 \n\t"\
|
yading@10
|
271 "fpadd16 %%f18, %%f52, %%f18 \n\t"\
|
yading@10
|
272 "fpsub16 %%f22, %%f54, %%f22 \n\t"\
|
yading@10
|
273 "fpadd16 %%f26, %%f56, %%f26 \n\t"\
|
yading@10
|
274 "fpsub16 %%f30, %%f58, %%f30 \n\t"\
|
yading@10
|
275 \
|
yading@10
|
276 "fmul8sux16 %%f12, %%f42, %%f48 \n\t"\
|
yading@10
|
277 "fmul8sux16 %%f12, %%f34, %%f50 \n\t"\
|
yading@10
|
278 "fmul8sux16 %%f14, %%f44, %%f52 \n\t"\
|
yading@10
|
279 "fmul8sux16 %%f14, %%f40, %%f54 \n\t"\
|
yading@10
|
280 "fmul8sux16 %%f14, %%f36, %%f56 \n\t"\
|
yading@10
|
281 "fmul8sux16 %%f14, %%f32, %%f58 \n\t"\
|
yading@10
|
282 \
|
yading@10
|
283 "fpadd16 %%f16, %%f48, %%f16 \n\t"\
|
yading@10
|
284 "fpsub16 %%f20, %%f50, %%f20 \n\t"\
|
yading@10
|
285 "fpadd16 %%f24, %%f50, %%f24 \n\t"\
|
yading@10
|
286 "fpsub16 %%f28, %%f48, %%f28 \n\t"\
|
yading@10
|
287 "fpadd16 %%f18, %%f52, %%f18 \n\t"\
|
yading@10
|
288 "fpsub16 %%f22, %%f54, %%f22 \n\t"\
|
yading@10
|
289 "fpadd16 %%f26, %%f56, %%f26 \n\t"\
|
yading@10
|
290 "fpsub16 %%f30, %%f58, %%f30 \n\t"\
|
yading@10
|
291 \
|
yading@10
|
292 "fpsub16 %%f20, %%f12, %%f20 \n\t"\
|
yading@10
|
293 "fpadd16 %%f24, %%f12, %%f24 \n\t"\
|
yading@10
|
294 "fpsub16 %%f22, %%f14, %%f22 \n\t"\
|
yading@10
|
295 "fpadd16 %%f26, %%f14, %%f26 \n\t"\
|
yading@10
|
296 "fpsub16 %%f30, %%f14, %%f30 \n\t"\
|
yading@10
|
297 /* final butterfly */\
|
yading@10
|
298 "5: \n\t"\
|
yading@10
|
299 "fpsub16 %%f16, %%f18, %%f48 \n\t"\
|
yading@10
|
300 "fpsub16 %%f20, %%f22, %%f50 \n\t"\
|
yading@10
|
301 "fpsub16 %%f24, %%f26, %%f52 \n\t"\
|
yading@10
|
302 "fpsub16 %%f28, %%f30, %%f54 \n\t"\
|
yading@10
|
303 "fpadd16 %%f16, %%f18, %%f16 \n\t"\
|
yading@10
|
304 "fpadd16 %%f20, %%f22, %%f20 \n\t"\
|
yading@10
|
305 "fpadd16 %%f24, %%f26, %%f24 \n\t"\
|
yading@10
|
306 "fpadd16 %%f28, %%f30, %%f28 \n\t"\
|
yading@10
|
307
|
yading@10
|
308 #define STOREROWS(out) \
|
yading@10
|
309 "std %%f48, [" out "+112] \n\t"\
|
yading@10
|
310 "std %%f50, [" out "+96] \n\t"\
|
yading@10
|
311 "std %%f52, [" out "+80] \n\t"\
|
yading@10
|
312 "std %%f54, [" out "+64] \n\t"\
|
yading@10
|
313 "std %%f16, [" out "] \n\t"\
|
yading@10
|
314 "std %%f20, [" out "+16] \n\t"\
|
yading@10
|
315 "std %%f24, [" out "+32] \n\t"\
|
yading@10
|
316 "std %%f28, [" out "+48] \n\t"\
|
yading@10
|
317
|
yading@10
|
318 #define SCALEROWS \
|
yading@10
|
319 "fmul8sux16 %%f46, %%f48, %%f48 \n\t"\
|
yading@10
|
320 "fmul8sux16 %%f46, %%f50, %%f50 \n\t"\
|
yading@10
|
321 "fmul8sux16 %%f46, %%f52, %%f52 \n\t"\
|
yading@10
|
322 "fmul8sux16 %%f46, %%f54, %%f54 \n\t"\
|
yading@10
|
323 "fmul8sux16 %%f46, %%f16, %%f16 \n\t"\
|
yading@10
|
324 "fmul8sux16 %%f46, %%f20, %%f20 \n\t"\
|
yading@10
|
325 "fmul8sux16 %%f46, %%f24, %%f24 \n\t"\
|
yading@10
|
326 "fmul8sux16 %%f46, %%f28, %%f28 \n\t"\
|
yading@10
|
327
|
yading@10
|
328 #define PUTPIXELSCLAMPED(dest) \
|
yading@10
|
329 "fpack16 %%f48, %%f14 \n\t"\
|
yading@10
|
330 "fpack16 %%f50, %%f12 \n\t"\
|
yading@10
|
331 "fpack16 %%f16, %%f0 \n\t"\
|
yading@10
|
332 "fpack16 %%f20, %%f2 \n\t"\
|
yading@10
|
333 "fpack16 %%f24, %%f4 \n\t"\
|
yading@10
|
334 "fpack16 %%f28, %%f6 \n\t"\
|
yading@10
|
335 "fpack16 %%f54, %%f8 \n\t"\
|
yading@10
|
336 "fpack16 %%f52, %%f10 \n\t"\
|
yading@10
|
337 "st %%f0, [%3+" dest "] \n\t"\
|
yading@10
|
338 "st %%f2, [%5+" dest "] \n\t"\
|
yading@10
|
339 "st %%f4, [%6+" dest "] \n\t"\
|
yading@10
|
340 "st %%f6, [%7+" dest "] \n\t"\
|
yading@10
|
341 "st %%f8, [%8+" dest "] \n\t"\
|
yading@10
|
342 "st %%f10, [%9+" dest "] \n\t"\
|
yading@10
|
343 "st %%f12, [%10+" dest "] \n\t"\
|
yading@10
|
344 "st %%f14, [%11+" dest "] \n\t"\
|
yading@10
|
345
|
yading@10
|
346 #define ADDPIXELSCLAMPED(dest) \
|
yading@10
|
347 "ldd [%5], %%f18 \n\t"\
|
yading@10
|
348 "ld [%3+" dest"], %%f0 \n\t"\
|
yading@10
|
349 "ld [%6+" dest"], %%f2 \n\t"\
|
yading@10
|
350 "ld [%7+" dest"], %%f4 \n\t"\
|
yading@10
|
351 "ld [%8+" dest"], %%f6 \n\t"\
|
yading@10
|
352 "ld [%9+" dest"], %%f8 \n\t"\
|
yading@10
|
353 "ld [%10+" dest"], %%f10 \n\t"\
|
yading@10
|
354 "ld [%11+" dest"], %%f12 \n\t"\
|
yading@10
|
355 "ld [%12+" dest"], %%f14 \n\t"\
|
yading@10
|
356 "fmul8x16 %%f0, %%f18, %%f0 \n\t"\
|
yading@10
|
357 "fmul8x16 %%f2, %%f18, %%f2 \n\t"\
|
yading@10
|
358 "fmul8x16 %%f4, %%f18, %%f4 \n\t"\
|
yading@10
|
359 "fmul8x16 %%f6, %%f18, %%f6 \n\t"\
|
yading@10
|
360 "fmul8x16 %%f8, %%f18, %%f8 \n\t"\
|
yading@10
|
361 "fmul8x16 %%f10, %%f18, %%f10 \n\t"\
|
yading@10
|
362 "fmul8x16 %%f12, %%f18, %%f12 \n\t"\
|
yading@10
|
363 "fmul8x16 %%f14, %%f18, %%f14 \n\t"\
|
yading@10
|
364 "fpadd16 %%f0, %%f16, %%f0 \n\t"\
|
yading@10
|
365 "fpadd16 %%f2, %%f20, %%f2 \n\t"\
|
yading@10
|
366 "fpadd16 %%f4, %%f24, %%f4 \n\t"\
|
yading@10
|
367 "fpadd16 %%f6, %%f28, %%f6 \n\t"\
|
yading@10
|
368 "fpadd16 %%f8, %%f54, %%f8 \n\t"\
|
yading@10
|
369 "fpadd16 %%f10, %%f52, %%f10 \n\t"\
|
yading@10
|
370 "fpadd16 %%f12, %%f50, %%f12 \n\t"\
|
yading@10
|
371 "fpadd16 %%f14, %%f48, %%f14 \n\t"\
|
yading@10
|
372 "fpack16 %%f0, %%f0 \n\t"\
|
yading@10
|
373 "fpack16 %%f2, %%f2 \n\t"\
|
yading@10
|
374 "fpack16 %%f4, %%f4 \n\t"\
|
yading@10
|
375 "fpack16 %%f6, %%f6 \n\t"\
|
yading@10
|
376 "fpack16 %%f8, %%f8 \n\t"\
|
yading@10
|
377 "fpack16 %%f10, %%f10 \n\t"\
|
yading@10
|
378 "fpack16 %%f12, %%f12 \n\t"\
|
yading@10
|
379 "fpack16 %%f14, %%f14 \n\t"\
|
yading@10
|
380 "st %%f0, [%3+" dest "] \n\t"\
|
yading@10
|
381 "st %%f2, [%6+" dest "] \n\t"\
|
yading@10
|
382 "st %%f4, [%7+" dest "] \n\t"\
|
yading@10
|
383 "st %%f6, [%8+" dest "] \n\t"\
|
yading@10
|
384 "st %%f8, [%9+" dest "] \n\t"\
|
yading@10
|
385 "st %%f10, [%10+" dest "] \n\t"\
|
yading@10
|
386 "st %%f12, [%11+" dest "] \n\t"\
|
yading@10
|
387 "st %%f14, [%12+" dest "] \n\t"\
|
yading@10
|
388
|
yading@10
|
389
|
yading@10
|
390 void ff_simple_idct_vis(int16_t *data) {
|
yading@10
|
391 int out1, out2, out3, out4;
|
yading@10
|
392 DECLARE_ALIGNED(8, int16_t, temp)[8*8];
|
yading@10
|
393
|
yading@10
|
394 __asm__ volatile(
|
yading@10
|
395 INIT_IDCT
|
yading@10
|
396
|
yading@10
|
397 #define ADDROUNDER
|
yading@10
|
398
|
yading@10
|
399 // shift right 16-4=12
|
yading@10
|
400 LOADSCALE("%2+8")
|
yading@10
|
401 IDCT4ROWS
|
yading@10
|
402 STOREROWS("%3+8")
|
yading@10
|
403 LOADSCALE("%2+0")
|
yading@10
|
404 IDCT4ROWS
|
yading@10
|
405 "std %%f48, [%3+112] \n\t"
|
yading@10
|
406 "std %%f50, [%3+96] \n\t"
|
yading@10
|
407 "std %%f52, [%3+80] \n\t"
|
yading@10
|
408 "std %%f54, [%3+64] \n\t"
|
yading@10
|
409
|
yading@10
|
410 // shift right 16+4
|
yading@10
|
411 "ldd [%3+8], %%f18 \n\t"
|
yading@10
|
412 "ldd [%3+24], %%f22 \n\t"
|
yading@10
|
413 "ldd [%3+40], %%f26 \n\t"
|
yading@10
|
414 "ldd [%3+56], %%f30 \n\t"
|
yading@10
|
415 TRANSPOSE
|
yading@10
|
416 IDCT4ROWS
|
yading@10
|
417 SCALEROWS
|
yading@10
|
418 STOREROWS("%2+0")
|
yading@10
|
419 LOAD("%3+64")
|
yading@10
|
420 TRANSPOSE
|
yading@10
|
421 IDCT4ROWS
|
yading@10
|
422 SCALEROWS
|
yading@10
|
423 STOREROWS("%2+8")
|
yading@10
|
424
|
yading@10
|
425 : "=r" (out1), "=r" (out2), "=r" (out3), "=r" (out4)
|
yading@10
|
426 : "0" (scale), "1" (coeffs), "2" (data), "3" (temp)
|
yading@10
|
427 );
|
yading@10
|
428 }
|
yading@10
|
429
|
yading@10
|
430 void ff_simple_idct_put_vis(uint8_t *dest, int line_size, int16_t *data) {
|
yading@10
|
431 int out1, out2, out3, out4, out5;
|
yading@10
|
432 int r1, r2, r3, r4, r5, r6, r7;
|
yading@10
|
433
|
yading@10
|
434 __asm__ volatile(
|
yading@10
|
435 "wr %%g0, 0x8, %%gsr \n\t"
|
yading@10
|
436
|
yading@10
|
437 INIT_IDCT
|
yading@10
|
438
|
yading@10
|
439 "add %3, %4, %5 \n\t"
|
yading@10
|
440 "add %5, %4, %6 \n\t"
|
yading@10
|
441 "add %6, %4, %7 \n\t"
|
yading@10
|
442 "add %7, %4, %8 \n\t"
|
yading@10
|
443 "add %8, %4, %9 \n\t"
|
yading@10
|
444 "add %9, %4, %10 \n\t"
|
yading@10
|
445 "add %10, %4, %11 \n\t"
|
yading@10
|
446
|
yading@10
|
447 // shift right 16-4=12
|
yading@10
|
448 LOADSCALE("%2+8")
|
yading@10
|
449 IDCT4ROWS
|
yading@10
|
450 STOREROWS("%2+8")
|
yading@10
|
451 LOADSCALE("%2+0")
|
yading@10
|
452 IDCT4ROWS
|
yading@10
|
453 "std %%f48, [%2+112] \n\t"
|
yading@10
|
454 "std %%f50, [%2+96] \n\t"
|
yading@10
|
455 "std %%f52, [%2+80] \n\t"
|
yading@10
|
456 "std %%f54, [%2+64] \n\t"
|
yading@10
|
457
|
yading@10
|
458 #undef ADDROUNDER
|
yading@10
|
459 #define ADDROUNDER "fpadd16 %%f28, %%f46, %%f28 \n\t"
|
yading@10
|
460
|
yading@10
|
461 // shift right 16+4
|
yading@10
|
462 "ldd [%2+8], %%f18 \n\t"
|
yading@10
|
463 "ldd [%2+24], %%f22 \n\t"
|
yading@10
|
464 "ldd [%2+40], %%f26 \n\t"
|
yading@10
|
465 "ldd [%2+56], %%f30 \n\t"
|
yading@10
|
466 TRANSPOSE
|
yading@10
|
467 IDCT4ROWS
|
yading@10
|
468 PUTPIXELSCLAMPED("0")
|
yading@10
|
469 LOAD("%2+64")
|
yading@10
|
470 TRANSPOSE
|
yading@10
|
471 IDCT4ROWS
|
yading@10
|
472 PUTPIXELSCLAMPED("4")
|
yading@10
|
473
|
yading@10
|
474 : "=r" (out1), "=r" (out2), "=r" (out3), "=r" (out4), "=r" (out5),
|
yading@10
|
475 "=r" (r1), "=r" (r2), "=r" (r3), "=r" (r4), "=r" (r5), "=r" (r6), "=r" (r7)
|
yading@10
|
476 : "0" (rounder), "1" (coeffs), "2" (data), "3" (dest), "4" (line_size)
|
yading@10
|
477 );
|
yading@10
|
478 }
|
yading@10
|
479
|
yading@10
|
480 void ff_simple_idct_add_vis(uint8_t *dest, int line_size, int16_t *data) {
|
yading@10
|
481 int out1, out2, out3, out4, out5, out6;
|
yading@10
|
482 int r1, r2, r3, r4, r5, r6, r7;
|
yading@10
|
483
|
yading@10
|
484 __asm__ volatile(
|
yading@10
|
485 "wr %%g0, 0x8, %%gsr \n\t"
|
yading@10
|
486
|
yading@10
|
487 INIT_IDCT
|
yading@10
|
488
|
yading@10
|
489 "add %3, %4, %6 \n\t"
|
yading@10
|
490 "add %6, %4, %7 \n\t"
|
yading@10
|
491 "add %7, %4, %8 \n\t"
|
yading@10
|
492 "add %8, %4, %9 \n\t"
|
yading@10
|
493 "add %9, %4, %10 \n\t"
|
yading@10
|
494 "add %10, %4, %11 \n\t"
|
yading@10
|
495 "add %11, %4, %12 \n\t"
|
yading@10
|
496
|
yading@10
|
497 #undef ADDROUNDER
|
yading@10
|
498 #define ADDROUNDER
|
yading@10
|
499
|
yading@10
|
500 // shift right 16-4=12
|
yading@10
|
501 LOADSCALE("%2+8")
|
yading@10
|
502 IDCT4ROWS
|
yading@10
|
503 STOREROWS("%2+8")
|
yading@10
|
504 LOADSCALE("%2+0")
|
yading@10
|
505 IDCT4ROWS
|
yading@10
|
506 "std %%f48, [%2+112] \n\t"
|
yading@10
|
507 "std %%f50, [%2+96] \n\t"
|
yading@10
|
508 "std %%f52, [%2+80] \n\t"
|
yading@10
|
509 "std %%f54, [%2+64] \n\t"
|
yading@10
|
510
|
yading@10
|
511 #undef ADDROUNDER
|
yading@10
|
512 #define ADDROUNDER "fpadd16 %%f28, %%f46, %%f28 \n\t"
|
yading@10
|
513
|
yading@10
|
514 // shift right 16+4
|
yading@10
|
515 "ldd [%2+8], %%f18 \n\t"
|
yading@10
|
516 "ldd [%2+24], %%f22 \n\t"
|
yading@10
|
517 "ldd [%2+40], %%f26 \n\t"
|
yading@10
|
518 "ldd [%2+56], %%f30 \n\t"
|
yading@10
|
519 TRANSPOSE
|
yading@10
|
520 IDCT4ROWS
|
yading@10
|
521 ADDPIXELSCLAMPED("0")
|
yading@10
|
522 LOAD("%2+64")
|
yading@10
|
523 TRANSPOSE
|
yading@10
|
524 IDCT4ROWS
|
yading@10
|
525 ADDPIXELSCLAMPED("4")
|
yading@10
|
526
|
yading@10
|
527 : "=r" (out1), "=r" (out2), "=r" (out3), "=r" (out4), "=r" (out5), "=r" (out6),
|
yading@10
|
528 "=r" (r1), "=r" (r2), "=r" (r3), "=r" (r4), "=r" (r5), "=r" (r6), "=r" (r7)
|
yading@10
|
529 : "0" (rounder), "1" (coeffs), "2" (data), "3" (dest), "4" (line_size), "5" (expand)
|
yading@10
|
530 );
|
yading@10
|
531 }
|