yading@10
|
1 /*
|
yading@10
|
2 * Copyright (C) 2003 David S. Miller <davem@redhat.com>
|
yading@10
|
3 *
|
yading@10
|
4 * This file is part of FFmpeg.
|
yading@10
|
5 *
|
yading@10
|
6 * FFmpeg is free software; you can redistribute it and/or
|
yading@10
|
7 * modify it under the terms of the GNU Lesser General Public
|
yading@10
|
8 * License as published by the Free Software Foundation; either
|
yading@10
|
9 * version 2.1 of the License, or (at your option) any later version.
|
yading@10
|
10 *
|
yading@10
|
11 * FFmpeg is distributed in the hope that it will be useful,
|
yading@10
|
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
yading@10
|
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
yading@10
|
14 * Lesser General Public License for more details.
|
yading@10
|
15 *
|
yading@10
|
16 * You should have received a copy of the GNU Lesser General Public
|
yading@10
|
17 * License along with FFmpeg; if not, write to the Free Software
|
yading@10
|
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
yading@10
|
19 */
|
yading@10
|
20
|
yading@10
|
21 /* The *no_round* functions have been added by James A. Morrison, 2003,2004.
|
yading@10
|
22 The vis code from libmpeg2 was adapted for libavcodec by James A. Morrison.
|
yading@10
|
23 */
|
yading@10
|
24
|
yading@10
|
25 #include <stddef.h>
|
yading@10
|
26 #include <stdint.h>
|
yading@10
|
27
|
yading@10
|
28 #include "libavutil/attributes.h"
|
yading@10
|
29 #include "libavutil/mem.h"
|
yading@10
|
30 #include "libavcodec/hpeldsp.h"
|
yading@10
|
31 #include "vis.h"
|
yading@10
|
32
|
yading@10
|
33 /* The trick used in some of this file is the formula from the MMX
|
yading@10
|
34 * motion comp code, which is:
|
yading@10
|
35 *
|
yading@10
|
36 * (x+y+1)>>1 == (x|y)-((x^y)>>1)
|
yading@10
|
37 *
|
yading@10
|
38 * This allows us to average 8 bytes at a time in a 64-bit FPU reg.
|
yading@10
|
39 * We avoid overflows by masking before we do the shift, and we
|
yading@10
|
40 * implement the shift by multiplying by 1/2 using mul8x16. So in
|
yading@10
|
41 * VIS this is (assume 'x' is in f0, 'y' is in f2, a repeating mask
|
yading@10
|
42 * of '0xfe' is in f4, a repeating mask of '0x7f' is in f6, and
|
yading@10
|
43 * the value 0x80808080 is in f8):
|
yading@10
|
44 *
|
yading@10
|
45 * fxor f0, f2, f10
|
yading@10
|
46 * fand f10, f4, f10
|
yading@10
|
47 * fmul8x16 f8, f10, f10
|
yading@10
|
48 * fand f10, f6, f10
|
yading@10
|
49 * for f0, f2, f12
|
yading@10
|
50 * fpsub16 f12, f10, f10
|
yading@10
|
51 */
|
yading@10
|
52
|
yading@10
|
53 #define DUP4(x) {x, x, x, x}
|
yading@10
|
54 #define DUP8(x) {x, x, x, x, x, x, x, x}
|
yading@10
|
55 DECLARE_ALIGNED(8, static const int16_t, constants1)[] = DUP4 (1);
|
yading@10
|
56 DECLARE_ALIGNED(8, static const int16_t, constants2)[] = DUP4 (2);
|
yading@10
|
57 DECLARE_ALIGNED(8, static const int16_t, constants3)[] = DUP4 (3);
|
yading@10
|
58 DECLARE_ALIGNED(8, static const int16_t, constants6)[] = DUP4 (6);
|
yading@10
|
59 DECLARE_ALIGNED(8, static const int8_t, constants_fe)[] = DUP8 (0xfe);
|
yading@10
|
60 DECLARE_ALIGNED(8, static const int8_t, constants_7f)[] = DUP8 (0x7f);
|
yading@10
|
61 DECLARE_ALIGNED(8, static const int8_t, constants128)[] = DUP8 (128);
|
yading@10
|
62 DECLARE_ALIGNED(8, static const int16_t, constants256_512)[] =
|
yading@10
|
63 {256, 512, 256, 512};
|
yading@10
|
64 DECLARE_ALIGNED(8, static const int16_t, constants256_1024)[] =
|
yading@10
|
65 {256, 1024, 256, 1024};
|
yading@10
|
66
|
yading@10
|
67 #define REF_0 0
|
yading@10
|
68 #define REF_0_1 1
|
yading@10
|
69 #define REF_2 2
|
yading@10
|
70 #define REF_2_1 3
|
yading@10
|
71 #define REF_4 4
|
yading@10
|
72 #define REF_4_1 5
|
yading@10
|
73 #define REF_6 6
|
yading@10
|
74 #define REF_6_1 7
|
yading@10
|
75 #define REF_S0 8
|
yading@10
|
76 #define REF_S0_1 9
|
yading@10
|
77 #define REF_S2 10
|
yading@10
|
78 #define REF_S2_1 11
|
yading@10
|
79 #define REF_S4 12
|
yading@10
|
80 #define REF_S4_1 13
|
yading@10
|
81 #define REF_S6 14
|
yading@10
|
82 #define REF_S6_1 15
|
yading@10
|
83 #define DST_0 16
|
yading@10
|
84 #define DST_1 17
|
yading@10
|
85 #define DST_2 18
|
yading@10
|
86 #define DST_3 19
|
yading@10
|
87 #define CONST_1 20
|
yading@10
|
88 #define CONST_2 20
|
yading@10
|
89 #define CONST_3 20
|
yading@10
|
90 #define CONST_6 20
|
yading@10
|
91 #define MASK_fe 20
|
yading@10
|
92 #define CONST_128 22
|
yading@10
|
93 #define CONST_256 22
|
yading@10
|
94 #define CONST_512 22
|
yading@10
|
95 #define CONST_1024 22
|
yading@10
|
96 #define TMP0 24
|
yading@10
|
97 #define TMP1 25
|
yading@10
|
98 #define TMP2 26
|
yading@10
|
99 #define TMP3 27
|
yading@10
|
100 #define TMP4 28
|
yading@10
|
101 #define TMP5 29
|
yading@10
|
102 #define ZERO 30
|
yading@10
|
103 #define MASK_7f 30
|
yading@10
|
104
|
yading@10
|
105 #define TMP6 32
|
yading@10
|
106 #define TMP8 34
|
yading@10
|
107 #define TMP10 36
|
yading@10
|
108 #define TMP12 38
|
yading@10
|
109 #define TMP14 40
|
yading@10
|
110 #define TMP16 42
|
yading@10
|
111 #define TMP18 44
|
yading@10
|
112 #define TMP20 46
|
yading@10
|
113 #define TMP22 48
|
yading@10
|
114 #define TMP24 50
|
yading@10
|
115 #define TMP26 52
|
yading@10
|
116 #define TMP28 54
|
yading@10
|
117 #define TMP30 56
|
yading@10
|
118 #define TMP32 58
|
yading@10
|
119
|
yading@10
|
120 static void MC_put_o_16_vis (uint8_t * dest, const uint8_t * ref,
|
yading@10
|
121 const ptrdiff_t stride, int height)
|
yading@10
|
122 {
|
yading@10
|
123 ref = vis_alignaddr(ref);
|
yading@10
|
124 do { /* 5 cycles */
|
yading@10
|
125 vis_ld64(ref[0], TMP0);
|
yading@10
|
126
|
yading@10
|
127 vis_ld64_2(ref, 8, TMP2);
|
yading@10
|
128
|
yading@10
|
129 vis_ld64_2(ref, 16, TMP4);
|
yading@10
|
130 ref += stride;
|
yading@10
|
131
|
yading@10
|
132 vis_faligndata(TMP0, TMP2, REF_0);
|
yading@10
|
133 vis_st64(REF_0, dest[0]);
|
yading@10
|
134
|
yading@10
|
135 vis_faligndata(TMP2, TMP4, REF_2);
|
yading@10
|
136 vis_st64_2(REF_2, dest, 8);
|
yading@10
|
137 dest += stride;
|
yading@10
|
138 } while (--height);
|
yading@10
|
139 }
|
yading@10
|
140
|
yading@10
|
141 static void MC_put_o_8_vis (uint8_t * dest, const uint8_t * ref,
|
yading@10
|
142 const ptrdiff_t stride, int height)
|
yading@10
|
143 {
|
yading@10
|
144 ref = vis_alignaddr(ref);
|
yading@10
|
145 do { /* 4 cycles */
|
yading@10
|
146 vis_ld64(ref[0], TMP0);
|
yading@10
|
147
|
yading@10
|
148 vis_ld64(ref[8], TMP2);
|
yading@10
|
149 ref += stride;
|
yading@10
|
150
|
yading@10
|
151 /* stall */
|
yading@10
|
152
|
yading@10
|
153 vis_faligndata(TMP0, TMP2, REF_0);
|
yading@10
|
154 vis_st64(REF_0, dest[0]);
|
yading@10
|
155 dest += stride;
|
yading@10
|
156 } while (--height);
|
yading@10
|
157 }
|
yading@10
|
158
|
yading@10
|
159
|
yading@10
|
160 static void MC_avg_o_16_vis (uint8_t * dest, const uint8_t * ref,
|
yading@10
|
161 const ptrdiff_t stride, int height)
|
yading@10
|
162 {
|
yading@10
|
163 int stride_8 = stride + 8;
|
yading@10
|
164
|
yading@10
|
165 ref = vis_alignaddr(ref);
|
yading@10
|
166
|
yading@10
|
167 vis_ld64(ref[0], TMP0);
|
yading@10
|
168
|
yading@10
|
169 vis_ld64(ref[8], TMP2);
|
yading@10
|
170
|
yading@10
|
171 vis_ld64(ref[16], TMP4);
|
yading@10
|
172
|
yading@10
|
173 vis_ld64(dest[0], DST_0);
|
yading@10
|
174
|
yading@10
|
175 vis_ld64(dest[8], DST_2);
|
yading@10
|
176
|
yading@10
|
177 vis_ld64(constants_fe[0], MASK_fe);
|
yading@10
|
178 vis_faligndata(TMP0, TMP2, REF_0);
|
yading@10
|
179
|
yading@10
|
180 vis_ld64(constants_7f[0], MASK_7f);
|
yading@10
|
181 vis_faligndata(TMP2, TMP4, REF_2);
|
yading@10
|
182
|
yading@10
|
183 vis_ld64(constants128[0], CONST_128);
|
yading@10
|
184
|
yading@10
|
185 ref += stride;
|
yading@10
|
186 height = (height >> 1) - 1;
|
yading@10
|
187
|
yading@10
|
188 do { /* 24 cycles */
|
yading@10
|
189 vis_ld64(ref[0], TMP0);
|
yading@10
|
190 vis_xor(DST_0, REF_0, TMP6);
|
yading@10
|
191
|
yading@10
|
192 vis_ld64_2(ref, 8, TMP2);
|
yading@10
|
193 vis_and(TMP6, MASK_fe, TMP6);
|
yading@10
|
194
|
yading@10
|
195 vis_ld64_2(ref, 16, TMP4);
|
yading@10
|
196 ref += stride;
|
yading@10
|
197 vis_mul8x16(CONST_128, TMP6, TMP6);
|
yading@10
|
198 vis_xor(DST_2, REF_2, TMP8);
|
yading@10
|
199
|
yading@10
|
200 vis_and(TMP8, MASK_fe, TMP8);
|
yading@10
|
201
|
yading@10
|
202 vis_or(DST_0, REF_0, TMP10);
|
yading@10
|
203 vis_ld64_2(dest, stride, DST_0);
|
yading@10
|
204 vis_mul8x16(CONST_128, TMP8, TMP8);
|
yading@10
|
205
|
yading@10
|
206 vis_or(DST_2, REF_2, TMP12);
|
yading@10
|
207 vis_ld64_2(dest, stride_8, DST_2);
|
yading@10
|
208
|
yading@10
|
209 vis_ld64(ref[0], TMP14);
|
yading@10
|
210 vis_and(TMP6, MASK_7f, TMP6);
|
yading@10
|
211
|
yading@10
|
212 vis_and(TMP8, MASK_7f, TMP8);
|
yading@10
|
213
|
yading@10
|
214 vis_psub16(TMP10, TMP6, TMP6);
|
yading@10
|
215 vis_st64(TMP6, dest[0]);
|
yading@10
|
216
|
yading@10
|
217 vis_psub16(TMP12, TMP8, TMP8);
|
yading@10
|
218 vis_st64_2(TMP8, dest, 8);
|
yading@10
|
219
|
yading@10
|
220 dest += stride;
|
yading@10
|
221 vis_ld64_2(ref, 8, TMP16);
|
yading@10
|
222 vis_faligndata(TMP0, TMP2, REF_0);
|
yading@10
|
223
|
yading@10
|
224 vis_ld64_2(ref, 16, TMP18);
|
yading@10
|
225 vis_faligndata(TMP2, TMP4, REF_2);
|
yading@10
|
226 ref += stride;
|
yading@10
|
227
|
yading@10
|
228 vis_xor(DST_0, REF_0, TMP20);
|
yading@10
|
229
|
yading@10
|
230 vis_and(TMP20, MASK_fe, TMP20);
|
yading@10
|
231
|
yading@10
|
232 vis_xor(DST_2, REF_2, TMP22);
|
yading@10
|
233 vis_mul8x16(CONST_128, TMP20, TMP20);
|
yading@10
|
234
|
yading@10
|
235 vis_and(TMP22, MASK_fe, TMP22);
|
yading@10
|
236
|
yading@10
|
237 vis_or(DST_0, REF_0, TMP24);
|
yading@10
|
238 vis_mul8x16(CONST_128, TMP22, TMP22);
|
yading@10
|
239
|
yading@10
|
240 vis_or(DST_2, REF_2, TMP26);
|
yading@10
|
241
|
yading@10
|
242 vis_ld64_2(dest, stride, DST_0);
|
yading@10
|
243 vis_faligndata(TMP14, TMP16, REF_0);
|
yading@10
|
244
|
yading@10
|
245 vis_ld64_2(dest, stride_8, DST_2);
|
yading@10
|
246 vis_faligndata(TMP16, TMP18, REF_2);
|
yading@10
|
247
|
yading@10
|
248 vis_and(TMP20, MASK_7f, TMP20);
|
yading@10
|
249
|
yading@10
|
250 vis_and(TMP22, MASK_7f, TMP22);
|
yading@10
|
251
|
yading@10
|
252 vis_psub16(TMP24, TMP20, TMP20);
|
yading@10
|
253 vis_st64(TMP20, dest[0]);
|
yading@10
|
254
|
yading@10
|
255 vis_psub16(TMP26, TMP22, TMP22);
|
yading@10
|
256 vis_st64_2(TMP22, dest, 8);
|
yading@10
|
257 dest += stride;
|
yading@10
|
258 } while (--height);
|
yading@10
|
259
|
yading@10
|
260 vis_ld64(ref[0], TMP0);
|
yading@10
|
261 vis_xor(DST_0, REF_0, TMP6);
|
yading@10
|
262
|
yading@10
|
263 vis_ld64_2(ref, 8, TMP2);
|
yading@10
|
264 vis_and(TMP6, MASK_fe, TMP6);
|
yading@10
|
265
|
yading@10
|
266 vis_ld64_2(ref, 16, TMP4);
|
yading@10
|
267 vis_mul8x16(CONST_128, TMP6, TMP6);
|
yading@10
|
268 vis_xor(DST_2, REF_2, TMP8);
|
yading@10
|
269
|
yading@10
|
270 vis_and(TMP8, MASK_fe, TMP8);
|
yading@10
|
271
|
yading@10
|
272 vis_or(DST_0, REF_0, TMP10);
|
yading@10
|
273 vis_ld64_2(dest, stride, DST_0);
|
yading@10
|
274 vis_mul8x16(CONST_128, TMP8, TMP8);
|
yading@10
|
275
|
yading@10
|
276 vis_or(DST_2, REF_2, TMP12);
|
yading@10
|
277 vis_ld64_2(dest, stride_8, DST_2);
|
yading@10
|
278
|
yading@10
|
279 vis_ld64(ref[0], TMP14);
|
yading@10
|
280 vis_and(TMP6, MASK_7f, TMP6);
|
yading@10
|
281
|
yading@10
|
282 vis_and(TMP8, MASK_7f, TMP8);
|
yading@10
|
283
|
yading@10
|
284 vis_psub16(TMP10, TMP6, TMP6);
|
yading@10
|
285 vis_st64(TMP6, dest[0]);
|
yading@10
|
286
|
yading@10
|
287 vis_psub16(TMP12, TMP8, TMP8);
|
yading@10
|
288 vis_st64_2(TMP8, dest, 8);
|
yading@10
|
289
|
yading@10
|
290 dest += stride;
|
yading@10
|
291 vis_faligndata(TMP0, TMP2, REF_0);
|
yading@10
|
292
|
yading@10
|
293 vis_faligndata(TMP2, TMP4, REF_2);
|
yading@10
|
294
|
yading@10
|
295 vis_xor(DST_0, REF_0, TMP20);
|
yading@10
|
296
|
yading@10
|
297 vis_and(TMP20, MASK_fe, TMP20);
|
yading@10
|
298
|
yading@10
|
299 vis_xor(DST_2, REF_2, TMP22);
|
yading@10
|
300 vis_mul8x16(CONST_128, TMP20, TMP20);
|
yading@10
|
301
|
yading@10
|
302 vis_and(TMP22, MASK_fe, TMP22);
|
yading@10
|
303
|
yading@10
|
304 vis_or(DST_0, REF_0, TMP24);
|
yading@10
|
305 vis_mul8x16(CONST_128, TMP22, TMP22);
|
yading@10
|
306
|
yading@10
|
307 vis_or(DST_2, REF_2, TMP26);
|
yading@10
|
308
|
yading@10
|
309 vis_and(TMP20, MASK_7f, TMP20);
|
yading@10
|
310
|
yading@10
|
311 vis_and(TMP22, MASK_7f, TMP22);
|
yading@10
|
312
|
yading@10
|
313 vis_psub16(TMP24, TMP20, TMP20);
|
yading@10
|
314 vis_st64(TMP20, dest[0]);
|
yading@10
|
315
|
yading@10
|
316 vis_psub16(TMP26, TMP22, TMP22);
|
yading@10
|
317 vis_st64_2(TMP22, dest, 8);
|
yading@10
|
318 }
|
yading@10
|
319
|
yading@10
|
320 static void MC_avg_o_8_vis (uint8_t * dest, const uint8_t * ref,
|
yading@10
|
321 const ptrdiff_t stride, int height)
|
yading@10
|
322 {
|
yading@10
|
323 ref = vis_alignaddr(ref);
|
yading@10
|
324
|
yading@10
|
325 vis_ld64(ref[0], TMP0);
|
yading@10
|
326
|
yading@10
|
327 vis_ld64(ref[8], TMP2);
|
yading@10
|
328
|
yading@10
|
329 vis_ld64(dest[0], DST_0);
|
yading@10
|
330
|
yading@10
|
331 vis_ld64(constants_fe[0], MASK_fe);
|
yading@10
|
332
|
yading@10
|
333 vis_ld64(constants_7f[0], MASK_7f);
|
yading@10
|
334 vis_faligndata(TMP0, TMP2, REF_0);
|
yading@10
|
335
|
yading@10
|
336 vis_ld64(constants128[0], CONST_128);
|
yading@10
|
337
|
yading@10
|
338 ref += stride;
|
yading@10
|
339 height = (height >> 1) - 1;
|
yading@10
|
340
|
yading@10
|
341 do { /* 12 cycles */
|
yading@10
|
342 vis_ld64(ref[0], TMP0);
|
yading@10
|
343 vis_xor(DST_0, REF_0, TMP4);
|
yading@10
|
344
|
yading@10
|
345 vis_ld64(ref[8], TMP2);
|
yading@10
|
346 vis_and(TMP4, MASK_fe, TMP4);
|
yading@10
|
347
|
yading@10
|
348 vis_or(DST_0, REF_0, TMP6);
|
yading@10
|
349 vis_ld64_2(dest, stride, DST_0);
|
yading@10
|
350 ref += stride;
|
yading@10
|
351 vis_mul8x16(CONST_128, TMP4, TMP4);
|
yading@10
|
352
|
yading@10
|
353 vis_ld64(ref[0], TMP12);
|
yading@10
|
354 vis_faligndata(TMP0, TMP2, REF_0);
|
yading@10
|
355
|
yading@10
|
356 vis_ld64(ref[8], TMP2);
|
yading@10
|
357 vis_xor(DST_0, REF_0, TMP0);
|
yading@10
|
358 ref += stride;
|
yading@10
|
359
|
yading@10
|
360 vis_and(TMP0, MASK_fe, TMP0);
|
yading@10
|
361
|
yading@10
|
362 vis_and(TMP4, MASK_7f, TMP4);
|
yading@10
|
363
|
yading@10
|
364 vis_psub16(TMP6, TMP4, TMP4);
|
yading@10
|
365 vis_st64(TMP4, dest[0]);
|
yading@10
|
366 dest += stride;
|
yading@10
|
367 vis_mul8x16(CONST_128, TMP0, TMP0);
|
yading@10
|
368
|
yading@10
|
369 vis_or(DST_0, REF_0, TMP6);
|
yading@10
|
370 vis_ld64_2(dest, stride, DST_0);
|
yading@10
|
371
|
yading@10
|
372 vis_faligndata(TMP12, TMP2, REF_0);
|
yading@10
|
373
|
yading@10
|
374 vis_and(TMP0, MASK_7f, TMP0);
|
yading@10
|
375
|
yading@10
|
376 vis_psub16(TMP6, TMP0, TMP4);
|
yading@10
|
377 vis_st64(TMP4, dest[0]);
|
yading@10
|
378 dest += stride;
|
yading@10
|
379 } while (--height);
|
yading@10
|
380
|
yading@10
|
381 vis_ld64(ref[0], TMP0);
|
yading@10
|
382 vis_xor(DST_0, REF_0, TMP4);
|
yading@10
|
383
|
yading@10
|
384 vis_ld64(ref[8], TMP2);
|
yading@10
|
385 vis_and(TMP4, MASK_fe, TMP4);
|
yading@10
|
386
|
yading@10
|
387 vis_or(DST_0, REF_0, TMP6);
|
yading@10
|
388 vis_ld64_2(dest, stride, DST_0);
|
yading@10
|
389 vis_mul8x16(CONST_128, TMP4, TMP4);
|
yading@10
|
390
|
yading@10
|
391 vis_faligndata(TMP0, TMP2, REF_0);
|
yading@10
|
392
|
yading@10
|
393 vis_xor(DST_0, REF_0, TMP0);
|
yading@10
|
394
|
yading@10
|
395 vis_and(TMP0, MASK_fe, TMP0);
|
yading@10
|
396
|
yading@10
|
397 vis_and(TMP4, MASK_7f, TMP4);
|
yading@10
|
398
|
yading@10
|
399 vis_psub16(TMP6, TMP4, TMP4);
|
yading@10
|
400 vis_st64(TMP4, dest[0]);
|
yading@10
|
401 dest += stride;
|
yading@10
|
402 vis_mul8x16(CONST_128, TMP0, TMP0);
|
yading@10
|
403
|
yading@10
|
404 vis_or(DST_0, REF_0, TMP6);
|
yading@10
|
405
|
yading@10
|
406 vis_and(TMP0, MASK_7f, TMP0);
|
yading@10
|
407
|
yading@10
|
408 vis_psub16(TMP6, TMP0, TMP4);
|
yading@10
|
409 vis_st64(TMP4, dest[0]);
|
yading@10
|
410 }
|
yading@10
|
411
|
yading@10
|
412 static void MC_put_x_16_vis (uint8_t * dest, const uint8_t * ref,
|
yading@10
|
413 const ptrdiff_t stride, int height)
|
yading@10
|
414 {
|
yading@10
|
415 unsigned long off = (unsigned long) ref & 0x7;
|
yading@10
|
416 unsigned long off_plus_1 = off + 1;
|
yading@10
|
417
|
yading@10
|
418 ref = vis_alignaddr(ref);
|
yading@10
|
419
|
yading@10
|
420 vis_ld64(ref[0], TMP0);
|
yading@10
|
421
|
yading@10
|
422 vis_ld64_2(ref, 8, TMP2);
|
yading@10
|
423
|
yading@10
|
424 vis_ld64_2(ref, 16, TMP4);
|
yading@10
|
425
|
yading@10
|
426 vis_ld64(constants_fe[0], MASK_fe);
|
yading@10
|
427
|
yading@10
|
428 vis_ld64(constants_7f[0], MASK_7f);
|
yading@10
|
429 vis_faligndata(TMP0, TMP2, REF_0);
|
yading@10
|
430
|
yading@10
|
431 vis_ld64(constants128[0], CONST_128);
|
yading@10
|
432 vis_faligndata(TMP2, TMP4, REF_4);
|
yading@10
|
433
|
yading@10
|
434 if (off != 0x7) {
|
yading@10
|
435 vis_alignaddr_g0((void *)off_plus_1);
|
yading@10
|
436 vis_faligndata(TMP0, TMP2, REF_2);
|
yading@10
|
437 vis_faligndata(TMP2, TMP4, REF_6);
|
yading@10
|
438 } else {
|
yading@10
|
439 vis_src1(TMP2, REF_2);
|
yading@10
|
440 vis_src1(TMP4, REF_6);
|
yading@10
|
441 }
|
yading@10
|
442
|
yading@10
|
443 ref += stride;
|
yading@10
|
444 height = (height >> 1) - 1;
|
yading@10
|
445
|
yading@10
|
446 do { /* 34 cycles */
|
yading@10
|
447 vis_ld64(ref[0], TMP0);
|
yading@10
|
448 vis_xor(REF_0, REF_2, TMP6);
|
yading@10
|
449
|
yading@10
|
450 vis_ld64_2(ref, 8, TMP2);
|
yading@10
|
451 vis_xor(REF_4, REF_6, TMP8);
|
yading@10
|
452
|
yading@10
|
453 vis_ld64_2(ref, 16, TMP4);
|
yading@10
|
454 vis_and(TMP6, MASK_fe, TMP6);
|
yading@10
|
455 ref += stride;
|
yading@10
|
456
|
yading@10
|
457 vis_ld64(ref[0], TMP14);
|
yading@10
|
458 vis_mul8x16(CONST_128, TMP6, TMP6);
|
yading@10
|
459 vis_and(TMP8, MASK_fe, TMP8);
|
yading@10
|
460
|
yading@10
|
461 vis_ld64_2(ref, 8, TMP16);
|
yading@10
|
462 vis_mul8x16(CONST_128, TMP8, TMP8);
|
yading@10
|
463 vis_or(REF_0, REF_2, TMP10);
|
yading@10
|
464
|
yading@10
|
465 vis_ld64_2(ref, 16, TMP18);
|
yading@10
|
466 ref += stride;
|
yading@10
|
467 vis_or(REF_4, REF_6, TMP12);
|
yading@10
|
468
|
yading@10
|
469 vis_alignaddr_g0((void *)off);
|
yading@10
|
470
|
yading@10
|
471 vis_faligndata(TMP0, TMP2, REF_0);
|
yading@10
|
472
|
yading@10
|
473 vis_faligndata(TMP2, TMP4, REF_4);
|
yading@10
|
474
|
yading@10
|
475 if (off != 0x7) {
|
yading@10
|
476 vis_alignaddr_g0((void *)off_plus_1);
|
yading@10
|
477 vis_faligndata(TMP0, TMP2, REF_2);
|
yading@10
|
478 vis_faligndata(TMP2, TMP4, REF_6);
|
yading@10
|
479 } else {
|
yading@10
|
480 vis_src1(TMP2, REF_2);
|
yading@10
|
481 vis_src1(TMP4, REF_6);
|
yading@10
|
482 }
|
yading@10
|
483
|
yading@10
|
484 vis_and(TMP6, MASK_7f, TMP6);
|
yading@10
|
485
|
yading@10
|
486 vis_and(TMP8, MASK_7f, TMP8);
|
yading@10
|
487
|
yading@10
|
488 vis_psub16(TMP10, TMP6, TMP6);
|
yading@10
|
489 vis_st64(TMP6, dest[0]);
|
yading@10
|
490
|
yading@10
|
491 vis_psub16(TMP12, TMP8, TMP8);
|
yading@10
|
492 vis_st64_2(TMP8, dest, 8);
|
yading@10
|
493 dest += stride;
|
yading@10
|
494
|
yading@10
|
495 vis_xor(REF_0, REF_2, TMP6);
|
yading@10
|
496
|
yading@10
|
497 vis_xor(REF_4, REF_6, TMP8);
|
yading@10
|
498
|
yading@10
|
499 vis_and(TMP6, MASK_fe, TMP6);
|
yading@10
|
500
|
yading@10
|
501 vis_mul8x16(CONST_128, TMP6, TMP6);
|
yading@10
|
502 vis_and(TMP8, MASK_fe, TMP8);
|
yading@10
|
503
|
yading@10
|
504 vis_mul8x16(CONST_128, TMP8, TMP8);
|
yading@10
|
505 vis_or(REF_0, REF_2, TMP10);
|
yading@10
|
506
|
yading@10
|
507 vis_or(REF_4, REF_6, TMP12);
|
yading@10
|
508
|
yading@10
|
509 vis_alignaddr_g0((void *)off);
|
yading@10
|
510
|
yading@10
|
511 vis_faligndata(TMP14, TMP16, REF_0);
|
yading@10
|
512
|
yading@10
|
513 vis_faligndata(TMP16, TMP18, REF_4);
|
yading@10
|
514
|
yading@10
|
515 if (off != 0x7) {
|
yading@10
|
516 vis_alignaddr_g0((void *)off_plus_1);
|
yading@10
|
517 vis_faligndata(TMP14, TMP16, REF_2);
|
yading@10
|
518 vis_faligndata(TMP16, TMP18, REF_6);
|
yading@10
|
519 } else {
|
yading@10
|
520 vis_src1(TMP16, REF_2);
|
yading@10
|
521 vis_src1(TMP18, REF_6);
|
yading@10
|
522 }
|
yading@10
|
523
|
yading@10
|
524 vis_and(TMP6, MASK_7f, TMP6);
|
yading@10
|
525
|
yading@10
|
526 vis_and(TMP8, MASK_7f, TMP8);
|
yading@10
|
527
|
yading@10
|
528 vis_psub16(TMP10, TMP6, TMP6);
|
yading@10
|
529 vis_st64(TMP6, dest[0]);
|
yading@10
|
530
|
yading@10
|
531 vis_psub16(TMP12, TMP8, TMP8);
|
yading@10
|
532 vis_st64_2(TMP8, dest, 8);
|
yading@10
|
533 dest += stride;
|
yading@10
|
534 } while (--height);
|
yading@10
|
535
|
yading@10
|
536 vis_ld64(ref[0], TMP0);
|
yading@10
|
537 vis_xor(REF_0, REF_2, TMP6);
|
yading@10
|
538
|
yading@10
|
539 vis_ld64_2(ref, 8, TMP2);
|
yading@10
|
540 vis_xor(REF_4, REF_6, TMP8);
|
yading@10
|
541
|
yading@10
|
542 vis_ld64_2(ref, 16, TMP4);
|
yading@10
|
543 vis_and(TMP6, MASK_fe, TMP6);
|
yading@10
|
544
|
yading@10
|
545 vis_mul8x16(CONST_128, TMP6, TMP6);
|
yading@10
|
546 vis_and(TMP8, MASK_fe, TMP8);
|
yading@10
|
547
|
yading@10
|
548 vis_mul8x16(CONST_128, TMP8, TMP8);
|
yading@10
|
549 vis_or(REF_0, REF_2, TMP10);
|
yading@10
|
550
|
yading@10
|
551 vis_or(REF_4, REF_6, TMP12);
|
yading@10
|
552
|
yading@10
|
553 vis_alignaddr_g0((void *)off);
|
yading@10
|
554
|
yading@10
|
555 vis_faligndata(TMP0, TMP2, REF_0);
|
yading@10
|
556
|
yading@10
|
557 vis_faligndata(TMP2, TMP4, REF_4);
|
yading@10
|
558
|
yading@10
|
559 if (off != 0x7) {
|
yading@10
|
560 vis_alignaddr_g0((void *)off_plus_1);
|
yading@10
|
561 vis_faligndata(TMP0, TMP2, REF_2);
|
yading@10
|
562 vis_faligndata(TMP2, TMP4, REF_6);
|
yading@10
|
563 } else {
|
yading@10
|
564 vis_src1(TMP2, REF_2);
|
yading@10
|
565 vis_src1(TMP4, REF_6);
|
yading@10
|
566 }
|
yading@10
|
567
|
yading@10
|
568 vis_and(TMP6, MASK_7f, TMP6);
|
yading@10
|
569
|
yading@10
|
570 vis_and(TMP8, MASK_7f, TMP8);
|
yading@10
|
571
|
yading@10
|
572 vis_psub16(TMP10, TMP6, TMP6);
|
yading@10
|
573 vis_st64(TMP6, dest[0]);
|
yading@10
|
574
|
yading@10
|
575 vis_psub16(TMP12, TMP8, TMP8);
|
yading@10
|
576 vis_st64_2(TMP8, dest, 8);
|
yading@10
|
577 dest += stride;
|
yading@10
|
578
|
yading@10
|
579 vis_xor(REF_0, REF_2, TMP6);
|
yading@10
|
580
|
yading@10
|
581 vis_xor(REF_4, REF_6, TMP8);
|
yading@10
|
582
|
yading@10
|
583 vis_and(TMP6, MASK_fe, TMP6);
|
yading@10
|
584
|
yading@10
|
585 vis_mul8x16(CONST_128, TMP6, TMP6);
|
yading@10
|
586 vis_and(TMP8, MASK_fe, TMP8);
|
yading@10
|
587
|
yading@10
|
588 vis_mul8x16(CONST_128, TMP8, TMP8);
|
yading@10
|
589 vis_or(REF_0, REF_2, TMP10);
|
yading@10
|
590
|
yading@10
|
591 vis_or(REF_4, REF_6, TMP12);
|
yading@10
|
592
|
yading@10
|
593 vis_and(TMP6, MASK_7f, TMP6);
|
yading@10
|
594
|
yading@10
|
595 vis_and(TMP8, MASK_7f, TMP8);
|
yading@10
|
596
|
yading@10
|
597 vis_psub16(TMP10, TMP6, TMP6);
|
yading@10
|
598 vis_st64(TMP6, dest[0]);
|
yading@10
|
599
|
yading@10
|
600 vis_psub16(TMP12, TMP8, TMP8);
|
yading@10
|
601 vis_st64_2(TMP8, dest, 8);
|
yading@10
|
602 }
|
yading@10
|
603
|
yading@10
|
604 static void MC_put_x_8_vis (uint8_t * dest, const uint8_t * ref,
|
yading@10
|
605 const ptrdiff_t stride, int height)
|
yading@10
|
606 {
|
yading@10
|
607 unsigned long off = (unsigned long) ref & 0x7;
|
yading@10
|
608 unsigned long off_plus_1 = off + 1;
|
yading@10
|
609
|
yading@10
|
610 ref = vis_alignaddr(ref);
|
yading@10
|
611
|
yading@10
|
612 vis_ld64(ref[0], TMP0);
|
yading@10
|
613
|
yading@10
|
614 vis_ld64(ref[8], TMP2);
|
yading@10
|
615
|
yading@10
|
616 vis_ld64(constants_fe[0], MASK_fe);
|
yading@10
|
617
|
yading@10
|
618 vis_ld64(constants_7f[0], MASK_7f);
|
yading@10
|
619
|
yading@10
|
620 vis_ld64(constants128[0], CONST_128);
|
yading@10
|
621 vis_faligndata(TMP0, TMP2, REF_0);
|
yading@10
|
622
|
yading@10
|
623 if (off != 0x7) {
|
yading@10
|
624 vis_alignaddr_g0((void *)off_plus_1);
|
yading@10
|
625 vis_faligndata(TMP0, TMP2, REF_2);
|
yading@10
|
626 } else {
|
yading@10
|
627 vis_src1(TMP2, REF_2);
|
yading@10
|
628 }
|
yading@10
|
629
|
yading@10
|
630 ref += stride;
|
yading@10
|
631 height = (height >> 1) - 1;
|
yading@10
|
632
|
yading@10
|
633 do { /* 20 cycles */
|
yading@10
|
634 vis_ld64(ref[0], TMP0);
|
yading@10
|
635 vis_xor(REF_0, REF_2, TMP4);
|
yading@10
|
636
|
yading@10
|
637 vis_ld64_2(ref, 8, TMP2);
|
yading@10
|
638 vis_and(TMP4, MASK_fe, TMP4);
|
yading@10
|
639 ref += stride;
|
yading@10
|
640
|
yading@10
|
641 vis_ld64(ref[0], TMP8);
|
yading@10
|
642 vis_or(REF_0, REF_2, TMP6);
|
yading@10
|
643 vis_mul8x16(CONST_128, TMP4, TMP4);
|
yading@10
|
644
|
yading@10
|
645 vis_alignaddr_g0((void *)off);
|
yading@10
|
646
|
yading@10
|
647 vis_ld64_2(ref, 8, TMP10);
|
yading@10
|
648 ref += stride;
|
yading@10
|
649 vis_faligndata(TMP0, TMP2, REF_0);
|
yading@10
|
650
|
yading@10
|
651 if (off != 0x7) {
|
yading@10
|
652 vis_alignaddr_g0((void *)off_plus_1);
|
yading@10
|
653 vis_faligndata(TMP0, TMP2, REF_2);
|
yading@10
|
654 } else {
|
yading@10
|
655 vis_src1(TMP2, REF_2);
|
yading@10
|
656 }
|
yading@10
|
657
|
yading@10
|
658 vis_and(TMP4, MASK_7f, TMP4);
|
yading@10
|
659
|
yading@10
|
660 vis_psub16(TMP6, TMP4, DST_0);
|
yading@10
|
661 vis_st64(DST_0, dest[0]);
|
yading@10
|
662 dest += stride;
|
yading@10
|
663
|
yading@10
|
664 vis_xor(REF_0, REF_2, TMP12);
|
yading@10
|
665
|
yading@10
|
666 vis_and(TMP12, MASK_fe, TMP12);
|
yading@10
|
667
|
yading@10
|
668 vis_or(REF_0, REF_2, TMP14);
|
yading@10
|
669 vis_mul8x16(CONST_128, TMP12, TMP12);
|
yading@10
|
670
|
yading@10
|
671 vis_alignaddr_g0((void *)off);
|
yading@10
|
672 vis_faligndata(TMP8, TMP10, REF_0);
|
yading@10
|
673 if (off != 0x7) {
|
yading@10
|
674 vis_alignaddr_g0((void *)off_plus_1);
|
yading@10
|
675 vis_faligndata(TMP8, TMP10, REF_2);
|
yading@10
|
676 } else {
|
yading@10
|
677 vis_src1(TMP10, REF_2);
|
yading@10
|
678 }
|
yading@10
|
679
|
yading@10
|
680 vis_and(TMP12, MASK_7f, TMP12);
|
yading@10
|
681
|
yading@10
|
682 vis_psub16(TMP14, TMP12, DST_0);
|
yading@10
|
683 vis_st64(DST_0, dest[0]);
|
yading@10
|
684 dest += stride;
|
yading@10
|
685 } while (--height);
|
yading@10
|
686
|
yading@10
|
687 vis_ld64(ref[0], TMP0);
|
yading@10
|
688 vis_xor(REF_0, REF_2, TMP4);
|
yading@10
|
689
|
yading@10
|
690 vis_ld64_2(ref, 8, TMP2);
|
yading@10
|
691 vis_and(TMP4, MASK_fe, TMP4);
|
yading@10
|
692
|
yading@10
|
693 vis_or(REF_0, REF_2, TMP6);
|
yading@10
|
694 vis_mul8x16(CONST_128, TMP4, TMP4);
|
yading@10
|
695
|
yading@10
|
696 vis_alignaddr_g0((void *)off);
|
yading@10
|
697
|
yading@10
|
698 vis_faligndata(TMP0, TMP2, REF_0);
|
yading@10
|
699
|
yading@10
|
700 if (off != 0x7) {
|
yading@10
|
701 vis_alignaddr_g0((void *)off_plus_1);
|
yading@10
|
702 vis_faligndata(TMP0, TMP2, REF_2);
|
yading@10
|
703 } else {
|
yading@10
|
704 vis_src1(TMP2, REF_2);
|
yading@10
|
705 }
|
yading@10
|
706
|
yading@10
|
707 vis_and(TMP4, MASK_7f, TMP4);
|
yading@10
|
708
|
yading@10
|
709 vis_psub16(TMP6, TMP4, DST_0);
|
yading@10
|
710 vis_st64(DST_0, dest[0]);
|
yading@10
|
711 dest += stride;
|
yading@10
|
712
|
yading@10
|
713 vis_xor(REF_0, REF_2, TMP12);
|
yading@10
|
714
|
yading@10
|
715 vis_and(TMP12, MASK_fe, TMP12);
|
yading@10
|
716
|
yading@10
|
717 vis_or(REF_0, REF_2, TMP14);
|
yading@10
|
718 vis_mul8x16(CONST_128, TMP12, TMP12);
|
yading@10
|
719
|
yading@10
|
720 vis_and(TMP12, MASK_7f, TMP12);
|
yading@10
|
721
|
yading@10
|
722 vis_psub16(TMP14, TMP12, DST_0);
|
yading@10
|
723 vis_st64(DST_0, dest[0]);
|
yading@10
|
724 dest += stride;
|
yading@10
|
725 }
|
yading@10
|
726
|
yading@10
|
727 static void MC_avg_x_16_vis (uint8_t * dest, const uint8_t * ref,
|
yading@10
|
728 const ptrdiff_t stride, int height)
|
yading@10
|
729 {
|
yading@10
|
730 unsigned long off = (unsigned long) ref & 0x7;
|
yading@10
|
731 unsigned long off_plus_1 = off + 1;
|
yading@10
|
732
|
yading@10
|
733 vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);
|
yading@10
|
734
|
yading@10
|
735 vis_ld64(constants3[0], CONST_3);
|
yading@10
|
736 vis_fzero(ZERO);
|
yading@10
|
737 vis_ld64(constants256_512[0], CONST_256);
|
yading@10
|
738
|
yading@10
|
739 ref = vis_alignaddr(ref);
|
yading@10
|
740 do { /* 26 cycles */
|
yading@10
|
741 vis_ld64(ref[0], TMP0);
|
yading@10
|
742
|
yading@10
|
743 vis_ld64(ref[8], TMP2);
|
yading@10
|
744
|
yading@10
|
745 vis_alignaddr_g0((void *)off);
|
yading@10
|
746
|
yading@10
|
747 vis_ld64(ref[16], TMP4);
|
yading@10
|
748
|
yading@10
|
749 vis_ld64(dest[0], DST_0);
|
yading@10
|
750 vis_faligndata(TMP0, TMP2, REF_0);
|
yading@10
|
751
|
yading@10
|
752 vis_ld64(dest[8], DST_2);
|
yading@10
|
753 vis_faligndata(TMP2, TMP4, REF_4);
|
yading@10
|
754
|
yading@10
|
755 if (off != 0x7) {
|
yading@10
|
756 vis_alignaddr_g0((void *)off_plus_1);
|
yading@10
|
757 vis_faligndata(TMP0, TMP2, REF_2);
|
yading@10
|
758 vis_faligndata(TMP2, TMP4, REF_6);
|
yading@10
|
759 } else {
|
yading@10
|
760 vis_src1(TMP2, REF_2);
|
yading@10
|
761 vis_src1(TMP4, REF_6);
|
yading@10
|
762 }
|
yading@10
|
763
|
yading@10
|
764 vis_mul8x16au(REF_0, CONST_256, TMP0);
|
yading@10
|
765
|
yading@10
|
766 vis_pmerge(ZERO, REF_2, TMP4);
|
yading@10
|
767 vis_mul8x16au(REF_0_1, CONST_256, TMP2);
|
yading@10
|
768
|
yading@10
|
769 vis_pmerge(ZERO, REF_2_1, TMP6);
|
yading@10
|
770
|
yading@10
|
771 vis_padd16(TMP0, TMP4, TMP0);
|
yading@10
|
772
|
yading@10
|
773 vis_mul8x16al(DST_0, CONST_512, TMP4);
|
yading@10
|
774 vis_padd16(TMP2, TMP6, TMP2);
|
yading@10
|
775
|
yading@10
|
776 vis_mul8x16al(DST_1, CONST_512, TMP6);
|
yading@10
|
777
|
yading@10
|
778 vis_mul8x16au(REF_6, CONST_256, TMP12);
|
yading@10
|
779
|
yading@10
|
780 vis_padd16(TMP0, TMP4, TMP0);
|
yading@10
|
781 vis_mul8x16au(REF_6_1, CONST_256, TMP14);
|
yading@10
|
782
|
yading@10
|
783 vis_padd16(TMP2, TMP6, TMP2);
|
yading@10
|
784 vis_mul8x16au(REF_4, CONST_256, TMP16);
|
yading@10
|
785
|
yading@10
|
786 vis_padd16(TMP0, CONST_3, TMP8);
|
yading@10
|
787 vis_mul8x16au(REF_4_1, CONST_256, TMP18);
|
yading@10
|
788
|
yading@10
|
789 vis_padd16(TMP2, CONST_3, TMP10);
|
yading@10
|
790 vis_pack16(TMP8, DST_0);
|
yading@10
|
791
|
yading@10
|
792 vis_pack16(TMP10, DST_1);
|
yading@10
|
793 vis_padd16(TMP16, TMP12, TMP0);
|
yading@10
|
794
|
yading@10
|
795 vis_st64(DST_0, dest[0]);
|
yading@10
|
796 vis_mul8x16al(DST_2, CONST_512, TMP4);
|
yading@10
|
797 vis_padd16(TMP18, TMP14, TMP2);
|
yading@10
|
798
|
yading@10
|
799 vis_mul8x16al(DST_3, CONST_512, TMP6);
|
yading@10
|
800 vis_padd16(TMP0, CONST_3, TMP0);
|
yading@10
|
801
|
yading@10
|
802 vis_padd16(TMP2, CONST_3, TMP2);
|
yading@10
|
803
|
yading@10
|
804 vis_padd16(TMP0, TMP4, TMP0);
|
yading@10
|
805
|
yading@10
|
806 vis_padd16(TMP2, TMP6, TMP2);
|
yading@10
|
807 vis_pack16(TMP0, DST_2);
|
yading@10
|
808
|
yading@10
|
809 vis_pack16(TMP2, DST_3);
|
yading@10
|
810 vis_st64(DST_2, dest[8]);
|
yading@10
|
811
|
yading@10
|
812 ref += stride;
|
yading@10
|
813 dest += stride;
|
yading@10
|
814 } while (--height);
|
yading@10
|
815 }
|
yading@10
|
816
|
yading@10
|
817 static void MC_avg_x_8_vis (uint8_t * dest, const uint8_t * ref,
|
yading@10
|
818 const ptrdiff_t stride, int height)
|
yading@10
|
819 {
|
yading@10
|
820 unsigned long off = (unsigned long) ref & 0x7;
|
yading@10
|
821 unsigned long off_plus_1 = off + 1;
|
yading@10
|
822 int stride_times_2 = stride << 1;
|
yading@10
|
823
|
yading@10
|
824 vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);
|
yading@10
|
825
|
yading@10
|
826 vis_ld64(constants3[0], CONST_3);
|
yading@10
|
827 vis_fzero(ZERO);
|
yading@10
|
828 vis_ld64(constants256_512[0], CONST_256);
|
yading@10
|
829
|
yading@10
|
830 ref = vis_alignaddr(ref);
|
yading@10
|
831 height >>= 2;
|
yading@10
|
832 do { /* 47 cycles */
|
yading@10
|
833 vis_ld64(ref[0], TMP0);
|
yading@10
|
834
|
yading@10
|
835 vis_ld64_2(ref, 8, TMP2);
|
yading@10
|
836 ref += stride;
|
yading@10
|
837
|
yading@10
|
838 vis_alignaddr_g0((void *)off);
|
yading@10
|
839
|
yading@10
|
840 vis_ld64(ref[0], TMP4);
|
yading@10
|
841 vis_faligndata(TMP0, TMP2, REF_0);
|
yading@10
|
842
|
yading@10
|
843 vis_ld64_2(ref, 8, TMP6);
|
yading@10
|
844 ref += stride;
|
yading@10
|
845
|
yading@10
|
846 vis_ld64(ref[0], TMP8);
|
yading@10
|
847
|
yading@10
|
848 vis_ld64_2(ref, 8, TMP10);
|
yading@10
|
849 ref += stride;
|
yading@10
|
850 vis_faligndata(TMP4, TMP6, REF_4);
|
yading@10
|
851
|
yading@10
|
852 vis_ld64(ref[0], TMP12);
|
yading@10
|
853
|
yading@10
|
854 vis_ld64_2(ref, 8, TMP14);
|
yading@10
|
855 ref += stride;
|
yading@10
|
856 vis_faligndata(TMP8, TMP10, REF_S0);
|
yading@10
|
857
|
yading@10
|
858 vis_faligndata(TMP12, TMP14, REF_S4);
|
yading@10
|
859
|
yading@10
|
860 if (off != 0x7) {
|
yading@10
|
861 vis_alignaddr_g0((void *)off_plus_1);
|
yading@10
|
862
|
yading@10
|
863 vis_ld64(dest[0], DST_0);
|
yading@10
|
864 vis_faligndata(TMP0, TMP2, REF_2);
|
yading@10
|
865
|
yading@10
|
866 vis_ld64_2(dest, stride, DST_2);
|
yading@10
|
867 vis_faligndata(TMP4, TMP6, REF_6);
|
yading@10
|
868
|
yading@10
|
869 vis_faligndata(TMP8, TMP10, REF_S2);
|
yading@10
|
870
|
yading@10
|
871 vis_faligndata(TMP12, TMP14, REF_S6);
|
yading@10
|
872 } else {
|
yading@10
|
873 vis_ld64(dest[0], DST_0);
|
yading@10
|
874 vis_src1(TMP2, REF_2);
|
yading@10
|
875
|
yading@10
|
876 vis_ld64_2(dest, stride, DST_2);
|
yading@10
|
877 vis_src1(TMP6, REF_6);
|
yading@10
|
878
|
yading@10
|
879 vis_src1(TMP10, REF_S2);
|
yading@10
|
880
|
yading@10
|
881 vis_src1(TMP14, REF_S6);
|
yading@10
|
882 }
|
yading@10
|
883
|
yading@10
|
884 vis_pmerge(ZERO, REF_0, TMP0);
|
yading@10
|
885 vis_mul8x16au(REF_0_1, CONST_256, TMP2);
|
yading@10
|
886
|
yading@10
|
887 vis_pmerge(ZERO, REF_2, TMP4);
|
yading@10
|
888 vis_mul8x16au(REF_2_1, CONST_256, TMP6);
|
yading@10
|
889
|
yading@10
|
890 vis_padd16(TMP0, CONST_3, TMP0);
|
yading@10
|
891 vis_mul8x16al(DST_0, CONST_512, TMP16);
|
yading@10
|
892
|
yading@10
|
893 vis_padd16(TMP2, CONST_3, TMP2);
|
yading@10
|
894 vis_mul8x16al(DST_1, CONST_512, TMP18);
|
yading@10
|
895
|
yading@10
|
896 vis_padd16(TMP0, TMP4, TMP0);
|
yading@10
|
897 vis_mul8x16au(REF_4, CONST_256, TMP8);
|
yading@10
|
898
|
yading@10
|
899 vis_padd16(TMP2, TMP6, TMP2);
|
yading@10
|
900 vis_mul8x16au(REF_4_1, CONST_256, TMP10);
|
yading@10
|
901
|
yading@10
|
902 vis_padd16(TMP0, TMP16, TMP0);
|
yading@10
|
903 vis_mul8x16au(REF_6, CONST_256, TMP12);
|
yading@10
|
904
|
yading@10
|
905 vis_padd16(TMP2, TMP18, TMP2);
|
yading@10
|
906 vis_mul8x16au(REF_6_1, CONST_256, TMP14);
|
yading@10
|
907
|
yading@10
|
908 vis_padd16(TMP8, CONST_3, TMP8);
|
yading@10
|
909 vis_mul8x16al(DST_2, CONST_512, TMP16);
|
yading@10
|
910
|
yading@10
|
911 vis_padd16(TMP8, TMP12, TMP8);
|
yading@10
|
912 vis_mul8x16al(DST_3, CONST_512, TMP18);
|
yading@10
|
913
|
yading@10
|
914 vis_padd16(TMP10, TMP14, TMP10);
|
yading@10
|
915 vis_pack16(TMP0, DST_0);
|
yading@10
|
916
|
yading@10
|
917 vis_pack16(TMP2, DST_1);
|
yading@10
|
918 vis_st64(DST_0, dest[0]);
|
yading@10
|
919 dest += stride;
|
yading@10
|
920 vis_padd16(TMP10, CONST_3, TMP10);
|
yading@10
|
921
|
yading@10
|
922 vis_ld64_2(dest, stride, DST_0);
|
yading@10
|
923 vis_padd16(TMP8, TMP16, TMP8);
|
yading@10
|
924
|
yading@10
|
925 vis_ld64_2(dest, stride_times_2, TMP4/*DST_2*/);
|
yading@10
|
926 vis_padd16(TMP10, TMP18, TMP10);
|
yading@10
|
927 vis_pack16(TMP8, DST_2);
|
yading@10
|
928
|
yading@10
|
929 vis_pack16(TMP10, DST_3);
|
yading@10
|
930 vis_st64(DST_2, dest[0]);
|
yading@10
|
931 dest += stride;
|
yading@10
|
932
|
yading@10
|
933 vis_mul8x16au(REF_S0_1, CONST_256, TMP2);
|
yading@10
|
934 vis_pmerge(ZERO, REF_S0, TMP0);
|
yading@10
|
935
|
yading@10
|
936 vis_pmerge(ZERO, REF_S2, TMP24);
|
yading@10
|
937 vis_mul8x16au(REF_S2_1, CONST_256, TMP6);
|
yading@10
|
938
|
yading@10
|
939 vis_padd16(TMP0, CONST_3, TMP0);
|
yading@10
|
940 vis_mul8x16au(REF_S4, CONST_256, TMP8);
|
yading@10
|
941
|
yading@10
|
942 vis_padd16(TMP2, CONST_3, TMP2);
|
yading@10
|
943 vis_mul8x16au(REF_S4_1, CONST_256, TMP10);
|
yading@10
|
944
|
yading@10
|
945 vis_padd16(TMP0, TMP24, TMP0);
|
yading@10
|
946 vis_mul8x16au(REF_S6, CONST_256, TMP12);
|
yading@10
|
947
|
yading@10
|
948 vis_padd16(TMP2, TMP6, TMP2);
|
yading@10
|
949 vis_mul8x16au(REF_S6_1, CONST_256, TMP14);
|
yading@10
|
950
|
yading@10
|
951 vis_padd16(TMP8, CONST_3, TMP8);
|
yading@10
|
952 vis_mul8x16al(DST_0, CONST_512, TMP16);
|
yading@10
|
953
|
yading@10
|
954 vis_padd16(TMP10, CONST_3, TMP10);
|
yading@10
|
955 vis_mul8x16al(DST_1, CONST_512, TMP18);
|
yading@10
|
956
|
yading@10
|
957 vis_padd16(TMP8, TMP12, TMP8);
|
yading@10
|
958 vis_mul8x16al(TMP4/*DST_2*/, CONST_512, TMP20);
|
yading@10
|
959
|
yading@10
|
960 vis_mul8x16al(TMP5/*DST_3*/, CONST_512, TMP22);
|
yading@10
|
961 vis_padd16(TMP0, TMP16, TMP0);
|
yading@10
|
962
|
yading@10
|
963 vis_padd16(TMP2, TMP18, TMP2);
|
yading@10
|
964 vis_pack16(TMP0, DST_0);
|
yading@10
|
965
|
yading@10
|
966 vis_padd16(TMP10, TMP14, TMP10);
|
yading@10
|
967 vis_pack16(TMP2, DST_1);
|
yading@10
|
968 vis_st64(DST_0, dest[0]);
|
yading@10
|
969 dest += stride;
|
yading@10
|
970
|
yading@10
|
971 vis_padd16(TMP8, TMP20, TMP8);
|
yading@10
|
972
|
yading@10
|
973 vis_padd16(TMP10, TMP22, TMP10);
|
yading@10
|
974 vis_pack16(TMP8, DST_2);
|
yading@10
|
975
|
yading@10
|
976 vis_pack16(TMP10, DST_3);
|
yading@10
|
977 vis_st64(DST_2, dest[0]);
|
yading@10
|
978 dest += stride;
|
yading@10
|
979 } while (--height);
|
yading@10
|
980 }
|
yading@10
|
981
|
yading@10
|
982 static void MC_put_y_16_vis (uint8_t * dest, const uint8_t * ref,
|
yading@10
|
983 const ptrdiff_t stride, int height)
|
yading@10
|
984 {
|
yading@10
|
985 ref = vis_alignaddr(ref);
|
yading@10
|
986 vis_ld64(ref[0], TMP0);
|
yading@10
|
987
|
yading@10
|
988 vis_ld64_2(ref, 8, TMP2);
|
yading@10
|
989
|
yading@10
|
990 vis_ld64_2(ref, 16, TMP4);
|
yading@10
|
991 ref += stride;
|
yading@10
|
992
|
yading@10
|
993 vis_ld64(ref[0], TMP6);
|
yading@10
|
994 vis_faligndata(TMP0, TMP2, REF_0);
|
yading@10
|
995
|
yading@10
|
996 vis_ld64_2(ref, 8, TMP8);
|
yading@10
|
997 vis_faligndata(TMP2, TMP4, REF_4);
|
yading@10
|
998
|
yading@10
|
999 vis_ld64_2(ref, 16, TMP10);
|
yading@10
|
1000 ref += stride;
|
yading@10
|
1001
|
yading@10
|
1002 vis_ld64(constants_fe[0], MASK_fe);
|
yading@10
|
1003 vis_faligndata(TMP6, TMP8, REF_2);
|
yading@10
|
1004
|
yading@10
|
1005 vis_ld64(constants_7f[0], MASK_7f);
|
yading@10
|
1006 vis_faligndata(TMP8, TMP10, REF_6);
|
yading@10
|
1007
|
yading@10
|
1008 vis_ld64(constants128[0], CONST_128);
|
yading@10
|
1009 height = (height >> 1) - 1;
|
yading@10
|
1010 do { /* 24 cycles */
|
yading@10
|
1011 vis_ld64(ref[0], TMP0);
|
yading@10
|
1012 vis_xor(REF_0, REF_2, TMP12);
|
yading@10
|
1013
|
yading@10
|
1014 vis_ld64_2(ref, 8, TMP2);
|
yading@10
|
1015 vis_xor(REF_4, REF_6, TMP16);
|
yading@10
|
1016
|
yading@10
|
1017 vis_ld64_2(ref, 16, TMP4);
|
yading@10
|
1018 ref += stride;
|
yading@10
|
1019 vis_or(REF_0, REF_2, TMP14);
|
yading@10
|
1020
|
yading@10
|
1021 vis_ld64(ref[0], TMP6);
|
yading@10
|
1022 vis_or(REF_4, REF_6, TMP18);
|
yading@10
|
1023
|
yading@10
|
1024 vis_ld64_2(ref, 8, TMP8);
|
yading@10
|
1025 vis_faligndata(TMP0, TMP2, REF_0);
|
yading@10
|
1026
|
yading@10
|
1027 vis_ld64_2(ref, 16, TMP10);
|
yading@10
|
1028 ref += stride;
|
yading@10
|
1029 vis_faligndata(TMP2, TMP4, REF_4);
|
yading@10
|
1030
|
yading@10
|
1031 vis_and(TMP12, MASK_fe, TMP12);
|
yading@10
|
1032
|
yading@10
|
1033 vis_and(TMP16, MASK_fe, TMP16);
|
yading@10
|
1034 vis_mul8x16(CONST_128, TMP12, TMP12);
|
yading@10
|
1035
|
yading@10
|
1036 vis_mul8x16(CONST_128, TMP16, TMP16);
|
yading@10
|
1037 vis_xor(REF_0, REF_2, TMP0);
|
yading@10
|
1038
|
yading@10
|
1039 vis_xor(REF_4, REF_6, TMP2);
|
yading@10
|
1040
|
yading@10
|
1041 vis_or(REF_0, REF_2, TMP20);
|
yading@10
|
1042
|
yading@10
|
1043 vis_and(TMP12, MASK_7f, TMP12);
|
yading@10
|
1044
|
yading@10
|
1045 vis_and(TMP16, MASK_7f, TMP16);
|
yading@10
|
1046
|
yading@10
|
1047 vis_psub16(TMP14, TMP12, TMP12);
|
yading@10
|
1048 vis_st64(TMP12, dest[0]);
|
yading@10
|
1049
|
yading@10
|
1050 vis_psub16(TMP18, TMP16, TMP16);
|
yading@10
|
1051 vis_st64_2(TMP16, dest, 8);
|
yading@10
|
1052 dest += stride;
|
yading@10
|
1053
|
yading@10
|
1054 vis_or(REF_4, REF_6, TMP18);
|
yading@10
|
1055
|
yading@10
|
1056 vis_and(TMP0, MASK_fe, TMP0);
|
yading@10
|
1057
|
yading@10
|
1058 vis_and(TMP2, MASK_fe, TMP2);
|
yading@10
|
1059 vis_mul8x16(CONST_128, TMP0, TMP0);
|
yading@10
|
1060
|
yading@10
|
1061 vis_faligndata(TMP6, TMP8, REF_2);
|
yading@10
|
1062 vis_mul8x16(CONST_128, TMP2, TMP2);
|
yading@10
|
1063
|
yading@10
|
1064 vis_faligndata(TMP8, TMP10, REF_6);
|
yading@10
|
1065
|
yading@10
|
1066 vis_and(TMP0, MASK_7f, TMP0);
|
yading@10
|
1067
|
yading@10
|
1068 vis_and(TMP2, MASK_7f, TMP2);
|
yading@10
|
1069
|
yading@10
|
1070 vis_psub16(TMP20, TMP0, TMP0);
|
yading@10
|
1071 vis_st64(TMP0, dest[0]);
|
yading@10
|
1072
|
yading@10
|
1073 vis_psub16(TMP18, TMP2, TMP2);
|
yading@10
|
1074 vis_st64_2(TMP2, dest, 8);
|
yading@10
|
1075 dest += stride;
|
yading@10
|
1076 } while (--height);
|
yading@10
|
1077
|
yading@10
|
1078 vis_ld64(ref[0], TMP0);
|
yading@10
|
1079 vis_xor(REF_0, REF_2, TMP12);
|
yading@10
|
1080
|
yading@10
|
1081 vis_ld64_2(ref, 8, TMP2);
|
yading@10
|
1082 vis_xor(REF_4, REF_6, TMP16);
|
yading@10
|
1083
|
yading@10
|
1084 vis_ld64_2(ref, 16, TMP4);
|
yading@10
|
1085 vis_or(REF_0, REF_2, TMP14);
|
yading@10
|
1086
|
yading@10
|
1087 vis_or(REF_4, REF_6, TMP18);
|
yading@10
|
1088
|
yading@10
|
1089 vis_faligndata(TMP0, TMP2, REF_0);
|
yading@10
|
1090
|
yading@10
|
1091 vis_faligndata(TMP2, TMP4, REF_4);
|
yading@10
|
1092
|
yading@10
|
1093 vis_and(TMP12, MASK_fe, TMP12);
|
yading@10
|
1094
|
yading@10
|
1095 vis_and(TMP16, MASK_fe, TMP16);
|
yading@10
|
1096 vis_mul8x16(CONST_128, TMP12, TMP12);
|
yading@10
|
1097
|
yading@10
|
1098 vis_mul8x16(CONST_128, TMP16, TMP16);
|
yading@10
|
1099 vis_xor(REF_0, REF_2, TMP0);
|
yading@10
|
1100
|
yading@10
|
1101 vis_xor(REF_4, REF_6, TMP2);
|
yading@10
|
1102
|
yading@10
|
1103 vis_or(REF_0, REF_2, TMP20);
|
yading@10
|
1104
|
yading@10
|
1105 vis_and(TMP12, MASK_7f, TMP12);
|
yading@10
|
1106
|
yading@10
|
1107 vis_and(TMP16, MASK_7f, TMP16);
|
yading@10
|
1108
|
yading@10
|
1109 vis_psub16(TMP14, TMP12, TMP12);
|
yading@10
|
1110 vis_st64(TMP12, dest[0]);
|
yading@10
|
1111
|
yading@10
|
1112 vis_psub16(TMP18, TMP16, TMP16);
|
yading@10
|
1113 vis_st64_2(TMP16, dest, 8);
|
yading@10
|
1114 dest += stride;
|
yading@10
|
1115
|
yading@10
|
1116 vis_or(REF_4, REF_6, TMP18);
|
yading@10
|
1117
|
yading@10
|
1118 vis_and(TMP0, MASK_fe, TMP0);
|
yading@10
|
1119
|
yading@10
|
1120 vis_and(TMP2, MASK_fe, TMP2);
|
yading@10
|
1121 vis_mul8x16(CONST_128, TMP0, TMP0);
|
yading@10
|
1122
|
yading@10
|
1123 vis_mul8x16(CONST_128, TMP2, TMP2);
|
yading@10
|
1124
|
yading@10
|
1125 vis_and(TMP0, MASK_7f, TMP0);
|
yading@10
|
1126
|
yading@10
|
1127 vis_and(TMP2, MASK_7f, TMP2);
|
yading@10
|
1128
|
yading@10
|
1129 vis_psub16(TMP20, TMP0, TMP0);
|
yading@10
|
1130 vis_st64(TMP0, dest[0]);
|
yading@10
|
1131
|
yading@10
|
1132 vis_psub16(TMP18, TMP2, TMP2);
|
yading@10
|
1133 vis_st64_2(TMP2, dest, 8);
|
yading@10
|
1134 }
|
yading@10
|
1135
|
yading@10
|
1136 static void MC_put_y_8_vis (uint8_t * dest, const uint8_t * ref,
|
yading@10
|
1137 const ptrdiff_t stride, int height)
|
yading@10
|
1138 {
|
yading@10
|
1139 ref = vis_alignaddr(ref);
|
yading@10
|
1140 vis_ld64(ref[0], TMP0);
|
yading@10
|
1141
|
yading@10
|
1142 vis_ld64_2(ref, 8, TMP2);
|
yading@10
|
1143 ref += stride;
|
yading@10
|
1144
|
yading@10
|
1145 vis_ld64(ref[0], TMP4);
|
yading@10
|
1146
|
yading@10
|
1147 vis_ld64_2(ref, 8, TMP6);
|
yading@10
|
1148 ref += stride;
|
yading@10
|
1149
|
yading@10
|
1150 vis_ld64(constants_fe[0], MASK_fe);
|
yading@10
|
1151 vis_faligndata(TMP0, TMP2, REF_0);
|
yading@10
|
1152
|
yading@10
|
1153 vis_ld64(constants_7f[0], MASK_7f);
|
yading@10
|
1154 vis_faligndata(TMP4, TMP6, REF_2);
|
yading@10
|
1155
|
yading@10
|
1156 vis_ld64(constants128[0], CONST_128);
|
yading@10
|
1157 height = (height >> 1) - 1;
|
yading@10
|
1158 do { /* 12 cycles */
|
yading@10
|
1159 vis_ld64(ref[0], TMP0);
|
yading@10
|
1160 vis_xor(REF_0, REF_2, TMP4);
|
yading@10
|
1161
|
yading@10
|
1162 vis_ld64_2(ref, 8, TMP2);
|
yading@10
|
1163 ref += stride;
|
yading@10
|
1164 vis_and(TMP4, MASK_fe, TMP4);
|
yading@10
|
1165
|
yading@10
|
1166 vis_or(REF_0, REF_2, TMP6);
|
yading@10
|
1167 vis_mul8x16(CONST_128, TMP4, TMP4);
|
yading@10
|
1168
|
yading@10
|
1169 vis_faligndata(TMP0, TMP2, REF_0);
|
yading@10
|
1170 vis_ld64(ref[0], TMP0);
|
yading@10
|
1171
|
yading@10
|
1172 vis_ld64_2(ref, 8, TMP2);
|
yading@10
|
1173 ref += stride;
|
yading@10
|
1174 vis_xor(REF_0, REF_2, TMP12);
|
yading@10
|
1175
|
yading@10
|
1176 vis_and(TMP4, MASK_7f, TMP4);
|
yading@10
|
1177
|
yading@10
|
1178 vis_and(TMP12, MASK_fe, TMP12);
|
yading@10
|
1179
|
yading@10
|
1180 vis_mul8x16(CONST_128, TMP12, TMP12);
|
yading@10
|
1181 vis_or(REF_0, REF_2, TMP14);
|
yading@10
|
1182
|
yading@10
|
1183 vis_psub16(TMP6, TMP4, DST_0);
|
yading@10
|
1184 vis_st64(DST_0, dest[0]);
|
yading@10
|
1185 dest += stride;
|
yading@10
|
1186
|
yading@10
|
1187 vis_faligndata(TMP0, TMP2, REF_2);
|
yading@10
|
1188
|
yading@10
|
1189 vis_and(TMP12, MASK_7f, TMP12);
|
yading@10
|
1190
|
yading@10
|
1191 vis_psub16(TMP14, TMP12, DST_0);
|
yading@10
|
1192 vis_st64(DST_0, dest[0]);
|
yading@10
|
1193 dest += stride;
|
yading@10
|
1194 } while (--height);
|
yading@10
|
1195
|
yading@10
|
1196 vis_ld64(ref[0], TMP0);
|
yading@10
|
1197 vis_xor(REF_0, REF_2, TMP4);
|
yading@10
|
1198
|
yading@10
|
1199 vis_ld64_2(ref, 8, TMP2);
|
yading@10
|
1200 vis_and(TMP4, MASK_fe, TMP4);
|
yading@10
|
1201
|
yading@10
|
1202 vis_or(REF_0, REF_2, TMP6);
|
yading@10
|
1203 vis_mul8x16(CONST_128, TMP4, TMP4);
|
yading@10
|
1204
|
yading@10
|
1205 vis_faligndata(TMP0, TMP2, REF_0);
|
yading@10
|
1206
|
yading@10
|
1207 vis_xor(REF_0, REF_2, TMP12);
|
yading@10
|
1208
|
yading@10
|
1209 vis_and(TMP4, MASK_7f, TMP4);
|
yading@10
|
1210
|
yading@10
|
1211 vis_and(TMP12, MASK_fe, TMP12);
|
yading@10
|
1212
|
yading@10
|
1213 vis_mul8x16(CONST_128, TMP12, TMP12);
|
yading@10
|
1214 vis_or(REF_0, REF_2, TMP14);
|
yading@10
|
1215
|
yading@10
|
1216 vis_psub16(TMP6, TMP4, DST_0);
|
yading@10
|
1217 vis_st64(DST_0, dest[0]);
|
yading@10
|
1218 dest += stride;
|
yading@10
|
1219
|
yading@10
|
1220 vis_and(TMP12, MASK_7f, TMP12);
|
yading@10
|
1221
|
yading@10
|
1222 vis_psub16(TMP14, TMP12, DST_0);
|
yading@10
|
1223 vis_st64(DST_0, dest[0]);
|
yading@10
|
1224 }
|
yading@10
|
1225
|
yading@10
|
1226 static void MC_avg_y_16_vis (uint8_t * dest, const uint8_t * ref,
|
yading@10
|
1227 const ptrdiff_t stride, int height)
|
yading@10
|
1228 {
|
yading@10
|
1229 int stride_8 = stride + 8;
|
yading@10
|
1230 int stride_16 = stride + 16;
|
yading@10
|
1231
|
yading@10
|
1232 vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);
|
yading@10
|
1233
|
yading@10
|
1234 ref = vis_alignaddr(ref);
|
yading@10
|
1235
|
yading@10
|
1236 vis_ld64(ref[ 0], TMP0);
|
yading@10
|
1237 vis_fzero(ZERO);
|
yading@10
|
1238
|
yading@10
|
1239 vis_ld64(ref[ 8], TMP2);
|
yading@10
|
1240
|
yading@10
|
1241 vis_ld64(ref[16], TMP4);
|
yading@10
|
1242
|
yading@10
|
1243 vis_ld64(constants3[0], CONST_3);
|
yading@10
|
1244 vis_faligndata(TMP0, TMP2, REF_2);
|
yading@10
|
1245
|
yading@10
|
1246 vis_ld64(constants256_512[0], CONST_256);
|
yading@10
|
1247 vis_faligndata(TMP2, TMP4, REF_6);
|
yading@10
|
1248 height >>= 1;
|
yading@10
|
1249
|
yading@10
|
1250 do { /* 31 cycles */
|
yading@10
|
1251 vis_ld64_2(ref, stride, TMP0);
|
yading@10
|
1252 vis_pmerge(ZERO, REF_2, TMP12);
|
yading@10
|
1253 vis_mul8x16au(REF_2_1, CONST_256, TMP14);
|
yading@10
|
1254
|
yading@10
|
1255 vis_ld64_2(ref, stride_8, TMP2);
|
yading@10
|
1256 vis_pmerge(ZERO, REF_6, TMP16);
|
yading@10
|
1257 vis_mul8x16au(REF_6_1, CONST_256, TMP18);
|
yading@10
|
1258
|
yading@10
|
1259 vis_ld64_2(ref, stride_16, TMP4);
|
yading@10
|
1260 ref += stride;
|
yading@10
|
1261
|
yading@10
|
1262 vis_ld64(dest[0], DST_0);
|
yading@10
|
1263 vis_faligndata(TMP0, TMP2, REF_0);
|
yading@10
|
1264
|
yading@10
|
1265 vis_ld64_2(dest, 8, DST_2);
|
yading@10
|
1266 vis_faligndata(TMP2, TMP4, REF_4);
|
yading@10
|
1267
|
yading@10
|
1268 vis_ld64_2(ref, stride, TMP6);
|
yading@10
|
1269 vis_pmerge(ZERO, REF_0, TMP0);
|
yading@10
|
1270 vis_mul8x16au(REF_0_1, CONST_256, TMP2);
|
yading@10
|
1271
|
yading@10
|
1272 vis_ld64_2(ref, stride_8, TMP8);
|
yading@10
|
1273 vis_pmerge(ZERO, REF_4, TMP4);
|
yading@10
|
1274
|
yading@10
|
1275 vis_ld64_2(ref, stride_16, TMP10);
|
yading@10
|
1276 ref += stride;
|
yading@10
|
1277
|
yading@10
|
1278 vis_ld64_2(dest, stride, REF_S0/*DST_4*/);
|
yading@10
|
1279 vis_faligndata(TMP6, TMP8, REF_2);
|
yading@10
|
1280 vis_mul8x16au(REF_4_1, CONST_256, TMP6);
|
yading@10
|
1281
|
yading@10
|
1282 vis_ld64_2(dest, stride_8, REF_S2/*DST_6*/);
|
yading@10
|
1283 vis_faligndata(TMP8, TMP10, REF_6);
|
yading@10
|
1284 vis_mul8x16al(DST_0, CONST_512, TMP20);
|
yading@10
|
1285
|
yading@10
|
1286 vis_padd16(TMP0, CONST_3, TMP0);
|
yading@10
|
1287 vis_mul8x16al(DST_1, CONST_512, TMP22);
|
yading@10
|
1288
|
yading@10
|
1289 vis_padd16(TMP2, CONST_3, TMP2);
|
yading@10
|
1290 vis_mul8x16al(DST_2, CONST_512, TMP24);
|
yading@10
|
1291
|
yading@10
|
1292 vis_padd16(TMP4, CONST_3, TMP4);
|
yading@10
|
1293 vis_mul8x16al(DST_3, CONST_512, TMP26);
|
yading@10
|
1294
|
yading@10
|
1295 vis_padd16(TMP6, CONST_3, TMP6);
|
yading@10
|
1296
|
yading@10
|
1297 vis_padd16(TMP12, TMP20, TMP12);
|
yading@10
|
1298 vis_mul8x16al(REF_S0, CONST_512, TMP20);
|
yading@10
|
1299
|
yading@10
|
1300 vis_padd16(TMP14, TMP22, TMP14);
|
yading@10
|
1301 vis_mul8x16al(REF_S0_1, CONST_512, TMP22);
|
yading@10
|
1302
|
yading@10
|
1303 vis_padd16(TMP16, TMP24, TMP16);
|
yading@10
|
1304 vis_mul8x16al(REF_S2, CONST_512, TMP24);
|
yading@10
|
1305
|
yading@10
|
1306 vis_padd16(TMP18, TMP26, TMP18);
|
yading@10
|
1307 vis_mul8x16al(REF_S2_1, CONST_512, TMP26);
|
yading@10
|
1308
|
yading@10
|
1309 vis_padd16(TMP12, TMP0, TMP12);
|
yading@10
|
1310 vis_mul8x16au(REF_2, CONST_256, TMP28);
|
yading@10
|
1311
|
yading@10
|
1312 vis_padd16(TMP14, TMP2, TMP14);
|
yading@10
|
1313 vis_mul8x16au(REF_2_1, CONST_256, TMP30);
|
yading@10
|
1314
|
yading@10
|
1315 vis_padd16(TMP16, TMP4, TMP16);
|
yading@10
|
1316 vis_mul8x16au(REF_6, CONST_256, REF_S4);
|
yading@10
|
1317
|
yading@10
|
1318 vis_padd16(TMP18, TMP6, TMP18);
|
yading@10
|
1319 vis_mul8x16au(REF_6_1, CONST_256, REF_S6);
|
yading@10
|
1320
|
yading@10
|
1321 vis_pack16(TMP12, DST_0);
|
yading@10
|
1322 vis_padd16(TMP28, TMP0, TMP12);
|
yading@10
|
1323
|
yading@10
|
1324 vis_pack16(TMP14, DST_1);
|
yading@10
|
1325 vis_st64(DST_0, dest[0]);
|
yading@10
|
1326 vis_padd16(TMP30, TMP2, TMP14);
|
yading@10
|
1327
|
yading@10
|
1328 vis_pack16(TMP16, DST_2);
|
yading@10
|
1329 vis_padd16(REF_S4, TMP4, TMP16);
|
yading@10
|
1330
|
yading@10
|
1331 vis_pack16(TMP18, DST_3);
|
yading@10
|
1332 vis_st64_2(DST_2, dest, 8);
|
yading@10
|
1333 dest += stride;
|
yading@10
|
1334 vis_padd16(REF_S6, TMP6, TMP18);
|
yading@10
|
1335
|
yading@10
|
1336 vis_padd16(TMP12, TMP20, TMP12);
|
yading@10
|
1337
|
yading@10
|
1338 vis_padd16(TMP14, TMP22, TMP14);
|
yading@10
|
1339 vis_pack16(TMP12, DST_0);
|
yading@10
|
1340
|
yading@10
|
1341 vis_padd16(TMP16, TMP24, TMP16);
|
yading@10
|
1342 vis_pack16(TMP14, DST_1);
|
yading@10
|
1343 vis_st64(DST_0, dest[0]);
|
yading@10
|
1344
|
yading@10
|
1345 vis_padd16(TMP18, TMP26, TMP18);
|
yading@10
|
1346 vis_pack16(TMP16, DST_2);
|
yading@10
|
1347
|
yading@10
|
1348 vis_pack16(TMP18, DST_3);
|
yading@10
|
1349 vis_st64_2(DST_2, dest, 8);
|
yading@10
|
1350 dest += stride;
|
yading@10
|
1351 } while (--height);
|
yading@10
|
1352 }
|
yading@10
|
1353
|
yading@10
|
1354 static void MC_avg_y_8_vis (uint8_t * dest, const uint8_t * ref,
|
yading@10
|
1355 const ptrdiff_t stride, int height)
|
yading@10
|
1356 {
|
yading@10
|
1357 int stride_8 = stride + 8;
|
yading@10
|
1358
|
yading@10
|
1359 vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);
|
yading@10
|
1360
|
yading@10
|
1361 ref = vis_alignaddr(ref);
|
yading@10
|
1362
|
yading@10
|
1363 vis_ld64(ref[ 0], TMP0);
|
yading@10
|
1364 vis_fzero(ZERO);
|
yading@10
|
1365
|
yading@10
|
1366 vis_ld64(ref[ 8], TMP2);
|
yading@10
|
1367
|
yading@10
|
1368 vis_ld64(constants3[0], CONST_3);
|
yading@10
|
1369 vis_faligndata(TMP0, TMP2, REF_2);
|
yading@10
|
1370
|
yading@10
|
1371 vis_ld64(constants256_512[0], CONST_256);
|
yading@10
|
1372
|
yading@10
|
1373 height >>= 1;
|
yading@10
|
1374 do { /* 20 cycles */
|
yading@10
|
1375 vis_ld64_2(ref, stride, TMP0);
|
yading@10
|
1376 vis_pmerge(ZERO, REF_2, TMP8);
|
yading@10
|
1377 vis_mul8x16au(REF_2_1, CONST_256, TMP10);
|
yading@10
|
1378
|
yading@10
|
1379 vis_ld64_2(ref, stride_8, TMP2);
|
yading@10
|
1380 ref += stride;
|
yading@10
|
1381
|
yading@10
|
1382 vis_ld64(dest[0], DST_0);
|
yading@10
|
1383
|
yading@10
|
1384 vis_ld64_2(dest, stride, DST_2);
|
yading@10
|
1385 vis_faligndata(TMP0, TMP2, REF_0);
|
yading@10
|
1386
|
yading@10
|
1387 vis_ld64_2(ref, stride, TMP4);
|
yading@10
|
1388 vis_mul8x16al(DST_0, CONST_512, TMP16);
|
yading@10
|
1389 vis_pmerge(ZERO, REF_0, TMP12);
|
yading@10
|
1390
|
yading@10
|
1391 vis_ld64_2(ref, stride_8, TMP6);
|
yading@10
|
1392 ref += stride;
|
yading@10
|
1393 vis_mul8x16al(DST_1, CONST_512, TMP18);
|
yading@10
|
1394 vis_pmerge(ZERO, REF_0_1, TMP14);
|
yading@10
|
1395
|
yading@10
|
1396 vis_padd16(TMP12, CONST_3, TMP12);
|
yading@10
|
1397 vis_mul8x16al(DST_2, CONST_512, TMP24);
|
yading@10
|
1398
|
yading@10
|
1399 vis_padd16(TMP14, CONST_3, TMP14);
|
yading@10
|
1400 vis_mul8x16al(DST_3, CONST_512, TMP26);
|
yading@10
|
1401
|
yading@10
|
1402 vis_faligndata(TMP4, TMP6, REF_2);
|
yading@10
|
1403
|
yading@10
|
1404 vis_padd16(TMP8, TMP12, TMP8);
|
yading@10
|
1405
|
yading@10
|
1406 vis_padd16(TMP10, TMP14, TMP10);
|
yading@10
|
1407 vis_mul8x16au(REF_2, CONST_256, TMP20);
|
yading@10
|
1408
|
yading@10
|
1409 vis_padd16(TMP8, TMP16, TMP0);
|
yading@10
|
1410 vis_mul8x16au(REF_2_1, CONST_256, TMP22);
|
yading@10
|
1411
|
yading@10
|
1412 vis_padd16(TMP10, TMP18, TMP2);
|
yading@10
|
1413 vis_pack16(TMP0, DST_0);
|
yading@10
|
1414
|
yading@10
|
1415 vis_pack16(TMP2, DST_1);
|
yading@10
|
1416 vis_st64(DST_0, dest[0]);
|
yading@10
|
1417 dest += stride;
|
yading@10
|
1418 vis_padd16(TMP12, TMP20, TMP12);
|
yading@10
|
1419
|
yading@10
|
1420 vis_padd16(TMP14, TMP22, TMP14);
|
yading@10
|
1421
|
yading@10
|
1422 vis_padd16(TMP12, TMP24, TMP0);
|
yading@10
|
1423
|
yading@10
|
1424 vis_padd16(TMP14, TMP26, TMP2);
|
yading@10
|
1425 vis_pack16(TMP0, DST_2);
|
yading@10
|
1426
|
yading@10
|
1427 vis_pack16(TMP2, DST_3);
|
yading@10
|
1428 vis_st64(DST_2, dest[0]);
|
yading@10
|
1429 dest += stride;
|
yading@10
|
1430 } while (--height);
|
yading@10
|
1431 }
|
yading@10
|
1432
|
yading@10
|
1433 static void MC_put_xy_16_vis (uint8_t * dest, const uint8_t * ref,
|
yading@10
|
1434 const ptrdiff_t stride, int height)
|
yading@10
|
1435 {
|
yading@10
|
1436 unsigned long off = (unsigned long) ref & 0x7;
|
yading@10
|
1437 unsigned long off_plus_1 = off + 1;
|
yading@10
|
1438 int stride_8 = stride + 8;
|
yading@10
|
1439 int stride_16 = stride + 16;
|
yading@10
|
1440
|
yading@10
|
1441 vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);
|
yading@10
|
1442
|
yading@10
|
1443 ref = vis_alignaddr(ref);
|
yading@10
|
1444
|
yading@10
|
1445 vis_ld64(ref[ 0], TMP0);
|
yading@10
|
1446 vis_fzero(ZERO);
|
yading@10
|
1447
|
yading@10
|
1448 vis_ld64(ref[ 8], TMP2);
|
yading@10
|
1449
|
yading@10
|
1450 vis_ld64(ref[16], TMP4);
|
yading@10
|
1451
|
yading@10
|
1452 vis_ld64(constants2[0], CONST_2);
|
yading@10
|
1453 vis_faligndata(TMP0, TMP2, REF_S0);
|
yading@10
|
1454
|
yading@10
|
1455 vis_ld64(constants256_512[0], CONST_256);
|
yading@10
|
1456 vis_faligndata(TMP2, TMP4, REF_S4);
|
yading@10
|
1457
|
yading@10
|
1458 if (off != 0x7) {
|
yading@10
|
1459 vis_alignaddr_g0((void *)off_plus_1);
|
yading@10
|
1460 vis_faligndata(TMP0, TMP2, REF_S2);
|
yading@10
|
1461 vis_faligndata(TMP2, TMP4, REF_S6);
|
yading@10
|
1462 } else {
|
yading@10
|
1463 vis_src1(TMP2, REF_S2);
|
yading@10
|
1464 vis_src1(TMP4, REF_S6);
|
yading@10
|
1465 }
|
yading@10
|
1466
|
yading@10
|
1467 height >>= 1;
|
yading@10
|
1468 do {
|
yading@10
|
1469 vis_ld64_2(ref, stride, TMP0);
|
yading@10
|
1470 vis_mul8x16au(REF_S0, CONST_256, TMP12);
|
yading@10
|
1471 vis_pmerge(ZERO, REF_S0_1, TMP14);
|
yading@10
|
1472
|
yading@10
|
1473 vis_alignaddr_g0((void *)off);
|
yading@10
|
1474
|
yading@10
|
1475 vis_ld64_2(ref, stride_8, TMP2);
|
yading@10
|
1476 vis_mul8x16au(REF_S2, CONST_256, TMP16);
|
yading@10
|
1477 vis_pmerge(ZERO, REF_S2_1, TMP18);
|
yading@10
|
1478
|
yading@10
|
1479 vis_ld64_2(ref, stride_16, TMP4);
|
yading@10
|
1480 ref += stride;
|
yading@10
|
1481 vis_mul8x16au(REF_S4, CONST_256, TMP20);
|
yading@10
|
1482 vis_pmerge(ZERO, REF_S4_1, TMP22);
|
yading@10
|
1483
|
yading@10
|
1484 vis_ld64_2(ref, stride, TMP6);
|
yading@10
|
1485 vis_mul8x16au(REF_S6, CONST_256, TMP24);
|
yading@10
|
1486 vis_pmerge(ZERO, REF_S6_1, TMP26);
|
yading@10
|
1487
|
yading@10
|
1488 vis_ld64_2(ref, stride_8, TMP8);
|
yading@10
|
1489 vis_faligndata(TMP0, TMP2, REF_0);
|
yading@10
|
1490
|
yading@10
|
1491 vis_ld64_2(ref, stride_16, TMP10);
|
yading@10
|
1492 ref += stride;
|
yading@10
|
1493 vis_faligndata(TMP2, TMP4, REF_4);
|
yading@10
|
1494
|
yading@10
|
1495 vis_faligndata(TMP6, TMP8, REF_S0);
|
yading@10
|
1496
|
yading@10
|
1497 vis_faligndata(TMP8, TMP10, REF_S4);
|
yading@10
|
1498
|
yading@10
|
1499 if (off != 0x7) {
|
yading@10
|
1500 vis_alignaddr_g0((void *)off_plus_1);
|
yading@10
|
1501 vis_faligndata(TMP0, TMP2, REF_2);
|
yading@10
|
1502 vis_faligndata(TMP2, TMP4, REF_6);
|
yading@10
|
1503 vis_faligndata(TMP6, TMP8, REF_S2);
|
yading@10
|
1504 vis_faligndata(TMP8, TMP10, REF_S6);
|
yading@10
|
1505 } else {
|
yading@10
|
1506 vis_src1(TMP2, REF_2);
|
yading@10
|
1507 vis_src1(TMP4, REF_6);
|
yading@10
|
1508 vis_src1(TMP8, REF_S2);
|
yading@10
|
1509 vis_src1(TMP10, REF_S6);
|
yading@10
|
1510 }
|
yading@10
|
1511
|
yading@10
|
1512 vis_mul8x16au(REF_0, CONST_256, TMP0);
|
yading@10
|
1513 vis_pmerge(ZERO, REF_0_1, TMP2);
|
yading@10
|
1514
|
yading@10
|
1515 vis_mul8x16au(REF_2, CONST_256, TMP4);
|
yading@10
|
1516 vis_pmerge(ZERO, REF_2_1, TMP6);
|
yading@10
|
1517
|
yading@10
|
1518 vis_padd16(TMP0, CONST_2, TMP8);
|
yading@10
|
1519 vis_mul8x16au(REF_4, CONST_256, TMP0);
|
yading@10
|
1520
|
yading@10
|
1521 vis_padd16(TMP2, CONST_2, TMP10);
|
yading@10
|
1522 vis_mul8x16au(REF_4_1, CONST_256, TMP2);
|
yading@10
|
1523
|
yading@10
|
1524 vis_padd16(TMP8, TMP4, TMP8);
|
yading@10
|
1525 vis_mul8x16au(REF_6, CONST_256, TMP4);
|
yading@10
|
1526
|
yading@10
|
1527 vis_padd16(TMP10, TMP6, TMP10);
|
yading@10
|
1528 vis_mul8x16au(REF_6_1, CONST_256, TMP6);
|
yading@10
|
1529
|
yading@10
|
1530 vis_padd16(TMP12, TMP8, TMP12);
|
yading@10
|
1531
|
yading@10
|
1532 vis_padd16(TMP14, TMP10, TMP14);
|
yading@10
|
1533
|
yading@10
|
1534 vis_padd16(TMP12, TMP16, TMP12);
|
yading@10
|
1535
|
yading@10
|
1536 vis_padd16(TMP14, TMP18, TMP14);
|
yading@10
|
1537 vis_pack16(TMP12, DST_0);
|
yading@10
|
1538
|
yading@10
|
1539 vis_pack16(TMP14, DST_1);
|
yading@10
|
1540 vis_st64(DST_0, dest[0]);
|
yading@10
|
1541 vis_padd16(TMP0, CONST_2, TMP12);
|
yading@10
|
1542
|
yading@10
|
1543 vis_mul8x16au(REF_S0, CONST_256, TMP0);
|
yading@10
|
1544 vis_padd16(TMP2, CONST_2, TMP14);
|
yading@10
|
1545
|
yading@10
|
1546 vis_mul8x16au(REF_S0_1, CONST_256, TMP2);
|
yading@10
|
1547 vis_padd16(TMP12, TMP4, TMP12);
|
yading@10
|
1548
|
yading@10
|
1549 vis_mul8x16au(REF_S2, CONST_256, TMP4);
|
yading@10
|
1550 vis_padd16(TMP14, TMP6, TMP14);
|
yading@10
|
1551
|
yading@10
|
1552 vis_mul8x16au(REF_S2_1, CONST_256, TMP6);
|
yading@10
|
1553 vis_padd16(TMP20, TMP12, TMP20);
|
yading@10
|
1554
|
yading@10
|
1555 vis_padd16(TMP22, TMP14, TMP22);
|
yading@10
|
1556
|
yading@10
|
1557 vis_padd16(TMP20, TMP24, TMP20);
|
yading@10
|
1558
|
yading@10
|
1559 vis_padd16(TMP22, TMP26, TMP22);
|
yading@10
|
1560 vis_pack16(TMP20, DST_2);
|
yading@10
|
1561
|
yading@10
|
1562 vis_pack16(TMP22, DST_3);
|
yading@10
|
1563 vis_st64_2(DST_2, dest, 8);
|
yading@10
|
1564 dest += stride;
|
yading@10
|
1565 vis_padd16(TMP0, TMP4, TMP24);
|
yading@10
|
1566
|
yading@10
|
1567 vis_mul8x16au(REF_S4, CONST_256, TMP0);
|
yading@10
|
1568 vis_padd16(TMP2, TMP6, TMP26);
|
yading@10
|
1569
|
yading@10
|
1570 vis_mul8x16au(REF_S4_1, CONST_256, TMP2);
|
yading@10
|
1571 vis_padd16(TMP24, TMP8, TMP24);
|
yading@10
|
1572
|
yading@10
|
1573 vis_padd16(TMP26, TMP10, TMP26);
|
yading@10
|
1574 vis_pack16(TMP24, DST_0);
|
yading@10
|
1575
|
yading@10
|
1576 vis_pack16(TMP26, DST_1);
|
yading@10
|
1577 vis_st64(DST_0, dest[0]);
|
yading@10
|
1578 vis_pmerge(ZERO, REF_S6, TMP4);
|
yading@10
|
1579
|
yading@10
|
1580 vis_pmerge(ZERO, REF_S6_1, TMP6);
|
yading@10
|
1581
|
yading@10
|
1582 vis_padd16(TMP0, TMP4, TMP0);
|
yading@10
|
1583
|
yading@10
|
1584 vis_padd16(TMP2, TMP6, TMP2);
|
yading@10
|
1585
|
yading@10
|
1586 vis_padd16(TMP0, TMP12, TMP0);
|
yading@10
|
1587
|
yading@10
|
1588 vis_padd16(TMP2, TMP14, TMP2);
|
yading@10
|
1589 vis_pack16(TMP0, DST_2);
|
yading@10
|
1590
|
yading@10
|
1591 vis_pack16(TMP2, DST_3);
|
yading@10
|
1592 vis_st64_2(DST_2, dest, 8);
|
yading@10
|
1593 dest += stride;
|
yading@10
|
1594 } while (--height);
|
yading@10
|
1595 }
|
yading@10
|
1596
|
yading@10
|
1597 static void MC_put_xy_8_vis (uint8_t * dest, const uint8_t * ref,
|
yading@10
|
1598 const ptrdiff_t stride, int height)
|
yading@10
|
1599 {
|
yading@10
|
1600 unsigned long off = (unsigned long) ref & 0x7;
|
yading@10
|
1601 unsigned long off_plus_1 = off + 1;
|
yading@10
|
1602 int stride_8 = stride + 8;
|
yading@10
|
1603
|
yading@10
|
1604 vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);
|
yading@10
|
1605
|
yading@10
|
1606 ref = vis_alignaddr(ref);
|
yading@10
|
1607
|
yading@10
|
1608 vis_ld64(ref[ 0], TMP0);
|
yading@10
|
1609 vis_fzero(ZERO);
|
yading@10
|
1610
|
yading@10
|
1611 vis_ld64(ref[ 8], TMP2);
|
yading@10
|
1612
|
yading@10
|
1613 vis_ld64(constants2[0], CONST_2);
|
yading@10
|
1614
|
yading@10
|
1615 vis_ld64(constants256_512[0], CONST_256);
|
yading@10
|
1616 vis_faligndata(TMP0, TMP2, REF_S0);
|
yading@10
|
1617
|
yading@10
|
1618 if (off != 0x7) {
|
yading@10
|
1619 vis_alignaddr_g0((void *)off_plus_1);
|
yading@10
|
1620 vis_faligndata(TMP0, TMP2, REF_S2);
|
yading@10
|
1621 } else {
|
yading@10
|
1622 vis_src1(TMP2, REF_S2);
|
yading@10
|
1623 }
|
yading@10
|
1624
|
yading@10
|
1625 height >>= 1;
|
yading@10
|
1626 do { /* 26 cycles */
|
yading@10
|
1627 vis_ld64_2(ref, stride, TMP0);
|
yading@10
|
1628 vis_mul8x16au(REF_S0, CONST_256, TMP8);
|
yading@10
|
1629 vis_pmerge(ZERO, REF_S2, TMP12);
|
yading@10
|
1630
|
yading@10
|
1631 vis_alignaddr_g0((void *)off);
|
yading@10
|
1632
|
yading@10
|
1633 vis_ld64_2(ref, stride_8, TMP2);
|
yading@10
|
1634 ref += stride;
|
yading@10
|
1635 vis_mul8x16au(REF_S0_1, CONST_256, TMP10);
|
yading@10
|
1636 vis_pmerge(ZERO, REF_S2_1, TMP14);
|
yading@10
|
1637
|
yading@10
|
1638 vis_ld64_2(ref, stride, TMP4);
|
yading@10
|
1639
|
yading@10
|
1640 vis_ld64_2(ref, stride_8, TMP6);
|
yading@10
|
1641 ref += stride;
|
yading@10
|
1642 vis_faligndata(TMP0, TMP2, REF_S4);
|
yading@10
|
1643
|
yading@10
|
1644 vis_pmerge(ZERO, REF_S4, TMP18);
|
yading@10
|
1645
|
yading@10
|
1646 vis_pmerge(ZERO, REF_S4_1, TMP20);
|
yading@10
|
1647
|
yading@10
|
1648 vis_faligndata(TMP4, TMP6, REF_S0);
|
yading@10
|
1649
|
yading@10
|
1650 if (off != 0x7) {
|
yading@10
|
1651 vis_alignaddr_g0((void *)off_plus_1);
|
yading@10
|
1652 vis_faligndata(TMP0, TMP2, REF_S6);
|
yading@10
|
1653 vis_faligndata(TMP4, TMP6, REF_S2);
|
yading@10
|
1654 } else {
|
yading@10
|
1655 vis_src1(TMP2, REF_S6);
|
yading@10
|
1656 vis_src1(TMP6, REF_S2);
|
yading@10
|
1657 }
|
yading@10
|
1658
|
yading@10
|
1659 vis_padd16(TMP18, CONST_2, TMP18);
|
yading@10
|
1660 vis_mul8x16au(REF_S6, CONST_256, TMP22);
|
yading@10
|
1661
|
yading@10
|
1662 vis_padd16(TMP20, CONST_2, TMP20);
|
yading@10
|
1663 vis_mul8x16au(REF_S6_1, CONST_256, TMP24);
|
yading@10
|
1664
|
yading@10
|
1665 vis_mul8x16au(REF_S0, CONST_256, TMP26);
|
yading@10
|
1666 vis_pmerge(ZERO, REF_S0_1, TMP28);
|
yading@10
|
1667
|
yading@10
|
1668 vis_mul8x16au(REF_S2, CONST_256, TMP30);
|
yading@10
|
1669 vis_padd16(TMP18, TMP22, TMP18);
|
yading@10
|
1670
|
yading@10
|
1671 vis_mul8x16au(REF_S2_1, CONST_256, TMP32);
|
yading@10
|
1672 vis_padd16(TMP20, TMP24, TMP20);
|
yading@10
|
1673
|
yading@10
|
1674 vis_padd16(TMP8, TMP18, TMP8);
|
yading@10
|
1675
|
yading@10
|
1676 vis_padd16(TMP10, TMP20, TMP10);
|
yading@10
|
1677
|
yading@10
|
1678 vis_padd16(TMP8, TMP12, TMP8);
|
yading@10
|
1679
|
yading@10
|
1680 vis_padd16(TMP10, TMP14, TMP10);
|
yading@10
|
1681 vis_pack16(TMP8, DST_0);
|
yading@10
|
1682
|
yading@10
|
1683 vis_pack16(TMP10, DST_1);
|
yading@10
|
1684 vis_st64(DST_0, dest[0]);
|
yading@10
|
1685 dest += stride;
|
yading@10
|
1686 vis_padd16(TMP18, TMP26, TMP18);
|
yading@10
|
1687
|
yading@10
|
1688 vis_padd16(TMP20, TMP28, TMP20);
|
yading@10
|
1689
|
yading@10
|
1690 vis_padd16(TMP18, TMP30, TMP18);
|
yading@10
|
1691
|
yading@10
|
1692 vis_padd16(TMP20, TMP32, TMP20);
|
yading@10
|
1693 vis_pack16(TMP18, DST_2);
|
yading@10
|
1694
|
yading@10
|
1695 vis_pack16(TMP20, DST_3);
|
yading@10
|
1696 vis_st64(DST_2, dest[0]);
|
yading@10
|
1697 dest += stride;
|
yading@10
|
1698 } while (--height);
|
yading@10
|
1699 }
|
yading@10
|
1700
|
yading@10
|
1701 static void MC_avg_xy_16_vis (uint8_t * dest, const uint8_t * ref,
|
yading@10
|
1702 const ptrdiff_t stride, int height)
|
yading@10
|
1703 {
|
yading@10
|
1704 unsigned long off = (unsigned long) ref & 0x7;
|
yading@10
|
1705 unsigned long off_plus_1 = off + 1;
|
yading@10
|
1706 int stride_8 = stride + 8;
|
yading@10
|
1707 int stride_16 = stride + 16;
|
yading@10
|
1708
|
yading@10
|
1709 vis_set_gsr(4 << VIS_GSR_SCALEFACT_SHIFT);
|
yading@10
|
1710
|
yading@10
|
1711 ref = vis_alignaddr(ref);
|
yading@10
|
1712
|
yading@10
|
1713 vis_ld64(ref[ 0], TMP0);
|
yading@10
|
1714 vis_fzero(ZERO);
|
yading@10
|
1715
|
yading@10
|
1716 vis_ld64(ref[ 8], TMP2);
|
yading@10
|
1717
|
yading@10
|
1718 vis_ld64(ref[16], TMP4);
|
yading@10
|
1719
|
yading@10
|
1720 vis_ld64(constants6[0], CONST_6);
|
yading@10
|
1721 vis_faligndata(TMP0, TMP2, REF_S0);
|
yading@10
|
1722
|
yading@10
|
1723 vis_ld64(constants256_1024[0], CONST_256);
|
yading@10
|
1724 vis_faligndata(TMP2, TMP4, REF_S4);
|
yading@10
|
1725
|
yading@10
|
1726 if (off != 0x7) {
|
yading@10
|
1727 vis_alignaddr_g0((void *)off_plus_1);
|
yading@10
|
1728 vis_faligndata(TMP0, TMP2, REF_S2);
|
yading@10
|
1729 vis_faligndata(TMP2, TMP4, REF_S6);
|
yading@10
|
1730 } else {
|
yading@10
|
1731 vis_src1(TMP2, REF_S2);
|
yading@10
|
1732 vis_src1(TMP4, REF_S6);
|
yading@10
|
1733 }
|
yading@10
|
1734
|
yading@10
|
1735 height >>= 1;
|
yading@10
|
1736 do { /* 55 cycles */
|
yading@10
|
1737 vis_ld64_2(ref, stride, TMP0);
|
yading@10
|
1738 vis_mul8x16au(REF_S0, CONST_256, TMP12);
|
yading@10
|
1739 vis_pmerge(ZERO, REF_S0_1, TMP14);
|
yading@10
|
1740
|
yading@10
|
1741 vis_alignaddr_g0((void *)off);
|
yading@10
|
1742
|
yading@10
|
1743 vis_ld64_2(ref, stride_8, TMP2);
|
yading@10
|
1744 vis_mul8x16au(REF_S2, CONST_256, TMP16);
|
yading@10
|
1745 vis_pmerge(ZERO, REF_S2_1, TMP18);
|
yading@10
|
1746
|
yading@10
|
1747 vis_ld64_2(ref, stride_16, TMP4);
|
yading@10
|
1748 ref += stride;
|
yading@10
|
1749 vis_mul8x16au(REF_S4, CONST_256, TMP20);
|
yading@10
|
1750 vis_pmerge(ZERO, REF_S4_1, TMP22);
|
yading@10
|
1751
|
yading@10
|
1752 vis_ld64_2(ref, stride, TMP6);
|
yading@10
|
1753 vis_mul8x16au(REF_S6, CONST_256, TMP24);
|
yading@10
|
1754 vis_pmerge(ZERO, REF_S6_1, TMP26);
|
yading@10
|
1755
|
yading@10
|
1756 vis_ld64_2(ref, stride_8, TMP8);
|
yading@10
|
1757 vis_faligndata(TMP0, TMP2, REF_0);
|
yading@10
|
1758
|
yading@10
|
1759 vis_ld64_2(ref, stride_16, TMP10);
|
yading@10
|
1760 ref += stride;
|
yading@10
|
1761 vis_faligndata(TMP2, TMP4, REF_4);
|
yading@10
|
1762
|
yading@10
|
1763 vis_ld64(dest[0], DST_0);
|
yading@10
|
1764 vis_faligndata(TMP6, TMP8, REF_S0);
|
yading@10
|
1765
|
yading@10
|
1766 vis_ld64_2(dest, 8, DST_2);
|
yading@10
|
1767 vis_faligndata(TMP8, TMP10, REF_S4);
|
yading@10
|
1768
|
yading@10
|
1769 if (off != 0x7) {
|
yading@10
|
1770 vis_alignaddr_g0((void *)off_plus_1);
|
yading@10
|
1771 vis_faligndata(TMP0, TMP2, REF_2);
|
yading@10
|
1772 vis_faligndata(TMP2, TMP4, REF_6);
|
yading@10
|
1773 vis_faligndata(TMP6, TMP8, REF_S2);
|
yading@10
|
1774 vis_faligndata(TMP8, TMP10, REF_S6);
|
yading@10
|
1775 } else {
|
yading@10
|
1776 vis_src1(TMP2, REF_2);
|
yading@10
|
1777 vis_src1(TMP4, REF_6);
|
yading@10
|
1778 vis_src1(TMP8, REF_S2);
|
yading@10
|
1779 vis_src1(TMP10, REF_S6);
|
yading@10
|
1780 }
|
yading@10
|
1781
|
yading@10
|
1782 vis_mul8x16al(DST_0, CONST_1024, TMP30);
|
yading@10
|
1783 vis_pmerge(ZERO, REF_0, TMP0);
|
yading@10
|
1784
|
yading@10
|
1785 vis_mul8x16al(DST_1, CONST_1024, TMP32);
|
yading@10
|
1786 vis_pmerge(ZERO, REF_0_1, TMP2);
|
yading@10
|
1787
|
yading@10
|
1788 vis_mul8x16au(REF_2, CONST_256, TMP4);
|
yading@10
|
1789 vis_pmerge(ZERO, REF_2_1, TMP6);
|
yading@10
|
1790
|
yading@10
|
1791 vis_mul8x16al(DST_2, CONST_1024, REF_0);
|
yading@10
|
1792 vis_padd16(TMP0, CONST_6, TMP0);
|
yading@10
|
1793
|
yading@10
|
1794 vis_mul8x16al(DST_3, CONST_1024, REF_2);
|
yading@10
|
1795 vis_padd16(TMP2, CONST_6, TMP2);
|
yading@10
|
1796
|
yading@10
|
1797 vis_padd16(TMP0, TMP4, TMP0);
|
yading@10
|
1798 vis_mul8x16au(REF_4, CONST_256, TMP4);
|
yading@10
|
1799
|
yading@10
|
1800 vis_padd16(TMP2, TMP6, TMP2);
|
yading@10
|
1801 vis_mul8x16au(REF_4_1, CONST_256, TMP6);
|
yading@10
|
1802
|
yading@10
|
1803 vis_padd16(TMP12, TMP0, TMP12);
|
yading@10
|
1804 vis_mul8x16au(REF_6, CONST_256, TMP8);
|
yading@10
|
1805
|
yading@10
|
1806 vis_padd16(TMP14, TMP2, TMP14);
|
yading@10
|
1807 vis_mul8x16au(REF_6_1, CONST_256, TMP10);
|
yading@10
|
1808
|
yading@10
|
1809 vis_padd16(TMP12, TMP16, TMP12);
|
yading@10
|
1810 vis_mul8x16au(REF_S0, CONST_256, REF_4);
|
yading@10
|
1811
|
yading@10
|
1812 vis_padd16(TMP14, TMP18, TMP14);
|
yading@10
|
1813 vis_mul8x16au(REF_S0_1, CONST_256, REF_6);
|
yading@10
|
1814
|
yading@10
|
1815 vis_padd16(TMP12, TMP30, TMP12);
|
yading@10
|
1816
|
yading@10
|
1817 vis_padd16(TMP14, TMP32, TMP14);
|
yading@10
|
1818 vis_pack16(TMP12, DST_0);
|
yading@10
|
1819
|
yading@10
|
1820 vis_pack16(TMP14, DST_1);
|
yading@10
|
1821 vis_st64(DST_0, dest[0]);
|
yading@10
|
1822 vis_padd16(TMP4, CONST_6, TMP4);
|
yading@10
|
1823
|
yading@10
|
1824 vis_ld64_2(dest, stride, DST_0);
|
yading@10
|
1825 vis_padd16(TMP6, CONST_6, TMP6);
|
yading@10
|
1826 vis_mul8x16au(REF_S2, CONST_256, TMP12);
|
yading@10
|
1827
|
yading@10
|
1828 vis_padd16(TMP4, TMP8, TMP4);
|
yading@10
|
1829 vis_mul8x16au(REF_S2_1, CONST_256, TMP14);
|
yading@10
|
1830
|
yading@10
|
1831 vis_padd16(TMP6, TMP10, TMP6);
|
yading@10
|
1832
|
yading@10
|
1833 vis_padd16(TMP20, TMP4, TMP20);
|
yading@10
|
1834
|
yading@10
|
1835 vis_padd16(TMP22, TMP6, TMP22);
|
yading@10
|
1836
|
yading@10
|
1837 vis_padd16(TMP20, TMP24, TMP20);
|
yading@10
|
1838
|
yading@10
|
1839 vis_padd16(TMP22, TMP26, TMP22);
|
yading@10
|
1840
|
yading@10
|
1841 vis_padd16(TMP20, REF_0, TMP20);
|
yading@10
|
1842 vis_mul8x16au(REF_S4, CONST_256, REF_0);
|
yading@10
|
1843
|
yading@10
|
1844 vis_padd16(TMP22, REF_2, TMP22);
|
yading@10
|
1845 vis_pack16(TMP20, DST_2);
|
yading@10
|
1846
|
yading@10
|
1847 vis_pack16(TMP22, DST_3);
|
yading@10
|
1848 vis_st64_2(DST_2, dest, 8);
|
yading@10
|
1849 dest += stride;
|
yading@10
|
1850
|
yading@10
|
1851 vis_ld64_2(dest, 8, DST_2);
|
yading@10
|
1852 vis_mul8x16al(DST_0, CONST_1024, TMP30);
|
yading@10
|
1853 vis_pmerge(ZERO, REF_S4_1, REF_2);
|
yading@10
|
1854
|
yading@10
|
1855 vis_mul8x16al(DST_1, CONST_1024, TMP32);
|
yading@10
|
1856 vis_padd16(REF_4, TMP0, TMP8);
|
yading@10
|
1857
|
yading@10
|
1858 vis_mul8x16au(REF_S6, CONST_256, REF_4);
|
yading@10
|
1859 vis_padd16(REF_6, TMP2, TMP10);
|
yading@10
|
1860
|
yading@10
|
1861 vis_mul8x16au(REF_S6_1, CONST_256, REF_6);
|
yading@10
|
1862 vis_padd16(TMP8, TMP12, TMP8);
|
yading@10
|
1863
|
yading@10
|
1864 vis_padd16(TMP10, TMP14, TMP10);
|
yading@10
|
1865
|
yading@10
|
1866 vis_padd16(TMP8, TMP30, TMP8);
|
yading@10
|
1867
|
yading@10
|
1868 vis_padd16(TMP10, TMP32, TMP10);
|
yading@10
|
1869 vis_pack16(TMP8, DST_0);
|
yading@10
|
1870
|
yading@10
|
1871 vis_pack16(TMP10, DST_1);
|
yading@10
|
1872 vis_st64(DST_0, dest[0]);
|
yading@10
|
1873
|
yading@10
|
1874 vis_padd16(REF_0, TMP4, REF_0);
|
yading@10
|
1875
|
yading@10
|
1876 vis_mul8x16al(DST_2, CONST_1024, TMP30);
|
yading@10
|
1877 vis_padd16(REF_2, TMP6, REF_2);
|
yading@10
|
1878
|
yading@10
|
1879 vis_mul8x16al(DST_3, CONST_1024, TMP32);
|
yading@10
|
1880 vis_padd16(REF_0, REF_4, REF_0);
|
yading@10
|
1881
|
yading@10
|
1882 vis_padd16(REF_2, REF_6, REF_2);
|
yading@10
|
1883
|
yading@10
|
1884 vis_padd16(REF_0, TMP30, REF_0);
|
yading@10
|
1885
|
yading@10
|
1886 /* stall */
|
yading@10
|
1887
|
yading@10
|
1888 vis_padd16(REF_2, TMP32, REF_2);
|
yading@10
|
1889 vis_pack16(REF_0, DST_2);
|
yading@10
|
1890
|
yading@10
|
1891 vis_pack16(REF_2, DST_3);
|
yading@10
|
1892 vis_st64_2(DST_2, dest, 8);
|
yading@10
|
1893 dest += stride;
|
yading@10
|
1894 } while (--height);
|
yading@10
|
1895 }
|
yading@10
|
1896
|
yading@10
|
1897 static void MC_avg_xy_8_vis (uint8_t * dest, const uint8_t * ref,
|
yading@10
|
1898 const ptrdiff_t stride, int height)
|
yading@10
|
1899 {
|
yading@10
|
1900 unsigned long off = (unsigned long) ref & 0x7;
|
yading@10
|
1901 unsigned long off_plus_1 = off + 1;
|
yading@10
|
1902 int stride_8 = stride + 8;
|
yading@10
|
1903
|
yading@10
|
1904 vis_set_gsr(4 << VIS_GSR_SCALEFACT_SHIFT);
|
yading@10
|
1905
|
yading@10
|
1906 ref = vis_alignaddr(ref);
|
yading@10
|
1907
|
yading@10
|
1908 vis_ld64(ref[0], TMP0);
|
yading@10
|
1909 vis_fzero(ZERO);
|
yading@10
|
1910
|
yading@10
|
1911 vis_ld64_2(ref, 8, TMP2);
|
yading@10
|
1912
|
yading@10
|
1913 vis_ld64(constants6[0], CONST_6);
|
yading@10
|
1914
|
yading@10
|
1915 vis_ld64(constants256_1024[0], CONST_256);
|
yading@10
|
1916 vis_faligndata(TMP0, TMP2, REF_S0);
|
yading@10
|
1917
|
yading@10
|
1918 if (off != 0x7) {
|
yading@10
|
1919 vis_alignaddr_g0((void *)off_plus_1);
|
yading@10
|
1920 vis_faligndata(TMP0, TMP2, REF_S2);
|
yading@10
|
1921 } else {
|
yading@10
|
1922 vis_src1(TMP2, REF_S2);
|
yading@10
|
1923 }
|
yading@10
|
1924
|
yading@10
|
1925 height >>= 1;
|
yading@10
|
1926 do { /* 31 cycles */
|
yading@10
|
1927 vis_ld64_2(ref, stride, TMP0);
|
yading@10
|
1928 vis_mul8x16au(REF_S0, CONST_256, TMP8);
|
yading@10
|
1929 vis_pmerge(ZERO, REF_S0_1, TMP10);
|
yading@10
|
1930
|
yading@10
|
1931 vis_ld64_2(ref, stride_8, TMP2);
|
yading@10
|
1932 ref += stride;
|
yading@10
|
1933 vis_mul8x16au(REF_S2, CONST_256, TMP12);
|
yading@10
|
1934 vis_pmerge(ZERO, REF_S2_1, TMP14);
|
yading@10
|
1935
|
yading@10
|
1936 vis_alignaddr_g0((void *)off);
|
yading@10
|
1937
|
yading@10
|
1938 vis_ld64_2(ref, stride, TMP4);
|
yading@10
|
1939 vis_faligndata(TMP0, TMP2, REF_S4);
|
yading@10
|
1940
|
yading@10
|
1941 vis_ld64_2(ref, stride_8, TMP6);
|
yading@10
|
1942 ref += stride;
|
yading@10
|
1943
|
yading@10
|
1944 vis_ld64(dest[0], DST_0);
|
yading@10
|
1945 vis_faligndata(TMP4, TMP6, REF_S0);
|
yading@10
|
1946
|
yading@10
|
1947 vis_ld64_2(dest, stride, DST_2);
|
yading@10
|
1948
|
yading@10
|
1949 if (off != 0x7) {
|
yading@10
|
1950 vis_alignaddr_g0((void *)off_plus_1);
|
yading@10
|
1951 vis_faligndata(TMP0, TMP2, REF_S6);
|
yading@10
|
1952 vis_faligndata(TMP4, TMP6, REF_S2);
|
yading@10
|
1953 } else {
|
yading@10
|
1954 vis_src1(TMP2, REF_S6);
|
yading@10
|
1955 vis_src1(TMP6, REF_S2);
|
yading@10
|
1956 }
|
yading@10
|
1957
|
yading@10
|
1958 vis_mul8x16al(DST_0, CONST_1024, TMP30);
|
yading@10
|
1959 vis_pmerge(ZERO, REF_S4, TMP22);
|
yading@10
|
1960
|
yading@10
|
1961 vis_mul8x16al(DST_1, CONST_1024, TMP32);
|
yading@10
|
1962 vis_pmerge(ZERO, REF_S4_1, TMP24);
|
yading@10
|
1963
|
yading@10
|
1964 vis_mul8x16au(REF_S6, CONST_256, TMP26);
|
yading@10
|
1965 vis_pmerge(ZERO, REF_S6_1, TMP28);
|
yading@10
|
1966
|
yading@10
|
1967 vis_mul8x16au(REF_S0, CONST_256, REF_S4);
|
yading@10
|
1968 vis_padd16(TMP22, CONST_6, TMP22);
|
yading@10
|
1969
|
yading@10
|
1970 vis_mul8x16au(REF_S0_1, CONST_256, REF_S6);
|
yading@10
|
1971 vis_padd16(TMP24, CONST_6, TMP24);
|
yading@10
|
1972
|
yading@10
|
1973 vis_mul8x16al(DST_2, CONST_1024, REF_0);
|
yading@10
|
1974 vis_padd16(TMP22, TMP26, TMP22);
|
yading@10
|
1975
|
yading@10
|
1976 vis_mul8x16al(DST_3, CONST_1024, REF_2);
|
yading@10
|
1977 vis_padd16(TMP24, TMP28, TMP24);
|
yading@10
|
1978
|
yading@10
|
1979 vis_mul8x16au(REF_S2, CONST_256, TMP26);
|
yading@10
|
1980 vis_padd16(TMP8, TMP22, TMP8);
|
yading@10
|
1981
|
yading@10
|
1982 vis_mul8x16au(REF_S2_1, CONST_256, TMP28);
|
yading@10
|
1983 vis_padd16(TMP10, TMP24, TMP10);
|
yading@10
|
1984
|
yading@10
|
1985 vis_padd16(TMP8, TMP12, TMP8);
|
yading@10
|
1986
|
yading@10
|
1987 vis_padd16(TMP10, TMP14, TMP10);
|
yading@10
|
1988
|
yading@10
|
1989 vis_padd16(TMP8, TMP30, TMP8);
|
yading@10
|
1990
|
yading@10
|
1991 vis_padd16(TMP10, TMP32, TMP10);
|
yading@10
|
1992 vis_pack16(TMP8, DST_0);
|
yading@10
|
1993
|
yading@10
|
1994 vis_pack16(TMP10, DST_1);
|
yading@10
|
1995 vis_st64(DST_0, dest[0]);
|
yading@10
|
1996 dest += stride;
|
yading@10
|
1997
|
yading@10
|
1998 vis_padd16(REF_S4, TMP22, TMP12);
|
yading@10
|
1999
|
yading@10
|
2000 vis_padd16(REF_S6, TMP24, TMP14);
|
yading@10
|
2001
|
yading@10
|
2002 vis_padd16(TMP12, TMP26, TMP12);
|
yading@10
|
2003
|
yading@10
|
2004 vis_padd16(TMP14, TMP28, TMP14);
|
yading@10
|
2005
|
yading@10
|
2006 vis_padd16(TMP12, REF_0, TMP12);
|
yading@10
|
2007
|
yading@10
|
2008 vis_padd16(TMP14, REF_2, TMP14);
|
yading@10
|
2009 vis_pack16(TMP12, DST_2);
|
yading@10
|
2010
|
yading@10
|
2011 vis_pack16(TMP14, DST_3);
|
yading@10
|
2012 vis_st64(DST_2, dest[0]);
|
yading@10
|
2013 dest += stride;
|
yading@10
|
2014 } while (--height);
|
yading@10
|
2015 }
|
yading@10
|
2016
|
yading@10
|
2017 /* End of rounding code */
|
yading@10
|
2018
|
yading@10
|
2019 /* Start of no rounding code */
|
yading@10
|
2020 /* The trick used in some of this file is the formula from the MMX
|
yading@10
|
2021 * motion comp code, which is:
|
yading@10
|
2022 *
|
yading@10
|
2023 * (x+y)>>1 == (x&y)+((x^y)>>1)
|
yading@10
|
2024 *
|
yading@10
|
2025 * This allows us to average 8 bytes at a time in a 64-bit FPU reg.
|
yading@10
|
2026 * We avoid overflows by masking before we do the shift, and we
|
yading@10
|
2027 * implement the shift by multiplying by 1/2 using mul8x16. So in
|
yading@10
|
2028 * VIS this is (assume 'x' is in f0, 'y' is in f2, a repeating mask
|
yading@10
|
2029 * of '0xfe' is in f4, a repeating mask of '0x7f' is in f6, and
|
yading@10
|
2030 * the value 0x80808080 is in f8):
|
yading@10
|
2031 *
|
yading@10
|
2032 * fxor f0, f2, f10
|
yading@10
|
2033 * fand f10, f4, f10
|
yading@10
|
2034 * fmul8x16 f8, f10, f10
|
yading@10
|
2035 * fand f10, f6, f10
|
yading@10
|
2036 * fand f0, f2, f12
|
yading@10
|
2037 * fpadd16 f12, f10, f10
|
yading@10
|
2038 */
|
yading@10
|
2039
|
yading@10
|
2040 static void MC_put_no_round_o_16_vis (uint8_t * dest, const uint8_t * ref,
|
yading@10
|
2041 const ptrdiff_t stride, int height)
|
yading@10
|
2042 {
|
yading@10
|
2043 ref = vis_alignaddr(ref);
|
yading@10
|
2044 do { /* 5 cycles */
|
yading@10
|
2045 vis_ld64(ref[0], TMP0);
|
yading@10
|
2046
|
yading@10
|
2047 vis_ld64_2(ref, 8, TMP2);
|
yading@10
|
2048
|
yading@10
|
2049 vis_ld64_2(ref, 16, TMP4);
|
yading@10
|
2050 ref += stride;
|
yading@10
|
2051
|
yading@10
|
2052 vis_faligndata(TMP0, TMP2, REF_0);
|
yading@10
|
2053 vis_st64(REF_0, dest[0]);
|
yading@10
|
2054
|
yading@10
|
2055 vis_faligndata(TMP2, TMP4, REF_2);
|
yading@10
|
2056 vis_st64_2(REF_2, dest, 8);
|
yading@10
|
2057 dest += stride;
|
yading@10
|
2058 } while (--height);
|
yading@10
|
2059 }
|
yading@10
|
2060
|
yading@10
|
2061 static void MC_put_no_round_o_8_vis (uint8_t * dest, const uint8_t * ref,
|
yading@10
|
2062 const ptrdiff_t stride, int height)
|
yading@10
|
2063 {
|
yading@10
|
2064 ref = vis_alignaddr(ref);
|
yading@10
|
2065 do { /* 4 cycles */
|
yading@10
|
2066 vis_ld64(ref[0], TMP0);
|
yading@10
|
2067
|
yading@10
|
2068 vis_ld64(ref[8], TMP2);
|
yading@10
|
2069 ref += stride;
|
yading@10
|
2070
|
yading@10
|
2071 /* stall */
|
yading@10
|
2072
|
yading@10
|
2073 vis_faligndata(TMP0, TMP2, REF_0);
|
yading@10
|
2074 vis_st64(REF_0, dest[0]);
|
yading@10
|
2075 dest += stride;
|
yading@10
|
2076 } while (--height);
|
yading@10
|
2077 }
|
yading@10
|
2078
|
yading@10
|
2079
|
yading@10
|
2080 static void MC_avg_no_round_o_16_vis (uint8_t * dest, const uint8_t * ref,
|
yading@10
|
2081 const ptrdiff_t stride, int height)
|
yading@10
|
2082 {
|
yading@10
|
2083 int stride_8 = stride + 8;
|
yading@10
|
2084
|
yading@10
|
2085 ref = vis_alignaddr(ref);
|
yading@10
|
2086
|
yading@10
|
2087 vis_ld64(ref[0], TMP0);
|
yading@10
|
2088
|
yading@10
|
2089 vis_ld64(ref[8], TMP2);
|
yading@10
|
2090
|
yading@10
|
2091 vis_ld64(ref[16], TMP4);
|
yading@10
|
2092
|
yading@10
|
2093 vis_ld64(dest[0], DST_0);
|
yading@10
|
2094
|
yading@10
|
2095 vis_ld64(dest[8], DST_2);
|
yading@10
|
2096
|
yading@10
|
2097 vis_ld64(constants_fe[0], MASK_fe);
|
yading@10
|
2098 vis_faligndata(TMP0, TMP2, REF_0);
|
yading@10
|
2099
|
yading@10
|
2100 vis_ld64(constants_7f[0], MASK_7f);
|
yading@10
|
2101 vis_faligndata(TMP2, TMP4, REF_2);
|
yading@10
|
2102
|
yading@10
|
2103 vis_ld64(constants128[0], CONST_128);
|
yading@10
|
2104
|
yading@10
|
2105 ref += stride;
|
yading@10
|
2106 height = (height >> 1) - 1;
|
yading@10
|
2107
|
yading@10
|
2108 do { /* 24 cycles */
|
yading@10
|
2109 vis_ld64(ref[0], TMP0);
|
yading@10
|
2110 vis_xor(DST_0, REF_0, TMP6);
|
yading@10
|
2111
|
yading@10
|
2112 vis_ld64_2(ref, 8, TMP2);
|
yading@10
|
2113 vis_and(TMP6, MASK_fe, TMP6);
|
yading@10
|
2114
|
yading@10
|
2115 vis_ld64_2(ref, 16, TMP4);
|
yading@10
|
2116 ref += stride;
|
yading@10
|
2117 vis_mul8x16(CONST_128, TMP6, TMP6);
|
yading@10
|
2118 vis_xor(DST_2, REF_2, TMP8);
|
yading@10
|
2119
|
yading@10
|
2120 vis_and(TMP8, MASK_fe, TMP8);
|
yading@10
|
2121
|
yading@10
|
2122 vis_and(DST_0, REF_0, TMP10);
|
yading@10
|
2123 vis_ld64_2(dest, stride, DST_0);
|
yading@10
|
2124 vis_mul8x16(CONST_128, TMP8, TMP8);
|
yading@10
|
2125
|
yading@10
|
2126 vis_and(DST_2, REF_2, TMP12);
|
yading@10
|
2127 vis_ld64_2(dest, stride_8, DST_2);
|
yading@10
|
2128
|
yading@10
|
2129 vis_ld64(ref[0], TMP14);
|
yading@10
|
2130 vis_and(TMP6, MASK_7f, TMP6);
|
yading@10
|
2131
|
yading@10
|
2132 vis_and(TMP8, MASK_7f, TMP8);
|
yading@10
|
2133
|
yading@10
|
2134 vis_padd16(TMP10, TMP6, TMP6);
|
yading@10
|
2135 vis_st64(TMP6, dest[0]);
|
yading@10
|
2136
|
yading@10
|
2137 vis_padd16(TMP12, TMP8, TMP8);
|
yading@10
|
2138 vis_st64_2(TMP8, dest, 8);
|
yading@10
|
2139
|
yading@10
|
2140 dest += stride;
|
yading@10
|
2141 vis_ld64_2(ref, 8, TMP16);
|
yading@10
|
2142 vis_faligndata(TMP0, TMP2, REF_0);
|
yading@10
|
2143
|
yading@10
|
2144 vis_ld64_2(ref, 16, TMP18);
|
yading@10
|
2145 vis_faligndata(TMP2, TMP4, REF_2);
|
yading@10
|
2146 ref += stride;
|
yading@10
|
2147
|
yading@10
|
2148 vis_xor(DST_0, REF_0, TMP20);
|
yading@10
|
2149
|
yading@10
|
2150 vis_and(TMP20, MASK_fe, TMP20);
|
yading@10
|
2151
|
yading@10
|
2152 vis_xor(DST_2, REF_2, TMP22);
|
yading@10
|
2153 vis_mul8x16(CONST_128, TMP20, TMP20);
|
yading@10
|
2154
|
yading@10
|
2155 vis_and(TMP22, MASK_fe, TMP22);
|
yading@10
|
2156
|
yading@10
|
2157 vis_and(DST_0, REF_0, TMP24);
|
yading@10
|
2158 vis_mul8x16(CONST_128, TMP22, TMP22);
|
yading@10
|
2159
|
yading@10
|
2160 vis_and(DST_2, REF_2, TMP26);
|
yading@10
|
2161
|
yading@10
|
2162 vis_ld64_2(dest, stride, DST_0);
|
yading@10
|
2163 vis_faligndata(TMP14, TMP16, REF_0);
|
yading@10
|
2164
|
yading@10
|
2165 vis_ld64_2(dest, stride_8, DST_2);
|
yading@10
|
2166 vis_faligndata(TMP16, TMP18, REF_2);
|
yading@10
|
2167
|
yading@10
|
2168 vis_and(TMP20, MASK_7f, TMP20);
|
yading@10
|
2169
|
yading@10
|
2170 vis_and(TMP22, MASK_7f, TMP22);
|
yading@10
|
2171
|
yading@10
|
2172 vis_padd16(TMP24, TMP20, TMP20);
|
yading@10
|
2173 vis_st64(TMP20, dest[0]);
|
yading@10
|
2174
|
yading@10
|
2175 vis_padd16(TMP26, TMP22, TMP22);
|
yading@10
|
2176 vis_st64_2(TMP22, dest, 8);
|
yading@10
|
2177 dest += stride;
|
yading@10
|
2178 } while (--height);
|
yading@10
|
2179
|
yading@10
|
2180 vis_ld64(ref[0], TMP0);
|
yading@10
|
2181 vis_xor(DST_0, REF_0, TMP6);
|
yading@10
|
2182
|
yading@10
|
2183 vis_ld64_2(ref, 8, TMP2);
|
yading@10
|
2184 vis_and(TMP6, MASK_fe, TMP6);
|
yading@10
|
2185
|
yading@10
|
2186 vis_ld64_2(ref, 16, TMP4);
|
yading@10
|
2187 vis_mul8x16(CONST_128, TMP6, TMP6);
|
yading@10
|
2188 vis_xor(DST_2, REF_2, TMP8);
|
yading@10
|
2189
|
yading@10
|
2190 vis_and(TMP8, MASK_fe, TMP8);
|
yading@10
|
2191
|
yading@10
|
2192 vis_and(DST_0, REF_0, TMP10);
|
yading@10
|
2193 vis_ld64_2(dest, stride, DST_0);
|
yading@10
|
2194 vis_mul8x16(CONST_128, TMP8, TMP8);
|
yading@10
|
2195
|
yading@10
|
2196 vis_and(DST_2, REF_2, TMP12);
|
yading@10
|
2197 vis_ld64_2(dest, stride_8, DST_2);
|
yading@10
|
2198
|
yading@10
|
2199 vis_ld64(ref[0], TMP14);
|
yading@10
|
2200 vis_and(TMP6, MASK_7f, TMP6);
|
yading@10
|
2201
|
yading@10
|
2202 vis_and(TMP8, MASK_7f, TMP8);
|
yading@10
|
2203
|
yading@10
|
2204 vis_padd16(TMP10, TMP6, TMP6);
|
yading@10
|
2205 vis_st64(TMP6, dest[0]);
|
yading@10
|
2206
|
yading@10
|
2207 vis_padd16(TMP12, TMP8, TMP8);
|
yading@10
|
2208 vis_st64_2(TMP8, dest, 8);
|
yading@10
|
2209
|
yading@10
|
2210 dest += stride;
|
yading@10
|
2211 vis_faligndata(TMP0, TMP2, REF_0);
|
yading@10
|
2212
|
yading@10
|
2213 vis_faligndata(TMP2, TMP4, REF_2);
|
yading@10
|
2214
|
yading@10
|
2215 vis_xor(DST_0, REF_0, TMP20);
|
yading@10
|
2216
|
yading@10
|
2217 vis_and(TMP20, MASK_fe, TMP20);
|
yading@10
|
2218
|
yading@10
|
2219 vis_xor(DST_2, REF_2, TMP22);
|
yading@10
|
2220 vis_mul8x16(CONST_128, TMP20, TMP20);
|
yading@10
|
2221
|
yading@10
|
2222 vis_and(TMP22, MASK_fe, TMP22);
|
yading@10
|
2223
|
yading@10
|
2224 vis_and(DST_0, REF_0, TMP24);
|
yading@10
|
2225 vis_mul8x16(CONST_128, TMP22, TMP22);
|
yading@10
|
2226
|
yading@10
|
2227 vis_and(DST_2, REF_2, TMP26);
|
yading@10
|
2228
|
yading@10
|
2229 vis_and(TMP20, MASK_7f, TMP20);
|
yading@10
|
2230
|
yading@10
|
2231 vis_and(TMP22, MASK_7f, TMP22);
|
yading@10
|
2232
|
yading@10
|
2233 vis_padd16(TMP24, TMP20, TMP20);
|
yading@10
|
2234 vis_st64(TMP20, dest[0]);
|
yading@10
|
2235
|
yading@10
|
2236 vis_padd16(TMP26, TMP22, TMP22);
|
yading@10
|
2237 vis_st64_2(TMP22, dest, 8);
|
yading@10
|
2238 }
|
yading@10
|
2239
|
yading@10
|
2240 static void MC_put_no_round_x_16_vis (uint8_t * dest, const uint8_t * ref,
|
yading@10
|
2241 const ptrdiff_t stride, int height)
|
yading@10
|
2242 {
|
yading@10
|
2243 unsigned long off = (unsigned long) ref & 0x7;
|
yading@10
|
2244 unsigned long off_plus_1 = off + 1;
|
yading@10
|
2245
|
yading@10
|
2246 ref = vis_alignaddr(ref);
|
yading@10
|
2247
|
yading@10
|
2248 vis_ld64(ref[0], TMP0);
|
yading@10
|
2249
|
yading@10
|
2250 vis_ld64_2(ref, 8, TMP2);
|
yading@10
|
2251
|
yading@10
|
2252 vis_ld64_2(ref, 16, TMP4);
|
yading@10
|
2253
|
yading@10
|
2254 vis_ld64(constants_fe[0], MASK_fe);
|
yading@10
|
2255
|
yading@10
|
2256 vis_ld64(constants_7f[0], MASK_7f);
|
yading@10
|
2257 vis_faligndata(TMP0, TMP2, REF_0);
|
yading@10
|
2258
|
yading@10
|
2259 vis_ld64(constants128[0], CONST_128);
|
yading@10
|
2260 vis_faligndata(TMP2, TMP4, REF_4);
|
yading@10
|
2261
|
yading@10
|
2262 if (off != 0x7) {
|
yading@10
|
2263 vis_alignaddr_g0((void *)off_plus_1);
|
yading@10
|
2264 vis_faligndata(TMP0, TMP2, REF_2);
|
yading@10
|
2265 vis_faligndata(TMP2, TMP4, REF_6);
|
yading@10
|
2266 } else {
|
yading@10
|
2267 vis_src1(TMP2, REF_2);
|
yading@10
|
2268 vis_src1(TMP4, REF_6);
|
yading@10
|
2269 }
|
yading@10
|
2270
|
yading@10
|
2271 ref += stride;
|
yading@10
|
2272 height = (height >> 1) - 1;
|
yading@10
|
2273
|
yading@10
|
2274 do { /* 34 cycles */
|
yading@10
|
2275 vis_ld64(ref[0], TMP0);
|
yading@10
|
2276 vis_xor(REF_0, REF_2, TMP6);
|
yading@10
|
2277
|
yading@10
|
2278 vis_ld64_2(ref, 8, TMP2);
|
yading@10
|
2279 vis_xor(REF_4, REF_6, TMP8);
|
yading@10
|
2280
|
yading@10
|
2281 vis_ld64_2(ref, 16, TMP4);
|
yading@10
|
2282 vis_and(TMP6, MASK_fe, TMP6);
|
yading@10
|
2283 ref += stride;
|
yading@10
|
2284
|
yading@10
|
2285 vis_ld64(ref[0], TMP14);
|
yading@10
|
2286 vis_mul8x16(CONST_128, TMP6, TMP6);
|
yading@10
|
2287 vis_and(TMP8, MASK_fe, TMP8);
|
yading@10
|
2288
|
yading@10
|
2289 vis_ld64_2(ref, 8, TMP16);
|
yading@10
|
2290 vis_mul8x16(CONST_128, TMP8, TMP8);
|
yading@10
|
2291 vis_and(REF_0, REF_2, TMP10);
|
yading@10
|
2292
|
yading@10
|
2293 vis_ld64_2(ref, 16, TMP18);
|
yading@10
|
2294 ref += stride;
|
yading@10
|
2295 vis_and(REF_4, REF_6, TMP12);
|
yading@10
|
2296
|
yading@10
|
2297 vis_alignaddr_g0((void *)off);
|
yading@10
|
2298
|
yading@10
|
2299 vis_faligndata(TMP0, TMP2, REF_0);
|
yading@10
|
2300
|
yading@10
|
2301 vis_faligndata(TMP2, TMP4, REF_4);
|
yading@10
|
2302
|
yading@10
|
2303 if (off != 0x7) {
|
yading@10
|
2304 vis_alignaddr_g0((void *)off_plus_1);
|
yading@10
|
2305 vis_faligndata(TMP0, TMP2, REF_2);
|
yading@10
|
2306 vis_faligndata(TMP2, TMP4, REF_6);
|
yading@10
|
2307 } else {
|
yading@10
|
2308 vis_src1(TMP2, REF_2);
|
yading@10
|
2309 vis_src1(TMP4, REF_6);
|
yading@10
|
2310 }
|
yading@10
|
2311
|
yading@10
|
2312 vis_and(TMP6, MASK_7f, TMP6);
|
yading@10
|
2313
|
yading@10
|
2314 vis_and(TMP8, MASK_7f, TMP8);
|
yading@10
|
2315
|
yading@10
|
2316 vis_padd16(TMP10, TMP6, TMP6);
|
yading@10
|
2317 vis_st64(TMP6, dest[0]);
|
yading@10
|
2318
|
yading@10
|
2319 vis_padd16(TMP12, TMP8, TMP8);
|
yading@10
|
2320 vis_st64_2(TMP8, dest, 8);
|
yading@10
|
2321 dest += stride;
|
yading@10
|
2322
|
yading@10
|
2323 vis_xor(REF_0, REF_2, TMP6);
|
yading@10
|
2324
|
yading@10
|
2325 vis_xor(REF_4, REF_6, TMP8);
|
yading@10
|
2326
|
yading@10
|
2327 vis_and(TMP6, MASK_fe, TMP6);
|
yading@10
|
2328
|
yading@10
|
2329 vis_mul8x16(CONST_128, TMP6, TMP6);
|
yading@10
|
2330 vis_and(TMP8, MASK_fe, TMP8);
|
yading@10
|
2331
|
yading@10
|
2332 vis_mul8x16(CONST_128, TMP8, TMP8);
|
yading@10
|
2333 vis_and(REF_0, REF_2, TMP10);
|
yading@10
|
2334
|
yading@10
|
2335 vis_and(REF_4, REF_6, TMP12);
|
yading@10
|
2336
|
yading@10
|
2337 vis_alignaddr_g0((void *)off);
|
yading@10
|
2338
|
yading@10
|
2339 vis_faligndata(TMP14, TMP16, REF_0);
|
yading@10
|
2340
|
yading@10
|
2341 vis_faligndata(TMP16, TMP18, REF_4);
|
yading@10
|
2342
|
yading@10
|
2343 if (off != 0x7) {
|
yading@10
|
2344 vis_alignaddr_g0((void *)off_plus_1);
|
yading@10
|
2345 vis_faligndata(TMP14, TMP16, REF_2);
|
yading@10
|
2346 vis_faligndata(TMP16, TMP18, REF_6);
|
yading@10
|
2347 } else {
|
yading@10
|
2348 vis_src1(TMP16, REF_2);
|
yading@10
|
2349 vis_src1(TMP18, REF_6);
|
yading@10
|
2350 }
|
yading@10
|
2351
|
yading@10
|
2352 vis_and(TMP6, MASK_7f, TMP6);
|
yading@10
|
2353
|
yading@10
|
2354 vis_and(TMP8, MASK_7f, TMP8);
|
yading@10
|
2355
|
yading@10
|
2356 vis_padd16(TMP10, TMP6, TMP6);
|
yading@10
|
2357 vis_st64(TMP6, dest[0]);
|
yading@10
|
2358
|
yading@10
|
2359 vis_padd16(TMP12, TMP8, TMP8);
|
yading@10
|
2360 vis_st64_2(TMP8, dest, 8);
|
yading@10
|
2361 dest += stride;
|
yading@10
|
2362 } while (--height);
|
yading@10
|
2363
|
yading@10
|
2364 vis_ld64(ref[0], TMP0);
|
yading@10
|
2365 vis_xor(REF_0, REF_2, TMP6);
|
yading@10
|
2366
|
yading@10
|
2367 vis_ld64_2(ref, 8, TMP2);
|
yading@10
|
2368 vis_xor(REF_4, REF_6, TMP8);
|
yading@10
|
2369
|
yading@10
|
2370 vis_ld64_2(ref, 16, TMP4);
|
yading@10
|
2371 vis_and(TMP6, MASK_fe, TMP6);
|
yading@10
|
2372
|
yading@10
|
2373 vis_mul8x16(CONST_128, TMP6, TMP6);
|
yading@10
|
2374 vis_and(TMP8, MASK_fe, TMP8);
|
yading@10
|
2375
|
yading@10
|
2376 vis_mul8x16(CONST_128, TMP8, TMP8);
|
yading@10
|
2377 vis_and(REF_0, REF_2, TMP10);
|
yading@10
|
2378
|
yading@10
|
2379 vis_and(REF_4, REF_6, TMP12);
|
yading@10
|
2380
|
yading@10
|
2381 vis_alignaddr_g0((void *)off);
|
yading@10
|
2382
|
yading@10
|
2383 vis_faligndata(TMP0, TMP2, REF_0);
|
yading@10
|
2384
|
yading@10
|
2385 vis_faligndata(TMP2, TMP4, REF_4);
|
yading@10
|
2386
|
yading@10
|
2387 if (off != 0x7) {
|
yading@10
|
2388 vis_alignaddr_g0((void *)off_plus_1);
|
yading@10
|
2389 vis_faligndata(TMP0, TMP2, REF_2);
|
yading@10
|
2390 vis_faligndata(TMP2, TMP4, REF_6);
|
yading@10
|
2391 } else {
|
yading@10
|
2392 vis_src1(TMP2, REF_2);
|
yading@10
|
2393 vis_src1(TMP4, REF_6);
|
yading@10
|
2394 }
|
yading@10
|
2395
|
yading@10
|
2396 vis_and(TMP6, MASK_7f, TMP6);
|
yading@10
|
2397
|
yading@10
|
2398 vis_and(TMP8, MASK_7f, TMP8);
|
yading@10
|
2399
|
yading@10
|
2400 vis_padd16(TMP10, TMP6, TMP6);
|
yading@10
|
2401 vis_st64(TMP6, dest[0]);
|
yading@10
|
2402
|
yading@10
|
2403 vis_padd16(TMP12, TMP8, TMP8);
|
yading@10
|
2404 vis_st64_2(TMP8, dest, 8);
|
yading@10
|
2405 dest += stride;
|
yading@10
|
2406
|
yading@10
|
2407 vis_xor(REF_0, REF_2, TMP6);
|
yading@10
|
2408
|
yading@10
|
2409 vis_xor(REF_4, REF_6, TMP8);
|
yading@10
|
2410
|
yading@10
|
2411 vis_and(TMP6, MASK_fe, TMP6);
|
yading@10
|
2412
|
yading@10
|
2413 vis_mul8x16(CONST_128, TMP6, TMP6);
|
yading@10
|
2414 vis_and(TMP8, MASK_fe, TMP8);
|
yading@10
|
2415
|
yading@10
|
2416 vis_mul8x16(CONST_128, TMP8, TMP8);
|
yading@10
|
2417 vis_and(REF_0, REF_2, TMP10);
|
yading@10
|
2418
|
yading@10
|
2419 vis_and(REF_4, REF_6, TMP12);
|
yading@10
|
2420
|
yading@10
|
2421 vis_and(TMP6, MASK_7f, TMP6);
|
yading@10
|
2422
|
yading@10
|
2423 vis_and(TMP8, MASK_7f, TMP8);
|
yading@10
|
2424
|
yading@10
|
2425 vis_padd16(TMP10, TMP6, TMP6);
|
yading@10
|
2426 vis_st64(TMP6, dest[0]);
|
yading@10
|
2427
|
yading@10
|
2428 vis_padd16(TMP12, TMP8, TMP8);
|
yading@10
|
2429 vis_st64_2(TMP8, dest, 8);
|
yading@10
|
2430 }
|
yading@10
|
2431
|
yading@10
|
2432 static void MC_put_no_round_x_8_vis (uint8_t * dest, const uint8_t * ref,
|
yading@10
|
2433 const ptrdiff_t stride, int height)
|
yading@10
|
2434 {
|
yading@10
|
2435 unsigned long off = (unsigned long) ref & 0x7;
|
yading@10
|
2436 unsigned long off_plus_1 = off + 1;
|
yading@10
|
2437
|
yading@10
|
2438 ref = vis_alignaddr(ref);
|
yading@10
|
2439
|
yading@10
|
2440 vis_ld64(ref[0], TMP0);
|
yading@10
|
2441
|
yading@10
|
2442 vis_ld64(ref[8], TMP2);
|
yading@10
|
2443
|
yading@10
|
2444 vis_ld64(constants_fe[0], MASK_fe);
|
yading@10
|
2445
|
yading@10
|
2446 vis_ld64(constants_7f[0], MASK_7f);
|
yading@10
|
2447
|
yading@10
|
2448 vis_ld64(constants128[0], CONST_128);
|
yading@10
|
2449 vis_faligndata(TMP0, TMP2, REF_0);
|
yading@10
|
2450
|
yading@10
|
2451 if (off != 0x7) {
|
yading@10
|
2452 vis_alignaddr_g0((void *)off_plus_1);
|
yading@10
|
2453 vis_faligndata(TMP0, TMP2, REF_2);
|
yading@10
|
2454 } else {
|
yading@10
|
2455 vis_src1(TMP2, REF_2);
|
yading@10
|
2456 }
|
yading@10
|
2457
|
yading@10
|
2458 ref += stride;
|
yading@10
|
2459 height = (height >> 1) - 1;
|
yading@10
|
2460
|
yading@10
|
2461 do { /* 20 cycles */
|
yading@10
|
2462 vis_ld64(ref[0], TMP0);
|
yading@10
|
2463 vis_xor(REF_0, REF_2, TMP4);
|
yading@10
|
2464
|
yading@10
|
2465 vis_ld64_2(ref, 8, TMP2);
|
yading@10
|
2466 vis_and(TMP4, MASK_fe, TMP4);
|
yading@10
|
2467 ref += stride;
|
yading@10
|
2468
|
yading@10
|
2469 vis_ld64(ref[0], TMP8);
|
yading@10
|
2470 vis_and(REF_0, REF_2, TMP6);
|
yading@10
|
2471 vis_mul8x16(CONST_128, TMP4, TMP4);
|
yading@10
|
2472
|
yading@10
|
2473 vis_alignaddr_g0((void *)off);
|
yading@10
|
2474
|
yading@10
|
2475 vis_ld64_2(ref, 8, TMP10);
|
yading@10
|
2476 ref += stride;
|
yading@10
|
2477 vis_faligndata(TMP0, TMP2, REF_0);
|
yading@10
|
2478
|
yading@10
|
2479 if (off != 0x7) {
|
yading@10
|
2480 vis_alignaddr_g0((void *)off_plus_1);
|
yading@10
|
2481 vis_faligndata(TMP0, TMP2, REF_2);
|
yading@10
|
2482 } else {
|
yading@10
|
2483 vis_src1(TMP2, REF_2);
|
yading@10
|
2484 }
|
yading@10
|
2485
|
yading@10
|
2486 vis_and(TMP4, MASK_7f, TMP4);
|
yading@10
|
2487
|
yading@10
|
2488 vis_padd16(TMP6, TMP4, DST_0);
|
yading@10
|
2489 vis_st64(DST_0, dest[0]);
|
yading@10
|
2490 dest += stride;
|
yading@10
|
2491
|
yading@10
|
2492 vis_xor(REF_0, REF_2, TMP12);
|
yading@10
|
2493
|
yading@10
|
2494 vis_and(TMP12, MASK_fe, TMP12);
|
yading@10
|
2495
|
yading@10
|
2496 vis_and(REF_0, REF_2, TMP14);
|
yading@10
|
2497 vis_mul8x16(CONST_128, TMP12, TMP12);
|
yading@10
|
2498
|
yading@10
|
2499 vis_alignaddr_g0((void *)off);
|
yading@10
|
2500 vis_faligndata(TMP8, TMP10, REF_0);
|
yading@10
|
2501 if (off != 0x7) {
|
yading@10
|
2502 vis_alignaddr_g0((void *)off_plus_1);
|
yading@10
|
2503 vis_faligndata(TMP8, TMP10, REF_2);
|
yading@10
|
2504 } else {
|
yading@10
|
2505 vis_src1(TMP10, REF_2);
|
yading@10
|
2506 }
|
yading@10
|
2507
|
yading@10
|
2508 vis_and(TMP12, MASK_7f, TMP12);
|
yading@10
|
2509
|
yading@10
|
2510 vis_padd16(TMP14, TMP12, DST_0);
|
yading@10
|
2511 vis_st64(DST_0, dest[0]);
|
yading@10
|
2512 dest += stride;
|
yading@10
|
2513 } while (--height);
|
yading@10
|
2514
|
yading@10
|
2515 vis_ld64(ref[0], TMP0);
|
yading@10
|
2516 vis_xor(REF_0, REF_2, TMP4);
|
yading@10
|
2517
|
yading@10
|
2518 vis_ld64_2(ref, 8, TMP2);
|
yading@10
|
2519 vis_and(TMP4, MASK_fe, TMP4);
|
yading@10
|
2520
|
yading@10
|
2521 vis_and(REF_0, REF_2, TMP6);
|
yading@10
|
2522 vis_mul8x16(CONST_128, TMP4, TMP4);
|
yading@10
|
2523
|
yading@10
|
2524 vis_alignaddr_g0((void *)off);
|
yading@10
|
2525
|
yading@10
|
2526 vis_faligndata(TMP0, TMP2, REF_0);
|
yading@10
|
2527
|
yading@10
|
2528 if (off != 0x7) {
|
yading@10
|
2529 vis_alignaddr_g0((void *)off_plus_1);
|
yading@10
|
2530 vis_faligndata(TMP0, TMP2, REF_2);
|
yading@10
|
2531 } else {
|
yading@10
|
2532 vis_src1(TMP2, REF_2);
|
yading@10
|
2533 }
|
yading@10
|
2534
|
yading@10
|
2535 vis_and(TMP4, MASK_7f, TMP4);
|
yading@10
|
2536
|
yading@10
|
2537 vis_padd16(TMP6, TMP4, DST_0);
|
yading@10
|
2538 vis_st64(DST_0, dest[0]);
|
yading@10
|
2539 dest += stride;
|
yading@10
|
2540
|
yading@10
|
2541 vis_xor(REF_0, REF_2, TMP12);
|
yading@10
|
2542
|
yading@10
|
2543 vis_and(TMP12, MASK_fe, TMP12);
|
yading@10
|
2544
|
yading@10
|
2545 vis_and(REF_0, REF_2, TMP14);
|
yading@10
|
2546 vis_mul8x16(CONST_128, TMP12, TMP12);
|
yading@10
|
2547
|
yading@10
|
2548 vis_and(TMP12, MASK_7f, TMP12);
|
yading@10
|
2549
|
yading@10
|
2550 vis_padd16(TMP14, TMP12, DST_0);
|
yading@10
|
2551 vis_st64(DST_0, dest[0]);
|
yading@10
|
2552 dest += stride;
|
yading@10
|
2553 }
|
yading@10
|
2554
|
yading@10
|
2555 static void MC_avg_no_round_x_16_vis (uint8_t * dest, const uint8_t * ref,
|
yading@10
|
2556 const ptrdiff_t stride, int height)
|
yading@10
|
2557 {
|
yading@10
|
2558 unsigned long off = (unsigned long) ref & 0x7;
|
yading@10
|
2559 unsigned long off_plus_1 = off + 1;
|
yading@10
|
2560
|
yading@10
|
2561 vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);
|
yading@10
|
2562
|
yading@10
|
2563 vis_ld64(constants3[0], CONST_3);
|
yading@10
|
2564 vis_fzero(ZERO);
|
yading@10
|
2565 vis_ld64(constants256_512[0], CONST_256);
|
yading@10
|
2566
|
yading@10
|
2567 ref = vis_alignaddr(ref);
|
yading@10
|
2568 do { /* 26 cycles */
|
yading@10
|
2569 vis_ld64(ref[0], TMP0);
|
yading@10
|
2570
|
yading@10
|
2571 vis_ld64(ref[8], TMP2);
|
yading@10
|
2572
|
yading@10
|
2573 vis_alignaddr_g0((void *)off);
|
yading@10
|
2574
|
yading@10
|
2575 vis_ld64(ref[16], TMP4);
|
yading@10
|
2576
|
yading@10
|
2577 vis_ld64(dest[0], DST_0);
|
yading@10
|
2578 vis_faligndata(TMP0, TMP2, REF_0);
|
yading@10
|
2579
|
yading@10
|
2580 vis_ld64(dest[8], DST_2);
|
yading@10
|
2581 vis_faligndata(TMP2, TMP4, REF_4);
|
yading@10
|
2582
|
yading@10
|
2583 if (off != 0x7) {
|
yading@10
|
2584 vis_alignaddr_g0((void *)off_plus_1);
|
yading@10
|
2585 vis_faligndata(TMP0, TMP2, REF_2);
|
yading@10
|
2586 vis_faligndata(TMP2, TMP4, REF_6);
|
yading@10
|
2587 } else {
|
yading@10
|
2588 vis_src1(TMP2, REF_2);
|
yading@10
|
2589 vis_src1(TMP4, REF_6);
|
yading@10
|
2590 }
|
yading@10
|
2591
|
yading@10
|
2592 vis_mul8x16au(REF_0, CONST_256, TMP0);
|
yading@10
|
2593
|
yading@10
|
2594 vis_pmerge(ZERO, REF_2, TMP4);
|
yading@10
|
2595 vis_mul8x16au(REF_0_1, CONST_256, TMP2);
|
yading@10
|
2596
|
yading@10
|
2597 vis_pmerge(ZERO, REF_2_1, TMP6);
|
yading@10
|
2598
|
yading@10
|
2599 vis_padd16(TMP0, TMP4, TMP0);
|
yading@10
|
2600
|
yading@10
|
2601 vis_mul8x16al(DST_0, CONST_512, TMP4);
|
yading@10
|
2602 vis_padd16(TMP2, TMP6, TMP2);
|
yading@10
|
2603
|
yading@10
|
2604 vis_mul8x16al(DST_1, CONST_512, TMP6);
|
yading@10
|
2605
|
yading@10
|
2606 vis_mul8x16au(REF_6, CONST_256, TMP12);
|
yading@10
|
2607
|
yading@10
|
2608 vis_padd16(TMP0, TMP4, TMP0);
|
yading@10
|
2609 vis_mul8x16au(REF_6_1, CONST_256, TMP14);
|
yading@10
|
2610
|
yading@10
|
2611 vis_padd16(TMP2, TMP6, TMP2);
|
yading@10
|
2612 vis_mul8x16au(REF_4, CONST_256, TMP16);
|
yading@10
|
2613
|
yading@10
|
2614 vis_padd16(TMP0, CONST_3, TMP8);
|
yading@10
|
2615 vis_mul8x16au(REF_4_1, CONST_256, TMP18);
|
yading@10
|
2616
|
yading@10
|
2617 vis_padd16(TMP2, CONST_3, TMP10);
|
yading@10
|
2618 vis_pack16(TMP8, DST_0);
|
yading@10
|
2619
|
yading@10
|
2620 vis_pack16(TMP10, DST_1);
|
yading@10
|
2621 vis_padd16(TMP16, TMP12, TMP0);
|
yading@10
|
2622
|
yading@10
|
2623 vis_st64(DST_0, dest[0]);
|
yading@10
|
2624 vis_mul8x16al(DST_2, CONST_512, TMP4);
|
yading@10
|
2625 vis_padd16(TMP18, TMP14, TMP2);
|
yading@10
|
2626
|
yading@10
|
2627 vis_mul8x16al(DST_3, CONST_512, TMP6);
|
yading@10
|
2628 vis_padd16(TMP0, CONST_3, TMP0);
|
yading@10
|
2629
|
yading@10
|
2630 vis_padd16(TMP2, CONST_3, TMP2);
|
yading@10
|
2631
|
yading@10
|
2632 vis_padd16(TMP0, TMP4, TMP0);
|
yading@10
|
2633
|
yading@10
|
2634 vis_padd16(TMP2, TMP6, TMP2);
|
yading@10
|
2635 vis_pack16(TMP0, DST_2);
|
yading@10
|
2636
|
yading@10
|
2637 vis_pack16(TMP2, DST_3);
|
yading@10
|
2638 vis_st64(DST_2, dest[8]);
|
yading@10
|
2639
|
yading@10
|
2640 ref += stride;
|
yading@10
|
2641 dest += stride;
|
yading@10
|
2642 } while (--height);
|
yading@10
|
2643 }
|
yading@10
|
2644
|
yading@10
|
2645 static void MC_put_no_round_y_16_vis (uint8_t * dest, const uint8_t * ref,
|
yading@10
|
2646 const ptrdiff_t stride, int height)
|
yading@10
|
2647 {
|
yading@10
|
2648 ref = vis_alignaddr(ref);
|
yading@10
|
2649 vis_ld64(ref[0], TMP0);
|
yading@10
|
2650
|
yading@10
|
2651 vis_ld64_2(ref, 8, TMP2);
|
yading@10
|
2652
|
yading@10
|
2653 vis_ld64_2(ref, 16, TMP4);
|
yading@10
|
2654 ref += stride;
|
yading@10
|
2655
|
yading@10
|
2656 vis_ld64(ref[0], TMP6);
|
yading@10
|
2657 vis_faligndata(TMP0, TMP2, REF_0);
|
yading@10
|
2658
|
yading@10
|
2659 vis_ld64_2(ref, 8, TMP8);
|
yading@10
|
2660 vis_faligndata(TMP2, TMP4, REF_4);
|
yading@10
|
2661
|
yading@10
|
2662 vis_ld64_2(ref, 16, TMP10);
|
yading@10
|
2663 ref += stride;
|
yading@10
|
2664
|
yading@10
|
2665 vis_ld64(constants_fe[0], MASK_fe);
|
yading@10
|
2666 vis_faligndata(TMP6, TMP8, REF_2);
|
yading@10
|
2667
|
yading@10
|
2668 vis_ld64(constants_7f[0], MASK_7f);
|
yading@10
|
2669 vis_faligndata(TMP8, TMP10, REF_6);
|
yading@10
|
2670
|
yading@10
|
2671 vis_ld64(constants128[0], CONST_128);
|
yading@10
|
2672 height = (height >> 1) - 1;
|
yading@10
|
2673 do { /* 24 cycles */
|
yading@10
|
2674 vis_ld64(ref[0], TMP0);
|
yading@10
|
2675 vis_xor(REF_0, REF_2, TMP12);
|
yading@10
|
2676
|
yading@10
|
2677 vis_ld64_2(ref, 8, TMP2);
|
yading@10
|
2678 vis_xor(REF_4, REF_6, TMP16);
|
yading@10
|
2679
|
yading@10
|
2680 vis_ld64_2(ref, 16, TMP4);
|
yading@10
|
2681 ref += stride;
|
yading@10
|
2682 vis_and(REF_0, REF_2, TMP14);
|
yading@10
|
2683
|
yading@10
|
2684 vis_ld64(ref[0], TMP6);
|
yading@10
|
2685 vis_and(REF_4, REF_6, TMP18);
|
yading@10
|
2686
|
yading@10
|
2687 vis_ld64_2(ref, 8, TMP8);
|
yading@10
|
2688 vis_faligndata(TMP0, TMP2, REF_0);
|
yading@10
|
2689
|
yading@10
|
2690 vis_ld64_2(ref, 16, TMP10);
|
yading@10
|
2691 ref += stride;
|
yading@10
|
2692 vis_faligndata(TMP2, TMP4, REF_4);
|
yading@10
|
2693
|
yading@10
|
2694 vis_and(TMP12, MASK_fe, TMP12);
|
yading@10
|
2695
|
yading@10
|
2696 vis_and(TMP16, MASK_fe, TMP16);
|
yading@10
|
2697 vis_mul8x16(CONST_128, TMP12, TMP12);
|
yading@10
|
2698
|
yading@10
|
2699 vis_mul8x16(CONST_128, TMP16, TMP16);
|
yading@10
|
2700 vis_xor(REF_0, REF_2, TMP0);
|
yading@10
|
2701
|
yading@10
|
2702 vis_xor(REF_4, REF_6, TMP2);
|
yading@10
|
2703
|
yading@10
|
2704 vis_and(REF_0, REF_2, TMP20);
|
yading@10
|
2705
|
yading@10
|
2706 vis_and(TMP12, MASK_7f, TMP12);
|
yading@10
|
2707
|
yading@10
|
2708 vis_and(TMP16, MASK_7f, TMP16);
|
yading@10
|
2709
|
yading@10
|
2710 vis_padd16(TMP14, TMP12, TMP12);
|
yading@10
|
2711 vis_st64(TMP12, dest[0]);
|
yading@10
|
2712
|
yading@10
|
2713 vis_padd16(TMP18, TMP16, TMP16);
|
yading@10
|
2714 vis_st64_2(TMP16, dest, 8);
|
yading@10
|
2715 dest += stride;
|
yading@10
|
2716
|
yading@10
|
2717 vis_and(REF_4, REF_6, TMP18);
|
yading@10
|
2718
|
yading@10
|
2719 vis_and(TMP0, MASK_fe, TMP0);
|
yading@10
|
2720
|
yading@10
|
2721 vis_and(TMP2, MASK_fe, TMP2);
|
yading@10
|
2722 vis_mul8x16(CONST_128, TMP0, TMP0);
|
yading@10
|
2723
|
yading@10
|
2724 vis_faligndata(TMP6, TMP8, REF_2);
|
yading@10
|
2725 vis_mul8x16(CONST_128, TMP2, TMP2);
|
yading@10
|
2726
|
yading@10
|
2727 vis_faligndata(TMP8, TMP10, REF_6);
|
yading@10
|
2728
|
yading@10
|
2729 vis_and(TMP0, MASK_7f, TMP0);
|
yading@10
|
2730
|
yading@10
|
2731 vis_and(TMP2, MASK_7f, TMP2);
|
yading@10
|
2732
|
yading@10
|
2733 vis_padd16(TMP20, TMP0, TMP0);
|
yading@10
|
2734 vis_st64(TMP0, dest[0]);
|
yading@10
|
2735
|
yading@10
|
2736 vis_padd16(TMP18, TMP2, TMP2);
|
yading@10
|
2737 vis_st64_2(TMP2, dest, 8);
|
yading@10
|
2738 dest += stride;
|
yading@10
|
2739 } while (--height);
|
yading@10
|
2740
|
yading@10
|
2741 vis_ld64(ref[0], TMP0);
|
yading@10
|
2742 vis_xor(REF_0, REF_2, TMP12);
|
yading@10
|
2743
|
yading@10
|
2744 vis_ld64_2(ref, 8, TMP2);
|
yading@10
|
2745 vis_xor(REF_4, REF_6, TMP16);
|
yading@10
|
2746
|
yading@10
|
2747 vis_ld64_2(ref, 16, TMP4);
|
yading@10
|
2748 vis_and(REF_0, REF_2, TMP14);
|
yading@10
|
2749
|
yading@10
|
2750 vis_and(REF_4, REF_6, TMP18);
|
yading@10
|
2751
|
yading@10
|
2752 vis_faligndata(TMP0, TMP2, REF_0);
|
yading@10
|
2753
|
yading@10
|
2754 vis_faligndata(TMP2, TMP4, REF_4);
|
yading@10
|
2755
|
yading@10
|
2756 vis_and(TMP12, MASK_fe, TMP12);
|
yading@10
|
2757
|
yading@10
|
2758 vis_and(TMP16, MASK_fe, TMP16);
|
yading@10
|
2759 vis_mul8x16(CONST_128, TMP12, TMP12);
|
yading@10
|
2760
|
yading@10
|
2761 vis_mul8x16(CONST_128, TMP16, TMP16);
|
yading@10
|
2762 vis_xor(REF_0, REF_2, TMP0);
|
yading@10
|
2763
|
yading@10
|
2764 vis_xor(REF_4, REF_6, TMP2);
|
yading@10
|
2765
|
yading@10
|
2766 vis_and(REF_0, REF_2, TMP20);
|
yading@10
|
2767
|
yading@10
|
2768 vis_and(TMP12, MASK_7f, TMP12);
|
yading@10
|
2769
|
yading@10
|
2770 vis_and(TMP16, MASK_7f, TMP16);
|
yading@10
|
2771
|
yading@10
|
2772 vis_padd16(TMP14, TMP12, TMP12);
|
yading@10
|
2773 vis_st64(TMP12, dest[0]);
|
yading@10
|
2774
|
yading@10
|
2775 vis_padd16(TMP18, TMP16, TMP16);
|
yading@10
|
2776 vis_st64_2(TMP16, dest, 8);
|
yading@10
|
2777 dest += stride;
|
yading@10
|
2778
|
yading@10
|
2779 vis_and(REF_4, REF_6, TMP18);
|
yading@10
|
2780
|
yading@10
|
2781 vis_and(TMP0, MASK_fe, TMP0);
|
yading@10
|
2782
|
yading@10
|
2783 vis_and(TMP2, MASK_fe, TMP2);
|
yading@10
|
2784 vis_mul8x16(CONST_128, TMP0, TMP0);
|
yading@10
|
2785
|
yading@10
|
2786 vis_mul8x16(CONST_128, TMP2, TMP2);
|
yading@10
|
2787
|
yading@10
|
2788 vis_and(TMP0, MASK_7f, TMP0);
|
yading@10
|
2789
|
yading@10
|
2790 vis_and(TMP2, MASK_7f, TMP2);
|
yading@10
|
2791
|
yading@10
|
2792 vis_padd16(TMP20, TMP0, TMP0);
|
yading@10
|
2793 vis_st64(TMP0, dest[0]);
|
yading@10
|
2794
|
yading@10
|
2795 vis_padd16(TMP18, TMP2, TMP2);
|
yading@10
|
2796 vis_st64_2(TMP2, dest, 8);
|
yading@10
|
2797 }
|
yading@10
|
2798
|
yading@10
|
2799 static void MC_put_no_round_y_8_vis (uint8_t * dest, const uint8_t * ref,
|
yading@10
|
2800 const ptrdiff_t stride, int height)
|
yading@10
|
2801 {
|
yading@10
|
2802 ref = vis_alignaddr(ref);
|
yading@10
|
2803 vis_ld64(ref[0], TMP0);
|
yading@10
|
2804
|
yading@10
|
2805 vis_ld64_2(ref, 8, TMP2);
|
yading@10
|
2806 ref += stride;
|
yading@10
|
2807
|
yading@10
|
2808 vis_ld64(ref[0], TMP4);
|
yading@10
|
2809
|
yading@10
|
2810 vis_ld64_2(ref, 8, TMP6);
|
yading@10
|
2811 ref += stride;
|
yading@10
|
2812
|
yading@10
|
2813 vis_ld64(constants_fe[0], MASK_fe);
|
yading@10
|
2814 vis_faligndata(TMP0, TMP2, REF_0);
|
yading@10
|
2815
|
yading@10
|
2816 vis_ld64(constants_7f[0], MASK_7f);
|
yading@10
|
2817 vis_faligndata(TMP4, TMP6, REF_2);
|
yading@10
|
2818
|
yading@10
|
2819 vis_ld64(constants128[0], CONST_128);
|
yading@10
|
2820 height = (height >> 1) - 1;
|
yading@10
|
2821 do { /* 12 cycles */
|
yading@10
|
2822 vis_ld64(ref[0], TMP0);
|
yading@10
|
2823 vis_xor(REF_0, REF_2, TMP4);
|
yading@10
|
2824
|
yading@10
|
2825 vis_ld64_2(ref, 8, TMP2);
|
yading@10
|
2826 ref += stride;
|
yading@10
|
2827 vis_and(TMP4, MASK_fe, TMP4);
|
yading@10
|
2828
|
yading@10
|
2829 vis_and(REF_0, REF_2, TMP6);
|
yading@10
|
2830 vis_mul8x16(CONST_128, TMP4, TMP4);
|
yading@10
|
2831
|
yading@10
|
2832 vis_faligndata(TMP0, TMP2, REF_0);
|
yading@10
|
2833 vis_ld64(ref[0], TMP0);
|
yading@10
|
2834
|
yading@10
|
2835 vis_ld64_2(ref, 8, TMP2);
|
yading@10
|
2836 ref += stride;
|
yading@10
|
2837 vis_xor(REF_0, REF_2, TMP12);
|
yading@10
|
2838
|
yading@10
|
2839 vis_and(TMP4, MASK_7f, TMP4);
|
yading@10
|
2840
|
yading@10
|
2841 vis_and(TMP12, MASK_fe, TMP12);
|
yading@10
|
2842
|
yading@10
|
2843 vis_mul8x16(CONST_128, TMP12, TMP12);
|
yading@10
|
2844 vis_and(REF_0, REF_2, TMP14);
|
yading@10
|
2845
|
yading@10
|
2846 vis_padd16(TMP6, TMP4, DST_0);
|
yading@10
|
2847 vis_st64(DST_0, dest[0]);
|
yading@10
|
2848 dest += stride;
|
yading@10
|
2849
|
yading@10
|
2850 vis_faligndata(TMP0, TMP2, REF_2);
|
yading@10
|
2851
|
yading@10
|
2852 vis_and(TMP12, MASK_7f, TMP12);
|
yading@10
|
2853
|
yading@10
|
2854 vis_padd16(TMP14, TMP12, DST_0);
|
yading@10
|
2855 vis_st64(DST_0, dest[0]);
|
yading@10
|
2856 dest += stride;
|
yading@10
|
2857 } while (--height);
|
yading@10
|
2858
|
yading@10
|
2859 vis_ld64(ref[0], TMP0);
|
yading@10
|
2860 vis_xor(REF_0, REF_2, TMP4);
|
yading@10
|
2861
|
yading@10
|
2862 vis_ld64_2(ref, 8, TMP2);
|
yading@10
|
2863 vis_and(TMP4, MASK_fe, TMP4);
|
yading@10
|
2864
|
yading@10
|
2865 vis_and(REF_0, REF_2, TMP6);
|
yading@10
|
2866 vis_mul8x16(CONST_128, TMP4, TMP4);
|
yading@10
|
2867
|
yading@10
|
2868 vis_faligndata(TMP0, TMP2, REF_0);
|
yading@10
|
2869
|
yading@10
|
2870 vis_xor(REF_0, REF_2, TMP12);
|
yading@10
|
2871
|
yading@10
|
2872 vis_and(TMP4, MASK_7f, TMP4);
|
yading@10
|
2873
|
yading@10
|
2874 vis_and(TMP12, MASK_fe, TMP12);
|
yading@10
|
2875
|
yading@10
|
2876 vis_mul8x16(CONST_128, TMP12, TMP12);
|
yading@10
|
2877 vis_and(REF_0, REF_2, TMP14);
|
yading@10
|
2878
|
yading@10
|
2879 vis_padd16(TMP6, TMP4, DST_0);
|
yading@10
|
2880 vis_st64(DST_0, dest[0]);
|
yading@10
|
2881 dest += stride;
|
yading@10
|
2882
|
yading@10
|
2883 vis_and(TMP12, MASK_7f, TMP12);
|
yading@10
|
2884
|
yading@10
|
2885 vis_padd16(TMP14, TMP12, DST_0);
|
yading@10
|
2886 vis_st64(DST_0, dest[0]);
|
yading@10
|
2887 }
|
yading@10
|
2888
|
yading@10
|
2889 static void MC_avg_no_round_y_16_vis (uint8_t * dest, const uint8_t * ref,
|
yading@10
|
2890 const ptrdiff_t stride, int height)
|
yading@10
|
2891 {
|
yading@10
|
2892 int stride_8 = stride + 8;
|
yading@10
|
2893 int stride_16 = stride + 16;
|
yading@10
|
2894
|
yading@10
|
2895 vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);
|
yading@10
|
2896
|
yading@10
|
2897 ref = vis_alignaddr(ref);
|
yading@10
|
2898
|
yading@10
|
2899 vis_ld64(ref[ 0], TMP0);
|
yading@10
|
2900 vis_fzero(ZERO);
|
yading@10
|
2901
|
yading@10
|
2902 vis_ld64(ref[ 8], TMP2);
|
yading@10
|
2903
|
yading@10
|
2904 vis_ld64(ref[16], TMP4);
|
yading@10
|
2905
|
yading@10
|
2906 vis_ld64(constants3[0], CONST_3);
|
yading@10
|
2907 vis_faligndata(TMP0, TMP2, REF_2);
|
yading@10
|
2908
|
yading@10
|
2909 vis_ld64(constants256_512[0], CONST_256);
|
yading@10
|
2910 vis_faligndata(TMP2, TMP4, REF_6);
|
yading@10
|
2911 height >>= 1;
|
yading@10
|
2912
|
yading@10
|
2913 do { /* 31 cycles */
|
yading@10
|
2914 vis_ld64_2(ref, stride, TMP0);
|
yading@10
|
2915 vis_pmerge(ZERO, REF_2, TMP12);
|
yading@10
|
2916 vis_mul8x16au(REF_2_1, CONST_256, TMP14);
|
yading@10
|
2917
|
yading@10
|
2918 vis_ld64_2(ref, stride_8, TMP2);
|
yading@10
|
2919 vis_pmerge(ZERO, REF_6, TMP16);
|
yading@10
|
2920 vis_mul8x16au(REF_6_1, CONST_256, TMP18);
|
yading@10
|
2921
|
yading@10
|
2922 vis_ld64_2(ref, stride_16, TMP4);
|
yading@10
|
2923 ref += stride;
|
yading@10
|
2924
|
yading@10
|
2925 vis_ld64(dest[0], DST_0);
|
yading@10
|
2926 vis_faligndata(TMP0, TMP2, REF_0);
|
yading@10
|
2927
|
yading@10
|
2928 vis_ld64_2(dest, 8, DST_2);
|
yading@10
|
2929 vis_faligndata(TMP2, TMP4, REF_4);
|
yading@10
|
2930
|
yading@10
|
2931 vis_ld64_2(ref, stride, TMP6);
|
yading@10
|
2932 vis_pmerge(ZERO, REF_0, TMP0);
|
yading@10
|
2933 vis_mul8x16au(REF_0_1, CONST_256, TMP2);
|
yading@10
|
2934
|
yading@10
|
2935 vis_ld64_2(ref, stride_8, TMP8);
|
yading@10
|
2936 vis_pmerge(ZERO, REF_4, TMP4);
|
yading@10
|
2937
|
yading@10
|
2938 vis_ld64_2(ref, stride_16, TMP10);
|
yading@10
|
2939 ref += stride;
|
yading@10
|
2940
|
yading@10
|
2941 vis_ld64_2(dest, stride, REF_S0/*DST_4*/);
|
yading@10
|
2942 vis_faligndata(TMP6, TMP8, REF_2);
|
yading@10
|
2943 vis_mul8x16au(REF_4_1, CONST_256, TMP6);
|
yading@10
|
2944
|
yading@10
|
2945 vis_ld64_2(dest, stride_8, REF_S2/*DST_6*/);
|
yading@10
|
2946 vis_faligndata(TMP8, TMP10, REF_6);
|
yading@10
|
2947 vis_mul8x16al(DST_0, CONST_512, TMP20);
|
yading@10
|
2948
|
yading@10
|
2949 vis_padd16(TMP0, CONST_3, TMP0);
|
yading@10
|
2950 vis_mul8x16al(DST_1, CONST_512, TMP22);
|
yading@10
|
2951
|
yading@10
|
2952 vis_padd16(TMP2, CONST_3, TMP2);
|
yading@10
|
2953 vis_mul8x16al(DST_2, CONST_512, TMP24);
|
yading@10
|
2954
|
yading@10
|
2955 vis_padd16(TMP4, CONST_3, TMP4);
|
yading@10
|
2956 vis_mul8x16al(DST_3, CONST_512, TMP26);
|
yading@10
|
2957
|
yading@10
|
2958 vis_padd16(TMP6, CONST_3, TMP6);
|
yading@10
|
2959
|
yading@10
|
2960 vis_padd16(TMP12, TMP20, TMP12);
|
yading@10
|
2961 vis_mul8x16al(REF_S0, CONST_512, TMP20);
|
yading@10
|
2962
|
yading@10
|
2963 vis_padd16(TMP14, TMP22, TMP14);
|
yading@10
|
2964 vis_mul8x16al(REF_S0_1, CONST_512, TMP22);
|
yading@10
|
2965
|
yading@10
|
2966 vis_padd16(TMP16, TMP24, TMP16);
|
yading@10
|
2967 vis_mul8x16al(REF_S2, CONST_512, TMP24);
|
yading@10
|
2968
|
yading@10
|
2969 vis_padd16(TMP18, TMP26, TMP18);
|
yading@10
|
2970 vis_mul8x16al(REF_S2_1, CONST_512, TMP26);
|
yading@10
|
2971
|
yading@10
|
2972 vis_padd16(TMP12, TMP0, TMP12);
|
yading@10
|
2973 vis_mul8x16au(REF_2, CONST_256, TMP28);
|
yading@10
|
2974
|
yading@10
|
2975 vis_padd16(TMP14, TMP2, TMP14);
|
yading@10
|
2976 vis_mul8x16au(REF_2_1, CONST_256, TMP30);
|
yading@10
|
2977
|
yading@10
|
2978 vis_padd16(TMP16, TMP4, TMP16);
|
yading@10
|
2979 vis_mul8x16au(REF_6, CONST_256, REF_S4);
|
yading@10
|
2980
|
yading@10
|
2981 vis_padd16(TMP18, TMP6, TMP18);
|
yading@10
|
2982 vis_mul8x16au(REF_6_1, CONST_256, REF_S6);
|
yading@10
|
2983
|
yading@10
|
2984 vis_pack16(TMP12, DST_0);
|
yading@10
|
2985 vis_padd16(TMP28, TMP0, TMP12);
|
yading@10
|
2986
|
yading@10
|
2987 vis_pack16(TMP14, DST_1);
|
yading@10
|
2988 vis_st64(DST_0, dest[0]);
|
yading@10
|
2989 vis_padd16(TMP30, TMP2, TMP14);
|
yading@10
|
2990
|
yading@10
|
2991 vis_pack16(TMP16, DST_2);
|
yading@10
|
2992 vis_padd16(REF_S4, TMP4, TMP16);
|
yading@10
|
2993
|
yading@10
|
2994 vis_pack16(TMP18, DST_3);
|
yading@10
|
2995 vis_st64_2(DST_2, dest, 8);
|
yading@10
|
2996 dest += stride;
|
yading@10
|
2997 vis_padd16(REF_S6, TMP6, TMP18);
|
yading@10
|
2998
|
yading@10
|
2999 vis_padd16(TMP12, TMP20, TMP12);
|
yading@10
|
3000
|
yading@10
|
3001 vis_padd16(TMP14, TMP22, TMP14);
|
yading@10
|
3002 vis_pack16(TMP12, DST_0);
|
yading@10
|
3003
|
yading@10
|
3004 vis_padd16(TMP16, TMP24, TMP16);
|
yading@10
|
3005 vis_pack16(TMP14, DST_1);
|
yading@10
|
3006 vis_st64(DST_0, dest[0]);
|
yading@10
|
3007
|
yading@10
|
3008 vis_padd16(TMP18, TMP26, TMP18);
|
yading@10
|
3009 vis_pack16(TMP16, DST_2);
|
yading@10
|
3010
|
yading@10
|
3011 vis_pack16(TMP18, DST_3);
|
yading@10
|
3012 vis_st64_2(DST_2, dest, 8);
|
yading@10
|
3013 dest += stride;
|
yading@10
|
3014 } while (--height);
|
yading@10
|
3015 }
|
yading@10
|
3016
|
yading@10
|
3017 static void MC_put_no_round_xy_16_vis (uint8_t * dest, const uint8_t * ref,
|
yading@10
|
3018 const ptrdiff_t stride, int height)
|
yading@10
|
3019 {
|
yading@10
|
3020 unsigned long off = (unsigned long) ref & 0x7;
|
yading@10
|
3021 unsigned long off_plus_1 = off + 1;
|
yading@10
|
3022 int stride_8 = stride + 8;
|
yading@10
|
3023 int stride_16 = stride + 16;
|
yading@10
|
3024
|
yading@10
|
3025 vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);
|
yading@10
|
3026
|
yading@10
|
3027 ref = vis_alignaddr(ref);
|
yading@10
|
3028
|
yading@10
|
3029 vis_ld64(ref[ 0], TMP0);
|
yading@10
|
3030 vis_fzero(ZERO);
|
yading@10
|
3031
|
yading@10
|
3032 vis_ld64(ref[ 8], TMP2);
|
yading@10
|
3033
|
yading@10
|
3034 vis_ld64(ref[16], TMP4);
|
yading@10
|
3035
|
yading@10
|
3036 vis_ld64(constants1[0], CONST_1);
|
yading@10
|
3037 vis_faligndata(TMP0, TMP2, REF_S0);
|
yading@10
|
3038
|
yading@10
|
3039 vis_ld64(constants256_512[0], CONST_256);
|
yading@10
|
3040 vis_faligndata(TMP2, TMP4, REF_S4);
|
yading@10
|
3041
|
yading@10
|
3042 if (off != 0x7) {
|
yading@10
|
3043 vis_alignaddr_g0((void *)off_plus_1);
|
yading@10
|
3044 vis_faligndata(TMP0, TMP2, REF_S2);
|
yading@10
|
3045 vis_faligndata(TMP2, TMP4, REF_S6);
|
yading@10
|
3046 } else {
|
yading@10
|
3047 vis_src1(TMP2, REF_S2);
|
yading@10
|
3048 vis_src1(TMP4, REF_S6);
|
yading@10
|
3049 }
|
yading@10
|
3050
|
yading@10
|
3051 height >>= 1;
|
yading@10
|
3052 do {
|
yading@10
|
3053 vis_ld64_2(ref, stride, TMP0);
|
yading@10
|
3054 vis_mul8x16au(REF_S0, CONST_256, TMP12);
|
yading@10
|
3055 vis_pmerge(ZERO, REF_S0_1, TMP14);
|
yading@10
|
3056
|
yading@10
|
3057 vis_alignaddr_g0((void *)off);
|
yading@10
|
3058
|
yading@10
|
3059 vis_ld64_2(ref, stride_8, TMP2);
|
yading@10
|
3060 vis_mul8x16au(REF_S2, CONST_256, TMP16);
|
yading@10
|
3061 vis_pmerge(ZERO, REF_S2_1, TMP18);
|
yading@10
|
3062
|
yading@10
|
3063 vis_ld64_2(ref, stride_16, TMP4);
|
yading@10
|
3064 ref += stride;
|
yading@10
|
3065 vis_mul8x16au(REF_S4, CONST_256, TMP20);
|
yading@10
|
3066 vis_pmerge(ZERO, REF_S4_1, TMP22);
|
yading@10
|
3067
|
yading@10
|
3068 vis_ld64_2(ref, stride, TMP6);
|
yading@10
|
3069 vis_mul8x16au(REF_S6, CONST_256, TMP24);
|
yading@10
|
3070 vis_pmerge(ZERO, REF_S6_1, TMP26);
|
yading@10
|
3071
|
yading@10
|
3072 vis_ld64_2(ref, stride_8, TMP8);
|
yading@10
|
3073 vis_faligndata(TMP0, TMP2, REF_0);
|
yading@10
|
3074
|
yading@10
|
3075 vis_ld64_2(ref, stride_16, TMP10);
|
yading@10
|
3076 ref += stride;
|
yading@10
|
3077 vis_faligndata(TMP2, TMP4, REF_4);
|
yading@10
|
3078
|
yading@10
|
3079 vis_faligndata(TMP6, TMP8, REF_S0);
|
yading@10
|
3080
|
yading@10
|
3081 vis_faligndata(TMP8, TMP10, REF_S4);
|
yading@10
|
3082
|
yading@10
|
3083 if (off != 0x7) {
|
yading@10
|
3084 vis_alignaddr_g0((void *)off_plus_1);
|
yading@10
|
3085 vis_faligndata(TMP0, TMP2, REF_2);
|
yading@10
|
3086 vis_faligndata(TMP2, TMP4, REF_6);
|
yading@10
|
3087 vis_faligndata(TMP6, TMP8, REF_S2);
|
yading@10
|
3088 vis_faligndata(TMP8, TMP10, REF_S6);
|
yading@10
|
3089 } else {
|
yading@10
|
3090 vis_src1(TMP2, REF_2);
|
yading@10
|
3091 vis_src1(TMP4, REF_6);
|
yading@10
|
3092 vis_src1(TMP8, REF_S2);
|
yading@10
|
3093 vis_src1(TMP10, REF_S6);
|
yading@10
|
3094 }
|
yading@10
|
3095
|
yading@10
|
3096 vis_mul8x16au(REF_0, CONST_256, TMP0);
|
yading@10
|
3097 vis_pmerge(ZERO, REF_0_1, TMP2);
|
yading@10
|
3098
|
yading@10
|
3099 vis_mul8x16au(REF_2, CONST_256, TMP4);
|
yading@10
|
3100 vis_pmerge(ZERO, REF_2_1, TMP6);
|
yading@10
|
3101
|
yading@10
|
3102 vis_padd16(TMP0, CONST_2, TMP8);
|
yading@10
|
3103 vis_mul8x16au(REF_4, CONST_256, TMP0);
|
yading@10
|
3104
|
yading@10
|
3105 vis_padd16(TMP2, CONST_1, TMP10);
|
yading@10
|
3106 vis_mul8x16au(REF_4_1, CONST_256, TMP2);
|
yading@10
|
3107
|
yading@10
|
3108 vis_padd16(TMP8, TMP4, TMP8);
|
yading@10
|
3109 vis_mul8x16au(REF_6, CONST_256, TMP4);
|
yading@10
|
3110
|
yading@10
|
3111 vis_padd16(TMP10, TMP6, TMP10);
|
yading@10
|
3112 vis_mul8x16au(REF_6_1, CONST_256, TMP6);
|
yading@10
|
3113
|
yading@10
|
3114 vis_padd16(TMP12, TMP8, TMP12);
|
yading@10
|
3115
|
yading@10
|
3116 vis_padd16(TMP14, TMP10, TMP14);
|
yading@10
|
3117
|
yading@10
|
3118 vis_padd16(TMP12, TMP16, TMP12);
|
yading@10
|
3119
|
yading@10
|
3120 vis_padd16(TMP14, TMP18, TMP14);
|
yading@10
|
3121 vis_pack16(TMP12, DST_0);
|
yading@10
|
3122
|
yading@10
|
3123 vis_pack16(TMP14, DST_1);
|
yading@10
|
3124 vis_st64(DST_0, dest[0]);
|
yading@10
|
3125 vis_padd16(TMP0, CONST_1, TMP12);
|
yading@10
|
3126
|
yading@10
|
3127 vis_mul8x16au(REF_S0, CONST_256, TMP0);
|
yading@10
|
3128 vis_padd16(TMP2, CONST_1, TMP14);
|
yading@10
|
3129
|
yading@10
|
3130 vis_mul8x16au(REF_S0_1, CONST_256, TMP2);
|
yading@10
|
3131 vis_padd16(TMP12, TMP4, TMP12);
|
yading@10
|
3132
|
yading@10
|
3133 vis_mul8x16au(REF_S2, CONST_256, TMP4);
|
yading@10
|
3134 vis_padd16(TMP14, TMP6, TMP14);
|
yading@10
|
3135
|
yading@10
|
3136 vis_mul8x16au(REF_S2_1, CONST_256, TMP6);
|
yading@10
|
3137 vis_padd16(TMP20, TMP12, TMP20);
|
yading@10
|
3138
|
yading@10
|
3139 vis_padd16(TMP22, TMP14, TMP22);
|
yading@10
|
3140
|
yading@10
|
3141 vis_padd16(TMP20, TMP24, TMP20);
|
yading@10
|
3142
|
yading@10
|
3143 vis_padd16(TMP22, TMP26, TMP22);
|
yading@10
|
3144 vis_pack16(TMP20, DST_2);
|
yading@10
|
3145
|
yading@10
|
3146 vis_pack16(TMP22, DST_3);
|
yading@10
|
3147 vis_st64_2(DST_2, dest, 8);
|
yading@10
|
3148 dest += stride;
|
yading@10
|
3149 vis_padd16(TMP0, TMP4, TMP24);
|
yading@10
|
3150
|
yading@10
|
3151 vis_mul8x16au(REF_S4, CONST_256, TMP0);
|
yading@10
|
3152 vis_padd16(TMP2, TMP6, TMP26);
|
yading@10
|
3153
|
yading@10
|
3154 vis_mul8x16au(REF_S4_1, CONST_256, TMP2);
|
yading@10
|
3155 vis_padd16(TMP24, TMP8, TMP24);
|
yading@10
|
3156
|
yading@10
|
3157 vis_padd16(TMP26, TMP10, TMP26);
|
yading@10
|
3158 vis_pack16(TMP24, DST_0);
|
yading@10
|
3159
|
yading@10
|
3160 vis_pack16(TMP26, DST_1);
|
yading@10
|
3161 vis_st64(DST_0, dest[0]);
|
yading@10
|
3162 vis_pmerge(ZERO, REF_S6, TMP4);
|
yading@10
|
3163
|
yading@10
|
3164 vis_pmerge(ZERO, REF_S6_1, TMP6);
|
yading@10
|
3165
|
yading@10
|
3166 vis_padd16(TMP0, TMP4, TMP0);
|
yading@10
|
3167
|
yading@10
|
3168 vis_padd16(TMP2, TMP6, TMP2);
|
yading@10
|
3169
|
yading@10
|
3170 vis_padd16(TMP0, TMP12, TMP0);
|
yading@10
|
3171
|
yading@10
|
3172 vis_padd16(TMP2, TMP14, TMP2);
|
yading@10
|
3173 vis_pack16(TMP0, DST_2);
|
yading@10
|
3174
|
yading@10
|
3175 vis_pack16(TMP2, DST_3);
|
yading@10
|
3176 vis_st64_2(DST_2, dest, 8);
|
yading@10
|
3177 dest += stride;
|
yading@10
|
3178 } while (--height);
|
yading@10
|
3179 }
|
yading@10
|
3180
|
yading@10
|
3181 static void MC_put_no_round_xy_8_vis (uint8_t * dest, const uint8_t * ref,
|
yading@10
|
3182 const ptrdiff_t stride, int height)
|
yading@10
|
3183 {
|
yading@10
|
3184 unsigned long off = (unsigned long) ref & 0x7;
|
yading@10
|
3185 unsigned long off_plus_1 = off + 1;
|
yading@10
|
3186 int stride_8 = stride + 8;
|
yading@10
|
3187
|
yading@10
|
3188 vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);
|
yading@10
|
3189
|
yading@10
|
3190 ref = vis_alignaddr(ref);
|
yading@10
|
3191
|
yading@10
|
3192 vis_ld64(ref[ 0], TMP0);
|
yading@10
|
3193 vis_fzero(ZERO);
|
yading@10
|
3194
|
yading@10
|
3195 vis_ld64(ref[ 8], TMP2);
|
yading@10
|
3196
|
yading@10
|
3197 vis_ld64(constants1[0], CONST_1);
|
yading@10
|
3198
|
yading@10
|
3199 vis_ld64(constants256_512[0], CONST_256);
|
yading@10
|
3200 vis_faligndata(TMP0, TMP2, REF_S0);
|
yading@10
|
3201
|
yading@10
|
3202 if (off != 0x7) {
|
yading@10
|
3203 vis_alignaddr_g0((void *)off_plus_1);
|
yading@10
|
3204 vis_faligndata(TMP0, TMP2, REF_S2);
|
yading@10
|
3205 } else {
|
yading@10
|
3206 vis_src1(TMP2, REF_S2);
|
yading@10
|
3207 }
|
yading@10
|
3208
|
yading@10
|
3209 height >>= 1;
|
yading@10
|
3210 do { /* 26 cycles */
|
yading@10
|
3211 vis_ld64_2(ref, stride, TMP0);
|
yading@10
|
3212 vis_mul8x16au(REF_S0, CONST_256, TMP8);
|
yading@10
|
3213 vis_pmerge(ZERO, REF_S2, TMP12);
|
yading@10
|
3214
|
yading@10
|
3215 vis_alignaddr_g0((void *)off);
|
yading@10
|
3216
|
yading@10
|
3217 vis_ld64_2(ref, stride_8, TMP2);
|
yading@10
|
3218 ref += stride;
|
yading@10
|
3219 vis_mul8x16au(REF_S0_1, CONST_256, TMP10);
|
yading@10
|
3220 vis_pmerge(ZERO, REF_S2_1, TMP14);
|
yading@10
|
3221
|
yading@10
|
3222 vis_ld64_2(ref, stride, TMP4);
|
yading@10
|
3223
|
yading@10
|
3224 vis_ld64_2(ref, stride_8, TMP6);
|
yading@10
|
3225 ref += stride;
|
yading@10
|
3226 vis_faligndata(TMP0, TMP2, REF_S4);
|
yading@10
|
3227
|
yading@10
|
3228 vis_pmerge(ZERO, REF_S4, TMP18);
|
yading@10
|
3229
|
yading@10
|
3230 vis_pmerge(ZERO, REF_S4_1, TMP20);
|
yading@10
|
3231
|
yading@10
|
3232 vis_faligndata(TMP4, TMP6, REF_S0);
|
yading@10
|
3233
|
yading@10
|
3234 if (off != 0x7) {
|
yading@10
|
3235 vis_alignaddr_g0((void *)off_plus_1);
|
yading@10
|
3236 vis_faligndata(TMP0, TMP2, REF_S6);
|
yading@10
|
3237 vis_faligndata(TMP4, TMP6, REF_S2);
|
yading@10
|
3238 } else {
|
yading@10
|
3239 vis_src1(TMP2, REF_S6);
|
yading@10
|
3240 vis_src1(TMP6, REF_S2);
|
yading@10
|
3241 }
|
yading@10
|
3242
|
yading@10
|
3243 vis_padd16(TMP18, CONST_1, TMP18);
|
yading@10
|
3244 vis_mul8x16au(REF_S6, CONST_256, TMP22);
|
yading@10
|
3245
|
yading@10
|
3246 vis_padd16(TMP20, CONST_1, TMP20);
|
yading@10
|
3247 vis_mul8x16au(REF_S6_1, CONST_256, TMP24);
|
yading@10
|
3248
|
yading@10
|
3249 vis_mul8x16au(REF_S0, CONST_256, TMP26);
|
yading@10
|
3250 vis_pmerge(ZERO, REF_S0_1, TMP28);
|
yading@10
|
3251
|
yading@10
|
3252 vis_mul8x16au(REF_S2, CONST_256, TMP30);
|
yading@10
|
3253 vis_padd16(TMP18, TMP22, TMP18);
|
yading@10
|
3254
|
yading@10
|
3255 vis_mul8x16au(REF_S2_1, CONST_256, TMP32);
|
yading@10
|
3256 vis_padd16(TMP20, TMP24, TMP20);
|
yading@10
|
3257
|
yading@10
|
3258 vis_padd16(TMP8, TMP18, TMP8);
|
yading@10
|
3259
|
yading@10
|
3260 vis_padd16(TMP10, TMP20, TMP10);
|
yading@10
|
3261
|
yading@10
|
3262 vis_padd16(TMP8, TMP12, TMP8);
|
yading@10
|
3263
|
yading@10
|
3264 vis_padd16(TMP10, TMP14, TMP10);
|
yading@10
|
3265 vis_pack16(TMP8, DST_0);
|
yading@10
|
3266
|
yading@10
|
3267 vis_pack16(TMP10, DST_1);
|
yading@10
|
3268 vis_st64(DST_0, dest[0]);
|
yading@10
|
3269 dest += stride;
|
yading@10
|
3270 vis_padd16(TMP18, TMP26, TMP18);
|
yading@10
|
3271
|
yading@10
|
3272 vis_padd16(TMP20, TMP28, TMP20);
|
yading@10
|
3273
|
yading@10
|
3274 vis_padd16(TMP18, TMP30, TMP18);
|
yading@10
|
3275
|
yading@10
|
3276 vis_padd16(TMP20, TMP32, TMP20);
|
yading@10
|
3277 vis_pack16(TMP18, DST_2);
|
yading@10
|
3278
|
yading@10
|
3279 vis_pack16(TMP20, DST_3);
|
yading@10
|
3280 vis_st64(DST_2, dest[0]);
|
yading@10
|
3281 dest += stride;
|
yading@10
|
3282 } while (--height);
|
yading@10
|
3283 }
|
yading@10
|
3284
|
yading@10
|
3285 static void MC_avg_no_round_xy_16_vis (uint8_t * dest, const uint8_t * ref,
|
yading@10
|
3286 const ptrdiff_t stride, int height)
|
yading@10
|
3287 {
|
yading@10
|
3288 unsigned long off = (unsigned long) ref & 0x7;
|
yading@10
|
3289 unsigned long off_plus_1 = off + 1;
|
yading@10
|
3290 int stride_8 = stride + 8;
|
yading@10
|
3291 int stride_16 = stride + 16;
|
yading@10
|
3292
|
yading@10
|
3293 vis_set_gsr(4 << VIS_GSR_SCALEFACT_SHIFT);
|
yading@10
|
3294
|
yading@10
|
3295 ref = vis_alignaddr(ref);
|
yading@10
|
3296
|
yading@10
|
3297 vis_ld64(ref[ 0], TMP0);
|
yading@10
|
3298 vis_fzero(ZERO);
|
yading@10
|
3299
|
yading@10
|
3300 vis_ld64(ref[ 8], TMP2);
|
yading@10
|
3301
|
yading@10
|
3302 vis_ld64(ref[16], TMP4);
|
yading@10
|
3303
|
yading@10
|
3304 vis_ld64(constants6[0], CONST_6);
|
yading@10
|
3305 vis_faligndata(TMP0, TMP2, REF_S0);
|
yading@10
|
3306
|
yading@10
|
3307 vis_ld64(constants256_1024[0], CONST_256);
|
yading@10
|
3308 vis_faligndata(TMP2, TMP4, REF_S4);
|
yading@10
|
3309
|
yading@10
|
3310 if (off != 0x7) {
|
yading@10
|
3311 vis_alignaddr_g0((void *)off_plus_1);
|
yading@10
|
3312 vis_faligndata(TMP0, TMP2, REF_S2);
|
yading@10
|
3313 vis_faligndata(TMP2, TMP4, REF_S6);
|
yading@10
|
3314 } else {
|
yading@10
|
3315 vis_src1(TMP2, REF_S2);
|
yading@10
|
3316 vis_src1(TMP4, REF_S6);
|
yading@10
|
3317 }
|
yading@10
|
3318
|
yading@10
|
3319 height >>= 1;
|
yading@10
|
3320 do { /* 55 cycles */
|
yading@10
|
3321 vis_ld64_2(ref, stride, TMP0);
|
yading@10
|
3322 vis_mul8x16au(REF_S0, CONST_256, TMP12);
|
yading@10
|
3323 vis_pmerge(ZERO, REF_S0_1, TMP14);
|
yading@10
|
3324
|
yading@10
|
3325 vis_alignaddr_g0((void *)off);
|
yading@10
|
3326
|
yading@10
|
3327 vis_ld64_2(ref, stride_8, TMP2);
|
yading@10
|
3328 vis_mul8x16au(REF_S2, CONST_256, TMP16);
|
yading@10
|
3329 vis_pmerge(ZERO, REF_S2_1, TMP18);
|
yading@10
|
3330
|
yading@10
|
3331 vis_ld64_2(ref, stride_16, TMP4);
|
yading@10
|
3332 ref += stride;
|
yading@10
|
3333 vis_mul8x16au(REF_S4, CONST_256, TMP20);
|
yading@10
|
3334 vis_pmerge(ZERO, REF_S4_1, TMP22);
|
yading@10
|
3335
|
yading@10
|
3336 vis_ld64_2(ref, stride, TMP6);
|
yading@10
|
3337 vis_mul8x16au(REF_S6, CONST_256, TMP24);
|
yading@10
|
3338 vis_pmerge(ZERO, REF_S6_1, TMP26);
|
yading@10
|
3339
|
yading@10
|
3340 vis_ld64_2(ref, stride_8, TMP8);
|
yading@10
|
3341 vis_faligndata(TMP0, TMP2, REF_0);
|
yading@10
|
3342
|
yading@10
|
3343 vis_ld64_2(ref, stride_16, TMP10);
|
yading@10
|
3344 ref += stride;
|
yading@10
|
3345 vis_faligndata(TMP2, TMP4, REF_4);
|
yading@10
|
3346
|
yading@10
|
3347 vis_ld64(dest[0], DST_0);
|
yading@10
|
3348 vis_faligndata(TMP6, TMP8, REF_S0);
|
yading@10
|
3349
|
yading@10
|
3350 vis_ld64_2(dest, 8, DST_2);
|
yading@10
|
3351 vis_faligndata(TMP8, TMP10, REF_S4);
|
yading@10
|
3352
|
yading@10
|
3353 if (off != 0x7) {
|
yading@10
|
3354 vis_alignaddr_g0((void *)off_plus_1);
|
yading@10
|
3355 vis_faligndata(TMP0, TMP2, REF_2);
|
yading@10
|
3356 vis_faligndata(TMP2, TMP4, REF_6);
|
yading@10
|
3357 vis_faligndata(TMP6, TMP8, REF_S2);
|
yading@10
|
3358 vis_faligndata(TMP8, TMP10, REF_S6);
|
yading@10
|
3359 } else {
|
yading@10
|
3360 vis_src1(TMP2, REF_2);
|
yading@10
|
3361 vis_src1(TMP4, REF_6);
|
yading@10
|
3362 vis_src1(TMP8, REF_S2);
|
yading@10
|
3363 vis_src1(TMP10, REF_S6);
|
yading@10
|
3364 }
|
yading@10
|
3365
|
yading@10
|
3366 vis_mul8x16al(DST_0, CONST_1024, TMP30);
|
yading@10
|
3367 vis_pmerge(ZERO, REF_0, TMP0);
|
yading@10
|
3368
|
yading@10
|
3369 vis_mul8x16al(DST_1, CONST_1024, TMP32);
|
yading@10
|
3370 vis_pmerge(ZERO, REF_0_1, TMP2);
|
yading@10
|
3371
|
yading@10
|
3372 vis_mul8x16au(REF_2, CONST_256, TMP4);
|
yading@10
|
3373 vis_pmerge(ZERO, REF_2_1, TMP6);
|
yading@10
|
3374
|
yading@10
|
3375 vis_mul8x16al(DST_2, CONST_1024, REF_0);
|
yading@10
|
3376 vis_padd16(TMP0, CONST_6, TMP0);
|
yading@10
|
3377
|
yading@10
|
3378 vis_mul8x16al(DST_3, CONST_1024, REF_2);
|
yading@10
|
3379 vis_padd16(TMP2, CONST_6, TMP2);
|
yading@10
|
3380
|
yading@10
|
3381 vis_padd16(TMP0, TMP4, TMP0);
|
yading@10
|
3382 vis_mul8x16au(REF_4, CONST_256, TMP4);
|
yading@10
|
3383
|
yading@10
|
3384 vis_padd16(TMP2, TMP6, TMP2);
|
yading@10
|
3385 vis_mul8x16au(REF_4_1, CONST_256, TMP6);
|
yading@10
|
3386
|
yading@10
|
3387 vis_padd16(TMP12, TMP0, TMP12);
|
yading@10
|
3388 vis_mul8x16au(REF_6, CONST_256, TMP8);
|
yading@10
|
3389
|
yading@10
|
3390 vis_padd16(TMP14, TMP2, TMP14);
|
yading@10
|
3391 vis_mul8x16au(REF_6_1, CONST_256, TMP10);
|
yading@10
|
3392
|
yading@10
|
3393 vis_padd16(TMP12, TMP16, TMP12);
|
yading@10
|
3394 vis_mul8x16au(REF_S0, CONST_256, REF_4);
|
yading@10
|
3395
|
yading@10
|
3396 vis_padd16(TMP14, TMP18, TMP14);
|
yading@10
|
3397 vis_mul8x16au(REF_S0_1, CONST_256, REF_6);
|
yading@10
|
3398
|
yading@10
|
3399 vis_padd16(TMP12, TMP30, TMP12);
|
yading@10
|
3400
|
yading@10
|
3401 vis_padd16(TMP14, TMP32, TMP14);
|
yading@10
|
3402 vis_pack16(TMP12, DST_0);
|
yading@10
|
3403
|
yading@10
|
3404 vis_pack16(TMP14, DST_1);
|
yading@10
|
3405 vis_st64(DST_0, dest[0]);
|
yading@10
|
3406 vis_padd16(TMP4, CONST_6, TMP4);
|
yading@10
|
3407
|
yading@10
|
3408 vis_ld64_2(dest, stride, DST_0);
|
yading@10
|
3409 vis_padd16(TMP6, CONST_6, TMP6);
|
yading@10
|
3410 vis_mul8x16au(REF_S2, CONST_256, TMP12);
|
yading@10
|
3411
|
yading@10
|
3412 vis_padd16(TMP4, TMP8, TMP4);
|
yading@10
|
3413 vis_mul8x16au(REF_S2_1, CONST_256, TMP14);
|
yading@10
|
3414
|
yading@10
|
3415 vis_padd16(TMP6, TMP10, TMP6);
|
yading@10
|
3416
|
yading@10
|
3417 vis_padd16(TMP20, TMP4, TMP20);
|
yading@10
|
3418
|
yading@10
|
3419 vis_padd16(TMP22, TMP6, TMP22);
|
yading@10
|
3420
|
yading@10
|
3421 vis_padd16(TMP20, TMP24, TMP20);
|
yading@10
|
3422
|
yading@10
|
3423 vis_padd16(TMP22, TMP26, TMP22);
|
yading@10
|
3424
|
yading@10
|
3425 vis_padd16(TMP20, REF_0, TMP20);
|
yading@10
|
3426 vis_mul8x16au(REF_S4, CONST_256, REF_0);
|
yading@10
|
3427
|
yading@10
|
3428 vis_padd16(TMP22, REF_2, TMP22);
|
yading@10
|
3429 vis_pack16(TMP20, DST_2);
|
yading@10
|
3430
|
yading@10
|
3431 vis_pack16(TMP22, DST_3);
|
yading@10
|
3432 vis_st64_2(DST_2, dest, 8);
|
yading@10
|
3433 dest += stride;
|
yading@10
|
3434
|
yading@10
|
3435 vis_ld64_2(dest, 8, DST_2);
|
yading@10
|
3436 vis_mul8x16al(DST_0, CONST_1024, TMP30);
|
yading@10
|
3437 vis_pmerge(ZERO, REF_S4_1, REF_2);
|
yading@10
|
3438
|
yading@10
|
3439 vis_mul8x16al(DST_1, CONST_1024, TMP32);
|
yading@10
|
3440 vis_padd16(REF_4, TMP0, TMP8);
|
yading@10
|
3441
|
yading@10
|
3442 vis_mul8x16au(REF_S6, CONST_256, REF_4);
|
yading@10
|
3443 vis_padd16(REF_6, TMP2, TMP10);
|
yading@10
|
3444
|
yading@10
|
3445 vis_mul8x16au(REF_S6_1, CONST_256, REF_6);
|
yading@10
|
3446 vis_padd16(TMP8, TMP12, TMP8);
|
yading@10
|
3447
|
yading@10
|
3448 vis_padd16(TMP10, TMP14, TMP10);
|
yading@10
|
3449
|
yading@10
|
3450 vis_padd16(TMP8, TMP30, TMP8);
|
yading@10
|
3451
|
yading@10
|
3452 vis_padd16(TMP10, TMP32, TMP10);
|
yading@10
|
3453 vis_pack16(TMP8, DST_0);
|
yading@10
|
3454
|
yading@10
|
3455 vis_pack16(TMP10, DST_1);
|
yading@10
|
3456 vis_st64(DST_0, dest[0]);
|
yading@10
|
3457
|
yading@10
|
3458 vis_padd16(REF_0, TMP4, REF_0);
|
yading@10
|
3459
|
yading@10
|
3460 vis_mul8x16al(DST_2, CONST_1024, TMP30);
|
yading@10
|
3461 vis_padd16(REF_2, TMP6, REF_2);
|
yading@10
|
3462
|
yading@10
|
3463 vis_mul8x16al(DST_3, CONST_1024, TMP32);
|
yading@10
|
3464 vis_padd16(REF_0, REF_4, REF_0);
|
yading@10
|
3465
|
yading@10
|
3466 vis_padd16(REF_2, REF_6, REF_2);
|
yading@10
|
3467
|
yading@10
|
3468 vis_padd16(REF_0, TMP30, REF_0);
|
yading@10
|
3469
|
yading@10
|
3470 /* stall */
|
yading@10
|
3471
|
yading@10
|
3472 vis_padd16(REF_2, TMP32, REF_2);
|
yading@10
|
3473 vis_pack16(REF_0, DST_2);
|
yading@10
|
3474
|
yading@10
|
3475 vis_pack16(REF_2, DST_3);
|
yading@10
|
3476 vis_st64_2(DST_2, dest, 8);
|
yading@10
|
3477 dest += stride;
|
yading@10
|
3478 } while (--height);
|
yading@10
|
3479 }
|
yading@10
|
3480
|
yading@10
|
3481 /* End of no rounding code */
|
yading@10
|
3482
|
yading@10
|
3483 av_cold void ff_hpeldsp_init_vis(HpelDSPContext *c, int flags)
|
yading@10
|
3484 {
|
yading@10
|
3485 /* VIS-specific optimizations */
|
yading@10
|
3486 int accel = vis_level ();
|
yading@10
|
3487
|
yading@10
|
3488 if (accel & ACCEL_SPARC_VIS) {
|
yading@10
|
3489 c->put_pixels_tab[0][0] = MC_put_o_16_vis;
|
yading@10
|
3490 c->put_pixels_tab[0][1] = MC_put_x_16_vis;
|
yading@10
|
3491 c->put_pixels_tab[0][2] = MC_put_y_16_vis;
|
yading@10
|
3492 c->put_pixels_tab[0][3] = MC_put_xy_16_vis;
|
yading@10
|
3493
|
yading@10
|
3494 c->put_pixels_tab[1][0] = MC_put_o_8_vis;
|
yading@10
|
3495 c->put_pixels_tab[1][1] = MC_put_x_8_vis;
|
yading@10
|
3496 c->put_pixels_tab[1][2] = MC_put_y_8_vis;
|
yading@10
|
3497 c->put_pixels_tab[1][3] = MC_put_xy_8_vis;
|
yading@10
|
3498
|
yading@10
|
3499 c->avg_pixels_tab[0][0] = MC_avg_o_16_vis;
|
yading@10
|
3500 c->avg_pixels_tab[0][1] = MC_avg_x_16_vis;
|
yading@10
|
3501 c->avg_pixels_tab[0][2] = MC_avg_y_16_vis;
|
yading@10
|
3502 c->avg_pixels_tab[0][3] = MC_avg_xy_16_vis;
|
yading@10
|
3503
|
yading@10
|
3504 c->avg_pixels_tab[1][0] = MC_avg_o_8_vis;
|
yading@10
|
3505 c->avg_pixels_tab[1][1] = MC_avg_x_8_vis;
|
yading@10
|
3506 c->avg_pixels_tab[1][2] = MC_avg_y_8_vis;
|
yading@10
|
3507 c->avg_pixels_tab[1][3] = MC_avg_xy_8_vis;
|
yading@10
|
3508
|
yading@10
|
3509 c->put_no_rnd_pixels_tab[0][0] = MC_put_no_round_o_16_vis;
|
yading@10
|
3510 c->put_no_rnd_pixels_tab[0][1] = MC_put_no_round_x_16_vis;
|
yading@10
|
3511 c->put_no_rnd_pixels_tab[0][2] = MC_put_no_round_y_16_vis;
|
yading@10
|
3512 c->put_no_rnd_pixels_tab[0][3] = MC_put_no_round_xy_16_vis;
|
yading@10
|
3513
|
yading@10
|
3514 c->put_no_rnd_pixels_tab[1][0] = MC_put_no_round_o_8_vis;
|
yading@10
|
3515 c->put_no_rnd_pixels_tab[1][1] = MC_put_no_round_x_8_vis;
|
yading@10
|
3516 c->put_no_rnd_pixels_tab[1][2] = MC_put_no_round_y_8_vis;
|
yading@10
|
3517 c->put_no_rnd_pixels_tab[1][3] = MC_put_no_round_xy_8_vis;
|
yading@10
|
3518
|
yading@10
|
3519 c->avg_no_rnd_pixels_tab[0] = MC_avg_no_round_o_16_vis;
|
yading@10
|
3520 c->avg_no_rnd_pixels_tab[1] = MC_avg_no_round_x_16_vis;
|
yading@10
|
3521 c->avg_no_rnd_pixels_tab[2] = MC_avg_no_round_y_16_vis;
|
yading@10
|
3522 c->avg_no_rnd_pixels_tab[3] = MC_avg_no_round_xy_16_vis;
|
yading@10
|
3523 }
|
yading@10
|
3524 }
|