annotate ffmpeg/libavcodec/sparc/hpeldsp_vis.c @ 13:844d341cf643 tip

Back up before ISMIR
author Yading Song <yading.song@eecs.qmul.ac.uk>
date Thu, 31 Oct 2013 13:17:06 +0000
parents 6840f77b83aa
children
rev   line source
yading@10 1 /*
yading@10 2 * Copyright (C) 2003 David S. Miller <davem@redhat.com>
yading@10 3 *
yading@10 4 * This file is part of FFmpeg.
yading@10 5 *
yading@10 6 * FFmpeg is free software; you can redistribute it and/or
yading@10 7 * modify it under the terms of the GNU Lesser General Public
yading@10 8 * License as published by the Free Software Foundation; either
yading@10 9 * version 2.1 of the License, or (at your option) any later version.
yading@10 10 *
yading@10 11 * FFmpeg is distributed in the hope that it will be useful,
yading@10 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
yading@10 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
yading@10 14 * Lesser General Public License for more details.
yading@10 15 *
yading@10 16 * You should have received a copy of the GNU Lesser General Public
yading@10 17 * License along with FFmpeg; if not, write to the Free Software
yading@10 18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
yading@10 19 */
yading@10 20
yading@10 21 /* The *no_round* functions have been added by James A. Morrison, 2003,2004.
yading@10 22 The vis code from libmpeg2 was adapted for libavcodec by James A. Morrison.
yading@10 23 */
yading@10 24
yading@10 25 #include <stddef.h>
yading@10 26 #include <stdint.h>
yading@10 27
yading@10 28 #include "libavutil/attributes.h"
yading@10 29 #include "libavutil/mem.h"
yading@10 30 #include "libavcodec/hpeldsp.h"
yading@10 31 #include "vis.h"
yading@10 32
yading@10 33 /* The trick used in some of this file is the formula from the MMX
yading@10 34 * motion comp code, which is:
yading@10 35 *
yading@10 36 * (x+y+1)>>1 == (x|y)-((x^y)>>1)
yading@10 37 *
yading@10 38 * This allows us to average 8 bytes at a time in a 64-bit FPU reg.
yading@10 39 * We avoid overflows by masking before we do the shift, and we
yading@10 40 * implement the shift by multiplying by 1/2 using mul8x16. So in
yading@10 41 * VIS this is (assume 'x' is in f0, 'y' is in f2, a repeating mask
yading@10 42 * of '0xfe' is in f4, a repeating mask of '0x7f' is in f6, and
yading@10 43 * the value 0x80808080 is in f8):
yading@10 44 *
yading@10 45 * fxor f0, f2, f10
yading@10 46 * fand f10, f4, f10
yading@10 47 * fmul8x16 f8, f10, f10
yading@10 48 * fand f10, f6, f10
yading@10 49 * for f0, f2, f12
yading@10 50 * fpsub16 f12, f10, f10
yading@10 51 */
yading@10 52
yading@10 53 #define DUP4(x) {x, x, x, x}
yading@10 54 #define DUP8(x) {x, x, x, x, x, x, x, x}
yading@10 55 DECLARE_ALIGNED(8, static const int16_t, constants1)[] = DUP4 (1);
yading@10 56 DECLARE_ALIGNED(8, static const int16_t, constants2)[] = DUP4 (2);
yading@10 57 DECLARE_ALIGNED(8, static const int16_t, constants3)[] = DUP4 (3);
yading@10 58 DECLARE_ALIGNED(8, static const int16_t, constants6)[] = DUP4 (6);
yading@10 59 DECLARE_ALIGNED(8, static const int8_t, constants_fe)[] = DUP8 (0xfe);
yading@10 60 DECLARE_ALIGNED(8, static const int8_t, constants_7f)[] = DUP8 (0x7f);
yading@10 61 DECLARE_ALIGNED(8, static const int8_t, constants128)[] = DUP8 (128);
yading@10 62 DECLARE_ALIGNED(8, static const int16_t, constants256_512)[] =
yading@10 63 {256, 512, 256, 512};
yading@10 64 DECLARE_ALIGNED(8, static const int16_t, constants256_1024)[] =
yading@10 65 {256, 1024, 256, 1024};
yading@10 66
yading@10 67 #define REF_0 0
yading@10 68 #define REF_0_1 1
yading@10 69 #define REF_2 2
yading@10 70 #define REF_2_1 3
yading@10 71 #define REF_4 4
yading@10 72 #define REF_4_1 5
yading@10 73 #define REF_6 6
yading@10 74 #define REF_6_1 7
yading@10 75 #define REF_S0 8
yading@10 76 #define REF_S0_1 9
yading@10 77 #define REF_S2 10
yading@10 78 #define REF_S2_1 11
yading@10 79 #define REF_S4 12
yading@10 80 #define REF_S4_1 13
yading@10 81 #define REF_S6 14
yading@10 82 #define REF_S6_1 15
yading@10 83 #define DST_0 16
yading@10 84 #define DST_1 17
yading@10 85 #define DST_2 18
yading@10 86 #define DST_3 19
yading@10 87 #define CONST_1 20
yading@10 88 #define CONST_2 20
yading@10 89 #define CONST_3 20
yading@10 90 #define CONST_6 20
yading@10 91 #define MASK_fe 20
yading@10 92 #define CONST_128 22
yading@10 93 #define CONST_256 22
yading@10 94 #define CONST_512 22
yading@10 95 #define CONST_1024 22
yading@10 96 #define TMP0 24
yading@10 97 #define TMP1 25
yading@10 98 #define TMP2 26
yading@10 99 #define TMP3 27
yading@10 100 #define TMP4 28
yading@10 101 #define TMP5 29
yading@10 102 #define ZERO 30
yading@10 103 #define MASK_7f 30
yading@10 104
yading@10 105 #define TMP6 32
yading@10 106 #define TMP8 34
yading@10 107 #define TMP10 36
yading@10 108 #define TMP12 38
yading@10 109 #define TMP14 40
yading@10 110 #define TMP16 42
yading@10 111 #define TMP18 44
yading@10 112 #define TMP20 46
yading@10 113 #define TMP22 48
yading@10 114 #define TMP24 50
yading@10 115 #define TMP26 52
yading@10 116 #define TMP28 54
yading@10 117 #define TMP30 56
yading@10 118 #define TMP32 58
yading@10 119
yading@10 120 static void MC_put_o_16_vis (uint8_t * dest, const uint8_t * ref,
yading@10 121 const ptrdiff_t stride, int height)
yading@10 122 {
yading@10 123 ref = vis_alignaddr(ref);
yading@10 124 do { /* 5 cycles */
yading@10 125 vis_ld64(ref[0], TMP0);
yading@10 126
yading@10 127 vis_ld64_2(ref, 8, TMP2);
yading@10 128
yading@10 129 vis_ld64_2(ref, 16, TMP4);
yading@10 130 ref += stride;
yading@10 131
yading@10 132 vis_faligndata(TMP0, TMP2, REF_0);
yading@10 133 vis_st64(REF_0, dest[0]);
yading@10 134
yading@10 135 vis_faligndata(TMP2, TMP4, REF_2);
yading@10 136 vis_st64_2(REF_2, dest, 8);
yading@10 137 dest += stride;
yading@10 138 } while (--height);
yading@10 139 }
yading@10 140
yading@10 141 static void MC_put_o_8_vis (uint8_t * dest, const uint8_t * ref,
yading@10 142 const ptrdiff_t stride, int height)
yading@10 143 {
yading@10 144 ref = vis_alignaddr(ref);
yading@10 145 do { /* 4 cycles */
yading@10 146 vis_ld64(ref[0], TMP0);
yading@10 147
yading@10 148 vis_ld64(ref[8], TMP2);
yading@10 149 ref += stride;
yading@10 150
yading@10 151 /* stall */
yading@10 152
yading@10 153 vis_faligndata(TMP0, TMP2, REF_0);
yading@10 154 vis_st64(REF_0, dest[0]);
yading@10 155 dest += stride;
yading@10 156 } while (--height);
yading@10 157 }
yading@10 158
yading@10 159
yading@10 160 static void MC_avg_o_16_vis (uint8_t * dest, const uint8_t * ref,
yading@10 161 const ptrdiff_t stride, int height)
yading@10 162 {
yading@10 163 int stride_8 = stride + 8;
yading@10 164
yading@10 165 ref = vis_alignaddr(ref);
yading@10 166
yading@10 167 vis_ld64(ref[0], TMP0);
yading@10 168
yading@10 169 vis_ld64(ref[8], TMP2);
yading@10 170
yading@10 171 vis_ld64(ref[16], TMP4);
yading@10 172
yading@10 173 vis_ld64(dest[0], DST_0);
yading@10 174
yading@10 175 vis_ld64(dest[8], DST_2);
yading@10 176
yading@10 177 vis_ld64(constants_fe[0], MASK_fe);
yading@10 178 vis_faligndata(TMP0, TMP2, REF_0);
yading@10 179
yading@10 180 vis_ld64(constants_7f[0], MASK_7f);
yading@10 181 vis_faligndata(TMP2, TMP4, REF_2);
yading@10 182
yading@10 183 vis_ld64(constants128[0], CONST_128);
yading@10 184
yading@10 185 ref += stride;
yading@10 186 height = (height >> 1) - 1;
yading@10 187
yading@10 188 do { /* 24 cycles */
yading@10 189 vis_ld64(ref[0], TMP0);
yading@10 190 vis_xor(DST_0, REF_0, TMP6);
yading@10 191
yading@10 192 vis_ld64_2(ref, 8, TMP2);
yading@10 193 vis_and(TMP6, MASK_fe, TMP6);
yading@10 194
yading@10 195 vis_ld64_2(ref, 16, TMP4);
yading@10 196 ref += stride;
yading@10 197 vis_mul8x16(CONST_128, TMP6, TMP6);
yading@10 198 vis_xor(DST_2, REF_2, TMP8);
yading@10 199
yading@10 200 vis_and(TMP8, MASK_fe, TMP8);
yading@10 201
yading@10 202 vis_or(DST_0, REF_0, TMP10);
yading@10 203 vis_ld64_2(dest, stride, DST_0);
yading@10 204 vis_mul8x16(CONST_128, TMP8, TMP8);
yading@10 205
yading@10 206 vis_or(DST_2, REF_2, TMP12);
yading@10 207 vis_ld64_2(dest, stride_8, DST_2);
yading@10 208
yading@10 209 vis_ld64(ref[0], TMP14);
yading@10 210 vis_and(TMP6, MASK_7f, TMP6);
yading@10 211
yading@10 212 vis_and(TMP8, MASK_7f, TMP8);
yading@10 213
yading@10 214 vis_psub16(TMP10, TMP6, TMP6);
yading@10 215 vis_st64(TMP6, dest[0]);
yading@10 216
yading@10 217 vis_psub16(TMP12, TMP8, TMP8);
yading@10 218 vis_st64_2(TMP8, dest, 8);
yading@10 219
yading@10 220 dest += stride;
yading@10 221 vis_ld64_2(ref, 8, TMP16);
yading@10 222 vis_faligndata(TMP0, TMP2, REF_0);
yading@10 223
yading@10 224 vis_ld64_2(ref, 16, TMP18);
yading@10 225 vis_faligndata(TMP2, TMP4, REF_2);
yading@10 226 ref += stride;
yading@10 227
yading@10 228 vis_xor(DST_0, REF_0, TMP20);
yading@10 229
yading@10 230 vis_and(TMP20, MASK_fe, TMP20);
yading@10 231
yading@10 232 vis_xor(DST_2, REF_2, TMP22);
yading@10 233 vis_mul8x16(CONST_128, TMP20, TMP20);
yading@10 234
yading@10 235 vis_and(TMP22, MASK_fe, TMP22);
yading@10 236
yading@10 237 vis_or(DST_0, REF_0, TMP24);
yading@10 238 vis_mul8x16(CONST_128, TMP22, TMP22);
yading@10 239
yading@10 240 vis_or(DST_2, REF_2, TMP26);
yading@10 241
yading@10 242 vis_ld64_2(dest, stride, DST_0);
yading@10 243 vis_faligndata(TMP14, TMP16, REF_0);
yading@10 244
yading@10 245 vis_ld64_2(dest, stride_8, DST_2);
yading@10 246 vis_faligndata(TMP16, TMP18, REF_2);
yading@10 247
yading@10 248 vis_and(TMP20, MASK_7f, TMP20);
yading@10 249
yading@10 250 vis_and(TMP22, MASK_7f, TMP22);
yading@10 251
yading@10 252 vis_psub16(TMP24, TMP20, TMP20);
yading@10 253 vis_st64(TMP20, dest[0]);
yading@10 254
yading@10 255 vis_psub16(TMP26, TMP22, TMP22);
yading@10 256 vis_st64_2(TMP22, dest, 8);
yading@10 257 dest += stride;
yading@10 258 } while (--height);
yading@10 259
yading@10 260 vis_ld64(ref[0], TMP0);
yading@10 261 vis_xor(DST_0, REF_0, TMP6);
yading@10 262
yading@10 263 vis_ld64_2(ref, 8, TMP2);
yading@10 264 vis_and(TMP6, MASK_fe, TMP6);
yading@10 265
yading@10 266 vis_ld64_2(ref, 16, TMP4);
yading@10 267 vis_mul8x16(CONST_128, TMP6, TMP6);
yading@10 268 vis_xor(DST_2, REF_2, TMP8);
yading@10 269
yading@10 270 vis_and(TMP8, MASK_fe, TMP8);
yading@10 271
yading@10 272 vis_or(DST_0, REF_0, TMP10);
yading@10 273 vis_ld64_2(dest, stride, DST_0);
yading@10 274 vis_mul8x16(CONST_128, TMP8, TMP8);
yading@10 275
yading@10 276 vis_or(DST_2, REF_2, TMP12);
yading@10 277 vis_ld64_2(dest, stride_8, DST_2);
yading@10 278
yading@10 279 vis_ld64(ref[0], TMP14);
yading@10 280 vis_and(TMP6, MASK_7f, TMP6);
yading@10 281
yading@10 282 vis_and(TMP8, MASK_7f, TMP8);
yading@10 283
yading@10 284 vis_psub16(TMP10, TMP6, TMP6);
yading@10 285 vis_st64(TMP6, dest[0]);
yading@10 286
yading@10 287 vis_psub16(TMP12, TMP8, TMP8);
yading@10 288 vis_st64_2(TMP8, dest, 8);
yading@10 289
yading@10 290 dest += stride;
yading@10 291 vis_faligndata(TMP0, TMP2, REF_0);
yading@10 292
yading@10 293 vis_faligndata(TMP2, TMP4, REF_2);
yading@10 294
yading@10 295 vis_xor(DST_0, REF_0, TMP20);
yading@10 296
yading@10 297 vis_and(TMP20, MASK_fe, TMP20);
yading@10 298
yading@10 299 vis_xor(DST_2, REF_2, TMP22);
yading@10 300 vis_mul8x16(CONST_128, TMP20, TMP20);
yading@10 301
yading@10 302 vis_and(TMP22, MASK_fe, TMP22);
yading@10 303
yading@10 304 vis_or(DST_0, REF_0, TMP24);
yading@10 305 vis_mul8x16(CONST_128, TMP22, TMP22);
yading@10 306
yading@10 307 vis_or(DST_2, REF_2, TMP26);
yading@10 308
yading@10 309 vis_and(TMP20, MASK_7f, TMP20);
yading@10 310
yading@10 311 vis_and(TMP22, MASK_7f, TMP22);
yading@10 312
yading@10 313 vis_psub16(TMP24, TMP20, TMP20);
yading@10 314 vis_st64(TMP20, dest[0]);
yading@10 315
yading@10 316 vis_psub16(TMP26, TMP22, TMP22);
yading@10 317 vis_st64_2(TMP22, dest, 8);
yading@10 318 }
yading@10 319
yading@10 320 static void MC_avg_o_8_vis (uint8_t * dest, const uint8_t * ref,
yading@10 321 const ptrdiff_t stride, int height)
yading@10 322 {
yading@10 323 ref = vis_alignaddr(ref);
yading@10 324
yading@10 325 vis_ld64(ref[0], TMP0);
yading@10 326
yading@10 327 vis_ld64(ref[8], TMP2);
yading@10 328
yading@10 329 vis_ld64(dest[0], DST_0);
yading@10 330
yading@10 331 vis_ld64(constants_fe[0], MASK_fe);
yading@10 332
yading@10 333 vis_ld64(constants_7f[0], MASK_7f);
yading@10 334 vis_faligndata(TMP0, TMP2, REF_0);
yading@10 335
yading@10 336 vis_ld64(constants128[0], CONST_128);
yading@10 337
yading@10 338 ref += stride;
yading@10 339 height = (height >> 1) - 1;
yading@10 340
yading@10 341 do { /* 12 cycles */
yading@10 342 vis_ld64(ref[0], TMP0);
yading@10 343 vis_xor(DST_0, REF_0, TMP4);
yading@10 344
yading@10 345 vis_ld64(ref[8], TMP2);
yading@10 346 vis_and(TMP4, MASK_fe, TMP4);
yading@10 347
yading@10 348 vis_or(DST_0, REF_0, TMP6);
yading@10 349 vis_ld64_2(dest, stride, DST_0);
yading@10 350 ref += stride;
yading@10 351 vis_mul8x16(CONST_128, TMP4, TMP4);
yading@10 352
yading@10 353 vis_ld64(ref[0], TMP12);
yading@10 354 vis_faligndata(TMP0, TMP2, REF_0);
yading@10 355
yading@10 356 vis_ld64(ref[8], TMP2);
yading@10 357 vis_xor(DST_0, REF_0, TMP0);
yading@10 358 ref += stride;
yading@10 359
yading@10 360 vis_and(TMP0, MASK_fe, TMP0);
yading@10 361
yading@10 362 vis_and(TMP4, MASK_7f, TMP4);
yading@10 363
yading@10 364 vis_psub16(TMP6, TMP4, TMP4);
yading@10 365 vis_st64(TMP4, dest[0]);
yading@10 366 dest += stride;
yading@10 367 vis_mul8x16(CONST_128, TMP0, TMP0);
yading@10 368
yading@10 369 vis_or(DST_0, REF_0, TMP6);
yading@10 370 vis_ld64_2(dest, stride, DST_0);
yading@10 371
yading@10 372 vis_faligndata(TMP12, TMP2, REF_0);
yading@10 373
yading@10 374 vis_and(TMP0, MASK_7f, TMP0);
yading@10 375
yading@10 376 vis_psub16(TMP6, TMP0, TMP4);
yading@10 377 vis_st64(TMP4, dest[0]);
yading@10 378 dest += stride;
yading@10 379 } while (--height);
yading@10 380
yading@10 381 vis_ld64(ref[0], TMP0);
yading@10 382 vis_xor(DST_0, REF_0, TMP4);
yading@10 383
yading@10 384 vis_ld64(ref[8], TMP2);
yading@10 385 vis_and(TMP4, MASK_fe, TMP4);
yading@10 386
yading@10 387 vis_or(DST_0, REF_0, TMP6);
yading@10 388 vis_ld64_2(dest, stride, DST_0);
yading@10 389 vis_mul8x16(CONST_128, TMP4, TMP4);
yading@10 390
yading@10 391 vis_faligndata(TMP0, TMP2, REF_0);
yading@10 392
yading@10 393 vis_xor(DST_0, REF_0, TMP0);
yading@10 394
yading@10 395 vis_and(TMP0, MASK_fe, TMP0);
yading@10 396
yading@10 397 vis_and(TMP4, MASK_7f, TMP4);
yading@10 398
yading@10 399 vis_psub16(TMP6, TMP4, TMP4);
yading@10 400 vis_st64(TMP4, dest[0]);
yading@10 401 dest += stride;
yading@10 402 vis_mul8x16(CONST_128, TMP0, TMP0);
yading@10 403
yading@10 404 vis_or(DST_0, REF_0, TMP6);
yading@10 405
yading@10 406 vis_and(TMP0, MASK_7f, TMP0);
yading@10 407
yading@10 408 vis_psub16(TMP6, TMP0, TMP4);
yading@10 409 vis_st64(TMP4, dest[0]);
yading@10 410 }
yading@10 411
yading@10 412 static void MC_put_x_16_vis (uint8_t * dest, const uint8_t * ref,
yading@10 413 const ptrdiff_t stride, int height)
yading@10 414 {
yading@10 415 unsigned long off = (unsigned long) ref & 0x7;
yading@10 416 unsigned long off_plus_1 = off + 1;
yading@10 417
yading@10 418 ref = vis_alignaddr(ref);
yading@10 419
yading@10 420 vis_ld64(ref[0], TMP0);
yading@10 421
yading@10 422 vis_ld64_2(ref, 8, TMP2);
yading@10 423
yading@10 424 vis_ld64_2(ref, 16, TMP4);
yading@10 425
yading@10 426 vis_ld64(constants_fe[0], MASK_fe);
yading@10 427
yading@10 428 vis_ld64(constants_7f[0], MASK_7f);
yading@10 429 vis_faligndata(TMP0, TMP2, REF_0);
yading@10 430
yading@10 431 vis_ld64(constants128[0], CONST_128);
yading@10 432 vis_faligndata(TMP2, TMP4, REF_4);
yading@10 433
yading@10 434 if (off != 0x7) {
yading@10 435 vis_alignaddr_g0((void *)off_plus_1);
yading@10 436 vis_faligndata(TMP0, TMP2, REF_2);
yading@10 437 vis_faligndata(TMP2, TMP4, REF_6);
yading@10 438 } else {
yading@10 439 vis_src1(TMP2, REF_2);
yading@10 440 vis_src1(TMP4, REF_6);
yading@10 441 }
yading@10 442
yading@10 443 ref += stride;
yading@10 444 height = (height >> 1) - 1;
yading@10 445
yading@10 446 do { /* 34 cycles */
yading@10 447 vis_ld64(ref[0], TMP0);
yading@10 448 vis_xor(REF_0, REF_2, TMP6);
yading@10 449
yading@10 450 vis_ld64_2(ref, 8, TMP2);
yading@10 451 vis_xor(REF_4, REF_6, TMP8);
yading@10 452
yading@10 453 vis_ld64_2(ref, 16, TMP4);
yading@10 454 vis_and(TMP6, MASK_fe, TMP6);
yading@10 455 ref += stride;
yading@10 456
yading@10 457 vis_ld64(ref[0], TMP14);
yading@10 458 vis_mul8x16(CONST_128, TMP6, TMP6);
yading@10 459 vis_and(TMP8, MASK_fe, TMP8);
yading@10 460
yading@10 461 vis_ld64_2(ref, 8, TMP16);
yading@10 462 vis_mul8x16(CONST_128, TMP8, TMP8);
yading@10 463 vis_or(REF_0, REF_2, TMP10);
yading@10 464
yading@10 465 vis_ld64_2(ref, 16, TMP18);
yading@10 466 ref += stride;
yading@10 467 vis_or(REF_4, REF_6, TMP12);
yading@10 468
yading@10 469 vis_alignaddr_g0((void *)off);
yading@10 470
yading@10 471 vis_faligndata(TMP0, TMP2, REF_0);
yading@10 472
yading@10 473 vis_faligndata(TMP2, TMP4, REF_4);
yading@10 474
yading@10 475 if (off != 0x7) {
yading@10 476 vis_alignaddr_g0((void *)off_plus_1);
yading@10 477 vis_faligndata(TMP0, TMP2, REF_2);
yading@10 478 vis_faligndata(TMP2, TMP4, REF_6);
yading@10 479 } else {
yading@10 480 vis_src1(TMP2, REF_2);
yading@10 481 vis_src1(TMP4, REF_6);
yading@10 482 }
yading@10 483
yading@10 484 vis_and(TMP6, MASK_7f, TMP6);
yading@10 485
yading@10 486 vis_and(TMP8, MASK_7f, TMP8);
yading@10 487
yading@10 488 vis_psub16(TMP10, TMP6, TMP6);
yading@10 489 vis_st64(TMP6, dest[0]);
yading@10 490
yading@10 491 vis_psub16(TMP12, TMP8, TMP8);
yading@10 492 vis_st64_2(TMP8, dest, 8);
yading@10 493 dest += stride;
yading@10 494
yading@10 495 vis_xor(REF_0, REF_2, TMP6);
yading@10 496
yading@10 497 vis_xor(REF_4, REF_6, TMP8);
yading@10 498
yading@10 499 vis_and(TMP6, MASK_fe, TMP6);
yading@10 500
yading@10 501 vis_mul8x16(CONST_128, TMP6, TMP6);
yading@10 502 vis_and(TMP8, MASK_fe, TMP8);
yading@10 503
yading@10 504 vis_mul8x16(CONST_128, TMP8, TMP8);
yading@10 505 vis_or(REF_0, REF_2, TMP10);
yading@10 506
yading@10 507 vis_or(REF_4, REF_6, TMP12);
yading@10 508
yading@10 509 vis_alignaddr_g0((void *)off);
yading@10 510
yading@10 511 vis_faligndata(TMP14, TMP16, REF_0);
yading@10 512
yading@10 513 vis_faligndata(TMP16, TMP18, REF_4);
yading@10 514
yading@10 515 if (off != 0x7) {
yading@10 516 vis_alignaddr_g0((void *)off_plus_1);
yading@10 517 vis_faligndata(TMP14, TMP16, REF_2);
yading@10 518 vis_faligndata(TMP16, TMP18, REF_6);
yading@10 519 } else {
yading@10 520 vis_src1(TMP16, REF_2);
yading@10 521 vis_src1(TMP18, REF_6);
yading@10 522 }
yading@10 523
yading@10 524 vis_and(TMP6, MASK_7f, TMP6);
yading@10 525
yading@10 526 vis_and(TMP8, MASK_7f, TMP8);
yading@10 527
yading@10 528 vis_psub16(TMP10, TMP6, TMP6);
yading@10 529 vis_st64(TMP6, dest[0]);
yading@10 530
yading@10 531 vis_psub16(TMP12, TMP8, TMP8);
yading@10 532 vis_st64_2(TMP8, dest, 8);
yading@10 533 dest += stride;
yading@10 534 } while (--height);
yading@10 535
yading@10 536 vis_ld64(ref[0], TMP0);
yading@10 537 vis_xor(REF_0, REF_2, TMP6);
yading@10 538
yading@10 539 vis_ld64_2(ref, 8, TMP2);
yading@10 540 vis_xor(REF_4, REF_6, TMP8);
yading@10 541
yading@10 542 vis_ld64_2(ref, 16, TMP4);
yading@10 543 vis_and(TMP6, MASK_fe, TMP6);
yading@10 544
yading@10 545 vis_mul8x16(CONST_128, TMP6, TMP6);
yading@10 546 vis_and(TMP8, MASK_fe, TMP8);
yading@10 547
yading@10 548 vis_mul8x16(CONST_128, TMP8, TMP8);
yading@10 549 vis_or(REF_0, REF_2, TMP10);
yading@10 550
yading@10 551 vis_or(REF_4, REF_6, TMP12);
yading@10 552
yading@10 553 vis_alignaddr_g0((void *)off);
yading@10 554
yading@10 555 vis_faligndata(TMP0, TMP2, REF_0);
yading@10 556
yading@10 557 vis_faligndata(TMP2, TMP4, REF_4);
yading@10 558
yading@10 559 if (off != 0x7) {
yading@10 560 vis_alignaddr_g0((void *)off_plus_1);
yading@10 561 vis_faligndata(TMP0, TMP2, REF_2);
yading@10 562 vis_faligndata(TMP2, TMP4, REF_6);
yading@10 563 } else {
yading@10 564 vis_src1(TMP2, REF_2);
yading@10 565 vis_src1(TMP4, REF_6);
yading@10 566 }
yading@10 567
yading@10 568 vis_and(TMP6, MASK_7f, TMP6);
yading@10 569
yading@10 570 vis_and(TMP8, MASK_7f, TMP8);
yading@10 571
yading@10 572 vis_psub16(TMP10, TMP6, TMP6);
yading@10 573 vis_st64(TMP6, dest[0]);
yading@10 574
yading@10 575 vis_psub16(TMP12, TMP8, TMP8);
yading@10 576 vis_st64_2(TMP8, dest, 8);
yading@10 577 dest += stride;
yading@10 578
yading@10 579 vis_xor(REF_0, REF_2, TMP6);
yading@10 580
yading@10 581 vis_xor(REF_4, REF_6, TMP8);
yading@10 582
yading@10 583 vis_and(TMP6, MASK_fe, TMP6);
yading@10 584
yading@10 585 vis_mul8x16(CONST_128, TMP6, TMP6);
yading@10 586 vis_and(TMP8, MASK_fe, TMP8);
yading@10 587
yading@10 588 vis_mul8x16(CONST_128, TMP8, TMP8);
yading@10 589 vis_or(REF_0, REF_2, TMP10);
yading@10 590
yading@10 591 vis_or(REF_4, REF_6, TMP12);
yading@10 592
yading@10 593 vis_and(TMP6, MASK_7f, TMP6);
yading@10 594
yading@10 595 vis_and(TMP8, MASK_7f, TMP8);
yading@10 596
yading@10 597 vis_psub16(TMP10, TMP6, TMP6);
yading@10 598 vis_st64(TMP6, dest[0]);
yading@10 599
yading@10 600 vis_psub16(TMP12, TMP8, TMP8);
yading@10 601 vis_st64_2(TMP8, dest, 8);
yading@10 602 }
yading@10 603
yading@10 604 static void MC_put_x_8_vis (uint8_t * dest, const uint8_t * ref,
yading@10 605 const ptrdiff_t stride, int height)
yading@10 606 {
yading@10 607 unsigned long off = (unsigned long) ref & 0x7;
yading@10 608 unsigned long off_plus_1 = off + 1;
yading@10 609
yading@10 610 ref = vis_alignaddr(ref);
yading@10 611
yading@10 612 vis_ld64(ref[0], TMP0);
yading@10 613
yading@10 614 vis_ld64(ref[8], TMP2);
yading@10 615
yading@10 616 vis_ld64(constants_fe[0], MASK_fe);
yading@10 617
yading@10 618 vis_ld64(constants_7f[0], MASK_7f);
yading@10 619
yading@10 620 vis_ld64(constants128[0], CONST_128);
yading@10 621 vis_faligndata(TMP0, TMP2, REF_0);
yading@10 622
yading@10 623 if (off != 0x7) {
yading@10 624 vis_alignaddr_g0((void *)off_plus_1);
yading@10 625 vis_faligndata(TMP0, TMP2, REF_2);
yading@10 626 } else {
yading@10 627 vis_src1(TMP2, REF_2);
yading@10 628 }
yading@10 629
yading@10 630 ref += stride;
yading@10 631 height = (height >> 1) - 1;
yading@10 632
yading@10 633 do { /* 20 cycles */
yading@10 634 vis_ld64(ref[0], TMP0);
yading@10 635 vis_xor(REF_0, REF_2, TMP4);
yading@10 636
yading@10 637 vis_ld64_2(ref, 8, TMP2);
yading@10 638 vis_and(TMP4, MASK_fe, TMP4);
yading@10 639 ref += stride;
yading@10 640
yading@10 641 vis_ld64(ref[0], TMP8);
yading@10 642 vis_or(REF_0, REF_2, TMP6);
yading@10 643 vis_mul8x16(CONST_128, TMP4, TMP4);
yading@10 644
yading@10 645 vis_alignaddr_g0((void *)off);
yading@10 646
yading@10 647 vis_ld64_2(ref, 8, TMP10);
yading@10 648 ref += stride;
yading@10 649 vis_faligndata(TMP0, TMP2, REF_0);
yading@10 650
yading@10 651 if (off != 0x7) {
yading@10 652 vis_alignaddr_g0((void *)off_plus_1);
yading@10 653 vis_faligndata(TMP0, TMP2, REF_2);
yading@10 654 } else {
yading@10 655 vis_src1(TMP2, REF_2);
yading@10 656 }
yading@10 657
yading@10 658 vis_and(TMP4, MASK_7f, TMP4);
yading@10 659
yading@10 660 vis_psub16(TMP6, TMP4, DST_0);
yading@10 661 vis_st64(DST_0, dest[0]);
yading@10 662 dest += stride;
yading@10 663
yading@10 664 vis_xor(REF_0, REF_2, TMP12);
yading@10 665
yading@10 666 vis_and(TMP12, MASK_fe, TMP12);
yading@10 667
yading@10 668 vis_or(REF_0, REF_2, TMP14);
yading@10 669 vis_mul8x16(CONST_128, TMP12, TMP12);
yading@10 670
yading@10 671 vis_alignaddr_g0((void *)off);
yading@10 672 vis_faligndata(TMP8, TMP10, REF_0);
yading@10 673 if (off != 0x7) {
yading@10 674 vis_alignaddr_g0((void *)off_plus_1);
yading@10 675 vis_faligndata(TMP8, TMP10, REF_2);
yading@10 676 } else {
yading@10 677 vis_src1(TMP10, REF_2);
yading@10 678 }
yading@10 679
yading@10 680 vis_and(TMP12, MASK_7f, TMP12);
yading@10 681
yading@10 682 vis_psub16(TMP14, TMP12, DST_0);
yading@10 683 vis_st64(DST_0, dest[0]);
yading@10 684 dest += stride;
yading@10 685 } while (--height);
yading@10 686
yading@10 687 vis_ld64(ref[0], TMP0);
yading@10 688 vis_xor(REF_0, REF_2, TMP4);
yading@10 689
yading@10 690 vis_ld64_2(ref, 8, TMP2);
yading@10 691 vis_and(TMP4, MASK_fe, TMP4);
yading@10 692
yading@10 693 vis_or(REF_0, REF_2, TMP6);
yading@10 694 vis_mul8x16(CONST_128, TMP4, TMP4);
yading@10 695
yading@10 696 vis_alignaddr_g0((void *)off);
yading@10 697
yading@10 698 vis_faligndata(TMP0, TMP2, REF_0);
yading@10 699
yading@10 700 if (off != 0x7) {
yading@10 701 vis_alignaddr_g0((void *)off_plus_1);
yading@10 702 vis_faligndata(TMP0, TMP2, REF_2);
yading@10 703 } else {
yading@10 704 vis_src1(TMP2, REF_2);
yading@10 705 }
yading@10 706
yading@10 707 vis_and(TMP4, MASK_7f, TMP4);
yading@10 708
yading@10 709 vis_psub16(TMP6, TMP4, DST_0);
yading@10 710 vis_st64(DST_0, dest[0]);
yading@10 711 dest += stride;
yading@10 712
yading@10 713 vis_xor(REF_0, REF_2, TMP12);
yading@10 714
yading@10 715 vis_and(TMP12, MASK_fe, TMP12);
yading@10 716
yading@10 717 vis_or(REF_0, REF_2, TMP14);
yading@10 718 vis_mul8x16(CONST_128, TMP12, TMP12);
yading@10 719
yading@10 720 vis_and(TMP12, MASK_7f, TMP12);
yading@10 721
yading@10 722 vis_psub16(TMP14, TMP12, DST_0);
yading@10 723 vis_st64(DST_0, dest[0]);
yading@10 724 dest += stride;
yading@10 725 }
yading@10 726
yading@10 727 static void MC_avg_x_16_vis (uint8_t * dest, const uint8_t * ref,
yading@10 728 const ptrdiff_t stride, int height)
yading@10 729 {
yading@10 730 unsigned long off = (unsigned long) ref & 0x7;
yading@10 731 unsigned long off_plus_1 = off + 1;
yading@10 732
yading@10 733 vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);
yading@10 734
yading@10 735 vis_ld64(constants3[0], CONST_3);
yading@10 736 vis_fzero(ZERO);
yading@10 737 vis_ld64(constants256_512[0], CONST_256);
yading@10 738
yading@10 739 ref = vis_alignaddr(ref);
yading@10 740 do { /* 26 cycles */
yading@10 741 vis_ld64(ref[0], TMP0);
yading@10 742
yading@10 743 vis_ld64(ref[8], TMP2);
yading@10 744
yading@10 745 vis_alignaddr_g0((void *)off);
yading@10 746
yading@10 747 vis_ld64(ref[16], TMP4);
yading@10 748
yading@10 749 vis_ld64(dest[0], DST_0);
yading@10 750 vis_faligndata(TMP0, TMP2, REF_0);
yading@10 751
yading@10 752 vis_ld64(dest[8], DST_2);
yading@10 753 vis_faligndata(TMP2, TMP4, REF_4);
yading@10 754
yading@10 755 if (off != 0x7) {
yading@10 756 vis_alignaddr_g0((void *)off_plus_1);
yading@10 757 vis_faligndata(TMP0, TMP2, REF_2);
yading@10 758 vis_faligndata(TMP2, TMP4, REF_6);
yading@10 759 } else {
yading@10 760 vis_src1(TMP2, REF_2);
yading@10 761 vis_src1(TMP4, REF_6);
yading@10 762 }
yading@10 763
yading@10 764 vis_mul8x16au(REF_0, CONST_256, TMP0);
yading@10 765
yading@10 766 vis_pmerge(ZERO, REF_2, TMP4);
yading@10 767 vis_mul8x16au(REF_0_1, CONST_256, TMP2);
yading@10 768
yading@10 769 vis_pmerge(ZERO, REF_2_1, TMP6);
yading@10 770
yading@10 771 vis_padd16(TMP0, TMP4, TMP0);
yading@10 772
yading@10 773 vis_mul8x16al(DST_0, CONST_512, TMP4);
yading@10 774 vis_padd16(TMP2, TMP6, TMP2);
yading@10 775
yading@10 776 vis_mul8x16al(DST_1, CONST_512, TMP6);
yading@10 777
yading@10 778 vis_mul8x16au(REF_6, CONST_256, TMP12);
yading@10 779
yading@10 780 vis_padd16(TMP0, TMP4, TMP0);
yading@10 781 vis_mul8x16au(REF_6_1, CONST_256, TMP14);
yading@10 782
yading@10 783 vis_padd16(TMP2, TMP6, TMP2);
yading@10 784 vis_mul8x16au(REF_4, CONST_256, TMP16);
yading@10 785
yading@10 786 vis_padd16(TMP0, CONST_3, TMP8);
yading@10 787 vis_mul8x16au(REF_4_1, CONST_256, TMP18);
yading@10 788
yading@10 789 vis_padd16(TMP2, CONST_3, TMP10);
yading@10 790 vis_pack16(TMP8, DST_0);
yading@10 791
yading@10 792 vis_pack16(TMP10, DST_1);
yading@10 793 vis_padd16(TMP16, TMP12, TMP0);
yading@10 794
yading@10 795 vis_st64(DST_0, dest[0]);
yading@10 796 vis_mul8x16al(DST_2, CONST_512, TMP4);
yading@10 797 vis_padd16(TMP18, TMP14, TMP2);
yading@10 798
yading@10 799 vis_mul8x16al(DST_3, CONST_512, TMP6);
yading@10 800 vis_padd16(TMP0, CONST_3, TMP0);
yading@10 801
yading@10 802 vis_padd16(TMP2, CONST_3, TMP2);
yading@10 803
yading@10 804 vis_padd16(TMP0, TMP4, TMP0);
yading@10 805
yading@10 806 vis_padd16(TMP2, TMP6, TMP2);
yading@10 807 vis_pack16(TMP0, DST_2);
yading@10 808
yading@10 809 vis_pack16(TMP2, DST_3);
yading@10 810 vis_st64(DST_2, dest[8]);
yading@10 811
yading@10 812 ref += stride;
yading@10 813 dest += stride;
yading@10 814 } while (--height);
yading@10 815 }
yading@10 816
yading@10 817 static void MC_avg_x_8_vis (uint8_t * dest, const uint8_t * ref,
yading@10 818 const ptrdiff_t stride, int height)
yading@10 819 {
yading@10 820 unsigned long off = (unsigned long) ref & 0x7;
yading@10 821 unsigned long off_plus_1 = off + 1;
yading@10 822 int stride_times_2 = stride << 1;
yading@10 823
yading@10 824 vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);
yading@10 825
yading@10 826 vis_ld64(constants3[0], CONST_3);
yading@10 827 vis_fzero(ZERO);
yading@10 828 vis_ld64(constants256_512[0], CONST_256);
yading@10 829
yading@10 830 ref = vis_alignaddr(ref);
yading@10 831 height >>= 2;
yading@10 832 do { /* 47 cycles */
yading@10 833 vis_ld64(ref[0], TMP0);
yading@10 834
yading@10 835 vis_ld64_2(ref, 8, TMP2);
yading@10 836 ref += stride;
yading@10 837
yading@10 838 vis_alignaddr_g0((void *)off);
yading@10 839
yading@10 840 vis_ld64(ref[0], TMP4);
yading@10 841 vis_faligndata(TMP0, TMP2, REF_0);
yading@10 842
yading@10 843 vis_ld64_2(ref, 8, TMP6);
yading@10 844 ref += stride;
yading@10 845
yading@10 846 vis_ld64(ref[0], TMP8);
yading@10 847
yading@10 848 vis_ld64_2(ref, 8, TMP10);
yading@10 849 ref += stride;
yading@10 850 vis_faligndata(TMP4, TMP6, REF_4);
yading@10 851
yading@10 852 vis_ld64(ref[0], TMP12);
yading@10 853
yading@10 854 vis_ld64_2(ref, 8, TMP14);
yading@10 855 ref += stride;
yading@10 856 vis_faligndata(TMP8, TMP10, REF_S0);
yading@10 857
yading@10 858 vis_faligndata(TMP12, TMP14, REF_S4);
yading@10 859
yading@10 860 if (off != 0x7) {
yading@10 861 vis_alignaddr_g0((void *)off_plus_1);
yading@10 862
yading@10 863 vis_ld64(dest[0], DST_0);
yading@10 864 vis_faligndata(TMP0, TMP2, REF_2);
yading@10 865
yading@10 866 vis_ld64_2(dest, stride, DST_2);
yading@10 867 vis_faligndata(TMP4, TMP6, REF_6);
yading@10 868
yading@10 869 vis_faligndata(TMP8, TMP10, REF_S2);
yading@10 870
yading@10 871 vis_faligndata(TMP12, TMP14, REF_S6);
yading@10 872 } else {
yading@10 873 vis_ld64(dest[0], DST_0);
yading@10 874 vis_src1(TMP2, REF_2);
yading@10 875
yading@10 876 vis_ld64_2(dest, stride, DST_2);
yading@10 877 vis_src1(TMP6, REF_6);
yading@10 878
yading@10 879 vis_src1(TMP10, REF_S2);
yading@10 880
yading@10 881 vis_src1(TMP14, REF_S6);
yading@10 882 }
yading@10 883
yading@10 884 vis_pmerge(ZERO, REF_0, TMP0);
yading@10 885 vis_mul8x16au(REF_0_1, CONST_256, TMP2);
yading@10 886
yading@10 887 vis_pmerge(ZERO, REF_2, TMP4);
yading@10 888 vis_mul8x16au(REF_2_1, CONST_256, TMP6);
yading@10 889
yading@10 890 vis_padd16(TMP0, CONST_3, TMP0);
yading@10 891 vis_mul8x16al(DST_0, CONST_512, TMP16);
yading@10 892
yading@10 893 vis_padd16(TMP2, CONST_3, TMP2);
yading@10 894 vis_mul8x16al(DST_1, CONST_512, TMP18);
yading@10 895
yading@10 896 vis_padd16(TMP0, TMP4, TMP0);
yading@10 897 vis_mul8x16au(REF_4, CONST_256, TMP8);
yading@10 898
yading@10 899 vis_padd16(TMP2, TMP6, TMP2);
yading@10 900 vis_mul8x16au(REF_4_1, CONST_256, TMP10);
yading@10 901
yading@10 902 vis_padd16(TMP0, TMP16, TMP0);
yading@10 903 vis_mul8x16au(REF_6, CONST_256, TMP12);
yading@10 904
yading@10 905 vis_padd16(TMP2, TMP18, TMP2);
yading@10 906 vis_mul8x16au(REF_6_1, CONST_256, TMP14);
yading@10 907
yading@10 908 vis_padd16(TMP8, CONST_3, TMP8);
yading@10 909 vis_mul8x16al(DST_2, CONST_512, TMP16);
yading@10 910
yading@10 911 vis_padd16(TMP8, TMP12, TMP8);
yading@10 912 vis_mul8x16al(DST_3, CONST_512, TMP18);
yading@10 913
yading@10 914 vis_padd16(TMP10, TMP14, TMP10);
yading@10 915 vis_pack16(TMP0, DST_0);
yading@10 916
yading@10 917 vis_pack16(TMP2, DST_1);
yading@10 918 vis_st64(DST_0, dest[0]);
yading@10 919 dest += stride;
yading@10 920 vis_padd16(TMP10, CONST_3, TMP10);
yading@10 921
yading@10 922 vis_ld64_2(dest, stride, DST_0);
yading@10 923 vis_padd16(TMP8, TMP16, TMP8);
yading@10 924
yading@10 925 vis_ld64_2(dest, stride_times_2, TMP4/*DST_2*/);
yading@10 926 vis_padd16(TMP10, TMP18, TMP10);
yading@10 927 vis_pack16(TMP8, DST_2);
yading@10 928
yading@10 929 vis_pack16(TMP10, DST_3);
yading@10 930 vis_st64(DST_2, dest[0]);
yading@10 931 dest += stride;
yading@10 932
yading@10 933 vis_mul8x16au(REF_S0_1, CONST_256, TMP2);
yading@10 934 vis_pmerge(ZERO, REF_S0, TMP0);
yading@10 935
yading@10 936 vis_pmerge(ZERO, REF_S2, TMP24);
yading@10 937 vis_mul8x16au(REF_S2_1, CONST_256, TMP6);
yading@10 938
yading@10 939 vis_padd16(TMP0, CONST_3, TMP0);
yading@10 940 vis_mul8x16au(REF_S4, CONST_256, TMP8);
yading@10 941
yading@10 942 vis_padd16(TMP2, CONST_3, TMP2);
yading@10 943 vis_mul8x16au(REF_S4_1, CONST_256, TMP10);
yading@10 944
yading@10 945 vis_padd16(TMP0, TMP24, TMP0);
yading@10 946 vis_mul8x16au(REF_S6, CONST_256, TMP12);
yading@10 947
yading@10 948 vis_padd16(TMP2, TMP6, TMP2);
yading@10 949 vis_mul8x16au(REF_S6_1, CONST_256, TMP14);
yading@10 950
yading@10 951 vis_padd16(TMP8, CONST_3, TMP8);
yading@10 952 vis_mul8x16al(DST_0, CONST_512, TMP16);
yading@10 953
yading@10 954 vis_padd16(TMP10, CONST_3, TMP10);
yading@10 955 vis_mul8x16al(DST_1, CONST_512, TMP18);
yading@10 956
yading@10 957 vis_padd16(TMP8, TMP12, TMP8);
yading@10 958 vis_mul8x16al(TMP4/*DST_2*/, CONST_512, TMP20);
yading@10 959
yading@10 960 vis_mul8x16al(TMP5/*DST_3*/, CONST_512, TMP22);
yading@10 961 vis_padd16(TMP0, TMP16, TMP0);
yading@10 962
yading@10 963 vis_padd16(TMP2, TMP18, TMP2);
yading@10 964 vis_pack16(TMP0, DST_0);
yading@10 965
yading@10 966 vis_padd16(TMP10, TMP14, TMP10);
yading@10 967 vis_pack16(TMP2, DST_1);
yading@10 968 vis_st64(DST_0, dest[0]);
yading@10 969 dest += stride;
yading@10 970
yading@10 971 vis_padd16(TMP8, TMP20, TMP8);
yading@10 972
yading@10 973 vis_padd16(TMP10, TMP22, TMP10);
yading@10 974 vis_pack16(TMP8, DST_2);
yading@10 975
yading@10 976 vis_pack16(TMP10, DST_3);
yading@10 977 vis_st64(DST_2, dest[0]);
yading@10 978 dest += stride;
yading@10 979 } while (--height);
yading@10 980 }
yading@10 981
yading@10 982 static void MC_put_y_16_vis (uint8_t * dest, const uint8_t * ref,
yading@10 983 const ptrdiff_t stride, int height)
yading@10 984 {
yading@10 985 ref = vis_alignaddr(ref);
yading@10 986 vis_ld64(ref[0], TMP0);
yading@10 987
yading@10 988 vis_ld64_2(ref, 8, TMP2);
yading@10 989
yading@10 990 vis_ld64_2(ref, 16, TMP4);
yading@10 991 ref += stride;
yading@10 992
yading@10 993 vis_ld64(ref[0], TMP6);
yading@10 994 vis_faligndata(TMP0, TMP2, REF_0);
yading@10 995
yading@10 996 vis_ld64_2(ref, 8, TMP8);
yading@10 997 vis_faligndata(TMP2, TMP4, REF_4);
yading@10 998
yading@10 999 vis_ld64_2(ref, 16, TMP10);
yading@10 1000 ref += stride;
yading@10 1001
yading@10 1002 vis_ld64(constants_fe[0], MASK_fe);
yading@10 1003 vis_faligndata(TMP6, TMP8, REF_2);
yading@10 1004
yading@10 1005 vis_ld64(constants_7f[0], MASK_7f);
yading@10 1006 vis_faligndata(TMP8, TMP10, REF_6);
yading@10 1007
yading@10 1008 vis_ld64(constants128[0], CONST_128);
yading@10 1009 height = (height >> 1) - 1;
yading@10 1010 do { /* 24 cycles */
yading@10 1011 vis_ld64(ref[0], TMP0);
yading@10 1012 vis_xor(REF_0, REF_2, TMP12);
yading@10 1013
yading@10 1014 vis_ld64_2(ref, 8, TMP2);
yading@10 1015 vis_xor(REF_4, REF_6, TMP16);
yading@10 1016
yading@10 1017 vis_ld64_2(ref, 16, TMP4);
yading@10 1018 ref += stride;
yading@10 1019 vis_or(REF_0, REF_2, TMP14);
yading@10 1020
yading@10 1021 vis_ld64(ref[0], TMP6);
yading@10 1022 vis_or(REF_4, REF_6, TMP18);
yading@10 1023
yading@10 1024 vis_ld64_2(ref, 8, TMP8);
yading@10 1025 vis_faligndata(TMP0, TMP2, REF_0);
yading@10 1026
yading@10 1027 vis_ld64_2(ref, 16, TMP10);
yading@10 1028 ref += stride;
yading@10 1029 vis_faligndata(TMP2, TMP4, REF_4);
yading@10 1030
yading@10 1031 vis_and(TMP12, MASK_fe, TMP12);
yading@10 1032
yading@10 1033 vis_and(TMP16, MASK_fe, TMP16);
yading@10 1034 vis_mul8x16(CONST_128, TMP12, TMP12);
yading@10 1035
yading@10 1036 vis_mul8x16(CONST_128, TMP16, TMP16);
yading@10 1037 vis_xor(REF_0, REF_2, TMP0);
yading@10 1038
yading@10 1039 vis_xor(REF_4, REF_6, TMP2);
yading@10 1040
yading@10 1041 vis_or(REF_0, REF_2, TMP20);
yading@10 1042
yading@10 1043 vis_and(TMP12, MASK_7f, TMP12);
yading@10 1044
yading@10 1045 vis_and(TMP16, MASK_7f, TMP16);
yading@10 1046
yading@10 1047 vis_psub16(TMP14, TMP12, TMP12);
yading@10 1048 vis_st64(TMP12, dest[0]);
yading@10 1049
yading@10 1050 vis_psub16(TMP18, TMP16, TMP16);
yading@10 1051 vis_st64_2(TMP16, dest, 8);
yading@10 1052 dest += stride;
yading@10 1053
yading@10 1054 vis_or(REF_4, REF_6, TMP18);
yading@10 1055
yading@10 1056 vis_and(TMP0, MASK_fe, TMP0);
yading@10 1057
yading@10 1058 vis_and(TMP2, MASK_fe, TMP2);
yading@10 1059 vis_mul8x16(CONST_128, TMP0, TMP0);
yading@10 1060
yading@10 1061 vis_faligndata(TMP6, TMP8, REF_2);
yading@10 1062 vis_mul8x16(CONST_128, TMP2, TMP2);
yading@10 1063
yading@10 1064 vis_faligndata(TMP8, TMP10, REF_6);
yading@10 1065
yading@10 1066 vis_and(TMP0, MASK_7f, TMP0);
yading@10 1067
yading@10 1068 vis_and(TMP2, MASK_7f, TMP2);
yading@10 1069
yading@10 1070 vis_psub16(TMP20, TMP0, TMP0);
yading@10 1071 vis_st64(TMP0, dest[0]);
yading@10 1072
yading@10 1073 vis_psub16(TMP18, TMP2, TMP2);
yading@10 1074 vis_st64_2(TMP2, dest, 8);
yading@10 1075 dest += stride;
yading@10 1076 } while (--height);
yading@10 1077
yading@10 1078 vis_ld64(ref[0], TMP0);
yading@10 1079 vis_xor(REF_0, REF_2, TMP12);
yading@10 1080
yading@10 1081 vis_ld64_2(ref, 8, TMP2);
yading@10 1082 vis_xor(REF_4, REF_6, TMP16);
yading@10 1083
yading@10 1084 vis_ld64_2(ref, 16, TMP4);
yading@10 1085 vis_or(REF_0, REF_2, TMP14);
yading@10 1086
yading@10 1087 vis_or(REF_4, REF_6, TMP18);
yading@10 1088
yading@10 1089 vis_faligndata(TMP0, TMP2, REF_0);
yading@10 1090
yading@10 1091 vis_faligndata(TMP2, TMP4, REF_4);
yading@10 1092
yading@10 1093 vis_and(TMP12, MASK_fe, TMP12);
yading@10 1094
yading@10 1095 vis_and(TMP16, MASK_fe, TMP16);
yading@10 1096 vis_mul8x16(CONST_128, TMP12, TMP12);
yading@10 1097
yading@10 1098 vis_mul8x16(CONST_128, TMP16, TMP16);
yading@10 1099 vis_xor(REF_0, REF_2, TMP0);
yading@10 1100
yading@10 1101 vis_xor(REF_4, REF_6, TMP2);
yading@10 1102
yading@10 1103 vis_or(REF_0, REF_2, TMP20);
yading@10 1104
yading@10 1105 vis_and(TMP12, MASK_7f, TMP12);
yading@10 1106
yading@10 1107 vis_and(TMP16, MASK_7f, TMP16);
yading@10 1108
yading@10 1109 vis_psub16(TMP14, TMP12, TMP12);
yading@10 1110 vis_st64(TMP12, dest[0]);
yading@10 1111
yading@10 1112 vis_psub16(TMP18, TMP16, TMP16);
yading@10 1113 vis_st64_2(TMP16, dest, 8);
yading@10 1114 dest += stride;
yading@10 1115
yading@10 1116 vis_or(REF_4, REF_6, TMP18);
yading@10 1117
yading@10 1118 vis_and(TMP0, MASK_fe, TMP0);
yading@10 1119
yading@10 1120 vis_and(TMP2, MASK_fe, TMP2);
yading@10 1121 vis_mul8x16(CONST_128, TMP0, TMP0);
yading@10 1122
yading@10 1123 vis_mul8x16(CONST_128, TMP2, TMP2);
yading@10 1124
yading@10 1125 vis_and(TMP0, MASK_7f, TMP0);
yading@10 1126
yading@10 1127 vis_and(TMP2, MASK_7f, TMP2);
yading@10 1128
yading@10 1129 vis_psub16(TMP20, TMP0, TMP0);
yading@10 1130 vis_st64(TMP0, dest[0]);
yading@10 1131
yading@10 1132 vis_psub16(TMP18, TMP2, TMP2);
yading@10 1133 vis_st64_2(TMP2, dest, 8);
yading@10 1134 }
yading@10 1135
yading@10 1136 static void MC_put_y_8_vis (uint8_t * dest, const uint8_t * ref,
yading@10 1137 const ptrdiff_t stride, int height)
yading@10 1138 {
yading@10 1139 ref = vis_alignaddr(ref);
yading@10 1140 vis_ld64(ref[0], TMP0);
yading@10 1141
yading@10 1142 vis_ld64_2(ref, 8, TMP2);
yading@10 1143 ref += stride;
yading@10 1144
yading@10 1145 vis_ld64(ref[0], TMP4);
yading@10 1146
yading@10 1147 vis_ld64_2(ref, 8, TMP6);
yading@10 1148 ref += stride;
yading@10 1149
yading@10 1150 vis_ld64(constants_fe[0], MASK_fe);
yading@10 1151 vis_faligndata(TMP0, TMP2, REF_0);
yading@10 1152
yading@10 1153 vis_ld64(constants_7f[0], MASK_7f);
yading@10 1154 vis_faligndata(TMP4, TMP6, REF_2);
yading@10 1155
yading@10 1156 vis_ld64(constants128[0], CONST_128);
yading@10 1157 height = (height >> 1) - 1;
yading@10 1158 do { /* 12 cycles */
yading@10 1159 vis_ld64(ref[0], TMP0);
yading@10 1160 vis_xor(REF_0, REF_2, TMP4);
yading@10 1161
yading@10 1162 vis_ld64_2(ref, 8, TMP2);
yading@10 1163 ref += stride;
yading@10 1164 vis_and(TMP4, MASK_fe, TMP4);
yading@10 1165
yading@10 1166 vis_or(REF_0, REF_2, TMP6);
yading@10 1167 vis_mul8x16(CONST_128, TMP4, TMP4);
yading@10 1168
yading@10 1169 vis_faligndata(TMP0, TMP2, REF_0);
yading@10 1170 vis_ld64(ref[0], TMP0);
yading@10 1171
yading@10 1172 vis_ld64_2(ref, 8, TMP2);
yading@10 1173 ref += stride;
yading@10 1174 vis_xor(REF_0, REF_2, TMP12);
yading@10 1175
yading@10 1176 vis_and(TMP4, MASK_7f, TMP4);
yading@10 1177
yading@10 1178 vis_and(TMP12, MASK_fe, TMP12);
yading@10 1179
yading@10 1180 vis_mul8x16(CONST_128, TMP12, TMP12);
yading@10 1181 vis_or(REF_0, REF_2, TMP14);
yading@10 1182
yading@10 1183 vis_psub16(TMP6, TMP4, DST_0);
yading@10 1184 vis_st64(DST_0, dest[0]);
yading@10 1185 dest += stride;
yading@10 1186
yading@10 1187 vis_faligndata(TMP0, TMP2, REF_2);
yading@10 1188
yading@10 1189 vis_and(TMP12, MASK_7f, TMP12);
yading@10 1190
yading@10 1191 vis_psub16(TMP14, TMP12, DST_0);
yading@10 1192 vis_st64(DST_0, dest[0]);
yading@10 1193 dest += stride;
yading@10 1194 } while (--height);
yading@10 1195
yading@10 1196 vis_ld64(ref[0], TMP0);
yading@10 1197 vis_xor(REF_0, REF_2, TMP4);
yading@10 1198
yading@10 1199 vis_ld64_2(ref, 8, TMP2);
yading@10 1200 vis_and(TMP4, MASK_fe, TMP4);
yading@10 1201
yading@10 1202 vis_or(REF_0, REF_2, TMP6);
yading@10 1203 vis_mul8x16(CONST_128, TMP4, TMP4);
yading@10 1204
yading@10 1205 vis_faligndata(TMP0, TMP2, REF_0);
yading@10 1206
yading@10 1207 vis_xor(REF_0, REF_2, TMP12);
yading@10 1208
yading@10 1209 vis_and(TMP4, MASK_7f, TMP4);
yading@10 1210
yading@10 1211 vis_and(TMP12, MASK_fe, TMP12);
yading@10 1212
yading@10 1213 vis_mul8x16(CONST_128, TMP12, TMP12);
yading@10 1214 vis_or(REF_0, REF_2, TMP14);
yading@10 1215
yading@10 1216 vis_psub16(TMP6, TMP4, DST_0);
yading@10 1217 vis_st64(DST_0, dest[0]);
yading@10 1218 dest += stride;
yading@10 1219
yading@10 1220 vis_and(TMP12, MASK_7f, TMP12);
yading@10 1221
yading@10 1222 vis_psub16(TMP14, TMP12, DST_0);
yading@10 1223 vis_st64(DST_0, dest[0]);
yading@10 1224 }
yading@10 1225
yading@10 1226 static void MC_avg_y_16_vis (uint8_t * dest, const uint8_t * ref,
yading@10 1227 const ptrdiff_t stride, int height)
yading@10 1228 {
yading@10 1229 int stride_8 = stride + 8;
yading@10 1230 int stride_16 = stride + 16;
yading@10 1231
yading@10 1232 vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);
yading@10 1233
yading@10 1234 ref = vis_alignaddr(ref);
yading@10 1235
yading@10 1236 vis_ld64(ref[ 0], TMP0);
yading@10 1237 vis_fzero(ZERO);
yading@10 1238
yading@10 1239 vis_ld64(ref[ 8], TMP2);
yading@10 1240
yading@10 1241 vis_ld64(ref[16], TMP4);
yading@10 1242
yading@10 1243 vis_ld64(constants3[0], CONST_3);
yading@10 1244 vis_faligndata(TMP0, TMP2, REF_2);
yading@10 1245
yading@10 1246 vis_ld64(constants256_512[0], CONST_256);
yading@10 1247 vis_faligndata(TMP2, TMP4, REF_6);
yading@10 1248 height >>= 1;
yading@10 1249
yading@10 1250 do { /* 31 cycles */
yading@10 1251 vis_ld64_2(ref, stride, TMP0);
yading@10 1252 vis_pmerge(ZERO, REF_2, TMP12);
yading@10 1253 vis_mul8x16au(REF_2_1, CONST_256, TMP14);
yading@10 1254
yading@10 1255 vis_ld64_2(ref, stride_8, TMP2);
yading@10 1256 vis_pmerge(ZERO, REF_6, TMP16);
yading@10 1257 vis_mul8x16au(REF_6_1, CONST_256, TMP18);
yading@10 1258
yading@10 1259 vis_ld64_2(ref, stride_16, TMP4);
yading@10 1260 ref += stride;
yading@10 1261
yading@10 1262 vis_ld64(dest[0], DST_0);
yading@10 1263 vis_faligndata(TMP0, TMP2, REF_0);
yading@10 1264
yading@10 1265 vis_ld64_2(dest, 8, DST_2);
yading@10 1266 vis_faligndata(TMP2, TMP4, REF_4);
yading@10 1267
yading@10 1268 vis_ld64_2(ref, stride, TMP6);
yading@10 1269 vis_pmerge(ZERO, REF_0, TMP0);
yading@10 1270 vis_mul8x16au(REF_0_1, CONST_256, TMP2);
yading@10 1271
yading@10 1272 vis_ld64_2(ref, stride_8, TMP8);
yading@10 1273 vis_pmerge(ZERO, REF_4, TMP4);
yading@10 1274
yading@10 1275 vis_ld64_2(ref, stride_16, TMP10);
yading@10 1276 ref += stride;
yading@10 1277
yading@10 1278 vis_ld64_2(dest, stride, REF_S0/*DST_4*/);
yading@10 1279 vis_faligndata(TMP6, TMP8, REF_2);
yading@10 1280 vis_mul8x16au(REF_4_1, CONST_256, TMP6);
yading@10 1281
yading@10 1282 vis_ld64_2(dest, stride_8, REF_S2/*DST_6*/);
yading@10 1283 vis_faligndata(TMP8, TMP10, REF_6);
yading@10 1284 vis_mul8x16al(DST_0, CONST_512, TMP20);
yading@10 1285
yading@10 1286 vis_padd16(TMP0, CONST_3, TMP0);
yading@10 1287 vis_mul8x16al(DST_1, CONST_512, TMP22);
yading@10 1288
yading@10 1289 vis_padd16(TMP2, CONST_3, TMP2);
yading@10 1290 vis_mul8x16al(DST_2, CONST_512, TMP24);
yading@10 1291
yading@10 1292 vis_padd16(TMP4, CONST_3, TMP4);
yading@10 1293 vis_mul8x16al(DST_3, CONST_512, TMP26);
yading@10 1294
yading@10 1295 vis_padd16(TMP6, CONST_3, TMP6);
yading@10 1296
yading@10 1297 vis_padd16(TMP12, TMP20, TMP12);
yading@10 1298 vis_mul8x16al(REF_S0, CONST_512, TMP20);
yading@10 1299
yading@10 1300 vis_padd16(TMP14, TMP22, TMP14);
yading@10 1301 vis_mul8x16al(REF_S0_1, CONST_512, TMP22);
yading@10 1302
yading@10 1303 vis_padd16(TMP16, TMP24, TMP16);
yading@10 1304 vis_mul8x16al(REF_S2, CONST_512, TMP24);
yading@10 1305
yading@10 1306 vis_padd16(TMP18, TMP26, TMP18);
yading@10 1307 vis_mul8x16al(REF_S2_1, CONST_512, TMP26);
yading@10 1308
yading@10 1309 vis_padd16(TMP12, TMP0, TMP12);
yading@10 1310 vis_mul8x16au(REF_2, CONST_256, TMP28);
yading@10 1311
yading@10 1312 vis_padd16(TMP14, TMP2, TMP14);
yading@10 1313 vis_mul8x16au(REF_2_1, CONST_256, TMP30);
yading@10 1314
yading@10 1315 vis_padd16(TMP16, TMP4, TMP16);
yading@10 1316 vis_mul8x16au(REF_6, CONST_256, REF_S4);
yading@10 1317
yading@10 1318 vis_padd16(TMP18, TMP6, TMP18);
yading@10 1319 vis_mul8x16au(REF_6_1, CONST_256, REF_S6);
yading@10 1320
yading@10 1321 vis_pack16(TMP12, DST_0);
yading@10 1322 vis_padd16(TMP28, TMP0, TMP12);
yading@10 1323
yading@10 1324 vis_pack16(TMP14, DST_1);
yading@10 1325 vis_st64(DST_0, dest[0]);
yading@10 1326 vis_padd16(TMP30, TMP2, TMP14);
yading@10 1327
yading@10 1328 vis_pack16(TMP16, DST_2);
yading@10 1329 vis_padd16(REF_S4, TMP4, TMP16);
yading@10 1330
yading@10 1331 vis_pack16(TMP18, DST_3);
yading@10 1332 vis_st64_2(DST_2, dest, 8);
yading@10 1333 dest += stride;
yading@10 1334 vis_padd16(REF_S6, TMP6, TMP18);
yading@10 1335
yading@10 1336 vis_padd16(TMP12, TMP20, TMP12);
yading@10 1337
yading@10 1338 vis_padd16(TMP14, TMP22, TMP14);
yading@10 1339 vis_pack16(TMP12, DST_0);
yading@10 1340
yading@10 1341 vis_padd16(TMP16, TMP24, TMP16);
yading@10 1342 vis_pack16(TMP14, DST_1);
yading@10 1343 vis_st64(DST_0, dest[0]);
yading@10 1344
yading@10 1345 vis_padd16(TMP18, TMP26, TMP18);
yading@10 1346 vis_pack16(TMP16, DST_2);
yading@10 1347
yading@10 1348 vis_pack16(TMP18, DST_3);
yading@10 1349 vis_st64_2(DST_2, dest, 8);
yading@10 1350 dest += stride;
yading@10 1351 } while (--height);
yading@10 1352 }
yading@10 1353
yading@10 1354 static void MC_avg_y_8_vis (uint8_t * dest, const uint8_t * ref,
yading@10 1355 const ptrdiff_t stride, int height)
yading@10 1356 {
yading@10 1357 int stride_8 = stride + 8;
yading@10 1358
yading@10 1359 vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);
yading@10 1360
yading@10 1361 ref = vis_alignaddr(ref);
yading@10 1362
yading@10 1363 vis_ld64(ref[ 0], TMP0);
yading@10 1364 vis_fzero(ZERO);
yading@10 1365
yading@10 1366 vis_ld64(ref[ 8], TMP2);
yading@10 1367
yading@10 1368 vis_ld64(constants3[0], CONST_3);
yading@10 1369 vis_faligndata(TMP0, TMP2, REF_2);
yading@10 1370
yading@10 1371 vis_ld64(constants256_512[0], CONST_256);
yading@10 1372
yading@10 1373 height >>= 1;
yading@10 1374 do { /* 20 cycles */
yading@10 1375 vis_ld64_2(ref, stride, TMP0);
yading@10 1376 vis_pmerge(ZERO, REF_2, TMP8);
yading@10 1377 vis_mul8x16au(REF_2_1, CONST_256, TMP10);
yading@10 1378
yading@10 1379 vis_ld64_2(ref, stride_8, TMP2);
yading@10 1380 ref += stride;
yading@10 1381
yading@10 1382 vis_ld64(dest[0], DST_0);
yading@10 1383
yading@10 1384 vis_ld64_2(dest, stride, DST_2);
yading@10 1385 vis_faligndata(TMP0, TMP2, REF_0);
yading@10 1386
yading@10 1387 vis_ld64_2(ref, stride, TMP4);
yading@10 1388 vis_mul8x16al(DST_0, CONST_512, TMP16);
yading@10 1389 vis_pmerge(ZERO, REF_0, TMP12);
yading@10 1390
yading@10 1391 vis_ld64_2(ref, stride_8, TMP6);
yading@10 1392 ref += stride;
yading@10 1393 vis_mul8x16al(DST_1, CONST_512, TMP18);
yading@10 1394 vis_pmerge(ZERO, REF_0_1, TMP14);
yading@10 1395
yading@10 1396 vis_padd16(TMP12, CONST_3, TMP12);
yading@10 1397 vis_mul8x16al(DST_2, CONST_512, TMP24);
yading@10 1398
yading@10 1399 vis_padd16(TMP14, CONST_3, TMP14);
yading@10 1400 vis_mul8x16al(DST_3, CONST_512, TMP26);
yading@10 1401
yading@10 1402 vis_faligndata(TMP4, TMP6, REF_2);
yading@10 1403
yading@10 1404 vis_padd16(TMP8, TMP12, TMP8);
yading@10 1405
yading@10 1406 vis_padd16(TMP10, TMP14, TMP10);
yading@10 1407 vis_mul8x16au(REF_2, CONST_256, TMP20);
yading@10 1408
yading@10 1409 vis_padd16(TMP8, TMP16, TMP0);
yading@10 1410 vis_mul8x16au(REF_2_1, CONST_256, TMP22);
yading@10 1411
yading@10 1412 vis_padd16(TMP10, TMP18, TMP2);
yading@10 1413 vis_pack16(TMP0, DST_0);
yading@10 1414
yading@10 1415 vis_pack16(TMP2, DST_1);
yading@10 1416 vis_st64(DST_0, dest[0]);
yading@10 1417 dest += stride;
yading@10 1418 vis_padd16(TMP12, TMP20, TMP12);
yading@10 1419
yading@10 1420 vis_padd16(TMP14, TMP22, TMP14);
yading@10 1421
yading@10 1422 vis_padd16(TMP12, TMP24, TMP0);
yading@10 1423
yading@10 1424 vis_padd16(TMP14, TMP26, TMP2);
yading@10 1425 vis_pack16(TMP0, DST_2);
yading@10 1426
yading@10 1427 vis_pack16(TMP2, DST_3);
yading@10 1428 vis_st64(DST_2, dest[0]);
yading@10 1429 dest += stride;
yading@10 1430 } while (--height);
yading@10 1431 }
yading@10 1432
yading@10 1433 static void MC_put_xy_16_vis (uint8_t * dest, const uint8_t * ref,
yading@10 1434 const ptrdiff_t stride, int height)
yading@10 1435 {
yading@10 1436 unsigned long off = (unsigned long) ref & 0x7;
yading@10 1437 unsigned long off_plus_1 = off + 1;
yading@10 1438 int stride_8 = stride + 8;
yading@10 1439 int stride_16 = stride + 16;
yading@10 1440
yading@10 1441 vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);
yading@10 1442
yading@10 1443 ref = vis_alignaddr(ref);
yading@10 1444
yading@10 1445 vis_ld64(ref[ 0], TMP0);
yading@10 1446 vis_fzero(ZERO);
yading@10 1447
yading@10 1448 vis_ld64(ref[ 8], TMP2);
yading@10 1449
yading@10 1450 vis_ld64(ref[16], TMP4);
yading@10 1451
yading@10 1452 vis_ld64(constants2[0], CONST_2);
yading@10 1453 vis_faligndata(TMP0, TMP2, REF_S0);
yading@10 1454
yading@10 1455 vis_ld64(constants256_512[0], CONST_256);
yading@10 1456 vis_faligndata(TMP2, TMP4, REF_S4);
yading@10 1457
yading@10 1458 if (off != 0x7) {
yading@10 1459 vis_alignaddr_g0((void *)off_plus_1);
yading@10 1460 vis_faligndata(TMP0, TMP2, REF_S2);
yading@10 1461 vis_faligndata(TMP2, TMP4, REF_S6);
yading@10 1462 } else {
yading@10 1463 vis_src1(TMP2, REF_S2);
yading@10 1464 vis_src1(TMP4, REF_S6);
yading@10 1465 }
yading@10 1466
yading@10 1467 height >>= 1;
yading@10 1468 do {
yading@10 1469 vis_ld64_2(ref, stride, TMP0);
yading@10 1470 vis_mul8x16au(REF_S0, CONST_256, TMP12);
yading@10 1471 vis_pmerge(ZERO, REF_S0_1, TMP14);
yading@10 1472
yading@10 1473 vis_alignaddr_g0((void *)off);
yading@10 1474
yading@10 1475 vis_ld64_2(ref, stride_8, TMP2);
yading@10 1476 vis_mul8x16au(REF_S2, CONST_256, TMP16);
yading@10 1477 vis_pmerge(ZERO, REF_S2_1, TMP18);
yading@10 1478
yading@10 1479 vis_ld64_2(ref, stride_16, TMP4);
yading@10 1480 ref += stride;
yading@10 1481 vis_mul8x16au(REF_S4, CONST_256, TMP20);
yading@10 1482 vis_pmerge(ZERO, REF_S4_1, TMP22);
yading@10 1483
yading@10 1484 vis_ld64_2(ref, stride, TMP6);
yading@10 1485 vis_mul8x16au(REF_S6, CONST_256, TMP24);
yading@10 1486 vis_pmerge(ZERO, REF_S6_1, TMP26);
yading@10 1487
yading@10 1488 vis_ld64_2(ref, stride_8, TMP8);
yading@10 1489 vis_faligndata(TMP0, TMP2, REF_0);
yading@10 1490
yading@10 1491 vis_ld64_2(ref, stride_16, TMP10);
yading@10 1492 ref += stride;
yading@10 1493 vis_faligndata(TMP2, TMP4, REF_4);
yading@10 1494
yading@10 1495 vis_faligndata(TMP6, TMP8, REF_S0);
yading@10 1496
yading@10 1497 vis_faligndata(TMP8, TMP10, REF_S4);
yading@10 1498
yading@10 1499 if (off != 0x7) {
yading@10 1500 vis_alignaddr_g0((void *)off_plus_1);
yading@10 1501 vis_faligndata(TMP0, TMP2, REF_2);
yading@10 1502 vis_faligndata(TMP2, TMP4, REF_6);
yading@10 1503 vis_faligndata(TMP6, TMP8, REF_S2);
yading@10 1504 vis_faligndata(TMP8, TMP10, REF_S6);
yading@10 1505 } else {
yading@10 1506 vis_src1(TMP2, REF_2);
yading@10 1507 vis_src1(TMP4, REF_6);
yading@10 1508 vis_src1(TMP8, REF_S2);
yading@10 1509 vis_src1(TMP10, REF_S6);
yading@10 1510 }
yading@10 1511
yading@10 1512 vis_mul8x16au(REF_0, CONST_256, TMP0);
yading@10 1513 vis_pmerge(ZERO, REF_0_1, TMP2);
yading@10 1514
yading@10 1515 vis_mul8x16au(REF_2, CONST_256, TMP4);
yading@10 1516 vis_pmerge(ZERO, REF_2_1, TMP6);
yading@10 1517
yading@10 1518 vis_padd16(TMP0, CONST_2, TMP8);
yading@10 1519 vis_mul8x16au(REF_4, CONST_256, TMP0);
yading@10 1520
yading@10 1521 vis_padd16(TMP2, CONST_2, TMP10);
yading@10 1522 vis_mul8x16au(REF_4_1, CONST_256, TMP2);
yading@10 1523
yading@10 1524 vis_padd16(TMP8, TMP4, TMP8);
yading@10 1525 vis_mul8x16au(REF_6, CONST_256, TMP4);
yading@10 1526
yading@10 1527 vis_padd16(TMP10, TMP6, TMP10);
yading@10 1528 vis_mul8x16au(REF_6_1, CONST_256, TMP6);
yading@10 1529
yading@10 1530 vis_padd16(TMP12, TMP8, TMP12);
yading@10 1531
yading@10 1532 vis_padd16(TMP14, TMP10, TMP14);
yading@10 1533
yading@10 1534 vis_padd16(TMP12, TMP16, TMP12);
yading@10 1535
yading@10 1536 vis_padd16(TMP14, TMP18, TMP14);
yading@10 1537 vis_pack16(TMP12, DST_0);
yading@10 1538
yading@10 1539 vis_pack16(TMP14, DST_1);
yading@10 1540 vis_st64(DST_0, dest[0]);
yading@10 1541 vis_padd16(TMP0, CONST_2, TMP12);
yading@10 1542
yading@10 1543 vis_mul8x16au(REF_S0, CONST_256, TMP0);
yading@10 1544 vis_padd16(TMP2, CONST_2, TMP14);
yading@10 1545
yading@10 1546 vis_mul8x16au(REF_S0_1, CONST_256, TMP2);
yading@10 1547 vis_padd16(TMP12, TMP4, TMP12);
yading@10 1548
yading@10 1549 vis_mul8x16au(REF_S2, CONST_256, TMP4);
yading@10 1550 vis_padd16(TMP14, TMP6, TMP14);
yading@10 1551
yading@10 1552 vis_mul8x16au(REF_S2_1, CONST_256, TMP6);
yading@10 1553 vis_padd16(TMP20, TMP12, TMP20);
yading@10 1554
yading@10 1555 vis_padd16(TMP22, TMP14, TMP22);
yading@10 1556
yading@10 1557 vis_padd16(TMP20, TMP24, TMP20);
yading@10 1558
yading@10 1559 vis_padd16(TMP22, TMP26, TMP22);
yading@10 1560 vis_pack16(TMP20, DST_2);
yading@10 1561
yading@10 1562 vis_pack16(TMP22, DST_3);
yading@10 1563 vis_st64_2(DST_2, dest, 8);
yading@10 1564 dest += stride;
yading@10 1565 vis_padd16(TMP0, TMP4, TMP24);
yading@10 1566
yading@10 1567 vis_mul8x16au(REF_S4, CONST_256, TMP0);
yading@10 1568 vis_padd16(TMP2, TMP6, TMP26);
yading@10 1569
yading@10 1570 vis_mul8x16au(REF_S4_1, CONST_256, TMP2);
yading@10 1571 vis_padd16(TMP24, TMP8, TMP24);
yading@10 1572
yading@10 1573 vis_padd16(TMP26, TMP10, TMP26);
yading@10 1574 vis_pack16(TMP24, DST_0);
yading@10 1575
yading@10 1576 vis_pack16(TMP26, DST_1);
yading@10 1577 vis_st64(DST_0, dest[0]);
yading@10 1578 vis_pmerge(ZERO, REF_S6, TMP4);
yading@10 1579
yading@10 1580 vis_pmerge(ZERO, REF_S6_1, TMP6);
yading@10 1581
yading@10 1582 vis_padd16(TMP0, TMP4, TMP0);
yading@10 1583
yading@10 1584 vis_padd16(TMP2, TMP6, TMP2);
yading@10 1585
yading@10 1586 vis_padd16(TMP0, TMP12, TMP0);
yading@10 1587
yading@10 1588 vis_padd16(TMP2, TMP14, TMP2);
yading@10 1589 vis_pack16(TMP0, DST_2);
yading@10 1590
yading@10 1591 vis_pack16(TMP2, DST_3);
yading@10 1592 vis_st64_2(DST_2, dest, 8);
yading@10 1593 dest += stride;
yading@10 1594 } while (--height);
yading@10 1595 }
yading@10 1596
yading@10 1597 static void MC_put_xy_8_vis (uint8_t * dest, const uint8_t * ref,
yading@10 1598 const ptrdiff_t stride, int height)
yading@10 1599 {
yading@10 1600 unsigned long off = (unsigned long) ref & 0x7;
yading@10 1601 unsigned long off_plus_1 = off + 1;
yading@10 1602 int stride_8 = stride + 8;
yading@10 1603
yading@10 1604 vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);
yading@10 1605
yading@10 1606 ref = vis_alignaddr(ref);
yading@10 1607
yading@10 1608 vis_ld64(ref[ 0], TMP0);
yading@10 1609 vis_fzero(ZERO);
yading@10 1610
yading@10 1611 vis_ld64(ref[ 8], TMP2);
yading@10 1612
yading@10 1613 vis_ld64(constants2[0], CONST_2);
yading@10 1614
yading@10 1615 vis_ld64(constants256_512[0], CONST_256);
yading@10 1616 vis_faligndata(TMP0, TMP2, REF_S0);
yading@10 1617
yading@10 1618 if (off != 0x7) {
yading@10 1619 vis_alignaddr_g0((void *)off_plus_1);
yading@10 1620 vis_faligndata(TMP0, TMP2, REF_S2);
yading@10 1621 } else {
yading@10 1622 vis_src1(TMP2, REF_S2);
yading@10 1623 }
yading@10 1624
yading@10 1625 height >>= 1;
yading@10 1626 do { /* 26 cycles */
yading@10 1627 vis_ld64_2(ref, stride, TMP0);
yading@10 1628 vis_mul8x16au(REF_S0, CONST_256, TMP8);
yading@10 1629 vis_pmerge(ZERO, REF_S2, TMP12);
yading@10 1630
yading@10 1631 vis_alignaddr_g0((void *)off);
yading@10 1632
yading@10 1633 vis_ld64_2(ref, stride_8, TMP2);
yading@10 1634 ref += stride;
yading@10 1635 vis_mul8x16au(REF_S0_1, CONST_256, TMP10);
yading@10 1636 vis_pmerge(ZERO, REF_S2_1, TMP14);
yading@10 1637
yading@10 1638 vis_ld64_2(ref, stride, TMP4);
yading@10 1639
yading@10 1640 vis_ld64_2(ref, stride_8, TMP6);
yading@10 1641 ref += stride;
yading@10 1642 vis_faligndata(TMP0, TMP2, REF_S4);
yading@10 1643
yading@10 1644 vis_pmerge(ZERO, REF_S4, TMP18);
yading@10 1645
yading@10 1646 vis_pmerge(ZERO, REF_S4_1, TMP20);
yading@10 1647
yading@10 1648 vis_faligndata(TMP4, TMP6, REF_S0);
yading@10 1649
yading@10 1650 if (off != 0x7) {
yading@10 1651 vis_alignaddr_g0((void *)off_plus_1);
yading@10 1652 vis_faligndata(TMP0, TMP2, REF_S6);
yading@10 1653 vis_faligndata(TMP4, TMP6, REF_S2);
yading@10 1654 } else {
yading@10 1655 vis_src1(TMP2, REF_S6);
yading@10 1656 vis_src1(TMP6, REF_S2);
yading@10 1657 }
yading@10 1658
yading@10 1659 vis_padd16(TMP18, CONST_2, TMP18);
yading@10 1660 vis_mul8x16au(REF_S6, CONST_256, TMP22);
yading@10 1661
yading@10 1662 vis_padd16(TMP20, CONST_2, TMP20);
yading@10 1663 vis_mul8x16au(REF_S6_1, CONST_256, TMP24);
yading@10 1664
yading@10 1665 vis_mul8x16au(REF_S0, CONST_256, TMP26);
yading@10 1666 vis_pmerge(ZERO, REF_S0_1, TMP28);
yading@10 1667
yading@10 1668 vis_mul8x16au(REF_S2, CONST_256, TMP30);
yading@10 1669 vis_padd16(TMP18, TMP22, TMP18);
yading@10 1670
yading@10 1671 vis_mul8x16au(REF_S2_1, CONST_256, TMP32);
yading@10 1672 vis_padd16(TMP20, TMP24, TMP20);
yading@10 1673
yading@10 1674 vis_padd16(TMP8, TMP18, TMP8);
yading@10 1675
yading@10 1676 vis_padd16(TMP10, TMP20, TMP10);
yading@10 1677
yading@10 1678 vis_padd16(TMP8, TMP12, TMP8);
yading@10 1679
yading@10 1680 vis_padd16(TMP10, TMP14, TMP10);
yading@10 1681 vis_pack16(TMP8, DST_0);
yading@10 1682
yading@10 1683 vis_pack16(TMP10, DST_1);
yading@10 1684 vis_st64(DST_0, dest[0]);
yading@10 1685 dest += stride;
yading@10 1686 vis_padd16(TMP18, TMP26, TMP18);
yading@10 1687
yading@10 1688 vis_padd16(TMP20, TMP28, TMP20);
yading@10 1689
yading@10 1690 vis_padd16(TMP18, TMP30, TMP18);
yading@10 1691
yading@10 1692 vis_padd16(TMP20, TMP32, TMP20);
yading@10 1693 vis_pack16(TMP18, DST_2);
yading@10 1694
yading@10 1695 vis_pack16(TMP20, DST_3);
yading@10 1696 vis_st64(DST_2, dest[0]);
yading@10 1697 dest += stride;
yading@10 1698 } while (--height);
yading@10 1699 }
yading@10 1700
yading@10 1701 static void MC_avg_xy_16_vis (uint8_t * dest, const uint8_t * ref,
yading@10 1702 const ptrdiff_t stride, int height)
yading@10 1703 {
yading@10 1704 unsigned long off = (unsigned long) ref & 0x7;
yading@10 1705 unsigned long off_plus_1 = off + 1;
yading@10 1706 int stride_8 = stride + 8;
yading@10 1707 int stride_16 = stride + 16;
yading@10 1708
yading@10 1709 vis_set_gsr(4 << VIS_GSR_SCALEFACT_SHIFT);
yading@10 1710
yading@10 1711 ref = vis_alignaddr(ref);
yading@10 1712
yading@10 1713 vis_ld64(ref[ 0], TMP0);
yading@10 1714 vis_fzero(ZERO);
yading@10 1715
yading@10 1716 vis_ld64(ref[ 8], TMP2);
yading@10 1717
yading@10 1718 vis_ld64(ref[16], TMP4);
yading@10 1719
yading@10 1720 vis_ld64(constants6[0], CONST_6);
yading@10 1721 vis_faligndata(TMP0, TMP2, REF_S0);
yading@10 1722
yading@10 1723 vis_ld64(constants256_1024[0], CONST_256);
yading@10 1724 vis_faligndata(TMP2, TMP4, REF_S4);
yading@10 1725
yading@10 1726 if (off != 0x7) {
yading@10 1727 vis_alignaddr_g0((void *)off_plus_1);
yading@10 1728 vis_faligndata(TMP0, TMP2, REF_S2);
yading@10 1729 vis_faligndata(TMP2, TMP4, REF_S6);
yading@10 1730 } else {
yading@10 1731 vis_src1(TMP2, REF_S2);
yading@10 1732 vis_src1(TMP4, REF_S6);
yading@10 1733 }
yading@10 1734
yading@10 1735 height >>= 1;
yading@10 1736 do { /* 55 cycles */
yading@10 1737 vis_ld64_2(ref, stride, TMP0);
yading@10 1738 vis_mul8x16au(REF_S0, CONST_256, TMP12);
yading@10 1739 vis_pmerge(ZERO, REF_S0_1, TMP14);
yading@10 1740
yading@10 1741 vis_alignaddr_g0((void *)off);
yading@10 1742
yading@10 1743 vis_ld64_2(ref, stride_8, TMP2);
yading@10 1744 vis_mul8x16au(REF_S2, CONST_256, TMP16);
yading@10 1745 vis_pmerge(ZERO, REF_S2_1, TMP18);
yading@10 1746
yading@10 1747 vis_ld64_2(ref, stride_16, TMP4);
yading@10 1748 ref += stride;
yading@10 1749 vis_mul8x16au(REF_S4, CONST_256, TMP20);
yading@10 1750 vis_pmerge(ZERO, REF_S4_1, TMP22);
yading@10 1751
yading@10 1752 vis_ld64_2(ref, stride, TMP6);
yading@10 1753 vis_mul8x16au(REF_S6, CONST_256, TMP24);
yading@10 1754 vis_pmerge(ZERO, REF_S6_1, TMP26);
yading@10 1755
yading@10 1756 vis_ld64_2(ref, stride_8, TMP8);
yading@10 1757 vis_faligndata(TMP0, TMP2, REF_0);
yading@10 1758
yading@10 1759 vis_ld64_2(ref, stride_16, TMP10);
yading@10 1760 ref += stride;
yading@10 1761 vis_faligndata(TMP2, TMP4, REF_4);
yading@10 1762
yading@10 1763 vis_ld64(dest[0], DST_0);
yading@10 1764 vis_faligndata(TMP6, TMP8, REF_S0);
yading@10 1765
yading@10 1766 vis_ld64_2(dest, 8, DST_2);
yading@10 1767 vis_faligndata(TMP8, TMP10, REF_S4);
yading@10 1768
yading@10 1769 if (off != 0x7) {
yading@10 1770 vis_alignaddr_g0((void *)off_plus_1);
yading@10 1771 vis_faligndata(TMP0, TMP2, REF_2);
yading@10 1772 vis_faligndata(TMP2, TMP4, REF_6);
yading@10 1773 vis_faligndata(TMP6, TMP8, REF_S2);
yading@10 1774 vis_faligndata(TMP8, TMP10, REF_S6);
yading@10 1775 } else {
yading@10 1776 vis_src1(TMP2, REF_2);
yading@10 1777 vis_src1(TMP4, REF_6);
yading@10 1778 vis_src1(TMP8, REF_S2);
yading@10 1779 vis_src1(TMP10, REF_S6);
yading@10 1780 }
yading@10 1781
yading@10 1782 vis_mul8x16al(DST_0, CONST_1024, TMP30);
yading@10 1783 vis_pmerge(ZERO, REF_0, TMP0);
yading@10 1784
yading@10 1785 vis_mul8x16al(DST_1, CONST_1024, TMP32);
yading@10 1786 vis_pmerge(ZERO, REF_0_1, TMP2);
yading@10 1787
yading@10 1788 vis_mul8x16au(REF_2, CONST_256, TMP4);
yading@10 1789 vis_pmerge(ZERO, REF_2_1, TMP6);
yading@10 1790
yading@10 1791 vis_mul8x16al(DST_2, CONST_1024, REF_0);
yading@10 1792 vis_padd16(TMP0, CONST_6, TMP0);
yading@10 1793
yading@10 1794 vis_mul8x16al(DST_3, CONST_1024, REF_2);
yading@10 1795 vis_padd16(TMP2, CONST_6, TMP2);
yading@10 1796
yading@10 1797 vis_padd16(TMP0, TMP4, TMP0);
yading@10 1798 vis_mul8x16au(REF_4, CONST_256, TMP4);
yading@10 1799
yading@10 1800 vis_padd16(TMP2, TMP6, TMP2);
yading@10 1801 vis_mul8x16au(REF_4_1, CONST_256, TMP6);
yading@10 1802
yading@10 1803 vis_padd16(TMP12, TMP0, TMP12);
yading@10 1804 vis_mul8x16au(REF_6, CONST_256, TMP8);
yading@10 1805
yading@10 1806 vis_padd16(TMP14, TMP2, TMP14);
yading@10 1807 vis_mul8x16au(REF_6_1, CONST_256, TMP10);
yading@10 1808
yading@10 1809 vis_padd16(TMP12, TMP16, TMP12);
yading@10 1810 vis_mul8x16au(REF_S0, CONST_256, REF_4);
yading@10 1811
yading@10 1812 vis_padd16(TMP14, TMP18, TMP14);
yading@10 1813 vis_mul8x16au(REF_S0_1, CONST_256, REF_6);
yading@10 1814
yading@10 1815 vis_padd16(TMP12, TMP30, TMP12);
yading@10 1816
yading@10 1817 vis_padd16(TMP14, TMP32, TMP14);
yading@10 1818 vis_pack16(TMP12, DST_0);
yading@10 1819
yading@10 1820 vis_pack16(TMP14, DST_1);
yading@10 1821 vis_st64(DST_0, dest[0]);
yading@10 1822 vis_padd16(TMP4, CONST_6, TMP4);
yading@10 1823
yading@10 1824 vis_ld64_2(dest, stride, DST_0);
yading@10 1825 vis_padd16(TMP6, CONST_6, TMP6);
yading@10 1826 vis_mul8x16au(REF_S2, CONST_256, TMP12);
yading@10 1827
yading@10 1828 vis_padd16(TMP4, TMP8, TMP4);
yading@10 1829 vis_mul8x16au(REF_S2_1, CONST_256, TMP14);
yading@10 1830
yading@10 1831 vis_padd16(TMP6, TMP10, TMP6);
yading@10 1832
yading@10 1833 vis_padd16(TMP20, TMP4, TMP20);
yading@10 1834
yading@10 1835 vis_padd16(TMP22, TMP6, TMP22);
yading@10 1836
yading@10 1837 vis_padd16(TMP20, TMP24, TMP20);
yading@10 1838
yading@10 1839 vis_padd16(TMP22, TMP26, TMP22);
yading@10 1840
yading@10 1841 vis_padd16(TMP20, REF_0, TMP20);
yading@10 1842 vis_mul8x16au(REF_S4, CONST_256, REF_0);
yading@10 1843
yading@10 1844 vis_padd16(TMP22, REF_2, TMP22);
yading@10 1845 vis_pack16(TMP20, DST_2);
yading@10 1846
yading@10 1847 vis_pack16(TMP22, DST_3);
yading@10 1848 vis_st64_2(DST_2, dest, 8);
yading@10 1849 dest += stride;
yading@10 1850
yading@10 1851 vis_ld64_2(dest, 8, DST_2);
yading@10 1852 vis_mul8x16al(DST_0, CONST_1024, TMP30);
yading@10 1853 vis_pmerge(ZERO, REF_S4_1, REF_2);
yading@10 1854
yading@10 1855 vis_mul8x16al(DST_1, CONST_1024, TMP32);
yading@10 1856 vis_padd16(REF_4, TMP0, TMP8);
yading@10 1857
yading@10 1858 vis_mul8x16au(REF_S6, CONST_256, REF_4);
yading@10 1859 vis_padd16(REF_6, TMP2, TMP10);
yading@10 1860
yading@10 1861 vis_mul8x16au(REF_S6_1, CONST_256, REF_6);
yading@10 1862 vis_padd16(TMP8, TMP12, TMP8);
yading@10 1863
yading@10 1864 vis_padd16(TMP10, TMP14, TMP10);
yading@10 1865
yading@10 1866 vis_padd16(TMP8, TMP30, TMP8);
yading@10 1867
yading@10 1868 vis_padd16(TMP10, TMP32, TMP10);
yading@10 1869 vis_pack16(TMP8, DST_0);
yading@10 1870
yading@10 1871 vis_pack16(TMP10, DST_1);
yading@10 1872 vis_st64(DST_0, dest[0]);
yading@10 1873
yading@10 1874 vis_padd16(REF_0, TMP4, REF_0);
yading@10 1875
yading@10 1876 vis_mul8x16al(DST_2, CONST_1024, TMP30);
yading@10 1877 vis_padd16(REF_2, TMP6, REF_2);
yading@10 1878
yading@10 1879 vis_mul8x16al(DST_3, CONST_1024, TMP32);
yading@10 1880 vis_padd16(REF_0, REF_4, REF_0);
yading@10 1881
yading@10 1882 vis_padd16(REF_2, REF_6, REF_2);
yading@10 1883
yading@10 1884 vis_padd16(REF_0, TMP30, REF_0);
yading@10 1885
yading@10 1886 /* stall */
yading@10 1887
yading@10 1888 vis_padd16(REF_2, TMP32, REF_2);
yading@10 1889 vis_pack16(REF_0, DST_2);
yading@10 1890
yading@10 1891 vis_pack16(REF_2, DST_3);
yading@10 1892 vis_st64_2(DST_2, dest, 8);
yading@10 1893 dest += stride;
yading@10 1894 } while (--height);
yading@10 1895 }
yading@10 1896
yading@10 1897 static void MC_avg_xy_8_vis (uint8_t * dest, const uint8_t * ref,
yading@10 1898 const ptrdiff_t stride, int height)
yading@10 1899 {
yading@10 1900 unsigned long off = (unsigned long) ref & 0x7;
yading@10 1901 unsigned long off_plus_1 = off + 1;
yading@10 1902 int stride_8 = stride + 8;
yading@10 1903
yading@10 1904 vis_set_gsr(4 << VIS_GSR_SCALEFACT_SHIFT);
yading@10 1905
yading@10 1906 ref = vis_alignaddr(ref);
yading@10 1907
yading@10 1908 vis_ld64(ref[0], TMP0);
yading@10 1909 vis_fzero(ZERO);
yading@10 1910
yading@10 1911 vis_ld64_2(ref, 8, TMP2);
yading@10 1912
yading@10 1913 vis_ld64(constants6[0], CONST_6);
yading@10 1914
yading@10 1915 vis_ld64(constants256_1024[0], CONST_256);
yading@10 1916 vis_faligndata(TMP0, TMP2, REF_S0);
yading@10 1917
yading@10 1918 if (off != 0x7) {
yading@10 1919 vis_alignaddr_g0((void *)off_plus_1);
yading@10 1920 vis_faligndata(TMP0, TMP2, REF_S2);
yading@10 1921 } else {
yading@10 1922 vis_src1(TMP2, REF_S2);
yading@10 1923 }
yading@10 1924
yading@10 1925 height >>= 1;
yading@10 1926 do { /* 31 cycles */
yading@10 1927 vis_ld64_2(ref, stride, TMP0);
yading@10 1928 vis_mul8x16au(REF_S0, CONST_256, TMP8);
yading@10 1929 vis_pmerge(ZERO, REF_S0_1, TMP10);
yading@10 1930
yading@10 1931 vis_ld64_2(ref, stride_8, TMP2);
yading@10 1932 ref += stride;
yading@10 1933 vis_mul8x16au(REF_S2, CONST_256, TMP12);
yading@10 1934 vis_pmerge(ZERO, REF_S2_1, TMP14);
yading@10 1935
yading@10 1936 vis_alignaddr_g0((void *)off);
yading@10 1937
yading@10 1938 vis_ld64_2(ref, stride, TMP4);
yading@10 1939 vis_faligndata(TMP0, TMP2, REF_S4);
yading@10 1940
yading@10 1941 vis_ld64_2(ref, stride_8, TMP6);
yading@10 1942 ref += stride;
yading@10 1943
yading@10 1944 vis_ld64(dest[0], DST_0);
yading@10 1945 vis_faligndata(TMP4, TMP6, REF_S0);
yading@10 1946
yading@10 1947 vis_ld64_2(dest, stride, DST_2);
yading@10 1948
yading@10 1949 if (off != 0x7) {
yading@10 1950 vis_alignaddr_g0((void *)off_plus_1);
yading@10 1951 vis_faligndata(TMP0, TMP2, REF_S6);
yading@10 1952 vis_faligndata(TMP4, TMP6, REF_S2);
yading@10 1953 } else {
yading@10 1954 vis_src1(TMP2, REF_S6);
yading@10 1955 vis_src1(TMP6, REF_S2);
yading@10 1956 }
yading@10 1957
yading@10 1958 vis_mul8x16al(DST_0, CONST_1024, TMP30);
yading@10 1959 vis_pmerge(ZERO, REF_S4, TMP22);
yading@10 1960
yading@10 1961 vis_mul8x16al(DST_1, CONST_1024, TMP32);
yading@10 1962 vis_pmerge(ZERO, REF_S4_1, TMP24);
yading@10 1963
yading@10 1964 vis_mul8x16au(REF_S6, CONST_256, TMP26);
yading@10 1965 vis_pmerge(ZERO, REF_S6_1, TMP28);
yading@10 1966
yading@10 1967 vis_mul8x16au(REF_S0, CONST_256, REF_S4);
yading@10 1968 vis_padd16(TMP22, CONST_6, TMP22);
yading@10 1969
yading@10 1970 vis_mul8x16au(REF_S0_1, CONST_256, REF_S6);
yading@10 1971 vis_padd16(TMP24, CONST_6, TMP24);
yading@10 1972
yading@10 1973 vis_mul8x16al(DST_2, CONST_1024, REF_0);
yading@10 1974 vis_padd16(TMP22, TMP26, TMP22);
yading@10 1975
yading@10 1976 vis_mul8x16al(DST_3, CONST_1024, REF_2);
yading@10 1977 vis_padd16(TMP24, TMP28, TMP24);
yading@10 1978
yading@10 1979 vis_mul8x16au(REF_S2, CONST_256, TMP26);
yading@10 1980 vis_padd16(TMP8, TMP22, TMP8);
yading@10 1981
yading@10 1982 vis_mul8x16au(REF_S2_1, CONST_256, TMP28);
yading@10 1983 vis_padd16(TMP10, TMP24, TMP10);
yading@10 1984
yading@10 1985 vis_padd16(TMP8, TMP12, TMP8);
yading@10 1986
yading@10 1987 vis_padd16(TMP10, TMP14, TMP10);
yading@10 1988
yading@10 1989 vis_padd16(TMP8, TMP30, TMP8);
yading@10 1990
yading@10 1991 vis_padd16(TMP10, TMP32, TMP10);
yading@10 1992 vis_pack16(TMP8, DST_0);
yading@10 1993
yading@10 1994 vis_pack16(TMP10, DST_1);
yading@10 1995 vis_st64(DST_0, dest[0]);
yading@10 1996 dest += stride;
yading@10 1997
yading@10 1998 vis_padd16(REF_S4, TMP22, TMP12);
yading@10 1999
yading@10 2000 vis_padd16(REF_S6, TMP24, TMP14);
yading@10 2001
yading@10 2002 vis_padd16(TMP12, TMP26, TMP12);
yading@10 2003
yading@10 2004 vis_padd16(TMP14, TMP28, TMP14);
yading@10 2005
yading@10 2006 vis_padd16(TMP12, REF_0, TMP12);
yading@10 2007
yading@10 2008 vis_padd16(TMP14, REF_2, TMP14);
yading@10 2009 vis_pack16(TMP12, DST_2);
yading@10 2010
yading@10 2011 vis_pack16(TMP14, DST_3);
yading@10 2012 vis_st64(DST_2, dest[0]);
yading@10 2013 dest += stride;
yading@10 2014 } while (--height);
yading@10 2015 }
yading@10 2016
yading@10 2017 /* End of rounding code */
yading@10 2018
yading@10 2019 /* Start of no rounding code */
yading@10 2020 /* The trick used in some of this file is the formula from the MMX
yading@10 2021 * motion comp code, which is:
yading@10 2022 *
yading@10 2023 * (x+y)>>1 == (x&y)+((x^y)>>1)
yading@10 2024 *
yading@10 2025 * This allows us to average 8 bytes at a time in a 64-bit FPU reg.
yading@10 2026 * We avoid overflows by masking before we do the shift, and we
yading@10 2027 * implement the shift by multiplying by 1/2 using mul8x16. So in
yading@10 2028 * VIS this is (assume 'x' is in f0, 'y' is in f2, a repeating mask
yading@10 2029 * of '0xfe' is in f4, a repeating mask of '0x7f' is in f6, and
yading@10 2030 * the value 0x80808080 is in f8):
yading@10 2031 *
yading@10 2032 * fxor f0, f2, f10
yading@10 2033 * fand f10, f4, f10
yading@10 2034 * fmul8x16 f8, f10, f10
yading@10 2035 * fand f10, f6, f10
yading@10 2036 * fand f0, f2, f12
yading@10 2037 * fpadd16 f12, f10, f10
yading@10 2038 */
yading@10 2039
yading@10 2040 static void MC_put_no_round_o_16_vis (uint8_t * dest, const uint8_t * ref,
yading@10 2041 const ptrdiff_t stride, int height)
yading@10 2042 {
yading@10 2043 ref = vis_alignaddr(ref);
yading@10 2044 do { /* 5 cycles */
yading@10 2045 vis_ld64(ref[0], TMP0);
yading@10 2046
yading@10 2047 vis_ld64_2(ref, 8, TMP2);
yading@10 2048
yading@10 2049 vis_ld64_2(ref, 16, TMP4);
yading@10 2050 ref += stride;
yading@10 2051
yading@10 2052 vis_faligndata(TMP0, TMP2, REF_0);
yading@10 2053 vis_st64(REF_0, dest[0]);
yading@10 2054
yading@10 2055 vis_faligndata(TMP2, TMP4, REF_2);
yading@10 2056 vis_st64_2(REF_2, dest, 8);
yading@10 2057 dest += stride;
yading@10 2058 } while (--height);
yading@10 2059 }
yading@10 2060
yading@10 2061 static void MC_put_no_round_o_8_vis (uint8_t * dest, const uint8_t * ref,
yading@10 2062 const ptrdiff_t stride, int height)
yading@10 2063 {
yading@10 2064 ref = vis_alignaddr(ref);
yading@10 2065 do { /* 4 cycles */
yading@10 2066 vis_ld64(ref[0], TMP0);
yading@10 2067
yading@10 2068 vis_ld64(ref[8], TMP2);
yading@10 2069 ref += stride;
yading@10 2070
yading@10 2071 /* stall */
yading@10 2072
yading@10 2073 vis_faligndata(TMP0, TMP2, REF_0);
yading@10 2074 vis_st64(REF_0, dest[0]);
yading@10 2075 dest += stride;
yading@10 2076 } while (--height);
yading@10 2077 }
yading@10 2078
yading@10 2079
yading@10 2080 static void MC_avg_no_round_o_16_vis (uint8_t * dest, const uint8_t * ref,
yading@10 2081 const ptrdiff_t stride, int height)
yading@10 2082 {
yading@10 2083 int stride_8 = stride + 8;
yading@10 2084
yading@10 2085 ref = vis_alignaddr(ref);
yading@10 2086
yading@10 2087 vis_ld64(ref[0], TMP0);
yading@10 2088
yading@10 2089 vis_ld64(ref[8], TMP2);
yading@10 2090
yading@10 2091 vis_ld64(ref[16], TMP4);
yading@10 2092
yading@10 2093 vis_ld64(dest[0], DST_0);
yading@10 2094
yading@10 2095 vis_ld64(dest[8], DST_2);
yading@10 2096
yading@10 2097 vis_ld64(constants_fe[0], MASK_fe);
yading@10 2098 vis_faligndata(TMP0, TMP2, REF_0);
yading@10 2099
yading@10 2100 vis_ld64(constants_7f[0], MASK_7f);
yading@10 2101 vis_faligndata(TMP2, TMP4, REF_2);
yading@10 2102
yading@10 2103 vis_ld64(constants128[0], CONST_128);
yading@10 2104
yading@10 2105 ref += stride;
yading@10 2106 height = (height >> 1) - 1;
yading@10 2107
yading@10 2108 do { /* 24 cycles */
yading@10 2109 vis_ld64(ref[0], TMP0);
yading@10 2110 vis_xor(DST_0, REF_0, TMP6);
yading@10 2111
yading@10 2112 vis_ld64_2(ref, 8, TMP2);
yading@10 2113 vis_and(TMP6, MASK_fe, TMP6);
yading@10 2114
yading@10 2115 vis_ld64_2(ref, 16, TMP4);
yading@10 2116 ref += stride;
yading@10 2117 vis_mul8x16(CONST_128, TMP6, TMP6);
yading@10 2118 vis_xor(DST_2, REF_2, TMP8);
yading@10 2119
yading@10 2120 vis_and(TMP8, MASK_fe, TMP8);
yading@10 2121
yading@10 2122 vis_and(DST_0, REF_0, TMP10);
yading@10 2123 vis_ld64_2(dest, stride, DST_0);
yading@10 2124 vis_mul8x16(CONST_128, TMP8, TMP8);
yading@10 2125
yading@10 2126 vis_and(DST_2, REF_2, TMP12);
yading@10 2127 vis_ld64_2(dest, stride_8, DST_2);
yading@10 2128
yading@10 2129 vis_ld64(ref[0], TMP14);
yading@10 2130 vis_and(TMP6, MASK_7f, TMP6);
yading@10 2131
yading@10 2132 vis_and(TMP8, MASK_7f, TMP8);
yading@10 2133
yading@10 2134 vis_padd16(TMP10, TMP6, TMP6);
yading@10 2135 vis_st64(TMP6, dest[0]);
yading@10 2136
yading@10 2137 vis_padd16(TMP12, TMP8, TMP8);
yading@10 2138 vis_st64_2(TMP8, dest, 8);
yading@10 2139
yading@10 2140 dest += stride;
yading@10 2141 vis_ld64_2(ref, 8, TMP16);
yading@10 2142 vis_faligndata(TMP0, TMP2, REF_0);
yading@10 2143
yading@10 2144 vis_ld64_2(ref, 16, TMP18);
yading@10 2145 vis_faligndata(TMP2, TMP4, REF_2);
yading@10 2146 ref += stride;
yading@10 2147
yading@10 2148 vis_xor(DST_0, REF_0, TMP20);
yading@10 2149
yading@10 2150 vis_and(TMP20, MASK_fe, TMP20);
yading@10 2151
yading@10 2152 vis_xor(DST_2, REF_2, TMP22);
yading@10 2153 vis_mul8x16(CONST_128, TMP20, TMP20);
yading@10 2154
yading@10 2155 vis_and(TMP22, MASK_fe, TMP22);
yading@10 2156
yading@10 2157 vis_and(DST_0, REF_0, TMP24);
yading@10 2158 vis_mul8x16(CONST_128, TMP22, TMP22);
yading@10 2159
yading@10 2160 vis_and(DST_2, REF_2, TMP26);
yading@10 2161
yading@10 2162 vis_ld64_2(dest, stride, DST_0);
yading@10 2163 vis_faligndata(TMP14, TMP16, REF_0);
yading@10 2164
yading@10 2165 vis_ld64_2(dest, stride_8, DST_2);
yading@10 2166 vis_faligndata(TMP16, TMP18, REF_2);
yading@10 2167
yading@10 2168 vis_and(TMP20, MASK_7f, TMP20);
yading@10 2169
yading@10 2170 vis_and(TMP22, MASK_7f, TMP22);
yading@10 2171
yading@10 2172 vis_padd16(TMP24, TMP20, TMP20);
yading@10 2173 vis_st64(TMP20, dest[0]);
yading@10 2174
yading@10 2175 vis_padd16(TMP26, TMP22, TMP22);
yading@10 2176 vis_st64_2(TMP22, dest, 8);
yading@10 2177 dest += stride;
yading@10 2178 } while (--height);
yading@10 2179
yading@10 2180 vis_ld64(ref[0], TMP0);
yading@10 2181 vis_xor(DST_0, REF_0, TMP6);
yading@10 2182
yading@10 2183 vis_ld64_2(ref, 8, TMP2);
yading@10 2184 vis_and(TMP6, MASK_fe, TMP6);
yading@10 2185
yading@10 2186 vis_ld64_2(ref, 16, TMP4);
yading@10 2187 vis_mul8x16(CONST_128, TMP6, TMP6);
yading@10 2188 vis_xor(DST_2, REF_2, TMP8);
yading@10 2189
yading@10 2190 vis_and(TMP8, MASK_fe, TMP8);
yading@10 2191
yading@10 2192 vis_and(DST_0, REF_0, TMP10);
yading@10 2193 vis_ld64_2(dest, stride, DST_0);
yading@10 2194 vis_mul8x16(CONST_128, TMP8, TMP8);
yading@10 2195
yading@10 2196 vis_and(DST_2, REF_2, TMP12);
yading@10 2197 vis_ld64_2(dest, stride_8, DST_2);
yading@10 2198
yading@10 2199 vis_ld64(ref[0], TMP14);
yading@10 2200 vis_and(TMP6, MASK_7f, TMP6);
yading@10 2201
yading@10 2202 vis_and(TMP8, MASK_7f, TMP8);
yading@10 2203
yading@10 2204 vis_padd16(TMP10, TMP6, TMP6);
yading@10 2205 vis_st64(TMP6, dest[0]);
yading@10 2206
yading@10 2207 vis_padd16(TMP12, TMP8, TMP8);
yading@10 2208 vis_st64_2(TMP8, dest, 8);
yading@10 2209
yading@10 2210 dest += stride;
yading@10 2211 vis_faligndata(TMP0, TMP2, REF_0);
yading@10 2212
yading@10 2213 vis_faligndata(TMP2, TMP4, REF_2);
yading@10 2214
yading@10 2215 vis_xor(DST_0, REF_0, TMP20);
yading@10 2216
yading@10 2217 vis_and(TMP20, MASK_fe, TMP20);
yading@10 2218
yading@10 2219 vis_xor(DST_2, REF_2, TMP22);
yading@10 2220 vis_mul8x16(CONST_128, TMP20, TMP20);
yading@10 2221
yading@10 2222 vis_and(TMP22, MASK_fe, TMP22);
yading@10 2223
yading@10 2224 vis_and(DST_0, REF_0, TMP24);
yading@10 2225 vis_mul8x16(CONST_128, TMP22, TMP22);
yading@10 2226
yading@10 2227 vis_and(DST_2, REF_2, TMP26);
yading@10 2228
yading@10 2229 vis_and(TMP20, MASK_7f, TMP20);
yading@10 2230
yading@10 2231 vis_and(TMP22, MASK_7f, TMP22);
yading@10 2232
yading@10 2233 vis_padd16(TMP24, TMP20, TMP20);
yading@10 2234 vis_st64(TMP20, dest[0]);
yading@10 2235
yading@10 2236 vis_padd16(TMP26, TMP22, TMP22);
yading@10 2237 vis_st64_2(TMP22, dest, 8);
yading@10 2238 }
yading@10 2239
yading@10 2240 static void MC_put_no_round_x_16_vis (uint8_t * dest, const uint8_t * ref,
yading@10 2241 const ptrdiff_t stride, int height)
yading@10 2242 {
yading@10 2243 unsigned long off = (unsigned long) ref & 0x7;
yading@10 2244 unsigned long off_plus_1 = off + 1;
yading@10 2245
yading@10 2246 ref = vis_alignaddr(ref);
yading@10 2247
yading@10 2248 vis_ld64(ref[0], TMP0);
yading@10 2249
yading@10 2250 vis_ld64_2(ref, 8, TMP2);
yading@10 2251
yading@10 2252 vis_ld64_2(ref, 16, TMP4);
yading@10 2253
yading@10 2254 vis_ld64(constants_fe[0], MASK_fe);
yading@10 2255
yading@10 2256 vis_ld64(constants_7f[0], MASK_7f);
yading@10 2257 vis_faligndata(TMP0, TMP2, REF_0);
yading@10 2258
yading@10 2259 vis_ld64(constants128[0], CONST_128);
yading@10 2260 vis_faligndata(TMP2, TMP4, REF_4);
yading@10 2261
yading@10 2262 if (off != 0x7) {
yading@10 2263 vis_alignaddr_g0((void *)off_plus_1);
yading@10 2264 vis_faligndata(TMP0, TMP2, REF_2);
yading@10 2265 vis_faligndata(TMP2, TMP4, REF_6);
yading@10 2266 } else {
yading@10 2267 vis_src1(TMP2, REF_2);
yading@10 2268 vis_src1(TMP4, REF_6);
yading@10 2269 }
yading@10 2270
yading@10 2271 ref += stride;
yading@10 2272 height = (height >> 1) - 1;
yading@10 2273
yading@10 2274 do { /* 34 cycles */
yading@10 2275 vis_ld64(ref[0], TMP0);
yading@10 2276 vis_xor(REF_0, REF_2, TMP6);
yading@10 2277
yading@10 2278 vis_ld64_2(ref, 8, TMP2);
yading@10 2279 vis_xor(REF_4, REF_6, TMP8);
yading@10 2280
yading@10 2281 vis_ld64_2(ref, 16, TMP4);
yading@10 2282 vis_and(TMP6, MASK_fe, TMP6);
yading@10 2283 ref += stride;
yading@10 2284
yading@10 2285 vis_ld64(ref[0], TMP14);
yading@10 2286 vis_mul8x16(CONST_128, TMP6, TMP6);
yading@10 2287 vis_and(TMP8, MASK_fe, TMP8);
yading@10 2288
yading@10 2289 vis_ld64_2(ref, 8, TMP16);
yading@10 2290 vis_mul8x16(CONST_128, TMP8, TMP8);
yading@10 2291 vis_and(REF_0, REF_2, TMP10);
yading@10 2292
yading@10 2293 vis_ld64_2(ref, 16, TMP18);
yading@10 2294 ref += stride;
yading@10 2295 vis_and(REF_4, REF_6, TMP12);
yading@10 2296
yading@10 2297 vis_alignaddr_g0((void *)off);
yading@10 2298
yading@10 2299 vis_faligndata(TMP0, TMP2, REF_0);
yading@10 2300
yading@10 2301 vis_faligndata(TMP2, TMP4, REF_4);
yading@10 2302
yading@10 2303 if (off != 0x7) {
yading@10 2304 vis_alignaddr_g0((void *)off_plus_1);
yading@10 2305 vis_faligndata(TMP0, TMP2, REF_2);
yading@10 2306 vis_faligndata(TMP2, TMP4, REF_6);
yading@10 2307 } else {
yading@10 2308 vis_src1(TMP2, REF_2);
yading@10 2309 vis_src1(TMP4, REF_6);
yading@10 2310 }
yading@10 2311
yading@10 2312 vis_and(TMP6, MASK_7f, TMP6);
yading@10 2313
yading@10 2314 vis_and(TMP8, MASK_7f, TMP8);
yading@10 2315
yading@10 2316 vis_padd16(TMP10, TMP6, TMP6);
yading@10 2317 vis_st64(TMP6, dest[0]);
yading@10 2318
yading@10 2319 vis_padd16(TMP12, TMP8, TMP8);
yading@10 2320 vis_st64_2(TMP8, dest, 8);
yading@10 2321 dest += stride;
yading@10 2322
yading@10 2323 vis_xor(REF_0, REF_2, TMP6);
yading@10 2324
yading@10 2325 vis_xor(REF_4, REF_6, TMP8);
yading@10 2326
yading@10 2327 vis_and(TMP6, MASK_fe, TMP6);
yading@10 2328
yading@10 2329 vis_mul8x16(CONST_128, TMP6, TMP6);
yading@10 2330 vis_and(TMP8, MASK_fe, TMP8);
yading@10 2331
yading@10 2332 vis_mul8x16(CONST_128, TMP8, TMP8);
yading@10 2333 vis_and(REF_0, REF_2, TMP10);
yading@10 2334
yading@10 2335 vis_and(REF_4, REF_6, TMP12);
yading@10 2336
yading@10 2337 vis_alignaddr_g0((void *)off);
yading@10 2338
yading@10 2339 vis_faligndata(TMP14, TMP16, REF_0);
yading@10 2340
yading@10 2341 vis_faligndata(TMP16, TMP18, REF_4);
yading@10 2342
yading@10 2343 if (off != 0x7) {
yading@10 2344 vis_alignaddr_g0((void *)off_plus_1);
yading@10 2345 vis_faligndata(TMP14, TMP16, REF_2);
yading@10 2346 vis_faligndata(TMP16, TMP18, REF_6);
yading@10 2347 } else {
yading@10 2348 vis_src1(TMP16, REF_2);
yading@10 2349 vis_src1(TMP18, REF_6);
yading@10 2350 }
yading@10 2351
yading@10 2352 vis_and(TMP6, MASK_7f, TMP6);
yading@10 2353
yading@10 2354 vis_and(TMP8, MASK_7f, TMP8);
yading@10 2355
yading@10 2356 vis_padd16(TMP10, TMP6, TMP6);
yading@10 2357 vis_st64(TMP6, dest[0]);
yading@10 2358
yading@10 2359 vis_padd16(TMP12, TMP8, TMP8);
yading@10 2360 vis_st64_2(TMP8, dest, 8);
yading@10 2361 dest += stride;
yading@10 2362 } while (--height);
yading@10 2363
yading@10 2364 vis_ld64(ref[0], TMP0);
yading@10 2365 vis_xor(REF_0, REF_2, TMP6);
yading@10 2366
yading@10 2367 vis_ld64_2(ref, 8, TMP2);
yading@10 2368 vis_xor(REF_4, REF_6, TMP8);
yading@10 2369
yading@10 2370 vis_ld64_2(ref, 16, TMP4);
yading@10 2371 vis_and(TMP6, MASK_fe, TMP6);
yading@10 2372
yading@10 2373 vis_mul8x16(CONST_128, TMP6, TMP6);
yading@10 2374 vis_and(TMP8, MASK_fe, TMP8);
yading@10 2375
yading@10 2376 vis_mul8x16(CONST_128, TMP8, TMP8);
yading@10 2377 vis_and(REF_0, REF_2, TMP10);
yading@10 2378
yading@10 2379 vis_and(REF_4, REF_6, TMP12);
yading@10 2380
yading@10 2381 vis_alignaddr_g0((void *)off);
yading@10 2382
yading@10 2383 vis_faligndata(TMP0, TMP2, REF_0);
yading@10 2384
yading@10 2385 vis_faligndata(TMP2, TMP4, REF_4);
yading@10 2386
yading@10 2387 if (off != 0x7) {
yading@10 2388 vis_alignaddr_g0((void *)off_plus_1);
yading@10 2389 vis_faligndata(TMP0, TMP2, REF_2);
yading@10 2390 vis_faligndata(TMP2, TMP4, REF_6);
yading@10 2391 } else {
yading@10 2392 vis_src1(TMP2, REF_2);
yading@10 2393 vis_src1(TMP4, REF_6);
yading@10 2394 }
yading@10 2395
yading@10 2396 vis_and(TMP6, MASK_7f, TMP6);
yading@10 2397
yading@10 2398 vis_and(TMP8, MASK_7f, TMP8);
yading@10 2399
yading@10 2400 vis_padd16(TMP10, TMP6, TMP6);
yading@10 2401 vis_st64(TMP6, dest[0]);
yading@10 2402
yading@10 2403 vis_padd16(TMP12, TMP8, TMP8);
yading@10 2404 vis_st64_2(TMP8, dest, 8);
yading@10 2405 dest += stride;
yading@10 2406
yading@10 2407 vis_xor(REF_0, REF_2, TMP6);
yading@10 2408
yading@10 2409 vis_xor(REF_4, REF_6, TMP8);
yading@10 2410
yading@10 2411 vis_and(TMP6, MASK_fe, TMP6);
yading@10 2412
yading@10 2413 vis_mul8x16(CONST_128, TMP6, TMP6);
yading@10 2414 vis_and(TMP8, MASK_fe, TMP8);
yading@10 2415
yading@10 2416 vis_mul8x16(CONST_128, TMP8, TMP8);
yading@10 2417 vis_and(REF_0, REF_2, TMP10);
yading@10 2418
yading@10 2419 vis_and(REF_4, REF_6, TMP12);
yading@10 2420
yading@10 2421 vis_and(TMP6, MASK_7f, TMP6);
yading@10 2422
yading@10 2423 vis_and(TMP8, MASK_7f, TMP8);
yading@10 2424
yading@10 2425 vis_padd16(TMP10, TMP6, TMP6);
yading@10 2426 vis_st64(TMP6, dest[0]);
yading@10 2427
yading@10 2428 vis_padd16(TMP12, TMP8, TMP8);
yading@10 2429 vis_st64_2(TMP8, dest, 8);
yading@10 2430 }
yading@10 2431
yading@10 2432 static void MC_put_no_round_x_8_vis (uint8_t * dest, const uint8_t * ref,
yading@10 2433 const ptrdiff_t stride, int height)
yading@10 2434 {
yading@10 2435 unsigned long off = (unsigned long) ref & 0x7;
yading@10 2436 unsigned long off_plus_1 = off + 1;
yading@10 2437
yading@10 2438 ref = vis_alignaddr(ref);
yading@10 2439
yading@10 2440 vis_ld64(ref[0], TMP0);
yading@10 2441
yading@10 2442 vis_ld64(ref[8], TMP2);
yading@10 2443
yading@10 2444 vis_ld64(constants_fe[0], MASK_fe);
yading@10 2445
yading@10 2446 vis_ld64(constants_7f[0], MASK_7f);
yading@10 2447
yading@10 2448 vis_ld64(constants128[0], CONST_128);
yading@10 2449 vis_faligndata(TMP0, TMP2, REF_0);
yading@10 2450
yading@10 2451 if (off != 0x7) {
yading@10 2452 vis_alignaddr_g0((void *)off_plus_1);
yading@10 2453 vis_faligndata(TMP0, TMP2, REF_2);
yading@10 2454 } else {
yading@10 2455 vis_src1(TMP2, REF_2);
yading@10 2456 }
yading@10 2457
yading@10 2458 ref += stride;
yading@10 2459 height = (height >> 1) - 1;
yading@10 2460
yading@10 2461 do { /* 20 cycles */
yading@10 2462 vis_ld64(ref[0], TMP0);
yading@10 2463 vis_xor(REF_0, REF_2, TMP4);
yading@10 2464
yading@10 2465 vis_ld64_2(ref, 8, TMP2);
yading@10 2466 vis_and(TMP4, MASK_fe, TMP4);
yading@10 2467 ref += stride;
yading@10 2468
yading@10 2469 vis_ld64(ref[0], TMP8);
yading@10 2470 vis_and(REF_0, REF_2, TMP6);
yading@10 2471 vis_mul8x16(CONST_128, TMP4, TMP4);
yading@10 2472
yading@10 2473 vis_alignaddr_g0((void *)off);
yading@10 2474
yading@10 2475 vis_ld64_2(ref, 8, TMP10);
yading@10 2476 ref += stride;
yading@10 2477 vis_faligndata(TMP0, TMP2, REF_0);
yading@10 2478
yading@10 2479 if (off != 0x7) {
yading@10 2480 vis_alignaddr_g0((void *)off_plus_1);
yading@10 2481 vis_faligndata(TMP0, TMP2, REF_2);
yading@10 2482 } else {
yading@10 2483 vis_src1(TMP2, REF_2);
yading@10 2484 }
yading@10 2485
yading@10 2486 vis_and(TMP4, MASK_7f, TMP4);
yading@10 2487
yading@10 2488 vis_padd16(TMP6, TMP4, DST_0);
yading@10 2489 vis_st64(DST_0, dest[0]);
yading@10 2490 dest += stride;
yading@10 2491
yading@10 2492 vis_xor(REF_0, REF_2, TMP12);
yading@10 2493
yading@10 2494 vis_and(TMP12, MASK_fe, TMP12);
yading@10 2495
yading@10 2496 vis_and(REF_0, REF_2, TMP14);
yading@10 2497 vis_mul8x16(CONST_128, TMP12, TMP12);
yading@10 2498
yading@10 2499 vis_alignaddr_g0((void *)off);
yading@10 2500 vis_faligndata(TMP8, TMP10, REF_0);
yading@10 2501 if (off != 0x7) {
yading@10 2502 vis_alignaddr_g0((void *)off_plus_1);
yading@10 2503 vis_faligndata(TMP8, TMP10, REF_2);
yading@10 2504 } else {
yading@10 2505 vis_src1(TMP10, REF_2);
yading@10 2506 }
yading@10 2507
yading@10 2508 vis_and(TMP12, MASK_7f, TMP12);
yading@10 2509
yading@10 2510 vis_padd16(TMP14, TMP12, DST_0);
yading@10 2511 vis_st64(DST_0, dest[0]);
yading@10 2512 dest += stride;
yading@10 2513 } while (--height);
yading@10 2514
yading@10 2515 vis_ld64(ref[0], TMP0);
yading@10 2516 vis_xor(REF_0, REF_2, TMP4);
yading@10 2517
yading@10 2518 vis_ld64_2(ref, 8, TMP2);
yading@10 2519 vis_and(TMP4, MASK_fe, TMP4);
yading@10 2520
yading@10 2521 vis_and(REF_0, REF_2, TMP6);
yading@10 2522 vis_mul8x16(CONST_128, TMP4, TMP4);
yading@10 2523
yading@10 2524 vis_alignaddr_g0((void *)off);
yading@10 2525
yading@10 2526 vis_faligndata(TMP0, TMP2, REF_0);
yading@10 2527
yading@10 2528 if (off != 0x7) {
yading@10 2529 vis_alignaddr_g0((void *)off_plus_1);
yading@10 2530 vis_faligndata(TMP0, TMP2, REF_2);
yading@10 2531 } else {
yading@10 2532 vis_src1(TMP2, REF_2);
yading@10 2533 }
yading@10 2534
yading@10 2535 vis_and(TMP4, MASK_7f, TMP4);
yading@10 2536
yading@10 2537 vis_padd16(TMP6, TMP4, DST_0);
yading@10 2538 vis_st64(DST_0, dest[0]);
yading@10 2539 dest += stride;
yading@10 2540
yading@10 2541 vis_xor(REF_0, REF_2, TMP12);
yading@10 2542
yading@10 2543 vis_and(TMP12, MASK_fe, TMP12);
yading@10 2544
yading@10 2545 vis_and(REF_0, REF_2, TMP14);
yading@10 2546 vis_mul8x16(CONST_128, TMP12, TMP12);
yading@10 2547
yading@10 2548 vis_and(TMP12, MASK_7f, TMP12);
yading@10 2549
yading@10 2550 vis_padd16(TMP14, TMP12, DST_0);
yading@10 2551 vis_st64(DST_0, dest[0]);
yading@10 2552 dest += stride;
yading@10 2553 }
yading@10 2554
yading@10 2555 static void MC_avg_no_round_x_16_vis (uint8_t * dest, const uint8_t * ref,
yading@10 2556 const ptrdiff_t stride, int height)
yading@10 2557 {
yading@10 2558 unsigned long off = (unsigned long) ref & 0x7;
yading@10 2559 unsigned long off_plus_1 = off + 1;
yading@10 2560
yading@10 2561 vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);
yading@10 2562
yading@10 2563 vis_ld64(constants3[0], CONST_3);
yading@10 2564 vis_fzero(ZERO);
yading@10 2565 vis_ld64(constants256_512[0], CONST_256);
yading@10 2566
yading@10 2567 ref = vis_alignaddr(ref);
yading@10 2568 do { /* 26 cycles */
yading@10 2569 vis_ld64(ref[0], TMP0);
yading@10 2570
yading@10 2571 vis_ld64(ref[8], TMP2);
yading@10 2572
yading@10 2573 vis_alignaddr_g0((void *)off);
yading@10 2574
yading@10 2575 vis_ld64(ref[16], TMP4);
yading@10 2576
yading@10 2577 vis_ld64(dest[0], DST_0);
yading@10 2578 vis_faligndata(TMP0, TMP2, REF_0);
yading@10 2579
yading@10 2580 vis_ld64(dest[8], DST_2);
yading@10 2581 vis_faligndata(TMP2, TMP4, REF_4);
yading@10 2582
yading@10 2583 if (off != 0x7) {
yading@10 2584 vis_alignaddr_g0((void *)off_plus_1);
yading@10 2585 vis_faligndata(TMP0, TMP2, REF_2);
yading@10 2586 vis_faligndata(TMP2, TMP4, REF_6);
yading@10 2587 } else {
yading@10 2588 vis_src1(TMP2, REF_2);
yading@10 2589 vis_src1(TMP4, REF_6);
yading@10 2590 }
yading@10 2591
yading@10 2592 vis_mul8x16au(REF_0, CONST_256, TMP0);
yading@10 2593
yading@10 2594 vis_pmerge(ZERO, REF_2, TMP4);
yading@10 2595 vis_mul8x16au(REF_0_1, CONST_256, TMP2);
yading@10 2596
yading@10 2597 vis_pmerge(ZERO, REF_2_1, TMP6);
yading@10 2598
yading@10 2599 vis_padd16(TMP0, TMP4, TMP0);
yading@10 2600
yading@10 2601 vis_mul8x16al(DST_0, CONST_512, TMP4);
yading@10 2602 vis_padd16(TMP2, TMP6, TMP2);
yading@10 2603
yading@10 2604 vis_mul8x16al(DST_1, CONST_512, TMP6);
yading@10 2605
yading@10 2606 vis_mul8x16au(REF_6, CONST_256, TMP12);
yading@10 2607
yading@10 2608 vis_padd16(TMP0, TMP4, TMP0);
yading@10 2609 vis_mul8x16au(REF_6_1, CONST_256, TMP14);
yading@10 2610
yading@10 2611 vis_padd16(TMP2, TMP6, TMP2);
yading@10 2612 vis_mul8x16au(REF_4, CONST_256, TMP16);
yading@10 2613
yading@10 2614 vis_padd16(TMP0, CONST_3, TMP8);
yading@10 2615 vis_mul8x16au(REF_4_1, CONST_256, TMP18);
yading@10 2616
yading@10 2617 vis_padd16(TMP2, CONST_3, TMP10);
yading@10 2618 vis_pack16(TMP8, DST_0);
yading@10 2619
yading@10 2620 vis_pack16(TMP10, DST_1);
yading@10 2621 vis_padd16(TMP16, TMP12, TMP0);
yading@10 2622
yading@10 2623 vis_st64(DST_0, dest[0]);
yading@10 2624 vis_mul8x16al(DST_2, CONST_512, TMP4);
yading@10 2625 vis_padd16(TMP18, TMP14, TMP2);
yading@10 2626
yading@10 2627 vis_mul8x16al(DST_3, CONST_512, TMP6);
yading@10 2628 vis_padd16(TMP0, CONST_3, TMP0);
yading@10 2629
yading@10 2630 vis_padd16(TMP2, CONST_3, TMP2);
yading@10 2631
yading@10 2632 vis_padd16(TMP0, TMP4, TMP0);
yading@10 2633
yading@10 2634 vis_padd16(TMP2, TMP6, TMP2);
yading@10 2635 vis_pack16(TMP0, DST_2);
yading@10 2636
yading@10 2637 vis_pack16(TMP2, DST_3);
yading@10 2638 vis_st64(DST_2, dest[8]);
yading@10 2639
yading@10 2640 ref += stride;
yading@10 2641 dest += stride;
yading@10 2642 } while (--height);
yading@10 2643 }
yading@10 2644
yading@10 2645 static void MC_put_no_round_y_16_vis (uint8_t * dest, const uint8_t * ref,
yading@10 2646 const ptrdiff_t stride, int height)
yading@10 2647 {
yading@10 2648 ref = vis_alignaddr(ref);
yading@10 2649 vis_ld64(ref[0], TMP0);
yading@10 2650
yading@10 2651 vis_ld64_2(ref, 8, TMP2);
yading@10 2652
yading@10 2653 vis_ld64_2(ref, 16, TMP4);
yading@10 2654 ref += stride;
yading@10 2655
yading@10 2656 vis_ld64(ref[0], TMP6);
yading@10 2657 vis_faligndata(TMP0, TMP2, REF_0);
yading@10 2658
yading@10 2659 vis_ld64_2(ref, 8, TMP8);
yading@10 2660 vis_faligndata(TMP2, TMP4, REF_4);
yading@10 2661
yading@10 2662 vis_ld64_2(ref, 16, TMP10);
yading@10 2663 ref += stride;
yading@10 2664
yading@10 2665 vis_ld64(constants_fe[0], MASK_fe);
yading@10 2666 vis_faligndata(TMP6, TMP8, REF_2);
yading@10 2667
yading@10 2668 vis_ld64(constants_7f[0], MASK_7f);
yading@10 2669 vis_faligndata(TMP8, TMP10, REF_6);
yading@10 2670
yading@10 2671 vis_ld64(constants128[0], CONST_128);
yading@10 2672 height = (height >> 1) - 1;
yading@10 2673 do { /* 24 cycles */
yading@10 2674 vis_ld64(ref[0], TMP0);
yading@10 2675 vis_xor(REF_0, REF_2, TMP12);
yading@10 2676
yading@10 2677 vis_ld64_2(ref, 8, TMP2);
yading@10 2678 vis_xor(REF_4, REF_6, TMP16);
yading@10 2679
yading@10 2680 vis_ld64_2(ref, 16, TMP4);
yading@10 2681 ref += stride;
yading@10 2682 vis_and(REF_0, REF_2, TMP14);
yading@10 2683
yading@10 2684 vis_ld64(ref[0], TMP6);
yading@10 2685 vis_and(REF_4, REF_6, TMP18);
yading@10 2686
yading@10 2687 vis_ld64_2(ref, 8, TMP8);
yading@10 2688 vis_faligndata(TMP0, TMP2, REF_0);
yading@10 2689
yading@10 2690 vis_ld64_2(ref, 16, TMP10);
yading@10 2691 ref += stride;
yading@10 2692 vis_faligndata(TMP2, TMP4, REF_4);
yading@10 2693
yading@10 2694 vis_and(TMP12, MASK_fe, TMP12);
yading@10 2695
yading@10 2696 vis_and(TMP16, MASK_fe, TMP16);
yading@10 2697 vis_mul8x16(CONST_128, TMP12, TMP12);
yading@10 2698
yading@10 2699 vis_mul8x16(CONST_128, TMP16, TMP16);
yading@10 2700 vis_xor(REF_0, REF_2, TMP0);
yading@10 2701
yading@10 2702 vis_xor(REF_4, REF_6, TMP2);
yading@10 2703
yading@10 2704 vis_and(REF_0, REF_2, TMP20);
yading@10 2705
yading@10 2706 vis_and(TMP12, MASK_7f, TMP12);
yading@10 2707
yading@10 2708 vis_and(TMP16, MASK_7f, TMP16);
yading@10 2709
yading@10 2710 vis_padd16(TMP14, TMP12, TMP12);
yading@10 2711 vis_st64(TMP12, dest[0]);
yading@10 2712
yading@10 2713 vis_padd16(TMP18, TMP16, TMP16);
yading@10 2714 vis_st64_2(TMP16, dest, 8);
yading@10 2715 dest += stride;
yading@10 2716
yading@10 2717 vis_and(REF_4, REF_6, TMP18);
yading@10 2718
yading@10 2719 vis_and(TMP0, MASK_fe, TMP0);
yading@10 2720
yading@10 2721 vis_and(TMP2, MASK_fe, TMP2);
yading@10 2722 vis_mul8x16(CONST_128, TMP0, TMP0);
yading@10 2723
yading@10 2724 vis_faligndata(TMP6, TMP8, REF_2);
yading@10 2725 vis_mul8x16(CONST_128, TMP2, TMP2);
yading@10 2726
yading@10 2727 vis_faligndata(TMP8, TMP10, REF_6);
yading@10 2728
yading@10 2729 vis_and(TMP0, MASK_7f, TMP0);
yading@10 2730
yading@10 2731 vis_and(TMP2, MASK_7f, TMP2);
yading@10 2732
yading@10 2733 vis_padd16(TMP20, TMP0, TMP0);
yading@10 2734 vis_st64(TMP0, dest[0]);
yading@10 2735
yading@10 2736 vis_padd16(TMP18, TMP2, TMP2);
yading@10 2737 vis_st64_2(TMP2, dest, 8);
yading@10 2738 dest += stride;
yading@10 2739 } while (--height);
yading@10 2740
yading@10 2741 vis_ld64(ref[0], TMP0);
yading@10 2742 vis_xor(REF_0, REF_2, TMP12);
yading@10 2743
yading@10 2744 vis_ld64_2(ref, 8, TMP2);
yading@10 2745 vis_xor(REF_4, REF_6, TMP16);
yading@10 2746
yading@10 2747 vis_ld64_2(ref, 16, TMP4);
yading@10 2748 vis_and(REF_0, REF_2, TMP14);
yading@10 2749
yading@10 2750 vis_and(REF_4, REF_6, TMP18);
yading@10 2751
yading@10 2752 vis_faligndata(TMP0, TMP2, REF_0);
yading@10 2753
yading@10 2754 vis_faligndata(TMP2, TMP4, REF_4);
yading@10 2755
yading@10 2756 vis_and(TMP12, MASK_fe, TMP12);
yading@10 2757
yading@10 2758 vis_and(TMP16, MASK_fe, TMP16);
yading@10 2759 vis_mul8x16(CONST_128, TMP12, TMP12);
yading@10 2760
yading@10 2761 vis_mul8x16(CONST_128, TMP16, TMP16);
yading@10 2762 vis_xor(REF_0, REF_2, TMP0);
yading@10 2763
yading@10 2764 vis_xor(REF_4, REF_6, TMP2);
yading@10 2765
yading@10 2766 vis_and(REF_0, REF_2, TMP20);
yading@10 2767
yading@10 2768 vis_and(TMP12, MASK_7f, TMP12);
yading@10 2769
yading@10 2770 vis_and(TMP16, MASK_7f, TMP16);
yading@10 2771
yading@10 2772 vis_padd16(TMP14, TMP12, TMP12);
yading@10 2773 vis_st64(TMP12, dest[0]);
yading@10 2774
yading@10 2775 vis_padd16(TMP18, TMP16, TMP16);
yading@10 2776 vis_st64_2(TMP16, dest, 8);
yading@10 2777 dest += stride;
yading@10 2778
yading@10 2779 vis_and(REF_4, REF_6, TMP18);
yading@10 2780
yading@10 2781 vis_and(TMP0, MASK_fe, TMP0);
yading@10 2782
yading@10 2783 vis_and(TMP2, MASK_fe, TMP2);
yading@10 2784 vis_mul8x16(CONST_128, TMP0, TMP0);
yading@10 2785
yading@10 2786 vis_mul8x16(CONST_128, TMP2, TMP2);
yading@10 2787
yading@10 2788 vis_and(TMP0, MASK_7f, TMP0);
yading@10 2789
yading@10 2790 vis_and(TMP2, MASK_7f, TMP2);
yading@10 2791
yading@10 2792 vis_padd16(TMP20, TMP0, TMP0);
yading@10 2793 vis_st64(TMP0, dest[0]);
yading@10 2794
yading@10 2795 vis_padd16(TMP18, TMP2, TMP2);
yading@10 2796 vis_st64_2(TMP2, dest, 8);
yading@10 2797 }
yading@10 2798
yading@10 2799 static void MC_put_no_round_y_8_vis (uint8_t * dest, const uint8_t * ref,
yading@10 2800 const ptrdiff_t stride, int height)
yading@10 2801 {
yading@10 2802 ref = vis_alignaddr(ref);
yading@10 2803 vis_ld64(ref[0], TMP0);
yading@10 2804
yading@10 2805 vis_ld64_2(ref, 8, TMP2);
yading@10 2806 ref += stride;
yading@10 2807
yading@10 2808 vis_ld64(ref[0], TMP4);
yading@10 2809
yading@10 2810 vis_ld64_2(ref, 8, TMP6);
yading@10 2811 ref += stride;
yading@10 2812
yading@10 2813 vis_ld64(constants_fe[0], MASK_fe);
yading@10 2814 vis_faligndata(TMP0, TMP2, REF_0);
yading@10 2815
yading@10 2816 vis_ld64(constants_7f[0], MASK_7f);
yading@10 2817 vis_faligndata(TMP4, TMP6, REF_2);
yading@10 2818
yading@10 2819 vis_ld64(constants128[0], CONST_128);
yading@10 2820 height = (height >> 1) - 1;
yading@10 2821 do { /* 12 cycles */
yading@10 2822 vis_ld64(ref[0], TMP0);
yading@10 2823 vis_xor(REF_0, REF_2, TMP4);
yading@10 2824
yading@10 2825 vis_ld64_2(ref, 8, TMP2);
yading@10 2826 ref += stride;
yading@10 2827 vis_and(TMP4, MASK_fe, TMP4);
yading@10 2828
yading@10 2829 vis_and(REF_0, REF_2, TMP6);
yading@10 2830 vis_mul8x16(CONST_128, TMP4, TMP4);
yading@10 2831
yading@10 2832 vis_faligndata(TMP0, TMP2, REF_0);
yading@10 2833 vis_ld64(ref[0], TMP0);
yading@10 2834
yading@10 2835 vis_ld64_2(ref, 8, TMP2);
yading@10 2836 ref += stride;
yading@10 2837 vis_xor(REF_0, REF_2, TMP12);
yading@10 2838
yading@10 2839 vis_and(TMP4, MASK_7f, TMP4);
yading@10 2840
yading@10 2841 vis_and(TMP12, MASK_fe, TMP12);
yading@10 2842
yading@10 2843 vis_mul8x16(CONST_128, TMP12, TMP12);
yading@10 2844 vis_and(REF_0, REF_2, TMP14);
yading@10 2845
yading@10 2846 vis_padd16(TMP6, TMP4, DST_0);
yading@10 2847 vis_st64(DST_0, dest[0]);
yading@10 2848 dest += stride;
yading@10 2849
yading@10 2850 vis_faligndata(TMP0, TMP2, REF_2);
yading@10 2851
yading@10 2852 vis_and(TMP12, MASK_7f, TMP12);
yading@10 2853
yading@10 2854 vis_padd16(TMP14, TMP12, DST_0);
yading@10 2855 vis_st64(DST_0, dest[0]);
yading@10 2856 dest += stride;
yading@10 2857 } while (--height);
yading@10 2858
yading@10 2859 vis_ld64(ref[0], TMP0);
yading@10 2860 vis_xor(REF_0, REF_2, TMP4);
yading@10 2861
yading@10 2862 vis_ld64_2(ref, 8, TMP2);
yading@10 2863 vis_and(TMP4, MASK_fe, TMP4);
yading@10 2864
yading@10 2865 vis_and(REF_0, REF_2, TMP6);
yading@10 2866 vis_mul8x16(CONST_128, TMP4, TMP4);
yading@10 2867
yading@10 2868 vis_faligndata(TMP0, TMP2, REF_0);
yading@10 2869
yading@10 2870 vis_xor(REF_0, REF_2, TMP12);
yading@10 2871
yading@10 2872 vis_and(TMP4, MASK_7f, TMP4);
yading@10 2873
yading@10 2874 vis_and(TMP12, MASK_fe, TMP12);
yading@10 2875
yading@10 2876 vis_mul8x16(CONST_128, TMP12, TMP12);
yading@10 2877 vis_and(REF_0, REF_2, TMP14);
yading@10 2878
yading@10 2879 vis_padd16(TMP6, TMP4, DST_0);
yading@10 2880 vis_st64(DST_0, dest[0]);
yading@10 2881 dest += stride;
yading@10 2882
yading@10 2883 vis_and(TMP12, MASK_7f, TMP12);
yading@10 2884
yading@10 2885 vis_padd16(TMP14, TMP12, DST_0);
yading@10 2886 vis_st64(DST_0, dest[0]);
yading@10 2887 }
yading@10 2888
yading@10 2889 static void MC_avg_no_round_y_16_vis (uint8_t * dest, const uint8_t * ref,
yading@10 2890 const ptrdiff_t stride, int height)
yading@10 2891 {
yading@10 2892 int stride_8 = stride + 8;
yading@10 2893 int stride_16 = stride + 16;
yading@10 2894
yading@10 2895 vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);
yading@10 2896
yading@10 2897 ref = vis_alignaddr(ref);
yading@10 2898
yading@10 2899 vis_ld64(ref[ 0], TMP0);
yading@10 2900 vis_fzero(ZERO);
yading@10 2901
yading@10 2902 vis_ld64(ref[ 8], TMP2);
yading@10 2903
yading@10 2904 vis_ld64(ref[16], TMP4);
yading@10 2905
yading@10 2906 vis_ld64(constants3[0], CONST_3);
yading@10 2907 vis_faligndata(TMP0, TMP2, REF_2);
yading@10 2908
yading@10 2909 vis_ld64(constants256_512[0], CONST_256);
yading@10 2910 vis_faligndata(TMP2, TMP4, REF_6);
yading@10 2911 height >>= 1;
yading@10 2912
yading@10 2913 do { /* 31 cycles */
yading@10 2914 vis_ld64_2(ref, stride, TMP0);
yading@10 2915 vis_pmerge(ZERO, REF_2, TMP12);
yading@10 2916 vis_mul8x16au(REF_2_1, CONST_256, TMP14);
yading@10 2917
yading@10 2918 vis_ld64_2(ref, stride_8, TMP2);
yading@10 2919 vis_pmerge(ZERO, REF_6, TMP16);
yading@10 2920 vis_mul8x16au(REF_6_1, CONST_256, TMP18);
yading@10 2921
yading@10 2922 vis_ld64_2(ref, stride_16, TMP4);
yading@10 2923 ref += stride;
yading@10 2924
yading@10 2925 vis_ld64(dest[0], DST_0);
yading@10 2926 vis_faligndata(TMP0, TMP2, REF_0);
yading@10 2927
yading@10 2928 vis_ld64_2(dest, 8, DST_2);
yading@10 2929 vis_faligndata(TMP2, TMP4, REF_4);
yading@10 2930
yading@10 2931 vis_ld64_2(ref, stride, TMP6);
yading@10 2932 vis_pmerge(ZERO, REF_0, TMP0);
yading@10 2933 vis_mul8x16au(REF_0_1, CONST_256, TMP2);
yading@10 2934
yading@10 2935 vis_ld64_2(ref, stride_8, TMP8);
yading@10 2936 vis_pmerge(ZERO, REF_4, TMP4);
yading@10 2937
yading@10 2938 vis_ld64_2(ref, stride_16, TMP10);
yading@10 2939 ref += stride;
yading@10 2940
yading@10 2941 vis_ld64_2(dest, stride, REF_S0/*DST_4*/);
yading@10 2942 vis_faligndata(TMP6, TMP8, REF_2);
yading@10 2943 vis_mul8x16au(REF_4_1, CONST_256, TMP6);
yading@10 2944
yading@10 2945 vis_ld64_2(dest, stride_8, REF_S2/*DST_6*/);
yading@10 2946 vis_faligndata(TMP8, TMP10, REF_6);
yading@10 2947 vis_mul8x16al(DST_0, CONST_512, TMP20);
yading@10 2948
yading@10 2949 vis_padd16(TMP0, CONST_3, TMP0);
yading@10 2950 vis_mul8x16al(DST_1, CONST_512, TMP22);
yading@10 2951
yading@10 2952 vis_padd16(TMP2, CONST_3, TMP2);
yading@10 2953 vis_mul8x16al(DST_2, CONST_512, TMP24);
yading@10 2954
yading@10 2955 vis_padd16(TMP4, CONST_3, TMP4);
yading@10 2956 vis_mul8x16al(DST_3, CONST_512, TMP26);
yading@10 2957
yading@10 2958 vis_padd16(TMP6, CONST_3, TMP6);
yading@10 2959
yading@10 2960 vis_padd16(TMP12, TMP20, TMP12);
yading@10 2961 vis_mul8x16al(REF_S0, CONST_512, TMP20);
yading@10 2962
yading@10 2963 vis_padd16(TMP14, TMP22, TMP14);
yading@10 2964 vis_mul8x16al(REF_S0_1, CONST_512, TMP22);
yading@10 2965
yading@10 2966 vis_padd16(TMP16, TMP24, TMP16);
yading@10 2967 vis_mul8x16al(REF_S2, CONST_512, TMP24);
yading@10 2968
yading@10 2969 vis_padd16(TMP18, TMP26, TMP18);
yading@10 2970 vis_mul8x16al(REF_S2_1, CONST_512, TMP26);
yading@10 2971
yading@10 2972 vis_padd16(TMP12, TMP0, TMP12);
yading@10 2973 vis_mul8x16au(REF_2, CONST_256, TMP28);
yading@10 2974
yading@10 2975 vis_padd16(TMP14, TMP2, TMP14);
yading@10 2976 vis_mul8x16au(REF_2_1, CONST_256, TMP30);
yading@10 2977
yading@10 2978 vis_padd16(TMP16, TMP4, TMP16);
yading@10 2979 vis_mul8x16au(REF_6, CONST_256, REF_S4);
yading@10 2980
yading@10 2981 vis_padd16(TMP18, TMP6, TMP18);
yading@10 2982 vis_mul8x16au(REF_6_1, CONST_256, REF_S6);
yading@10 2983
yading@10 2984 vis_pack16(TMP12, DST_0);
yading@10 2985 vis_padd16(TMP28, TMP0, TMP12);
yading@10 2986
yading@10 2987 vis_pack16(TMP14, DST_1);
yading@10 2988 vis_st64(DST_0, dest[0]);
yading@10 2989 vis_padd16(TMP30, TMP2, TMP14);
yading@10 2990
yading@10 2991 vis_pack16(TMP16, DST_2);
yading@10 2992 vis_padd16(REF_S4, TMP4, TMP16);
yading@10 2993
yading@10 2994 vis_pack16(TMP18, DST_3);
yading@10 2995 vis_st64_2(DST_2, dest, 8);
yading@10 2996 dest += stride;
yading@10 2997 vis_padd16(REF_S6, TMP6, TMP18);
yading@10 2998
yading@10 2999 vis_padd16(TMP12, TMP20, TMP12);
yading@10 3000
yading@10 3001 vis_padd16(TMP14, TMP22, TMP14);
yading@10 3002 vis_pack16(TMP12, DST_0);
yading@10 3003
yading@10 3004 vis_padd16(TMP16, TMP24, TMP16);
yading@10 3005 vis_pack16(TMP14, DST_1);
yading@10 3006 vis_st64(DST_0, dest[0]);
yading@10 3007
yading@10 3008 vis_padd16(TMP18, TMP26, TMP18);
yading@10 3009 vis_pack16(TMP16, DST_2);
yading@10 3010
yading@10 3011 vis_pack16(TMP18, DST_3);
yading@10 3012 vis_st64_2(DST_2, dest, 8);
yading@10 3013 dest += stride;
yading@10 3014 } while (--height);
yading@10 3015 }
yading@10 3016
yading@10 3017 static void MC_put_no_round_xy_16_vis (uint8_t * dest, const uint8_t * ref,
yading@10 3018 const ptrdiff_t stride, int height)
yading@10 3019 {
yading@10 3020 unsigned long off = (unsigned long) ref & 0x7;
yading@10 3021 unsigned long off_plus_1 = off + 1;
yading@10 3022 int stride_8 = stride + 8;
yading@10 3023 int stride_16 = stride + 16;
yading@10 3024
yading@10 3025 vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);
yading@10 3026
yading@10 3027 ref = vis_alignaddr(ref);
yading@10 3028
yading@10 3029 vis_ld64(ref[ 0], TMP0);
yading@10 3030 vis_fzero(ZERO);
yading@10 3031
yading@10 3032 vis_ld64(ref[ 8], TMP2);
yading@10 3033
yading@10 3034 vis_ld64(ref[16], TMP4);
yading@10 3035
yading@10 3036 vis_ld64(constants1[0], CONST_1);
yading@10 3037 vis_faligndata(TMP0, TMP2, REF_S0);
yading@10 3038
yading@10 3039 vis_ld64(constants256_512[0], CONST_256);
yading@10 3040 vis_faligndata(TMP2, TMP4, REF_S4);
yading@10 3041
yading@10 3042 if (off != 0x7) {
yading@10 3043 vis_alignaddr_g0((void *)off_plus_1);
yading@10 3044 vis_faligndata(TMP0, TMP2, REF_S2);
yading@10 3045 vis_faligndata(TMP2, TMP4, REF_S6);
yading@10 3046 } else {
yading@10 3047 vis_src1(TMP2, REF_S2);
yading@10 3048 vis_src1(TMP4, REF_S6);
yading@10 3049 }
yading@10 3050
yading@10 3051 height >>= 1;
yading@10 3052 do {
yading@10 3053 vis_ld64_2(ref, stride, TMP0);
yading@10 3054 vis_mul8x16au(REF_S0, CONST_256, TMP12);
yading@10 3055 vis_pmerge(ZERO, REF_S0_1, TMP14);
yading@10 3056
yading@10 3057 vis_alignaddr_g0((void *)off);
yading@10 3058
yading@10 3059 vis_ld64_2(ref, stride_8, TMP2);
yading@10 3060 vis_mul8x16au(REF_S2, CONST_256, TMP16);
yading@10 3061 vis_pmerge(ZERO, REF_S2_1, TMP18);
yading@10 3062
yading@10 3063 vis_ld64_2(ref, stride_16, TMP4);
yading@10 3064 ref += stride;
yading@10 3065 vis_mul8x16au(REF_S4, CONST_256, TMP20);
yading@10 3066 vis_pmerge(ZERO, REF_S4_1, TMP22);
yading@10 3067
yading@10 3068 vis_ld64_2(ref, stride, TMP6);
yading@10 3069 vis_mul8x16au(REF_S6, CONST_256, TMP24);
yading@10 3070 vis_pmerge(ZERO, REF_S6_1, TMP26);
yading@10 3071
yading@10 3072 vis_ld64_2(ref, stride_8, TMP8);
yading@10 3073 vis_faligndata(TMP0, TMP2, REF_0);
yading@10 3074
yading@10 3075 vis_ld64_2(ref, stride_16, TMP10);
yading@10 3076 ref += stride;
yading@10 3077 vis_faligndata(TMP2, TMP4, REF_4);
yading@10 3078
yading@10 3079 vis_faligndata(TMP6, TMP8, REF_S0);
yading@10 3080
yading@10 3081 vis_faligndata(TMP8, TMP10, REF_S4);
yading@10 3082
yading@10 3083 if (off != 0x7) {
yading@10 3084 vis_alignaddr_g0((void *)off_plus_1);
yading@10 3085 vis_faligndata(TMP0, TMP2, REF_2);
yading@10 3086 vis_faligndata(TMP2, TMP4, REF_6);
yading@10 3087 vis_faligndata(TMP6, TMP8, REF_S2);
yading@10 3088 vis_faligndata(TMP8, TMP10, REF_S6);
yading@10 3089 } else {
yading@10 3090 vis_src1(TMP2, REF_2);
yading@10 3091 vis_src1(TMP4, REF_6);
yading@10 3092 vis_src1(TMP8, REF_S2);
yading@10 3093 vis_src1(TMP10, REF_S6);
yading@10 3094 }
yading@10 3095
yading@10 3096 vis_mul8x16au(REF_0, CONST_256, TMP0);
yading@10 3097 vis_pmerge(ZERO, REF_0_1, TMP2);
yading@10 3098
yading@10 3099 vis_mul8x16au(REF_2, CONST_256, TMP4);
yading@10 3100 vis_pmerge(ZERO, REF_2_1, TMP6);
yading@10 3101
yading@10 3102 vis_padd16(TMP0, CONST_2, TMP8);
yading@10 3103 vis_mul8x16au(REF_4, CONST_256, TMP0);
yading@10 3104
yading@10 3105 vis_padd16(TMP2, CONST_1, TMP10);
yading@10 3106 vis_mul8x16au(REF_4_1, CONST_256, TMP2);
yading@10 3107
yading@10 3108 vis_padd16(TMP8, TMP4, TMP8);
yading@10 3109 vis_mul8x16au(REF_6, CONST_256, TMP4);
yading@10 3110
yading@10 3111 vis_padd16(TMP10, TMP6, TMP10);
yading@10 3112 vis_mul8x16au(REF_6_1, CONST_256, TMP6);
yading@10 3113
yading@10 3114 vis_padd16(TMP12, TMP8, TMP12);
yading@10 3115
yading@10 3116 vis_padd16(TMP14, TMP10, TMP14);
yading@10 3117
yading@10 3118 vis_padd16(TMP12, TMP16, TMP12);
yading@10 3119
yading@10 3120 vis_padd16(TMP14, TMP18, TMP14);
yading@10 3121 vis_pack16(TMP12, DST_0);
yading@10 3122
yading@10 3123 vis_pack16(TMP14, DST_1);
yading@10 3124 vis_st64(DST_0, dest[0]);
yading@10 3125 vis_padd16(TMP0, CONST_1, TMP12);
yading@10 3126
yading@10 3127 vis_mul8x16au(REF_S0, CONST_256, TMP0);
yading@10 3128 vis_padd16(TMP2, CONST_1, TMP14);
yading@10 3129
yading@10 3130 vis_mul8x16au(REF_S0_1, CONST_256, TMP2);
yading@10 3131 vis_padd16(TMP12, TMP4, TMP12);
yading@10 3132
yading@10 3133 vis_mul8x16au(REF_S2, CONST_256, TMP4);
yading@10 3134 vis_padd16(TMP14, TMP6, TMP14);
yading@10 3135
yading@10 3136 vis_mul8x16au(REF_S2_1, CONST_256, TMP6);
yading@10 3137 vis_padd16(TMP20, TMP12, TMP20);
yading@10 3138
yading@10 3139 vis_padd16(TMP22, TMP14, TMP22);
yading@10 3140
yading@10 3141 vis_padd16(TMP20, TMP24, TMP20);
yading@10 3142
yading@10 3143 vis_padd16(TMP22, TMP26, TMP22);
yading@10 3144 vis_pack16(TMP20, DST_2);
yading@10 3145
yading@10 3146 vis_pack16(TMP22, DST_3);
yading@10 3147 vis_st64_2(DST_2, dest, 8);
yading@10 3148 dest += stride;
yading@10 3149 vis_padd16(TMP0, TMP4, TMP24);
yading@10 3150
yading@10 3151 vis_mul8x16au(REF_S4, CONST_256, TMP0);
yading@10 3152 vis_padd16(TMP2, TMP6, TMP26);
yading@10 3153
yading@10 3154 vis_mul8x16au(REF_S4_1, CONST_256, TMP2);
yading@10 3155 vis_padd16(TMP24, TMP8, TMP24);
yading@10 3156
yading@10 3157 vis_padd16(TMP26, TMP10, TMP26);
yading@10 3158 vis_pack16(TMP24, DST_0);
yading@10 3159
yading@10 3160 vis_pack16(TMP26, DST_1);
yading@10 3161 vis_st64(DST_0, dest[0]);
yading@10 3162 vis_pmerge(ZERO, REF_S6, TMP4);
yading@10 3163
yading@10 3164 vis_pmerge(ZERO, REF_S6_1, TMP6);
yading@10 3165
yading@10 3166 vis_padd16(TMP0, TMP4, TMP0);
yading@10 3167
yading@10 3168 vis_padd16(TMP2, TMP6, TMP2);
yading@10 3169
yading@10 3170 vis_padd16(TMP0, TMP12, TMP0);
yading@10 3171
yading@10 3172 vis_padd16(TMP2, TMP14, TMP2);
yading@10 3173 vis_pack16(TMP0, DST_2);
yading@10 3174
yading@10 3175 vis_pack16(TMP2, DST_3);
yading@10 3176 vis_st64_2(DST_2, dest, 8);
yading@10 3177 dest += stride;
yading@10 3178 } while (--height);
yading@10 3179 }
yading@10 3180
yading@10 3181 static void MC_put_no_round_xy_8_vis (uint8_t * dest, const uint8_t * ref,
yading@10 3182 const ptrdiff_t stride, int height)
yading@10 3183 {
yading@10 3184 unsigned long off = (unsigned long) ref & 0x7;
yading@10 3185 unsigned long off_plus_1 = off + 1;
yading@10 3186 int stride_8 = stride + 8;
yading@10 3187
yading@10 3188 vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);
yading@10 3189
yading@10 3190 ref = vis_alignaddr(ref);
yading@10 3191
yading@10 3192 vis_ld64(ref[ 0], TMP0);
yading@10 3193 vis_fzero(ZERO);
yading@10 3194
yading@10 3195 vis_ld64(ref[ 8], TMP2);
yading@10 3196
yading@10 3197 vis_ld64(constants1[0], CONST_1);
yading@10 3198
yading@10 3199 vis_ld64(constants256_512[0], CONST_256);
yading@10 3200 vis_faligndata(TMP0, TMP2, REF_S0);
yading@10 3201
yading@10 3202 if (off != 0x7) {
yading@10 3203 vis_alignaddr_g0((void *)off_plus_1);
yading@10 3204 vis_faligndata(TMP0, TMP2, REF_S2);
yading@10 3205 } else {
yading@10 3206 vis_src1(TMP2, REF_S2);
yading@10 3207 }
yading@10 3208
yading@10 3209 height >>= 1;
yading@10 3210 do { /* 26 cycles */
yading@10 3211 vis_ld64_2(ref, stride, TMP0);
yading@10 3212 vis_mul8x16au(REF_S0, CONST_256, TMP8);
yading@10 3213 vis_pmerge(ZERO, REF_S2, TMP12);
yading@10 3214
yading@10 3215 vis_alignaddr_g0((void *)off);
yading@10 3216
yading@10 3217 vis_ld64_2(ref, stride_8, TMP2);
yading@10 3218 ref += stride;
yading@10 3219 vis_mul8x16au(REF_S0_1, CONST_256, TMP10);
yading@10 3220 vis_pmerge(ZERO, REF_S2_1, TMP14);
yading@10 3221
yading@10 3222 vis_ld64_2(ref, stride, TMP4);
yading@10 3223
yading@10 3224 vis_ld64_2(ref, stride_8, TMP6);
yading@10 3225 ref += stride;
yading@10 3226 vis_faligndata(TMP0, TMP2, REF_S4);
yading@10 3227
yading@10 3228 vis_pmerge(ZERO, REF_S4, TMP18);
yading@10 3229
yading@10 3230 vis_pmerge(ZERO, REF_S4_1, TMP20);
yading@10 3231
yading@10 3232 vis_faligndata(TMP4, TMP6, REF_S0);
yading@10 3233
yading@10 3234 if (off != 0x7) {
yading@10 3235 vis_alignaddr_g0((void *)off_plus_1);
yading@10 3236 vis_faligndata(TMP0, TMP2, REF_S6);
yading@10 3237 vis_faligndata(TMP4, TMP6, REF_S2);
yading@10 3238 } else {
yading@10 3239 vis_src1(TMP2, REF_S6);
yading@10 3240 vis_src1(TMP6, REF_S2);
yading@10 3241 }
yading@10 3242
yading@10 3243 vis_padd16(TMP18, CONST_1, TMP18);
yading@10 3244 vis_mul8x16au(REF_S6, CONST_256, TMP22);
yading@10 3245
yading@10 3246 vis_padd16(TMP20, CONST_1, TMP20);
yading@10 3247 vis_mul8x16au(REF_S6_1, CONST_256, TMP24);
yading@10 3248
yading@10 3249 vis_mul8x16au(REF_S0, CONST_256, TMP26);
yading@10 3250 vis_pmerge(ZERO, REF_S0_1, TMP28);
yading@10 3251
yading@10 3252 vis_mul8x16au(REF_S2, CONST_256, TMP30);
yading@10 3253 vis_padd16(TMP18, TMP22, TMP18);
yading@10 3254
yading@10 3255 vis_mul8x16au(REF_S2_1, CONST_256, TMP32);
yading@10 3256 vis_padd16(TMP20, TMP24, TMP20);
yading@10 3257
yading@10 3258 vis_padd16(TMP8, TMP18, TMP8);
yading@10 3259
yading@10 3260 vis_padd16(TMP10, TMP20, TMP10);
yading@10 3261
yading@10 3262 vis_padd16(TMP8, TMP12, TMP8);
yading@10 3263
yading@10 3264 vis_padd16(TMP10, TMP14, TMP10);
yading@10 3265 vis_pack16(TMP8, DST_0);
yading@10 3266
yading@10 3267 vis_pack16(TMP10, DST_1);
yading@10 3268 vis_st64(DST_0, dest[0]);
yading@10 3269 dest += stride;
yading@10 3270 vis_padd16(TMP18, TMP26, TMP18);
yading@10 3271
yading@10 3272 vis_padd16(TMP20, TMP28, TMP20);
yading@10 3273
yading@10 3274 vis_padd16(TMP18, TMP30, TMP18);
yading@10 3275
yading@10 3276 vis_padd16(TMP20, TMP32, TMP20);
yading@10 3277 vis_pack16(TMP18, DST_2);
yading@10 3278
yading@10 3279 vis_pack16(TMP20, DST_3);
yading@10 3280 vis_st64(DST_2, dest[0]);
yading@10 3281 dest += stride;
yading@10 3282 } while (--height);
yading@10 3283 }
yading@10 3284
yading@10 3285 static void MC_avg_no_round_xy_16_vis (uint8_t * dest, const uint8_t * ref,
yading@10 3286 const ptrdiff_t stride, int height)
yading@10 3287 {
yading@10 3288 unsigned long off = (unsigned long) ref & 0x7;
yading@10 3289 unsigned long off_plus_1 = off + 1;
yading@10 3290 int stride_8 = stride + 8;
yading@10 3291 int stride_16 = stride + 16;
yading@10 3292
yading@10 3293 vis_set_gsr(4 << VIS_GSR_SCALEFACT_SHIFT);
yading@10 3294
yading@10 3295 ref = vis_alignaddr(ref);
yading@10 3296
yading@10 3297 vis_ld64(ref[ 0], TMP0);
yading@10 3298 vis_fzero(ZERO);
yading@10 3299
yading@10 3300 vis_ld64(ref[ 8], TMP2);
yading@10 3301
yading@10 3302 vis_ld64(ref[16], TMP4);
yading@10 3303
yading@10 3304 vis_ld64(constants6[0], CONST_6);
yading@10 3305 vis_faligndata(TMP0, TMP2, REF_S0);
yading@10 3306
yading@10 3307 vis_ld64(constants256_1024[0], CONST_256);
yading@10 3308 vis_faligndata(TMP2, TMP4, REF_S4);
yading@10 3309
yading@10 3310 if (off != 0x7) {
yading@10 3311 vis_alignaddr_g0((void *)off_plus_1);
yading@10 3312 vis_faligndata(TMP0, TMP2, REF_S2);
yading@10 3313 vis_faligndata(TMP2, TMP4, REF_S6);
yading@10 3314 } else {
yading@10 3315 vis_src1(TMP2, REF_S2);
yading@10 3316 vis_src1(TMP4, REF_S6);
yading@10 3317 }
yading@10 3318
yading@10 3319 height >>= 1;
yading@10 3320 do { /* 55 cycles */
yading@10 3321 vis_ld64_2(ref, stride, TMP0);
yading@10 3322 vis_mul8x16au(REF_S0, CONST_256, TMP12);
yading@10 3323 vis_pmerge(ZERO, REF_S0_1, TMP14);
yading@10 3324
yading@10 3325 vis_alignaddr_g0((void *)off);
yading@10 3326
yading@10 3327 vis_ld64_2(ref, stride_8, TMP2);
yading@10 3328 vis_mul8x16au(REF_S2, CONST_256, TMP16);
yading@10 3329 vis_pmerge(ZERO, REF_S2_1, TMP18);
yading@10 3330
yading@10 3331 vis_ld64_2(ref, stride_16, TMP4);
yading@10 3332 ref += stride;
yading@10 3333 vis_mul8x16au(REF_S4, CONST_256, TMP20);
yading@10 3334 vis_pmerge(ZERO, REF_S4_1, TMP22);
yading@10 3335
yading@10 3336 vis_ld64_2(ref, stride, TMP6);
yading@10 3337 vis_mul8x16au(REF_S6, CONST_256, TMP24);
yading@10 3338 vis_pmerge(ZERO, REF_S6_1, TMP26);
yading@10 3339
yading@10 3340 vis_ld64_2(ref, stride_8, TMP8);
yading@10 3341 vis_faligndata(TMP0, TMP2, REF_0);
yading@10 3342
yading@10 3343 vis_ld64_2(ref, stride_16, TMP10);
yading@10 3344 ref += stride;
yading@10 3345 vis_faligndata(TMP2, TMP4, REF_4);
yading@10 3346
yading@10 3347 vis_ld64(dest[0], DST_0);
yading@10 3348 vis_faligndata(TMP6, TMP8, REF_S0);
yading@10 3349
yading@10 3350 vis_ld64_2(dest, 8, DST_2);
yading@10 3351 vis_faligndata(TMP8, TMP10, REF_S4);
yading@10 3352
yading@10 3353 if (off != 0x7) {
yading@10 3354 vis_alignaddr_g0((void *)off_plus_1);
yading@10 3355 vis_faligndata(TMP0, TMP2, REF_2);
yading@10 3356 vis_faligndata(TMP2, TMP4, REF_6);
yading@10 3357 vis_faligndata(TMP6, TMP8, REF_S2);
yading@10 3358 vis_faligndata(TMP8, TMP10, REF_S6);
yading@10 3359 } else {
yading@10 3360 vis_src1(TMP2, REF_2);
yading@10 3361 vis_src1(TMP4, REF_6);
yading@10 3362 vis_src1(TMP8, REF_S2);
yading@10 3363 vis_src1(TMP10, REF_S6);
yading@10 3364 }
yading@10 3365
yading@10 3366 vis_mul8x16al(DST_0, CONST_1024, TMP30);
yading@10 3367 vis_pmerge(ZERO, REF_0, TMP0);
yading@10 3368
yading@10 3369 vis_mul8x16al(DST_1, CONST_1024, TMP32);
yading@10 3370 vis_pmerge(ZERO, REF_0_1, TMP2);
yading@10 3371
yading@10 3372 vis_mul8x16au(REF_2, CONST_256, TMP4);
yading@10 3373 vis_pmerge(ZERO, REF_2_1, TMP6);
yading@10 3374
yading@10 3375 vis_mul8x16al(DST_2, CONST_1024, REF_0);
yading@10 3376 vis_padd16(TMP0, CONST_6, TMP0);
yading@10 3377
yading@10 3378 vis_mul8x16al(DST_3, CONST_1024, REF_2);
yading@10 3379 vis_padd16(TMP2, CONST_6, TMP2);
yading@10 3380
yading@10 3381 vis_padd16(TMP0, TMP4, TMP0);
yading@10 3382 vis_mul8x16au(REF_4, CONST_256, TMP4);
yading@10 3383
yading@10 3384 vis_padd16(TMP2, TMP6, TMP2);
yading@10 3385 vis_mul8x16au(REF_4_1, CONST_256, TMP6);
yading@10 3386
yading@10 3387 vis_padd16(TMP12, TMP0, TMP12);
yading@10 3388 vis_mul8x16au(REF_6, CONST_256, TMP8);
yading@10 3389
yading@10 3390 vis_padd16(TMP14, TMP2, TMP14);
yading@10 3391 vis_mul8x16au(REF_6_1, CONST_256, TMP10);
yading@10 3392
yading@10 3393 vis_padd16(TMP12, TMP16, TMP12);
yading@10 3394 vis_mul8x16au(REF_S0, CONST_256, REF_4);
yading@10 3395
yading@10 3396 vis_padd16(TMP14, TMP18, TMP14);
yading@10 3397 vis_mul8x16au(REF_S0_1, CONST_256, REF_6);
yading@10 3398
yading@10 3399 vis_padd16(TMP12, TMP30, TMP12);
yading@10 3400
yading@10 3401 vis_padd16(TMP14, TMP32, TMP14);
yading@10 3402 vis_pack16(TMP12, DST_0);
yading@10 3403
yading@10 3404 vis_pack16(TMP14, DST_1);
yading@10 3405 vis_st64(DST_0, dest[0]);
yading@10 3406 vis_padd16(TMP4, CONST_6, TMP4);
yading@10 3407
yading@10 3408 vis_ld64_2(dest, stride, DST_0);
yading@10 3409 vis_padd16(TMP6, CONST_6, TMP6);
yading@10 3410 vis_mul8x16au(REF_S2, CONST_256, TMP12);
yading@10 3411
yading@10 3412 vis_padd16(TMP4, TMP8, TMP4);
yading@10 3413 vis_mul8x16au(REF_S2_1, CONST_256, TMP14);
yading@10 3414
yading@10 3415 vis_padd16(TMP6, TMP10, TMP6);
yading@10 3416
yading@10 3417 vis_padd16(TMP20, TMP4, TMP20);
yading@10 3418
yading@10 3419 vis_padd16(TMP22, TMP6, TMP22);
yading@10 3420
yading@10 3421 vis_padd16(TMP20, TMP24, TMP20);
yading@10 3422
yading@10 3423 vis_padd16(TMP22, TMP26, TMP22);
yading@10 3424
yading@10 3425 vis_padd16(TMP20, REF_0, TMP20);
yading@10 3426 vis_mul8x16au(REF_S4, CONST_256, REF_0);
yading@10 3427
yading@10 3428 vis_padd16(TMP22, REF_2, TMP22);
yading@10 3429 vis_pack16(TMP20, DST_2);
yading@10 3430
yading@10 3431 vis_pack16(TMP22, DST_3);
yading@10 3432 vis_st64_2(DST_2, dest, 8);
yading@10 3433 dest += stride;
yading@10 3434
yading@10 3435 vis_ld64_2(dest, 8, DST_2);
yading@10 3436 vis_mul8x16al(DST_0, CONST_1024, TMP30);
yading@10 3437 vis_pmerge(ZERO, REF_S4_1, REF_2);
yading@10 3438
yading@10 3439 vis_mul8x16al(DST_1, CONST_1024, TMP32);
yading@10 3440 vis_padd16(REF_4, TMP0, TMP8);
yading@10 3441
yading@10 3442 vis_mul8x16au(REF_S6, CONST_256, REF_4);
yading@10 3443 vis_padd16(REF_6, TMP2, TMP10);
yading@10 3444
yading@10 3445 vis_mul8x16au(REF_S6_1, CONST_256, REF_6);
yading@10 3446 vis_padd16(TMP8, TMP12, TMP8);
yading@10 3447
yading@10 3448 vis_padd16(TMP10, TMP14, TMP10);
yading@10 3449
yading@10 3450 vis_padd16(TMP8, TMP30, TMP8);
yading@10 3451
yading@10 3452 vis_padd16(TMP10, TMP32, TMP10);
yading@10 3453 vis_pack16(TMP8, DST_0);
yading@10 3454
yading@10 3455 vis_pack16(TMP10, DST_1);
yading@10 3456 vis_st64(DST_0, dest[0]);
yading@10 3457
yading@10 3458 vis_padd16(REF_0, TMP4, REF_0);
yading@10 3459
yading@10 3460 vis_mul8x16al(DST_2, CONST_1024, TMP30);
yading@10 3461 vis_padd16(REF_2, TMP6, REF_2);
yading@10 3462
yading@10 3463 vis_mul8x16al(DST_3, CONST_1024, TMP32);
yading@10 3464 vis_padd16(REF_0, REF_4, REF_0);
yading@10 3465
yading@10 3466 vis_padd16(REF_2, REF_6, REF_2);
yading@10 3467
yading@10 3468 vis_padd16(REF_0, TMP30, REF_0);
yading@10 3469
yading@10 3470 /* stall */
yading@10 3471
yading@10 3472 vis_padd16(REF_2, TMP32, REF_2);
yading@10 3473 vis_pack16(REF_0, DST_2);
yading@10 3474
yading@10 3475 vis_pack16(REF_2, DST_3);
yading@10 3476 vis_st64_2(DST_2, dest, 8);
yading@10 3477 dest += stride;
yading@10 3478 } while (--height);
yading@10 3479 }
yading@10 3480
yading@10 3481 /* End of no rounding code */
yading@10 3482
yading@10 3483 av_cold void ff_hpeldsp_init_vis(HpelDSPContext *c, int flags)
yading@10 3484 {
yading@10 3485 /* VIS-specific optimizations */
yading@10 3486 int accel = vis_level ();
yading@10 3487
yading@10 3488 if (accel & ACCEL_SPARC_VIS) {
yading@10 3489 c->put_pixels_tab[0][0] = MC_put_o_16_vis;
yading@10 3490 c->put_pixels_tab[0][1] = MC_put_x_16_vis;
yading@10 3491 c->put_pixels_tab[0][2] = MC_put_y_16_vis;
yading@10 3492 c->put_pixels_tab[0][3] = MC_put_xy_16_vis;
yading@10 3493
yading@10 3494 c->put_pixels_tab[1][0] = MC_put_o_8_vis;
yading@10 3495 c->put_pixels_tab[1][1] = MC_put_x_8_vis;
yading@10 3496 c->put_pixels_tab[1][2] = MC_put_y_8_vis;
yading@10 3497 c->put_pixels_tab[1][3] = MC_put_xy_8_vis;
yading@10 3498
yading@10 3499 c->avg_pixels_tab[0][0] = MC_avg_o_16_vis;
yading@10 3500 c->avg_pixels_tab[0][1] = MC_avg_x_16_vis;
yading@10 3501 c->avg_pixels_tab[0][2] = MC_avg_y_16_vis;
yading@10 3502 c->avg_pixels_tab[0][3] = MC_avg_xy_16_vis;
yading@10 3503
yading@10 3504 c->avg_pixels_tab[1][0] = MC_avg_o_8_vis;
yading@10 3505 c->avg_pixels_tab[1][1] = MC_avg_x_8_vis;
yading@10 3506 c->avg_pixels_tab[1][2] = MC_avg_y_8_vis;
yading@10 3507 c->avg_pixels_tab[1][3] = MC_avg_xy_8_vis;
yading@10 3508
yading@10 3509 c->put_no_rnd_pixels_tab[0][0] = MC_put_no_round_o_16_vis;
yading@10 3510 c->put_no_rnd_pixels_tab[0][1] = MC_put_no_round_x_16_vis;
yading@10 3511 c->put_no_rnd_pixels_tab[0][2] = MC_put_no_round_y_16_vis;
yading@10 3512 c->put_no_rnd_pixels_tab[0][3] = MC_put_no_round_xy_16_vis;
yading@10 3513
yading@10 3514 c->put_no_rnd_pixels_tab[1][0] = MC_put_no_round_o_8_vis;
yading@10 3515 c->put_no_rnd_pixels_tab[1][1] = MC_put_no_round_x_8_vis;
yading@10 3516 c->put_no_rnd_pixels_tab[1][2] = MC_put_no_round_y_8_vis;
yading@10 3517 c->put_no_rnd_pixels_tab[1][3] = MC_put_no_round_xy_8_vis;
yading@10 3518
yading@10 3519 c->avg_no_rnd_pixels_tab[0] = MC_avg_no_round_o_16_vis;
yading@10 3520 c->avg_no_rnd_pixels_tab[1] = MC_avg_no_round_x_16_vis;
yading@10 3521 c->avg_no_rnd_pixels_tab[2] = MC_avg_no_round_y_16_vis;
yading@10 3522 c->avg_no_rnd_pixels_tab[3] = MC_avg_no_round_xy_16_vis;
yading@10 3523 }
yading@10 3524 }