annotate ffmpeg/libavcodec/ppc/h264_qpel_template.c @ 13:844d341cf643 tip

Back up before ISMIR
author Yading Song <yading.song@eecs.qmul.ac.uk>
date Thu, 31 Oct 2013 13:17:06 +0000
parents 6840f77b83aa
children
rev   line source
yading@10 1 /*
yading@10 2 * Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org>
yading@10 3 *
yading@10 4 * This file is part of FFmpeg.
yading@10 5 *
yading@10 6 * FFmpeg is free software; you can redistribute it and/or
yading@10 7 * modify it under the terms of the GNU Lesser General Public
yading@10 8 * License as published by the Free Software Foundation; either
yading@10 9 * version 2.1 of the License, or (at your option) any later version.
yading@10 10 *
yading@10 11 * FFmpeg is distributed in the hope that it will be useful,
yading@10 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
yading@10 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
yading@10 14 * Lesser General Public License for more details.
yading@10 15 *
yading@10 16 * You should have received a copy of the GNU Lesser General Public
yading@10 17 * License along with FFmpeg; if not, write to the Free Software
yading@10 18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
yading@10 19 */
yading@10 20
yading@10 21 #include "libavutil/mem.h"
yading@10 22
yading@10 23 #ifdef DEBUG
yading@10 24 #define ASSERT_ALIGNED(ptr) assert(((unsigned long)ptr&0x0000000F));
yading@10 25 #else
yading@10 26 #define ASSERT_ALIGNED(ptr) ;
yading@10 27 #endif
yading@10 28
yading@10 29 /* this code assume stride % 16 == 0 */
yading@10 30 #ifdef PREFIX_h264_qpel16_h_lowpass_altivec
yading@10 31 static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) {
yading@10 32 register int i;
yading@10 33
yading@10 34 LOAD_ZERO;
yading@10 35 const vec_u8 permM2 = vec_lvsl(-2, src);
yading@10 36 const vec_u8 permM1 = vec_lvsl(-1, src);
yading@10 37 const vec_u8 permP0 = vec_lvsl(+0, src);
yading@10 38 const vec_u8 permP1 = vec_lvsl(+1, src);
yading@10 39 const vec_u8 permP2 = vec_lvsl(+2, src);
yading@10 40 const vec_u8 permP3 = vec_lvsl(+3, src);
yading@10 41 const vec_s16 v5ss = vec_splat_s16(5);
yading@10 42 const vec_u16 v5us = vec_splat_u16(5);
yading@10 43 const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
yading@10 44 const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
yading@10 45
yading@10 46 vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
yading@10 47
yading@10 48 register int align = ((((unsigned long)src) - 2) % 16);
yading@10 49
yading@10 50 vec_s16 srcP0A, srcP0B, srcP1A, srcP1B,
yading@10 51 srcP2A, srcP2B, srcP3A, srcP3B,
yading@10 52 srcM1A, srcM1B, srcM2A, srcM2B,
yading@10 53 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
yading@10 54 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
yading@10 55 psumA, psumB, sumA, sumB;
yading@10 56
yading@10 57 vec_u8 sum, fsum;
yading@10 58
yading@10 59 for (i = 0 ; i < 16 ; i ++) {
yading@10 60 vec_u8 srcR1 = vec_ld(-2, src);
yading@10 61 vec_u8 srcR2 = vec_ld(14, src);
yading@10 62
yading@10 63 switch (align) {
yading@10 64 default: {
yading@10 65 srcM2 = vec_perm(srcR1, srcR2, permM2);
yading@10 66 srcM1 = vec_perm(srcR1, srcR2, permM1);
yading@10 67 srcP0 = vec_perm(srcR1, srcR2, permP0);
yading@10 68 srcP1 = vec_perm(srcR1, srcR2, permP1);
yading@10 69 srcP2 = vec_perm(srcR1, srcR2, permP2);
yading@10 70 srcP3 = vec_perm(srcR1, srcR2, permP3);
yading@10 71 } break;
yading@10 72 case 11: {
yading@10 73 srcM2 = vec_perm(srcR1, srcR2, permM2);
yading@10 74 srcM1 = vec_perm(srcR1, srcR2, permM1);
yading@10 75 srcP0 = vec_perm(srcR1, srcR2, permP0);
yading@10 76 srcP1 = vec_perm(srcR1, srcR2, permP1);
yading@10 77 srcP2 = vec_perm(srcR1, srcR2, permP2);
yading@10 78 srcP3 = srcR2;
yading@10 79 } break;
yading@10 80 case 12: {
yading@10 81 vec_u8 srcR3 = vec_ld(30, src);
yading@10 82 srcM2 = vec_perm(srcR1, srcR2, permM2);
yading@10 83 srcM1 = vec_perm(srcR1, srcR2, permM1);
yading@10 84 srcP0 = vec_perm(srcR1, srcR2, permP0);
yading@10 85 srcP1 = vec_perm(srcR1, srcR2, permP1);
yading@10 86 srcP2 = srcR2;
yading@10 87 srcP3 = vec_perm(srcR2, srcR3, permP3);
yading@10 88 } break;
yading@10 89 case 13: {
yading@10 90 vec_u8 srcR3 = vec_ld(30, src);
yading@10 91 srcM2 = vec_perm(srcR1, srcR2, permM2);
yading@10 92 srcM1 = vec_perm(srcR1, srcR2, permM1);
yading@10 93 srcP0 = vec_perm(srcR1, srcR2, permP0);
yading@10 94 srcP1 = srcR2;
yading@10 95 srcP2 = vec_perm(srcR2, srcR3, permP2);
yading@10 96 srcP3 = vec_perm(srcR2, srcR3, permP3);
yading@10 97 } break;
yading@10 98 case 14: {
yading@10 99 vec_u8 srcR3 = vec_ld(30, src);
yading@10 100 srcM2 = vec_perm(srcR1, srcR2, permM2);
yading@10 101 srcM1 = vec_perm(srcR1, srcR2, permM1);
yading@10 102 srcP0 = srcR2;
yading@10 103 srcP1 = vec_perm(srcR2, srcR3, permP1);
yading@10 104 srcP2 = vec_perm(srcR2, srcR3, permP2);
yading@10 105 srcP3 = vec_perm(srcR2, srcR3, permP3);
yading@10 106 } break;
yading@10 107 case 15: {
yading@10 108 vec_u8 srcR3 = vec_ld(30, src);
yading@10 109 srcM2 = vec_perm(srcR1, srcR2, permM2);
yading@10 110 srcM1 = srcR2;
yading@10 111 srcP0 = vec_perm(srcR2, srcR3, permP0);
yading@10 112 srcP1 = vec_perm(srcR2, srcR3, permP1);
yading@10 113 srcP2 = vec_perm(srcR2, srcR3, permP2);
yading@10 114 srcP3 = vec_perm(srcR2, srcR3, permP3);
yading@10 115 } break;
yading@10 116 }
yading@10 117
yading@10 118 srcP0A = (vec_s16) vec_mergeh(zero_u8v, srcP0);
yading@10 119 srcP0B = (vec_s16) vec_mergel(zero_u8v, srcP0);
yading@10 120 srcP1A = (vec_s16) vec_mergeh(zero_u8v, srcP1);
yading@10 121 srcP1B = (vec_s16) vec_mergel(zero_u8v, srcP1);
yading@10 122
yading@10 123 srcP2A = (vec_s16) vec_mergeh(zero_u8v, srcP2);
yading@10 124 srcP2B = (vec_s16) vec_mergel(zero_u8v, srcP2);
yading@10 125 srcP3A = (vec_s16) vec_mergeh(zero_u8v, srcP3);
yading@10 126 srcP3B = (vec_s16) vec_mergel(zero_u8v, srcP3);
yading@10 127
yading@10 128 srcM1A = (vec_s16) vec_mergeh(zero_u8v, srcM1);
yading@10 129 srcM1B = (vec_s16) vec_mergel(zero_u8v, srcM1);
yading@10 130 srcM2A = (vec_s16) vec_mergeh(zero_u8v, srcM2);
yading@10 131 srcM2B = (vec_s16) vec_mergel(zero_u8v, srcM2);
yading@10 132
yading@10 133 sum1A = vec_adds(srcP0A, srcP1A);
yading@10 134 sum1B = vec_adds(srcP0B, srcP1B);
yading@10 135 sum2A = vec_adds(srcM1A, srcP2A);
yading@10 136 sum2B = vec_adds(srcM1B, srcP2B);
yading@10 137 sum3A = vec_adds(srcM2A, srcP3A);
yading@10 138 sum3B = vec_adds(srcM2B, srcP3B);
yading@10 139
yading@10 140 pp1A = vec_mladd(sum1A, v20ss, v16ss);
yading@10 141 pp1B = vec_mladd(sum1B, v20ss, v16ss);
yading@10 142
yading@10 143 pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
yading@10 144 pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
yading@10 145
yading@10 146 pp3A = vec_add(sum3A, pp1A);
yading@10 147 pp3B = vec_add(sum3B, pp1B);
yading@10 148
yading@10 149 psumA = vec_sub(pp3A, pp2A);
yading@10 150 psumB = vec_sub(pp3B, pp2B);
yading@10 151
yading@10 152 sumA = vec_sra(psumA, v5us);
yading@10 153 sumB = vec_sra(psumB, v5us);
yading@10 154
yading@10 155 sum = vec_packsu(sumA, sumB);
yading@10 156
yading@10 157 ASSERT_ALIGNED(dst);
yading@10 158
yading@10 159 OP_U8_ALTIVEC(fsum, sum, vec_ld(0, dst));
yading@10 160
yading@10 161 vec_st(fsum, 0, dst);
yading@10 162
yading@10 163 src += srcStride;
yading@10 164 dst += dstStride;
yading@10 165 }
yading@10 166 }
yading@10 167 #endif
yading@10 168
yading@10 169 /* this code assume stride % 16 == 0 */
yading@10 170 #ifdef PREFIX_h264_qpel16_v_lowpass_altivec
yading@10 171 static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) {
yading@10 172 register int i;
yading@10 173
yading@10 174 LOAD_ZERO;
yading@10 175 const vec_u8 perm = vec_lvsl(0, src);
yading@10 176 const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
yading@10 177 const vec_u16 v5us = vec_splat_u16(5);
yading@10 178 const vec_s16 v5ss = vec_splat_s16(5);
yading@10 179 const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
yading@10 180
yading@10 181 uint8_t *srcbis = src - (srcStride * 2);
yading@10 182
yading@10 183 const vec_u8 srcM2a = vec_ld(0, srcbis);
yading@10 184 const vec_u8 srcM2b = vec_ld(16, srcbis);
yading@10 185 const vec_u8 srcM2 = vec_perm(srcM2a, srcM2b, perm);
yading@10 186 //srcbis += srcStride;
yading@10 187 const vec_u8 srcM1a = vec_ld(0, srcbis += srcStride);
yading@10 188 const vec_u8 srcM1b = vec_ld(16, srcbis);
yading@10 189 const vec_u8 srcM1 = vec_perm(srcM1a, srcM1b, perm);
yading@10 190 //srcbis += srcStride;
yading@10 191 const vec_u8 srcP0a = vec_ld(0, srcbis += srcStride);
yading@10 192 const vec_u8 srcP0b = vec_ld(16, srcbis);
yading@10 193 const vec_u8 srcP0 = vec_perm(srcP0a, srcP0b, perm);
yading@10 194 //srcbis += srcStride;
yading@10 195 const vec_u8 srcP1a = vec_ld(0, srcbis += srcStride);
yading@10 196 const vec_u8 srcP1b = vec_ld(16, srcbis);
yading@10 197 const vec_u8 srcP1 = vec_perm(srcP1a, srcP1b, perm);
yading@10 198 //srcbis += srcStride;
yading@10 199 const vec_u8 srcP2a = vec_ld(0, srcbis += srcStride);
yading@10 200 const vec_u8 srcP2b = vec_ld(16, srcbis);
yading@10 201 const vec_u8 srcP2 = vec_perm(srcP2a, srcP2b, perm);
yading@10 202 //srcbis += srcStride;
yading@10 203
yading@10 204 vec_s16 srcM2ssA = (vec_s16) vec_mergeh(zero_u8v, srcM2);
yading@10 205 vec_s16 srcM2ssB = (vec_s16) vec_mergel(zero_u8v, srcM2);
yading@10 206 vec_s16 srcM1ssA = (vec_s16) vec_mergeh(zero_u8v, srcM1);
yading@10 207 vec_s16 srcM1ssB = (vec_s16) vec_mergel(zero_u8v, srcM1);
yading@10 208 vec_s16 srcP0ssA = (vec_s16) vec_mergeh(zero_u8v, srcP0);
yading@10 209 vec_s16 srcP0ssB = (vec_s16) vec_mergel(zero_u8v, srcP0);
yading@10 210 vec_s16 srcP1ssA = (vec_s16) vec_mergeh(zero_u8v, srcP1);
yading@10 211 vec_s16 srcP1ssB = (vec_s16) vec_mergel(zero_u8v, srcP1);
yading@10 212 vec_s16 srcP2ssA = (vec_s16) vec_mergeh(zero_u8v, srcP2);
yading@10 213 vec_s16 srcP2ssB = (vec_s16) vec_mergel(zero_u8v, srcP2);
yading@10 214
yading@10 215 vec_s16 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
yading@10 216 psumA, psumB, sumA, sumB,
yading@10 217 srcP3ssA, srcP3ssB,
yading@10 218 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B;
yading@10 219
yading@10 220 vec_u8 sum, fsum, srcP3a, srcP3b, srcP3;
yading@10 221
yading@10 222 for (i = 0 ; i < 16 ; i++) {
yading@10 223 srcP3a = vec_ld(0, srcbis += srcStride);
yading@10 224 srcP3b = vec_ld(16, srcbis);
yading@10 225 srcP3 = vec_perm(srcP3a, srcP3b, perm);
yading@10 226 srcP3ssA = (vec_s16) vec_mergeh(zero_u8v, srcP3);
yading@10 227 srcP3ssB = (vec_s16) vec_mergel(zero_u8v, srcP3);
yading@10 228 //srcbis += srcStride;
yading@10 229
yading@10 230 sum1A = vec_adds(srcP0ssA, srcP1ssA);
yading@10 231 sum1B = vec_adds(srcP0ssB, srcP1ssB);
yading@10 232 sum2A = vec_adds(srcM1ssA, srcP2ssA);
yading@10 233 sum2B = vec_adds(srcM1ssB, srcP2ssB);
yading@10 234 sum3A = vec_adds(srcM2ssA, srcP3ssA);
yading@10 235 sum3B = vec_adds(srcM2ssB, srcP3ssB);
yading@10 236
yading@10 237 srcM2ssA = srcM1ssA;
yading@10 238 srcM2ssB = srcM1ssB;
yading@10 239 srcM1ssA = srcP0ssA;
yading@10 240 srcM1ssB = srcP0ssB;
yading@10 241 srcP0ssA = srcP1ssA;
yading@10 242 srcP0ssB = srcP1ssB;
yading@10 243 srcP1ssA = srcP2ssA;
yading@10 244 srcP1ssB = srcP2ssB;
yading@10 245 srcP2ssA = srcP3ssA;
yading@10 246 srcP2ssB = srcP3ssB;
yading@10 247
yading@10 248 pp1A = vec_mladd(sum1A, v20ss, v16ss);
yading@10 249 pp1B = vec_mladd(sum1B, v20ss, v16ss);
yading@10 250
yading@10 251 pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
yading@10 252 pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
yading@10 253
yading@10 254 pp3A = vec_add(sum3A, pp1A);
yading@10 255 pp3B = vec_add(sum3B, pp1B);
yading@10 256
yading@10 257 psumA = vec_sub(pp3A, pp2A);
yading@10 258 psumB = vec_sub(pp3B, pp2B);
yading@10 259
yading@10 260 sumA = vec_sra(psumA, v5us);
yading@10 261 sumB = vec_sra(psumB, v5us);
yading@10 262
yading@10 263 sum = vec_packsu(sumA, sumB);
yading@10 264
yading@10 265 ASSERT_ALIGNED(dst);
yading@10 266
yading@10 267 OP_U8_ALTIVEC(fsum, sum, vec_ld(0, dst));
yading@10 268
yading@10 269 vec_st(fsum, 0, dst);
yading@10 270
yading@10 271 dst += dstStride;
yading@10 272 }
yading@10 273 }
yading@10 274 #endif
yading@10 275
yading@10 276 /* this code assume stride % 16 == 0 *and* tmp is properly aligned */
yading@10 277 #ifdef PREFIX_h264_qpel16_hv_lowpass_altivec
yading@10 278 static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, uint8_t * src, int dstStride, int tmpStride, int srcStride) {
yading@10 279 register int i;
yading@10 280 LOAD_ZERO;
yading@10 281 const vec_u8 permM2 = vec_lvsl(-2, src);
yading@10 282 const vec_u8 permM1 = vec_lvsl(-1, src);
yading@10 283 const vec_u8 permP0 = vec_lvsl(+0, src);
yading@10 284 const vec_u8 permP1 = vec_lvsl(+1, src);
yading@10 285 const vec_u8 permP2 = vec_lvsl(+2, src);
yading@10 286 const vec_u8 permP3 = vec_lvsl(+3, src);
yading@10 287 const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
yading@10 288 const vec_u32 v10ui = vec_splat_u32(10);
yading@10 289 const vec_s16 v5ss = vec_splat_s16(5);
yading@10 290 const vec_s16 v1ss = vec_splat_s16(1);
yading@10 291 const vec_s32 v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9));
yading@10 292 const vec_u32 v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4));
yading@10 293
yading@10 294 register int align = ((((unsigned long)src) - 2) % 16);
yading@10 295
yading@10 296 vec_s16 srcP0A, srcP0B, srcP1A, srcP1B,
yading@10 297 srcP2A, srcP2B, srcP3A, srcP3B,
yading@10 298 srcM1A, srcM1B, srcM2A, srcM2B,
yading@10 299 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
yading@10 300 pp1A, pp1B, pp2A, pp2B, psumA, psumB;
yading@10 301
yading@10 302 const vec_u8 mperm = (const vec_u8)
yading@10 303 {0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B,
yading@10 304 0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F};
yading@10 305 int16_t *tmpbis = tmp;
yading@10 306
yading@10 307 vec_s16 tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB,
yading@10 308 tmpP0ssA, tmpP0ssB, tmpP1ssA, tmpP1ssB,
yading@10 309 tmpP2ssA, tmpP2ssB;
yading@10 310
yading@10 311 vec_s32 pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo,
yading@10 312 pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo,
yading@10 313 pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo,
yading@10 314 ssumAe, ssumAo, ssumBe, ssumBo;
yading@10 315 vec_u8 fsum, sumv, sum;
yading@10 316 vec_s16 ssume, ssumo;
yading@10 317
yading@10 318 src -= (2 * srcStride);
yading@10 319 for (i = 0 ; i < 21 ; i ++) {
yading@10 320 vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
yading@10 321 vec_u8 srcR1 = vec_ld(-2, src);
yading@10 322 vec_u8 srcR2 = vec_ld(14, src);
yading@10 323
yading@10 324 switch (align) {
yading@10 325 default: {
yading@10 326 srcM2 = vec_perm(srcR1, srcR2, permM2);
yading@10 327 srcM1 = vec_perm(srcR1, srcR2, permM1);
yading@10 328 srcP0 = vec_perm(srcR1, srcR2, permP0);
yading@10 329 srcP1 = vec_perm(srcR1, srcR2, permP1);
yading@10 330 srcP2 = vec_perm(srcR1, srcR2, permP2);
yading@10 331 srcP3 = vec_perm(srcR1, srcR2, permP3);
yading@10 332 } break;
yading@10 333 case 11: {
yading@10 334 srcM2 = vec_perm(srcR1, srcR2, permM2);
yading@10 335 srcM1 = vec_perm(srcR1, srcR2, permM1);
yading@10 336 srcP0 = vec_perm(srcR1, srcR2, permP0);
yading@10 337 srcP1 = vec_perm(srcR1, srcR2, permP1);
yading@10 338 srcP2 = vec_perm(srcR1, srcR2, permP2);
yading@10 339 srcP3 = srcR2;
yading@10 340 } break;
yading@10 341 case 12: {
yading@10 342 vec_u8 srcR3 = vec_ld(30, src);
yading@10 343 srcM2 = vec_perm(srcR1, srcR2, permM2);
yading@10 344 srcM1 = vec_perm(srcR1, srcR2, permM1);
yading@10 345 srcP0 = vec_perm(srcR1, srcR2, permP0);
yading@10 346 srcP1 = vec_perm(srcR1, srcR2, permP1);
yading@10 347 srcP2 = srcR2;
yading@10 348 srcP3 = vec_perm(srcR2, srcR3, permP3);
yading@10 349 } break;
yading@10 350 case 13: {
yading@10 351 vec_u8 srcR3 = vec_ld(30, src);
yading@10 352 srcM2 = vec_perm(srcR1, srcR2, permM2);
yading@10 353 srcM1 = vec_perm(srcR1, srcR2, permM1);
yading@10 354 srcP0 = vec_perm(srcR1, srcR2, permP0);
yading@10 355 srcP1 = srcR2;
yading@10 356 srcP2 = vec_perm(srcR2, srcR3, permP2);
yading@10 357 srcP3 = vec_perm(srcR2, srcR3, permP3);
yading@10 358 } break;
yading@10 359 case 14: {
yading@10 360 vec_u8 srcR3 = vec_ld(30, src);
yading@10 361 srcM2 = vec_perm(srcR1, srcR2, permM2);
yading@10 362 srcM1 = vec_perm(srcR1, srcR2, permM1);
yading@10 363 srcP0 = srcR2;
yading@10 364 srcP1 = vec_perm(srcR2, srcR3, permP1);
yading@10 365 srcP2 = vec_perm(srcR2, srcR3, permP2);
yading@10 366 srcP3 = vec_perm(srcR2, srcR3, permP3);
yading@10 367 } break;
yading@10 368 case 15: {
yading@10 369 vec_u8 srcR3 = vec_ld(30, src);
yading@10 370 srcM2 = vec_perm(srcR1, srcR2, permM2);
yading@10 371 srcM1 = srcR2;
yading@10 372 srcP0 = vec_perm(srcR2, srcR3, permP0);
yading@10 373 srcP1 = vec_perm(srcR2, srcR3, permP1);
yading@10 374 srcP2 = vec_perm(srcR2, srcR3, permP2);
yading@10 375 srcP3 = vec_perm(srcR2, srcR3, permP3);
yading@10 376 } break;
yading@10 377 }
yading@10 378
yading@10 379 srcP0A = (vec_s16) vec_mergeh(zero_u8v, srcP0);
yading@10 380 srcP0B = (vec_s16) vec_mergel(zero_u8v, srcP0);
yading@10 381 srcP1A = (vec_s16) vec_mergeh(zero_u8v, srcP1);
yading@10 382 srcP1B = (vec_s16) vec_mergel(zero_u8v, srcP1);
yading@10 383
yading@10 384 srcP2A = (vec_s16) vec_mergeh(zero_u8v, srcP2);
yading@10 385 srcP2B = (vec_s16) vec_mergel(zero_u8v, srcP2);
yading@10 386 srcP3A = (vec_s16) vec_mergeh(zero_u8v, srcP3);
yading@10 387 srcP3B = (vec_s16) vec_mergel(zero_u8v, srcP3);
yading@10 388
yading@10 389 srcM1A = (vec_s16) vec_mergeh(zero_u8v, srcM1);
yading@10 390 srcM1B = (vec_s16) vec_mergel(zero_u8v, srcM1);
yading@10 391 srcM2A = (vec_s16) vec_mergeh(zero_u8v, srcM2);
yading@10 392 srcM2B = (vec_s16) vec_mergel(zero_u8v, srcM2);
yading@10 393
yading@10 394 sum1A = vec_adds(srcP0A, srcP1A);
yading@10 395 sum1B = vec_adds(srcP0B, srcP1B);
yading@10 396 sum2A = vec_adds(srcM1A, srcP2A);
yading@10 397 sum2B = vec_adds(srcM1B, srcP2B);
yading@10 398 sum3A = vec_adds(srcM2A, srcP3A);
yading@10 399 sum3B = vec_adds(srcM2B, srcP3B);
yading@10 400
yading@10 401 pp1A = vec_mladd(sum1A, v20ss, sum3A);
yading@10 402 pp1B = vec_mladd(sum1B, v20ss, sum3B);
yading@10 403
yading@10 404 pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
yading@10 405 pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
yading@10 406
yading@10 407 psumA = vec_sub(pp1A, pp2A);
yading@10 408 psumB = vec_sub(pp1B, pp2B);
yading@10 409
yading@10 410 vec_st(psumA, 0, tmp);
yading@10 411 vec_st(psumB, 16, tmp);
yading@10 412
yading@10 413 src += srcStride;
yading@10 414 tmp += tmpStride; /* int16_t*, and stride is 16, so it's OK here */
yading@10 415 }
yading@10 416
yading@10 417 tmpM2ssA = vec_ld(0, tmpbis);
yading@10 418 tmpM2ssB = vec_ld(16, tmpbis);
yading@10 419 tmpbis += tmpStride;
yading@10 420 tmpM1ssA = vec_ld(0, tmpbis);
yading@10 421 tmpM1ssB = vec_ld(16, tmpbis);
yading@10 422 tmpbis += tmpStride;
yading@10 423 tmpP0ssA = vec_ld(0, tmpbis);
yading@10 424 tmpP0ssB = vec_ld(16, tmpbis);
yading@10 425 tmpbis += tmpStride;
yading@10 426 tmpP1ssA = vec_ld(0, tmpbis);
yading@10 427 tmpP1ssB = vec_ld(16, tmpbis);
yading@10 428 tmpbis += tmpStride;
yading@10 429 tmpP2ssA = vec_ld(0, tmpbis);
yading@10 430 tmpP2ssB = vec_ld(16, tmpbis);
yading@10 431 tmpbis += tmpStride;
yading@10 432
yading@10 433 for (i = 0 ; i < 16 ; i++) {
yading@10 434 const vec_s16 tmpP3ssA = vec_ld(0, tmpbis);
yading@10 435 const vec_s16 tmpP3ssB = vec_ld(16, tmpbis);
yading@10 436
yading@10 437 const vec_s16 sum1A = vec_adds(tmpP0ssA, tmpP1ssA);
yading@10 438 const vec_s16 sum1B = vec_adds(tmpP0ssB, tmpP1ssB);
yading@10 439 const vec_s16 sum2A = vec_adds(tmpM1ssA, tmpP2ssA);
yading@10 440 const vec_s16 sum2B = vec_adds(tmpM1ssB, tmpP2ssB);
yading@10 441 const vec_s16 sum3A = vec_adds(tmpM2ssA, tmpP3ssA);
yading@10 442 const vec_s16 sum3B = vec_adds(tmpM2ssB, tmpP3ssB);
yading@10 443
yading@10 444 tmpbis += tmpStride;
yading@10 445
yading@10 446 tmpM2ssA = tmpM1ssA;
yading@10 447 tmpM2ssB = tmpM1ssB;
yading@10 448 tmpM1ssA = tmpP0ssA;
yading@10 449 tmpM1ssB = tmpP0ssB;
yading@10 450 tmpP0ssA = tmpP1ssA;
yading@10 451 tmpP0ssB = tmpP1ssB;
yading@10 452 tmpP1ssA = tmpP2ssA;
yading@10 453 tmpP1ssB = tmpP2ssB;
yading@10 454 tmpP2ssA = tmpP3ssA;
yading@10 455 tmpP2ssB = tmpP3ssB;
yading@10 456
yading@10 457 pp1Ae = vec_mule(sum1A, v20ss);
yading@10 458 pp1Ao = vec_mulo(sum1A, v20ss);
yading@10 459 pp1Be = vec_mule(sum1B, v20ss);
yading@10 460 pp1Bo = vec_mulo(sum1B, v20ss);
yading@10 461
yading@10 462 pp2Ae = vec_mule(sum2A, v5ss);
yading@10 463 pp2Ao = vec_mulo(sum2A, v5ss);
yading@10 464 pp2Be = vec_mule(sum2B, v5ss);
yading@10 465 pp2Bo = vec_mulo(sum2B, v5ss);
yading@10 466
yading@10 467 pp3Ae = vec_sra((vec_s32)sum3A, v16ui);
yading@10 468 pp3Ao = vec_mulo(sum3A, v1ss);
yading@10 469 pp3Be = vec_sra((vec_s32)sum3B, v16ui);
yading@10 470 pp3Bo = vec_mulo(sum3B, v1ss);
yading@10 471
yading@10 472 pp1cAe = vec_add(pp1Ae, v512si);
yading@10 473 pp1cAo = vec_add(pp1Ao, v512si);
yading@10 474 pp1cBe = vec_add(pp1Be, v512si);
yading@10 475 pp1cBo = vec_add(pp1Bo, v512si);
yading@10 476
yading@10 477 pp32Ae = vec_sub(pp3Ae, pp2Ae);
yading@10 478 pp32Ao = vec_sub(pp3Ao, pp2Ao);
yading@10 479 pp32Be = vec_sub(pp3Be, pp2Be);
yading@10 480 pp32Bo = vec_sub(pp3Bo, pp2Bo);
yading@10 481
yading@10 482 sumAe = vec_add(pp1cAe, pp32Ae);
yading@10 483 sumAo = vec_add(pp1cAo, pp32Ao);
yading@10 484 sumBe = vec_add(pp1cBe, pp32Be);
yading@10 485 sumBo = vec_add(pp1cBo, pp32Bo);
yading@10 486
yading@10 487 ssumAe = vec_sra(sumAe, v10ui);
yading@10 488 ssumAo = vec_sra(sumAo, v10ui);
yading@10 489 ssumBe = vec_sra(sumBe, v10ui);
yading@10 490 ssumBo = vec_sra(sumBo, v10ui);
yading@10 491
yading@10 492 ssume = vec_packs(ssumAe, ssumBe);
yading@10 493 ssumo = vec_packs(ssumAo, ssumBo);
yading@10 494
yading@10 495 sumv = vec_packsu(ssume, ssumo);
yading@10 496 sum = vec_perm(sumv, sumv, mperm);
yading@10 497
yading@10 498 ASSERT_ALIGNED(dst);
yading@10 499
yading@10 500 OP_U8_ALTIVEC(fsum, sum, vec_ld(0, dst));
yading@10 501
yading@10 502 vec_st(fsum, 0, dst);
yading@10 503
yading@10 504 dst += dstStride;
yading@10 505 }
yading@10 506 }
yading@10 507 #endif