annotate ffmpeg/libavcodec/alpha/motion_est_alpha.c @ 13:844d341cf643 tip

Back up before ISMIR
author Yading Song <yading.song@eecs.qmul.ac.uk>
date Thu, 31 Oct 2013 13:17:06 +0000
parents 6840f77b83aa
children
rev   line source
yading@10 1 /*
yading@10 2 * Alpha optimized DSP utils
yading@10 3 * Copyright (c) 2002 Falk Hueffner <falk@debian.org>
yading@10 4 *
yading@10 5 * This file is part of FFmpeg.
yading@10 6 *
yading@10 7 * FFmpeg is free software; you can redistribute it and/or
yading@10 8 * modify it under the terms of the GNU Lesser General Public
yading@10 9 * License as published by the Free Software Foundation; either
yading@10 10 * version 2.1 of the License, or (at your option) any later version.
yading@10 11 *
yading@10 12 * FFmpeg is distributed in the hope that it will be useful,
yading@10 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
yading@10 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
yading@10 15 * Lesser General Public License for more details.
yading@10 16 *
yading@10 17 * You should have received a copy of the GNU Lesser General Public
yading@10 18 * License along with FFmpeg; if not, write to the Free Software
yading@10 19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
yading@10 20 */
yading@10 21
yading@10 22 #include "dsputil_alpha.h"
yading@10 23 #include "asm.h"
yading@10 24
yading@10 25 void get_pixels_mvi(int16_t *restrict block,
yading@10 26 const uint8_t *restrict pixels, int line_size)
yading@10 27 {
yading@10 28 int h = 8;
yading@10 29
yading@10 30 do {
yading@10 31 uint64_t p;
yading@10 32
yading@10 33 p = ldq(pixels);
yading@10 34 stq(unpkbw(p), block);
yading@10 35 stq(unpkbw(p >> 32), block + 4);
yading@10 36
yading@10 37 pixels += line_size;
yading@10 38 block += 8;
yading@10 39 } while (--h);
yading@10 40 }
yading@10 41
yading@10 42 void diff_pixels_mvi(int16_t *block, const uint8_t *s1, const uint8_t *s2,
yading@10 43 int stride) {
yading@10 44 int h = 8;
yading@10 45 uint64_t mask = 0x4040;
yading@10 46
yading@10 47 mask |= mask << 16;
yading@10 48 mask |= mask << 32;
yading@10 49 do {
yading@10 50 uint64_t x, y, c, d, a;
yading@10 51 uint64_t signs;
yading@10 52
yading@10 53 x = ldq(s1);
yading@10 54 y = ldq(s2);
yading@10 55 c = cmpbge(x, y);
yading@10 56 d = x - y;
yading@10 57 a = zap(mask, c); /* We use 0x4040404040404040 here... */
yading@10 58 d += 4 * a; /* ...so we can use s4addq here. */
yading@10 59 signs = zap(-1, c);
yading@10 60
yading@10 61 stq(unpkbw(d) | (unpkbw(signs) << 8), block);
yading@10 62 stq(unpkbw(d >> 32) | (unpkbw(signs >> 32) << 8), block + 4);
yading@10 63
yading@10 64 s1 += stride;
yading@10 65 s2 += stride;
yading@10 66 block += 8;
yading@10 67 } while (--h);
yading@10 68 }
yading@10 69
yading@10 70 static inline uint64_t avg2(uint64_t a, uint64_t b)
yading@10 71 {
yading@10 72 return (a | b) - (((a ^ b) & BYTE_VEC(0xfe)) >> 1);
yading@10 73 }
yading@10 74
yading@10 75 static inline uint64_t avg4(uint64_t l1, uint64_t l2, uint64_t l3, uint64_t l4)
yading@10 76 {
yading@10 77 uint64_t r1 = ((l1 & ~BYTE_VEC(0x03)) >> 2)
yading@10 78 + ((l2 & ~BYTE_VEC(0x03)) >> 2)
yading@10 79 + ((l3 & ~BYTE_VEC(0x03)) >> 2)
yading@10 80 + ((l4 & ~BYTE_VEC(0x03)) >> 2);
yading@10 81 uint64_t r2 = (( (l1 & BYTE_VEC(0x03))
yading@10 82 + (l2 & BYTE_VEC(0x03))
yading@10 83 + (l3 & BYTE_VEC(0x03))
yading@10 84 + (l4 & BYTE_VEC(0x03))
yading@10 85 + BYTE_VEC(0x02)) >> 2) & BYTE_VEC(0x03);
yading@10 86 return r1 + r2;
yading@10 87 }
yading@10 88
yading@10 89 int pix_abs8x8_mvi(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
yading@10 90 {
yading@10 91 int result = 0;
yading@10 92
yading@10 93 if ((size_t) pix2 & 0x7) {
yading@10 94 /* works only when pix2 is actually unaligned */
yading@10 95 do { /* do 8 pixel a time */
yading@10 96 uint64_t p1, p2;
yading@10 97
yading@10 98 p1 = ldq(pix1);
yading@10 99 p2 = uldq(pix2);
yading@10 100 result += perr(p1, p2);
yading@10 101
yading@10 102 pix1 += line_size;
yading@10 103 pix2 += line_size;
yading@10 104 } while (--h);
yading@10 105 } else {
yading@10 106 do {
yading@10 107 uint64_t p1, p2;
yading@10 108
yading@10 109 p1 = ldq(pix1);
yading@10 110 p2 = ldq(pix2);
yading@10 111 result += perr(p1, p2);
yading@10 112
yading@10 113 pix1 += line_size;
yading@10 114 pix2 += line_size;
yading@10 115 } while (--h);
yading@10 116 }
yading@10 117
yading@10 118 return result;
yading@10 119 }
yading@10 120
yading@10 121 #if 0 /* now done in assembly */
yading@10 122 int pix_abs16x16_mvi(uint8_t *pix1, uint8_t *pix2, int line_size)
yading@10 123 {
yading@10 124 int result = 0;
yading@10 125 int h = 16;
yading@10 126
yading@10 127 if ((size_t) pix2 & 0x7) {
yading@10 128 /* works only when pix2 is actually unaligned */
yading@10 129 do { /* do 16 pixel a time */
yading@10 130 uint64_t p1_l, p1_r, p2_l, p2_r;
yading@10 131 uint64_t t;
yading@10 132
yading@10 133 p1_l = ldq(pix1);
yading@10 134 p1_r = ldq(pix1 + 8);
yading@10 135 t = ldq_u(pix2 + 8);
yading@10 136 p2_l = extql(ldq_u(pix2), pix2) | extqh(t, pix2);
yading@10 137 p2_r = extql(t, pix2) | extqh(ldq_u(pix2 + 16), pix2);
yading@10 138 pix1 += line_size;
yading@10 139 pix2 += line_size;
yading@10 140
yading@10 141 result += perr(p1_l, p2_l)
yading@10 142 + perr(p1_r, p2_r);
yading@10 143 } while (--h);
yading@10 144 } else {
yading@10 145 do {
yading@10 146 uint64_t p1_l, p1_r, p2_l, p2_r;
yading@10 147
yading@10 148 p1_l = ldq(pix1);
yading@10 149 p1_r = ldq(pix1 + 8);
yading@10 150 p2_l = ldq(pix2);
yading@10 151 p2_r = ldq(pix2 + 8);
yading@10 152 pix1 += line_size;
yading@10 153 pix2 += line_size;
yading@10 154
yading@10 155 result += perr(p1_l, p2_l)
yading@10 156 + perr(p1_r, p2_r);
yading@10 157 } while (--h);
yading@10 158 }
yading@10 159
yading@10 160 return result;
yading@10 161 }
yading@10 162 #endif
yading@10 163
yading@10 164 int pix_abs16x16_x2_mvi(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
yading@10 165 {
yading@10 166 int result = 0;
yading@10 167 uint64_t disalign = (size_t) pix2 & 0x7;
yading@10 168
yading@10 169 switch (disalign) {
yading@10 170 case 0:
yading@10 171 do {
yading@10 172 uint64_t p1_l, p1_r, p2_l, p2_r;
yading@10 173 uint64_t l, r;
yading@10 174
yading@10 175 p1_l = ldq(pix1);
yading@10 176 p1_r = ldq(pix1 + 8);
yading@10 177 l = ldq(pix2);
yading@10 178 r = ldq(pix2 + 8);
yading@10 179 p2_l = avg2(l, (l >> 8) | ((uint64_t) r << 56));
yading@10 180 p2_r = avg2(r, (r >> 8) | ((uint64_t) pix2[16] << 56));
yading@10 181 pix1 += line_size;
yading@10 182 pix2 += line_size;
yading@10 183
yading@10 184 result += perr(p1_l, p2_l)
yading@10 185 + perr(p1_r, p2_r);
yading@10 186 } while (--h);
yading@10 187 break;
yading@10 188 case 7:
yading@10 189 /* |.......l|lllllllr|rrrrrrr*|
yading@10 190 This case is special because disalign1 would be 8, which
yading@10 191 gets treated as 0 by extqh. At least it is a bit faster
yading@10 192 that way :) */
yading@10 193 do {
yading@10 194 uint64_t p1_l, p1_r, p2_l, p2_r;
yading@10 195 uint64_t l, m, r;
yading@10 196
yading@10 197 p1_l = ldq(pix1);
yading@10 198 p1_r = ldq(pix1 + 8);
yading@10 199 l = ldq_u(pix2);
yading@10 200 m = ldq_u(pix2 + 8);
yading@10 201 r = ldq_u(pix2 + 16);
yading@10 202 p2_l = avg2(extql(l, disalign) | extqh(m, disalign), m);
yading@10 203 p2_r = avg2(extql(m, disalign) | extqh(r, disalign), r);
yading@10 204 pix1 += line_size;
yading@10 205 pix2 += line_size;
yading@10 206
yading@10 207 result += perr(p1_l, p2_l)
yading@10 208 + perr(p1_r, p2_r);
yading@10 209 } while (--h);
yading@10 210 break;
yading@10 211 default:
yading@10 212 do {
yading@10 213 uint64_t disalign1 = disalign + 1;
yading@10 214 uint64_t p1_l, p1_r, p2_l, p2_r;
yading@10 215 uint64_t l, m, r;
yading@10 216
yading@10 217 p1_l = ldq(pix1);
yading@10 218 p1_r = ldq(pix1 + 8);
yading@10 219 l = ldq_u(pix2);
yading@10 220 m = ldq_u(pix2 + 8);
yading@10 221 r = ldq_u(pix2 + 16);
yading@10 222 p2_l = avg2(extql(l, disalign) | extqh(m, disalign),
yading@10 223 extql(l, disalign1) | extqh(m, disalign1));
yading@10 224 p2_r = avg2(extql(m, disalign) | extqh(r, disalign),
yading@10 225 extql(m, disalign1) | extqh(r, disalign1));
yading@10 226 pix1 += line_size;
yading@10 227 pix2 += line_size;
yading@10 228
yading@10 229 result += perr(p1_l, p2_l)
yading@10 230 + perr(p1_r, p2_r);
yading@10 231 } while (--h);
yading@10 232 break;
yading@10 233 }
yading@10 234 return result;
yading@10 235 }
yading@10 236
yading@10 237 int pix_abs16x16_y2_mvi(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
yading@10 238 {
yading@10 239 int result = 0;
yading@10 240
yading@10 241 if ((size_t) pix2 & 0x7) {
yading@10 242 uint64_t t, p2_l, p2_r;
yading@10 243 t = ldq_u(pix2 + 8);
yading@10 244 p2_l = extql(ldq_u(pix2), pix2) | extqh(t, pix2);
yading@10 245 p2_r = extql(t, pix2) | extqh(ldq_u(pix2 + 16), pix2);
yading@10 246
yading@10 247 do {
yading@10 248 uint64_t p1_l, p1_r, np2_l, np2_r;
yading@10 249 uint64_t t;
yading@10 250
yading@10 251 p1_l = ldq(pix1);
yading@10 252 p1_r = ldq(pix1 + 8);
yading@10 253 pix2 += line_size;
yading@10 254 t = ldq_u(pix2 + 8);
yading@10 255 np2_l = extql(ldq_u(pix2), pix2) | extqh(t, pix2);
yading@10 256 np2_r = extql(t, pix2) | extqh(ldq_u(pix2 + 16), pix2);
yading@10 257
yading@10 258 result += perr(p1_l, avg2(p2_l, np2_l))
yading@10 259 + perr(p1_r, avg2(p2_r, np2_r));
yading@10 260
yading@10 261 pix1 += line_size;
yading@10 262 p2_l = np2_l;
yading@10 263 p2_r = np2_r;
yading@10 264
yading@10 265 } while (--h);
yading@10 266 } else {
yading@10 267 uint64_t p2_l, p2_r;
yading@10 268 p2_l = ldq(pix2);
yading@10 269 p2_r = ldq(pix2 + 8);
yading@10 270 do {
yading@10 271 uint64_t p1_l, p1_r, np2_l, np2_r;
yading@10 272
yading@10 273 p1_l = ldq(pix1);
yading@10 274 p1_r = ldq(pix1 + 8);
yading@10 275 pix2 += line_size;
yading@10 276 np2_l = ldq(pix2);
yading@10 277 np2_r = ldq(pix2 + 8);
yading@10 278
yading@10 279 result += perr(p1_l, avg2(p2_l, np2_l))
yading@10 280 + perr(p1_r, avg2(p2_r, np2_r));
yading@10 281
yading@10 282 pix1 += line_size;
yading@10 283 p2_l = np2_l;
yading@10 284 p2_r = np2_r;
yading@10 285 } while (--h);
yading@10 286 }
yading@10 287 return result;
yading@10 288 }
yading@10 289
yading@10 290 int pix_abs16x16_xy2_mvi(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
yading@10 291 {
yading@10 292 int result = 0;
yading@10 293
yading@10 294 uint64_t p1_l, p1_r;
yading@10 295 uint64_t p2_l, p2_r, p2_x;
yading@10 296
yading@10 297 p1_l = ldq(pix1);
yading@10 298 p1_r = ldq(pix1 + 8);
yading@10 299
yading@10 300 if ((size_t) pix2 & 0x7) { /* could be optimized a lot */
yading@10 301 p2_l = uldq(pix2);
yading@10 302 p2_r = uldq(pix2 + 8);
yading@10 303 p2_x = (uint64_t) pix2[16] << 56;
yading@10 304 } else {
yading@10 305 p2_l = ldq(pix2);
yading@10 306 p2_r = ldq(pix2 + 8);
yading@10 307 p2_x = ldq(pix2 + 16) << 56;
yading@10 308 }
yading@10 309
yading@10 310 do {
yading@10 311 uint64_t np1_l, np1_r;
yading@10 312 uint64_t np2_l, np2_r, np2_x;
yading@10 313
yading@10 314 pix1 += line_size;
yading@10 315 pix2 += line_size;
yading@10 316
yading@10 317 np1_l = ldq(pix1);
yading@10 318 np1_r = ldq(pix1 + 8);
yading@10 319
yading@10 320 if ((size_t) pix2 & 0x7) { /* could be optimized a lot */
yading@10 321 np2_l = uldq(pix2);
yading@10 322 np2_r = uldq(pix2 + 8);
yading@10 323 np2_x = (uint64_t) pix2[16] << 56;
yading@10 324 } else {
yading@10 325 np2_l = ldq(pix2);
yading@10 326 np2_r = ldq(pix2 + 8);
yading@10 327 np2_x = ldq(pix2 + 16) << 56;
yading@10 328 }
yading@10 329
yading@10 330 result += perr(p1_l,
yading@10 331 avg4( p2_l, ( p2_l >> 8) | ((uint64_t) p2_r << 56),
yading@10 332 np2_l, (np2_l >> 8) | ((uint64_t) np2_r << 56)))
yading@10 333 + perr(p1_r,
yading@10 334 avg4( p2_r, ( p2_r >> 8) | ((uint64_t) p2_x),
yading@10 335 np2_r, (np2_r >> 8) | ((uint64_t) np2_x)));
yading@10 336
yading@10 337 p1_l = np1_l;
yading@10 338 p1_r = np1_r;
yading@10 339 p2_l = np2_l;
yading@10 340 p2_r = np2_r;
yading@10 341 p2_x = np2_x;
yading@10 342 } while (--h);
yading@10 343
yading@10 344 return result;
yading@10 345 }