yading@10
|
1 /*
|
yading@10
|
2 * Alpha optimized DSP utils
|
yading@10
|
3 * Copyright (c) 2002 Falk Hueffner <falk@debian.org>
|
yading@10
|
4 *
|
yading@10
|
5 * This file is part of FFmpeg.
|
yading@10
|
6 *
|
yading@10
|
7 * FFmpeg is free software; you can redistribute it and/or
|
yading@10
|
8 * modify it under the terms of the GNU Lesser General Public
|
yading@10
|
9 * License as published by the Free Software Foundation; either
|
yading@10
|
10 * version 2.1 of the License, or (at your option) any later version.
|
yading@10
|
11 *
|
yading@10
|
12 * FFmpeg is distributed in the hope that it will be useful,
|
yading@10
|
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
yading@10
|
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
yading@10
|
15 * Lesser General Public License for more details.
|
yading@10
|
16 *
|
yading@10
|
17 * You should have received a copy of the GNU Lesser General Public
|
yading@10
|
18 * License along with FFmpeg; if not, write to the Free Software
|
yading@10
|
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
yading@10
|
20 */
|
yading@10
|
21
|
yading@10
|
22 #include "dsputil_alpha.h"
|
yading@10
|
23 #include "asm.h"
|
yading@10
|
24
|
yading@10
|
25 void get_pixels_mvi(int16_t *restrict block,
|
yading@10
|
26 const uint8_t *restrict pixels, int line_size)
|
yading@10
|
27 {
|
yading@10
|
28 int h = 8;
|
yading@10
|
29
|
yading@10
|
30 do {
|
yading@10
|
31 uint64_t p;
|
yading@10
|
32
|
yading@10
|
33 p = ldq(pixels);
|
yading@10
|
34 stq(unpkbw(p), block);
|
yading@10
|
35 stq(unpkbw(p >> 32), block + 4);
|
yading@10
|
36
|
yading@10
|
37 pixels += line_size;
|
yading@10
|
38 block += 8;
|
yading@10
|
39 } while (--h);
|
yading@10
|
40 }
|
yading@10
|
41
|
yading@10
|
42 void diff_pixels_mvi(int16_t *block, const uint8_t *s1, const uint8_t *s2,
|
yading@10
|
43 int stride) {
|
yading@10
|
44 int h = 8;
|
yading@10
|
45 uint64_t mask = 0x4040;
|
yading@10
|
46
|
yading@10
|
47 mask |= mask << 16;
|
yading@10
|
48 mask |= mask << 32;
|
yading@10
|
49 do {
|
yading@10
|
50 uint64_t x, y, c, d, a;
|
yading@10
|
51 uint64_t signs;
|
yading@10
|
52
|
yading@10
|
53 x = ldq(s1);
|
yading@10
|
54 y = ldq(s2);
|
yading@10
|
55 c = cmpbge(x, y);
|
yading@10
|
56 d = x - y;
|
yading@10
|
57 a = zap(mask, c); /* We use 0x4040404040404040 here... */
|
yading@10
|
58 d += 4 * a; /* ...so we can use s4addq here. */
|
yading@10
|
59 signs = zap(-1, c);
|
yading@10
|
60
|
yading@10
|
61 stq(unpkbw(d) | (unpkbw(signs) << 8), block);
|
yading@10
|
62 stq(unpkbw(d >> 32) | (unpkbw(signs >> 32) << 8), block + 4);
|
yading@10
|
63
|
yading@10
|
64 s1 += stride;
|
yading@10
|
65 s2 += stride;
|
yading@10
|
66 block += 8;
|
yading@10
|
67 } while (--h);
|
yading@10
|
68 }
|
yading@10
|
69
|
yading@10
|
70 static inline uint64_t avg2(uint64_t a, uint64_t b)
|
yading@10
|
71 {
|
yading@10
|
72 return (a | b) - (((a ^ b) & BYTE_VEC(0xfe)) >> 1);
|
yading@10
|
73 }
|
yading@10
|
74
|
yading@10
|
75 static inline uint64_t avg4(uint64_t l1, uint64_t l2, uint64_t l3, uint64_t l4)
|
yading@10
|
76 {
|
yading@10
|
77 uint64_t r1 = ((l1 & ~BYTE_VEC(0x03)) >> 2)
|
yading@10
|
78 + ((l2 & ~BYTE_VEC(0x03)) >> 2)
|
yading@10
|
79 + ((l3 & ~BYTE_VEC(0x03)) >> 2)
|
yading@10
|
80 + ((l4 & ~BYTE_VEC(0x03)) >> 2);
|
yading@10
|
81 uint64_t r2 = (( (l1 & BYTE_VEC(0x03))
|
yading@10
|
82 + (l2 & BYTE_VEC(0x03))
|
yading@10
|
83 + (l3 & BYTE_VEC(0x03))
|
yading@10
|
84 + (l4 & BYTE_VEC(0x03))
|
yading@10
|
85 + BYTE_VEC(0x02)) >> 2) & BYTE_VEC(0x03);
|
yading@10
|
86 return r1 + r2;
|
yading@10
|
87 }
|
yading@10
|
88
|
yading@10
|
89 int pix_abs8x8_mvi(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
|
yading@10
|
90 {
|
yading@10
|
91 int result = 0;
|
yading@10
|
92
|
yading@10
|
93 if ((size_t) pix2 & 0x7) {
|
yading@10
|
94 /* works only when pix2 is actually unaligned */
|
yading@10
|
95 do { /* do 8 pixel a time */
|
yading@10
|
96 uint64_t p1, p2;
|
yading@10
|
97
|
yading@10
|
98 p1 = ldq(pix1);
|
yading@10
|
99 p2 = uldq(pix2);
|
yading@10
|
100 result += perr(p1, p2);
|
yading@10
|
101
|
yading@10
|
102 pix1 += line_size;
|
yading@10
|
103 pix2 += line_size;
|
yading@10
|
104 } while (--h);
|
yading@10
|
105 } else {
|
yading@10
|
106 do {
|
yading@10
|
107 uint64_t p1, p2;
|
yading@10
|
108
|
yading@10
|
109 p1 = ldq(pix1);
|
yading@10
|
110 p2 = ldq(pix2);
|
yading@10
|
111 result += perr(p1, p2);
|
yading@10
|
112
|
yading@10
|
113 pix1 += line_size;
|
yading@10
|
114 pix2 += line_size;
|
yading@10
|
115 } while (--h);
|
yading@10
|
116 }
|
yading@10
|
117
|
yading@10
|
118 return result;
|
yading@10
|
119 }
|
yading@10
|
120
|
yading@10
|
121 #if 0 /* now done in assembly */
|
yading@10
|
122 int pix_abs16x16_mvi(uint8_t *pix1, uint8_t *pix2, int line_size)
|
yading@10
|
123 {
|
yading@10
|
124 int result = 0;
|
yading@10
|
125 int h = 16;
|
yading@10
|
126
|
yading@10
|
127 if ((size_t) pix2 & 0x7) {
|
yading@10
|
128 /* works only when pix2 is actually unaligned */
|
yading@10
|
129 do { /* do 16 pixel a time */
|
yading@10
|
130 uint64_t p1_l, p1_r, p2_l, p2_r;
|
yading@10
|
131 uint64_t t;
|
yading@10
|
132
|
yading@10
|
133 p1_l = ldq(pix1);
|
yading@10
|
134 p1_r = ldq(pix1 + 8);
|
yading@10
|
135 t = ldq_u(pix2 + 8);
|
yading@10
|
136 p2_l = extql(ldq_u(pix2), pix2) | extqh(t, pix2);
|
yading@10
|
137 p2_r = extql(t, pix2) | extqh(ldq_u(pix2 + 16), pix2);
|
yading@10
|
138 pix1 += line_size;
|
yading@10
|
139 pix2 += line_size;
|
yading@10
|
140
|
yading@10
|
141 result += perr(p1_l, p2_l)
|
yading@10
|
142 + perr(p1_r, p2_r);
|
yading@10
|
143 } while (--h);
|
yading@10
|
144 } else {
|
yading@10
|
145 do {
|
yading@10
|
146 uint64_t p1_l, p1_r, p2_l, p2_r;
|
yading@10
|
147
|
yading@10
|
148 p1_l = ldq(pix1);
|
yading@10
|
149 p1_r = ldq(pix1 + 8);
|
yading@10
|
150 p2_l = ldq(pix2);
|
yading@10
|
151 p2_r = ldq(pix2 + 8);
|
yading@10
|
152 pix1 += line_size;
|
yading@10
|
153 pix2 += line_size;
|
yading@10
|
154
|
yading@10
|
155 result += perr(p1_l, p2_l)
|
yading@10
|
156 + perr(p1_r, p2_r);
|
yading@10
|
157 } while (--h);
|
yading@10
|
158 }
|
yading@10
|
159
|
yading@10
|
160 return result;
|
yading@10
|
161 }
|
yading@10
|
162 #endif
|
yading@10
|
163
|
yading@10
|
164 int pix_abs16x16_x2_mvi(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
|
yading@10
|
165 {
|
yading@10
|
166 int result = 0;
|
yading@10
|
167 uint64_t disalign = (size_t) pix2 & 0x7;
|
yading@10
|
168
|
yading@10
|
169 switch (disalign) {
|
yading@10
|
170 case 0:
|
yading@10
|
171 do {
|
yading@10
|
172 uint64_t p1_l, p1_r, p2_l, p2_r;
|
yading@10
|
173 uint64_t l, r;
|
yading@10
|
174
|
yading@10
|
175 p1_l = ldq(pix1);
|
yading@10
|
176 p1_r = ldq(pix1 + 8);
|
yading@10
|
177 l = ldq(pix2);
|
yading@10
|
178 r = ldq(pix2 + 8);
|
yading@10
|
179 p2_l = avg2(l, (l >> 8) | ((uint64_t) r << 56));
|
yading@10
|
180 p2_r = avg2(r, (r >> 8) | ((uint64_t) pix2[16] << 56));
|
yading@10
|
181 pix1 += line_size;
|
yading@10
|
182 pix2 += line_size;
|
yading@10
|
183
|
yading@10
|
184 result += perr(p1_l, p2_l)
|
yading@10
|
185 + perr(p1_r, p2_r);
|
yading@10
|
186 } while (--h);
|
yading@10
|
187 break;
|
yading@10
|
188 case 7:
|
yading@10
|
189 /* |.......l|lllllllr|rrrrrrr*|
|
yading@10
|
190 This case is special because disalign1 would be 8, which
|
yading@10
|
191 gets treated as 0 by extqh. At least it is a bit faster
|
yading@10
|
192 that way :) */
|
yading@10
|
193 do {
|
yading@10
|
194 uint64_t p1_l, p1_r, p2_l, p2_r;
|
yading@10
|
195 uint64_t l, m, r;
|
yading@10
|
196
|
yading@10
|
197 p1_l = ldq(pix1);
|
yading@10
|
198 p1_r = ldq(pix1 + 8);
|
yading@10
|
199 l = ldq_u(pix2);
|
yading@10
|
200 m = ldq_u(pix2 + 8);
|
yading@10
|
201 r = ldq_u(pix2 + 16);
|
yading@10
|
202 p2_l = avg2(extql(l, disalign) | extqh(m, disalign), m);
|
yading@10
|
203 p2_r = avg2(extql(m, disalign) | extqh(r, disalign), r);
|
yading@10
|
204 pix1 += line_size;
|
yading@10
|
205 pix2 += line_size;
|
yading@10
|
206
|
yading@10
|
207 result += perr(p1_l, p2_l)
|
yading@10
|
208 + perr(p1_r, p2_r);
|
yading@10
|
209 } while (--h);
|
yading@10
|
210 break;
|
yading@10
|
211 default:
|
yading@10
|
212 do {
|
yading@10
|
213 uint64_t disalign1 = disalign + 1;
|
yading@10
|
214 uint64_t p1_l, p1_r, p2_l, p2_r;
|
yading@10
|
215 uint64_t l, m, r;
|
yading@10
|
216
|
yading@10
|
217 p1_l = ldq(pix1);
|
yading@10
|
218 p1_r = ldq(pix1 + 8);
|
yading@10
|
219 l = ldq_u(pix2);
|
yading@10
|
220 m = ldq_u(pix2 + 8);
|
yading@10
|
221 r = ldq_u(pix2 + 16);
|
yading@10
|
222 p2_l = avg2(extql(l, disalign) | extqh(m, disalign),
|
yading@10
|
223 extql(l, disalign1) | extqh(m, disalign1));
|
yading@10
|
224 p2_r = avg2(extql(m, disalign) | extqh(r, disalign),
|
yading@10
|
225 extql(m, disalign1) | extqh(r, disalign1));
|
yading@10
|
226 pix1 += line_size;
|
yading@10
|
227 pix2 += line_size;
|
yading@10
|
228
|
yading@10
|
229 result += perr(p1_l, p2_l)
|
yading@10
|
230 + perr(p1_r, p2_r);
|
yading@10
|
231 } while (--h);
|
yading@10
|
232 break;
|
yading@10
|
233 }
|
yading@10
|
234 return result;
|
yading@10
|
235 }
|
yading@10
|
236
|
yading@10
|
237 int pix_abs16x16_y2_mvi(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
|
yading@10
|
238 {
|
yading@10
|
239 int result = 0;
|
yading@10
|
240
|
yading@10
|
241 if ((size_t) pix2 & 0x7) {
|
yading@10
|
242 uint64_t t, p2_l, p2_r;
|
yading@10
|
243 t = ldq_u(pix2 + 8);
|
yading@10
|
244 p2_l = extql(ldq_u(pix2), pix2) | extqh(t, pix2);
|
yading@10
|
245 p2_r = extql(t, pix2) | extqh(ldq_u(pix2 + 16), pix2);
|
yading@10
|
246
|
yading@10
|
247 do {
|
yading@10
|
248 uint64_t p1_l, p1_r, np2_l, np2_r;
|
yading@10
|
249 uint64_t t;
|
yading@10
|
250
|
yading@10
|
251 p1_l = ldq(pix1);
|
yading@10
|
252 p1_r = ldq(pix1 + 8);
|
yading@10
|
253 pix2 += line_size;
|
yading@10
|
254 t = ldq_u(pix2 + 8);
|
yading@10
|
255 np2_l = extql(ldq_u(pix2), pix2) | extqh(t, pix2);
|
yading@10
|
256 np2_r = extql(t, pix2) | extqh(ldq_u(pix2 + 16), pix2);
|
yading@10
|
257
|
yading@10
|
258 result += perr(p1_l, avg2(p2_l, np2_l))
|
yading@10
|
259 + perr(p1_r, avg2(p2_r, np2_r));
|
yading@10
|
260
|
yading@10
|
261 pix1 += line_size;
|
yading@10
|
262 p2_l = np2_l;
|
yading@10
|
263 p2_r = np2_r;
|
yading@10
|
264
|
yading@10
|
265 } while (--h);
|
yading@10
|
266 } else {
|
yading@10
|
267 uint64_t p2_l, p2_r;
|
yading@10
|
268 p2_l = ldq(pix2);
|
yading@10
|
269 p2_r = ldq(pix2 + 8);
|
yading@10
|
270 do {
|
yading@10
|
271 uint64_t p1_l, p1_r, np2_l, np2_r;
|
yading@10
|
272
|
yading@10
|
273 p1_l = ldq(pix1);
|
yading@10
|
274 p1_r = ldq(pix1 + 8);
|
yading@10
|
275 pix2 += line_size;
|
yading@10
|
276 np2_l = ldq(pix2);
|
yading@10
|
277 np2_r = ldq(pix2 + 8);
|
yading@10
|
278
|
yading@10
|
279 result += perr(p1_l, avg2(p2_l, np2_l))
|
yading@10
|
280 + perr(p1_r, avg2(p2_r, np2_r));
|
yading@10
|
281
|
yading@10
|
282 pix1 += line_size;
|
yading@10
|
283 p2_l = np2_l;
|
yading@10
|
284 p2_r = np2_r;
|
yading@10
|
285 } while (--h);
|
yading@10
|
286 }
|
yading@10
|
287 return result;
|
yading@10
|
288 }
|
yading@10
|
289
|
yading@10
|
290 int pix_abs16x16_xy2_mvi(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
|
yading@10
|
291 {
|
yading@10
|
292 int result = 0;
|
yading@10
|
293
|
yading@10
|
294 uint64_t p1_l, p1_r;
|
yading@10
|
295 uint64_t p2_l, p2_r, p2_x;
|
yading@10
|
296
|
yading@10
|
297 p1_l = ldq(pix1);
|
yading@10
|
298 p1_r = ldq(pix1 + 8);
|
yading@10
|
299
|
yading@10
|
300 if ((size_t) pix2 & 0x7) { /* could be optimized a lot */
|
yading@10
|
301 p2_l = uldq(pix2);
|
yading@10
|
302 p2_r = uldq(pix2 + 8);
|
yading@10
|
303 p2_x = (uint64_t) pix2[16] << 56;
|
yading@10
|
304 } else {
|
yading@10
|
305 p2_l = ldq(pix2);
|
yading@10
|
306 p2_r = ldq(pix2 + 8);
|
yading@10
|
307 p2_x = ldq(pix2 + 16) << 56;
|
yading@10
|
308 }
|
yading@10
|
309
|
yading@10
|
310 do {
|
yading@10
|
311 uint64_t np1_l, np1_r;
|
yading@10
|
312 uint64_t np2_l, np2_r, np2_x;
|
yading@10
|
313
|
yading@10
|
314 pix1 += line_size;
|
yading@10
|
315 pix2 += line_size;
|
yading@10
|
316
|
yading@10
|
317 np1_l = ldq(pix1);
|
yading@10
|
318 np1_r = ldq(pix1 + 8);
|
yading@10
|
319
|
yading@10
|
320 if ((size_t) pix2 & 0x7) { /* could be optimized a lot */
|
yading@10
|
321 np2_l = uldq(pix2);
|
yading@10
|
322 np2_r = uldq(pix2 + 8);
|
yading@10
|
323 np2_x = (uint64_t) pix2[16] << 56;
|
yading@10
|
324 } else {
|
yading@10
|
325 np2_l = ldq(pix2);
|
yading@10
|
326 np2_r = ldq(pix2 + 8);
|
yading@10
|
327 np2_x = ldq(pix2 + 16) << 56;
|
yading@10
|
328 }
|
yading@10
|
329
|
yading@10
|
330 result += perr(p1_l,
|
yading@10
|
331 avg4( p2_l, ( p2_l >> 8) | ((uint64_t) p2_r << 56),
|
yading@10
|
332 np2_l, (np2_l >> 8) | ((uint64_t) np2_r << 56)))
|
yading@10
|
333 + perr(p1_r,
|
yading@10
|
334 avg4( p2_r, ( p2_r >> 8) | ((uint64_t) p2_x),
|
yading@10
|
335 np2_r, (np2_r >> 8) | ((uint64_t) np2_x)));
|
yading@10
|
336
|
yading@10
|
337 p1_l = np1_l;
|
yading@10
|
338 p1_r = np1_r;
|
yading@10
|
339 p2_l = np2_l;
|
yading@10
|
340 p2_r = np2_r;
|
yading@10
|
341 p2_x = np2_x;
|
yading@10
|
342 } while (--h);
|
yading@10
|
343
|
yading@10
|
344 return result;
|
yading@10
|
345 }
|