hpeldsp_rnd_template.c
Go to the documentation of this file.
1 /*
2  * DSP utils mmx functions are compiled twice for rnd/no_rnd
3  * Copyright (c) 2000, 2001 Fabrice Bellard
4  * Copyright (c) 2003-2004 Michael Niedermayer <michaelni@gmx.at>
5  *
6  * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
7  * mostly rewritten by Michael Niedermayer <michaelni@gmx.at>
8  * and improved by Zdenek Kabelac <kabi@users.sf.net>
9  *
10  * This file is part of FFmpeg.
11  *
12  * FFmpeg is free software; you can redistribute it and/or
13  * modify it under the terms of the GNU Lesser General Public
14  * License as published by the Free Software Foundation; either
15  * version 2.1 of the License, or (at your option) any later version.
16  *
17  * FFmpeg is distributed in the hope that it will be useful,
18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20  * Lesser General Public License for more details.
21  *
22  * You should have received a copy of the GNU Lesser General Public
23  * License along with FFmpeg; if not, write to the Free Software
24  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25  */
26 
27 // put_pixels
28 static void DEF(put, pixels8_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
29 {
30  MOVQ_BFE(mm6);
31  __asm__ volatile(
32  "lea (%3, %3), %%"REG_a" \n\t"
33  ".p2align 3 \n\t"
34  "1: \n\t"
35  "movq (%1), %%mm0 \n\t"
36  "movq 1(%1), %%mm1 \n\t"
37  "movq (%1, %3), %%mm2 \n\t"
38  "movq 1(%1, %3), %%mm3 \n\t"
39  PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
40  "movq %%mm4, (%2) \n\t"
41  "movq %%mm5, (%2, %3) \n\t"
42  "add %%"REG_a", %1 \n\t"
43  "add %%"REG_a", %2 \n\t"
44  "movq (%1), %%mm0 \n\t"
45  "movq 1(%1), %%mm1 \n\t"
46  "movq (%1, %3), %%mm2 \n\t"
47  "movq 1(%1, %3), %%mm3 \n\t"
48  PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
49  "movq %%mm4, (%2) \n\t"
50  "movq %%mm5, (%2, %3) \n\t"
51  "add %%"REG_a", %1 \n\t"
52  "add %%"REG_a", %2 \n\t"
53  "subl $4, %0 \n\t"
54  "jnz 1b \n\t"
55  :"+g"(h), "+S"(pixels), "+D"(block)
56  :"r"((x86_reg)line_size)
57  :REG_a, "memory");
58 }
59 
60 static void DEF(put, pixels16_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
61 {
62  MOVQ_BFE(mm6);
63  __asm__ volatile(
64  "lea (%3, %3), %%"REG_a" \n\t"
65  ".p2align 3 \n\t"
66  "1: \n\t"
67  "movq (%1), %%mm0 \n\t"
68  "movq 1(%1), %%mm1 \n\t"
69  "movq (%1, %3), %%mm2 \n\t"
70  "movq 1(%1, %3), %%mm3 \n\t"
71  PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
72  "movq %%mm4, (%2) \n\t"
73  "movq %%mm5, (%2, %3) \n\t"
74  "movq 8(%1), %%mm0 \n\t"
75  "movq 9(%1), %%mm1 \n\t"
76  "movq 8(%1, %3), %%mm2 \n\t"
77  "movq 9(%1, %3), %%mm3 \n\t"
78  PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
79  "movq %%mm4, 8(%2) \n\t"
80  "movq %%mm5, 8(%2, %3) \n\t"
81  "add %%"REG_a", %1 \n\t"
82  "add %%"REG_a", %2 \n\t"
83  "movq (%1), %%mm0 \n\t"
84  "movq 1(%1), %%mm1 \n\t"
85  "movq (%1, %3), %%mm2 \n\t"
86  "movq 1(%1, %3), %%mm3 \n\t"
87  PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
88  "movq %%mm4, (%2) \n\t"
89  "movq %%mm5, (%2, %3) \n\t"
90  "movq 8(%1), %%mm0 \n\t"
91  "movq 9(%1), %%mm1 \n\t"
92  "movq 8(%1, %3), %%mm2 \n\t"
93  "movq 9(%1, %3), %%mm3 \n\t"
94  PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
95  "movq %%mm4, 8(%2) \n\t"
96  "movq %%mm5, 8(%2, %3) \n\t"
97  "add %%"REG_a", %1 \n\t"
98  "add %%"REG_a", %2 \n\t"
99  "subl $4, %0 \n\t"
100  "jnz 1b \n\t"
101  :"+g"(h), "+S"(pixels), "+D"(block)
102  :"r"((x86_reg)line_size)
103  :REG_a, "memory");
104 }
105 
106 static void DEF(put, pixels8_y2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
107 {
108  MOVQ_BFE(mm6);
109  __asm__ volatile(
110  "lea (%3, %3), %%"REG_a" \n\t"
111  "movq (%1), %%mm0 \n\t"
112  ".p2align 3 \n\t"
113  "1: \n\t"
114  "movq (%1, %3), %%mm1 \n\t"
115  "movq (%1, %%"REG_a"),%%mm2 \n\t"
116  PAVGBP(%%mm1, %%mm0, %%mm4, %%mm2, %%mm1, %%mm5)
117  "movq %%mm4, (%2) \n\t"
118  "movq %%mm5, (%2, %3) \n\t"
119  "add %%"REG_a", %1 \n\t"
120  "add %%"REG_a", %2 \n\t"
121  "movq (%1, %3), %%mm1 \n\t"
122  "movq (%1, %%"REG_a"),%%mm0 \n\t"
123  PAVGBP(%%mm1, %%mm2, %%mm4, %%mm0, %%mm1, %%mm5)
124  "movq %%mm4, (%2) \n\t"
125  "movq %%mm5, (%2, %3) \n\t"
126  "add %%"REG_a", %1 \n\t"
127  "add %%"REG_a", %2 \n\t"
128  "subl $4, %0 \n\t"
129  "jnz 1b \n\t"
130  :"+g"(h), "+S"(pixels), "+D"(block)
131  :"r"((x86_reg)line_size)
132  :REG_a, "memory");
133 }
134 
135 static void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
136 {
137  MOVQ_ZERO(mm7);
138  SET_RND(mm6); // =2 for rnd and =1 for no_rnd version
139  __asm__ volatile(
140  "movq (%1), %%mm0 \n\t"
141  "movq 1(%1), %%mm4 \n\t"
142  "movq %%mm0, %%mm1 \n\t"
143  "movq %%mm4, %%mm5 \n\t"
144  "punpcklbw %%mm7, %%mm0 \n\t"
145  "punpcklbw %%mm7, %%mm4 \n\t"
146  "punpckhbw %%mm7, %%mm1 \n\t"
147  "punpckhbw %%mm7, %%mm5 \n\t"
148  "paddusw %%mm0, %%mm4 \n\t"
149  "paddusw %%mm1, %%mm5 \n\t"
150  "xor %%"REG_a", %%"REG_a" \n\t"
151  "add %3, %1 \n\t"
152  ".p2align 3 \n\t"
153  "1: \n\t"
154  "movq (%1, %%"REG_a"), %%mm0 \n\t"
155  "movq 1(%1, %%"REG_a"), %%mm2 \n\t"
156  "movq %%mm0, %%mm1 \n\t"
157  "movq %%mm2, %%mm3 \n\t"
158  "punpcklbw %%mm7, %%mm0 \n\t"
159  "punpcklbw %%mm7, %%mm2 \n\t"
160  "punpckhbw %%mm7, %%mm1 \n\t"
161  "punpckhbw %%mm7, %%mm3 \n\t"
162  "paddusw %%mm2, %%mm0 \n\t"
163  "paddusw %%mm3, %%mm1 \n\t"
164  "paddusw %%mm6, %%mm4 \n\t"
165  "paddusw %%mm6, %%mm5 \n\t"
166  "paddusw %%mm0, %%mm4 \n\t"
167  "paddusw %%mm1, %%mm5 \n\t"
168  "psrlw $2, %%mm4 \n\t"
169  "psrlw $2, %%mm5 \n\t"
170  "packuswb %%mm5, %%mm4 \n\t"
171  "movq %%mm4, (%2, %%"REG_a") \n\t"
172  "add %3, %%"REG_a" \n\t"
173 
174  "movq (%1, %%"REG_a"), %%mm2 \n\t" // 0 <-> 2 1 <-> 3
175  "movq 1(%1, %%"REG_a"), %%mm4 \n\t"
176  "movq %%mm2, %%mm3 \n\t"
177  "movq %%mm4, %%mm5 \n\t"
178  "punpcklbw %%mm7, %%mm2 \n\t"
179  "punpcklbw %%mm7, %%mm4 \n\t"
180  "punpckhbw %%mm7, %%mm3 \n\t"
181  "punpckhbw %%mm7, %%mm5 \n\t"
182  "paddusw %%mm2, %%mm4 \n\t"
183  "paddusw %%mm3, %%mm5 \n\t"
184  "paddusw %%mm6, %%mm0 \n\t"
185  "paddusw %%mm6, %%mm1 \n\t"
186  "paddusw %%mm4, %%mm0 \n\t"
187  "paddusw %%mm5, %%mm1 \n\t"
188  "psrlw $2, %%mm0 \n\t"
189  "psrlw $2, %%mm1 \n\t"
190  "packuswb %%mm1, %%mm0 \n\t"
191  "movq %%mm0, (%2, %%"REG_a") \n\t"
192  "add %3, %%"REG_a" \n\t"
193 
194  "subl $2, %0 \n\t"
195  "jnz 1b \n\t"
196  :"+g"(h), "+S"(pixels)
197  :"D"(block), "r"((x86_reg)line_size)
198  :REG_a, "memory");
199 }
200 
201 // avg_pixels
202 #ifndef NO_RND
203 // in case more speed is needed - unroling would certainly help
204 static void DEF(avg, pixels8)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
205 {
206  MOVQ_BFE(mm6);
207  JUMPALIGN();
208  do {
209  __asm__ volatile(
210  "movq %0, %%mm0 \n\t"
211  "movq %1, %%mm1 \n\t"
212  OP_AVG(%%mm0, %%mm1, %%mm2, %%mm6)
213  "movq %%mm2, %0 \n\t"
214  :"+m"(*block)
215  :"m"(*pixels)
216  :"memory");
217  pixels += line_size;
218  block += line_size;
219  }
220  while (--h);
221 }
222 #endif // NO_RND
223 
224 static void DEF(avg, pixels16)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
225 {
226  MOVQ_BFE(mm6);
227  JUMPALIGN();
228  do {
229  __asm__ volatile(
230  "movq %0, %%mm0 \n\t"
231  "movq %1, %%mm1 \n\t"
232  OP_AVG(%%mm0, %%mm1, %%mm2, %%mm6)
233  "movq %%mm2, %0 \n\t"
234  "movq 8%0, %%mm0 \n\t"
235  "movq 8%1, %%mm1 \n\t"
236  OP_AVG(%%mm0, %%mm1, %%mm2, %%mm6)
237  "movq %%mm2, 8%0 \n\t"
238  :"+m"(*block)
239  :"m"(*pixels)
240  :"memory");
241  pixels += line_size;
242  block += line_size;
243  }
244  while (--h);
245 }
246 
247 #ifndef NO_RND
248 static void DEF(avg, pixels8_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
249 {
250  MOVQ_BFE(mm6);
251  JUMPALIGN();
252  do {
253  __asm__ volatile(
254  "movq %1, %%mm0 \n\t"
255  "movq 1%1, %%mm1 \n\t"
256  "movq %0, %%mm3 \n\t"
257  PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
258  OP_AVG(%%mm3, %%mm2, %%mm0, %%mm6)
259  "movq %%mm0, %0 \n\t"
260  :"+m"(*block)
261  :"m"(*pixels)
262  :"memory");
263  pixels += line_size;
264  block += line_size;
265  } while (--h);
266 }
267 #endif // NO_RND
268 
269 static void DEF(avg, pixels16_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
270 {
271  MOVQ_BFE(mm6);
272  JUMPALIGN();
273  do {
274  __asm__ volatile(
275  "movq %1, %%mm0 \n\t"
276  "movq 1%1, %%mm1 \n\t"
277  "movq %0, %%mm3 \n\t"
278  PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
279  OP_AVG(%%mm3, %%mm2, %%mm0, %%mm6)
280  "movq %%mm0, %0 \n\t"
281  "movq 8%1, %%mm0 \n\t"
282  "movq 9%1, %%mm1 \n\t"
283  "movq 8%0, %%mm3 \n\t"
284  PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
285  OP_AVG(%%mm3, %%mm2, %%mm0, %%mm6)
286  "movq %%mm0, 8%0 \n\t"
287  :"+m"(*block)
288  :"m"(*pixels)
289  :"memory");
290  pixels += line_size;
291  block += line_size;
292  } while (--h);
293 }
294 
295 static void DEF(avg, pixels8_y2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
296 {
297  MOVQ_BFE(mm6);
298  __asm__ volatile(
299  "lea (%3, %3), %%"REG_a" \n\t"
300  "movq (%1), %%mm0 \n\t"
301  ".p2align 3 \n\t"
302  "1: \n\t"
303  "movq (%1, %3), %%mm1 \n\t"
304  "movq (%1, %%"REG_a"), %%mm2 \n\t"
305  PAVGBP(%%mm1, %%mm0, %%mm4, %%mm2, %%mm1, %%mm5)
306  "movq (%2), %%mm3 \n\t"
307  OP_AVG(%%mm3, %%mm4, %%mm0, %%mm6)
308  "movq (%2, %3), %%mm3 \n\t"
309  OP_AVG(%%mm3, %%mm5, %%mm1, %%mm6)
310  "movq %%mm0, (%2) \n\t"
311  "movq %%mm1, (%2, %3) \n\t"
312  "add %%"REG_a", %1 \n\t"
313  "add %%"REG_a", %2 \n\t"
314 
315  "movq (%1, %3), %%mm1 \n\t"
316  "movq (%1, %%"REG_a"), %%mm0 \n\t"
317  PAVGBP(%%mm1, %%mm2, %%mm4, %%mm0, %%mm1, %%mm5)
318  "movq (%2), %%mm3 \n\t"
319  OP_AVG(%%mm3, %%mm4, %%mm2, %%mm6)
320  "movq (%2, %3), %%mm3 \n\t"
321  OP_AVG(%%mm3, %%mm5, %%mm1, %%mm6)
322  "movq %%mm2, (%2) \n\t"
323  "movq %%mm1, (%2, %3) \n\t"
324  "add %%"REG_a", %1 \n\t"
325  "add %%"REG_a", %2 \n\t"
326 
327  "subl $4, %0 \n\t"
328  "jnz 1b \n\t"
329  :"+g"(h), "+S"(pixels), "+D"(block)
330  :"r"((x86_reg)line_size)
331  :REG_a, "memory");
332 }
333 
334 // this routine is 'slightly' suboptimal but mostly unused
335 static void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
336 {
337  MOVQ_ZERO(mm7);
338  SET_RND(mm6); // =2 for rnd and =1 for no_rnd version
339  __asm__ volatile(
340  "movq (%1), %%mm0 \n\t"
341  "movq 1(%1), %%mm4 \n\t"
342  "movq %%mm0, %%mm1 \n\t"
343  "movq %%mm4, %%mm5 \n\t"
344  "punpcklbw %%mm7, %%mm0 \n\t"
345  "punpcklbw %%mm7, %%mm4 \n\t"
346  "punpckhbw %%mm7, %%mm1 \n\t"
347  "punpckhbw %%mm7, %%mm5 \n\t"
348  "paddusw %%mm0, %%mm4 \n\t"
349  "paddusw %%mm1, %%mm5 \n\t"
350  "xor %%"REG_a", %%"REG_a" \n\t"
351  "add %3, %1 \n\t"
352  ".p2align 3 \n\t"
353  "1: \n\t"
354  "movq (%1, %%"REG_a"), %%mm0 \n\t"
355  "movq 1(%1, %%"REG_a"), %%mm2 \n\t"
356  "movq %%mm0, %%mm1 \n\t"
357  "movq %%mm2, %%mm3 \n\t"
358  "punpcklbw %%mm7, %%mm0 \n\t"
359  "punpcklbw %%mm7, %%mm2 \n\t"
360  "punpckhbw %%mm7, %%mm1 \n\t"
361  "punpckhbw %%mm7, %%mm3 \n\t"
362  "paddusw %%mm2, %%mm0 \n\t"
363  "paddusw %%mm3, %%mm1 \n\t"
364  "paddusw %%mm6, %%mm4 \n\t"
365  "paddusw %%mm6, %%mm5 \n\t"
366  "paddusw %%mm0, %%mm4 \n\t"
367  "paddusw %%mm1, %%mm5 \n\t"
368  "psrlw $2, %%mm4 \n\t"
369  "psrlw $2, %%mm5 \n\t"
370  "movq (%2, %%"REG_a"), %%mm3 \n\t"
371  "packuswb %%mm5, %%mm4 \n\t"
372  "pcmpeqd %%mm2, %%mm2 \n\t"
373  "paddb %%mm2, %%mm2 \n\t"
374  OP_AVG(%%mm3, %%mm4, %%mm5, %%mm2)
375  "movq %%mm5, (%2, %%"REG_a") \n\t"
376  "add %3, %%"REG_a" \n\t"
377 
378  "movq (%1, %%"REG_a"), %%mm2 \n\t" // 0 <-> 2 1 <-> 3
379  "movq 1(%1, %%"REG_a"), %%mm4 \n\t"
380  "movq %%mm2, %%mm3 \n\t"
381  "movq %%mm4, %%mm5 \n\t"
382  "punpcklbw %%mm7, %%mm2 \n\t"
383  "punpcklbw %%mm7, %%mm4 \n\t"
384  "punpckhbw %%mm7, %%mm3 \n\t"
385  "punpckhbw %%mm7, %%mm5 \n\t"
386  "paddusw %%mm2, %%mm4 \n\t"
387  "paddusw %%mm3, %%mm5 \n\t"
388  "paddusw %%mm6, %%mm0 \n\t"
389  "paddusw %%mm6, %%mm1 \n\t"
390  "paddusw %%mm4, %%mm0 \n\t"
391  "paddusw %%mm5, %%mm1 \n\t"
392  "psrlw $2, %%mm0 \n\t"
393  "psrlw $2, %%mm1 \n\t"
394  "movq (%2, %%"REG_a"), %%mm3 \n\t"
395  "packuswb %%mm1, %%mm0 \n\t"
396  "pcmpeqd %%mm2, %%mm2 \n\t"
397  "paddb %%mm2, %%mm2 \n\t"
398  OP_AVG(%%mm3, %%mm0, %%mm1, %%mm2)
399  "movq %%mm1, (%2, %%"REG_a") \n\t"
400  "add %3, %%"REG_a" \n\t"
401 
402  "subl $2, %0 \n\t"
403  "jnz 1b \n\t"
404  :"+g"(h), "+S"(pixels)
405  :"D"(block), "r"((x86_reg)line_size)
406  :REG_a, "memory");
407 }
408 
409 //FIXME optimize
410 static void DEF(put, pixels16_y2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h){
411  DEF(put, pixels8_y2)(block , pixels , line_size, h);
412  DEF(put, pixels8_y2)(block+8, pixels+8, line_size, h);
413 }
414 
415 static void DEF(put, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h){
416  DEF(put, pixels8_xy2)(block , pixels , line_size, h);
417  DEF(put, pixels8_xy2)(block+8, pixels+8, line_size, h);
418 }
419 
420 static void DEF(avg, pixels16_y2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h){
421  DEF(avg, pixels8_y2)(block , pixels , line_size, h);
422  DEF(avg, pixels8_y2)(block+8, pixels+8, line_size, h);
423 }
424 
425 static void DEF(avg, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h){
426  DEF(avg, pixels8_xy2)(block , pixels , line_size, h);
427  DEF(avg, pixels8_xy2)(block+8, pixels+8, line_size, h);
428 }
uint8_t
#define put(d, s)
Definition: dsputil_align.c:51
#define OP_AVG(dst, val)
Definition: diracdsp.c:77
static void DEF(put, pixels8_x2)
#define PAVGB(a, b)
int x86_reg
#define avg(d, s)
Definition: dsputil_align.c:52