yading@10
|
1 /*
|
yading@10
|
2 * DSP utils mmx functions are compiled twice for rnd/no_rnd
|
yading@10
|
3 * Copyright (c) 2000, 2001 Fabrice Bellard
|
yading@10
|
4 * Copyright (c) 2003-2004 Michael Niedermayer <michaelni@gmx.at>
|
yading@10
|
5 *
|
yading@10
|
6 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
|
yading@10
|
7 * mostly rewritten by Michael Niedermayer <michaelni@gmx.at>
|
yading@10
|
8 * and improved by Zdenek Kabelac <kabi@users.sf.net>
|
yading@10
|
9 *
|
yading@10
|
10 * This file is part of FFmpeg.
|
yading@10
|
11 *
|
yading@10
|
12 * FFmpeg is free software; you can redistribute it and/or
|
yading@10
|
13 * modify it under the terms of the GNU Lesser General Public
|
yading@10
|
14 * License as published by the Free Software Foundation; either
|
yading@10
|
15 * version 2.1 of the License, or (at your option) any later version.
|
yading@10
|
16 *
|
yading@10
|
17 * FFmpeg is distributed in the hope that it will be useful,
|
yading@10
|
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
yading@10
|
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
yading@10
|
20 * Lesser General Public License for more details.
|
yading@10
|
21 *
|
yading@10
|
22 * You should have received a copy of the GNU Lesser General Public
|
yading@10
|
23 * License along with FFmpeg; if not, write to the Free Software
|
yading@10
|
24 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
yading@10
|
25 */
|
yading@10
|
26
|
yading@10
|
27 // put_pixels
|
yading@10
|
28 static void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
|
yading@10
|
29 {
|
yading@10
|
30 MOVQ_ZERO(mm7);
|
yading@10
|
31 SET_RND(mm6); // =2 for rnd and =1 for no_rnd version
|
yading@10
|
32 __asm__ volatile(
|
yading@10
|
33 "movq (%1), %%mm0 \n\t"
|
yading@10
|
34 "movq 1(%1), %%mm4 \n\t"
|
yading@10
|
35 "movq %%mm0, %%mm1 \n\t"
|
yading@10
|
36 "movq %%mm4, %%mm5 \n\t"
|
yading@10
|
37 "punpcklbw %%mm7, %%mm0 \n\t"
|
yading@10
|
38 "punpcklbw %%mm7, %%mm4 \n\t"
|
yading@10
|
39 "punpckhbw %%mm7, %%mm1 \n\t"
|
yading@10
|
40 "punpckhbw %%mm7, %%mm5 \n\t"
|
yading@10
|
41 "paddusw %%mm0, %%mm4 \n\t"
|
yading@10
|
42 "paddusw %%mm1, %%mm5 \n\t"
|
yading@10
|
43 "xor %%"REG_a", %%"REG_a" \n\t"
|
yading@10
|
44 "add %3, %1 \n\t"
|
yading@10
|
45 ".p2align 3 \n\t"
|
yading@10
|
46 "1: \n\t"
|
yading@10
|
47 "movq (%1, %%"REG_a"), %%mm0 \n\t"
|
yading@10
|
48 "movq 1(%1, %%"REG_a"), %%mm2 \n\t"
|
yading@10
|
49 "movq %%mm0, %%mm1 \n\t"
|
yading@10
|
50 "movq %%mm2, %%mm3 \n\t"
|
yading@10
|
51 "punpcklbw %%mm7, %%mm0 \n\t"
|
yading@10
|
52 "punpcklbw %%mm7, %%mm2 \n\t"
|
yading@10
|
53 "punpckhbw %%mm7, %%mm1 \n\t"
|
yading@10
|
54 "punpckhbw %%mm7, %%mm3 \n\t"
|
yading@10
|
55 "paddusw %%mm2, %%mm0 \n\t"
|
yading@10
|
56 "paddusw %%mm3, %%mm1 \n\t"
|
yading@10
|
57 "paddusw %%mm6, %%mm4 \n\t"
|
yading@10
|
58 "paddusw %%mm6, %%mm5 \n\t"
|
yading@10
|
59 "paddusw %%mm0, %%mm4 \n\t"
|
yading@10
|
60 "paddusw %%mm1, %%mm5 \n\t"
|
yading@10
|
61 "psrlw $2, %%mm4 \n\t"
|
yading@10
|
62 "psrlw $2, %%mm5 \n\t"
|
yading@10
|
63 "packuswb %%mm5, %%mm4 \n\t"
|
yading@10
|
64 "movq %%mm4, (%2, %%"REG_a") \n\t"
|
yading@10
|
65 "add %3, %%"REG_a" \n\t"
|
yading@10
|
66
|
yading@10
|
67 "movq (%1, %%"REG_a"), %%mm2 \n\t" // 0 <-> 2 1 <-> 3
|
yading@10
|
68 "movq 1(%1, %%"REG_a"), %%mm4 \n\t"
|
yading@10
|
69 "movq %%mm2, %%mm3 \n\t"
|
yading@10
|
70 "movq %%mm4, %%mm5 \n\t"
|
yading@10
|
71 "punpcklbw %%mm7, %%mm2 \n\t"
|
yading@10
|
72 "punpcklbw %%mm7, %%mm4 \n\t"
|
yading@10
|
73 "punpckhbw %%mm7, %%mm3 \n\t"
|
yading@10
|
74 "punpckhbw %%mm7, %%mm5 \n\t"
|
yading@10
|
75 "paddusw %%mm2, %%mm4 \n\t"
|
yading@10
|
76 "paddusw %%mm3, %%mm5 \n\t"
|
yading@10
|
77 "paddusw %%mm6, %%mm0 \n\t"
|
yading@10
|
78 "paddusw %%mm6, %%mm1 \n\t"
|
yading@10
|
79 "paddusw %%mm4, %%mm0 \n\t"
|
yading@10
|
80 "paddusw %%mm5, %%mm1 \n\t"
|
yading@10
|
81 "psrlw $2, %%mm0 \n\t"
|
yading@10
|
82 "psrlw $2, %%mm1 \n\t"
|
yading@10
|
83 "packuswb %%mm1, %%mm0 \n\t"
|
yading@10
|
84 "movq %%mm0, (%2, %%"REG_a") \n\t"
|
yading@10
|
85 "add %3, %%"REG_a" \n\t"
|
yading@10
|
86
|
yading@10
|
87 "subl $2, %0 \n\t"
|
yading@10
|
88 "jnz 1b \n\t"
|
yading@10
|
89 :"+g"(h), "+S"(pixels)
|
yading@10
|
90 :"D"(block), "r"((x86_reg)line_size)
|
yading@10
|
91 :REG_a, "memory");
|
yading@10
|
92 }
|
yading@10
|
93
|
yading@10
|
94 // in case more speed is needed - unroling would certainly help
|
yading@10
|
95 static void DEF(avg, pixels8)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
|
yading@10
|
96 {
|
yading@10
|
97 MOVQ_BFE(mm6);
|
yading@10
|
98 JUMPALIGN();
|
yading@10
|
99 do {
|
yading@10
|
100 __asm__ volatile(
|
yading@10
|
101 "movq %0, %%mm0 \n\t"
|
yading@10
|
102 "movq %1, %%mm1 \n\t"
|
yading@10
|
103 OP_AVG(%%mm0, %%mm1, %%mm2, %%mm6)
|
yading@10
|
104 "movq %%mm2, %0 \n\t"
|
yading@10
|
105 :"+m"(*block)
|
yading@10
|
106 :"m"(*pixels)
|
yading@10
|
107 :"memory");
|
yading@10
|
108 pixels += line_size;
|
yading@10
|
109 block += line_size;
|
yading@10
|
110 }
|
yading@10
|
111 while (--h);
|
yading@10
|
112 }
|
yading@10
|
113
|
yading@10
|
114 static void DEF(avg, pixels16)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
|
yading@10
|
115 {
|
yading@10
|
116 MOVQ_BFE(mm6);
|
yading@10
|
117 JUMPALIGN();
|
yading@10
|
118 do {
|
yading@10
|
119 __asm__ volatile(
|
yading@10
|
120 "movq %0, %%mm0 \n\t"
|
yading@10
|
121 "movq %1, %%mm1 \n\t"
|
yading@10
|
122 OP_AVG(%%mm0, %%mm1, %%mm2, %%mm6)
|
yading@10
|
123 "movq %%mm2, %0 \n\t"
|
yading@10
|
124 "movq 8%0, %%mm0 \n\t"
|
yading@10
|
125 "movq 8%1, %%mm1 \n\t"
|
yading@10
|
126 OP_AVG(%%mm0, %%mm1, %%mm2, %%mm6)
|
yading@10
|
127 "movq %%mm2, 8%0 \n\t"
|
yading@10
|
128 :"+m"(*block)
|
yading@10
|
129 :"m"(*pixels)
|
yading@10
|
130 :"memory");
|
yading@10
|
131 pixels += line_size;
|
yading@10
|
132 block += line_size;
|
yading@10
|
133 }
|
yading@10
|
134 while (--h);
|
yading@10
|
135 }
|
yading@10
|
136
|
yading@10
|
137 // this routine is 'slightly' suboptimal but mostly unused
|
yading@10
|
138 static void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
|
yading@10
|
139 {
|
yading@10
|
140 MOVQ_ZERO(mm7);
|
yading@10
|
141 SET_RND(mm6); // =2 for rnd and =1 for no_rnd version
|
yading@10
|
142 __asm__ volatile(
|
yading@10
|
143 "movq (%1), %%mm0 \n\t"
|
yading@10
|
144 "movq 1(%1), %%mm4 \n\t"
|
yading@10
|
145 "movq %%mm0, %%mm1 \n\t"
|
yading@10
|
146 "movq %%mm4, %%mm5 \n\t"
|
yading@10
|
147 "punpcklbw %%mm7, %%mm0 \n\t"
|
yading@10
|
148 "punpcklbw %%mm7, %%mm4 \n\t"
|
yading@10
|
149 "punpckhbw %%mm7, %%mm1 \n\t"
|
yading@10
|
150 "punpckhbw %%mm7, %%mm5 \n\t"
|
yading@10
|
151 "paddusw %%mm0, %%mm4 \n\t"
|
yading@10
|
152 "paddusw %%mm1, %%mm5 \n\t"
|
yading@10
|
153 "xor %%"REG_a", %%"REG_a" \n\t"
|
yading@10
|
154 "add %3, %1 \n\t"
|
yading@10
|
155 ".p2align 3 \n\t"
|
yading@10
|
156 "1: \n\t"
|
yading@10
|
157 "movq (%1, %%"REG_a"), %%mm0 \n\t"
|
yading@10
|
158 "movq 1(%1, %%"REG_a"), %%mm2 \n\t"
|
yading@10
|
159 "movq %%mm0, %%mm1 \n\t"
|
yading@10
|
160 "movq %%mm2, %%mm3 \n\t"
|
yading@10
|
161 "punpcklbw %%mm7, %%mm0 \n\t"
|
yading@10
|
162 "punpcklbw %%mm7, %%mm2 \n\t"
|
yading@10
|
163 "punpckhbw %%mm7, %%mm1 \n\t"
|
yading@10
|
164 "punpckhbw %%mm7, %%mm3 \n\t"
|
yading@10
|
165 "paddusw %%mm2, %%mm0 \n\t"
|
yading@10
|
166 "paddusw %%mm3, %%mm1 \n\t"
|
yading@10
|
167 "paddusw %%mm6, %%mm4 \n\t"
|
yading@10
|
168 "paddusw %%mm6, %%mm5 \n\t"
|
yading@10
|
169 "paddusw %%mm0, %%mm4 \n\t"
|
yading@10
|
170 "paddusw %%mm1, %%mm5 \n\t"
|
yading@10
|
171 "psrlw $2, %%mm4 \n\t"
|
yading@10
|
172 "psrlw $2, %%mm5 \n\t"
|
yading@10
|
173 "movq (%2, %%"REG_a"), %%mm3 \n\t"
|
yading@10
|
174 "packuswb %%mm5, %%mm4 \n\t"
|
yading@10
|
175 "pcmpeqd %%mm2, %%mm2 \n\t"
|
yading@10
|
176 "paddb %%mm2, %%mm2 \n\t"
|
yading@10
|
177 OP_AVG(%%mm3, %%mm4, %%mm5, %%mm2)
|
yading@10
|
178 "movq %%mm5, (%2, %%"REG_a") \n\t"
|
yading@10
|
179 "add %3, %%"REG_a" \n\t"
|
yading@10
|
180
|
yading@10
|
181 "movq (%1, %%"REG_a"), %%mm2 \n\t" // 0 <-> 2 1 <-> 3
|
yading@10
|
182 "movq 1(%1, %%"REG_a"), %%mm4 \n\t"
|
yading@10
|
183 "movq %%mm2, %%mm3 \n\t"
|
yading@10
|
184 "movq %%mm4, %%mm5 \n\t"
|
yading@10
|
185 "punpcklbw %%mm7, %%mm2 \n\t"
|
yading@10
|
186 "punpcklbw %%mm7, %%mm4 \n\t"
|
yading@10
|
187 "punpckhbw %%mm7, %%mm3 \n\t"
|
yading@10
|
188 "punpckhbw %%mm7, %%mm5 \n\t"
|
yading@10
|
189 "paddusw %%mm2, %%mm4 \n\t"
|
yading@10
|
190 "paddusw %%mm3, %%mm5 \n\t"
|
yading@10
|
191 "paddusw %%mm6, %%mm0 \n\t"
|
yading@10
|
192 "paddusw %%mm6, %%mm1 \n\t"
|
yading@10
|
193 "paddusw %%mm4, %%mm0 \n\t"
|
yading@10
|
194 "paddusw %%mm5, %%mm1 \n\t"
|
yading@10
|
195 "psrlw $2, %%mm0 \n\t"
|
yading@10
|
196 "psrlw $2, %%mm1 \n\t"
|
yading@10
|
197 "movq (%2, %%"REG_a"), %%mm3 \n\t"
|
yading@10
|
198 "packuswb %%mm1, %%mm0 \n\t"
|
yading@10
|
199 "pcmpeqd %%mm2, %%mm2 \n\t"
|
yading@10
|
200 "paddb %%mm2, %%mm2 \n\t"
|
yading@10
|
201 OP_AVG(%%mm3, %%mm0, %%mm1, %%mm2)
|
yading@10
|
202 "movq %%mm1, (%2, %%"REG_a") \n\t"
|
yading@10
|
203 "add %3, %%"REG_a" \n\t"
|
yading@10
|
204
|
yading@10
|
205 "subl $2, %0 \n\t"
|
yading@10
|
206 "jnz 1b \n\t"
|
yading@10
|
207 :"+g"(h), "+S"(pixels)
|
yading@10
|
208 :"D"(block), "r"((x86_reg)line_size)
|
yading@10
|
209 :REG_a, "memory");
|
yading@10
|
210 }
|
yading@10
|
211
|
yading@10
|
212 //FIXME optimize
|
yading@10
|
213 static void DEF(put, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h){
|
yading@10
|
214 DEF(put, pixels8_xy2)(block , pixels , line_size, h);
|
yading@10
|
215 DEF(put, pixels8_xy2)(block+8, pixels+8, line_size, h);
|
yading@10
|
216 }
|
yading@10
|
217
|
yading@10
|
218 static void DEF(avg, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h){
|
yading@10
|
219 DEF(avg, pixels8_xy2)(block , pixels , line_size, h);
|
yading@10
|
220 DEF(avg, pixels8_xy2)(block+8, pixels+8, line_size, h);
|
yading@10
|
221 }
|