yading@10
|
1 /*
|
yading@10
|
2 * DSP utils mmx functions are compiled twice for rnd/no_rnd
|
yading@10
|
3 * Copyright (c) 2000, 2001 Fabrice Bellard
|
yading@10
|
4 * Copyright (c) 2003-2004 Michael Niedermayer <michaelni@gmx.at>
|
yading@10
|
5 *
|
yading@10
|
6 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
|
yading@10
|
7 * mostly rewritten by Michael Niedermayer <michaelni@gmx.at>
|
yading@10
|
8 * and improved by Zdenek Kabelac <kabi@users.sf.net>
|
yading@10
|
9 *
|
yading@10
|
10 * This file is part of FFmpeg.
|
yading@10
|
11 *
|
yading@10
|
12 * FFmpeg is free software; you can redistribute it and/or
|
yading@10
|
13 * modify it under the terms of the GNU Lesser General Public
|
yading@10
|
14 * License as published by the Free Software Foundation; either
|
yading@10
|
15 * version 2.1 of the License, or (at your option) any later version.
|
yading@10
|
16 *
|
yading@10
|
17 * FFmpeg is distributed in the hope that it will be useful,
|
yading@10
|
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
yading@10
|
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
yading@10
|
20 * Lesser General Public License for more details.
|
yading@10
|
21 *
|
yading@10
|
22 * You should have received a copy of the GNU Lesser General Public
|
yading@10
|
23 * License along with FFmpeg; if not, write to the Free Software
|
yading@10
|
24 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
yading@10
|
25 */
|
yading@10
|
26
|
yading@10
|
27 // put_pixels
|
yading@10
|
28 static void DEF(put, pixels8_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
|
yading@10
|
29 {
|
yading@10
|
30 MOVQ_BFE(mm6);
|
yading@10
|
31 __asm__ volatile(
|
yading@10
|
32 "lea (%3, %3), %%"REG_a" \n\t"
|
yading@10
|
33 ".p2align 3 \n\t"
|
yading@10
|
34 "1: \n\t"
|
yading@10
|
35 "movq (%1), %%mm0 \n\t"
|
yading@10
|
36 "movq 1(%1), %%mm1 \n\t"
|
yading@10
|
37 "movq (%1, %3), %%mm2 \n\t"
|
yading@10
|
38 "movq 1(%1, %3), %%mm3 \n\t"
|
yading@10
|
39 PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
|
yading@10
|
40 "movq %%mm4, (%2) \n\t"
|
yading@10
|
41 "movq %%mm5, (%2, %3) \n\t"
|
yading@10
|
42 "add %%"REG_a", %1 \n\t"
|
yading@10
|
43 "add %%"REG_a", %2 \n\t"
|
yading@10
|
44 "movq (%1), %%mm0 \n\t"
|
yading@10
|
45 "movq 1(%1), %%mm1 \n\t"
|
yading@10
|
46 "movq (%1, %3), %%mm2 \n\t"
|
yading@10
|
47 "movq 1(%1, %3), %%mm3 \n\t"
|
yading@10
|
48 PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
|
yading@10
|
49 "movq %%mm4, (%2) \n\t"
|
yading@10
|
50 "movq %%mm5, (%2, %3) \n\t"
|
yading@10
|
51 "add %%"REG_a", %1 \n\t"
|
yading@10
|
52 "add %%"REG_a", %2 \n\t"
|
yading@10
|
53 "subl $4, %0 \n\t"
|
yading@10
|
54 "jnz 1b \n\t"
|
yading@10
|
55 :"+g"(h), "+S"(pixels), "+D"(block)
|
yading@10
|
56 :"r"((x86_reg)line_size)
|
yading@10
|
57 :REG_a, "memory");
|
yading@10
|
58 }
|
yading@10
|
59
|
yading@10
|
60 static void DEF(put, pixels16_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
|
yading@10
|
61 {
|
yading@10
|
62 MOVQ_BFE(mm6);
|
yading@10
|
63 __asm__ volatile(
|
yading@10
|
64 "lea (%3, %3), %%"REG_a" \n\t"
|
yading@10
|
65 ".p2align 3 \n\t"
|
yading@10
|
66 "1: \n\t"
|
yading@10
|
67 "movq (%1), %%mm0 \n\t"
|
yading@10
|
68 "movq 1(%1), %%mm1 \n\t"
|
yading@10
|
69 "movq (%1, %3), %%mm2 \n\t"
|
yading@10
|
70 "movq 1(%1, %3), %%mm3 \n\t"
|
yading@10
|
71 PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
|
yading@10
|
72 "movq %%mm4, (%2) \n\t"
|
yading@10
|
73 "movq %%mm5, (%2, %3) \n\t"
|
yading@10
|
74 "movq 8(%1), %%mm0 \n\t"
|
yading@10
|
75 "movq 9(%1), %%mm1 \n\t"
|
yading@10
|
76 "movq 8(%1, %3), %%mm2 \n\t"
|
yading@10
|
77 "movq 9(%1, %3), %%mm3 \n\t"
|
yading@10
|
78 PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
|
yading@10
|
79 "movq %%mm4, 8(%2) \n\t"
|
yading@10
|
80 "movq %%mm5, 8(%2, %3) \n\t"
|
yading@10
|
81 "add %%"REG_a", %1 \n\t"
|
yading@10
|
82 "add %%"REG_a", %2 \n\t"
|
yading@10
|
83 "movq (%1), %%mm0 \n\t"
|
yading@10
|
84 "movq 1(%1), %%mm1 \n\t"
|
yading@10
|
85 "movq (%1, %3), %%mm2 \n\t"
|
yading@10
|
86 "movq 1(%1, %3), %%mm3 \n\t"
|
yading@10
|
87 PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
|
yading@10
|
88 "movq %%mm4, (%2) \n\t"
|
yading@10
|
89 "movq %%mm5, (%2, %3) \n\t"
|
yading@10
|
90 "movq 8(%1), %%mm0 \n\t"
|
yading@10
|
91 "movq 9(%1), %%mm1 \n\t"
|
yading@10
|
92 "movq 8(%1, %3), %%mm2 \n\t"
|
yading@10
|
93 "movq 9(%1, %3), %%mm3 \n\t"
|
yading@10
|
94 PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
|
yading@10
|
95 "movq %%mm4, 8(%2) \n\t"
|
yading@10
|
96 "movq %%mm5, 8(%2, %3) \n\t"
|
yading@10
|
97 "add %%"REG_a", %1 \n\t"
|
yading@10
|
98 "add %%"REG_a", %2 \n\t"
|
yading@10
|
99 "subl $4, %0 \n\t"
|
yading@10
|
100 "jnz 1b \n\t"
|
yading@10
|
101 :"+g"(h), "+S"(pixels), "+D"(block)
|
yading@10
|
102 :"r"((x86_reg)line_size)
|
yading@10
|
103 :REG_a, "memory");
|
yading@10
|
104 }
|
yading@10
|
105
|
yading@10
|
106 static void DEF(put, pixels8_y2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
|
yading@10
|
107 {
|
yading@10
|
108 MOVQ_BFE(mm6);
|
yading@10
|
109 __asm__ volatile(
|
yading@10
|
110 "lea (%3, %3), %%"REG_a" \n\t"
|
yading@10
|
111 "movq (%1), %%mm0 \n\t"
|
yading@10
|
112 ".p2align 3 \n\t"
|
yading@10
|
113 "1: \n\t"
|
yading@10
|
114 "movq (%1, %3), %%mm1 \n\t"
|
yading@10
|
115 "movq (%1, %%"REG_a"),%%mm2 \n\t"
|
yading@10
|
116 PAVGBP(%%mm1, %%mm0, %%mm4, %%mm2, %%mm1, %%mm5)
|
yading@10
|
117 "movq %%mm4, (%2) \n\t"
|
yading@10
|
118 "movq %%mm5, (%2, %3) \n\t"
|
yading@10
|
119 "add %%"REG_a", %1 \n\t"
|
yading@10
|
120 "add %%"REG_a", %2 \n\t"
|
yading@10
|
121 "movq (%1, %3), %%mm1 \n\t"
|
yading@10
|
122 "movq (%1, %%"REG_a"),%%mm0 \n\t"
|
yading@10
|
123 PAVGBP(%%mm1, %%mm2, %%mm4, %%mm0, %%mm1, %%mm5)
|
yading@10
|
124 "movq %%mm4, (%2) \n\t"
|
yading@10
|
125 "movq %%mm5, (%2, %3) \n\t"
|
yading@10
|
126 "add %%"REG_a", %1 \n\t"
|
yading@10
|
127 "add %%"REG_a", %2 \n\t"
|
yading@10
|
128 "subl $4, %0 \n\t"
|
yading@10
|
129 "jnz 1b \n\t"
|
yading@10
|
130 :"+g"(h), "+S"(pixels), "+D"(block)
|
yading@10
|
131 :"r"((x86_reg)line_size)
|
yading@10
|
132 :REG_a, "memory");
|
yading@10
|
133 }
|
yading@10
|
134
|
yading@10
|
135 static void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
|
yading@10
|
136 {
|
yading@10
|
137 MOVQ_ZERO(mm7);
|
yading@10
|
138 SET_RND(mm6); // =2 for rnd and =1 for no_rnd version
|
yading@10
|
139 __asm__ volatile(
|
yading@10
|
140 "movq (%1), %%mm0 \n\t"
|
yading@10
|
141 "movq 1(%1), %%mm4 \n\t"
|
yading@10
|
142 "movq %%mm0, %%mm1 \n\t"
|
yading@10
|
143 "movq %%mm4, %%mm5 \n\t"
|
yading@10
|
144 "punpcklbw %%mm7, %%mm0 \n\t"
|
yading@10
|
145 "punpcklbw %%mm7, %%mm4 \n\t"
|
yading@10
|
146 "punpckhbw %%mm7, %%mm1 \n\t"
|
yading@10
|
147 "punpckhbw %%mm7, %%mm5 \n\t"
|
yading@10
|
148 "paddusw %%mm0, %%mm4 \n\t"
|
yading@10
|
149 "paddusw %%mm1, %%mm5 \n\t"
|
yading@10
|
150 "xor %%"REG_a", %%"REG_a" \n\t"
|
yading@10
|
151 "add %3, %1 \n\t"
|
yading@10
|
152 ".p2align 3 \n\t"
|
yading@10
|
153 "1: \n\t"
|
yading@10
|
154 "movq (%1, %%"REG_a"), %%mm0 \n\t"
|
yading@10
|
155 "movq 1(%1, %%"REG_a"), %%mm2 \n\t"
|
yading@10
|
156 "movq %%mm0, %%mm1 \n\t"
|
yading@10
|
157 "movq %%mm2, %%mm3 \n\t"
|
yading@10
|
158 "punpcklbw %%mm7, %%mm0 \n\t"
|
yading@10
|
159 "punpcklbw %%mm7, %%mm2 \n\t"
|
yading@10
|
160 "punpckhbw %%mm7, %%mm1 \n\t"
|
yading@10
|
161 "punpckhbw %%mm7, %%mm3 \n\t"
|
yading@10
|
162 "paddusw %%mm2, %%mm0 \n\t"
|
yading@10
|
163 "paddusw %%mm3, %%mm1 \n\t"
|
yading@10
|
164 "paddusw %%mm6, %%mm4 \n\t"
|
yading@10
|
165 "paddusw %%mm6, %%mm5 \n\t"
|
yading@10
|
166 "paddusw %%mm0, %%mm4 \n\t"
|
yading@10
|
167 "paddusw %%mm1, %%mm5 \n\t"
|
yading@10
|
168 "psrlw $2, %%mm4 \n\t"
|
yading@10
|
169 "psrlw $2, %%mm5 \n\t"
|
yading@10
|
170 "packuswb %%mm5, %%mm4 \n\t"
|
yading@10
|
171 "movq %%mm4, (%2, %%"REG_a") \n\t"
|
yading@10
|
172 "add %3, %%"REG_a" \n\t"
|
yading@10
|
173
|
yading@10
|
174 "movq (%1, %%"REG_a"), %%mm2 \n\t" // 0 <-> 2 1 <-> 3
|
yading@10
|
175 "movq 1(%1, %%"REG_a"), %%mm4 \n\t"
|
yading@10
|
176 "movq %%mm2, %%mm3 \n\t"
|
yading@10
|
177 "movq %%mm4, %%mm5 \n\t"
|
yading@10
|
178 "punpcklbw %%mm7, %%mm2 \n\t"
|
yading@10
|
179 "punpcklbw %%mm7, %%mm4 \n\t"
|
yading@10
|
180 "punpckhbw %%mm7, %%mm3 \n\t"
|
yading@10
|
181 "punpckhbw %%mm7, %%mm5 \n\t"
|
yading@10
|
182 "paddusw %%mm2, %%mm4 \n\t"
|
yading@10
|
183 "paddusw %%mm3, %%mm5 \n\t"
|
yading@10
|
184 "paddusw %%mm6, %%mm0 \n\t"
|
yading@10
|
185 "paddusw %%mm6, %%mm1 \n\t"
|
yading@10
|
186 "paddusw %%mm4, %%mm0 \n\t"
|
yading@10
|
187 "paddusw %%mm5, %%mm1 \n\t"
|
yading@10
|
188 "psrlw $2, %%mm0 \n\t"
|
yading@10
|
189 "psrlw $2, %%mm1 \n\t"
|
yading@10
|
190 "packuswb %%mm1, %%mm0 \n\t"
|
yading@10
|
191 "movq %%mm0, (%2, %%"REG_a") \n\t"
|
yading@10
|
192 "add %3, %%"REG_a" \n\t"
|
yading@10
|
193
|
yading@10
|
194 "subl $2, %0 \n\t"
|
yading@10
|
195 "jnz 1b \n\t"
|
yading@10
|
196 :"+g"(h), "+S"(pixels)
|
yading@10
|
197 :"D"(block), "r"((x86_reg)line_size)
|
yading@10
|
198 :REG_a, "memory");
|
yading@10
|
199 }
|
yading@10
|
200
|
yading@10
|
201 // avg_pixels
|
yading@10
|
202 #ifndef NO_RND
|
yading@10
|
203 // in case more speed is needed - unroling would certainly help
|
yading@10
|
204 static void DEF(avg, pixels8)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
|
yading@10
|
205 {
|
yading@10
|
206 MOVQ_BFE(mm6);
|
yading@10
|
207 JUMPALIGN();
|
yading@10
|
208 do {
|
yading@10
|
209 __asm__ volatile(
|
yading@10
|
210 "movq %0, %%mm0 \n\t"
|
yading@10
|
211 "movq %1, %%mm1 \n\t"
|
yading@10
|
212 OP_AVG(%%mm0, %%mm1, %%mm2, %%mm6)
|
yading@10
|
213 "movq %%mm2, %0 \n\t"
|
yading@10
|
214 :"+m"(*block)
|
yading@10
|
215 :"m"(*pixels)
|
yading@10
|
216 :"memory");
|
yading@10
|
217 pixels += line_size;
|
yading@10
|
218 block += line_size;
|
yading@10
|
219 }
|
yading@10
|
220 while (--h);
|
yading@10
|
221 }
|
yading@10
|
222 #endif // NO_RND
|
yading@10
|
223
|
yading@10
|
224 static void DEF(avg, pixels16)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
|
yading@10
|
225 {
|
yading@10
|
226 MOVQ_BFE(mm6);
|
yading@10
|
227 JUMPALIGN();
|
yading@10
|
228 do {
|
yading@10
|
229 __asm__ volatile(
|
yading@10
|
230 "movq %0, %%mm0 \n\t"
|
yading@10
|
231 "movq %1, %%mm1 \n\t"
|
yading@10
|
232 OP_AVG(%%mm0, %%mm1, %%mm2, %%mm6)
|
yading@10
|
233 "movq %%mm2, %0 \n\t"
|
yading@10
|
234 "movq 8%0, %%mm0 \n\t"
|
yading@10
|
235 "movq 8%1, %%mm1 \n\t"
|
yading@10
|
236 OP_AVG(%%mm0, %%mm1, %%mm2, %%mm6)
|
yading@10
|
237 "movq %%mm2, 8%0 \n\t"
|
yading@10
|
238 :"+m"(*block)
|
yading@10
|
239 :"m"(*pixels)
|
yading@10
|
240 :"memory");
|
yading@10
|
241 pixels += line_size;
|
yading@10
|
242 block += line_size;
|
yading@10
|
243 }
|
yading@10
|
244 while (--h);
|
yading@10
|
245 }
|
yading@10
|
246
|
yading@10
|
247 #ifndef NO_RND
|
yading@10
|
248 static void DEF(avg, pixels8_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
|
yading@10
|
249 {
|
yading@10
|
250 MOVQ_BFE(mm6);
|
yading@10
|
251 JUMPALIGN();
|
yading@10
|
252 do {
|
yading@10
|
253 __asm__ volatile(
|
yading@10
|
254 "movq %1, %%mm0 \n\t"
|
yading@10
|
255 "movq 1%1, %%mm1 \n\t"
|
yading@10
|
256 "movq %0, %%mm3 \n\t"
|
yading@10
|
257 PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
|
yading@10
|
258 OP_AVG(%%mm3, %%mm2, %%mm0, %%mm6)
|
yading@10
|
259 "movq %%mm0, %0 \n\t"
|
yading@10
|
260 :"+m"(*block)
|
yading@10
|
261 :"m"(*pixels)
|
yading@10
|
262 :"memory");
|
yading@10
|
263 pixels += line_size;
|
yading@10
|
264 block += line_size;
|
yading@10
|
265 } while (--h);
|
yading@10
|
266 }
|
yading@10
|
267 #endif // NO_RND
|
yading@10
|
268
|
yading@10
|
269 static void DEF(avg, pixels16_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
|
yading@10
|
270 {
|
yading@10
|
271 MOVQ_BFE(mm6);
|
yading@10
|
272 JUMPALIGN();
|
yading@10
|
273 do {
|
yading@10
|
274 __asm__ volatile(
|
yading@10
|
275 "movq %1, %%mm0 \n\t"
|
yading@10
|
276 "movq 1%1, %%mm1 \n\t"
|
yading@10
|
277 "movq %0, %%mm3 \n\t"
|
yading@10
|
278 PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
|
yading@10
|
279 OP_AVG(%%mm3, %%mm2, %%mm0, %%mm6)
|
yading@10
|
280 "movq %%mm0, %0 \n\t"
|
yading@10
|
281 "movq 8%1, %%mm0 \n\t"
|
yading@10
|
282 "movq 9%1, %%mm1 \n\t"
|
yading@10
|
283 "movq 8%0, %%mm3 \n\t"
|
yading@10
|
284 PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
|
yading@10
|
285 OP_AVG(%%mm3, %%mm2, %%mm0, %%mm6)
|
yading@10
|
286 "movq %%mm0, 8%0 \n\t"
|
yading@10
|
287 :"+m"(*block)
|
yading@10
|
288 :"m"(*pixels)
|
yading@10
|
289 :"memory");
|
yading@10
|
290 pixels += line_size;
|
yading@10
|
291 block += line_size;
|
yading@10
|
292 } while (--h);
|
yading@10
|
293 }
|
yading@10
|
294
|
yading@10
|
295 static void DEF(avg, pixels8_y2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
|
yading@10
|
296 {
|
yading@10
|
297 MOVQ_BFE(mm6);
|
yading@10
|
298 __asm__ volatile(
|
yading@10
|
299 "lea (%3, %3), %%"REG_a" \n\t"
|
yading@10
|
300 "movq (%1), %%mm0 \n\t"
|
yading@10
|
301 ".p2align 3 \n\t"
|
yading@10
|
302 "1: \n\t"
|
yading@10
|
303 "movq (%1, %3), %%mm1 \n\t"
|
yading@10
|
304 "movq (%1, %%"REG_a"), %%mm2 \n\t"
|
yading@10
|
305 PAVGBP(%%mm1, %%mm0, %%mm4, %%mm2, %%mm1, %%mm5)
|
yading@10
|
306 "movq (%2), %%mm3 \n\t"
|
yading@10
|
307 OP_AVG(%%mm3, %%mm4, %%mm0, %%mm6)
|
yading@10
|
308 "movq (%2, %3), %%mm3 \n\t"
|
yading@10
|
309 OP_AVG(%%mm3, %%mm5, %%mm1, %%mm6)
|
yading@10
|
310 "movq %%mm0, (%2) \n\t"
|
yading@10
|
311 "movq %%mm1, (%2, %3) \n\t"
|
yading@10
|
312 "add %%"REG_a", %1 \n\t"
|
yading@10
|
313 "add %%"REG_a", %2 \n\t"
|
yading@10
|
314
|
yading@10
|
315 "movq (%1, %3), %%mm1 \n\t"
|
yading@10
|
316 "movq (%1, %%"REG_a"), %%mm0 \n\t"
|
yading@10
|
317 PAVGBP(%%mm1, %%mm2, %%mm4, %%mm0, %%mm1, %%mm5)
|
yading@10
|
318 "movq (%2), %%mm3 \n\t"
|
yading@10
|
319 OP_AVG(%%mm3, %%mm4, %%mm2, %%mm6)
|
yading@10
|
320 "movq (%2, %3), %%mm3 \n\t"
|
yading@10
|
321 OP_AVG(%%mm3, %%mm5, %%mm1, %%mm6)
|
yading@10
|
322 "movq %%mm2, (%2) \n\t"
|
yading@10
|
323 "movq %%mm1, (%2, %3) \n\t"
|
yading@10
|
324 "add %%"REG_a", %1 \n\t"
|
yading@10
|
325 "add %%"REG_a", %2 \n\t"
|
yading@10
|
326
|
yading@10
|
327 "subl $4, %0 \n\t"
|
yading@10
|
328 "jnz 1b \n\t"
|
yading@10
|
329 :"+g"(h), "+S"(pixels), "+D"(block)
|
yading@10
|
330 :"r"((x86_reg)line_size)
|
yading@10
|
331 :REG_a, "memory");
|
yading@10
|
332 }
|
yading@10
|
333
|
yading@10
|
334 // this routine is 'slightly' suboptimal but mostly unused
|
yading@10
|
335 static void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
|
yading@10
|
336 {
|
yading@10
|
337 MOVQ_ZERO(mm7);
|
yading@10
|
338 SET_RND(mm6); // =2 for rnd and =1 for no_rnd version
|
yading@10
|
339 __asm__ volatile(
|
yading@10
|
340 "movq (%1), %%mm0 \n\t"
|
yading@10
|
341 "movq 1(%1), %%mm4 \n\t"
|
yading@10
|
342 "movq %%mm0, %%mm1 \n\t"
|
yading@10
|
343 "movq %%mm4, %%mm5 \n\t"
|
yading@10
|
344 "punpcklbw %%mm7, %%mm0 \n\t"
|
yading@10
|
345 "punpcklbw %%mm7, %%mm4 \n\t"
|
yading@10
|
346 "punpckhbw %%mm7, %%mm1 \n\t"
|
yading@10
|
347 "punpckhbw %%mm7, %%mm5 \n\t"
|
yading@10
|
348 "paddusw %%mm0, %%mm4 \n\t"
|
yading@10
|
349 "paddusw %%mm1, %%mm5 \n\t"
|
yading@10
|
350 "xor %%"REG_a", %%"REG_a" \n\t"
|
yading@10
|
351 "add %3, %1 \n\t"
|
yading@10
|
352 ".p2align 3 \n\t"
|
yading@10
|
353 "1: \n\t"
|
yading@10
|
354 "movq (%1, %%"REG_a"), %%mm0 \n\t"
|
yading@10
|
355 "movq 1(%1, %%"REG_a"), %%mm2 \n\t"
|
yading@10
|
356 "movq %%mm0, %%mm1 \n\t"
|
yading@10
|
357 "movq %%mm2, %%mm3 \n\t"
|
yading@10
|
358 "punpcklbw %%mm7, %%mm0 \n\t"
|
yading@10
|
359 "punpcklbw %%mm7, %%mm2 \n\t"
|
yading@10
|
360 "punpckhbw %%mm7, %%mm1 \n\t"
|
yading@10
|
361 "punpckhbw %%mm7, %%mm3 \n\t"
|
yading@10
|
362 "paddusw %%mm2, %%mm0 \n\t"
|
yading@10
|
363 "paddusw %%mm3, %%mm1 \n\t"
|
yading@10
|
364 "paddusw %%mm6, %%mm4 \n\t"
|
yading@10
|
365 "paddusw %%mm6, %%mm5 \n\t"
|
yading@10
|
366 "paddusw %%mm0, %%mm4 \n\t"
|
yading@10
|
367 "paddusw %%mm1, %%mm5 \n\t"
|
yading@10
|
368 "psrlw $2, %%mm4 \n\t"
|
yading@10
|
369 "psrlw $2, %%mm5 \n\t"
|
yading@10
|
370 "movq (%2, %%"REG_a"), %%mm3 \n\t"
|
yading@10
|
371 "packuswb %%mm5, %%mm4 \n\t"
|
yading@10
|
372 "pcmpeqd %%mm2, %%mm2 \n\t"
|
yading@10
|
373 "paddb %%mm2, %%mm2 \n\t"
|
yading@10
|
374 OP_AVG(%%mm3, %%mm4, %%mm5, %%mm2)
|
yading@10
|
375 "movq %%mm5, (%2, %%"REG_a") \n\t"
|
yading@10
|
376 "add %3, %%"REG_a" \n\t"
|
yading@10
|
377
|
yading@10
|
378 "movq (%1, %%"REG_a"), %%mm2 \n\t" // 0 <-> 2 1 <-> 3
|
yading@10
|
379 "movq 1(%1, %%"REG_a"), %%mm4 \n\t"
|
yading@10
|
380 "movq %%mm2, %%mm3 \n\t"
|
yading@10
|
381 "movq %%mm4, %%mm5 \n\t"
|
yading@10
|
382 "punpcklbw %%mm7, %%mm2 \n\t"
|
yading@10
|
383 "punpcklbw %%mm7, %%mm4 \n\t"
|
yading@10
|
384 "punpckhbw %%mm7, %%mm3 \n\t"
|
yading@10
|
385 "punpckhbw %%mm7, %%mm5 \n\t"
|
yading@10
|
386 "paddusw %%mm2, %%mm4 \n\t"
|
yading@10
|
387 "paddusw %%mm3, %%mm5 \n\t"
|
yading@10
|
388 "paddusw %%mm6, %%mm0 \n\t"
|
yading@10
|
389 "paddusw %%mm6, %%mm1 \n\t"
|
yading@10
|
390 "paddusw %%mm4, %%mm0 \n\t"
|
yading@10
|
391 "paddusw %%mm5, %%mm1 \n\t"
|
yading@10
|
392 "psrlw $2, %%mm0 \n\t"
|
yading@10
|
393 "psrlw $2, %%mm1 \n\t"
|
yading@10
|
394 "movq (%2, %%"REG_a"), %%mm3 \n\t"
|
yading@10
|
395 "packuswb %%mm1, %%mm0 \n\t"
|
yading@10
|
396 "pcmpeqd %%mm2, %%mm2 \n\t"
|
yading@10
|
397 "paddb %%mm2, %%mm2 \n\t"
|
yading@10
|
398 OP_AVG(%%mm3, %%mm0, %%mm1, %%mm2)
|
yading@10
|
399 "movq %%mm1, (%2, %%"REG_a") \n\t"
|
yading@10
|
400 "add %3, %%"REG_a" \n\t"
|
yading@10
|
401
|
yading@10
|
402 "subl $2, %0 \n\t"
|
yading@10
|
403 "jnz 1b \n\t"
|
yading@10
|
404 :"+g"(h), "+S"(pixels)
|
yading@10
|
405 :"D"(block), "r"((x86_reg)line_size)
|
yading@10
|
406 :REG_a, "memory");
|
yading@10
|
407 }
|
yading@10
|
408
|
yading@10
|
409 //FIXME optimize
|
yading@10
|
410 static void DEF(put, pixels16_y2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h){
|
yading@10
|
411 DEF(put, pixels8_y2)(block , pixels , line_size, h);
|
yading@10
|
412 DEF(put, pixels8_y2)(block+8, pixels+8, line_size, h);
|
yading@10
|
413 }
|
yading@10
|
414
|
yading@10
|
415 static void DEF(put, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h){
|
yading@10
|
416 DEF(put, pixels8_xy2)(block , pixels , line_size, h);
|
yading@10
|
417 DEF(put, pixels8_xy2)(block+8, pixels+8, line_size, h);
|
yading@10
|
418 }
|
yading@10
|
419
|
yading@10
|
420 static void DEF(avg, pixels16_y2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h){
|
yading@10
|
421 DEF(avg, pixels8_y2)(block , pixels , line_size, h);
|
yading@10
|
422 DEF(avg, pixels8_y2)(block+8, pixels+8, line_size, h);
|
yading@10
|
423 }
|
yading@10
|
424
|
yading@10
|
425 static void DEF(avg, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h){
|
yading@10
|
426 DEF(avg, pixels8_xy2)(block , pixels , line_size, h);
|
yading@10
|
427 DEF(avg, pixels8_xy2)(block+8, pixels+8, line_size, h);
|
yading@10
|
428 }
|