yading@10
|
1 ;*****************************************************************************
|
yading@10
|
2 ;* MMX optimized DSP utils
|
yading@10
|
3 ;*****************************************************************************
|
yading@10
|
4 ;* Copyright (c) 2000, 2001 Fabrice Bellard
|
yading@10
|
5 ;* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
|
yading@10
|
6 ;*
|
yading@10
|
7 ;* This file is part of FFmpeg.
|
yading@10
|
8 ;*
|
yading@10
|
9 ;* FFmpeg is free software; you can redistribute it and/or
|
yading@10
|
10 ;* modify it under the terms of the GNU Lesser General Public
|
yading@10
|
11 ;* License as published by the Free Software Foundation; either
|
yading@10
|
12 ;* version 2.1 of the License, or (at your option) any later version.
|
yading@10
|
13 ;*
|
yading@10
|
14 ;* FFmpeg is distributed in the hope that it will be useful,
|
yading@10
|
15 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
yading@10
|
16 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
yading@10
|
17 ;* Lesser General Public License for more details.
|
yading@10
|
18 ;*
|
yading@10
|
19 ;* You should have received a copy of the GNU Lesser General Public
|
yading@10
|
20 ;* License along with FFmpeg; if not, write to the Free Software
|
yading@10
|
21 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
yading@10
|
22 ;*****************************************************************************
|
yading@10
|
23
|
yading@10
|
24 %include "libavutil/x86/x86util.asm"
|
yading@10
|
25
|
yading@10
|
26 SECTION .text
|
yading@10
|
27
|
yading@10
|
28 %macro DIFF_PIXELS_1 4
|
yading@10
|
29 movh %1, %3
|
yading@10
|
30 movh %2, %4
|
yading@10
|
31 punpcklbw %2, %1
|
yading@10
|
32 punpcklbw %1, %1
|
yading@10
|
33 psubw %1, %2
|
yading@10
|
34 %endmacro
|
yading@10
|
35
|
yading@10
|
36 ; %1=uint8_t *pix1, %2=uint8_t *pix2, %3=static offset, %4=stride, %5=stride*3
|
yading@10
|
37 ; %6=temporary storage location
|
yading@10
|
38 ; this macro requires $mmsize stack space (aligned) on %6 (except on SSE+x86-64)
|
yading@10
|
39 %macro DIFF_PIXELS_8 6
|
yading@10
|
40 DIFF_PIXELS_1 m0, m7, [%1 +%3], [%2 +%3]
|
yading@10
|
41 DIFF_PIXELS_1 m1, m7, [%1+%4 +%3], [%2+%4 +%3]
|
yading@10
|
42 DIFF_PIXELS_1 m2, m7, [%1+%4*2+%3], [%2+%4*2+%3]
|
yading@10
|
43 add %1, %5
|
yading@10
|
44 add %2, %5
|
yading@10
|
45 DIFF_PIXELS_1 m3, m7, [%1 +%3], [%2 +%3]
|
yading@10
|
46 DIFF_PIXELS_1 m4, m7, [%1+%4 +%3], [%2+%4 +%3]
|
yading@10
|
47 DIFF_PIXELS_1 m5, m7, [%1+%4*2+%3], [%2+%4*2+%3]
|
yading@10
|
48 DIFF_PIXELS_1 m6, m7, [%1+%5 +%3], [%2+%5 +%3]
|
yading@10
|
49 %ifdef m8
|
yading@10
|
50 DIFF_PIXELS_1 m7, m8, [%1+%4*4+%3], [%2+%4*4+%3]
|
yading@10
|
51 %else
|
yading@10
|
52 mova [%6], m0
|
yading@10
|
53 DIFF_PIXELS_1 m7, m0, [%1+%4*4+%3], [%2+%4*4+%3]
|
yading@10
|
54 mova m0, [%6]
|
yading@10
|
55 %endif
|
yading@10
|
56 sub %1, %5
|
yading@10
|
57 sub %2, %5
|
yading@10
|
58 %endmacro
|
yading@10
|
59
|
yading@10
|
60 %macro HADAMARD8 0
|
yading@10
|
61 SUMSUB_BADC w, 0, 1, 2, 3
|
yading@10
|
62 SUMSUB_BADC w, 4, 5, 6, 7
|
yading@10
|
63 SUMSUB_BADC w, 0, 2, 1, 3
|
yading@10
|
64 SUMSUB_BADC w, 4, 6, 5, 7
|
yading@10
|
65 SUMSUB_BADC w, 0, 4, 1, 5
|
yading@10
|
66 SUMSUB_BADC w, 2, 6, 3, 7
|
yading@10
|
67 %endmacro
|
yading@10
|
68
|
yading@10
|
69 %macro ABS1_SUM 3
|
yading@10
|
70 ABS1 %1, %2
|
yading@10
|
71 paddusw %3, %1
|
yading@10
|
72 %endmacro
|
yading@10
|
73
|
yading@10
|
74 %macro ABS2_SUM 6
|
yading@10
|
75 ABS2 %1, %2, %3, %4
|
yading@10
|
76 paddusw %5, %1
|
yading@10
|
77 paddusw %6, %2
|
yading@10
|
78 %endmacro
|
yading@10
|
79
|
yading@10
|
80 %macro ABS_SUM_8x8_64 1
|
yading@10
|
81 ABS2 m0, m1, m8, m9
|
yading@10
|
82 ABS2_SUM m2, m3, m8, m9, m0, m1
|
yading@10
|
83 ABS2_SUM m4, m5, m8, m9, m0, m1
|
yading@10
|
84 ABS2_SUM m6, m7, m8, m9, m0, m1
|
yading@10
|
85 paddusw m0, m1
|
yading@10
|
86 %endmacro
|
yading@10
|
87
|
yading@10
|
88 %macro ABS_SUM_8x8_32 1
|
yading@10
|
89 mova [%1], m7
|
yading@10
|
90 ABS1 m0, m7
|
yading@10
|
91 ABS1 m1, m7
|
yading@10
|
92 ABS1_SUM m2, m7, m0
|
yading@10
|
93 ABS1_SUM m3, m7, m1
|
yading@10
|
94 ABS1_SUM m4, m7, m0
|
yading@10
|
95 ABS1_SUM m5, m7, m1
|
yading@10
|
96 ABS1_SUM m6, m7, m0
|
yading@10
|
97 mova m2, [%1]
|
yading@10
|
98 ABS1_SUM m2, m7, m1
|
yading@10
|
99 paddusw m0, m1
|
yading@10
|
100 %endmacro
|
yading@10
|
101
|
yading@10
|
102 ; FIXME: HSUM saturates at 64k, while an 8x8 hadamard or dct block can get up to
|
yading@10
|
103 ; about 100k on extreme inputs. But that's very unlikely to occur in natural video,
|
yading@10
|
104 ; and it's even more unlikely to not have any alternative mvs/modes with lower cost.
|
yading@10
|
105 %macro HSUM 3
|
yading@10
|
106 %if cpuflag(sse2)
|
yading@10
|
107 movhlps %2, %1
|
yading@10
|
108 paddusw %1, %2
|
yading@10
|
109 pshuflw %2, %1, 0xE
|
yading@10
|
110 paddusw %1, %2
|
yading@10
|
111 pshuflw %2, %1, 0x1
|
yading@10
|
112 paddusw %1, %2
|
yading@10
|
113 movd %3, %1
|
yading@10
|
114 %elif cpuflag(mmxext)
|
yading@10
|
115 pshufw %2, %1, 0xE
|
yading@10
|
116 paddusw %1, %2
|
yading@10
|
117 pshufw %2, %1, 0x1
|
yading@10
|
118 paddusw %1, %2
|
yading@10
|
119 movd %3, %1
|
yading@10
|
120 %elif cpuflag(mmx)
|
yading@10
|
121 mova %2, %1
|
yading@10
|
122 psrlq %1, 32
|
yading@10
|
123 paddusw %1, %2
|
yading@10
|
124 mova %2, %1
|
yading@10
|
125 psrlq %1, 16
|
yading@10
|
126 paddusw %1, %2
|
yading@10
|
127 movd %3, %1
|
yading@10
|
128 %endif
|
yading@10
|
129 %endmacro
|
yading@10
|
130
|
yading@10
|
131 %macro STORE4 5
|
yading@10
|
132 mova [%1+mmsize*0], %2
|
yading@10
|
133 mova [%1+mmsize*1], %3
|
yading@10
|
134 mova [%1+mmsize*2], %4
|
yading@10
|
135 mova [%1+mmsize*3], %5
|
yading@10
|
136 %endmacro
|
yading@10
|
137
|
yading@10
|
138 %macro LOAD4 5
|
yading@10
|
139 mova %2, [%1+mmsize*0]
|
yading@10
|
140 mova %3, [%1+mmsize*1]
|
yading@10
|
141 mova %4, [%1+mmsize*2]
|
yading@10
|
142 mova %5, [%1+mmsize*3]
|
yading@10
|
143 %endmacro
|
yading@10
|
144
|
yading@10
|
145 %macro hadamard8_16_wrapper 2
|
yading@10
|
146 cglobal hadamard8_diff, 4, 4, %1
|
yading@10
|
147 %ifndef m8
|
yading@10
|
148 %assign pad %2*mmsize-(4+stack_offset&(mmsize-1))
|
yading@10
|
149 SUB rsp, pad
|
yading@10
|
150 %endif
|
yading@10
|
151 call hadamard8x8_diff %+ SUFFIX
|
yading@10
|
152 %ifndef m8
|
yading@10
|
153 ADD rsp, pad
|
yading@10
|
154 %endif
|
yading@10
|
155 RET
|
yading@10
|
156
|
yading@10
|
157 cglobal hadamard8_diff16, 5, 6, %1
|
yading@10
|
158 %ifndef m8
|
yading@10
|
159 %assign pad %2*mmsize-(4+stack_offset&(mmsize-1))
|
yading@10
|
160 SUB rsp, pad
|
yading@10
|
161 %endif
|
yading@10
|
162
|
yading@10
|
163 call hadamard8x8_diff %+ SUFFIX
|
yading@10
|
164 mov r5d, eax
|
yading@10
|
165
|
yading@10
|
166 add r1, 8
|
yading@10
|
167 add r2, 8
|
yading@10
|
168 call hadamard8x8_diff %+ SUFFIX
|
yading@10
|
169 add r5d, eax
|
yading@10
|
170
|
yading@10
|
171 cmp r4d, 16
|
yading@10
|
172 jne .done
|
yading@10
|
173
|
yading@10
|
174 lea r1, [r1+r3*8-8]
|
yading@10
|
175 lea r2, [r2+r3*8-8]
|
yading@10
|
176 call hadamard8x8_diff %+ SUFFIX
|
yading@10
|
177 add r5d, eax
|
yading@10
|
178
|
yading@10
|
179 add r1, 8
|
yading@10
|
180 add r2, 8
|
yading@10
|
181 call hadamard8x8_diff %+ SUFFIX
|
yading@10
|
182 add r5d, eax
|
yading@10
|
183
|
yading@10
|
184 .done:
|
yading@10
|
185 mov eax, r5d
|
yading@10
|
186 %ifndef m8
|
yading@10
|
187 ADD rsp, pad
|
yading@10
|
188 %endif
|
yading@10
|
189 RET
|
yading@10
|
190 %endmacro
|
yading@10
|
191
|
yading@10
|
192 %macro HADAMARD8_DIFF 0-1
|
yading@10
|
193 %if cpuflag(sse2)
|
yading@10
|
194 hadamard8x8_diff %+ SUFFIX:
|
yading@10
|
195 lea r0, [r3*3]
|
yading@10
|
196 DIFF_PIXELS_8 r1, r2, 0, r3, r0, rsp+gprsize
|
yading@10
|
197 HADAMARD8
|
yading@10
|
198 %if ARCH_X86_64
|
yading@10
|
199 TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8
|
yading@10
|
200 %else
|
yading@10
|
201 TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [rsp+gprsize], [rsp+mmsize+gprsize]
|
yading@10
|
202 %endif
|
yading@10
|
203 HADAMARD8
|
yading@10
|
204 ABS_SUM_8x8 rsp+gprsize
|
yading@10
|
205 HSUM m0, m1, eax
|
yading@10
|
206 and eax, 0xFFFF
|
yading@10
|
207 ret
|
yading@10
|
208
|
yading@10
|
209 hadamard8_16_wrapper %1, 3
|
yading@10
|
210 %elif cpuflag(mmx)
|
yading@10
|
211 ALIGN 16
|
yading@10
|
212 ; int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2,
|
yading@10
|
213 ; int stride, int h)
|
yading@10
|
214 ; r0 = void *s = unused, int h = unused (always 8)
|
yading@10
|
215 ; note how r1, r2 and r3 are not clobbered in this function, so 16x16
|
yading@10
|
216 ; can simply call this 2x2x (and that's why we access rsp+gprsize
|
yading@10
|
217 ; everywhere, which is rsp of calling func
|
yading@10
|
218 hadamard8x8_diff %+ SUFFIX:
|
yading@10
|
219 lea r0, [r3*3]
|
yading@10
|
220
|
yading@10
|
221 ; first 4x8 pixels
|
yading@10
|
222 DIFF_PIXELS_8 r1, r2, 0, r3, r0, rsp+gprsize+0x60
|
yading@10
|
223 HADAMARD8
|
yading@10
|
224 mova [rsp+gprsize+0x60], m7
|
yading@10
|
225 TRANSPOSE4x4W 0, 1, 2, 3, 7
|
yading@10
|
226 STORE4 rsp+gprsize, m0, m1, m2, m3
|
yading@10
|
227 mova m7, [rsp+gprsize+0x60]
|
yading@10
|
228 TRANSPOSE4x4W 4, 5, 6, 7, 0
|
yading@10
|
229 STORE4 rsp+gprsize+0x40, m4, m5, m6, m7
|
yading@10
|
230
|
yading@10
|
231 ; second 4x8 pixels
|
yading@10
|
232 DIFF_PIXELS_8 r1, r2, 4, r3, r0, rsp+gprsize+0x60
|
yading@10
|
233 HADAMARD8
|
yading@10
|
234 mova [rsp+gprsize+0x60], m7
|
yading@10
|
235 TRANSPOSE4x4W 0, 1, 2, 3, 7
|
yading@10
|
236 STORE4 rsp+gprsize+0x20, m0, m1, m2, m3
|
yading@10
|
237 mova m7, [rsp+gprsize+0x60]
|
yading@10
|
238 TRANSPOSE4x4W 4, 5, 6, 7, 0
|
yading@10
|
239
|
yading@10
|
240 LOAD4 rsp+gprsize+0x40, m0, m1, m2, m3
|
yading@10
|
241 HADAMARD8
|
yading@10
|
242 ABS_SUM_8x8_32 rsp+gprsize+0x60
|
yading@10
|
243 mova [rsp+gprsize+0x60], m0
|
yading@10
|
244
|
yading@10
|
245 LOAD4 rsp+gprsize , m0, m1, m2, m3
|
yading@10
|
246 LOAD4 rsp+gprsize+0x20, m4, m5, m6, m7
|
yading@10
|
247 HADAMARD8
|
yading@10
|
248 ABS_SUM_8x8_32 rsp+gprsize
|
yading@10
|
249 paddusw m0, [rsp+gprsize+0x60]
|
yading@10
|
250
|
yading@10
|
251 HSUM m0, m1, eax
|
yading@10
|
252 and rax, 0xFFFF
|
yading@10
|
253 ret
|
yading@10
|
254
|
yading@10
|
255 hadamard8_16_wrapper 0, 14
|
yading@10
|
256 %endif
|
yading@10
|
257 %endmacro
|
yading@10
|
258
|
yading@10
|
259 INIT_MMX mmx
|
yading@10
|
260 HADAMARD8_DIFF
|
yading@10
|
261
|
yading@10
|
262 INIT_MMX mmxext
|
yading@10
|
263 HADAMARD8_DIFF
|
yading@10
|
264
|
yading@10
|
265 INIT_XMM sse2
|
yading@10
|
266 %if ARCH_X86_64
|
yading@10
|
267 %define ABS_SUM_8x8 ABS_SUM_8x8_64
|
yading@10
|
268 %else
|
yading@10
|
269 %define ABS_SUM_8x8 ABS_SUM_8x8_32
|
yading@10
|
270 %endif
|
yading@10
|
271 HADAMARD8_DIFF 10
|
yading@10
|
272
|
yading@10
|
273 INIT_XMM ssse3
|
yading@10
|
274 %define ABS_SUM_8x8 ABS_SUM_8x8_64
|
yading@10
|
275 HADAMARD8_DIFF 9
|
yading@10
|
276
|
yading@10
|
277 INIT_XMM sse2
|
yading@10
|
278 ; sse16_sse2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
|
yading@10
|
279 cglobal sse16, 5, 5, 8
|
yading@10
|
280 shr r4d, 1
|
yading@10
|
281 pxor m0, m0 ; mm0 = 0
|
yading@10
|
282 pxor m7, m7 ; mm7 holds the sum
|
yading@10
|
283
|
yading@10
|
284 .next2lines: ; FIXME why are these unaligned movs? pix1[] is aligned
|
yading@10
|
285 movu m1, [r1 ] ; mm1 = pix1[0][0-15]
|
yading@10
|
286 movu m2, [r2 ] ; mm2 = pix2[0][0-15]
|
yading@10
|
287 movu m3, [r1+r3] ; mm3 = pix1[1][0-15]
|
yading@10
|
288 movu m4, [r2+r3] ; mm4 = pix2[1][0-15]
|
yading@10
|
289
|
yading@10
|
290 ; todo: mm1-mm2, mm3-mm4
|
yading@10
|
291 ; algo: subtract mm1 from mm2 with saturation and vice versa
|
yading@10
|
292 ; OR the result to get the absolute difference
|
yading@10
|
293 mova m5, m1
|
yading@10
|
294 mova m6, m3
|
yading@10
|
295 psubusb m1, m2
|
yading@10
|
296 psubusb m3, m4
|
yading@10
|
297 psubusb m2, m5
|
yading@10
|
298 psubusb m4, m6
|
yading@10
|
299
|
yading@10
|
300 por m2, m1
|
yading@10
|
301 por m4, m3
|
yading@10
|
302
|
yading@10
|
303 ; now convert to 16-bit vectors so we can square them
|
yading@10
|
304 mova m1, m2
|
yading@10
|
305 mova m3, m4
|
yading@10
|
306
|
yading@10
|
307 punpckhbw m2, m0
|
yading@10
|
308 punpckhbw m4, m0
|
yading@10
|
309 punpcklbw m1, m0 ; mm1 not spread over (mm1,mm2)
|
yading@10
|
310 punpcklbw m3, m0 ; mm4 not spread over (mm3,mm4)
|
yading@10
|
311
|
yading@10
|
312 pmaddwd m2, m2
|
yading@10
|
313 pmaddwd m4, m4
|
yading@10
|
314 pmaddwd m1, m1
|
yading@10
|
315 pmaddwd m3, m3
|
yading@10
|
316
|
yading@10
|
317 lea r1, [r1+r3*2] ; pix1 += 2*line_size
|
yading@10
|
318 lea r2, [r2+r3*2] ; pix2 += 2*line_size
|
yading@10
|
319
|
yading@10
|
320 paddd m1, m2
|
yading@10
|
321 paddd m3, m4
|
yading@10
|
322 paddd m7, m1
|
yading@10
|
323 paddd m7, m3
|
yading@10
|
324
|
yading@10
|
325 dec r4
|
yading@10
|
326 jnz .next2lines
|
yading@10
|
327
|
yading@10
|
328 mova m1, m7
|
yading@10
|
329 psrldq m7, 8 ; shift hi qword to lo
|
yading@10
|
330 paddd m7, m1
|
yading@10
|
331 mova m1, m7
|
yading@10
|
332 psrldq m7, 4 ; shift hi dword to lo
|
yading@10
|
333 paddd m7, m1
|
yading@10
|
334 movd eax, m7 ; return value
|
yading@10
|
335 RET
|
yading@10
|
336
|
yading@10
|
337 INIT_MMX mmx
|
yading@10
|
338 ; get_pixels_mmx(int16_t *block, const uint8_t *pixels, int line_size)
|
yading@10
|
339 cglobal get_pixels, 3,4
|
yading@10
|
340 movsxdifnidn r2, r2d
|
yading@10
|
341 add r0, 128
|
yading@10
|
342 mov r3, -128
|
yading@10
|
343 pxor m7, m7
|
yading@10
|
344 .loop:
|
yading@10
|
345 mova m0, [r1]
|
yading@10
|
346 mova m2, [r1+r2]
|
yading@10
|
347 mova m1, m0
|
yading@10
|
348 mova m3, m2
|
yading@10
|
349 punpcklbw m0, m7
|
yading@10
|
350 punpckhbw m1, m7
|
yading@10
|
351 punpcklbw m2, m7
|
yading@10
|
352 punpckhbw m3, m7
|
yading@10
|
353 mova [r0+r3+ 0], m0
|
yading@10
|
354 mova [r0+r3+ 8], m1
|
yading@10
|
355 mova [r0+r3+16], m2
|
yading@10
|
356 mova [r0+r3+24], m3
|
yading@10
|
357 lea r1, [r1+r2*2]
|
yading@10
|
358 add r3, 32
|
yading@10
|
359 js .loop
|
yading@10
|
360 REP_RET
|
yading@10
|
361
|
yading@10
|
362 INIT_XMM sse2
|
yading@10
|
363 cglobal get_pixels, 3, 4
|
yading@10
|
364 movsxdifnidn r2, r2d
|
yading@10
|
365 lea r3, [r2*3]
|
yading@10
|
366 pxor m4, m4
|
yading@10
|
367 movh m0, [r1]
|
yading@10
|
368 movh m1, [r1+r2]
|
yading@10
|
369 movh m2, [r1+r2*2]
|
yading@10
|
370 movh m3, [r1+r3]
|
yading@10
|
371 lea r1, [r1+r2*4]
|
yading@10
|
372 punpcklbw m0, m4
|
yading@10
|
373 punpcklbw m1, m4
|
yading@10
|
374 punpcklbw m2, m4
|
yading@10
|
375 punpcklbw m3, m4
|
yading@10
|
376 mova [r0], m0
|
yading@10
|
377 mova [r0+0x10], m1
|
yading@10
|
378 mova [r0+0x20], m2
|
yading@10
|
379 mova [r0+0x30], m3
|
yading@10
|
380 movh m0, [r1]
|
yading@10
|
381 movh m1, [r1+r2*1]
|
yading@10
|
382 movh m2, [r1+r2*2]
|
yading@10
|
383 movh m3, [r1+r3]
|
yading@10
|
384 punpcklbw m0, m4
|
yading@10
|
385 punpcklbw m1, m4
|
yading@10
|
386 punpcklbw m2, m4
|
yading@10
|
387 punpcklbw m3, m4
|
yading@10
|
388 mova [r0+0x40], m0
|
yading@10
|
389 mova [r0+0x50], m1
|
yading@10
|
390 mova [r0+0x60], m2
|
yading@10
|
391 mova [r0+0x70], m3
|
yading@10
|
392 RET
|
yading@10
|
393
|
yading@10
|
394 INIT_MMX mmx
|
yading@10
|
395 ; diff_pixels_mmx(int16_t *block, const uint8_t *s1, const unint8_t *s2, stride)
|
yading@10
|
396 cglobal diff_pixels, 4,5
|
yading@10
|
397 movsxdifnidn r3, r3d
|
yading@10
|
398 pxor m7, m7
|
yading@10
|
399 add r0, 128
|
yading@10
|
400 mov r4, -128
|
yading@10
|
401 .loop:
|
yading@10
|
402 mova m0, [r1]
|
yading@10
|
403 mova m2, [r2]
|
yading@10
|
404 mova m1, m0
|
yading@10
|
405 mova m3, m2
|
yading@10
|
406 punpcklbw m0, m7
|
yading@10
|
407 punpckhbw m1, m7
|
yading@10
|
408 punpcklbw m2, m7
|
yading@10
|
409 punpckhbw m3, m7
|
yading@10
|
410 psubw m0, m2
|
yading@10
|
411 psubw m1, m3
|
yading@10
|
412 mova [r0+r4+0], m0
|
yading@10
|
413 mova [r0+r4+8], m1
|
yading@10
|
414 add r1, r3
|
yading@10
|
415 add r2, r3
|
yading@10
|
416 add r4, 16
|
yading@10
|
417 jne .loop
|
yading@10
|
418 REP_RET
|
yading@10
|
419
|
yading@10
|
420 INIT_MMX mmx
|
yading@10
|
421 ; pix_sum16_mmx(uint8_t * pix, int line_size)
|
yading@10
|
422 cglobal pix_sum16, 2, 3
|
yading@10
|
423 movsxdifnidn r1, r1d
|
yading@10
|
424 mov r2, r1
|
yading@10
|
425 neg r2
|
yading@10
|
426 shl r2, 4
|
yading@10
|
427 sub r0, r2
|
yading@10
|
428 pxor m7, m7
|
yading@10
|
429 pxor m6, m6
|
yading@10
|
430 .loop:
|
yading@10
|
431 mova m0, [r0+r2+0]
|
yading@10
|
432 mova m1, [r0+r2+0]
|
yading@10
|
433 mova m2, [r0+r2+8]
|
yading@10
|
434 mova m3, [r0+r2+8]
|
yading@10
|
435 punpcklbw m0, m7
|
yading@10
|
436 punpckhbw m1, m7
|
yading@10
|
437 punpcklbw m2, m7
|
yading@10
|
438 punpckhbw m3, m7
|
yading@10
|
439 paddw m1, m0
|
yading@10
|
440 paddw m3, m2
|
yading@10
|
441 paddw m3, m1
|
yading@10
|
442 paddw m6, m3
|
yading@10
|
443 add r2, r1
|
yading@10
|
444 js .loop
|
yading@10
|
445 mova m5, m6
|
yading@10
|
446 psrlq m6, 32
|
yading@10
|
447 paddw m6, m5
|
yading@10
|
448 mova m5, m6
|
yading@10
|
449 psrlq m6, 16
|
yading@10
|
450 paddw m6, m5
|
yading@10
|
451 movd eax, m6
|
yading@10
|
452 and eax, 0xffff
|
yading@10
|
453 RET
|
yading@10
|
454
|
yading@10
|
455 INIT_MMX mmx
|
yading@10
|
456 ; pix_norm1_mmx(uint8_t *pix, int line_size)
|
yading@10
|
457 cglobal pix_norm1, 2, 4
|
yading@10
|
458 movsxdifnidn r1, r1d
|
yading@10
|
459 mov r2, 16
|
yading@10
|
460 pxor m0, m0
|
yading@10
|
461 pxor m7, m7
|
yading@10
|
462 .loop:
|
yading@10
|
463 mova m2, [r0+0]
|
yading@10
|
464 mova m3, [r0+8]
|
yading@10
|
465 mova m1, m2
|
yading@10
|
466 punpckhbw m1, m0
|
yading@10
|
467 punpcklbw m2, m0
|
yading@10
|
468 mova m4, m3
|
yading@10
|
469 punpckhbw m3, m0
|
yading@10
|
470 punpcklbw m4, m0
|
yading@10
|
471 pmaddwd m1, m1
|
yading@10
|
472 pmaddwd m2, m2
|
yading@10
|
473 pmaddwd m3, m3
|
yading@10
|
474 pmaddwd m4, m4
|
yading@10
|
475 paddd m2, m1
|
yading@10
|
476 paddd m4, m3
|
yading@10
|
477 paddd m7, m2
|
yading@10
|
478 add r0, r1
|
yading@10
|
479 paddd m7, m4
|
yading@10
|
480 dec r2
|
yading@10
|
481 jne .loop
|
yading@10
|
482 mova m1, m7
|
yading@10
|
483 psrlq m7, 32
|
yading@10
|
484 paddd m1, m7
|
yading@10
|
485 movd eax, m1
|
yading@10
|
486 RET
|
yading@10
|
487
|