yading@10
|
1 ;******************************************************************************
|
yading@10
|
2 ;* MMX optimized DSP utils
|
yading@10
|
3 ;* Copyright (c) 2008 Loren Merritt
|
yading@10
|
4 ;* Copyright (c) 2003-2013 Michael Niedermayer
|
yading@10
|
5 ;* Copyright (c) 2013 Daniel Kang
|
yading@10
|
6 ;*
|
yading@10
|
7 ;* This file is part of FFmpeg.
|
yading@10
|
8 ;*
|
yading@10
|
9 ;* FFmpeg is free software; you can redistribute it and/or
|
yading@10
|
10 ;* modify it under the terms of the GNU Lesser General Public
|
yading@10
|
11 ;* License as published by the Free Software Foundation; either
|
yading@10
|
12 ;* version 2.1 of the License, or (at your option) any later version.
|
yading@10
|
13 ;*
|
yading@10
|
14 ;* FFmpeg is distributed in the hope that it will be useful,
|
yading@10
|
15 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
yading@10
|
16 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
yading@10
|
17 ;* Lesser General Public License for more details.
|
yading@10
|
18 ;*
|
yading@10
|
19 ;* You should have received a copy of the GNU Lesser General Public
|
yading@10
|
20 ;* License along with FFmpeg; if not, write to the Free Software
|
yading@10
|
21 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
yading@10
|
22 ;******************************************************************************
|
yading@10
|
23
|
yading@10
|
24 %include "libavutil/x86/x86util.asm"
|
yading@10
|
25
|
yading@10
|
26 SECTION_RODATA
|
yading@10
|
27 pb_f: times 16 db 15
|
yading@10
|
28 pb_zzzzzzzz77777777: times 8 db -1
|
yading@10
|
29 pb_7: times 8 db 7
|
yading@10
|
30 pb_zzzz3333zzzzbbbb: db -1,-1,-1,-1,3,3,3,3,-1,-1,-1,-1,11,11,11,11
|
yading@10
|
31 pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13
|
yading@10
|
32 pb_revwords: SHUFFLE_MASK_W 7, 6, 5, 4, 3, 2, 1, 0
|
yading@10
|
33 pd_16384: times 4 dd 16384
|
yading@10
|
34 pb_bswap32: db 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
|
yading@10
|
35
|
yading@10
|
36 SECTION_TEXT
|
yading@10
|
37
|
yading@10
|
38 %macro SCALARPRODUCT 0
|
yading@10
|
39 ; int scalarproduct_int16(int16_t *v1, int16_t *v2, int order)
|
yading@10
|
40 cglobal scalarproduct_int16, 3,3,3, v1, v2, order
|
yading@10
|
41 shl orderq, 1
|
yading@10
|
42 add v1q, orderq
|
yading@10
|
43 add v2q, orderq
|
yading@10
|
44 neg orderq
|
yading@10
|
45 pxor m2, m2
|
yading@10
|
46 .loop:
|
yading@10
|
47 movu m0, [v1q + orderq]
|
yading@10
|
48 movu m1, [v1q + orderq + mmsize]
|
yading@10
|
49 pmaddwd m0, [v2q + orderq]
|
yading@10
|
50 pmaddwd m1, [v2q + orderq + mmsize]
|
yading@10
|
51 paddd m2, m0
|
yading@10
|
52 paddd m2, m1
|
yading@10
|
53 add orderq, mmsize*2
|
yading@10
|
54 jl .loop
|
yading@10
|
55 %if mmsize == 16
|
yading@10
|
56 movhlps m0, m2
|
yading@10
|
57 paddd m2, m0
|
yading@10
|
58 pshuflw m0, m2, 0x4e
|
yading@10
|
59 %else
|
yading@10
|
60 pshufw m0, m2, 0x4e
|
yading@10
|
61 %endif
|
yading@10
|
62 paddd m2, m0
|
yading@10
|
63 movd eax, m2
|
yading@10
|
64 RET
|
yading@10
|
65
|
yading@10
|
66 ; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul)
|
yading@10
|
67 cglobal scalarproduct_and_madd_int16, 4,4,8, v1, v2, v3, order, mul
|
yading@10
|
68 shl orderq, 1
|
yading@10
|
69 movd m7, mulm
|
yading@10
|
70 %if mmsize == 16
|
yading@10
|
71 pshuflw m7, m7, 0
|
yading@10
|
72 punpcklqdq m7, m7
|
yading@10
|
73 %else
|
yading@10
|
74 pshufw m7, m7, 0
|
yading@10
|
75 %endif
|
yading@10
|
76 pxor m6, m6
|
yading@10
|
77 add v1q, orderq
|
yading@10
|
78 add v2q, orderq
|
yading@10
|
79 add v3q, orderq
|
yading@10
|
80 neg orderq
|
yading@10
|
81 .loop:
|
yading@10
|
82 movu m0, [v2q + orderq]
|
yading@10
|
83 movu m1, [v2q + orderq + mmsize]
|
yading@10
|
84 mova m4, [v1q + orderq]
|
yading@10
|
85 mova m5, [v1q + orderq + mmsize]
|
yading@10
|
86 movu m2, [v3q + orderq]
|
yading@10
|
87 movu m3, [v3q + orderq + mmsize]
|
yading@10
|
88 pmaddwd m0, m4
|
yading@10
|
89 pmaddwd m1, m5
|
yading@10
|
90 pmullw m2, m7
|
yading@10
|
91 pmullw m3, m7
|
yading@10
|
92 paddd m6, m0
|
yading@10
|
93 paddd m6, m1
|
yading@10
|
94 paddw m2, m4
|
yading@10
|
95 paddw m3, m5
|
yading@10
|
96 mova [v1q + orderq], m2
|
yading@10
|
97 mova [v1q + orderq + mmsize], m3
|
yading@10
|
98 add orderq, mmsize*2
|
yading@10
|
99 jl .loop
|
yading@10
|
100 %if mmsize == 16
|
yading@10
|
101 movhlps m0, m6
|
yading@10
|
102 paddd m6, m0
|
yading@10
|
103 pshuflw m0, m6, 0x4e
|
yading@10
|
104 %else
|
yading@10
|
105 pshufw m0, m6, 0x4e
|
yading@10
|
106 %endif
|
yading@10
|
107 paddd m6, m0
|
yading@10
|
108 movd eax, m6
|
yading@10
|
109 RET
|
yading@10
|
110 %endmacro
|
yading@10
|
111
|
yading@10
|
112 INIT_MMX mmxext
|
yading@10
|
113 SCALARPRODUCT
|
yading@10
|
114 INIT_XMM sse2
|
yading@10
|
115 SCALARPRODUCT
|
yading@10
|
116
|
yading@10
|
117 %macro SCALARPRODUCT_LOOP 1
|
yading@10
|
118 align 16
|
yading@10
|
119 .loop%1:
|
yading@10
|
120 sub orderq, mmsize*2
|
yading@10
|
121 %if %1
|
yading@10
|
122 mova m1, m4
|
yading@10
|
123 mova m4, [v2q + orderq]
|
yading@10
|
124 mova m0, [v2q + orderq + mmsize]
|
yading@10
|
125 palignr m1, m0, %1
|
yading@10
|
126 palignr m0, m4, %1
|
yading@10
|
127 mova m3, m5
|
yading@10
|
128 mova m5, [v3q + orderq]
|
yading@10
|
129 mova m2, [v3q + orderq + mmsize]
|
yading@10
|
130 palignr m3, m2, %1
|
yading@10
|
131 palignr m2, m5, %1
|
yading@10
|
132 %else
|
yading@10
|
133 mova m0, [v2q + orderq]
|
yading@10
|
134 mova m1, [v2q + orderq + mmsize]
|
yading@10
|
135 mova m2, [v3q + orderq]
|
yading@10
|
136 mova m3, [v3q + orderq + mmsize]
|
yading@10
|
137 %endif
|
yading@10
|
138 %define t0 [v1q + orderq]
|
yading@10
|
139 %define t1 [v1q + orderq + mmsize]
|
yading@10
|
140 %if ARCH_X86_64
|
yading@10
|
141 mova m8, t0
|
yading@10
|
142 mova m9, t1
|
yading@10
|
143 %define t0 m8
|
yading@10
|
144 %define t1 m9
|
yading@10
|
145 %endif
|
yading@10
|
146 pmaddwd m0, t0
|
yading@10
|
147 pmaddwd m1, t1
|
yading@10
|
148 pmullw m2, m7
|
yading@10
|
149 pmullw m3, m7
|
yading@10
|
150 paddw m2, t0
|
yading@10
|
151 paddw m3, t1
|
yading@10
|
152 paddd m6, m0
|
yading@10
|
153 paddd m6, m1
|
yading@10
|
154 mova [v1q + orderq], m2
|
yading@10
|
155 mova [v1q + orderq + mmsize], m3
|
yading@10
|
156 jg .loop%1
|
yading@10
|
157 %if %1
|
yading@10
|
158 jmp .end
|
yading@10
|
159 %endif
|
yading@10
|
160 %endmacro
|
yading@10
|
161
|
yading@10
|
162 ; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul)
|
yading@10
|
163 INIT_XMM ssse3
|
yading@10
|
164 cglobal scalarproduct_and_madd_int16, 4,5,10, v1, v2, v3, order, mul
|
yading@10
|
165 shl orderq, 1
|
yading@10
|
166 movd m7, mulm
|
yading@10
|
167 pshuflw m7, m7, 0
|
yading@10
|
168 punpcklqdq m7, m7
|
yading@10
|
169 pxor m6, m6
|
yading@10
|
170 mov r4d, v2d
|
yading@10
|
171 and r4d, 15
|
yading@10
|
172 and v2q, ~15
|
yading@10
|
173 and v3q, ~15
|
yading@10
|
174 mova m4, [v2q + orderq]
|
yading@10
|
175 mova m5, [v3q + orderq]
|
yading@10
|
176 ; linear is faster than branch tree or jump table, because the branches taken are cyclic (i.e. predictable)
|
yading@10
|
177 cmp r4d, 0
|
yading@10
|
178 je .loop0
|
yading@10
|
179 cmp r4d, 2
|
yading@10
|
180 je .loop2
|
yading@10
|
181 cmp r4d, 4
|
yading@10
|
182 je .loop4
|
yading@10
|
183 cmp r4d, 6
|
yading@10
|
184 je .loop6
|
yading@10
|
185 cmp r4d, 8
|
yading@10
|
186 je .loop8
|
yading@10
|
187 cmp r4d, 10
|
yading@10
|
188 je .loop10
|
yading@10
|
189 cmp r4d, 12
|
yading@10
|
190 je .loop12
|
yading@10
|
191 SCALARPRODUCT_LOOP 14
|
yading@10
|
192 SCALARPRODUCT_LOOP 12
|
yading@10
|
193 SCALARPRODUCT_LOOP 10
|
yading@10
|
194 SCALARPRODUCT_LOOP 8
|
yading@10
|
195 SCALARPRODUCT_LOOP 6
|
yading@10
|
196 SCALARPRODUCT_LOOP 4
|
yading@10
|
197 SCALARPRODUCT_LOOP 2
|
yading@10
|
198 SCALARPRODUCT_LOOP 0
|
yading@10
|
199 .end:
|
yading@10
|
200 movhlps m0, m6
|
yading@10
|
201 paddd m6, m0
|
yading@10
|
202 pshuflw m0, m6, 0x4e
|
yading@10
|
203 paddd m6, m0
|
yading@10
|
204 movd eax, m6
|
yading@10
|
205 RET
|
yading@10
|
206
|
yading@10
|
207
|
yading@10
|
208 ;-----------------------------------------------------------------------------
|
yading@10
|
209 ; void ff_apply_window_int16(int16_t *output, const int16_t *input,
|
yading@10
|
210 ; const int16_t *window, unsigned int len)
|
yading@10
|
211 ;-----------------------------------------------------------------------------
|
yading@10
|
212
|
yading@10
|
213 %macro REVERSE_WORDS 1-2
|
yading@10
|
214 %if cpuflag(ssse3) && notcpuflag(atom)
|
yading@10
|
215 pshufb %1, %2
|
yading@10
|
216 %elif cpuflag(sse2)
|
yading@10
|
217 pshuflw %1, %1, 0x1B
|
yading@10
|
218 pshufhw %1, %1, 0x1B
|
yading@10
|
219 pshufd %1, %1, 0x4E
|
yading@10
|
220 %elif cpuflag(mmxext)
|
yading@10
|
221 pshufw %1, %1, 0x1B
|
yading@10
|
222 %endif
|
yading@10
|
223 %endmacro
|
yading@10
|
224
|
yading@10
|
225 %macro MUL16FIXED 3
|
yading@10
|
226 %if cpuflag(ssse3) ; dst, src, unused
|
yading@10
|
227 ; dst = ((dst * src) + (1<<14)) >> 15
|
yading@10
|
228 pmulhrsw %1, %2
|
yading@10
|
229 %elif cpuflag(mmxext) ; dst, src, temp
|
yading@10
|
230 ; dst = (dst * src) >> 15
|
yading@10
|
231 ; pmulhw cuts off the bottom bit, so we have to lshift by 1 and add it back
|
yading@10
|
232 ; in from the pmullw result.
|
yading@10
|
233 mova %3, %1
|
yading@10
|
234 pmulhw %1, %2
|
yading@10
|
235 pmullw %3, %2
|
yading@10
|
236 psrlw %3, 15
|
yading@10
|
237 psllw %1, 1
|
yading@10
|
238 por %1, %3
|
yading@10
|
239 %endif
|
yading@10
|
240 %endmacro
|
yading@10
|
241
|
yading@10
|
242 %macro APPLY_WINDOW_INT16 1 ; %1 bitexact version
|
yading@10
|
243 %if %1
|
yading@10
|
244 cglobal apply_window_int16, 4,5,6, output, input, window, offset, offset2
|
yading@10
|
245 %else
|
yading@10
|
246 cglobal apply_window_int16_round, 4,5,6, output, input, window, offset, offset2
|
yading@10
|
247 %endif
|
yading@10
|
248 lea offset2q, [offsetq-mmsize]
|
yading@10
|
249 %if cpuflag(ssse3) && notcpuflag(atom)
|
yading@10
|
250 mova m5, [pb_revwords]
|
yading@10
|
251 ALIGN 16
|
yading@10
|
252 %elif %1
|
yading@10
|
253 mova m5, [pd_16384]
|
yading@10
|
254 %endif
|
yading@10
|
255 .loop:
|
yading@10
|
256 %if cpuflag(ssse3)
|
yading@10
|
257 ; This version does the 16x16->16 multiplication in-place without expanding
|
yading@10
|
258 ; to 32-bit. The ssse3 version is bit-identical.
|
yading@10
|
259 mova m0, [windowq+offset2q]
|
yading@10
|
260 mova m1, [ inputq+offset2q]
|
yading@10
|
261 pmulhrsw m1, m0
|
yading@10
|
262 REVERSE_WORDS m0, m5
|
yading@10
|
263 pmulhrsw m0, [ inputq+offsetq ]
|
yading@10
|
264 mova [outputq+offset2q], m1
|
yading@10
|
265 mova [outputq+offsetq ], m0
|
yading@10
|
266 %elif %1
|
yading@10
|
267 ; This version expands 16-bit to 32-bit, multiplies by the window,
|
yading@10
|
268 ; adds 16384 for rounding, right shifts 15, then repacks back to words to
|
yading@10
|
269 ; save to the output. The window is reversed for the second half.
|
yading@10
|
270 mova m3, [windowq+offset2q]
|
yading@10
|
271 mova m4, [ inputq+offset2q]
|
yading@10
|
272 pxor m0, m0
|
yading@10
|
273 punpcklwd m0, m3
|
yading@10
|
274 punpcklwd m1, m4
|
yading@10
|
275 pmaddwd m0, m1
|
yading@10
|
276 paddd m0, m5
|
yading@10
|
277 psrad m0, 15
|
yading@10
|
278 pxor m2, m2
|
yading@10
|
279 punpckhwd m2, m3
|
yading@10
|
280 punpckhwd m1, m4
|
yading@10
|
281 pmaddwd m2, m1
|
yading@10
|
282 paddd m2, m5
|
yading@10
|
283 psrad m2, 15
|
yading@10
|
284 packssdw m0, m2
|
yading@10
|
285 mova [outputq+offset2q], m0
|
yading@10
|
286 REVERSE_WORDS m3
|
yading@10
|
287 mova m4, [ inputq+offsetq]
|
yading@10
|
288 pxor m0, m0
|
yading@10
|
289 punpcklwd m0, m3
|
yading@10
|
290 punpcklwd m1, m4
|
yading@10
|
291 pmaddwd m0, m1
|
yading@10
|
292 paddd m0, m5
|
yading@10
|
293 psrad m0, 15
|
yading@10
|
294 pxor m2, m2
|
yading@10
|
295 punpckhwd m2, m3
|
yading@10
|
296 punpckhwd m1, m4
|
yading@10
|
297 pmaddwd m2, m1
|
yading@10
|
298 paddd m2, m5
|
yading@10
|
299 psrad m2, 15
|
yading@10
|
300 packssdw m0, m2
|
yading@10
|
301 mova [outputq+offsetq], m0
|
yading@10
|
302 %else
|
yading@10
|
303 ; This version does the 16x16->16 multiplication in-place without expanding
|
yading@10
|
304 ; to 32-bit. The mmxext and sse2 versions do not use rounding, and
|
yading@10
|
305 ; therefore are not bit-identical to the C version.
|
yading@10
|
306 mova m0, [windowq+offset2q]
|
yading@10
|
307 mova m1, [ inputq+offset2q]
|
yading@10
|
308 mova m2, [ inputq+offsetq ]
|
yading@10
|
309 MUL16FIXED m1, m0, m3
|
yading@10
|
310 REVERSE_WORDS m0
|
yading@10
|
311 MUL16FIXED m2, m0, m3
|
yading@10
|
312 mova [outputq+offset2q], m1
|
yading@10
|
313 mova [outputq+offsetq ], m2
|
yading@10
|
314 %endif
|
yading@10
|
315 add offsetd, mmsize
|
yading@10
|
316 sub offset2d, mmsize
|
yading@10
|
317 jae .loop
|
yading@10
|
318 REP_RET
|
yading@10
|
319 %endmacro
|
yading@10
|
320
|
yading@10
|
321 INIT_MMX mmxext
|
yading@10
|
322 APPLY_WINDOW_INT16 0
|
yading@10
|
323 INIT_XMM sse2
|
yading@10
|
324 APPLY_WINDOW_INT16 0
|
yading@10
|
325
|
yading@10
|
326 INIT_MMX mmxext
|
yading@10
|
327 APPLY_WINDOW_INT16 1
|
yading@10
|
328 INIT_XMM sse2
|
yading@10
|
329 APPLY_WINDOW_INT16 1
|
yading@10
|
330 INIT_XMM ssse3
|
yading@10
|
331 APPLY_WINDOW_INT16 1
|
yading@10
|
332 INIT_XMM ssse3, atom
|
yading@10
|
333 APPLY_WINDOW_INT16 1
|
yading@10
|
334
|
yading@10
|
335
|
yading@10
|
336 ; void add_hfyu_median_prediction_mmxext(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top)
|
yading@10
|
337 INIT_MMX mmxext
|
yading@10
|
338 cglobal add_hfyu_median_prediction, 6,6,0, dst, top, diff, w, left, left_top
|
yading@10
|
339 movq mm0, [topq]
|
yading@10
|
340 movq mm2, mm0
|
yading@10
|
341 movd mm4, [left_topq]
|
yading@10
|
342 psllq mm2, 8
|
yading@10
|
343 movq mm1, mm0
|
yading@10
|
344 por mm4, mm2
|
yading@10
|
345 movd mm3, [leftq]
|
yading@10
|
346 psubb mm0, mm4 ; t-tl
|
yading@10
|
347 add dstq, wq
|
yading@10
|
348 add topq, wq
|
yading@10
|
349 add diffq, wq
|
yading@10
|
350 neg wq
|
yading@10
|
351 jmp .skip
|
yading@10
|
352 .loop:
|
yading@10
|
353 movq mm4, [topq+wq]
|
yading@10
|
354 movq mm0, mm4
|
yading@10
|
355 psllq mm4, 8
|
yading@10
|
356 por mm4, mm1
|
yading@10
|
357 movq mm1, mm0 ; t
|
yading@10
|
358 psubb mm0, mm4 ; t-tl
|
yading@10
|
359 .skip:
|
yading@10
|
360 movq mm2, [diffq+wq]
|
yading@10
|
361 %assign i 0
|
yading@10
|
362 %rep 8
|
yading@10
|
363 movq mm4, mm0
|
yading@10
|
364 paddb mm4, mm3 ; t-tl+l
|
yading@10
|
365 movq mm5, mm3
|
yading@10
|
366 pmaxub mm3, mm1
|
yading@10
|
367 pminub mm5, mm1
|
yading@10
|
368 pminub mm3, mm4
|
yading@10
|
369 pmaxub mm3, mm5 ; median
|
yading@10
|
370 paddb mm3, mm2 ; +residual
|
yading@10
|
371 %if i==0
|
yading@10
|
372 movq mm7, mm3
|
yading@10
|
373 psllq mm7, 56
|
yading@10
|
374 %else
|
yading@10
|
375 movq mm6, mm3
|
yading@10
|
376 psrlq mm7, 8
|
yading@10
|
377 psllq mm6, 56
|
yading@10
|
378 por mm7, mm6
|
yading@10
|
379 %endif
|
yading@10
|
380 %if i<7
|
yading@10
|
381 psrlq mm0, 8
|
yading@10
|
382 psrlq mm1, 8
|
yading@10
|
383 psrlq mm2, 8
|
yading@10
|
384 %endif
|
yading@10
|
385 %assign i i+1
|
yading@10
|
386 %endrep
|
yading@10
|
387 movq [dstq+wq], mm7
|
yading@10
|
388 add wq, 8
|
yading@10
|
389 jl .loop
|
yading@10
|
390 movzx r2d, byte [dstq-1]
|
yading@10
|
391 mov [leftq], r2d
|
yading@10
|
392 movzx r2d, byte [topq-1]
|
yading@10
|
393 mov [left_topq], r2d
|
yading@10
|
394 RET
|
yading@10
|
395
|
yading@10
|
396
|
yading@10
|
397 %macro ADD_HFYU_LEFT_LOOP 2 ; %1 = dst_is_aligned, %2 = src_is_aligned
|
yading@10
|
398 add srcq, wq
|
yading@10
|
399 add dstq, wq
|
yading@10
|
400 neg wq
|
yading@10
|
401 %%.loop:
|
yading@10
|
402 %if %2
|
yading@10
|
403 mova m1, [srcq+wq]
|
yading@10
|
404 %else
|
yading@10
|
405 movu m1, [srcq+wq]
|
yading@10
|
406 %endif
|
yading@10
|
407 mova m2, m1
|
yading@10
|
408 psllw m1, 8
|
yading@10
|
409 paddb m1, m2
|
yading@10
|
410 mova m2, m1
|
yading@10
|
411 pshufb m1, m3
|
yading@10
|
412 paddb m1, m2
|
yading@10
|
413 pshufb m0, m5
|
yading@10
|
414 mova m2, m1
|
yading@10
|
415 pshufb m1, m4
|
yading@10
|
416 paddb m1, m2
|
yading@10
|
417 %if mmsize == 16
|
yading@10
|
418 mova m2, m1
|
yading@10
|
419 pshufb m1, m6
|
yading@10
|
420 paddb m1, m2
|
yading@10
|
421 %endif
|
yading@10
|
422 paddb m0, m1
|
yading@10
|
423 %if %1
|
yading@10
|
424 mova [dstq+wq], m0
|
yading@10
|
425 %else
|
yading@10
|
426 movq [dstq+wq], m0
|
yading@10
|
427 movhps [dstq+wq+8], m0
|
yading@10
|
428 %endif
|
yading@10
|
429 add wq, mmsize
|
yading@10
|
430 jl %%.loop
|
yading@10
|
431 mov eax, mmsize-1
|
yading@10
|
432 sub eax, wd
|
yading@10
|
433 movd m1, eax
|
yading@10
|
434 pshufb m0, m1
|
yading@10
|
435 movd eax, m0
|
yading@10
|
436 RET
|
yading@10
|
437 %endmacro
|
yading@10
|
438
|
yading@10
|
439 ; int add_hfyu_left_prediction(uint8_t *dst, const uint8_t *src, int w, int left)
|
yading@10
|
440 INIT_MMX ssse3
|
yading@10
|
441 cglobal add_hfyu_left_prediction, 3,3,7, dst, src, w, left
|
yading@10
|
442 .skip_prologue:
|
yading@10
|
443 mova m5, [pb_7]
|
yading@10
|
444 mova m4, [pb_zzzz3333zzzzbbbb]
|
yading@10
|
445 mova m3, [pb_zz11zz55zz99zzdd]
|
yading@10
|
446 movd m0, leftm
|
yading@10
|
447 psllq m0, 56
|
yading@10
|
448 ADD_HFYU_LEFT_LOOP 1, 1
|
yading@10
|
449
|
yading@10
|
450 INIT_XMM sse4
|
yading@10
|
451 cglobal add_hfyu_left_prediction, 3,3,7, dst, src, w, left
|
yading@10
|
452 mova m5, [pb_f]
|
yading@10
|
453 mova m6, [pb_zzzzzzzz77777777]
|
yading@10
|
454 mova m4, [pb_zzzz3333zzzzbbbb]
|
yading@10
|
455 mova m3, [pb_zz11zz55zz99zzdd]
|
yading@10
|
456 movd m0, leftm
|
yading@10
|
457 pslldq m0, 15
|
yading@10
|
458 test srcq, 15
|
yading@10
|
459 jnz .src_unaligned
|
yading@10
|
460 test dstq, 15
|
yading@10
|
461 jnz .dst_unaligned
|
yading@10
|
462 ADD_HFYU_LEFT_LOOP 1, 1
|
yading@10
|
463 .dst_unaligned:
|
yading@10
|
464 ADD_HFYU_LEFT_LOOP 0, 1
|
yading@10
|
465 .src_unaligned:
|
yading@10
|
466 ADD_HFYU_LEFT_LOOP 0, 0
|
yading@10
|
467
|
yading@10
|
468 ;-----------------------------------------------------------------------------
|
yading@10
|
469 ; void ff_vector_clip_int32(int32_t *dst, const int32_t *src, int32_t min,
|
yading@10
|
470 ; int32_t max, unsigned int len)
|
yading@10
|
471 ;-----------------------------------------------------------------------------
|
yading@10
|
472
|
yading@10
|
473 ; %1 = number of xmm registers used
|
yading@10
|
474 ; %2 = number of inline load/process/store loops per asm loop
|
yading@10
|
475 ; %3 = process 4*mmsize (%3=0) or 8*mmsize (%3=1) bytes per loop
|
yading@10
|
476 ; %4 = CLIPD function takes min/max as float instead of int (CLIPD_SSE2)
|
yading@10
|
477 ; %5 = suffix
|
yading@10
|
478 %macro VECTOR_CLIP_INT32 4-5
|
yading@10
|
479 cglobal vector_clip_int32%5, 5,5,%1, dst, src, min, max, len
|
yading@10
|
480 %if %4
|
yading@10
|
481 cvtsi2ss m4, minm
|
yading@10
|
482 cvtsi2ss m5, maxm
|
yading@10
|
483 %else
|
yading@10
|
484 movd m4, minm
|
yading@10
|
485 movd m5, maxm
|
yading@10
|
486 %endif
|
yading@10
|
487 SPLATD m4
|
yading@10
|
488 SPLATD m5
|
yading@10
|
489 .loop:
|
yading@10
|
490 %assign %%i 1
|
yading@10
|
491 %rep %2
|
yading@10
|
492 mova m0, [srcq+mmsize*0*%%i]
|
yading@10
|
493 mova m1, [srcq+mmsize*1*%%i]
|
yading@10
|
494 mova m2, [srcq+mmsize*2*%%i]
|
yading@10
|
495 mova m3, [srcq+mmsize*3*%%i]
|
yading@10
|
496 %if %3
|
yading@10
|
497 mova m7, [srcq+mmsize*4*%%i]
|
yading@10
|
498 mova m8, [srcq+mmsize*5*%%i]
|
yading@10
|
499 mova m9, [srcq+mmsize*6*%%i]
|
yading@10
|
500 mova m10, [srcq+mmsize*7*%%i]
|
yading@10
|
501 %endif
|
yading@10
|
502 CLIPD m0, m4, m5, m6
|
yading@10
|
503 CLIPD m1, m4, m5, m6
|
yading@10
|
504 CLIPD m2, m4, m5, m6
|
yading@10
|
505 CLIPD m3, m4, m5, m6
|
yading@10
|
506 %if %3
|
yading@10
|
507 CLIPD m7, m4, m5, m6
|
yading@10
|
508 CLIPD m8, m4, m5, m6
|
yading@10
|
509 CLIPD m9, m4, m5, m6
|
yading@10
|
510 CLIPD m10, m4, m5, m6
|
yading@10
|
511 %endif
|
yading@10
|
512 mova [dstq+mmsize*0*%%i], m0
|
yading@10
|
513 mova [dstq+mmsize*1*%%i], m1
|
yading@10
|
514 mova [dstq+mmsize*2*%%i], m2
|
yading@10
|
515 mova [dstq+mmsize*3*%%i], m3
|
yading@10
|
516 %if %3
|
yading@10
|
517 mova [dstq+mmsize*4*%%i], m7
|
yading@10
|
518 mova [dstq+mmsize*5*%%i], m8
|
yading@10
|
519 mova [dstq+mmsize*6*%%i], m9
|
yading@10
|
520 mova [dstq+mmsize*7*%%i], m10
|
yading@10
|
521 %endif
|
yading@10
|
522 %assign %%i %%i+1
|
yading@10
|
523 %endrep
|
yading@10
|
524 add srcq, mmsize*4*(%2+%3)
|
yading@10
|
525 add dstq, mmsize*4*(%2+%3)
|
yading@10
|
526 sub lend, mmsize*(%2+%3)
|
yading@10
|
527 jg .loop
|
yading@10
|
528 REP_RET
|
yading@10
|
529 %endmacro
|
yading@10
|
530
|
yading@10
|
531 INIT_MMX mmx
|
yading@10
|
532 %define CLIPD CLIPD_MMX
|
yading@10
|
533 VECTOR_CLIP_INT32 0, 1, 0, 0
|
yading@10
|
534 INIT_XMM sse2
|
yading@10
|
535 VECTOR_CLIP_INT32 6, 1, 0, 0, _int
|
yading@10
|
536 %define CLIPD CLIPD_SSE2
|
yading@10
|
537 VECTOR_CLIP_INT32 6, 2, 0, 1
|
yading@10
|
538 INIT_XMM sse4
|
yading@10
|
539 %define CLIPD CLIPD_SSE41
|
yading@10
|
540 %ifdef m8
|
yading@10
|
541 VECTOR_CLIP_INT32 11, 1, 1, 0
|
yading@10
|
542 %else
|
yading@10
|
543 VECTOR_CLIP_INT32 6, 1, 0, 0
|
yading@10
|
544 %endif
|
yading@10
|
545
|
yading@10
|
546 ; %1 = aligned/unaligned
|
yading@10
|
547 %macro BSWAP_LOOPS 1
|
yading@10
|
548 mov r3, r2
|
yading@10
|
549 sar r2, 3
|
yading@10
|
550 jz .left4_%1
|
yading@10
|
551 .loop8_%1:
|
yading@10
|
552 mov%1 m0, [r1 + 0]
|
yading@10
|
553 mov%1 m1, [r1 + 16]
|
yading@10
|
554 %if cpuflag(ssse3)
|
yading@10
|
555 pshufb m0, m2
|
yading@10
|
556 pshufb m1, m2
|
yading@10
|
557 mov%1 [r0 + 0], m0
|
yading@10
|
558 mov%1 [r0 + 16], m1
|
yading@10
|
559 %else
|
yading@10
|
560 pshuflw m0, m0, 10110001b
|
yading@10
|
561 pshuflw m1, m1, 10110001b
|
yading@10
|
562 pshufhw m0, m0, 10110001b
|
yading@10
|
563 pshufhw m1, m1, 10110001b
|
yading@10
|
564 mova m2, m0
|
yading@10
|
565 mova m3, m1
|
yading@10
|
566 psllw m0, 8
|
yading@10
|
567 psllw m1, 8
|
yading@10
|
568 psrlw m2, 8
|
yading@10
|
569 psrlw m3, 8
|
yading@10
|
570 por m2, m0
|
yading@10
|
571 por m3, m1
|
yading@10
|
572 mov%1 [r0 + 0], m2
|
yading@10
|
573 mov%1 [r0 + 16], m3
|
yading@10
|
574 %endif
|
yading@10
|
575 add r0, 32
|
yading@10
|
576 add r1, 32
|
yading@10
|
577 dec r2
|
yading@10
|
578 jnz .loop8_%1
|
yading@10
|
579 .left4_%1:
|
yading@10
|
580 mov r2, r3
|
yading@10
|
581 and r3, 4
|
yading@10
|
582 jz .left
|
yading@10
|
583 mov%1 m0, [r1]
|
yading@10
|
584 %if cpuflag(ssse3)
|
yading@10
|
585 pshufb m0, m2
|
yading@10
|
586 mov%1 [r0], m0
|
yading@10
|
587 %else
|
yading@10
|
588 pshuflw m0, m0, 10110001b
|
yading@10
|
589 pshufhw m0, m0, 10110001b
|
yading@10
|
590 mova m2, m0
|
yading@10
|
591 psllw m0, 8
|
yading@10
|
592 psrlw m2, 8
|
yading@10
|
593 por m2, m0
|
yading@10
|
594 mov%1 [r0], m2
|
yading@10
|
595 %endif
|
yading@10
|
596 add r1, 16
|
yading@10
|
597 add r0, 16
|
yading@10
|
598 %endmacro
|
yading@10
|
599
|
yading@10
|
600 ; void bswap_buf(uint32_t *dst, const uint32_t *src, int w);
|
yading@10
|
601 %macro BSWAP32_BUF 0
|
yading@10
|
602 %if cpuflag(ssse3)
|
yading@10
|
603 cglobal bswap32_buf, 3,4,3
|
yading@10
|
604 mov r3, r1
|
yading@10
|
605 mova m2, [pb_bswap32]
|
yading@10
|
606 %else
|
yading@10
|
607 cglobal bswap32_buf, 3,4,5
|
yading@10
|
608 mov r3, r1
|
yading@10
|
609 %endif
|
yading@10
|
610 or r3, r0
|
yading@10
|
611 and r3, 15
|
yading@10
|
612 jz .start_align
|
yading@10
|
613 BSWAP_LOOPS u
|
yading@10
|
614 jmp .left
|
yading@10
|
615 .start_align:
|
yading@10
|
616 BSWAP_LOOPS a
|
yading@10
|
617 .left:
|
yading@10
|
618 %if cpuflag(ssse3)
|
yading@10
|
619 mov r3, r2
|
yading@10
|
620 and r2, 2
|
yading@10
|
621 jz .left1
|
yading@10
|
622 movq m0, [r1]
|
yading@10
|
623 pshufb m0, m2
|
yading@10
|
624 movq [r0], m0
|
yading@10
|
625 add r1, 8
|
yading@10
|
626 add r0, 8
|
yading@10
|
627 .left1:
|
yading@10
|
628 and r3, 1
|
yading@10
|
629 jz .end
|
yading@10
|
630 mov r2d, [r1]
|
yading@10
|
631 bswap r2d
|
yading@10
|
632 mov [r0], r2d
|
yading@10
|
633 %else
|
yading@10
|
634 and r2, 3
|
yading@10
|
635 jz .end
|
yading@10
|
636 .loop2:
|
yading@10
|
637 mov r3d, [r1]
|
yading@10
|
638 bswap r3d
|
yading@10
|
639 mov [r0], r3d
|
yading@10
|
640 add r1, 4
|
yading@10
|
641 add r0, 4
|
yading@10
|
642 dec r2
|
yading@10
|
643 jnz .loop2
|
yading@10
|
644 %endif
|
yading@10
|
645 .end:
|
yading@10
|
646 RET
|
yading@10
|
647 %endmacro
|
yading@10
|
648
|
yading@10
|
649 INIT_XMM sse2
|
yading@10
|
650 BSWAP32_BUF
|
yading@10
|
651
|
yading@10
|
652 INIT_XMM ssse3
|
yading@10
|
653 BSWAP32_BUF
|