yading@10
|
1 ;******************************************************************************
|
yading@10
|
2 ;* MMX/SSE2-optimized functions for the RV40 decoder
|
yading@10
|
3 ;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
|
yading@10
|
4 ;* Copyright (c) 2010 Jason Garrett-Glaser <darkshikari@gmail.com>
|
yading@10
|
5 ;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
|
yading@10
|
6 ;*
|
yading@10
|
7 ;* This file is part of Libav.
|
yading@10
|
8 ;*
|
yading@10
|
9 ;* Libav is free software; you can redistribute it and/or
|
yading@10
|
10 ;* modify it under the terms of the GNU Lesser General Public
|
yading@10
|
11 ;* License as published by the Free Software Foundation; either
|
yading@10
|
12 ;* version 2.1 of the License, or (at your option) any later version.
|
yading@10
|
13 ;*
|
yading@10
|
14 ;* Libav is distributed in the hope that it will be useful,
|
yading@10
|
15 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
yading@10
|
16 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
yading@10
|
17 ;* Lesser General Public License for more details.
|
yading@10
|
18 ;*
|
yading@10
|
19 ;* You should have received a copy of the GNU Lesser General Public
|
yading@10
|
20 ;* License along with Libav; if not, write to the Free Software
|
yading@10
|
21 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
yading@10
|
22 ;******************************************************************************
|
yading@10
|
23
|
yading@10
|
24 %include "libavutil/x86/x86util.asm"
|
yading@10
|
25
|
yading@10
|
26 SECTION_RODATA
|
yading@10
|
27
|
yading@10
|
28 align 16
|
yading@10
|
29 pw_1024: times 8 dw 1 << (16 - 6) ; pw_1024
|
yading@10
|
30
|
yading@10
|
31 sixtap_filter_hb_m: times 8 db 1, -5
|
yading@10
|
32 times 8 db 52, 20
|
yading@10
|
33 ; multiplied by 2 to have the same shift
|
yading@10
|
34 times 8 db 2, -10
|
yading@10
|
35 times 8 db 40, 40
|
yading@10
|
36 ; back to normal
|
yading@10
|
37 times 8 db 1, -5
|
yading@10
|
38 times 8 db 20, 52
|
yading@10
|
39
|
yading@10
|
40 sixtap_filter_v_m: times 8 dw 1
|
yading@10
|
41 times 8 dw -5
|
yading@10
|
42 times 8 dw 52
|
yading@10
|
43 times 8 dw 20
|
yading@10
|
44 ; multiplied by 2 to have the same shift
|
yading@10
|
45 times 8 dw 2
|
yading@10
|
46 times 8 dw -10
|
yading@10
|
47 times 8 dw 40
|
yading@10
|
48 times 8 dw 40
|
yading@10
|
49 ; back to normal
|
yading@10
|
50 times 8 dw 1
|
yading@10
|
51 times 8 dw -5
|
yading@10
|
52 times 8 dw 20
|
yading@10
|
53 times 8 dw 52
|
yading@10
|
54
|
yading@10
|
55 %ifdef PIC
|
yading@10
|
56 %define sixtap_filter_hw picregq
|
yading@10
|
57 %define sixtap_filter_hb picregq
|
yading@10
|
58 %define sixtap_filter_v picregq
|
yading@10
|
59 %define npicregs 1
|
yading@10
|
60 %else
|
yading@10
|
61 %define sixtap_filter_hw sixtap_filter_hw_m
|
yading@10
|
62 %define sixtap_filter_hb sixtap_filter_hb_m
|
yading@10
|
63 %define sixtap_filter_v sixtap_filter_v_m
|
yading@10
|
64 %define npicregs 0
|
yading@10
|
65 %endif
|
yading@10
|
66
|
yading@10
|
67 filter_h6_shuf1: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
|
yading@10
|
68 filter_h6_shuf2: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
|
yading@10
|
69 filter_h6_shuf3: db 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11, 10, 12, 11
|
yading@10
|
70
|
yading@10
|
71 cextern pw_32
|
yading@10
|
72 cextern pw_16
|
yading@10
|
73 cextern pw_512
|
yading@10
|
74
|
yading@10
|
75 SECTION .text
|
yading@10
|
76
|
yading@10
|
77 ;-----------------------------------------------------------------------------
|
yading@10
|
78 ; subpel MC functions:
|
yading@10
|
79 ;
|
yading@10
|
80 ; void [put|rv40]_rv40_qpel_[h|v]_<opt>(uint8_t *dst, int deststride,
|
yading@10
|
81 ; uint8_t *src, int srcstride,
|
yading@10
|
82 ; int len, int m);
|
yading@10
|
83 ;----------------------------------------------------------------------
|
yading@10
|
84 %macro LOAD 2
|
yading@10
|
85 %if WIN64
|
yading@10
|
86 movsxd %1q, %1d
|
yading@10
|
87 %endif
|
yading@10
|
88 %ifdef PIC
|
yading@10
|
89 add %1q, picregq
|
yading@10
|
90 %else
|
yading@10
|
91 add %1q, %2
|
yading@10
|
92 %endif
|
yading@10
|
93 %endmacro
|
yading@10
|
94
|
yading@10
|
95 %macro STORE 3
|
yading@10
|
96 %ifidn %3, avg
|
yading@10
|
97 movh %2, [dstq]
|
yading@10
|
98 %endif
|
yading@10
|
99 packuswb %1, %1
|
yading@10
|
100 %ifidn %3, avg
|
yading@10
|
101 %if cpuflag(3dnow)
|
yading@10
|
102 pavgusb %1, %2
|
yading@10
|
103 %else
|
yading@10
|
104 pavgb %1, %2
|
yading@10
|
105 %endif
|
yading@10
|
106 %endif
|
yading@10
|
107 movh [dstq], %1
|
yading@10
|
108 %endmacro
|
yading@10
|
109
|
yading@10
|
110 %macro FILTER_V 1
|
yading@10
|
111 cglobal %1_rv40_qpel_v, 6,6+npicregs,12, dst, dststride, src, srcstride, height, my, picreg
|
yading@10
|
112 %ifdef PIC
|
yading@10
|
113 lea picregq, [sixtap_filter_v_m]
|
yading@10
|
114 %endif
|
yading@10
|
115 pxor m7, m7
|
yading@10
|
116 LOAD my, sixtap_filter_v
|
yading@10
|
117
|
yading@10
|
118 ; read 5 lines
|
yading@10
|
119 sub srcq, srcstrideq
|
yading@10
|
120 sub srcq, srcstrideq
|
yading@10
|
121 movh m0, [srcq]
|
yading@10
|
122 movh m1, [srcq+srcstrideq]
|
yading@10
|
123 movh m2, [srcq+srcstrideq*2]
|
yading@10
|
124 lea srcq, [srcq+srcstrideq*2]
|
yading@10
|
125 add srcq, srcstrideq
|
yading@10
|
126 movh m3, [srcq]
|
yading@10
|
127 movh m4, [srcq+srcstrideq]
|
yading@10
|
128 punpcklbw m0, m7
|
yading@10
|
129 punpcklbw m1, m7
|
yading@10
|
130 punpcklbw m2, m7
|
yading@10
|
131 punpcklbw m3, m7
|
yading@10
|
132 punpcklbw m4, m7
|
yading@10
|
133
|
yading@10
|
134 %ifdef m8
|
yading@10
|
135 mova m8, [myq+ 0]
|
yading@10
|
136 mova m9, [myq+16]
|
yading@10
|
137 mova m10, [myq+32]
|
yading@10
|
138 mova m11, [myq+48]
|
yading@10
|
139 %define COEFF05 m8
|
yading@10
|
140 %define COEFF14 m9
|
yading@10
|
141 %define COEFF2 m10
|
yading@10
|
142 %define COEFF3 m11
|
yading@10
|
143 %else
|
yading@10
|
144 %define COEFF05 [myq+ 0]
|
yading@10
|
145 %define COEFF14 [myq+16]
|
yading@10
|
146 %define COEFF2 [myq+32]
|
yading@10
|
147 %define COEFF3 [myq+48]
|
yading@10
|
148 %endif
|
yading@10
|
149 .nextrow:
|
yading@10
|
150 mova m6, m1
|
yading@10
|
151 movh m5, [srcq+2*srcstrideq] ; read new row
|
yading@10
|
152 paddw m6, m4
|
yading@10
|
153 punpcklbw m5, m7
|
yading@10
|
154 pmullw m6, COEFF14
|
yading@10
|
155 paddw m0, m5
|
yading@10
|
156 pmullw m0, COEFF05
|
yading@10
|
157 paddw m6, m0
|
yading@10
|
158 mova m0, m1
|
yading@10
|
159 paddw m6, [pw_32]
|
yading@10
|
160 mova m1, m2
|
yading@10
|
161 pmullw m2, COEFF2
|
yading@10
|
162 paddw m6, m2
|
yading@10
|
163 mova m2, m3
|
yading@10
|
164 pmullw m3, COEFF3
|
yading@10
|
165 paddw m6, m3
|
yading@10
|
166
|
yading@10
|
167 ; round/clip/store
|
yading@10
|
168 mova m3, m4
|
yading@10
|
169 psraw m6, 6
|
yading@10
|
170 mova m4, m5
|
yading@10
|
171 STORE m6, m5, %1
|
yading@10
|
172
|
yading@10
|
173 ; go to next line
|
yading@10
|
174 add dstq, dststrideq
|
yading@10
|
175 add srcq, srcstrideq
|
yading@10
|
176 dec heightd ; next row
|
yading@10
|
177 jg .nextrow
|
yading@10
|
178 REP_RET
|
yading@10
|
179 %endmacro
|
yading@10
|
180
|
yading@10
|
181 %macro FILTER_H 1
|
yading@10
|
182 cglobal %1_rv40_qpel_h, 6, 6+npicregs, 12, dst, dststride, src, srcstride, height, mx, picreg
|
yading@10
|
183 %ifdef PIC
|
yading@10
|
184 lea picregq, [sixtap_filter_v_m]
|
yading@10
|
185 %endif
|
yading@10
|
186 pxor m7, m7
|
yading@10
|
187 LOAD mx, sixtap_filter_v
|
yading@10
|
188 mova m6, [pw_32]
|
yading@10
|
189 %ifdef m8
|
yading@10
|
190 mova m8, [mxq+ 0]
|
yading@10
|
191 mova m9, [mxq+16]
|
yading@10
|
192 mova m10, [mxq+32]
|
yading@10
|
193 mova m11, [mxq+48]
|
yading@10
|
194 %define COEFF05 m8
|
yading@10
|
195 %define COEFF14 m9
|
yading@10
|
196 %define COEFF2 m10
|
yading@10
|
197 %define COEFF3 m11
|
yading@10
|
198 %else
|
yading@10
|
199 %define COEFF05 [mxq+ 0]
|
yading@10
|
200 %define COEFF14 [mxq+16]
|
yading@10
|
201 %define COEFF2 [mxq+32]
|
yading@10
|
202 %define COEFF3 [mxq+48]
|
yading@10
|
203 %endif
|
yading@10
|
204 .nextrow:
|
yading@10
|
205 movq m0, [srcq-2]
|
yading@10
|
206 movq m5, [srcq+3]
|
yading@10
|
207 movq m1, [srcq-1]
|
yading@10
|
208 movq m4, [srcq+2]
|
yading@10
|
209 punpcklbw m0, m7
|
yading@10
|
210 punpcklbw m5, m7
|
yading@10
|
211 punpcklbw m1, m7
|
yading@10
|
212 punpcklbw m4, m7
|
yading@10
|
213 movq m2, [srcq-0]
|
yading@10
|
214 movq m3, [srcq+1]
|
yading@10
|
215 paddw m0, m5
|
yading@10
|
216 paddw m1, m4
|
yading@10
|
217 punpcklbw m2, m7
|
yading@10
|
218 punpcklbw m3, m7
|
yading@10
|
219 pmullw m0, COEFF05
|
yading@10
|
220 pmullw m1, COEFF14
|
yading@10
|
221 pmullw m2, COEFF2
|
yading@10
|
222 pmullw m3, COEFF3
|
yading@10
|
223 paddw m0, m6
|
yading@10
|
224 paddw m1, m2
|
yading@10
|
225 paddw m0, m3
|
yading@10
|
226 paddw m0, m1
|
yading@10
|
227 psraw m0, 6
|
yading@10
|
228 STORE m0, m1, %1
|
yading@10
|
229
|
yading@10
|
230 ; go to next line
|
yading@10
|
231 add dstq, dststrideq
|
yading@10
|
232 add srcq, srcstrideq
|
yading@10
|
233 dec heightd ; next row
|
yading@10
|
234 jg .nextrow
|
yading@10
|
235 REP_RET
|
yading@10
|
236 %endmacro
|
yading@10
|
237
|
yading@10
|
238 %if ARCH_X86_32
|
yading@10
|
239 INIT_MMX mmx
|
yading@10
|
240 FILTER_V put
|
yading@10
|
241 FILTER_H put
|
yading@10
|
242
|
yading@10
|
243 INIT_MMX mmxext
|
yading@10
|
244 FILTER_V avg
|
yading@10
|
245 FILTER_H avg
|
yading@10
|
246
|
yading@10
|
247 INIT_MMX 3dnow
|
yading@10
|
248 FILTER_V avg
|
yading@10
|
249 FILTER_H avg
|
yading@10
|
250 %endif
|
yading@10
|
251
|
yading@10
|
252 INIT_XMM sse2
|
yading@10
|
253 FILTER_H put
|
yading@10
|
254 FILTER_H avg
|
yading@10
|
255 FILTER_V put
|
yading@10
|
256 FILTER_V avg
|
yading@10
|
257
|
yading@10
|
258 %macro FILTER_SSSE3 1
|
yading@10
|
259 cglobal %1_rv40_qpel_v, 6,6+npicregs,8, dst, dststride, src, srcstride, height, my, picreg
|
yading@10
|
260 %ifdef PIC
|
yading@10
|
261 lea picregq, [sixtap_filter_hb_m]
|
yading@10
|
262 %endif
|
yading@10
|
263
|
yading@10
|
264 ; read 5 lines
|
yading@10
|
265 sub srcq, srcstrideq
|
yading@10
|
266 LOAD my, sixtap_filter_hb
|
yading@10
|
267 sub srcq, srcstrideq
|
yading@10
|
268 movh m0, [srcq]
|
yading@10
|
269 movh m1, [srcq+srcstrideq]
|
yading@10
|
270 movh m2, [srcq+srcstrideq*2]
|
yading@10
|
271 lea srcq, [srcq+srcstrideq*2]
|
yading@10
|
272 add srcq, srcstrideq
|
yading@10
|
273 mova m5, [myq]
|
yading@10
|
274 movh m3, [srcq]
|
yading@10
|
275 movh m4, [srcq+srcstrideq]
|
yading@10
|
276 lea srcq, [srcq+2*srcstrideq]
|
yading@10
|
277
|
yading@10
|
278 .nextrow:
|
yading@10
|
279 mova m6, m2
|
yading@10
|
280 punpcklbw m0, m1
|
yading@10
|
281 punpcklbw m6, m3
|
yading@10
|
282 pmaddubsw m0, m5
|
yading@10
|
283 pmaddubsw m6, [myq+16]
|
yading@10
|
284 movh m7, [srcq] ; read new row
|
yading@10
|
285 paddw m6, m0
|
yading@10
|
286 mova m0, m1
|
yading@10
|
287 mova m1, m2
|
yading@10
|
288 mova m2, m3
|
yading@10
|
289 mova m3, m4
|
yading@10
|
290 mova m4, m7
|
yading@10
|
291 punpcklbw m7, m3
|
yading@10
|
292 pmaddubsw m7, m5
|
yading@10
|
293 paddw m6, m7
|
yading@10
|
294 pmulhrsw m6, [pw_512]
|
yading@10
|
295 STORE m6, m7, %1
|
yading@10
|
296
|
yading@10
|
297 ; go to next line
|
yading@10
|
298 add dstq, dststrideq
|
yading@10
|
299 add srcq, srcstrideq
|
yading@10
|
300 dec heightd ; next row
|
yading@10
|
301 jg .nextrow
|
yading@10
|
302 REP_RET
|
yading@10
|
303
|
yading@10
|
304 cglobal %1_rv40_qpel_h, 6,6+npicregs,8, dst, dststride, src, srcstride, height, mx, picreg
|
yading@10
|
305 %ifdef PIC
|
yading@10
|
306 lea picregq, [sixtap_filter_hb_m]
|
yading@10
|
307 %endif
|
yading@10
|
308 mova m3, [filter_h6_shuf2]
|
yading@10
|
309 mova m4, [filter_h6_shuf3]
|
yading@10
|
310 LOAD mx, sixtap_filter_hb
|
yading@10
|
311 mova m5, [mxq] ; set up 6tap filter in bytes
|
yading@10
|
312 mova m6, [mxq+16]
|
yading@10
|
313 mova m7, [filter_h6_shuf1]
|
yading@10
|
314
|
yading@10
|
315 .nextrow:
|
yading@10
|
316 movu m0, [srcq-2]
|
yading@10
|
317 mova m1, m0
|
yading@10
|
318 mova m2, m0
|
yading@10
|
319 pshufb m0, m7
|
yading@10
|
320 pshufb m1, m3
|
yading@10
|
321 pshufb m2, m4
|
yading@10
|
322 pmaddubsw m0, m5
|
yading@10
|
323 pmaddubsw m1, m6
|
yading@10
|
324 pmaddubsw m2, m5
|
yading@10
|
325 paddw m0, m1
|
yading@10
|
326 paddw m0, m2
|
yading@10
|
327 pmulhrsw m0, [pw_512]
|
yading@10
|
328 STORE m0, m1, %1
|
yading@10
|
329
|
yading@10
|
330 ; go to next line
|
yading@10
|
331 add dstq, dststrideq
|
yading@10
|
332 add srcq, srcstrideq
|
yading@10
|
333 dec heightd ; next row
|
yading@10
|
334 jg .nextrow
|
yading@10
|
335 REP_RET
|
yading@10
|
336 %endmacro
|
yading@10
|
337
|
yading@10
|
338 INIT_XMM ssse3
|
yading@10
|
339 FILTER_SSSE3 put
|
yading@10
|
340 FILTER_SSSE3 avg
|
yading@10
|
341
|
yading@10
|
342 ; %1=5bits weights?, %2=dst %3=src1 %4=src3 %5=stride if sse2
|
yading@10
|
343 %macro RV40_WCORE 4-5
|
yading@10
|
344 movh m4, [%3 + r6 + 0]
|
yading@10
|
345 movh m5, [%4 + r6 + 0]
|
yading@10
|
346 %if %0 == 4
|
yading@10
|
347 %define OFFSET r6 + mmsize / 2
|
yading@10
|
348 %else
|
yading@10
|
349 ; 8x8 block and sse2, stride was provided
|
yading@10
|
350 %define OFFSET r6
|
yading@10
|
351 add r6, r5
|
yading@10
|
352 %endif
|
yading@10
|
353 movh m6, [%3 + OFFSET]
|
yading@10
|
354 movh m7, [%4 + OFFSET]
|
yading@10
|
355
|
yading@10
|
356 %if %1 == 0
|
yading@10
|
357 ; 14bits weights
|
yading@10
|
358 punpcklbw m4, m0
|
yading@10
|
359 punpcklbw m5, m0
|
yading@10
|
360 punpcklbw m6, m0
|
yading@10
|
361 punpcklbw m7, m0
|
yading@10
|
362
|
yading@10
|
363 psllw m4, 7
|
yading@10
|
364 psllw m5, 7
|
yading@10
|
365 psllw m6, 7
|
yading@10
|
366 psllw m7, 7
|
yading@10
|
367 pmulhw m4, m3
|
yading@10
|
368 pmulhw m5, m2
|
yading@10
|
369 pmulhw m6, m3
|
yading@10
|
370 pmulhw m7, m2
|
yading@10
|
371
|
yading@10
|
372 paddw m4, m5
|
yading@10
|
373 paddw m6, m7
|
yading@10
|
374 %else
|
yading@10
|
375 ; 5bits weights
|
yading@10
|
376 %if cpuflag(ssse3)
|
yading@10
|
377 punpcklbw m4, m5
|
yading@10
|
378 punpcklbw m6, m7
|
yading@10
|
379
|
yading@10
|
380 pmaddubsw m4, m3
|
yading@10
|
381 pmaddubsw m6, m3
|
yading@10
|
382 %else
|
yading@10
|
383 punpcklbw m4, m0
|
yading@10
|
384 punpcklbw m5, m0
|
yading@10
|
385 punpcklbw m6, m0
|
yading@10
|
386 punpcklbw m7, m0
|
yading@10
|
387
|
yading@10
|
388 pmullw m4, m3
|
yading@10
|
389 pmullw m5, m2
|
yading@10
|
390 pmullw m6, m3
|
yading@10
|
391 pmullw m7, m2
|
yading@10
|
392 paddw m4, m5
|
yading@10
|
393 paddw m6, m7
|
yading@10
|
394 %endif
|
yading@10
|
395
|
yading@10
|
396 %endif
|
yading@10
|
397
|
yading@10
|
398 ; bias and shift down
|
yading@10
|
399 %if cpuflag(ssse3)
|
yading@10
|
400 pmulhrsw m4, m1
|
yading@10
|
401 pmulhrsw m6, m1
|
yading@10
|
402 %else
|
yading@10
|
403 paddw m4, m1
|
yading@10
|
404 paddw m6, m1
|
yading@10
|
405 psrlw m4, 5
|
yading@10
|
406 psrlw m6, 5
|
yading@10
|
407 %endif
|
yading@10
|
408
|
yading@10
|
409 packuswb m4, m6
|
yading@10
|
410 %if %0 == 5
|
yading@10
|
411 ; Only called for 8x8 blocks and sse2
|
yading@10
|
412 sub r6, r5
|
yading@10
|
413 movh [%2 + r6], m4
|
yading@10
|
414 add r6, r5
|
yading@10
|
415 movhps [%2 + r6], m4
|
yading@10
|
416 %else
|
yading@10
|
417 mova [%2 + r6], m4
|
yading@10
|
418 %endif
|
yading@10
|
419 %endmacro
|
yading@10
|
420
|
yading@10
|
421
|
yading@10
|
422 %macro MAIN_LOOP 2
|
yading@10
|
423 %if mmsize == 8
|
yading@10
|
424 RV40_WCORE %2, r0, r1, r2
|
yading@10
|
425 %if %1 == 16
|
yading@10
|
426 RV40_WCORE %2, r0 + 8, r1 + 8, r2 + 8
|
yading@10
|
427 %endif
|
yading@10
|
428
|
yading@10
|
429 ; Prepare for next loop
|
yading@10
|
430 add r6, r5
|
yading@10
|
431 %else
|
yading@10
|
432 %ifidn %1, 8
|
yading@10
|
433 RV40_WCORE %2, r0, r1, r2, r5
|
yading@10
|
434 ; Prepare 2 next lines
|
yading@10
|
435 add r6, r5
|
yading@10
|
436 %else
|
yading@10
|
437 RV40_WCORE %2, r0, r1, r2
|
yading@10
|
438 ; Prepare single next line
|
yading@10
|
439 add r6, r5
|
yading@10
|
440 %endif
|
yading@10
|
441 %endif
|
yading@10
|
442
|
yading@10
|
443 %endmacro
|
yading@10
|
444
|
yading@10
|
445 ; rv40_weight_func_%1(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w1, int w2, int stride)
|
yading@10
|
446 ; %1=size %2=num of xmm regs
|
yading@10
|
447 ; The weights are FP0.14 notation of fractions depending on pts.
|
yading@10
|
448 ; For timebases without rounding error (i.e. PAL), the fractions
|
yading@10
|
449 ; can be simplified, and several operations can be avoided.
|
yading@10
|
450 ; Therefore, we check here whether they are multiples of 2^9 for
|
yading@10
|
451 ; those simplifications to occur.
|
yading@10
|
452 %macro RV40_WEIGHT 3
|
yading@10
|
453 cglobal rv40_weight_func_%1_%2, 6, 7, 8
|
yading@10
|
454 %if cpuflag(ssse3)
|
yading@10
|
455 mova m1, [pw_1024]
|
yading@10
|
456 %else
|
yading@10
|
457 mova m1, [pw_16]
|
yading@10
|
458 %endif
|
yading@10
|
459 pxor m0, m0
|
yading@10
|
460 ; Set loop counter and increments
|
yading@10
|
461 mov r6, r5
|
yading@10
|
462 shl r6, %3
|
yading@10
|
463 add r0, r6
|
yading@10
|
464 add r1, r6
|
yading@10
|
465 add r2, r6
|
yading@10
|
466 neg r6
|
yading@10
|
467
|
yading@10
|
468 movd m2, r3d
|
yading@10
|
469 movd m3, r4d
|
yading@10
|
470 %ifidn %1,rnd
|
yading@10
|
471 %define RND 0
|
yading@10
|
472 SPLATW m2, m2
|
yading@10
|
473 %else
|
yading@10
|
474 %define RND 1
|
yading@10
|
475 %if cpuflag(ssse3)
|
yading@10
|
476 punpcklbw m3, m2
|
yading@10
|
477 %else
|
yading@10
|
478 SPLATW m2, m2
|
yading@10
|
479 %endif
|
yading@10
|
480 %endif
|
yading@10
|
481 SPLATW m3, m3
|
yading@10
|
482
|
yading@10
|
483 .loop:
|
yading@10
|
484 MAIN_LOOP %2, RND
|
yading@10
|
485 jnz .loop
|
yading@10
|
486 REP_RET
|
yading@10
|
487 %endmacro
|
yading@10
|
488
|
yading@10
|
489 INIT_MMX mmxext
|
yading@10
|
490 RV40_WEIGHT rnd, 8, 3
|
yading@10
|
491 RV40_WEIGHT rnd, 16, 4
|
yading@10
|
492 RV40_WEIGHT nornd, 8, 3
|
yading@10
|
493 RV40_WEIGHT nornd, 16, 4
|
yading@10
|
494
|
yading@10
|
495 INIT_XMM sse2
|
yading@10
|
496 RV40_WEIGHT rnd, 8, 3
|
yading@10
|
497 RV40_WEIGHT rnd, 16, 4
|
yading@10
|
498 RV40_WEIGHT nornd, 8, 3
|
yading@10
|
499 RV40_WEIGHT nornd, 16, 4
|
yading@10
|
500
|
yading@10
|
501 INIT_XMM ssse3
|
yading@10
|
502 RV40_WEIGHT rnd, 8, 3
|
yading@10
|
503 RV40_WEIGHT rnd, 16, 4
|
yading@10
|
504 RV40_WEIGHT nornd, 8, 3
|
yading@10
|
505 RV40_WEIGHT nornd, 16, 4
|