yading@10
|
1 ;*****************************************************************************
|
yading@10
|
2 ;* SSE2-optimized weighted prediction code
|
yading@10
|
3 ;*****************************************************************************
|
yading@10
|
4 ;* Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt
|
yading@10
|
5 ;* Copyright (C) 2010 Eli Friedman <eli.friedman@gmail.com>
|
yading@10
|
6 ;*
|
yading@10
|
7 ;* This file is part of FFmpeg.
|
yading@10
|
8 ;*
|
yading@10
|
9 ;* FFmpeg is free software; you can redistribute it and/or
|
yading@10
|
10 ;* modify it under the terms of the GNU Lesser General Public
|
yading@10
|
11 ;* License as published by the Free Software Foundation; either
|
yading@10
|
12 ;* version 2.1 of the License, or (at your option) any later version.
|
yading@10
|
13 ;*
|
yading@10
|
14 ;* FFmpeg is distributed in the hope that it will be useful,
|
yading@10
|
15 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
yading@10
|
16 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
yading@10
|
17 ;* Lesser General Public License for more details.
|
yading@10
|
18 ;*
|
yading@10
|
19 ;* You should have received a copy of the GNU Lesser General Public
|
yading@10
|
20 ;* License along with FFmpeg; if not, write to the Free Software
|
yading@10
|
21 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
yading@10
|
22 ;******************************************************************************
|
yading@10
|
23
|
yading@10
|
24 %include "libavutil/x86/x86util.asm"
|
yading@10
|
25
|
yading@10
|
26 SECTION .text
|
yading@10
|
27
|
yading@10
|
28 ;-----------------------------------------------------------------------------
|
yading@10
|
29 ; biweight pred:
|
yading@10
|
30 ;
|
yading@10
|
31 ; void h264_biweight_16_sse2(uint8_t *dst, uint8_t *src, int stride,
|
yading@10
|
32 ; int height, int log2_denom, int weightd,
|
yading@10
|
33 ; int weights, int offset);
|
yading@10
|
34 ; and
|
yading@10
|
35 ; void h264_weight_16_sse2(uint8_t *dst, int stride, int height,
|
yading@10
|
36 ; int log2_denom, int weight, int offset);
|
yading@10
|
37 ;-----------------------------------------------------------------------------
|
yading@10
|
38
|
yading@10
|
39 %macro WEIGHT_SETUP 0
|
yading@10
|
40 add r5, r5
|
yading@10
|
41 inc r5
|
yading@10
|
42 movd m3, r4d
|
yading@10
|
43 movd m5, r5d
|
yading@10
|
44 movd m6, r3d
|
yading@10
|
45 pslld m5, m6
|
yading@10
|
46 psrld m5, 1
|
yading@10
|
47 %if mmsize == 16
|
yading@10
|
48 pshuflw m3, m3, 0
|
yading@10
|
49 pshuflw m5, m5, 0
|
yading@10
|
50 punpcklqdq m3, m3
|
yading@10
|
51 punpcklqdq m5, m5
|
yading@10
|
52 %else
|
yading@10
|
53 pshufw m3, m3, 0
|
yading@10
|
54 pshufw m5, m5, 0
|
yading@10
|
55 %endif
|
yading@10
|
56 pxor m7, m7
|
yading@10
|
57 %endmacro
|
yading@10
|
58
|
yading@10
|
59 %macro WEIGHT_OP 2
|
yading@10
|
60 movh m0, [r0+%1]
|
yading@10
|
61 movh m1, [r0+%2]
|
yading@10
|
62 punpcklbw m0, m7
|
yading@10
|
63 punpcklbw m1, m7
|
yading@10
|
64 pmullw m0, m3
|
yading@10
|
65 pmullw m1, m3
|
yading@10
|
66 paddsw m0, m5
|
yading@10
|
67 paddsw m1, m5
|
yading@10
|
68 psraw m0, m6
|
yading@10
|
69 psraw m1, m6
|
yading@10
|
70 packuswb m0, m1
|
yading@10
|
71 %endmacro
|
yading@10
|
72
|
yading@10
|
73 INIT_MMX mmxext
|
yading@10
|
74 cglobal h264_weight_16, 6, 6, 0
|
yading@10
|
75 WEIGHT_SETUP
|
yading@10
|
76 .nextrow:
|
yading@10
|
77 WEIGHT_OP 0, 4
|
yading@10
|
78 mova [r0 ], m0
|
yading@10
|
79 WEIGHT_OP 8, 12
|
yading@10
|
80 mova [r0+8], m0
|
yading@10
|
81 add r0, r1
|
yading@10
|
82 dec r2d
|
yading@10
|
83 jnz .nextrow
|
yading@10
|
84 REP_RET
|
yading@10
|
85
|
yading@10
|
86 %macro WEIGHT_FUNC_MM 2
|
yading@10
|
87 cglobal h264_weight_%1, 6, 6, %2
|
yading@10
|
88 WEIGHT_SETUP
|
yading@10
|
89 .nextrow:
|
yading@10
|
90 WEIGHT_OP 0, mmsize/2
|
yading@10
|
91 mova [r0], m0
|
yading@10
|
92 add r0, r1
|
yading@10
|
93 dec r2d
|
yading@10
|
94 jnz .nextrow
|
yading@10
|
95 REP_RET
|
yading@10
|
96 %endmacro
|
yading@10
|
97
|
yading@10
|
98 INIT_MMX mmxext
|
yading@10
|
99 WEIGHT_FUNC_MM 8, 0
|
yading@10
|
100 INIT_XMM sse2
|
yading@10
|
101 WEIGHT_FUNC_MM 16, 8
|
yading@10
|
102
|
yading@10
|
103 %macro WEIGHT_FUNC_HALF_MM 2
|
yading@10
|
104 cglobal h264_weight_%1, 6, 6, %2
|
yading@10
|
105 WEIGHT_SETUP
|
yading@10
|
106 sar r2d, 1
|
yading@10
|
107 lea r3, [r1*2]
|
yading@10
|
108 .nextrow:
|
yading@10
|
109 WEIGHT_OP 0, r1
|
yading@10
|
110 movh [r0], m0
|
yading@10
|
111 %if mmsize == 16
|
yading@10
|
112 movhps [r0+r1], m0
|
yading@10
|
113 %else
|
yading@10
|
114 psrlq m0, 32
|
yading@10
|
115 movh [r0+r1], m0
|
yading@10
|
116 %endif
|
yading@10
|
117 add r0, r3
|
yading@10
|
118 dec r2d
|
yading@10
|
119 jnz .nextrow
|
yading@10
|
120 REP_RET
|
yading@10
|
121 %endmacro
|
yading@10
|
122
|
yading@10
|
123 INIT_MMX mmxext
|
yading@10
|
124 WEIGHT_FUNC_HALF_MM 4, 0
|
yading@10
|
125 INIT_XMM sse2
|
yading@10
|
126 WEIGHT_FUNC_HALF_MM 8, 8
|
yading@10
|
127
|
yading@10
|
128 %macro BIWEIGHT_SETUP 0
|
yading@10
|
129 %if ARCH_X86_64
|
yading@10
|
130 %define off_regd r7d
|
yading@10
|
131 %else
|
yading@10
|
132 %define off_regd r3d
|
yading@10
|
133 %endif
|
yading@10
|
134 mov off_regd, r7m
|
yading@10
|
135 add off_regd, 1
|
yading@10
|
136 or off_regd, 1
|
yading@10
|
137 add r4, 1
|
yading@10
|
138 cmp r5, 128
|
yading@10
|
139 jne .normal
|
yading@10
|
140 sar r5, 1
|
yading@10
|
141 sar r6, 1
|
yading@10
|
142 sar off_regd, 1
|
yading@10
|
143 sub r4, 1
|
yading@10
|
144 .normal
|
yading@10
|
145 %if cpuflag(ssse3)
|
yading@10
|
146 movd m4, r5d
|
yading@10
|
147 movd m0, r6d
|
yading@10
|
148 %else
|
yading@10
|
149 movd m3, r5d
|
yading@10
|
150 movd m4, r6d
|
yading@10
|
151 %endif
|
yading@10
|
152 movd m5, off_regd
|
yading@10
|
153 movd m6, r4d
|
yading@10
|
154 pslld m5, m6
|
yading@10
|
155 psrld m5, 1
|
yading@10
|
156 %if cpuflag(ssse3)
|
yading@10
|
157 punpcklbw m4, m0
|
yading@10
|
158 pshuflw m4, m4, 0
|
yading@10
|
159 pshuflw m5, m5, 0
|
yading@10
|
160 punpcklqdq m4, m4
|
yading@10
|
161 punpcklqdq m5, m5
|
yading@10
|
162
|
yading@10
|
163 %else
|
yading@10
|
164 %if mmsize == 16
|
yading@10
|
165 pshuflw m3, m3, 0
|
yading@10
|
166 pshuflw m4, m4, 0
|
yading@10
|
167 pshuflw m5, m5, 0
|
yading@10
|
168 punpcklqdq m3, m3
|
yading@10
|
169 punpcklqdq m4, m4
|
yading@10
|
170 punpcklqdq m5, m5
|
yading@10
|
171 %else
|
yading@10
|
172 pshufw m3, m3, 0
|
yading@10
|
173 pshufw m4, m4, 0
|
yading@10
|
174 pshufw m5, m5, 0
|
yading@10
|
175 %endif
|
yading@10
|
176 pxor m7, m7
|
yading@10
|
177 %endif
|
yading@10
|
178 %endmacro
|
yading@10
|
179
|
yading@10
|
180 %macro BIWEIGHT_STEPA 3
|
yading@10
|
181 movh m%1, [r0+%3]
|
yading@10
|
182 movh m%2, [r1+%3]
|
yading@10
|
183 punpcklbw m%1, m7
|
yading@10
|
184 punpcklbw m%2, m7
|
yading@10
|
185 pmullw m%1, m3
|
yading@10
|
186 pmullw m%2, m4
|
yading@10
|
187 paddsw m%1, m%2
|
yading@10
|
188 %endmacro
|
yading@10
|
189
|
yading@10
|
190 %macro BIWEIGHT_STEPB 0
|
yading@10
|
191 paddsw m0, m5
|
yading@10
|
192 paddsw m1, m5
|
yading@10
|
193 psraw m0, m6
|
yading@10
|
194 psraw m1, m6
|
yading@10
|
195 packuswb m0, m1
|
yading@10
|
196 %endmacro
|
yading@10
|
197
|
yading@10
|
198 INIT_MMX mmxext
|
yading@10
|
199 cglobal h264_biweight_16, 7, 8, 0
|
yading@10
|
200 BIWEIGHT_SETUP
|
yading@10
|
201 movifnidn r3d, r3m
|
yading@10
|
202 .nextrow:
|
yading@10
|
203 BIWEIGHT_STEPA 0, 1, 0
|
yading@10
|
204 BIWEIGHT_STEPA 1, 2, 4
|
yading@10
|
205 BIWEIGHT_STEPB
|
yading@10
|
206 mova [r0], m0
|
yading@10
|
207 BIWEIGHT_STEPA 0, 1, 8
|
yading@10
|
208 BIWEIGHT_STEPA 1, 2, 12
|
yading@10
|
209 BIWEIGHT_STEPB
|
yading@10
|
210 mova [r0+8], m0
|
yading@10
|
211 add r0, r2
|
yading@10
|
212 add r1, r2
|
yading@10
|
213 dec r3d
|
yading@10
|
214 jnz .nextrow
|
yading@10
|
215 REP_RET
|
yading@10
|
216
|
yading@10
|
217 %macro BIWEIGHT_FUNC_MM 2
|
yading@10
|
218 cglobal h264_biweight_%1, 7, 8, %2
|
yading@10
|
219 BIWEIGHT_SETUP
|
yading@10
|
220 movifnidn r3d, r3m
|
yading@10
|
221 .nextrow:
|
yading@10
|
222 BIWEIGHT_STEPA 0, 1, 0
|
yading@10
|
223 BIWEIGHT_STEPA 1, 2, mmsize/2
|
yading@10
|
224 BIWEIGHT_STEPB
|
yading@10
|
225 mova [r0], m0
|
yading@10
|
226 add r0, r2
|
yading@10
|
227 add r1, r2
|
yading@10
|
228 dec r3d
|
yading@10
|
229 jnz .nextrow
|
yading@10
|
230 REP_RET
|
yading@10
|
231 %endmacro
|
yading@10
|
232
|
yading@10
|
233 INIT_MMX mmxext
|
yading@10
|
234 BIWEIGHT_FUNC_MM 8, 0
|
yading@10
|
235 INIT_XMM sse2
|
yading@10
|
236 BIWEIGHT_FUNC_MM 16, 8
|
yading@10
|
237
|
yading@10
|
238 %macro BIWEIGHT_FUNC_HALF_MM 2
|
yading@10
|
239 cglobal h264_biweight_%1, 7, 8, %2
|
yading@10
|
240 BIWEIGHT_SETUP
|
yading@10
|
241 movifnidn r3d, r3m
|
yading@10
|
242 sar r3, 1
|
yading@10
|
243 lea r4, [r2*2]
|
yading@10
|
244 .nextrow:
|
yading@10
|
245 BIWEIGHT_STEPA 0, 1, 0
|
yading@10
|
246 BIWEIGHT_STEPA 1, 2, r2
|
yading@10
|
247 BIWEIGHT_STEPB
|
yading@10
|
248 movh [r0], m0
|
yading@10
|
249 %if mmsize == 16
|
yading@10
|
250 movhps [r0+r2], m0
|
yading@10
|
251 %else
|
yading@10
|
252 psrlq m0, 32
|
yading@10
|
253 movh [r0+r2], m0
|
yading@10
|
254 %endif
|
yading@10
|
255 add r0, r4
|
yading@10
|
256 add r1, r4
|
yading@10
|
257 dec r3d
|
yading@10
|
258 jnz .nextrow
|
yading@10
|
259 REP_RET
|
yading@10
|
260 %endmacro
|
yading@10
|
261
|
yading@10
|
262 INIT_MMX mmxext
|
yading@10
|
263 BIWEIGHT_FUNC_HALF_MM 4, 0
|
yading@10
|
264 INIT_XMM sse2
|
yading@10
|
265 BIWEIGHT_FUNC_HALF_MM 8, 8
|
yading@10
|
266
|
yading@10
|
267 %macro BIWEIGHT_SSSE3_OP 0
|
yading@10
|
268 pmaddubsw m0, m4
|
yading@10
|
269 pmaddubsw m2, m4
|
yading@10
|
270 paddsw m0, m5
|
yading@10
|
271 paddsw m2, m5
|
yading@10
|
272 psraw m0, m6
|
yading@10
|
273 psraw m2, m6
|
yading@10
|
274 packuswb m0, m2
|
yading@10
|
275 %endmacro
|
yading@10
|
276
|
yading@10
|
277 INIT_XMM ssse3
|
yading@10
|
278 cglobal h264_biweight_16, 7, 8, 8
|
yading@10
|
279 BIWEIGHT_SETUP
|
yading@10
|
280 movifnidn r3d, r3m
|
yading@10
|
281
|
yading@10
|
282 .nextrow:
|
yading@10
|
283 movh m0, [r0]
|
yading@10
|
284 movh m2, [r0+8]
|
yading@10
|
285 movh m3, [r1+8]
|
yading@10
|
286 punpcklbw m0, [r1]
|
yading@10
|
287 punpcklbw m2, m3
|
yading@10
|
288 BIWEIGHT_SSSE3_OP
|
yading@10
|
289 mova [r0], m0
|
yading@10
|
290 add r0, r2
|
yading@10
|
291 add r1, r2
|
yading@10
|
292 dec r3d
|
yading@10
|
293 jnz .nextrow
|
yading@10
|
294 REP_RET
|
yading@10
|
295
|
yading@10
|
296 INIT_XMM ssse3
|
yading@10
|
297 cglobal h264_biweight_8, 7, 8, 8
|
yading@10
|
298 BIWEIGHT_SETUP
|
yading@10
|
299 movifnidn r3d, r3m
|
yading@10
|
300 sar r3, 1
|
yading@10
|
301 lea r4, [r2*2]
|
yading@10
|
302
|
yading@10
|
303 .nextrow:
|
yading@10
|
304 movh m0, [r0]
|
yading@10
|
305 movh m1, [r1]
|
yading@10
|
306 movh m2, [r0+r2]
|
yading@10
|
307 movh m3, [r1+r2]
|
yading@10
|
308 punpcklbw m0, m1
|
yading@10
|
309 punpcklbw m2, m3
|
yading@10
|
310 BIWEIGHT_SSSE3_OP
|
yading@10
|
311 movh [r0], m0
|
yading@10
|
312 movhps [r0+r2], m0
|
yading@10
|
313 add r0, r4
|
yading@10
|
314 add r1, r4
|
yading@10
|
315 dec r3d
|
yading@10
|
316 jnz .nextrow
|
yading@10
|
317 REP_RET
|