yading@10
|
1 ;*****************************************************************************
|
yading@10
|
2 ;* MMX/SSE2/AVX-optimized 10-bit H.264 chroma MC code
|
yading@10
|
3 ;*****************************************************************************
|
yading@10
|
4 ;* Copyright (C) 2005-2011 x264 project
|
yading@10
|
5 ;*
|
yading@10
|
6 ;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
|
yading@10
|
7 ;*
|
yading@10
|
8 ;* This file is part of Libav.
|
yading@10
|
9 ;*
|
yading@10
|
10 ;* Libav is free software; you can redistribute it and/or
|
yading@10
|
11 ;* modify it under the terms of the GNU Lesser General Public
|
yading@10
|
12 ;* License as published by the Free Software Foundation; either
|
yading@10
|
13 ;* version 2.1 of the License, or (at your option) any later version.
|
yading@10
|
14 ;*
|
yading@10
|
15 ;* Libav is distributed in the hope that it will be useful,
|
yading@10
|
16 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
yading@10
|
17 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
yading@10
|
18 ;* Lesser General Public License for more details.
|
yading@10
|
19 ;*
|
yading@10
|
20 ;* You should have received a copy of the GNU Lesser General Public
|
yading@10
|
21 ;* License along with Libav; if not, write to the Free Software
|
yading@10
|
22 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
yading@10
|
23 ;******************************************************************************
|
yading@10
|
24
|
yading@10
|
25 %include "libavutil/x86/x86util.asm"
|
yading@10
|
26
|
yading@10
|
27 SECTION_RODATA
|
yading@10
|
28
|
yading@10
|
29 cextern pw_4
|
yading@10
|
30 cextern pw_8
|
yading@10
|
31 cextern pw_32
|
yading@10
|
32 cextern pw_64
|
yading@10
|
33
|
yading@10
|
34 SECTION .text
|
yading@10
|
35
|
yading@10
|
36
|
yading@10
|
37 %macro MV0_PIXELS_MC8 0
|
yading@10
|
38 lea r4, [r2*3 ]
|
yading@10
|
39 lea r5, [r2*4 ]
|
yading@10
|
40 .next4rows:
|
yading@10
|
41 movu m0, [r1 ]
|
yading@10
|
42 movu m1, [r1+r2 ]
|
yading@10
|
43 CHROMAMC_AVG m0, [r0 ]
|
yading@10
|
44 CHROMAMC_AVG m1, [r0+r2 ]
|
yading@10
|
45 mova [r0 ], m0
|
yading@10
|
46 mova [r0+r2 ], m1
|
yading@10
|
47 movu m0, [r1+r2*2]
|
yading@10
|
48 movu m1, [r1+r4 ]
|
yading@10
|
49 CHROMAMC_AVG m0, [r0+r2*2]
|
yading@10
|
50 CHROMAMC_AVG m1, [r0+r4 ]
|
yading@10
|
51 mova [r0+r2*2], m0
|
yading@10
|
52 mova [r0+r4 ], m1
|
yading@10
|
53 add r1, r5
|
yading@10
|
54 add r0, r5
|
yading@10
|
55 sub r3d, 4
|
yading@10
|
56 jne .next4rows
|
yading@10
|
57 %endmacro
|
yading@10
|
58
|
yading@10
|
59 ;-----------------------------------------------------------------------------
|
yading@10
|
60 ; void put/avg_h264_chroma_mc8(pixel *dst, pixel *src, int stride, int h, int mx, int my)
|
yading@10
|
61 ;-----------------------------------------------------------------------------
|
yading@10
|
62 %macro CHROMA_MC8 1
|
yading@10
|
63 ; put/avg_h264_chroma_mc8_*(uint8_t *dst /*align 8*/, uint8_t *src /*align 1*/,
|
yading@10
|
64 ; int stride, int h, int mx, int my)
|
yading@10
|
65 cglobal %1_h264_chroma_mc8_10, 6,7,8
|
yading@10
|
66 movsxdifnidn r2, r2d
|
yading@10
|
67 mov r6d, r5d
|
yading@10
|
68 or r6d, r4d
|
yading@10
|
69 jne .at_least_one_non_zero
|
yading@10
|
70 ; mx == 0 AND my == 0 - no filter needed
|
yading@10
|
71 MV0_PIXELS_MC8
|
yading@10
|
72 REP_RET
|
yading@10
|
73
|
yading@10
|
74 .at_least_one_non_zero:
|
yading@10
|
75 mov r6d, 2
|
yading@10
|
76 test r5d, r5d
|
yading@10
|
77 je .x_interpolation
|
yading@10
|
78 mov r6, r2 ; dxy = x ? 1 : stride
|
yading@10
|
79 test r4d, r4d
|
yading@10
|
80 jne .xy_interpolation
|
yading@10
|
81 .x_interpolation:
|
yading@10
|
82 ; mx == 0 XOR my == 0 - 1 dimensional filter only
|
yading@10
|
83 or r4d, r5d ; x + y
|
yading@10
|
84 movd m5, r4d
|
yading@10
|
85 mova m4, [pw_8]
|
yading@10
|
86 mova m6, [pw_4] ; mm6 = rnd >> 3
|
yading@10
|
87 SPLATW m5, m5 ; mm5 = B = x
|
yading@10
|
88 psubw m4, m5 ; mm4 = A = 8-x
|
yading@10
|
89
|
yading@10
|
90 .next1drow:
|
yading@10
|
91 movu m0, [r1 ] ; mm0 = src[0..7]
|
yading@10
|
92 movu m2, [r1+r6] ; mm2 = src[1..8]
|
yading@10
|
93
|
yading@10
|
94 pmullw m0, m4 ; mm0 = A * src[0..7]
|
yading@10
|
95 pmullw m2, m5 ; mm2 = B * src[1..8]
|
yading@10
|
96
|
yading@10
|
97 paddw m0, m6
|
yading@10
|
98 paddw m0, m2
|
yading@10
|
99 psrlw m0, 3
|
yading@10
|
100 CHROMAMC_AVG m0, [r0]
|
yading@10
|
101 mova [r0], m0 ; dst[0..7] = (A * src[0..7] + B * src[1..8] + (rnd >> 3)) >> 3
|
yading@10
|
102
|
yading@10
|
103 add r0, r2
|
yading@10
|
104 add r1, r2
|
yading@10
|
105 dec r3d
|
yading@10
|
106 jne .next1drow
|
yading@10
|
107 REP_RET
|
yading@10
|
108
|
yading@10
|
109 .xy_interpolation: ; general case, bilinear
|
yading@10
|
110 movd m4, r4m ; x
|
yading@10
|
111 movd m6, r5m ; y
|
yading@10
|
112
|
yading@10
|
113 SPLATW m4, m4 ; mm4 = x words
|
yading@10
|
114 SPLATW m6, m6 ; mm6 = y words
|
yading@10
|
115 psllw m5, m4, 3 ; mm5 = 8x
|
yading@10
|
116 pmullw m4, m6 ; mm4 = x * y
|
yading@10
|
117 psllw m6, 3 ; mm6 = 8y
|
yading@10
|
118 paddw m1, m5, m6 ; mm7 = 8x+8y
|
yading@10
|
119 mova m7, m4 ; DD = x * y
|
yading@10
|
120 psubw m5, m4 ; mm5 = B = 8x - xy
|
yading@10
|
121 psubw m6, m4 ; mm6 = C = 8y - xy
|
yading@10
|
122 paddw m4, [pw_64]
|
yading@10
|
123 psubw m4, m1 ; mm4 = A = xy - (8x+8y) + 64
|
yading@10
|
124
|
yading@10
|
125 movu m0, [r1 ] ; mm0 = src[0..7]
|
yading@10
|
126 movu m1, [r1+2] ; mm1 = src[1..8]
|
yading@10
|
127 .next2drow:
|
yading@10
|
128 add r1, r2
|
yading@10
|
129
|
yading@10
|
130 pmullw m2, m0, m4
|
yading@10
|
131 pmullw m1, m5
|
yading@10
|
132 paddw m2, m1 ; mm2 = A * src[0..7] + B * src[1..8]
|
yading@10
|
133
|
yading@10
|
134 movu m0, [r1]
|
yading@10
|
135 movu m1, [r1+2]
|
yading@10
|
136 pmullw m3, m0, m6
|
yading@10
|
137 paddw m2, m3 ; mm2 += C * src[0..7+strde]
|
yading@10
|
138 pmullw m3, m1, m7
|
yading@10
|
139 paddw m2, m3 ; mm2 += D * src[1..8+strde]
|
yading@10
|
140
|
yading@10
|
141 paddw m2, [pw_32]
|
yading@10
|
142 psrlw m2, 6
|
yading@10
|
143 CHROMAMC_AVG m2, [r0]
|
yading@10
|
144 mova [r0], m2 ; dst[0..7] = (mm2 + 32) >> 6
|
yading@10
|
145
|
yading@10
|
146 add r0, r2
|
yading@10
|
147 dec r3d
|
yading@10
|
148 jne .next2drow
|
yading@10
|
149 REP_RET
|
yading@10
|
150 %endmacro
|
yading@10
|
151
|
yading@10
|
152 ;-----------------------------------------------------------------------------
|
yading@10
|
153 ; void put/avg_h264_chroma_mc4(pixel *dst, pixel *src, int stride, int h, int mx, int my)
|
yading@10
|
154 ;-----------------------------------------------------------------------------
|
yading@10
|
155 ;TODO: xmm mc4
|
yading@10
|
156 %macro MC4_OP 2
|
yading@10
|
157 movq %1, [r1 ]
|
yading@10
|
158 movq m1, [r1+2]
|
yading@10
|
159 add r1, r2
|
yading@10
|
160 pmullw %1, m4
|
yading@10
|
161 pmullw m1, m2
|
yading@10
|
162 paddw m1, %1
|
yading@10
|
163 mova %1, m1
|
yading@10
|
164
|
yading@10
|
165 pmullw %2, m5
|
yading@10
|
166 pmullw m1, m3
|
yading@10
|
167 paddw %2, [pw_32]
|
yading@10
|
168 paddw m1, %2
|
yading@10
|
169 psrlw m1, 6
|
yading@10
|
170 CHROMAMC_AVG m1, %2, [r0]
|
yading@10
|
171 movq [r0], m1
|
yading@10
|
172 add r0, r2
|
yading@10
|
173 %endmacro
|
yading@10
|
174
|
yading@10
|
175 %macro CHROMA_MC4 1
|
yading@10
|
176 cglobal %1_h264_chroma_mc4_10, 6,6,7
|
yading@10
|
177 movsxdifnidn r2, r2d
|
yading@10
|
178 movd m2, r4m ; x
|
yading@10
|
179 movd m3, r5m ; y
|
yading@10
|
180 mova m4, [pw_8]
|
yading@10
|
181 mova m5, m4
|
yading@10
|
182 SPLATW m2, m2
|
yading@10
|
183 SPLATW m3, m3
|
yading@10
|
184 psubw m4, m2
|
yading@10
|
185 psubw m5, m3
|
yading@10
|
186
|
yading@10
|
187 movq m0, [r1 ]
|
yading@10
|
188 movq m6, [r1+2]
|
yading@10
|
189 add r1, r2
|
yading@10
|
190 pmullw m0, m4
|
yading@10
|
191 pmullw m6, m2
|
yading@10
|
192 paddw m6, m0
|
yading@10
|
193
|
yading@10
|
194 .next2rows:
|
yading@10
|
195 MC4_OP m0, m6
|
yading@10
|
196 MC4_OP m6, m0
|
yading@10
|
197 sub r3d, 2
|
yading@10
|
198 jnz .next2rows
|
yading@10
|
199 REP_RET
|
yading@10
|
200 %endmacro
|
yading@10
|
201
|
yading@10
|
202 ;-----------------------------------------------------------------------------
|
yading@10
|
203 ; void put/avg_h264_chroma_mc2(pixel *dst, pixel *src, int stride, int h, int mx, int my)
|
yading@10
|
204 ;-----------------------------------------------------------------------------
|
yading@10
|
205 %macro CHROMA_MC2 1
|
yading@10
|
206 cglobal %1_h264_chroma_mc2_10, 6,7
|
yading@10
|
207 movsxdifnidn r2, r2d
|
yading@10
|
208 mov r6d, r4d
|
yading@10
|
209 shl r4d, 16
|
yading@10
|
210 sub r4d, r6d
|
yading@10
|
211 add r4d, 8
|
yading@10
|
212 imul r5d, r4d ; x*y<<16 | y*(8-x)
|
yading@10
|
213 shl r4d, 3
|
yading@10
|
214 sub r4d, r5d ; x*(8-y)<<16 | (8-x)*(8-y)
|
yading@10
|
215
|
yading@10
|
216 movd m5, r4d
|
yading@10
|
217 movd m6, r5d
|
yading@10
|
218 punpckldq m5, m5 ; mm5 = {A,B,A,B}
|
yading@10
|
219 punpckldq m6, m6 ; mm6 = {C,D,C,D}
|
yading@10
|
220 pxor m7, m7
|
yading@10
|
221 pshufw m2, [r1], 0x94 ; mm0 = src[0,1,1,2]
|
yading@10
|
222
|
yading@10
|
223 .nextrow:
|
yading@10
|
224 add r1, r2
|
yading@10
|
225 movq m1, m2
|
yading@10
|
226 pmaddwd m1, m5 ; mm1 = A * src[0,1] + B * src[1,2]
|
yading@10
|
227 pshufw m0, [r1], 0x94 ; mm0 = src[0,1,1,2]
|
yading@10
|
228 movq m2, m0
|
yading@10
|
229 pmaddwd m0, m6
|
yading@10
|
230 paddw m1, [pw_32]
|
yading@10
|
231 paddw m1, m0 ; mm1 += C * src[0,1] + D * src[1,2]
|
yading@10
|
232 psrlw m1, 6
|
yading@10
|
233 packssdw m1, m7
|
yading@10
|
234 CHROMAMC_AVG m1, m3, [r0]
|
yading@10
|
235 movd [r0], m1
|
yading@10
|
236 add r0, r2
|
yading@10
|
237 dec r3d
|
yading@10
|
238 jnz .nextrow
|
yading@10
|
239 REP_RET
|
yading@10
|
240 %endmacro
|
yading@10
|
241
|
yading@10
|
242 %macro NOTHING 2-3
|
yading@10
|
243 %endmacro
|
yading@10
|
244 %macro AVG 2-3
|
yading@10
|
245 %if %0==3
|
yading@10
|
246 movq %2, %3
|
yading@10
|
247 %endif
|
yading@10
|
248 pavgw %1, %2
|
yading@10
|
249 %endmacro
|
yading@10
|
250
|
yading@10
|
251 %define CHROMAMC_AVG NOTHING
|
yading@10
|
252 INIT_XMM sse2
|
yading@10
|
253 CHROMA_MC8 put
|
yading@10
|
254 %if HAVE_AVX_EXTERNAL
|
yading@10
|
255 INIT_XMM avx
|
yading@10
|
256 CHROMA_MC8 put
|
yading@10
|
257 %endif
|
yading@10
|
258 INIT_MMX mmxext
|
yading@10
|
259 CHROMA_MC4 put
|
yading@10
|
260 CHROMA_MC2 put
|
yading@10
|
261
|
yading@10
|
262 %define CHROMAMC_AVG AVG
|
yading@10
|
263 INIT_XMM sse2
|
yading@10
|
264 CHROMA_MC8 avg
|
yading@10
|
265 %if HAVE_AVX_EXTERNAL
|
yading@10
|
266 INIT_XMM avx
|
yading@10
|
267 CHROMA_MC8 avg
|
yading@10
|
268 %endif
|
yading@10
|
269 INIT_MMX mmxext
|
yading@10
|
270 CHROMA_MC4 avg
|
yading@10
|
271 CHROMA_MC2 avg
|