yading@11
|
1 ;******************************************************************************
|
yading@11
|
2 ;* Copyright (c) 2012 Michael Niedermayer
|
yading@11
|
3 ;*
|
yading@11
|
4 ;* This file is part of FFmpeg.
|
yading@11
|
5 ;*
|
yading@11
|
6 ;* FFmpeg is free software; you can redistribute it and/or
|
yading@11
|
7 ;* modify it under the terms of the GNU Lesser General Public
|
yading@11
|
8 ;* License as published by the Free Software Foundation; either
|
yading@11
|
9 ;* version 2.1 of the License, or (at your option) any later version.
|
yading@11
|
10 ;*
|
yading@11
|
11 ;* FFmpeg is distributed in the hope that it will be useful,
|
yading@11
|
12 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
yading@11
|
13 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
yading@11
|
14 ;* Lesser General Public License for more details.
|
yading@11
|
15 ;*
|
yading@11
|
16 ;* You should have received a copy of the GNU Lesser General Public
|
yading@11
|
17 ;* License along with FFmpeg; if not, write to the Free Software
|
yading@11
|
18 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
yading@11
|
19 ;******************************************************************************
|
yading@11
|
20
|
yading@11
|
21 %include "libavutil/x86/x86util.asm"
|
yading@11
|
22
|
yading@11
|
23
|
yading@11
|
24 SECTION_RODATA
|
yading@11
|
25 align 32
|
yading@11
|
26 dw1: times 8 dd 1
|
yading@11
|
27 w1 : times 16 dw 1
|
yading@11
|
28
|
yading@11
|
29 SECTION .text
|
yading@11
|
30
|
yading@11
|
31 %macro MIX2_FLT 1
|
yading@11
|
32 cglobal mix_2_1_%1_float, 7, 7, 6, out, in1, in2, coeffp, index1, index2, len
|
yading@11
|
33 %ifidn %1, a
|
yading@11
|
34 test in1q, mmsize-1
|
yading@11
|
35 jne mix_2_1_float_u_int %+ SUFFIX
|
yading@11
|
36 test in2q, mmsize-1
|
yading@11
|
37 jne mix_2_1_float_u_int %+ SUFFIX
|
yading@11
|
38 test outq, mmsize-1
|
yading@11
|
39 jne mix_2_1_float_u_int %+ SUFFIX
|
yading@11
|
40 %else
|
yading@11
|
41 mix_2_1_float_u_int %+ SUFFIX
|
yading@11
|
42 %endif
|
yading@11
|
43 VBROADCASTSS m4, [coeffpq + 4*index1q]
|
yading@11
|
44 VBROADCASTSS m5, [coeffpq + 4*index2q]
|
yading@11
|
45 shl lend , 2
|
yading@11
|
46 add in1q , lenq
|
yading@11
|
47 add in2q , lenq
|
yading@11
|
48 add outq , lenq
|
yading@11
|
49 neg lenq
|
yading@11
|
50 .next:
|
yading@11
|
51 %ifidn %1, a
|
yading@11
|
52 mulps m0, m4, [in1q + lenq ]
|
yading@11
|
53 mulps m1, m5, [in2q + lenq ]
|
yading@11
|
54 mulps m2, m4, [in1q + lenq + mmsize]
|
yading@11
|
55 mulps m3, m5, [in2q + lenq + mmsize]
|
yading@11
|
56 %else
|
yading@11
|
57 movu m0, [in1q + lenq ]
|
yading@11
|
58 movu m1, [in2q + lenq ]
|
yading@11
|
59 movu m2, [in1q + lenq + mmsize]
|
yading@11
|
60 movu m3, [in2q + lenq + mmsize]
|
yading@11
|
61 mulps m0, m0, m4
|
yading@11
|
62 mulps m1, m1, m5
|
yading@11
|
63 mulps m2, m2, m4
|
yading@11
|
64 mulps m3, m3, m5
|
yading@11
|
65 %endif
|
yading@11
|
66 addps m0, m0, m1
|
yading@11
|
67 addps m2, m2, m3
|
yading@11
|
68 mov%1 [outq + lenq ], m0
|
yading@11
|
69 mov%1 [outq + lenq + mmsize], m2
|
yading@11
|
70 add lenq, mmsize*2
|
yading@11
|
71 jl .next
|
yading@11
|
72 REP_RET
|
yading@11
|
73 %endmacro
|
yading@11
|
74
|
yading@11
|
75 %macro MIX1_FLT 1
|
yading@11
|
76 cglobal mix_1_1_%1_float, 5, 5, 3, out, in, coeffp, index, len
|
yading@11
|
77 %ifidn %1, a
|
yading@11
|
78 test inq, mmsize-1
|
yading@11
|
79 jne mix_1_1_float_u_int %+ SUFFIX
|
yading@11
|
80 test outq, mmsize-1
|
yading@11
|
81 jne mix_1_1_float_u_int %+ SUFFIX
|
yading@11
|
82 %else
|
yading@11
|
83 mix_1_1_float_u_int %+ SUFFIX
|
yading@11
|
84 %endif
|
yading@11
|
85 VBROADCASTSS m2, [coeffpq + 4*indexq]
|
yading@11
|
86 shl lenq , 2
|
yading@11
|
87 add inq , lenq
|
yading@11
|
88 add outq , lenq
|
yading@11
|
89 neg lenq
|
yading@11
|
90 .next:
|
yading@11
|
91 %ifidn %1, a
|
yading@11
|
92 mulps m0, m2, [inq + lenq ]
|
yading@11
|
93 mulps m1, m2, [inq + lenq + mmsize]
|
yading@11
|
94 %else
|
yading@11
|
95 movu m0, [inq + lenq ]
|
yading@11
|
96 movu m1, [inq + lenq + mmsize]
|
yading@11
|
97 mulps m0, m0, m2
|
yading@11
|
98 mulps m1, m1, m2
|
yading@11
|
99 %endif
|
yading@11
|
100 mov%1 [outq + lenq ], m0
|
yading@11
|
101 mov%1 [outq + lenq + mmsize], m1
|
yading@11
|
102 add lenq, mmsize*2
|
yading@11
|
103 jl .next
|
yading@11
|
104 REP_RET
|
yading@11
|
105 %endmacro
|
yading@11
|
106
|
yading@11
|
107 %macro MIX1_INT16 1
|
yading@11
|
108 cglobal mix_1_1_%1_int16, 5, 5, 6, out, in, coeffp, index, len
|
yading@11
|
109 %ifidn %1, a
|
yading@11
|
110 test inq, mmsize-1
|
yading@11
|
111 jne mix_1_1_int16_u_int %+ SUFFIX
|
yading@11
|
112 test outq, mmsize-1
|
yading@11
|
113 jne mix_1_1_int16_u_int %+ SUFFIX
|
yading@11
|
114 %else
|
yading@11
|
115 mix_1_1_int16_u_int %+ SUFFIX
|
yading@11
|
116 %endif
|
yading@11
|
117 movd m4, [coeffpq + 4*indexq]
|
yading@11
|
118 SPLATW m5, m4
|
yading@11
|
119 psllq m4, 32
|
yading@11
|
120 psrlq m4, 48
|
yading@11
|
121 mova m0, [w1]
|
yading@11
|
122 psllw m0, m4
|
yading@11
|
123 psrlw m0, 1
|
yading@11
|
124 punpcklwd m5, m0
|
yading@11
|
125 add lenq , lenq
|
yading@11
|
126 add inq , lenq
|
yading@11
|
127 add outq , lenq
|
yading@11
|
128 neg lenq
|
yading@11
|
129 .next:
|
yading@11
|
130 mov%1 m0, [inq + lenq ]
|
yading@11
|
131 mov%1 m2, [inq + lenq + mmsize]
|
yading@11
|
132 mova m1, m0
|
yading@11
|
133 mova m3, m2
|
yading@11
|
134 punpcklwd m0, [w1]
|
yading@11
|
135 punpckhwd m1, [w1]
|
yading@11
|
136 punpcklwd m2, [w1]
|
yading@11
|
137 punpckhwd m3, [w1]
|
yading@11
|
138 pmaddwd m0, m5
|
yading@11
|
139 pmaddwd m1, m5
|
yading@11
|
140 pmaddwd m2, m5
|
yading@11
|
141 pmaddwd m3, m5
|
yading@11
|
142 psrad m0, m4
|
yading@11
|
143 psrad m1, m4
|
yading@11
|
144 psrad m2, m4
|
yading@11
|
145 psrad m3, m4
|
yading@11
|
146 packssdw m0, m1
|
yading@11
|
147 packssdw m2, m3
|
yading@11
|
148 mov%1 [outq + lenq ], m0
|
yading@11
|
149 mov%1 [outq + lenq + mmsize], m2
|
yading@11
|
150 add lenq, mmsize*2
|
yading@11
|
151 jl .next
|
yading@11
|
152 %if mmsize == 8
|
yading@11
|
153 emms
|
yading@11
|
154 RET
|
yading@11
|
155 %else
|
yading@11
|
156 REP_RET
|
yading@11
|
157 %endif
|
yading@11
|
158 %endmacro
|
yading@11
|
159
|
yading@11
|
160 %macro MIX2_INT16 1
|
yading@11
|
161 cglobal mix_2_1_%1_int16, 7, 7, 8, out, in1, in2, coeffp, index1, index2, len
|
yading@11
|
162 %ifidn %1, a
|
yading@11
|
163 test in1q, mmsize-1
|
yading@11
|
164 jne mix_2_1_int16_u_int %+ SUFFIX
|
yading@11
|
165 test in2q, mmsize-1
|
yading@11
|
166 jne mix_2_1_int16_u_int %+ SUFFIX
|
yading@11
|
167 test outq, mmsize-1
|
yading@11
|
168 jne mix_2_1_int16_u_int %+ SUFFIX
|
yading@11
|
169 %else
|
yading@11
|
170 mix_2_1_int16_u_int %+ SUFFIX
|
yading@11
|
171 %endif
|
yading@11
|
172 movd m4, [coeffpq + 4*index1q]
|
yading@11
|
173 movd m6, [coeffpq + 4*index2q]
|
yading@11
|
174 SPLATW m5, m4
|
yading@11
|
175 SPLATW m6, m6
|
yading@11
|
176 psllq m4, 32
|
yading@11
|
177 psrlq m4, 48
|
yading@11
|
178 mova m7, [dw1]
|
yading@11
|
179 pslld m7, m4
|
yading@11
|
180 psrld m7, 1
|
yading@11
|
181 punpcklwd m5, m6
|
yading@11
|
182 add lend , lend
|
yading@11
|
183 add in1q , lenq
|
yading@11
|
184 add in2q , lenq
|
yading@11
|
185 add outq , lenq
|
yading@11
|
186 neg lenq
|
yading@11
|
187 .next:
|
yading@11
|
188 mov%1 m0, [in1q + lenq ]
|
yading@11
|
189 mov%1 m2, [in2q + lenq ]
|
yading@11
|
190 mova m1, m0
|
yading@11
|
191 punpcklwd m0, m2
|
yading@11
|
192 punpckhwd m1, m2
|
yading@11
|
193
|
yading@11
|
194 mov%1 m2, [in1q + lenq + mmsize]
|
yading@11
|
195 mov%1 m6, [in2q + lenq + mmsize]
|
yading@11
|
196 mova m3, m2
|
yading@11
|
197 punpcklwd m2, m6
|
yading@11
|
198 punpckhwd m3, m6
|
yading@11
|
199
|
yading@11
|
200 pmaddwd m0, m5
|
yading@11
|
201 pmaddwd m1, m5
|
yading@11
|
202 pmaddwd m2, m5
|
yading@11
|
203 pmaddwd m3, m5
|
yading@11
|
204 paddd m0, m7
|
yading@11
|
205 paddd m1, m7
|
yading@11
|
206 paddd m2, m7
|
yading@11
|
207 paddd m3, m7
|
yading@11
|
208 psrad m0, m4
|
yading@11
|
209 psrad m1, m4
|
yading@11
|
210 psrad m2, m4
|
yading@11
|
211 psrad m3, m4
|
yading@11
|
212 packssdw m0, m1
|
yading@11
|
213 packssdw m2, m3
|
yading@11
|
214 mov%1 [outq + lenq ], m0
|
yading@11
|
215 mov%1 [outq + lenq + mmsize], m2
|
yading@11
|
216 add lenq, mmsize*2
|
yading@11
|
217 jl .next
|
yading@11
|
218 %if mmsize == 8
|
yading@11
|
219 emms
|
yading@11
|
220 RET
|
yading@11
|
221 %else
|
yading@11
|
222 REP_RET
|
yading@11
|
223 %endif
|
yading@11
|
224 %endmacro
|
yading@11
|
225
|
yading@11
|
226
|
yading@11
|
227 INIT_MMX mmx
|
yading@11
|
228 MIX1_INT16 u
|
yading@11
|
229 MIX1_INT16 a
|
yading@11
|
230 MIX2_INT16 u
|
yading@11
|
231 MIX2_INT16 a
|
yading@11
|
232
|
yading@11
|
233 INIT_XMM sse
|
yading@11
|
234 MIX2_FLT u
|
yading@11
|
235 MIX2_FLT a
|
yading@11
|
236 MIX1_FLT u
|
yading@11
|
237 MIX1_FLT a
|
yading@11
|
238
|
yading@11
|
239 INIT_XMM sse2
|
yading@11
|
240 MIX1_INT16 u
|
yading@11
|
241 MIX1_INT16 a
|
yading@11
|
242 MIX2_INT16 u
|
yading@11
|
243 MIX2_INT16 a
|
yading@11
|
244
|
yading@11
|
245 %if HAVE_AVX_EXTERNAL
|
yading@11
|
246 INIT_YMM avx
|
yading@11
|
247 MIX2_FLT u
|
yading@11
|
248 MIX2_FLT a
|
yading@11
|
249 MIX1_FLT u
|
yading@11
|
250 MIX1_FLT a
|
yading@11
|
251 %endif
|