yading@11
|
1 ;*****************************************************************************
|
yading@11
|
2 ;* x86-optimized Float DSP functions
|
yading@11
|
3 ;*
|
yading@11
|
4 ;* Copyright 2006 Loren Merritt
|
yading@11
|
5 ;*
|
yading@11
|
6 ;* This file is part of FFmpeg.
|
yading@11
|
7 ;*
|
yading@11
|
8 ;* FFmpeg is free software; you can redistribute it and/or
|
yading@11
|
9 ;* modify it under the terms of the GNU Lesser General Public
|
yading@11
|
10 ;* License as published by the Free Software Foundation; either
|
yading@11
|
11 ;* version 2.1 of the License, or (at your option) any later version.
|
yading@11
|
12 ;*
|
yading@11
|
13 ;* FFmpeg is distributed in the hope that it will be useful,
|
yading@11
|
14 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
yading@11
|
15 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
yading@11
|
16 ;* Lesser General Public License for more details.
|
yading@11
|
17 ;*
|
yading@11
|
18 ;* You should have received a copy of the GNU Lesser General Public
|
yading@11
|
19 ;* License along with FFmpeg; if not, write to the Free Software
|
yading@11
|
20 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
yading@11
|
21 ;******************************************************************************
|
yading@11
|
22
|
yading@11
|
23 %include "x86util.asm"
|
yading@11
|
24
|
yading@11
|
25 SECTION .text
|
yading@11
|
26
|
yading@11
|
27 ;-----------------------------------------------------------------------------
|
yading@11
|
28 ; void vector_fmul(float *dst, const float *src0, const float *src1, int len)
|
yading@11
|
29 ;-----------------------------------------------------------------------------
|
yading@11
|
30 %macro VECTOR_FMUL 0
|
yading@11
|
31 cglobal vector_fmul, 4,4,2, dst, src0, src1, len
|
yading@11
|
32 lea lenq, [lend*4 - 2*mmsize]
|
yading@11
|
33 ALIGN 16
|
yading@11
|
34 .loop:
|
yading@11
|
35 mova m0, [src0q + lenq]
|
yading@11
|
36 mova m1, [src0q + lenq + mmsize]
|
yading@11
|
37 mulps m0, m0, [src1q + lenq]
|
yading@11
|
38 mulps m1, m1, [src1q + lenq + mmsize]
|
yading@11
|
39 mova [dstq + lenq], m0
|
yading@11
|
40 mova [dstq + lenq + mmsize], m1
|
yading@11
|
41
|
yading@11
|
42 sub lenq, 2*mmsize
|
yading@11
|
43 jge .loop
|
yading@11
|
44 REP_RET
|
yading@11
|
45 %endmacro
|
yading@11
|
46
|
yading@11
|
47 INIT_XMM sse
|
yading@11
|
48 VECTOR_FMUL
|
yading@11
|
49 %if HAVE_AVX_EXTERNAL
|
yading@11
|
50 INIT_YMM avx
|
yading@11
|
51 VECTOR_FMUL
|
yading@11
|
52 %endif
|
yading@11
|
53
|
yading@11
|
54 ;------------------------------------------------------------------------------
|
yading@11
|
55 ; void ff_vector_fmac_scalar(float *dst, const float *src, float mul, int len)
|
yading@11
|
56 ;------------------------------------------------------------------------------
|
yading@11
|
57
|
yading@11
|
58 %macro VECTOR_FMAC_SCALAR 0
|
yading@11
|
59 %if UNIX64
|
yading@11
|
60 cglobal vector_fmac_scalar, 3,3,3, dst, src, len
|
yading@11
|
61 %else
|
yading@11
|
62 cglobal vector_fmac_scalar, 4,4,3, dst, src, mul, len
|
yading@11
|
63 %endif
|
yading@11
|
64 %if ARCH_X86_32
|
yading@11
|
65 VBROADCASTSS m0, mulm
|
yading@11
|
66 %else
|
yading@11
|
67 %if WIN64
|
yading@11
|
68 mova xmm0, xmm2
|
yading@11
|
69 %endif
|
yading@11
|
70 shufps xmm0, xmm0, 0
|
yading@11
|
71 %if cpuflag(avx)
|
yading@11
|
72 vinsertf128 m0, m0, xmm0, 1
|
yading@11
|
73 %endif
|
yading@11
|
74 %endif
|
yading@11
|
75 lea lenq, [lend*4-2*mmsize]
|
yading@11
|
76 .loop:
|
yading@11
|
77 mulps m1, m0, [srcq+lenq ]
|
yading@11
|
78 mulps m2, m0, [srcq+lenq+mmsize]
|
yading@11
|
79 addps m1, m1, [dstq+lenq ]
|
yading@11
|
80 addps m2, m2, [dstq+lenq+mmsize]
|
yading@11
|
81 mova [dstq+lenq ], m1
|
yading@11
|
82 mova [dstq+lenq+mmsize], m2
|
yading@11
|
83 sub lenq, 2*mmsize
|
yading@11
|
84 jge .loop
|
yading@11
|
85 REP_RET
|
yading@11
|
86 %endmacro
|
yading@11
|
87
|
yading@11
|
88 INIT_XMM sse
|
yading@11
|
89 VECTOR_FMAC_SCALAR
|
yading@11
|
90 %if HAVE_AVX_EXTERNAL
|
yading@11
|
91 INIT_YMM avx
|
yading@11
|
92 VECTOR_FMAC_SCALAR
|
yading@11
|
93 %endif
|
yading@11
|
94
|
yading@11
|
95 ;------------------------------------------------------------------------------
|
yading@11
|
96 ; void ff_vector_fmul_scalar(float *dst, const float *src, float mul, int len)
|
yading@11
|
97 ;------------------------------------------------------------------------------
|
yading@11
|
98
|
yading@11
|
99 %macro VECTOR_FMUL_SCALAR 0
|
yading@11
|
100 %if UNIX64
|
yading@11
|
101 cglobal vector_fmul_scalar, 3,3,2, dst, src, len
|
yading@11
|
102 %else
|
yading@11
|
103 cglobal vector_fmul_scalar, 4,4,3, dst, src, mul, len
|
yading@11
|
104 %endif
|
yading@11
|
105 %if ARCH_X86_32
|
yading@11
|
106 movss m0, mulm
|
yading@11
|
107 %elif WIN64
|
yading@11
|
108 SWAP 0, 2
|
yading@11
|
109 %endif
|
yading@11
|
110 shufps m0, m0, 0
|
yading@11
|
111 lea lenq, [lend*4-mmsize]
|
yading@11
|
112 .loop:
|
yading@11
|
113 mova m1, [srcq+lenq]
|
yading@11
|
114 mulps m1, m0
|
yading@11
|
115 mova [dstq+lenq], m1
|
yading@11
|
116 sub lenq, mmsize
|
yading@11
|
117 jge .loop
|
yading@11
|
118 REP_RET
|
yading@11
|
119 %endmacro
|
yading@11
|
120
|
yading@11
|
121 INIT_XMM sse
|
yading@11
|
122 VECTOR_FMUL_SCALAR
|
yading@11
|
123
|
yading@11
|
124 ;------------------------------------------------------------------------------
|
yading@11
|
125 ; void ff_vector_dmul_scalar(double *dst, const double *src, double mul,
|
yading@11
|
126 ; int len)
|
yading@11
|
127 ;------------------------------------------------------------------------------
|
yading@11
|
128
|
yading@11
|
129 %macro VECTOR_DMUL_SCALAR 0
|
yading@11
|
130 %if ARCH_X86_32
|
yading@11
|
131 cglobal vector_dmul_scalar, 3,4,3, dst, src, mul, len, lenaddr
|
yading@11
|
132 mov lenq, lenaddrm
|
yading@11
|
133 %elif UNIX64
|
yading@11
|
134 cglobal vector_dmul_scalar, 3,3,3, dst, src, len
|
yading@11
|
135 %else
|
yading@11
|
136 cglobal vector_dmul_scalar, 4,4,3, dst, src, mul, len
|
yading@11
|
137 %endif
|
yading@11
|
138 %if ARCH_X86_32
|
yading@11
|
139 VBROADCASTSD m0, mulm
|
yading@11
|
140 %else
|
yading@11
|
141 %if WIN64
|
yading@11
|
142 movlhps xmm2, xmm2
|
yading@11
|
143 %if cpuflag(avx)
|
yading@11
|
144 vinsertf128 ymm2, ymm2, xmm2, 1
|
yading@11
|
145 %endif
|
yading@11
|
146 SWAP 0, 2
|
yading@11
|
147 %else
|
yading@11
|
148 movlhps xmm0, xmm0
|
yading@11
|
149 %if cpuflag(avx)
|
yading@11
|
150 vinsertf128 ymm0, ymm0, xmm0, 1
|
yading@11
|
151 %endif
|
yading@11
|
152 %endif
|
yading@11
|
153 %endif
|
yading@11
|
154 lea lenq, [lend*8-2*mmsize]
|
yading@11
|
155 .loop:
|
yading@11
|
156 mulpd m1, m0, [srcq+lenq ]
|
yading@11
|
157 mulpd m2, m0, [srcq+lenq+mmsize]
|
yading@11
|
158 mova [dstq+lenq ], m1
|
yading@11
|
159 mova [dstq+lenq+mmsize], m2
|
yading@11
|
160 sub lenq, 2*mmsize
|
yading@11
|
161 jge .loop
|
yading@11
|
162 REP_RET
|
yading@11
|
163 %endmacro
|
yading@11
|
164
|
yading@11
|
165 INIT_XMM sse2
|
yading@11
|
166 VECTOR_DMUL_SCALAR
|
yading@11
|
167 %if HAVE_AVX_EXTERNAL
|
yading@11
|
168 INIT_YMM avx
|
yading@11
|
169 VECTOR_DMUL_SCALAR
|
yading@11
|
170 %endif
|
yading@11
|
171
|
yading@11
|
172 ;-----------------------------------------------------------------------------
|
yading@11
|
173 ; vector_fmul_add(float *dst, const float *src0, const float *src1,
|
yading@11
|
174 ; const float *src2, int len)
|
yading@11
|
175 ;-----------------------------------------------------------------------------
|
yading@11
|
176 %macro VECTOR_FMUL_ADD 0
|
yading@11
|
177 cglobal vector_fmul_add, 5,5,2, dst, src0, src1, src2, len
|
yading@11
|
178 lea lenq, [lend*4 - 2*mmsize]
|
yading@11
|
179 ALIGN 16
|
yading@11
|
180 .loop:
|
yading@11
|
181 mova m0, [src0q + lenq]
|
yading@11
|
182 mova m1, [src0q + lenq + mmsize]
|
yading@11
|
183 mulps m0, m0, [src1q + lenq]
|
yading@11
|
184 mulps m1, m1, [src1q + lenq + mmsize]
|
yading@11
|
185 addps m0, m0, [src2q + lenq]
|
yading@11
|
186 addps m1, m1, [src2q + lenq + mmsize]
|
yading@11
|
187 mova [dstq + lenq], m0
|
yading@11
|
188 mova [dstq + lenq + mmsize], m1
|
yading@11
|
189
|
yading@11
|
190 sub lenq, 2*mmsize
|
yading@11
|
191 jge .loop
|
yading@11
|
192 REP_RET
|
yading@11
|
193 %endmacro
|
yading@11
|
194
|
yading@11
|
195 INIT_XMM sse
|
yading@11
|
196 VECTOR_FMUL_ADD
|
yading@11
|
197 %if HAVE_AVX_EXTERNAL
|
yading@11
|
198 INIT_YMM avx
|
yading@11
|
199 VECTOR_FMUL_ADD
|
yading@11
|
200 %endif
|
yading@11
|
201
|
yading@11
|
202 ;-----------------------------------------------------------------------------
|
yading@11
|
203 ; void vector_fmul_reverse(float *dst, const float *src0, const float *src1,
|
yading@11
|
204 ; int len)
|
yading@11
|
205 ;-----------------------------------------------------------------------------
|
yading@11
|
206 %macro VECTOR_FMUL_REVERSE 0
|
yading@11
|
207 cglobal vector_fmul_reverse, 4,4,2, dst, src0, src1, len
|
yading@11
|
208 lea lenq, [lend*4 - 2*mmsize]
|
yading@11
|
209 ALIGN 16
|
yading@11
|
210 .loop:
|
yading@11
|
211 %if cpuflag(avx)
|
yading@11
|
212 vmovaps xmm0, [src1q + 16]
|
yading@11
|
213 vinsertf128 m0, m0, [src1q], 1
|
yading@11
|
214 vshufps m0, m0, m0, q0123
|
yading@11
|
215 vmovaps xmm1, [src1q + mmsize + 16]
|
yading@11
|
216 vinsertf128 m1, m1, [src1q + mmsize], 1
|
yading@11
|
217 vshufps m1, m1, m1, q0123
|
yading@11
|
218 %else
|
yading@11
|
219 mova m0, [src1q]
|
yading@11
|
220 mova m1, [src1q + mmsize]
|
yading@11
|
221 shufps m0, m0, q0123
|
yading@11
|
222 shufps m1, m1, q0123
|
yading@11
|
223 %endif
|
yading@11
|
224 mulps m0, m0, [src0q + lenq + mmsize]
|
yading@11
|
225 mulps m1, m1, [src0q + lenq]
|
yading@11
|
226 mova [dstq + lenq + mmsize], m0
|
yading@11
|
227 mova [dstq + lenq], m1
|
yading@11
|
228 add src1q, 2*mmsize
|
yading@11
|
229 sub lenq, 2*mmsize
|
yading@11
|
230 jge .loop
|
yading@11
|
231 REP_RET
|
yading@11
|
232 %endmacro
|
yading@11
|
233
|
yading@11
|
234 INIT_XMM sse
|
yading@11
|
235 VECTOR_FMUL_REVERSE
|
yading@11
|
236 %if HAVE_AVX_EXTERNAL
|
yading@11
|
237 INIT_YMM avx
|
yading@11
|
238 VECTOR_FMUL_REVERSE
|
yading@11
|
239 %endif
|
yading@11
|
240
|
yading@11
|
241 ; float scalarproduct_float_sse(const float *v1, const float *v2, int len)
|
yading@11
|
242 INIT_XMM sse
|
yading@11
|
243 cglobal scalarproduct_float, 3,3,2, v1, v2, offset
|
yading@11
|
244 neg offsetq
|
yading@11
|
245 shl offsetq, 2
|
yading@11
|
246 sub v1q, offsetq
|
yading@11
|
247 sub v2q, offsetq
|
yading@11
|
248 xorps xmm0, xmm0
|
yading@11
|
249 .loop:
|
yading@11
|
250 movaps xmm1, [v1q+offsetq]
|
yading@11
|
251 mulps xmm1, [v2q+offsetq]
|
yading@11
|
252 addps xmm0, xmm1
|
yading@11
|
253 add offsetq, 16
|
yading@11
|
254 js .loop
|
yading@11
|
255 movhlps xmm1, xmm0
|
yading@11
|
256 addps xmm0, xmm1
|
yading@11
|
257 movss xmm1, xmm0
|
yading@11
|
258 shufps xmm0, xmm0, 1
|
yading@11
|
259 addss xmm0, xmm1
|
yading@11
|
260 %if ARCH_X86_64 == 0
|
yading@11
|
261 movss r0m, xmm0
|
yading@11
|
262 fld dword r0m
|
yading@11
|
263 %endif
|
yading@11
|
264 RET
|
yading@11
|
265
|
yading@11
|
266 ;-----------------------------------------------------------------------------
|
yading@11
|
267 ; void ff_butterflies_float(float *src0, float *src1, int len);
|
yading@11
|
268 ;-----------------------------------------------------------------------------
|
yading@11
|
269 INIT_XMM sse
|
yading@11
|
270 cglobal butterflies_float, 3,3,3, src0, src1, len
|
yading@11
|
271 movsxdifnidn lenq, lend
|
yading@11
|
272 test lenq, lenq
|
yading@11
|
273 jz .end
|
yading@11
|
274 shl lenq, 2
|
yading@11
|
275 add src0q, lenq
|
yading@11
|
276 add src1q, lenq
|
yading@11
|
277 neg lenq
|
yading@11
|
278 .loop:
|
yading@11
|
279 mova m0, [src0q + lenq]
|
yading@11
|
280 mova m1, [src1q + lenq]
|
yading@11
|
281 subps m2, m0, m1
|
yading@11
|
282 addps m0, m0, m1
|
yading@11
|
283 mova [src1q + lenq], m2
|
yading@11
|
284 mova [src0q + lenq], m0
|
yading@11
|
285 add lenq, mmsize
|
yading@11
|
286 jl .loop
|
yading@11
|
287 .end:
|
yading@11
|
288 REP_RET
|