yading@11
|
1 /*
|
yading@11
|
2 * ARM NEON optimised Float DSP functions
|
yading@11
|
3 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
|
yading@11
|
4 *
|
yading@11
|
5 * This file is part of Libav.
|
yading@11
|
6 *
|
yading@11
|
7 * Libav is free software; you can redistribute it and/or
|
yading@11
|
8 * modify it under the terms of the GNU Lesser General Public
|
yading@11
|
9 * License as published by the Free Software Foundation; either
|
yading@11
|
10 * version 2.1 of the License, or (at your option) any later version.
|
yading@11
|
11 *
|
yading@11
|
12 * Libav is distributed in the hope that it will be useful,
|
yading@11
|
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
yading@11
|
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
yading@11
|
15 * Lesser General Public License for more details.
|
yading@11
|
16 *
|
yading@11
|
17 * You should have received a copy of the GNU Lesser General Public
|
yading@11
|
18 * License along with Libav; if not, write to the Free Software
|
yading@11
|
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
yading@11
|
20 */
|
yading@11
|
21
|
yading@11
|
22 #include "config.h"
|
yading@11
|
23 #include "asm.S"
|
yading@11
|
24
|
yading@11
|
25 function ff_vector_fmul_neon, export=1
|
yading@11
|
26 subs r3, r3, #8
|
yading@11
|
27 vld1.32 {d0-d3}, [r1,:128]!
|
yading@11
|
28 vld1.32 {d4-d7}, [r2,:128]!
|
yading@11
|
29 vmul.f32 q8, q0, q2
|
yading@11
|
30 vmul.f32 q9, q1, q3
|
yading@11
|
31 beq 3f
|
yading@11
|
32 bics ip, r3, #15
|
yading@11
|
33 beq 2f
|
yading@11
|
34 1: subs ip, ip, #16
|
yading@11
|
35 vld1.32 {d0-d1}, [r1,:128]!
|
yading@11
|
36 vld1.32 {d4-d5}, [r2,:128]!
|
yading@11
|
37 vmul.f32 q10, q0, q2
|
yading@11
|
38 vld1.32 {d2-d3}, [r1,:128]!
|
yading@11
|
39 vld1.32 {d6-d7}, [r2,:128]!
|
yading@11
|
40 vmul.f32 q11, q1, q3
|
yading@11
|
41 vst1.32 {d16-d19},[r0,:128]!
|
yading@11
|
42 vld1.32 {d0-d1}, [r1,:128]!
|
yading@11
|
43 vld1.32 {d4-d5}, [r2,:128]!
|
yading@11
|
44 vmul.f32 q8, q0, q2
|
yading@11
|
45 vld1.32 {d2-d3}, [r1,:128]!
|
yading@11
|
46 vld1.32 {d6-d7}, [r2,:128]!
|
yading@11
|
47 vmul.f32 q9, q1, q3
|
yading@11
|
48 vst1.32 {d20-d23},[r0,:128]!
|
yading@11
|
49 bne 1b
|
yading@11
|
50 ands r3, r3, #15
|
yading@11
|
51 beq 3f
|
yading@11
|
52 2: vld1.32 {d0-d1}, [r1,:128]!
|
yading@11
|
53 vld1.32 {d4-d5}, [r2,:128]!
|
yading@11
|
54 vst1.32 {d16-d17},[r0,:128]!
|
yading@11
|
55 vmul.f32 q8, q0, q2
|
yading@11
|
56 vld1.32 {d2-d3}, [r1,:128]!
|
yading@11
|
57 vld1.32 {d6-d7}, [r2,:128]!
|
yading@11
|
58 vst1.32 {d18-d19},[r0,:128]!
|
yading@11
|
59 vmul.f32 q9, q1, q3
|
yading@11
|
60 3: vst1.32 {d16-d19},[r0,:128]!
|
yading@11
|
61 bx lr
|
yading@11
|
62 endfunc
|
yading@11
|
63
|
yading@11
|
64 function ff_vector_fmac_scalar_neon, export=1
|
yading@11
|
65 VFP len .req r2
|
yading@11
|
66 VFP acc .req r3
|
yading@11
|
67 NOVFP len .req r3
|
yading@11
|
68 NOVFP acc .req r2
|
yading@11
|
69 VFP vdup.32 q15, d0[0]
|
yading@11
|
70 NOVFP vdup.32 q15, r2
|
yading@11
|
71 bics r12, len, #15
|
yading@11
|
72 mov acc, r0
|
yading@11
|
73 beq 3f
|
yading@11
|
74 vld1.32 {q0}, [r1,:128]!
|
yading@11
|
75 vld1.32 {q8}, [acc,:128]!
|
yading@11
|
76 vld1.32 {q1}, [r1,:128]!
|
yading@11
|
77 vld1.32 {q9}, [acc,:128]!
|
yading@11
|
78 1: vmla.f32 q8, q0, q15
|
yading@11
|
79 vld1.32 {q2}, [r1,:128]!
|
yading@11
|
80 vld1.32 {q10}, [acc,:128]!
|
yading@11
|
81 vmla.f32 q9, q1, q15
|
yading@11
|
82 vld1.32 {q3}, [r1,:128]!
|
yading@11
|
83 vld1.32 {q11}, [acc,:128]!
|
yading@11
|
84 vmla.f32 q10, q2, q15
|
yading@11
|
85 vst1.32 {q8}, [r0,:128]!
|
yading@11
|
86 vmla.f32 q11, q3, q15
|
yading@11
|
87 vst1.32 {q9}, [r0,:128]!
|
yading@11
|
88 subs r12, r12, #16
|
yading@11
|
89 beq 2f
|
yading@11
|
90 vld1.32 {q0}, [r1,:128]!
|
yading@11
|
91 vld1.32 {q8}, [acc,:128]!
|
yading@11
|
92 vst1.32 {q10}, [r0,:128]!
|
yading@11
|
93 vld1.32 {q1}, [r1,:128]!
|
yading@11
|
94 vld1.32 {q9}, [acc,:128]!
|
yading@11
|
95 vst1.32 {q11}, [r0,:128]!
|
yading@11
|
96 b 1b
|
yading@11
|
97 2: vst1.32 {q10}, [r0,:128]!
|
yading@11
|
98 vst1.32 {q11}, [r0,:128]!
|
yading@11
|
99 ands len, len, #15
|
yading@11
|
100 it eq
|
yading@11
|
101 bxeq lr
|
yading@11
|
102 3: vld1.32 {q0}, [r1,:128]!
|
yading@11
|
103 vld1.32 {q8}, [acc,:128]!
|
yading@11
|
104 vmla.f32 q8, q0, q15
|
yading@11
|
105 vst1.32 {q8}, [r0,:128]!
|
yading@11
|
106 subs len, len, #4
|
yading@11
|
107 bgt 3b
|
yading@11
|
108 bx lr
|
yading@11
|
109 .unreq len
|
yading@11
|
110 endfunc
|
yading@11
|
111
|
yading@11
|
112 function ff_vector_fmul_scalar_neon, export=1
|
yading@11
|
113 VFP len .req r2
|
yading@11
|
114 NOVFP len .req r3
|
yading@11
|
115 VFP vdup.32 q8, d0[0]
|
yading@11
|
116 NOVFP vdup.32 q8, r2
|
yading@11
|
117 bics r12, len, #15
|
yading@11
|
118 beq 3f
|
yading@11
|
119 vld1.32 {q0},[r1,:128]!
|
yading@11
|
120 vld1.32 {q1},[r1,:128]!
|
yading@11
|
121 1: vmul.f32 q0, q0, q8
|
yading@11
|
122 vld1.32 {q2},[r1,:128]!
|
yading@11
|
123 vmul.f32 q1, q1, q8
|
yading@11
|
124 vld1.32 {q3},[r1,:128]!
|
yading@11
|
125 vmul.f32 q2, q2, q8
|
yading@11
|
126 vst1.32 {q0},[r0,:128]!
|
yading@11
|
127 vmul.f32 q3, q3, q8
|
yading@11
|
128 vst1.32 {q1},[r0,:128]!
|
yading@11
|
129 subs r12, r12, #16
|
yading@11
|
130 beq 2f
|
yading@11
|
131 vld1.32 {q0},[r1,:128]!
|
yading@11
|
132 vst1.32 {q2},[r0,:128]!
|
yading@11
|
133 vld1.32 {q1},[r1,:128]!
|
yading@11
|
134 vst1.32 {q3},[r0,:128]!
|
yading@11
|
135 b 1b
|
yading@11
|
136 2: vst1.32 {q2},[r0,:128]!
|
yading@11
|
137 vst1.32 {q3},[r0,:128]!
|
yading@11
|
138 ands len, len, #15
|
yading@11
|
139 it eq
|
yading@11
|
140 bxeq lr
|
yading@11
|
141 3: vld1.32 {q0},[r1,:128]!
|
yading@11
|
142 vmul.f32 q0, q0, q8
|
yading@11
|
143 vst1.32 {q0},[r0,:128]!
|
yading@11
|
144 subs len, len, #4
|
yading@11
|
145 bgt 3b
|
yading@11
|
146 bx lr
|
yading@11
|
147 .unreq len
|
yading@11
|
148 endfunc
|
yading@11
|
149
|
yading@11
|
150 function ff_vector_fmul_window_neon, export=1
|
yading@11
|
151 push {r4,r5,lr}
|
yading@11
|
152 ldr lr, [sp, #12]
|
yading@11
|
153 sub r2, r2, #8
|
yading@11
|
154 sub r5, lr, #2
|
yading@11
|
155 add r2, r2, r5, lsl #2
|
yading@11
|
156 add r4, r3, r5, lsl #3
|
yading@11
|
157 add ip, r0, r5, lsl #3
|
yading@11
|
158 mov r5, #-16
|
yading@11
|
159 vld1.32 {d0,d1}, [r1,:128]!
|
yading@11
|
160 vld1.32 {d2,d3}, [r2,:128], r5
|
yading@11
|
161 vld1.32 {d4,d5}, [r3,:128]!
|
yading@11
|
162 vld1.32 {d6,d7}, [r4,:128], r5
|
yading@11
|
163 1: subs lr, lr, #4
|
yading@11
|
164 vmul.f32 d22, d0, d4
|
yading@11
|
165 vrev64.32 q3, q3
|
yading@11
|
166 vmul.f32 d23, d1, d5
|
yading@11
|
167 vrev64.32 q1, q1
|
yading@11
|
168 vmul.f32 d20, d0, d7
|
yading@11
|
169 vmul.f32 d21, d1, d6
|
yading@11
|
170 beq 2f
|
yading@11
|
171 vmla.f32 d22, d3, d7
|
yading@11
|
172 vld1.32 {d0,d1}, [r1,:128]!
|
yading@11
|
173 vmla.f32 d23, d2, d6
|
yading@11
|
174 vld1.32 {d18,d19},[r2,:128], r5
|
yading@11
|
175 vmls.f32 d20, d3, d4
|
yading@11
|
176 vld1.32 {d24,d25},[r3,:128]!
|
yading@11
|
177 vmls.f32 d21, d2, d5
|
yading@11
|
178 vld1.32 {d6,d7}, [r4,:128], r5
|
yading@11
|
179 vmov q1, q9
|
yading@11
|
180 vrev64.32 q11, q11
|
yading@11
|
181 vmov q2, q12
|
yading@11
|
182 vswp d22, d23
|
yading@11
|
183 vst1.32 {d20,d21},[r0,:128]!
|
yading@11
|
184 vst1.32 {d22,d23},[ip,:128], r5
|
yading@11
|
185 b 1b
|
yading@11
|
186 2: vmla.f32 d22, d3, d7
|
yading@11
|
187 vmla.f32 d23, d2, d6
|
yading@11
|
188 vmls.f32 d20, d3, d4
|
yading@11
|
189 vmls.f32 d21, d2, d5
|
yading@11
|
190 vrev64.32 q11, q11
|
yading@11
|
191 vswp d22, d23
|
yading@11
|
192 vst1.32 {d20,d21},[r0,:128]!
|
yading@11
|
193 vst1.32 {d22,d23},[ip,:128], r5
|
yading@11
|
194 pop {r4,r5,pc}
|
yading@11
|
195 endfunc
|
yading@11
|
196
|
yading@11
|
197 function ff_vector_fmul_add_neon, export=1
|
yading@11
|
198 ldr r12, [sp]
|
yading@11
|
199 vld1.32 {q0-q1}, [r1,:128]!
|
yading@11
|
200 vld1.32 {q8-q9}, [r2,:128]!
|
yading@11
|
201 vld1.32 {q2-q3}, [r3,:128]!
|
yading@11
|
202 vmul.f32 q10, q0, q8
|
yading@11
|
203 vmul.f32 q11, q1, q9
|
yading@11
|
204 1: vadd.f32 q12, q2, q10
|
yading@11
|
205 vadd.f32 q13, q3, q11
|
yading@11
|
206 pld [r1, #16]
|
yading@11
|
207 pld [r2, #16]
|
yading@11
|
208 pld [r3, #16]
|
yading@11
|
209 subs r12, r12, #8
|
yading@11
|
210 beq 2f
|
yading@11
|
211 vld1.32 {q0}, [r1,:128]!
|
yading@11
|
212 vld1.32 {q8}, [r2,:128]!
|
yading@11
|
213 vmul.f32 q10, q0, q8
|
yading@11
|
214 vld1.32 {q1}, [r1,:128]!
|
yading@11
|
215 vld1.32 {q9}, [r2,:128]!
|
yading@11
|
216 vmul.f32 q11, q1, q9
|
yading@11
|
217 vld1.32 {q2-q3}, [r3,:128]!
|
yading@11
|
218 vst1.32 {q12-q13},[r0,:128]!
|
yading@11
|
219 b 1b
|
yading@11
|
220 2: vst1.32 {q12-q13},[r0,:128]!
|
yading@11
|
221 bx lr
|
yading@11
|
222 endfunc
|
yading@11
|
223
|
yading@11
|
224 function ff_vector_fmul_reverse_neon, export=1
|
yading@11
|
225 add r2, r2, r3, lsl #2
|
yading@11
|
226 sub r2, r2, #32
|
yading@11
|
227 mov r12, #-32
|
yading@11
|
228 vld1.32 {q0-q1}, [r1,:128]!
|
yading@11
|
229 vld1.32 {q2-q3}, [r2,:128], r12
|
yading@11
|
230 1: pld [r1, #32]
|
yading@11
|
231 vrev64.32 q3, q3
|
yading@11
|
232 vmul.f32 d16, d0, d7
|
yading@11
|
233 vmul.f32 d17, d1, d6
|
yading@11
|
234 pld [r2, #-32]
|
yading@11
|
235 vrev64.32 q2, q2
|
yading@11
|
236 vmul.f32 d18, d2, d5
|
yading@11
|
237 vmul.f32 d19, d3, d4
|
yading@11
|
238 subs r3, r3, #8
|
yading@11
|
239 beq 2f
|
yading@11
|
240 vld1.32 {q0-q1}, [r1,:128]!
|
yading@11
|
241 vld1.32 {q2-q3}, [r2,:128], r12
|
yading@11
|
242 vst1.32 {q8-q9}, [r0,:128]!
|
yading@11
|
243 b 1b
|
yading@11
|
244 2: vst1.32 {q8-q9}, [r0,:128]!
|
yading@11
|
245 bx lr
|
yading@11
|
246 endfunc
|
yading@11
|
247
|
yading@11
|
248 function ff_butterflies_float_neon, export=1
|
yading@11
|
249 1: vld1.32 {q0},[r0,:128]
|
yading@11
|
250 vld1.32 {q1},[r1,:128]
|
yading@11
|
251 vsub.f32 q2, q0, q1
|
yading@11
|
252 vadd.f32 q1, q0, q1
|
yading@11
|
253 vst1.32 {q2},[r1,:128]!
|
yading@11
|
254 vst1.32 {q1},[r0,:128]!
|
yading@11
|
255 subs r2, r2, #4
|
yading@11
|
256 bgt 1b
|
yading@11
|
257 bx lr
|
yading@11
|
258 endfunc
|
yading@11
|
259
|
yading@11
|
260 function ff_scalarproduct_float_neon, export=1
|
yading@11
|
261 vmov.f32 q2, #0.0
|
yading@11
|
262 1: vld1.32 {q0},[r0,:128]!
|
yading@11
|
263 vld1.32 {q1},[r1,:128]!
|
yading@11
|
264 vmla.f32 q2, q0, q1
|
yading@11
|
265 subs r2, r2, #4
|
yading@11
|
266 bgt 1b
|
yading@11
|
267 vadd.f32 d0, d4, d5
|
yading@11
|
268 vpadd.f32 d0, d0, d0
|
yading@11
|
269 NOVFP vmov.32 r0, d0[0]
|
yading@11
|
270 bx lr
|
yading@11
|
271 endfunc
|