annotate ffmpeg/libavutil/arm/float_dsp_neon.S @ 13:844d341cf643 tip

Back up before ISMIR
author Yading Song <yading.song@eecs.qmul.ac.uk>
date Thu, 31 Oct 2013 13:17:06 +0000
parents f445c3017523
children
rev   line source
yading@11 1 /*
yading@11 2 * ARM NEON optimised Float DSP functions
yading@11 3 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
yading@11 4 *
yading@11 5 * This file is part of Libav.
yading@11 6 *
yading@11 7 * Libav is free software; you can redistribute it and/or
yading@11 8 * modify it under the terms of the GNU Lesser General Public
yading@11 9 * License as published by the Free Software Foundation; either
yading@11 10 * version 2.1 of the License, or (at your option) any later version.
yading@11 11 *
yading@11 12 * Libav is distributed in the hope that it will be useful,
yading@11 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
yading@11 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
yading@11 15 * Lesser General Public License for more details.
yading@11 16 *
yading@11 17 * You should have received a copy of the GNU Lesser General Public
yading@11 18 * License along with Libav; if not, write to the Free Software
yading@11 19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
yading@11 20 */
yading@11 21
yading@11 22 #include "config.h"
yading@11 23 #include "asm.S"
yading@11 24
yading@11 25 function ff_vector_fmul_neon, export=1
yading@11 26 subs r3, r3, #8
yading@11 27 vld1.32 {d0-d3}, [r1,:128]!
yading@11 28 vld1.32 {d4-d7}, [r2,:128]!
yading@11 29 vmul.f32 q8, q0, q2
yading@11 30 vmul.f32 q9, q1, q3
yading@11 31 beq 3f
yading@11 32 bics ip, r3, #15
yading@11 33 beq 2f
yading@11 34 1: subs ip, ip, #16
yading@11 35 vld1.32 {d0-d1}, [r1,:128]!
yading@11 36 vld1.32 {d4-d5}, [r2,:128]!
yading@11 37 vmul.f32 q10, q0, q2
yading@11 38 vld1.32 {d2-d3}, [r1,:128]!
yading@11 39 vld1.32 {d6-d7}, [r2,:128]!
yading@11 40 vmul.f32 q11, q1, q3
yading@11 41 vst1.32 {d16-d19},[r0,:128]!
yading@11 42 vld1.32 {d0-d1}, [r1,:128]!
yading@11 43 vld1.32 {d4-d5}, [r2,:128]!
yading@11 44 vmul.f32 q8, q0, q2
yading@11 45 vld1.32 {d2-d3}, [r1,:128]!
yading@11 46 vld1.32 {d6-d7}, [r2,:128]!
yading@11 47 vmul.f32 q9, q1, q3
yading@11 48 vst1.32 {d20-d23},[r0,:128]!
yading@11 49 bne 1b
yading@11 50 ands r3, r3, #15
yading@11 51 beq 3f
yading@11 52 2: vld1.32 {d0-d1}, [r1,:128]!
yading@11 53 vld1.32 {d4-d5}, [r2,:128]!
yading@11 54 vst1.32 {d16-d17},[r0,:128]!
yading@11 55 vmul.f32 q8, q0, q2
yading@11 56 vld1.32 {d2-d3}, [r1,:128]!
yading@11 57 vld1.32 {d6-d7}, [r2,:128]!
yading@11 58 vst1.32 {d18-d19},[r0,:128]!
yading@11 59 vmul.f32 q9, q1, q3
yading@11 60 3: vst1.32 {d16-d19},[r0,:128]!
yading@11 61 bx lr
yading@11 62 endfunc
yading@11 63
yading@11 64 function ff_vector_fmac_scalar_neon, export=1
yading@11 65 VFP len .req r2
yading@11 66 VFP acc .req r3
yading@11 67 NOVFP len .req r3
yading@11 68 NOVFP acc .req r2
yading@11 69 VFP vdup.32 q15, d0[0]
yading@11 70 NOVFP vdup.32 q15, r2
yading@11 71 bics r12, len, #15
yading@11 72 mov acc, r0
yading@11 73 beq 3f
yading@11 74 vld1.32 {q0}, [r1,:128]!
yading@11 75 vld1.32 {q8}, [acc,:128]!
yading@11 76 vld1.32 {q1}, [r1,:128]!
yading@11 77 vld1.32 {q9}, [acc,:128]!
yading@11 78 1: vmla.f32 q8, q0, q15
yading@11 79 vld1.32 {q2}, [r1,:128]!
yading@11 80 vld1.32 {q10}, [acc,:128]!
yading@11 81 vmla.f32 q9, q1, q15
yading@11 82 vld1.32 {q3}, [r1,:128]!
yading@11 83 vld1.32 {q11}, [acc,:128]!
yading@11 84 vmla.f32 q10, q2, q15
yading@11 85 vst1.32 {q8}, [r0,:128]!
yading@11 86 vmla.f32 q11, q3, q15
yading@11 87 vst1.32 {q9}, [r0,:128]!
yading@11 88 subs r12, r12, #16
yading@11 89 beq 2f
yading@11 90 vld1.32 {q0}, [r1,:128]!
yading@11 91 vld1.32 {q8}, [acc,:128]!
yading@11 92 vst1.32 {q10}, [r0,:128]!
yading@11 93 vld1.32 {q1}, [r1,:128]!
yading@11 94 vld1.32 {q9}, [acc,:128]!
yading@11 95 vst1.32 {q11}, [r0,:128]!
yading@11 96 b 1b
yading@11 97 2: vst1.32 {q10}, [r0,:128]!
yading@11 98 vst1.32 {q11}, [r0,:128]!
yading@11 99 ands len, len, #15
yading@11 100 it eq
yading@11 101 bxeq lr
yading@11 102 3: vld1.32 {q0}, [r1,:128]!
yading@11 103 vld1.32 {q8}, [acc,:128]!
yading@11 104 vmla.f32 q8, q0, q15
yading@11 105 vst1.32 {q8}, [r0,:128]!
yading@11 106 subs len, len, #4
yading@11 107 bgt 3b
yading@11 108 bx lr
yading@11 109 .unreq len
yading@11 110 endfunc
yading@11 111
yading@11 112 function ff_vector_fmul_scalar_neon, export=1
yading@11 113 VFP len .req r2
yading@11 114 NOVFP len .req r3
yading@11 115 VFP vdup.32 q8, d0[0]
yading@11 116 NOVFP vdup.32 q8, r2
yading@11 117 bics r12, len, #15
yading@11 118 beq 3f
yading@11 119 vld1.32 {q0},[r1,:128]!
yading@11 120 vld1.32 {q1},[r1,:128]!
yading@11 121 1: vmul.f32 q0, q0, q8
yading@11 122 vld1.32 {q2},[r1,:128]!
yading@11 123 vmul.f32 q1, q1, q8
yading@11 124 vld1.32 {q3},[r1,:128]!
yading@11 125 vmul.f32 q2, q2, q8
yading@11 126 vst1.32 {q0},[r0,:128]!
yading@11 127 vmul.f32 q3, q3, q8
yading@11 128 vst1.32 {q1},[r0,:128]!
yading@11 129 subs r12, r12, #16
yading@11 130 beq 2f
yading@11 131 vld1.32 {q0},[r1,:128]!
yading@11 132 vst1.32 {q2},[r0,:128]!
yading@11 133 vld1.32 {q1},[r1,:128]!
yading@11 134 vst1.32 {q3},[r0,:128]!
yading@11 135 b 1b
yading@11 136 2: vst1.32 {q2},[r0,:128]!
yading@11 137 vst1.32 {q3},[r0,:128]!
yading@11 138 ands len, len, #15
yading@11 139 it eq
yading@11 140 bxeq lr
yading@11 141 3: vld1.32 {q0},[r1,:128]!
yading@11 142 vmul.f32 q0, q0, q8
yading@11 143 vst1.32 {q0},[r0,:128]!
yading@11 144 subs len, len, #4
yading@11 145 bgt 3b
yading@11 146 bx lr
yading@11 147 .unreq len
yading@11 148 endfunc
yading@11 149
yading@11 150 function ff_vector_fmul_window_neon, export=1
yading@11 151 push {r4,r5,lr}
yading@11 152 ldr lr, [sp, #12]
yading@11 153 sub r2, r2, #8
yading@11 154 sub r5, lr, #2
yading@11 155 add r2, r2, r5, lsl #2
yading@11 156 add r4, r3, r5, lsl #3
yading@11 157 add ip, r0, r5, lsl #3
yading@11 158 mov r5, #-16
yading@11 159 vld1.32 {d0,d1}, [r1,:128]!
yading@11 160 vld1.32 {d2,d3}, [r2,:128], r5
yading@11 161 vld1.32 {d4,d5}, [r3,:128]!
yading@11 162 vld1.32 {d6,d7}, [r4,:128], r5
yading@11 163 1: subs lr, lr, #4
yading@11 164 vmul.f32 d22, d0, d4
yading@11 165 vrev64.32 q3, q3
yading@11 166 vmul.f32 d23, d1, d5
yading@11 167 vrev64.32 q1, q1
yading@11 168 vmul.f32 d20, d0, d7
yading@11 169 vmul.f32 d21, d1, d6
yading@11 170 beq 2f
yading@11 171 vmla.f32 d22, d3, d7
yading@11 172 vld1.32 {d0,d1}, [r1,:128]!
yading@11 173 vmla.f32 d23, d2, d6
yading@11 174 vld1.32 {d18,d19},[r2,:128], r5
yading@11 175 vmls.f32 d20, d3, d4
yading@11 176 vld1.32 {d24,d25},[r3,:128]!
yading@11 177 vmls.f32 d21, d2, d5
yading@11 178 vld1.32 {d6,d7}, [r4,:128], r5
yading@11 179 vmov q1, q9
yading@11 180 vrev64.32 q11, q11
yading@11 181 vmov q2, q12
yading@11 182 vswp d22, d23
yading@11 183 vst1.32 {d20,d21},[r0,:128]!
yading@11 184 vst1.32 {d22,d23},[ip,:128], r5
yading@11 185 b 1b
yading@11 186 2: vmla.f32 d22, d3, d7
yading@11 187 vmla.f32 d23, d2, d6
yading@11 188 vmls.f32 d20, d3, d4
yading@11 189 vmls.f32 d21, d2, d5
yading@11 190 vrev64.32 q11, q11
yading@11 191 vswp d22, d23
yading@11 192 vst1.32 {d20,d21},[r0,:128]!
yading@11 193 vst1.32 {d22,d23},[ip,:128], r5
yading@11 194 pop {r4,r5,pc}
yading@11 195 endfunc
yading@11 196
yading@11 197 function ff_vector_fmul_add_neon, export=1
yading@11 198 ldr r12, [sp]
yading@11 199 vld1.32 {q0-q1}, [r1,:128]!
yading@11 200 vld1.32 {q8-q9}, [r2,:128]!
yading@11 201 vld1.32 {q2-q3}, [r3,:128]!
yading@11 202 vmul.f32 q10, q0, q8
yading@11 203 vmul.f32 q11, q1, q9
yading@11 204 1: vadd.f32 q12, q2, q10
yading@11 205 vadd.f32 q13, q3, q11
yading@11 206 pld [r1, #16]
yading@11 207 pld [r2, #16]
yading@11 208 pld [r3, #16]
yading@11 209 subs r12, r12, #8
yading@11 210 beq 2f
yading@11 211 vld1.32 {q0}, [r1,:128]!
yading@11 212 vld1.32 {q8}, [r2,:128]!
yading@11 213 vmul.f32 q10, q0, q8
yading@11 214 vld1.32 {q1}, [r1,:128]!
yading@11 215 vld1.32 {q9}, [r2,:128]!
yading@11 216 vmul.f32 q11, q1, q9
yading@11 217 vld1.32 {q2-q3}, [r3,:128]!
yading@11 218 vst1.32 {q12-q13},[r0,:128]!
yading@11 219 b 1b
yading@11 220 2: vst1.32 {q12-q13},[r0,:128]!
yading@11 221 bx lr
yading@11 222 endfunc
yading@11 223
yading@11 224 function ff_vector_fmul_reverse_neon, export=1
yading@11 225 add r2, r2, r3, lsl #2
yading@11 226 sub r2, r2, #32
yading@11 227 mov r12, #-32
yading@11 228 vld1.32 {q0-q1}, [r1,:128]!
yading@11 229 vld1.32 {q2-q3}, [r2,:128], r12
yading@11 230 1: pld [r1, #32]
yading@11 231 vrev64.32 q3, q3
yading@11 232 vmul.f32 d16, d0, d7
yading@11 233 vmul.f32 d17, d1, d6
yading@11 234 pld [r2, #-32]
yading@11 235 vrev64.32 q2, q2
yading@11 236 vmul.f32 d18, d2, d5
yading@11 237 vmul.f32 d19, d3, d4
yading@11 238 subs r3, r3, #8
yading@11 239 beq 2f
yading@11 240 vld1.32 {q0-q1}, [r1,:128]!
yading@11 241 vld1.32 {q2-q3}, [r2,:128], r12
yading@11 242 vst1.32 {q8-q9}, [r0,:128]!
yading@11 243 b 1b
yading@11 244 2: vst1.32 {q8-q9}, [r0,:128]!
yading@11 245 bx lr
yading@11 246 endfunc
yading@11 247
yading@11 248 function ff_butterflies_float_neon, export=1
yading@11 249 1: vld1.32 {q0},[r0,:128]
yading@11 250 vld1.32 {q1},[r1,:128]
yading@11 251 vsub.f32 q2, q0, q1
yading@11 252 vadd.f32 q1, q0, q1
yading@11 253 vst1.32 {q2},[r1,:128]!
yading@11 254 vst1.32 {q1},[r0,:128]!
yading@11 255 subs r2, r2, #4
yading@11 256 bgt 1b
yading@11 257 bx lr
yading@11 258 endfunc
yading@11 259
yading@11 260 function ff_scalarproduct_float_neon, export=1
yading@11 261 vmov.f32 q2, #0.0
yading@11 262 1: vld1.32 {q0},[r0,:128]!
yading@11 263 vld1.32 {q1},[r1,:128]!
yading@11 264 vmla.f32 q2, q0, q1
yading@11 265 subs r2, r2, #4
yading@11 266 bgt 1b
yading@11 267 vadd.f32 d0, d4, d5
yading@11 268 vpadd.f32 d0, d0, d0
yading@11 269 NOVFP vmov.32 r0, d0[0]
yading@11 270 bx lr
yading@11 271 endfunc