annotate ffmpeg/libavcodec/arm/fmtconvert_neon.S @ 13:844d341cf643 tip

Back up before ISMIR
author Yading Song <yading.song@eecs.qmul.ac.uk>
date Thu, 31 Oct 2013 13:17:06 +0000
parents 6840f77b83aa
children
rev   line source
yading@10 1 /*
yading@10 2 * ARM NEON optimised Format Conversion Utils
yading@10 3 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
yading@10 4 *
yading@10 5 * This file is part of FFmpeg.
yading@10 6 *
yading@10 7 * FFmpeg is free software; you can redistribute it and/or
yading@10 8 * modify it under the terms of the GNU Lesser General Public
yading@10 9 * License as published by the Free Software Foundation; either
yading@10 10 * version 2.1 of the License, or (at your option) any later version.
yading@10 11 *
yading@10 12 * FFmpeg is distributed in the hope that it will be useful,
yading@10 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
yading@10 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
yading@10 15 * Lesser General Public License for more details.
yading@10 16 *
yading@10 17 * You should have received a copy of the GNU Lesser General Public
yading@10 18 * License along with FFmpeg; if not, write to the Free Software
yading@10 19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
yading@10 20 */
yading@10 21
yading@10 22 #include "config.h"
yading@10 23 #include "libavutil/arm/asm.S"
yading@10 24
yading@10 25 function ff_float_to_int16_neon, export=1
yading@10 26 subs r2, r2, #8
yading@10 27 vld1.64 {d0-d1}, [r1,:128]!
yading@10 28 vcvt.s32.f32 q8, q0, #16
yading@10 29 vld1.64 {d2-d3}, [r1,:128]!
yading@10 30 vcvt.s32.f32 q9, q1, #16
yading@10 31 beq 3f
yading@10 32 bics ip, r2, #15
yading@10 33 beq 2f
yading@10 34 1: subs ip, ip, #16
yading@10 35 vshrn.s32 d4, q8, #16
yading@10 36 vld1.64 {d0-d1}, [r1,:128]!
yading@10 37 vcvt.s32.f32 q0, q0, #16
yading@10 38 vshrn.s32 d5, q9, #16
yading@10 39 vld1.64 {d2-d3}, [r1,:128]!
yading@10 40 vcvt.s32.f32 q1, q1, #16
yading@10 41 vshrn.s32 d6, q0, #16
yading@10 42 vst1.64 {d4-d5}, [r0,:128]!
yading@10 43 vshrn.s32 d7, q1, #16
yading@10 44 vld1.64 {d16-d17},[r1,:128]!
yading@10 45 vcvt.s32.f32 q8, q8, #16
yading@10 46 vld1.64 {d18-d19},[r1,:128]!
yading@10 47 vcvt.s32.f32 q9, q9, #16
yading@10 48 vst1.64 {d6-d7}, [r0,:128]!
yading@10 49 bne 1b
yading@10 50 ands r2, r2, #15
yading@10 51 beq 3f
yading@10 52 2: vld1.64 {d0-d1}, [r1,:128]!
yading@10 53 vshrn.s32 d4, q8, #16
yading@10 54 vcvt.s32.f32 q0, q0, #16
yading@10 55 vld1.64 {d2-d3}, [r1,:128]!
yading@10 56 vshrn.s32 d5, q9, #16
yading@10 57 vcvt.s32.f32 q1, q1, #16
yading@10 58 vshrn.s32 d6, q0, #16
yading@10 59 vst1.64 {d4-d5}, [r0,:128]!
yading@10 60 vshrn.s32 d7, q1, #16
yading@10 61 vst1.64 {d6-d7}, [r0,:128]!
yading@10 62 bx lr
yading@10 63 3: vshrn.s32 d4, q8, #16
yading@10 64 vshrn.s32 d5, q9, #16
yading@10 65 vst1.64 {d4-d5}, [r0,:128]!
yading@10 66 bx lr
yading@10 67 endfunc
yading@10 68
yading@10 69 function ff_float_to_int16_interleave_neon, export=1
yading@10 70 cmp r3, #2
yading@10 71 itt lt
yading@10 72 ldrlt r1, [r1]
yading@10 73 blt ff_float_to_int16_neon
yading@10 74 bne 4f
yading@10 75
yading@10 76 ldr r3, [r1]
yading@10 77 ldr r1, [r1, #4]
yading@10 78
yading@10 79 subs r2, r2, #8
yading@10 80 vld1.64 {d0-d1}, [r3,:128]!
yading@10 81 vcvt.s32.f32 q8, q0, #16
yading@10 82 vld1.64 {d2-d3}, [r3,:128]!
yading@10 83 vcvt.s32.f32 q9, q1, #16
yading@10 84 vld1.64 {d20-d21},[r1,:128]!
yading@10 85 vcvt.s32.f32 q10, q10, #16
yading@10 86 vld1.64 {d22-d23},[r1,:128]!
yading@10 87 vcvt.s32.f32 q11, q11, #16
yading@10 88 beq 3f
yading@10 89 bics ip, r2, #15
yading@10 90 beq 2f
yading@10 91 1: subs ip, ip, #16
yading@10 92 vld1.64 {d0-d1}, [r3,:128]!
yading@10 93 vcvt.s32.f32 q0, q0, #16
yading@10 94 vsri.32 q10, q8, #16
yading@10 95 vld1.64 {d2-d3}, [r3,:128]!
yading@10 96 vcvt.s32.f32 q1, q1, #16
yading@10 97 vld1.64 {d24-d25},[r1,:128]!
yading@10 98 vcvt.s32.f32 q12, q12, #16
yading@10 99 vld1.64 {d26-d27},[r1,:128]!
yading@10 100 vsri.32 q11, q9, #16
yading@10 101 vst1.64 {d20-d21},[r0,:128]!
yading@10 102 vcvt.s32.f32 q13, q13, #16
yading@10 103 vst1.64 {d22-d23},[r0,:128]!
yading@10 104 vsri.32 q12, q0, #16
yading@10 105 vld1.64 {d16-d17},[r3,:128]!
yading@10 106 vsri.32 q13, q1, #16
yading@10 107 vst1.64 {d24-d25},[r0,:128]!
yading@10 108 vcvt.s32.f32 q8, q8, #16
yading@10 109 vld1.64 {d18-d19},[r3,:128]!
yading@10 110 vcvt.s32.f32 q9, q9, #16
yading@10 111 vld1.64 {d20-d21},[r1,:128]!
yading@10 112 vcvt.s32.f32 q10, q10, #16
yading@10 113 vld1.64 {d22-d23},[r1,:128]!
yading@10 114 vcvt.s32.f32 q11, q11, #16
yading@10 115 vst1.64 {d26-d27},[r0,:128]!
yading@10 116 bne 1b
yading@10 117 ands r2, r2, #15
yading@10 118 beq 3f
yading@10 119 2: vsri.32 q10, q8, #16
yading@10 120 vld1.64 {d0-d1}, [r3,:128]!
yading@10 121 vcvt.s32.f32 q0, q0, #16
yading@10 122 vld1.64 {d2-d3}, [r3,:128]!
yading@10 123 vcvt.s32.f32 q1, q1, #16
yading@10 124 vld1.64 {d24-d25},[r1,:128]!
yading@10 125 vcvt.s32.f32 q12, q12, #16
yading@10 126 vsri.32 q11, q9, #16
yading@10 127 vld1.64 {d26-d27},[r1,:128]!
yading@10 128 vcvt.s32.f32 q13, q13, #16
yading@10 129 vst1.64 {d20-d21},[r0,:128]!
yading@10 130 vsri.32 q12, q0, #16
yading@10 131 vst1.64 {d22-d23},[r0,:128]!
yading@10 132 vsri.32 q13, q1, #16
yading@10 133 vst1.64 {d24-d27},[r0,:128]!
yading@10 134 bx lr
yading@10 135 3: vsri.32 q10, q8, #16
yading@10 136 vsri.32 q11, q9, #16
yading@10 137 vst1.64 {d20-d23},[r0,:128]!
yading@10 138 bx lr
yading@10 139
yading@10 140 4: push {r4-r8,lr}
yading@10 141 cmp r3, #4
yading@10 142 lsl ip, r3, #1
yading@10 143 blt 4f
yading@10 144
yading@10 145 @ 4 channels
yading@10 146 5: ldmia r1!, {r4-r7}
yading@10 147 mov lr, r2
yading@10 148 mov r8, r0
yading@10 149 vld1.64 {d16-d17},[r4,:128]!
yading@10 150 vcvt.s32.f32 q8, q8, #16
yading@10 151 vld1.64 {d18-d19},[r5,:128]!
yading@10 152 vcvt.s32.f32 q9, q9, #16
yading@10 153 vld1.64 {d20-d21},[r6,:128]!
yading@10 154 vcvt.s32.f32 q10, q10, #16
yading@10 155 vld1.64 {d22-d23},[r7,:128]!
yading@10 156 vcvt.s32.f32 q11, q11, #16
yading@10 157 6: subs lr, lr, #8
yading@10 158 vld1.64 {d0-d1}, [r4,:128]!
yading@10 159 vcvt.s32.f32 q0, q0, #16
yading@10 160 vsri.32 q9, q8, #16
yading@10 161 vld1.64 {d2-d3}, [r5,:128]!
yading@10 162 vcvt.s32.f32 q1, q1, #16
yading@10 163 vsri.32 q11, q10, #16
yading@10 164 vld1.64 {d4-d5}, [r6,:128]!
yading@10 165 vcvt.s32.f32 q2, q2, #16
yading@10 166 vzip.32 d18, d22
yading@10 167 vld1.64 {d6-d7}, [r7,:128]!
yading@10 168 vcvt.s32.f32 q3, q3, #16
yading@10 169 vzip.32 d19, d23
yading@10 170 vst1.64 {d18}, [r8], ip
yading@10 171 vsri.32 q1, q0, #16
yading@10 172 vst1.64 {d22}, [r8], ip
yading@10 173 vsri.32 q3, q2, #16
yading@10 174 vst1.64 {d19}, [r8], ip
yading@10 175 vzip.32 d2, d6
yading@10 176 vst1.64 {d23}, [r8], ip
yading@10 177 vzip.32 d3, d7
yading@10 178 beq 7f
yading@10 179 vld1.64 {d16-d17},[r4,:128]!
yading@10 180 vcvt.s32.f32 q8, q8, #16
yading@10 181 vst1.64 {d2}, [r8], ip
yading@10 182 vld1.64 {d18-d19},[r5,:128]!
yading@10 183 vcvt.s32.f32 q9, q9, #16
yading@10 184 vst1.64 {d6}, [r8], ip
yading@10 185 vld1.64 {d20-d21},[r6,:128]!
yading@10 186 vcvt.s32.f32 q10, q10, #16
yading@10 187 vst1.64 {d3}, [r8], ip
yading@10 188 vld1.64 {d22-d23},[r7,:128]!
yading@10 189 vcvt.s32.f32 q11, q11, #16
yading@10 190 vst1.64 {d7}, [r8], ip
yading@10 191 b 6b
yading@10 192 7: vst1.64 {d2}, [r8], ip
yading@10 193 vst1.64 {d6}, [r8], ip
yading@10 194 vst1.64 {d3}, [r8], ip
yading@10 195 vst1.64 {d7}, [r8], ip
yading@10 196 subs r3, r3, #4
yading@10 197 it eq
yading@10 198 popeq {r4-r8,pc}
yading@10 199 cmp r3, #4
yading@10 200 add r0, r0, #8
yading@10 201 bge 5b
yading@10 202
yading@10 203 @ 2 channels
yading@10 204 4: cmp r3, #2
yading@10 205 blt 4f
yading@10 206 ldmia r1!, {r4-r5}
yading@10 207 mov lr, r2
yading@10 208 mov r8, r0
yading@10 209 tst lr, #8
yading@10 210 vld1.64 {d16-d17},[r4,:128]!
yading@10 211 vcvt.s32.f32 q8, q8, #16
yading@10 212 vld1.64 {d18-d19},[r5,:128]!
yading@10 213 vcvt.s32.f32 q9, q9, #16
yading@10 214 vld1.64 {d20-d21},[r4,:128]!
yading@10 215 vcvt.s32.f32 q10, q10, #16
yading@10 216 vld1.64 {d22-d23},[r5,:128]!
yading@10 217 vcvt.s32.f32 q11, q11, #16
yading@10 218 beq 6f
yading@10 219 subs lr, lr, #8
yading@10 220 beq 7f
yading@10 221 vsri.32 d18, d16, #16
yading@10 222 vsri.32 d19, d17, #16
yading@10 223 vld1.64 {d16-d17},[r4,:128]!
yading@10 224 vcvt.s32.f32 q8, q8, #16
yading@10 225 vst1.32 {d18[0]}, [r8], ip
yading@10 226 vsri.32 d22, d20, #16
yading@10 227 vst1.32 {d18[1]}, [r8], ip
yading@10 228 vsri.32 d23, d21, #16
yading@10 229 vst1.32 {d19[0]}, [r8], ip
yading@10 230 vst1.32 {d19[1]}, [r8], ip
yading@10 231 vld1.64 {d18-d19},[r5,:128]!
yading@10 232 vcvt.s32.f32 q9, q9, #16
yading@10 233 vst1.32 {d22[0]}, [r8], ip
yading@10 234 vst1.32 {d22[1]}, [r8], ip
yading@10 235 vld1.64 {d20-d21},[r4,:128]!
yading@10 236 vcvt.s32.f32 q10, q10, #16
yading@10 237 vst1.32 {d23[0]}, [r8], ip
yading@10 238 vst1.32 {d23[1]}, [r8], ip
yading@10 239 vld1.64 {d22-d23},[r5,:128]!
yading@10 240 vcvt.s32.f32 q11, q11, #16
yading@10 241 6: subs lr, lr, #16
yading@10 242 vld1.64 {d0-d1}, [r4,:128]!
yading@10 243 vcvt.s32.f32 q0, q0, #16
yading@10 244 vsri.32 d18, d16, #16
yading@10 245 vld1.64 {d2-d3}, [r5,:128]!
yading@10 246 vcvt.s32.f32 q1, q1, #16
yading@10 247 vsri.32 d19, d17, #16
yading@10 248 vld1.64 {d4-d5}, [r4,:128]!
yading@10 249 vcvt.s32.f32 q2, q2, #16
yading@10 250 vld1.64 {d6-d7}, [r5,:128]!
yading@10 251 vcvt.s32.f32 q3, q3, #16
yading@10 252 vst1.32 {d18[0]}, [r8], ip
yading@10 253 vsri.32 d22, d20, #16
yading@10 254 vst1.32 {d18[1]}, [r8], ip
yading@10 255 vsri.32 d23, d21, #16
yading@10 256 vst1.32 {d19[0]}, [r8], ip
yading@10 257 vsri.32 d2, d0, #16
yading@10 258 vst1.32 {d19[1]}, [r8], ip
yading@10 259 vsri.32 d3, d1, #16
yading@10 260 vst1.32 {d22[0]}, [r8], ip
yading@10 261 vsri.32 d6, d4, #16
yading@10 262 vst1.32 {d22[1]}, [r8], ip
yading@10 263 vsri.32 d7, d5, #16
yading@10 264 vst1.32 {d23[0]}, [r8], ip
yading@10 265 vst1.32 {d23[1]}, [r8], ip
yading@10 266 beq 6f
yading@10 267 vld1.64 {d16-d17},[r4,:128]!
yading@10 268 vcvt.s32.f32 q8, q8, #16
yading@10 269 vst1.32 {d2[0]}, [r8], ip
yading@10 270 vst1.32 {d2[1]}, [r8], ip
yading@10 271 vld1.64 {d18-d19},[r5,:128]!
yading@10 272 vcvt.s32.f32 q9, q9, #16
yading@10 273 vst1.32 {d3[0]}, [r8], ip
yading@10 274 vst1.32 {d3[1]}, [r8], ip
yading@10 275 vld1.64 {d20-d21},[r4,:128]!
yading@10 276 vcvt.s32.f32 q10, q10, #16
yading@10 277 vst1.32 {d6[0]}, [r8], ip
yading@10 278 vst1.32 {d6[1]}, [r8], ip
yading@10 279 vld1.64 {d22-d23},[r5,:128]!
yading@10 280 vcvt.s32.f32 q11, q11, #16
yading@10 281 vst1.32 {d7[0]}, [r8], ip
yading@10 282 vst1.32 {d7[1]}, [r8], ip
yading@10 283 bgt 6b
yading@10 284 6: vst1.32 {d2[0]}, [r8], ip
yading@10 285 vst1.32 {d2[1]}, [r8], ip
yading@10 286 vst1.32 {d3[0]}, [r8], ip
yading@10 287 vst1.32 {d3[1]}, [r8], ip
yading@10 288 vst1.32 {d6[0]}, [r8], ip
yading@10 289 vst1.32 {d6[1]}, [r8], ip
yading@10 290 vst1.32 {d7[0]}, [r8], ip
yading@10 291 vst1.32 {d7[1]}, [r8], ip
yading@10 292 b 8f
yading@10 293 7: vsri.32 d18, d16, #16
yading@10 294 vsri.32 d19, d17, #16
yading@10 295 vst1.32 {d18[0]}, [r8], ip
yading@10 296 vsri.32 d22, d20, #16
yading@10 297 vst1.32 {d18[1]}, [r8], ip
yading@10 298 vsri.32 d23, d21, #16
yading@10 299 vst1.32 {d19[0]}, [r8], ip
yading@10 300 vst1.32 {d19[1]}, [r8], ip
yading@10 301 vst1.32 {d22[0]}, [r8], ip
yading@10 302 vst1.32 {d22[1]}, [r8], ip
yading@10 303 vst1.32 {d23[0]}, [r8], ip
yading@10 304 vst1.32 {d23[1]}, [r8], ip
yading@10 305 8: subs r3, r3, #2
yading@10 306 add r0, r0, #4
yading@10 307 it eq
yading@10 308 popeq {r4-r8,pc}
yading@10 309
yading@10 310 @ 1 channel
yading@10 311 4: ldr r4, [r1],#4
yading@10 312 tst r2, #8
yading@10 313 mov lr, r2
yading@10 314 mov r5, r0
yading@10 315 vld1.64 {d0-d1}, [r4,:128]!
yading@10 316 vcvt.s32.f32 q0, q0, #16
yading@10 317 vld1.64 {d2-d3}, [r4,:128]!
yading@10 318 vcvt.s32.f32 q1, q1, #16
yading@10 319 bne 8f
yading@10 320 6: subs lr, lr, #16
yading@10 321 vld1.64 {d4-d5}, [r4,:128]!
yading@10 322 vcvt.s32.f32 q2, q2, #16
yading@10 323 vld1.64 {d6-d7}, [r4,:128]!
yading@10 324 vcvt.s32.f32 q3, q3, #16
yading@10 325 vst1.16 {d0[1]}, [r5,:16], ip
yading@10 326 vst1.16 {d0[3]}, [r5,:16], ip
yading@10 327 vst1.16 {d1[1]}, [r5,:16], ip
yading@10 328 vst1.16 {d1[3]}, [r5,:16], ip
yading@10 329 vst1.16 {d2[1]}, [r5,:16], ip
yading@10 330 vst1.16 {d2[3]}, [r5,:16], ip
yading@10 331 vst1.16 {d3[1]}, [r5,:16], ip
yading@10 332 vst1.16 {d3[3]}, [r5,:16], ip
yading@10 333 beq 7f
yading@10 334 vld1.64 {d0-d1}, [r4,:128]!
yading@10 335 vcvt.s32.f32 q0, q0, #16
yading@10 336 vld1.64 {d2-d3}, [r4,:128]!
yading@10 337 vcvt.s32.f32 q1, q1, #16
yading@10 338 7: vst1.16 {d4[1]}, [r5,:16], ip
yading@10 339 vst1.16 {d4[3]}, [r5,:16], ip
yading@10 340 vst1.16 {d5[1]}, [r5,:16], ip
yading@10 341 vst1.16 {d5[3]}, [r5,:16], ip
yading@10 342 vst1.16 {d6[1]}, [r5,:16], ip
yading@10 343 vst1.16 {d6[3]}, [r5,:16], ip
yading@10 344 vst1.16 {d7[1]}, [r5,:16], ip
yading@10 345 vst1.16 {d7[3]}, [r5,:16], ip
yading@10 346 bgt 6b
yading@10 347 pop {r4-r8,pc}
yading@10 348 8: subs lr, lr, #8
yading@10 349 vst1.16 {d0[1]}, [r5,:16], ip
yading@10 350 vst1.16 {d0[3]}, [r5,:16], ip
yading@10 351 vst1.16 {d1[1]}, [r5,:16], ip
yading@10 352 vst1.16 {d1[3]}, [r5,:16], ip
yading@10 353 vst1.16 {d2[1]}, [r5,:16], ip
yading@10 354 vst1.16 {d2[3]}, [r5,:16], ip
yading@10 355 vst1.16 {d3[1]}, [r5,:16], ip
yading@10 356 vst1.16 {d3[3]}, [r5,:16], ip
yading@10 357 it eq
yading@10 358 popeq {r4-r8,pc}
yading@10 359 vld1.64 {d0-d1}, [r4,:128]!
yading@10 360 vcvt.s32.f32 q0, q0, #16
yading@10 361 vld1.64 {d2-d3}, [r4,:128]!
yading@10 362 vcvt.s32.f32 q1, q1, #16
yading@10 363 b 6b
yading@10 364 endfunc
yading@10 365
yading@10 366 function ff_int32_to_float_fmul_scalar_neon, export=1
yading@10 367 VFP vdup.32 q0, d0[0]
yading@10 368 VFP len .req r2
yading@10 369 NOVFP vdup.32 q0, r2
yading@10 370 NOVFP len .req r3
yading@10 371
yading@10 372 vld1.32 {q1},[r1,:128]!
yading@10 373 vcvt.f32.s32 q3, q1
yading@10 374 vld1.32 {q2},[r1,:128]!
yading@10 375 vcvt.f32.s32 q8, q2
yading@10 376 1: subs len, len, #8
yading@10 377 pld [r1, #16]
yading@10 378 vmul.f32 q9, q3, q0
yading@10 379 vmul.f32 q10, q8, q0
yading@10 380 beq 2f
yading@10 381 vld1.32 {q1},[r1,:128]!
yading@10 382 vcvt.f32.s32 q3, q1
yading@10 383 vld1.32 {q2},[r1,:128]!
yading@10 384 vcvt.f32.s32 q8, q2
yading@10 385 vst1.32 {q9}, [r0,:128]!
yading@10 386 vst1.32 {q10},[r0,:128]!
yading@10 387 b 1b
yading@10 388 2: vst1.32 {q9}, [r0,:128]!
yading@10 389 vst1.32 {q10},[r0,:128]!
yading@10 390 bx lr
yading@10 391 .unreq len
yading@10 392 endfunc