annotate ffmpeg/libavcodec/arm/sbrdsp_neon.S @ 13:844d341cf643 tip

Back up before ISMIR
author Yading Song <yading.song@eecs.qmul.ac.uk>
date Thu, 31 Oct 2013 13:17:06 +0000
parents 6840f77b83aa
children
rev   line source
yading@10 1 /*
yading@10 2 * Copyright (c) 2012 Mans Rullgard
yading@10 3 *
yading@10 4 * This file is part of Libav.
yading@10 5 *
yading@10 6 * Libav is free software; you can redistribute it and/or
yading@10 7 * modify it under the terms of the GNU Lesser General Public
yading@10 8 * License as published by the Free Software Foundation; either
yading@10 9 * version 2.1 of the License, or (at your option) any later version.
yading@10 10 *
yading@10 11 * Libav is distributed in the hope that it will be useful,
yading@10 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
yading@10 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
yading@10 14 * Lesser General Public License for more details.
yading@10 15 *
yading@10 16 * You should have received a copy of the GNU Lesser General Public
yading@10 17 * License along with Libav; if not, write to the Free Software
yading@10 18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
yading@10 19 */
yading@10 20
yading@10 21 #include "libavutil/arm/asm.S"
yading@10 22
yading@10 23 function ff_sbr_sum64x5_neon, export=1
yading@10 24 push {lr}
yading@10 25 add r1, r0, # 64*4
yading@10 26 add r2, r0, #128*4
yading@10 27 add r3, r0, #192*4
yading@10 28 add lr, r0, #256*4
yading@10 29 mov r12, #64
yading@10 30 1:
yading@10 31 vld1.32 {q0}, [r0,:128]
yading@10 32 vld1.32 {q1}, [r1,:128]!
yading@10 33 vadd.f32 q0, q0, q1
yading@10 34 vld1.32 {q2}, [r2,:128]!
yading@10 35 vadd.f32 q0, q0, q2
yading@10 36 vld1.32 {q3}, [r3,:128]!
yading@10 37 vadd.f32 q0, q0, q3
yading@10 38 vld1.32 {q8}, [lr,:128]!
yading@10 39 vadd.f32 q0, q0, q8
yading@10 40 vst1.32 {q0}, [r0,:128]!
yading@10 41 subs r12, #4
yading@10 42 bgt 1b
yading@10 43 pop {pc}
yading@10 44 endfunc
yading@10 45
yading@10 46 function ff_sbr_sum_square_neon, export=1
yading@10 47 vmov.f32 q0, #0.0
yading@10 48 1:
yading@10 49 vld1.32 {q1}, [r0,:128]!
yading@10 50 vmla.f32 q0, q1, q1
yading@10 51 subs r1, r1, #2
yading@10 52 bgt 1b
yading@10 53 vadd.f32 d0, d0, d1
yading@10 54 vpadd.f32 d0, d0, d0
yading@10 55 NOVFP vmov.32 r0, d0[0]
yading@10 56 bx lr
yading@10 57 endfunc
yading@10 58
yading@10 59 function ff_sbr_neg_odd_64_neon, export=1
yading@10 60 mov r1, r0
yading@10 61 vmov.i32 q8, #1<<31
yading@10 62 vld2.32 {q0,q1}, [r0,:128]!
yading@10 63 veor q1, q1, q8
yading@10 64 vld2.32 {q2,q3}, [r0,:128]!
yading@10 65 .rept 3
yading@10 66 vst2.32 {q0,q1}, [r1,:128]!
yading@10 67 veor q3, q3, q8
yading@10 68 vld2.32 {q0,q1}, [r0,:128]!
yading@10 69 vst2.32 {q2,q3}, [r1,:128]!
yading@10 70 veor q1, q1, q8
yading@10 71 vld2.32 {q2,q3}, [r0,:128]!
yading@10 72 .endr
yading@10 73 veor q3, q3, q8
yading@10 74 vst2.32 {q0,q1}, [r1,:128]!
yading@10 75 vst2.32 {q2,q3}, [r1,:128]!
yading@10 76 bx lr
yading@10 77 endfunc
yading@10 78
yading@10 79 function ff_sbr_qmf_pre_shuffle_neon, export=1
yading@10 80 add r1, r0, #60*4
yading@10 81 add r2, r0, #64*4
yading@10 82 vld1.32 {d0}, [r0,:64]!
yading@10 83 vst1.32 {d0}, [r2,:64]!
yading@10 84 mov r3, #-16
yading@10 85 mov r12, #24
yading@10 86 vmov.i32 q8, #1<<31
yading@10 87 vld1.32 {q0}, [r1,:128], r3
yading@10 88 vld1.32 {d2}, [r0,:64]!
yading@10 89 1:
yading@10 90 vld1.32 {d3,d4}, [r0,:128]!
yading@10 91 vrev64.32 q0, q0
yading@10 92 vld1.32 {q9}, [r1,:128], r3
yading@10 93 veor q0, q0, q8
yading@10 94 vld1.32 {d5,d6}, [r0,:128]!
yading@10 95 vswp d0, d1
yading@10 96 vrev64.32 q9, q9
yading@10 97 vst2.32 {q0,q1}, [r2,:64]!
yading@10 98 vmov q10, q2
yading@10 99 veor q9, q9, q8
yading@10 100 vmov d2, d6
yading@10 101 vswp d18, d19
yading@10 102 vld1.32 {q0}, [r1,:128], r3
yading@10 103 vst2.32 {q9,q10}, [r2,:64]!
yading@10 104 subs r12, r12, #8
yading@10 105 bgt 1b
yading@10 106 vld1.32 {d3,d4}, [r0,:128]!
yading@10 107 vrev64.32 q0, q0
yading@10 108 vld1.32 {q9}, [r1,:128], r3
yading@10 109 veor q0, q0, q8
yading@10 110 vld1.32 {d5}, [r0,:64]!
yading@10 111 vswp d0, d1
yading@10 112 vrev64.32 q9, q9
yading@10 113 vst2.32 {q0,q1}, [r2,:64]!
yading@10 114 vswp d4, d5
yading@10 115 veor q1, q9, q8
yading@10 116 vst2.32 {d3,d5}, [r2,:64]!
yading@10 117 vst2.32 {d2[0],d4[0]}, [r2,:64]!
yading@10 118 bx lr
yading@10 119 endfunc
yading@10 120
yading@10 121 function ff_sbr_qmf_post_shuffle_neon, export=1
yading@10 122 add r2, r1, #60*4
yading@10 123 mov r3, #-16
yading@10 124 mov r12, #32
yading@10 125 vmov.i32 q8, #1<<31
yading@10 126 vld1.32 {q0}, [r2,:128], r3
yading@10 127 vld1.32 {q1}, [r1,:128]!
yading@10 128 1:
yading@10 129 pld [r2, #-32]
yading@10 130 vrev64.32 q0, q0
yading@10 131 vswp d2, d3
yading@10 132 veor q0, q0, q8
yading@10 133 vld1.32 {q2}, [r2,:128], r3
yading@10 134 vld1.32 {q3}, [r1,:128]!
yading@10 135 vst2.32 {d1,d3}, [r0,:128]!
yading@10 136 vst2.32 {d0,d2}, [r0,:128]!
yading@10 137 pld [r2, #-32]
yading@10 138 vrev64.32 q2, q2
yading@10 139 vswp d6, d7
yading@10 140 veor q2, q2, q8
yading@10 141 vld1.32 {q0}, [r2,:128], r3
yading@10 142 vld1.32 {q1}, [r1,:128]!
yading@10 143 vst2.32 {d5,d7}, [r0,:128]!
yading@10 144 vst2.32 {d4,d6}, [r0,:128]!
yading@10 145 subs r12, r12, #8
yading@10 146 bgt 1b
yading@10 147 bx lr
yading@10 148 endfunc
yading@10 149
yading@10 150 function ff_sbr_qmf_deint_neg_neon, export=1
yading@10 151 add r1, r1, #60*4
yading@10 152 add r2, r0, #62*4
yading@10 153 mov r3, #-16
yading@10 154 mov r12, #32
yading@10 155 vmov.i32 d2, #1<<31
yading@10 156 1:
yading@10 157 vld2.32 {d0,d1}, [r1,:128], r3
yading@10 158 veor d0, d0, d2
yading@10 159 vrev64.32 d1, d1
yading@10 160 vst1.32 {d0}, [r2,:64]
yading@10 161 vst1.32 {d1}, [r0,:64]!
yading@10 162 sub r2, r2, #8
yading@10 163 subs r12, r12, #2
yading@10 164 bgt 1b
yading@10 165 bx lr
yading@10 166 endfunc
yading@10 167
yading@10 168 function ff_sbr_qmf_deint_bfly_neon, export=1
yading@10 169 push {lr}
yading@10 170 add r2, r2, #60*4
yading@10 171 add r3, r0, #124*4
yading@10 172 mov r12, #64
yading@10 173 mov lr, #-16
yading@10 174 1:
yading@10 175 vld1.32 {q0}, [r1,:128]!
yading@10 176 vld1.32 {q1}, [r2,:128], lr
yading@10 177 vrev64.32 q2, q0
yading@10 178 vrev64.32 q3, q1
yading@10 179 vadd.f32 d3, d4, d3
yading@10 180 vadd.f32 d2, d5, d2
yading@10 181 vsub.f32 d0, d0, d7
yading@10 182 vsub.f32 d1, d1, d6
yading@10 183 vst1.32 {q1}, [r3,:128], lr
yading@10 184 vst1.32 {q0}, [r0,:128]!
yading@10 185 subs r12, r12, #4
yading@10 186 bgt 1b
yading@10 187 pop {pc}
yading@10 188 endfunc
yading@10 189
yading@10 190 function ff_sbr_hf_g_filt_neon, export=1
yading@10 191 ldr r12, [sp]
yading@10 192 add r1, r1, r12, lsl #3
yading@10 193 mov r12, #40*2*4
yading@10 194 sub r3, r3, #1
yading@10 195 vld2.32 {d2[],d3[]},[r2,:64]!
yading@10 196 vld1.32 {d0}, [r1,:64], r12
yading@10 197 1:
yading@10 198 vld1.32 {d1}, [r1,:64], r12
yading@10 199 vmul.f32 q3, q0, q1
yading@10 200 vld2.32 {d2[],d3[]},[r2,:64]!
yading@10 201 vld1.32 {d0}, [r1,:64], r12
yading@10 202 vst1.32 {q3}, [r0,:64]!
yading@10 203 subs r3, r3, #2
yading@10 204 bgt 1b
yading@10 205 it lt
yading@10 206 bxlt lr
yading@10 207 vmul.f32 d0, d0, d2
yading@10 208 vst1.32 {d0}, [r0,:64]!
yading@10 209 bx lr
yading@10 210 endfunc
yading@10 211
yading@10 212 function ff_sbr_hf_gen_neon, export=1
yading@10 213 NOVFP vld1.32 {d1[]}, [sp,:32]
yading@10 214 VFP vdup.32 d1, d0[0]
yading@10 215 vmul.f32 d0, d1, d1
yading@10 216 vld1.32 {d3}, [r2,:64]
yading@10 217 vld1.32 {d2}, [r3,:64]
yading@10 218 vmul.f32 q0, q0, q1
yading@10 219 ldrd r2, r3, [sp, #4*!HAVE_VFP_ARGS]
yading@10 220 vtrn.32 d0, d1
yading@10 221 vneg.f32 d18, d1
yading@10 222 vtrn.32 d18, d1
yading@10 223 add r0, r0, r2, lsl #3
yading@10 224 add r1, r1, r2, lsl #3
yading@10 225 sub r1, r1, #2*8
yading@10 226 sub r3, r3, r2
yading@10 227 vld1.32 {q1}, [r1,:128]!
yading@10 228 1:
yading@10 229 vld1.32 {q3}, [r1,:128]!
yading@10 230 vrev64.32 q2, q1
yading@10 231 vmov q8, q3
yading@10 232 vrev64.32 d20, d3
yading@10 233 vrev64.32 d21, d6
yading@10 234 vmla.f32 q3, q1, d0[0]
yading@10 235 vmla.f32 d6, d4, d18
yading@10 236 vmla.f32 d7, d20, d18
yading@10 237 vmla.f32 d6, d3, d0[1]
yading@10 238 vmla.f32 d7, d16, d0[1]
yading@10 239 vmla.f32 d6, d5, d1
yading@10 240 vmla.f32 d7, d21, d1
yading@10 241 vmov q1, q8
yading@10 242 vst1.32 {q3}, [r0,:128]!
yading@10 243 subs r3, r3, #2
yading@10 244 bgt 1b
yading@10 245 bx lr
yading@10 246 endfunc
yading@10 247
yading@10 248 function ff_sbr_autocorrelate_neon, export=1
yading@10 249 vld1.32 {q0}, [r0,:128]!
yading@10 250 vmov.f32 q1, #0.0
yading@10 251 vmov.f32 q3, #0.0
yading@10 252 vmov.f32 d20, #0.0
yading@10 253 vmul.f32 d21, d1, d1
yading@10 254 vmov q8, q0
yading@10 255 vmov q11, q0
yading@10 256 mov r12, #36
yading@10 257 1:
yading@10 258 vld1.32 {q2}, [r0,:128]!
yading@10 259 vrev64.32 q12, q2
yading@10 260 vmla.f32 q10, q2, q2
yading@10 261 vmla.f32 d2, d1, d4
yading@10 262 vmla.f32 d3, d1, d24
yading@10 263 vmla.f32 d6, d0, d4
yading@10 264 vmla.f32 d7, d0, d24
yading@10 265 vmla.f32 d2, d4, d5
yading@10 266 vmla.f32 d3, d4, d25
yading@10 267 vmla.f32 d6, d1, d5
yading@10 268 vmla.f32 d7, d1, d25
yading@10 269 vmov q0, q2
yading@10 270 subs r12, r12, #2
yading@10 271 bgt 1b
yading@10 272 vld1.32 {q2}, [r0,:128]!
yading@10 273 vrev64.32 q12, q2
yading@10 274 vmla.f32 d2, d1, d4
yading@10 275 vmla.f32 d3, d1, d24
yading@10 276 vmla.f32 d6, d0, d4
yading@10 277 vmla.f32 d7, d0, d24
yading@10 278 vadd.f32 d20, d20, d21
yading@10 279 vrev64.32 d18, d17
yading@10 280 vmla.f32 d6, d1, d5
yading@10 281 vmla.f32 d7, d1, d25
yading@10 282 vmov q0, q1
yading@10 283 vmla.f32 d0, d16, d17
yading@10 284 vmla.f32 d1, d16, d18
yading@10 285 vmla.f32 d2, d4, d5
yading@10 286 vmla.f32 d3, d4, d25
yading@10 287 vneg.f32 s15, s15
yading@10 288 vmov d21, d20
yading@10 289 vpadd.f32 d0, d0, d2
yading@10 290 vpadd.f32 d7, d6, d7
yading@10 291 vtrn.32 d1, d3
yading@10 292 vsub.f32 d6, d1, d3
yading@10 293 vmla.f32 d20, d22, d22
yading@10 294 vmla.f32 d21, d4, d4
yading@10 295 vtrn.32 d0, d6
yading@10 296 vpadd.f32 d20, d20, d21
yading@10 297 vst1.32 {q3}, [r1,:128]!
yading@10 298 vst1.32 {d20[1]}, [r1,:32]
yading@10 299 add r1, r1, #2*4
yading@10 300 vst1.32 {d0}, [r1,:64]
yading@10 301 add r1, r1, #4*4
yading@10 302 vst1.32 {d20[0]}, [r1,:32]
yading@10 303 bx lr
yading@10 304 endfunc
yading@10 305
yading@10 306 function ff_sbr_hf_apply_noise_0_neon, export=1
yading@10 307 vmov.i32 d3, #0
yading@10 308 .Lhf_apply_noise_0:
yading@10 309 push {r4,lr}
yading@10 310 movrelx r4, X(ff_sbr_noise_table)
yading@10 311 ldr r12, [sp, #12]
yading@10 312 add r3, r3, #1
yading@10 313 bfc r3, #9, #23
yading@10 314 sub r12, r12, #1
yading@10 315 1:
yading@10 316 add lr, r4, r3, lsl #3
yading@10 317 vld2.32 {q0}, [r0,:64]
yading@10 318 vld2.32 {q3}, [lr,:64]
yading@10 319 vld1.32 {d2}, [r1,:64]!
yading@10 320 vld1.32 {d18}, [r2,:64]!
yading@10 321 vceq.f32 d16, d2, #0
yading@10 322 veor d2, d2, d3
yading@10 323 vmov q2, q0
yading@10 324 vmla.f32 d0, d6, d18
yading@10 325 vmla.f32 d1, d7, d18
yading@10 326 vadd.f32 d4, d4, d2
yading@10 327 add r3, r3, #2
yading@10 328 bfc r3, #9, #23
yading@10 329 vbif d0, d4, d16
yading@10 330 vbif d1, d5, d16
yading@10 331 vst2.32 {q0}, [r0,:64]!
yading@10 332 subs r12, r12, #2
yading@10 333 bgt 1b
yading@10 334 blt 2f
yading@10 335 add lr, r4, r3, lsl #3
yading@10 336 vld1.32 {d0}, [r0,:64]
yading@10 337 vld1.32 {d6}, [lr,:64]
yading@10 338 vld1.32 {d2[]}, [r1,:32]!
yading@10 339 vld1.32 {d3[]}, [r2,:32]!
yading@10 340 vceq.f32 d4, d2, #0
yading@10 341 veor d2, d2, d3
yading@10 342 vmov d1, d0
yading@10 343 vmla.f32 d0, d6, d3
yading@10 344 vadd.f32 s2, s2, s4
yading@10 345 vbif d0, d1, d4
yading@10 346 vst1.32 {d0}, [r0,:64]!
yading@10 347 2:
yading@10 348 pop {r4,pc}
yading@10 349 endfunc
yading@10 350
yading@10 351 function ff_sbr_hf_apply_noise_1_neon, export=1
yading@10 352 ldr r12, [sp]
yading@10 353 push {r4,lr}
yading@10 354 lsl r12, r12, #31
yading@10 355 eor lr, r12, #1<<31
yading@10 356 vmov d3, r12, lr
yading@10 357 .Lhf_apply_noise_1:
yading@10 358 movrelx r4, X(ff_sbr_noise_table)
yading@10 359 ldr r12, [sp, #12]
yading@10 360 add r3, r3, #1
yading@10 361 bfc r3, #9, #23
yading@10 362 sub r12, r12, #1
yading@10 363 1:
yading@10 364 add lr, r4, r3, lsl #3
yading@10 365 vld2.32 {q0}, [r0,:64]
yading@10 366 vld2.32 {q3}, [lr,:64]
yading@10 367 vld1.32 {d2}, [r1,:64]!
yading@10 368 vld1.32 {d18}, [r2,:64]!
yading@10 369 vceq.f32 d16, d2, #0
yading@10 370 veor d2, d2, d3
yading@10 371 vmov q2, q0
yading@10 372 vmla.f32 d0, d6, d18
yading@10 373 vmla.f32 d1, d7, d18
yading@10 374 vadd.f32 d5, d5, d2
yading@10 375 add r3, r3, #2
yading@10 376 bfc r3, #9, #23
yading@10 377 vbif d0, d4, d16
yading@10 378 vbif d1, d5, d16
yading@10 379 vst2.32 {q0}, [r0,:64]!
yading@10 380 subs r12, r12, #2
yading@10 381 bgt 1b
yading@10 382 blt 2f
yading@10 383 add lr, r4, r3, lsl #3
yading@10 384 vld1.32 {d0}, [r0,:64]
yading@10 385 vld1.32 {d6}, [lr,:64]
yading@10 386 vld1.32 {d2[]}, [r1,:32]!
yading@10 387 vld1.32 {d18[]}, [r2,:32]!
yading@10 388 vceq.f32 d4, d2, #0
yading@10 389 veor d2, d2, d3
yading@10 390 vmov d1, d0
yading@10 391 vmla.f32 d0, d6, d18
yading@10 392 vadd.f32 s3, s3, s5
yading@10 393 vbif d0, d1, d4
yading@10 394 vst1.32 {d0}, [r0,:64]!
yading@10 395 2:
yading@10 396 pop {r4,pc}
yading@10 397 endfunc
yading@10 398
yading@10 399 function ff_sbr_hf_apply_noise_2_neon, export=1
yading@10 400 vmov.i32 d3, #1<<31
yading@10 401 b .Lhf_apply_noise_0
yading@10 402 endfunc
yading@10 403
yading@10 404 function ff_sbr_hf_apply_noise_3_neon, export=1
yading@10 405 ldr r12, [sp]
yading@10 406 push {r4,lr}
yading@10 407 lsl r12, r12, #31
yading@10 408 eor lr, r12, #1<<31
yading@10 409 vmov d3, lr, r12
yading@10 410 b .Lhf_apply_noise_1
yading@10 411 endfunc