yading@10: /* yading@10: * Copyright (c) 2012 Mans Rullgard yading@10: * yading@10: * This file is part of Libav. yading@10: * yading@10: * Libav is free software; you can redistribute it and/or yading@10: * modify it under the terms of the GNU Lesser General Public yading@10: * License as published by the Free Software Foundation; either yading@10: * version 2.1 of the License, or (at your option) any later version. yading@10: * yading@10: * Libav is distributed in the hope that it will be useful, yading@10: * but WITHOUT ANY WARRANTY; without even the implied warranty of yading@10: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU yading@10: * Lesser General Public License for more details. yading@10: * yading@10: * You should have received a copy of the GNU Lesser General Public yading@10: * License along with Libav; if not, write to the Free Software yading@10: * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA yading@10: */ yading@10: yading@10: #include "libavutil/arm/asm.S" yading@10: yading@10: function ff_sbr_sum64x5_neon, export=1 yading@10: push {lr} yading@10: add r1, r0, # 64*4 yading@10: add r2, r0, #128*4 yading@10: add r3, r0, #192*4 yading@10: add lr, r0, #256*4 yading@10: mov r12, #64 yading@10: 1: yading@10: vld1.32 {q0}, [r0,:128] yading@10: vld1.32 {q1}, [r1,:128]! yading@10: vadd.f32 q0, q0, q1 yading@10: vld1.32 {q2}, [r2,:128]! yading@10: vadd.f32 q0, q0, q2 yading@10: vld1.32 {q3}, [r3,:128]! yading@10: vadd.f32 q0, q0, q3 yading@10: vld1.32 {q8}, [lr,:128]! yading@10: vadd.f32 q0, q0, q8 yading@10: vst1.32 {q0}, [r0,:128]! yading@10: subs r12, #4 yading@10: bgt 1b yading@10: pop {pc} yading@10: endfunc yading@10: yading@10: function ff_sbr_sum_square_neon, export=1 yading@10: vmov.f32 q0, #0.0 yading@10: 1: yading@10: vld1.32 {q1}, [r0,:128]! yading@10: vmla.f32 q0, q1, q1 yading@10: subs r1, r1, #2 yading@10: bgt 1b yading@10: vadd.f32 d0, d0, d1 yading@10: vpadd.f32 d0, d0, d0 yading@10: NOVFP vmov.32 r0, d0[0] yading@10: bx lr yading@10: endfunc yading@10: yading@10: function ff_sbr_neg_odd_64_neon, export=1 yading@10: mov r1, r0 yading@10: vmov.i32 q8, #1<<31 yading@10: vld2.32 {q0,q1}, [r0,:128]! yading@10: veor q1, q1, q8 yading@10: vld2.32 {q2,q3}, [r0,:128]! yading@10: .rept 3 yading@10: vst2.32 {q0,q1}, [r1,:128]! yading@10: veor q3, q3, q8 yading@10: vld2.32 {q0,q1}, [r0,:128]! yading@10: vst2.32 {q2,q3}, [r1,:128]! yading@10: veor q1, q1, q8 yading@10: vld2.32 {q2,q3}, [r0,:128]! yading@10: .endr yading@10: veor q3, q3, q8 yading@10: vst2.32 {q0,q1}, [r1,:128]! yading@10: vst2.32 {q2,q3}, [r1,:128]! yading@10: bx lr yading@10: endfunc yading@10: yading@10: function ff_sbr_qmf_pre_shuffle_neon, export=1 yading@10: add r1, r0, #60*4 yading@10: add r2, r0, #64*4 yading@10: vld1.32 {d0}, [r0,:64]! yading@10: vst1.32 {d0}, [r2,:64]! yading@10: mov r3, #-16 yading@10: mov r12, #24 yading@10: vmov.i32 q8, #1<<31 yading@10: vld1.32 {q0}, [r1,:128], r3 yading@10: vld1.32 {d2}, [r0,:64]! yading@10: 1: yading@10: vld1.32 {d3,d4}, [r0,:128]! yading@10: vrev64.32 q0, q0 yading@10: vld1.32 {q9}, [r1,:128], r3 yading@10: veor q0, q0, q8 yading@10: vld1.32 {d5,d6}, [r0,:128]! yading@10: vswp d0, d1 yading@10: vrev64.32 q9, q9 yading@10: vst2.32 {q0,q1}, [r2,:64]! yading@10: vmov q10, q2 yading@10: veor q9, q9, q8 yading@10: vmov d2, d6 yading@10: vswp d18, d19 yading@10: vld1.32 {q0}, [r1,:128], r3 yading@10: vst2.32 {q9,q10}, [r2,:64]! yading@10: subs r12, r12, #8 yading@10: bgt 1b yading@10: vld1.32 {d3,d4}, [r0,:128]! yading@10: vrev64.32 q0, q0 yading@10: vld1.32 {q9}, [r1,:128], r3 yading@10: veor q0, q0, q8 yading@10: vld1.32 {d5}, [r0,:64]! yading@10: vswp d0, d1 yading@10: vrev64.32 q9, q9 yading@10: vst2.32 {q0,q1}, [r2,:64]! yading@10: vswp d4, d5 yading@10: veor q1, q9, q8 yading@10: vst2.32 {d3,d5}, [r2,:64]! yading@10: vst2.32 {d2[0],d4[0]}, [r2,:64]! yading@10: bx lr yading@10: endfunc yading@10: yading@10: function ff_sbr_qmf_post_shuffle_neon, export=1 yading@10: add r2, r1, #60*4 yading@10: mov r3, #-16 yading@10: mov r12, #32 yading@10: vmov.i32 q8, #1<<31 yading@10: vld1.32 {q0}, [r2,:128], r3 yading@10: vld1.32 {q1}, [r1,:128]! yading@10: 1: yading@10: pld [r2, #-32] yading@10: vrev64.32 q0, q0 yading@10: vswp d2, d3 yading@10: veor q0, q0, q8 yading@10: vld1.32 {q2}, [r2,:128], r3 yading@10: vld1.32 {q3}, [r1,:128]! yading@10: vst2.32 {d1,d3}, [r0,:128]! yading@10: vst2.32 {d0,d2}, [r0,:128]! yading@10: pld [r2, #-32] yading@10: vrev64.32 q2, q2 yading@10: vswp d6, d7 yading@10: veor q2, q2, q8 yading@10: vld1.32 {q0}, [r2,:128], r3 yading@10: vld1.32 {q1}, [r1,:128]! yading@10: vst2.32 {d5,d7}, [r0,:128]! yading@10: vst2.32 {d4,d6}, [r0,:128]! yading@10: subs r12, r12, #8 yading@10: bgt 1b yading@10: bx lr yading@10: endfunc yading@10: yading@10: function ff_sbr_qmf_deint_neg_neon, export=1 yading@10: add r1, r1, #60*4 yading@10: add r2, r0, #62*4 yading@10: mov r3, #-16 yading@10: mov r12, #32 yading@10: vmov.i32 d2, #1<<31 yading@10: 1: yading@10: vld2.32 {d0,d1}, [r1,:128], r3 yading@10: veor d0, d0, d2 yading@10: vrev64.32 d1, d1 yading@10: vst1.32 {d0}, [r2,:64] yading@10: vst1.32 {d1}, [r0,:64]! yading@10: sub r2, r2, #8 yading@10: subs r12, r12, #2 yading@10: bgt 1b yading@10: bx lr yading@10: endfunc yading@10: yading@10: function ff_sbr_qmf_deint_bfly_neon, export=1 yading@10: push {lr} yading@10: add r2, r2, #60*4 yading@10: add r3, r0, #124*4 yading@10: mov r12, #64 yading@10: mov lr, #-16 yading@10: 1: yading@10: vld1.32 {q0}, [r1,:128]! yading@10: vld1.32 {q1}, [r2,:128], lr yading@10: vrev64.32 q2, q0 yading@10: vrev64.32 q3, q1 yading@10: vadd.f32 d3, d4, d3 yading@10: vadd.f32 d2, d5, d2 yading@10: vsub.f32 d0, d0, d7 yading@10: vsub.f32 d1, d1, d6 yading@10: vst1.32 {q1}, [r3,:128], lr yading@10: vst1.32 {q0}, [r0,:128]! yading@10: subs r12, r12, #4 yading@10: bgt 1b yading@10: pop {pc} yading@10: endfunc yading@10: yading@10: function ff_sbr_hf_g_filt_neon, export=1 yading@10: ldr r12, [sp] yading@10: add r1, r1, r12, lsl #3 yading@10: mov r12, #40*2*4 yading@10: sub r3, r3, #1 yading@10: vld2.32 {d2[],d3[]},[r2,:64]! yading@10: vld1.32 {d0}, [r1,:64], r12 yading@10: 1: yading@10: vld1.32 {d1}, [r1,:64], r12 yading@10: vmul.f32 q3, q0, q1 yading@10: vld2.32 {d2[],d3[]},[r2,:64]! yading@10: vld1.32 {d0}, [r1,:64], r12 yading@10: vst1.32 {q3}, [r0,:64]! yading@10: subs r3, r3, #2 yading@10: bgt 1b yading@10: it lt yading@10: bxlt lr yading@10: vmul.f32 d0, d0, d2 yading@10: vst1.32 {d0}, [r0,:64]! yading@10: bx lr yading@10: endfunc yading@10: yading@10: function ff_sbr_hf_gen_neon, export=1 yading@10: NOVFP vld1.32 {d1[]}, [sp,:32] yading@10: VFP vdup.32 d1, d0[0] yading@10: vmul.f32 d0, d1, d1 yading@10: vld1.32 {d3}, [r2,:64] yading@10: vld1.32 {d2}, [r3,:64] yading@10: vmul.f32 q0, q0, q1 yading@10: ldrd r2, r3, [sp, #4*!HAVE_VFP_ARGS] yading@10: vtrn.32 d0, d1 yading@10: vneg.f32 d18, d1 yading@10: vtrn.32 d18, d1 yading@10: add r0, r0, r2, lsl #3 yading@10: add r1, r1, r2, lsl #3 yading@10: sub r1, r1, #2*8 yading@10: sub r3, r3, r2 yading@10: vld1.32 {q1}, [r1,:128]! yading@10: 1: yading@10: vld1.32 {q3}, [r1,:128]! yading@10: vrev64.32 q2, q1 yading@10: vmov q8, q3 yading@10: vrev64.32 d20, d3 yading@10: vrev64.32 d21, d6 yading@10: vmla.f32 q3, q1, d0[0] yading@10: vmla.f32 d6, d4, d18 yading@10: vmla.f32 d7, d20, d18 yading@10: vmla.f32 d6, d3, d0[1] yading@10: vmla.f32 d7, d16, d0[1] yading@10: vmla.f32 d6, d5, d1 yading@10: vmla.f32 d7, d21, d1 yading@10: vmov q1, q8 yading@10: vst1.32 {q3}, [r0,:128]! yading@10: subs r3, r3, #2 yading@10: bgt 1b yading@10: bx lr yading@10: endfunc yading@10: yading@10: function ff_sbr_autocorrelate_neon, export=1 yading@10: vld1.32 {q0}, [r0,:128]! yading@10: vmov.f32 q1, #0.0 yading@10: vmov.f32 q3, #0.0 yading@10: vmov.f32 d20, #0.0 yading@10: vmul.f32 d21, d1, d1 yading@10: vmov q8, q0 yading@10: vmov q11, q0 yading@10: mov r12, #36 yading@10: 1: yading@10: vld1.32 {q2}, [r0,:128]! yading@10: vrev64.32 q12, q2 yading@10: vmla.f32 q10, q2, q2 yading@10: vmla.f32 d2, d1, d4 yading@10: vmla.f32 d3, d1, d24 yading@10: vmla.f32 d6, d0, d4 yading@10: vmla.f32 d7, d0, d24 yading@10: vmla.f32 d2, d4, d5 yading@10: vmla.f32 d3, d4, d25 yading@10: vmla.f32 d6, d1, d5 yading@10: vmla.f32 d7, d1, d25 yading@10: vmov q0, q2 yading@10: subs r12, r12, #2 yading@10: bgt 1b yading@10: vld1.32 {q2}, [r0,:128]! yading@10: vrev64.32 q12, q2 yading@10: vmla.f32 d2, d1, d4 yading@10: vmla.f32 d3, d1, d24 yading@10: vmla.f32 d6, d0, d4 yading@10: vmla.f32 d7, d0, d24 yading@10: vadd.f32 d20, d20, d21 yading@10: vrev64.32 d18, d17 yading@10: vmla.f32 d6, d1, d5 yading@10: vmla.f32 d7, d1, d25 yading@10: vmov q0, q1 yading@10: vmla.f32 d0, d16, d17 yading@10: vmla.f32 d1, d16, d18 yading@10: vmla.f32 d2, d4, d5 yading@10: vmla.f32 d3, d4, d25 yading@10: vneg.f32 s15, s15 yading@10: vmov d21, d20 yading@10: vpadd.f32 d0, d0, d2 yading@10: vpadd.f32 d7, d6, d7 yading@10: vtrn.32 d1, d3 yading@10: vsub.f32 d6, d1, d3 yading@10: vmla.f32 d20, d22, d22 yading@10: vmla.f32 d21, d4, d4 yading@10: vtrn.32 d0, d6 yading@10: vpadd.f32 d20, d20, d21 yading@10: vst1.32 {q3}, [r1,:128]! yading@10: vst1.32 {d20[1]}, [r1,:32] yading@10: add r1, r1, #2*4 yading@10: vst1.32 {d0}, [r1,:64] yading@10: add r1, r1, #4*4 yading@10: vst1.32 {d20[0]}, [r1,:32] yading@10: bx lr yading@10: endfunc yading@10: yading@10: function ff_sbr_hf_apply_noise_0_neon, export=1 yading@10: vmov.i32 d3, #0 yading@10: .Lhf_apply_noise_0: yading@10: push {r4,lr} yading@10: movrelx r4, X(ff_sbr_noise_table) yading@10: ldr r12, [sp, #12] yading@10: add r3, r3, #1 yading@10: bfc r3, #9, #23 yading@10: sub r12, r12, #1 yading@10: 1: yading@10: add lr, r4, r3, lsl #3 yading@10: vld2.32 {q0}, [r0,:64] yading@10: vld2.32 {q3}, [lr,:64] yading@10: vld1.32 {d2}, [r1,:64]! yading@10: vld1.32 {d18}, [r2,:64]! yading@10: vceq.f32 d16, d2, #0 yading@10: veor d2, d2, d3 yading@10: vmov q2, q0 yading@10: vmla.f32 d0, d6, d18 yading@10: vmla.f32 d1, d7, d18 yading@10: vadd.f32 d4, d4, d2 yading@10: add r3, r3, #2 yading@10: bfc r3, #9, #23 yading@10: vbif d0, d4, d16 yading@10: vbif d1, d5, d16 yading@10: vst2.32 {q0}, [r0,:64]! yading@10: subs r12, r12, #2 yading@10: bgt 1b yading@10: blt 2f yading@10: add lr, r4, r3, lsl #3 yading@10: vld1.32 {d0}, [r0,:64] yading@10: vld1.32 {d6}, [lr,:64] yading@10: vld1.32 {d2[]}, [r1,:32]! yading@10: vld1.32 {d3[]}, [r2,:32]! yading@10: vceq.f32 d4, d2, #0 yading@10: veor d2, d2, d3 yading@10: vmov d1, d0 yading@10: vmla.f32 d0, d6, d3 yading@10: vadd.f32 s2, s2, s4 yading@10: vbif d0, d1, d4 yading@10: vst1.32 {d0}, [r0,:64]! yading@10: 2: yading@10: pop {r4,pc} yading@10: endfunc yading@10: yading@10: function ff_sbr_hf_apply_noise_1_neon, export=1 yading@10: ldr r12, [sp] yading@10: push {r4,lr} yading@10: lsl r12, r12, #31 yading@10: eor lr, r12, #1<<31 yading@10: vmov d3, r12, lr yading@10: .Lhf_apply_noise_1: yading@10: movrelx r4, X(ff_sbr_noise_table) yading@10: ldr r12, [sp, #12] yading@10: add r3, r3, #1 yading@10: bfc r3, #9, #23 yading@10: sub r12, r12, #1 yading@10: 1: yading@10: add lr, r4, r3, lsl #3 yading@10: vld2.32 {q0}, [r0,:64] yading@10: vld2.32 {q3}, [lr,:64] yading@10: vld1.32 {d2}, [r1,:64]! yading@10: vld1.32 {d18}, [r2,:64]! yading@10: vceq.f32 d16, d2, #0 yading@10: veor d2, d2, d3 yading@10: vmov q2, q0 yading@10: vmla.f32 d0, d6, d18 yading@10: vmla.f32 d1, d7, d18 yading@10: vadd.f32 d5, d5, d2 yading@10: add r3, r3, #2 yading@10: bfc r3, #9, #23 yading@10: vbif d0, d4, d16 yading@10: vbif d1, d5, d16 yading@10: vst2.32 {q0}, [r0,:64]! yading@10: subs r12, r12, #2 yading@10: bgt 1b yading@10: blt 2f yading@10: add lr, r4, r3, lsl #3 yading@10: vld1.32 {d0}, [r0,:64] yading@10: vld1.32 {d6}, [lr,:64] yading@10: vld1.32 {d2[]}, [r1,:32]! yading@10: vld1.32 {d18[]}, [r2,:32]! yading@10: vceq.f32 d4, d2, #0 yading@10: veor d2, d2, d3 yading@10: vmov d1, d0 yading@10: vmla.f32 d0, d6, d18 yading@10: vadd.f32 s3, s3, s5 yading@10: vbif d0, d1, d4 yading@10: vst1.32 {d0}, [r0,:64]! yading@10: 2: yading@10: pop {r4,pc} yading@10: endfunc yading@10: yading@10: function ff_sbr_hf_apply_noise_2_neon, export=1 yading@10: vmov.i32 d3, #1<<31 yading@10: b .Lhf_apply_noise_0 yading@10: endfunc yading@10: yading@10: function ff_sbr_hf_apply_noise_3_neon, export=1 yading@10: ldr r12, [sp] yading@10: push {r4,lr} yading@10: lsl r12, r12, #31 yading@10: eor lr, r12, #1<<31 yading@10: vmov d3, lr, r12 yading@10: b .Lhf_apply_noise_1 yading@10: endfunc