yading@10: /* yading@10: * ARM NEON optimised RDFT yading@10: * Copyright (c) 2009 Mans Rullgard yading@10: * yading@10: * This file is part of FFmpeg. yading@10: * yading@10: * FFmpeg is free software; you can redistribute it and/or yading@10: * modify it under the terms of the GNU Lesser General Public yading@10: * License as published by the Free Software Foundation; either yading@10: * version 2.1 of the License, or (at your option) any later version. yading@10: * yading@10: * FFmpeg is distributed in the hope that it will be useful, yading@10: * but WITHOUT ANY WARRANTY; without even the implied warranty of yading@10: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU yading@10: * Lesser General Public License for more details. yading@10: * yading@10: * You should have received a copy of the GNU Lesser General Public yading@10: * License along with FFmpeg; if not, write to the Free Software yading@10: * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA yading@10: */ yading@10: yading@10: #include "libavutil/arm/asm.S" yading@10: yading@10: function ff_rdft_calc_neon, export=1 yading@10: push {r4-r8,lr} yading@10: yading@10: ldr r6, [r0, #4] @ inverse yading@10: mov r4, r0 yading@10: mov r5, r1 yading@10: yading@10: lsls r6, r6, #31 yading@10: bne 1f yading@10: add r0, r4, #20 yading@10: bl X(ff_fft_permute_neon) yading@10: add r0, r4, #20 yading@10: mov r1, r5 yading@10: bl X(ff_fft_calc_neon) yading@10: 1: yading@10: ldr r12, [r4, #0] @ nbits yading@10: mov r2, #1 yading@10: lsl r12, r2, r12 yading@10: add r0, r5, #8 yading@10: add r1, r5, r12, lsl #2 yading@10: lsr r12, r12, #2 yading@10: ldr r2, [r4, #12] @ tcos yading@10: sub r12, r12, #2 yading@10: ldr r3, [r4, #16] @ tsin yading@10: mov r7, r0 yading@10: sub r1, r1, #8 yading@10: mov lr, r1 yading@10: mov r8, #-8 yading@10: vld1.32 {d0}, [r0,:64]! @ d1[0,1] yading@10: vld1.32 {d1}, [r1,:64], r8 @ d2[0,1] yading@10: vld1.32 {d4}, [r2,:64]! @ tcos[i] yading@10: vld1.32 {d5}, [r3,:64]! @ tsin[i] yading@10: vmov.f32 d18, #0.5 @ k1 yading@10: vdup.32 d19, r6 yading@10: pld [r0, #32] yading@10: veor d19, d18, d19 @ k2 yading@10: vmov.i32 d16, #0 yading@10: vmov.i32 d17, #1<<31 yading@10: pld [r1, #-32] yading@10: vtrn.32 d16, d17 yading@10: pld [r2, #32] yading@10: vrev64.32 d16, d16 @ d16=1,0 d17=0,1 yading@10: pld [r3, #32] yading@10: 2: yading@10: veor q1, q0, q8 @ -d1[0],d1[1], d2[0],-d2[1] yading@10: vld1.32 {d24}, [r0,:64]! @ d1[0,1] yading@10: vadd.f32 d0, d0, d3 @ d1[0]+d2[0], d1[1]-d2[1] yading@10: vld1.32 {d25}, [r1,:64], r8 @ d2[0,1] yading@10: vadd.f32 d1, d2, d1 @ -d1[0]+d2[0], d1[1]+d2[1] yading@10: veor q3, q12, q8 @ -d1[0],d1[1], d2[0],-d2[1] yading@10: pld [r0, #32] yading@10: vmul.f32 q10, q0, q9 @ ev.re, ev.im, od.im, od.re yading@10: pld [r1, #-32] yading@10: vadd.f32 d0, d24, d7 @ d1[0]+d2[0], d1[1]-d2[1] yading@10: vadd.f32 d1, d6, d25 @ -d1[0]+d2[0], d1[1]+d2[1] yading@10: vmul.f32 q11, q0, q9 @ ev.re, ev.im, od.im, od.re yading@10: veor d7, d21, d16 @ -od.im, od.re yading@10: vrev64.32 d3, d21 @ od.re, od.im yading@10: veor d6, d20, d17 @ ev.re,-ev.im yading@10: veor d2, d3, d16 @ -od.re, od.im yading@10: vmla.f32 d20, d3, d4[1] yading@10: vmla.f32 d20, d7, d5[1] yading@10: vmla.f32 d6, d2, d4[1] yading@10: vmla.f32 d6, d21, d5[1] yading@10: vld1.32 {d4}, [r2,:64]! @ tcos[i] yading@10: veor d7, d23, d16 @ -od.im, od.re yading@10: vld1.32 {d5}, [r3,:64]! @ tsin[i] yading@10: veor d24, d22, d17 @ ev.re,-ev.im yading@10: vrev64.32 d3, d23 @ od.re, od.im yading@10: pld [r2, #32] yading@10: veor d2, d3, d16 @ -od.re, od.im yading@10: pld [r3, #32] yading@10: vmla.f32 d22, d3, d4[0] yading@10: vmla.f32 d22, d7, d5[0] yading@10: vmla.f32 d24, d2, d4[0] yading@10: vmla.f32 d24, d23, d5[0] yading@10: vld1.32 {d0}, [r0,:64]! @ d1[0,1] yading@10: vld1.32 {d1}, [r1,:64], r8 @ d2[0,1] yading@10: vst1.32 {d20}, [r7,:64]! yading@10: vst1.32 {d6}, [lr,:64], r8 yading@10: vst1.32 {d22}, [r7,:64]! yading@10: vst1.32 {d24}, [lr,:64], r8 yading@10: subs r12, r12, #2 yading@10: bgt 2b yading@10: yading@10: veor q1, q0, q8 @ -d1[0],d1[1], d2[0],-d2[1] yading@10: vadd.f32 d0, d0, d3 @ d1[0]+d2[0], d1[1]-d2[1] yading@10: vadd.f32 d1, d2, d1 @ -d1[0]+d2[0], d1[1]+d2[1] yading@10: ldr r2, [r4, #8] @ sign_convention yading@10: vmul.f32 q10, q0, q9 @ ev.re, ev.im, od.im, od.re yading@10: add r0, r0, #4 yading@10: bfc r2, #0, #31 yading@10: vld1.32 {d0[0]}, [r0,:32] yading@10: veor d7, d21, d16 @ -od.im, od.re yading@10: vrev64.32 d3, d21 @ od.re, od.im yading@10: veor d6, d20, d17 @ ev.re,-ev.im yading@10: vld1.32 {d22}, [r5,:64] yading@10: vdup.32 d1, r2 yading@10: vmov d23, d22 yading@10: veor d2, d3, d16 @ -od.re, od.im yading@10: vtrn.32 d22, d23 yading@10: veor d0, d0, d1 yading@10: veor d23, d23, d17 yading@10: vmla.f32 d20, d3, d4[1] yading@10: vmla.f32 d20, d7, d5[1] yading@10: vmla.f32 d6, d2, d4[1] yading@10: vmla.f32 d6, d21, d5[1] yading@10: vadd.f32 d22, d22, d23 yading@10: vst1.32 {d20}, [r7,:64] yading@10: vst1.32 {d6}, [lr,:64] yading@10: vst1.32 {d0[0]}, [r0,:32] yading@10: vst1.32 {d22}, [r5,:64] yading@10: yading@10: cmp r6, #0 yading@10: it eq yading@10: popeq {r4-r8,pc} yading@10: yading@10: vmul.f32 d22, d22, d18 yading@10: vst1.32 {d22}, [r5,:64] yading@10: add r0, r4, #20 yading@10: mov r1, r5 yading@10: bl X(ff_fft_permute_neon) yading@10: add r0, r4, #20 yading@10: mov r1, r5 yading@10: pop {r4-r8,lr} yading@10: b X(ff_fft_calc_neon) yading@10: endfunc