annotate ffmpeg/libavcodec/arm/mdct_fixed_neon.S @ 13:844d341cf643 tip

Back up before ISMIR
author Yading Song <yading.song@eecs.qmul.ac.uk>
date Thu, 31 Oct 2013 13:17:06 +0000
parents 6840f77b83aa
children
rev   line source
yading@10 1 /*
yading@10 2 * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
yading@10 3 *
yading@10 4 * This file is part of Libav.
yading@10 5 *
yading@10 6 * Libav is free software; you can redistribute it and/or
yading@10 7 * modify it under the terms of the GNU Lesser General Public
yading@10 8 * License as published by the Free Software Foundation; either
yading@10 9 * version 2.1 of the License, or (at your option) any later version.
yading@10 10 *
yading@10 11 * Libav is distributed in the hope that it will be useful,
yading@10 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
yading@10 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
yading@10 14 * Lesser General Public License for more details.
yading@10 15 *
yading@10 16 * You should have received a copy of the GNU Lesser General Public
yading@10 17 * License along with Libav; if not, write to the Free Software
yading@10 18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
yading@10 19 */
yading@10 20
yading@10 21 #include "libavutil/arm/asm.S"
yading@10 22
yading@10 23 .macro prerot dst, rt
yading@10 24 lsr r3, r6, #2 @ n4
yading@10 25 add \rt, r4, r6, lsr #1 @ revtab + n4
yading@10 26 add r9, r3, r3, lsl #1 @ n3
yading@10 27 add r8, r7, r6 @ tcos + n4
yading@10 28 add r3, r2, r6, lsr #1 @ in + n4
yading@10 29 add r9, r2, r9, lsl #1 @ in + n3
yading@10 30 sub r8, r8, #16
yading@10 31 sub r10, r3, #16
yading@10 32 sub r11, r9, #16
yading@10 33 mov r12, #-16
yading@10 34 1:
yading@10 35 vld2.16 {d0,d1}, [r9, :128]!
yading@10 36 vld2.16 {d2,d3}, [r11,:128], r12
yading@10 37 vld2.16 {d4,d5}, [r3, :128]!
yading@10 38 vld2.16 {d6,d7}, [r10,:128], r12
yading@10 39 vld2.16 {d16,d17},[r7, :128]! @ cos, sin
yading@10 40 vld2.16 {d18,d19},[r8, :128], r12
yading@10 41 vrev64.16 q1, q1
yading@10 42 vrev64.16 q3, q3
yading@10 43 vrev64.16 q9, q9
yading@10 44 vneg.s16 d0, d0
yading@10 45 vneg.s16 d2, d2
yading@10 46 vneg.s16 d16, d16
yading@10 47 vneg.s16 d18, d18
yading@10 48 vhsub.s16 d0, d0, d3 @ re
yading@10 49 vhsub.s16 d4, d7, d4 @ im
yading@10 50 vhsub.s16 d6, d6, d5
yading@10 51 vhsub.s16 d2, d2, d1
yading@10 52 vmull.s16 q10, d0, d16
yading@10 53 vmlsl.s16 q10, d4, d17
yading@10 54 vmull.s16 q11, d0, d17
yading@10 55 vmlal.s16 q11, d4, d16
yading@10 56 vmull.s16 q12, d6, d18
yading@10 57 vmlsl.s16 q12, d2, d19
yading@10 58 vmull.s16 q13, d6, d19
yading@10 59 vmlal.s16 q13, d2, d18
yading@10 60 vshrn.s32 d0, q10, #15
yading@10 61 vshrn.s32 d1, q11, #15
yading@10 62 vshrn.s32 d2, q12, #15
yading@10 63 vshrn.s32 d3, q13, #15
yading@10 64 vzip.16 d0, d1
yading@10 65 vzip.16 d2, d3
yading@10 66 ldrh lr, [r4], #2
yading@10 67 ldrh r2, [\rt, #-2]!
yading@10 68 add lr, \dst, lr, lsl #2
yading@10 69 add r2, \dst, r2, lsl #2
yading@10 70 vst1.32 {d0[0]}, [lr,:32]
yading@10 71 vst1.32 {d2[0]}, [r2,:32]
yading@10 72 ldrh lr, [r4], #2
yading@10 73 ldrh r2, [\rt, #-2]!
yading@10 74 add lr, \dst, lr, lsl #2
yading@10 75 add r2, \dst, r2, lsl #2
yading@10 76 vst1.32 {d0[1]}, [lr,:32]
yading@10 77 vst1.32 {d2[1]}, [r2,:32]
yading@10 78 ldrh lr, [r4], #2
yading@10 79 ldrh r2, [\rt, #-2]!
yading@10 80 add lr, \dst, lr, lsl #2
yading@10 81 add r2, \dst, r2, lsl #2
yading@10 82 vst1.32 {d1[0]}, [lr,:32]
yading@10 83 vst1.32 {d3[0]}, [r2,:32]
yading@10 84 ldrh lr, [r4], #2
yading@10 85 ldrh r2, [\rt, #-2]!
yading@10 86 add lr, \dst, lr, lsl #2
yading@10 87 add r2, \dst, r2, lsl #2
yading@10 88 vst1.32 {d1[1]}, [lr,:32]
yading@10 89 vst1.32 {d3[1]}, [r2,:32]
yading@10 90 subs r6, r6, #32
yading@10 91 bgt 1b
yading@10 92 .endm
yading@10 93
yading@10 94 function ff_mdct_fixed_calc_neon, export=1
yading@10 95 push {r1,r4-r11,lr}
yading@10 96
yading@10 97 ldr r4, [r0, #8] @ revtab
yading@10 98 ldr r6, [r0, #16] @ mdct_size; n
yading@10 99 ldr r7, [r0, #24] @ tcos
yading@10 100
yading@10 101 prerot r1, r5
yading@10 102
yading@10 103 mov r4, r0
yading@10 104 bl X(ff_fft_fixed_calc_neon)
yading@10 105
yading@10 106 pop {r5}
yading@10 107 mov r12, #-16
yading@10 108 ldr r6, [r4, #16] @ mdct_size; n
yading@10 109 ldr r7, [r4, #24] @ tcos
yading@10 110 add r5, r5, r6, lsr #1
yading@10 111 add r7, r7, r6, lsr #1
yading@10 112 sub r1, r5, #16
yading@10 113 sub r2, r7, #16
yading@10 114 1:
yading@10 115 vld2.16 {d4,d5}, [r7,:128]!
yading@10 116 vld2.16 {d6,d7}, [r2,:128], r12
yading@10 117 vld2.16 {d0,d1}, [r5,:128]
yading@10 118 vld2.16 {d2,d3}, [r1,:128]
yading@10 119 vrev64.16 q3, q3
yading@10 120 vrev64.16 q1, q1
yading@10 121 vneg.s16 q3, q3
yading@10 122 vneg.s16 q2, q2
yading@10 123 vmull.s16 q11, d2, d6
yading@10 124 vmlal.s16 q11, d3, d7
yading@10 125 vmull.s16 q8, d0, d5
yading@10 126 vmlsl.s16 q8, d1, d4
yading@10 127 vmull.s16 q9, d0, d4
yading@10 128 vmlal.s16 q9, d1, d5
yading@10 129 vmull.s16 q10, d2, d7
yading@10 130 vmlsl.s16 q10, d3, d6
yading@10 131 vshrn.s32 d0, q11, #15
yading@10 132 vshrn.s32 d1, q8, #15
yading@10 133 vshrn.s32 d2, q9, #15
yading@10 134 vshrn.s32 d3, q10, #15
yading@10 135 vrev64.16 q0, q0
yading@10 136 vst2.16 {d2,d3}, [r5,:128]!
yading@10 137 vst2.16 {d0,d1}, [r1,:128], r12
yading@10 138 subs r6, r6, #32
yading@10 139 bgt 1b
yading@10 140
yading@10 141 pop {r4-r11,pc}
yading@10 142 endfunc
yading@10 143
yading@10 144 function ff_mdct_fixed_calcw_neon, export=1
yading@10 145 push {r1,r4-r11,lr}
yading@10 146
yading@10 147 ldrd r4, r5, [r0, #8] @ revtab, tmp_buf
yading@10 148 ldr r6, [r0, #16] @ mdct_size; n
yading@10 149 ldr r7, [r0, #24] @ tcos
yading@10 150
yading@10 151 prerot r5, r1
yading@10 152
yading@10 153 mov r4, r0
yading@10 154 mov r1, r5
yading@10 155 bl X(ff_fft_fixed_calc_neon)
yading@10 156
yading@10 157 pop {r7}
yading@10 158 mov r12, #-16
yading@10 159 ldr r6, [r4, #16] @ mdct_size; n
yading@10 160 ldr r9, [r4, #24] @ tcos
yading@10 161 add r5, r5, r6, lsr #1
yading@10 162 add r7, r7, r6
yading@10 163 add r9, r9, r6, lsr #1
yading@10 164 sub r3, r5, #16
yading@10 165 sub r1, r7, #16
yading@10 166 sub r2, r9, #16
yading@10 167 1:
yading@10 168 vld2.16 {d4,d5}, [r9,:128]!
yading@10 169 vld2.16 {d6,d7}, [r2,:128], r12
yading@10 170 vld2.16 {d0,d1}, [r5,:128]!
yading@10 171 vld2.16 {d2,d3}, [r3,:128], r12
yading@10 172 vrev64.16 q3, q3
yading@10 173 vrev64.16 q1, q1
yading@10 174 vneg.s16 q3, q3
yading@10 175 vneg.s16 q2, q2
yading@10 176 vmull.s16 q8, d2, d6
yading@10 177 vmlal.s16 q8, d3, d7
yading@10 178 vmull.s16 q9, d0, d5
yading@10 179 vmlsl.s16 q9, d1, d4
yading@10 180 vmull.s16 q10, d0, d4
yading@10 181 vmlal.s16 q10, d1, d5
yading@10 182 vmull.s16 q11, d2, d7
yading@10 183 vmlsl.s16 q11, d3, d6
yading@10 184 vrev64.32 q8, q8
yading@10 185 vrev64.32 q9, q9
yading@10 186 vst2.32 {q10,q11},[r7,:128]!
yading@10 187 vst2.32 {d16,d18},[r1,:128], r12
yading@10 188 vst2.32 {d17,d19},[r1,:128], r12
yading@10 189 subs r6, r6, #32
yading@10 190 bgt 1b
yading@10 191
yading@10 192 pop {r4-r11,pc}
yading@10 193 endfunc