andrewm@0
|
1 @
|
andrewm@0
|
2 @ audio_routines.S
|
andrewm@0
|
3 @
|
andrewm@0
|
4 @ NEON-based functions for time-critical audio processing
|
andrewm@0
|
5 @
|
andrewm@0
|
6 @ Andrew McPherson 2014
|
andrewm@0
|
7 @ Queen Mary University of London
|
andrewm@0
|
8 @
|
andrewm@0
|
9
|
andrewm@0
|
10 .syntax unified
|
andrewm@0
|
11 .arch armv7-a
|
andrewm@0
|
12 .fpu neon
|
andrewm@0
|
13
|
andrewm@0
|
14 @ void oscillator_bank_neon(int numAudioFrames, float *audioOut,
|
andrewm@0
|
15 @ int activePartialNum, int lookupTableSize,
|
andrewm@0
|
16 @ float *phases, float *frequencies, float *amplitudes,
|
andrewm@0
|
17 @ float *freqDerivatives, float *ampDerivatives,
|
andrewm@0
|
18 @ float *lookupTable);
|
andrewm@0
|
19
|
andrewm@0
|
20 @ Registers:
|
andrewm@0
|
21 @ r0: numAudioFrames How many frames to render
|
andrewm@0
|
22 @ r1: audioOut Buffer for audio output samples [stereo]
|
andrewm@0
|
23 @ r2: activePartialNum How many active partials to render
|
andrewm@0
|
24 @ r3: lookupTableSize Size of lookup table
|
andrewm@0
|
25 @ ---- other arguments start on the stack and are moved: -----
|
andrewm@0
|
26 @ r4: phases Phase of each oscillator (pointer)
|
andrewm@0
|
27 @ r5: frequencies Normalised frequency of each oscillator (pointer)
|
andrewm@0
|
28 @ r6: amplitudes Normalised amplitude of each oscillator (pointer)
|
andrewm@0
|
29 @ r7: freqDerivatives Derivative of frequency for each oscillator (pointer)
|
andrewm@0
|
30 @ r8: ampDerivatives Derivative of amplitude for each oscillator (pointer)
|
andrewm@0
|
31 @ r9: lookupTable Lookup table containing one oscillation
|
andrewm@0
|
32 @
|
andrewm@0
|
33 @ Alignment requirements:
|
andrewm@0
|
34 @ audioOut: 8-byte boundary
|
andrewm@0
|
35 @ phases: 16-byte boundary
|
andrewm@0
|
36 @ frequencies: 16-byte boundary
|
andrewm@0
|
37 @ amplitudes: 16-byte boundary
|
andrewm@0
|
38 @ freqDerivatives: 16-byte bounary
|
andrewm@0
|
39 @ ampDerivatives: 16-byte boundary
|
andrewm@0
|
40 @ lookupTable: 4-byte boundary (TODO: check this)
|
andrewm@0
|
41
|
andrewm@0
|
42 .align 2
|
andrewm@0
|
43 .global oscillator_bank_neon
|
andrewm@0
|
44 .thumb
|
andrewm@0
|
45 .thumb_func
|
andrewm@0
|
46 .type oscillator_bank_neon, %function
|
andrewm@0
|
47 oscillator_bank_neon:
|
andrewm@0
|
48
|
andrewm@0
|
49
|
andrewm@0
|
50 dSample .dn D6.F32
|
andrewm@0
|
51 qPhases .qn Q8.F32
|
andrewm@0
|
52 dPhases_0 .dn D16.F32
|
andrewm@0
|
53 dPhases_1 .dn D17.F32
|
andrewm@0
|
54 qFreqs .qn Q9.F32
|
andrewm@0
|
55 dFreqs_0 .dn D18.F32
|
andrewm@0
|
56 dFreqs_1 .dn D19.F32
|
andrewm@0
|
57 qAmps .qn Q10.F32
|
andrewm@0
|
58 dAmps_0 .dn D20.F32
|
andrewm@0
|
59 dAmps_1 .dn D21.F32
|
andrewm@0
|
60 qFreqDs .qn Q11.F32
|
andrewm@0
|
61 dFreqDs_0 .dn D22.F32
|
andrewm@0
|
62 dFreqDs_1 .dn D23.F32
|
andrewm@0
|
63 qAmpDs .qn Q12.F32
|
andrewm@0
|
64 dAmpDs_0 .dn D24.F32
|
andrewm@0
|
65 dAmpDs_1 .dn D25.F32
|
andrewm@0
|
66
|
andrewm@0
|
67 qBaseInts .qn Q13.U32 @ Base indexes: unsigned ints x4
|
andrewm@0
|
68 dBaseInts_0 .dn D26.U32
|
andrewm@0
|
69 dBaseInts_1 .dn D27.U32
|
andrewm@0
|
70 qFractions .qn Q14.F32 @ Fraction indexes: floats x4
|
andrewm@0
|
71 qTableBase .qn Q15.U32 @ Base of lookup table
|
andrewm@0
|
72
|
andrewm@0
|
73 cmp r0, #0 @ Check for trivial case 1: zero frames
|
andrewm@0
|
74 it eq
|
andrewm@0
|
75 bxeq lr @ Return if that's the case (otherwise might have odd behaviour)
|
andrewm@0
|
76 cmp r2, #4 @ Check for trivial case 2: zero oscillators
|
andrewm@0
|
77 it lt
|
andrewm@0
|
78 bxlt lr @ Return if that's the case
|
andrewm@0
|
79
|
andrewm@0
|
80 push {r4-r11} @ Now arguments start 32 bytes above SP
|
andrewm@0
|
81 add r11, sp, #32 @ Pointer to 32 bytes into the stack
|
andrewm@0
|
82 ldm r11, {r4-r9} @ Load 6 arguments into registers
|
andrewm@0
|
83
|
andrewm@0
|
84 vdup qTableBase, r9 @ Move lookup table base index into 4 ints
|
andrewm@0
|
85
|
andrewm@0
|
86 @ Outer loop: iterate over the number of oscillators, choosing 4 at a
|
andrewm@0
|
87 @ time to work with.
|
andrewm@0
|
88 oscbank_oscillator_loop:
|
andrewm@0
|
89 vld1 {dPhases_0, dPhases_1}, [r4] @ no increment; will store at end of sample loop
|
andrewm@0
|
90 vld1 {dFreqs_0, dFreqs_1}, [r5]
|
andrewm@0
|
91 vld1 {dAmps_0, dAmps_1}, [r6]
|
andrewm@0
|
92 vld1 {dFreqDs_0, dFreqDs_1}, [r7]! @ increment; won't update at end of sample loop
|
andrewm@0
|
93 vld1 {dAmpDs_0, dAmpDs_1}, [r8]!
|
andrewm@0
|
94
|
andrewm@0
|
95 push {r0-r1,r4-r8}
|
andrewm@0
|
96 @ --- inner loop: iterate over the number of samples ---
|
andrewm@0
|
97 oscbank_sample_loop:
|
andrewm@0
|
98 vcvt qBaseInts, qPhases @ Take floor(phases)
|
andrewm@0
|
99 vmov q2.f32, #1.0 @ Load 1.0 into every slot of q2
|
andrewm@0
|
100 vshl q0.U32, qBaseInts, #2 @ Shift the indexes left 2 (*4 for float addressing)
|
andrewm@0
|
101 vcvt qFractions, qBaseInts @ int back to float
|
andrewm@0
|
102 vadd q0.U32, q0.U32, qTableBase @ Find memory addresses
|
andrewm@0
|
103
|
andrewm@0
|
104 vmov r4, r5, d0 @ Move two indexes to ARM registers
|
andrewm@0
|
105 vmov r6, r7, d1 @ Move two more indexes to ARM registers
|
andrewm@0
|
106 vsub qFractions, qPhases, qFractions @ fraction = phase - floor(phase)
|
andrewm@0
|
107
|
andrewm@0
|
108 vldr.64 d0, [r4] @ Load two consecutive floats at each location
|
andrewm@0
|
109 vldr.64 d1, [r5] @ These hold the previous and following samples in the table
|
andrewm@0
|
110 vldr.64 d2, [r6] @ TODO: check whether these work at 4-byte alignment
|
andrewm@0
|
111 vldr.64 d3, [r7]
|
andrewm@0
|
112
|
andrewm@0
|
113 @ Format at this point:
|
andrewm@0
|
114 @ Osc0(before) Osc0(after) Osc1(before) Osc1(after) Osc2(before) Osc2(after) Osc3(before) Osc3(after)
|
andrewm@0
|
115 @ We want:
|
andrewm@0
|
116 @ Osc0(before) Osc1(before) Osc2(before) Osc3(before) Osc0(after) Osc1(after) Osc2(after) Osc3(after)
|
andrewm@0
|
117
|
andrewm@0
|
118 vuzp.32 q0, q1 @ Now q0 contains before, q1 contains after
|
andrewm@0
|
119 vsub q2.f32, q2.f32, qFractions @ q2 = 1.0 - fraction
|
andrewm@0
|
120 vmul q1.f32, q1.f32, qFractions @ q1 = fraction * after
|
andrewm@0
|
121 vmul q0.f32, q0.f32, q2.f32 @ q0 = (1.0 - fraction) * before
|
andrewm@0
|
122
|
andrewm@0
|
123 vadd qPhases, qPhases, qFreqs @ Update phases
|
andrewm@0
|
124 vadd qFreqs, qFreqs, qFreqDs @ Update frequencies
|
andrewm@0
|
125
|
andrewm@0
|
126 vadd q0.f32, q0.f32, q1.f32 @ Add two interpolated components to get the final sample
|
andrewm@0
|
127 vdup q2.u32, r3 @ Put lookup table size into each element of q2
|
andrewm@0
|
128 vcvt qBaseInts, qPhases @ Take floor of new phases
|
andrewm@0
|
129 vmul q0.f32, q0.f32, qAmps @ Multiply samples by current amplitude
|
andrewm@0
|
130
|
andrewm@0
|
131 vld1 dSample, [r1] @ Load the current stereo samples
|
andrewm@0
|
132 vpadd d2.f32, d0.f32, d1.f32 @ Pairwise accumulate q0 (output sample) into d2
|
andrewm@0
|
133
|
andrewm@0
|
134 vand q2, q2, qBaseInts @ Logical AND of new phase int leaves 1 bit set only if phase >= table size
|
andrewm@0
|
135 vpadd d3.f32, d2.f32, d2.f32 @ Pairwise accumulate d2 into d0 --> d0[0] and d0[1] both hold total of 4 oscillators
|
andrewm@0
|
136 vadd qAmps, qAmps, qAmpDs @ Update amplitudes
|
andrewm@0
|
137 vcvt q0.f32, q2.u32 @ Convert int back to float after AND operation
|
andrewm@0
|
138
|
andrewm@0
|
139 vadd dSample, dSample, d3.f32 @ Add oscillator outputs to each channel
|
andrewm@0
|
140
|
andrewm@0
|
141 subs r0, r0, #1 @ numFrames--
|
andrewm@0
|
142 vsub qPhases, qPhases, q0.f32 @ Keep phases in table range
|
andrewm@0
|
143 vst1 dSample, [r1]! @ Store back in buffer and increment by 8
|
andrewm@0
|
144
|
andrewm@0
|
145 it gt
|
andrewm@0
|
146 bgt oscbank_sample_loop @ Loop if numFrames > 0
|
andrewm@0
|
147
|
andrewm@0
|
148 @ --- end inner loop ---
|
andrewm@0
|
149 pop {r0-r1,r4-r8} @ Restore registers: restores audioOut and numFrames, among others
|
andrewm@0
|
150
|
andrewm@0
|
151 vst1 {dPhases_0, dPhases_1}, [r4]! @ Store phases back to array
|
andrewm@0
|
152 vst1 {dFreqs_0, dFreqs_1}, [r5]! @ Store frequencies back to array
|
andrewm@0
|
153 vst1 {dAmps_0, dAmps_1}, [r6]! @ Store amplitudes back to array
|
andrewm@0
|
154 @ No need to update r7, r8
|
andrewm@0
|
155
|
andrewm@0
|
156 subs r2, r2, #4 @ numPartials -= 4
|
andrewm@0
|
157 it gt
|
andrewm@0
|
158 bgt oscbank_oscillator_loop @ Loop if numPartials > 0
|
andrewm@0
|
159
|
andrewm@0
|
160 pop {r4-r11}
|
andrewm@0
|
161 bx lr
|
andrewm@0
|
162
|
andrewm@0
|
163
|
andrewm@0
|
164 @ void wavetable_interpolate_neon(int numSamplesIn, int numSamplesOut,
|
andrewm@0
|
165 @ float *tableIn, float *tableOut);
|
andrewm@0
|
166
|
andrewm@0
|
167 @ Registers:
|
andrewm@0
|
168 @ r0: numSamplesIn Size of the input table
|
andrewm@0
|
169 @ r1: numSamplesOut Size of the output table
|
andrewm@0
|
170 @ r2: tableIn Pointer to input table
|
andrewm@0
|
171 @ r3: tableOut Pointer to output table
|
andrewm@0
|
172
|
andrewm@0
|
173 @ Alignment requirements:
|
andrewm@0
|
174 @ tableIn: 8-byte boundary
|
andrewm@0
|
175 @ tableOut: 8-byte boundary
|
andrewm@0
|
176
|
andrewm@0
|
177 .align 2
|
andrewm@0
|
178 .global wavetable_interpolate_neon
|
andrewm@0
|
179 .thumb
|
andrewm@0
|
180 .thumb_func
|
andrewm@0
|
181 .type wavetable_interpolate_neon, %function
|
andrewm@0
|
182 wavetable_interpolate_neon:
|
andrewm@0
|
183 @ TODO
|
andrewm@0
|
184
|
andrewm@0
|
185 bx lr
|