Mercurial > hg > beaglert
comparison examples/04-Audio/oscillator-bank/audio_routines.S @ 464:8fcfbfb32aa0 prerelease
Examples reorder with subdirectories. Added header to each project. Moved Doxygen to bottom of render.cpp.
author | Robert Jack <robert.h.jack@gmail.com> |
---|---|
date | Mon, 20 Jun 2016 16:20:38 +0100 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
463:c47709e8b5c9 | 464:8fcfbfb32aa0 |
---|---|
1 @ | |
2 @ audio_routines.S | |
3 @ | |
4 @ NEON-based functions for time-critical audio processing | |
5 @ | |
6 @ Andrew McPherson 2014 | |
7 @ Queen Mary University of London | |
8 @ | |
9 | |
10 .syntax unified | |
11 .arch armv7-a | |
12 .fpu neon | |
13 | |
14 @ void oscillator_bank_neon(int numAudioFrames, float *audioOut, | |
15 @ int activePartialNum, int lookupTableSize, | |
16 @ float *phases, float *frequencies, float *amplitudes, | |
17 @ float *freqDerivatives, float *ampDerivatives, | |
18 @ float *lookupTable); | |
19 | |
20 @ Registers: | |
21 @ r0: numAudioFrames How many frames to render | |
22 @ r1: audioOut Buffer for audio output samples [stereo] | |
23 @ r2: activePartialNum How many active partials to render | |
24 @ r3: lookupTableSize Size of lookup table | |
25 @ ---- other arguments start on the stack and are moved: ----- | |
26 @ r4: phases Phase of each oscillator (pointer) | |
27 @ r5: frequencies Normalised frequency of each oscillator (pointer) | |
28 @ r6: amplitudes Normalised amplitude of each oscillator (pointer) | |
29 @ r7: freqDerivatives Derivative of frequency for each oscillator (pointer) | |
30 @ r8: ampDerivatives Derivative of amplitude for each oscillator (pointer) | |
31 @ r9: lookupTable Lookup table containing one oscillation | |
32 @ | |
33 @ Alignment requirements: | |
34 @ audioOut: 8-byte boundary | |
35 @ phases: 16-byte boundary | |
36 @ frequencies: 16-byte boundary | |
37 @ amplitudes: 16-byte boundary | |
38 @ freqDerivatives: 16-byte bounary | |
39 @ ampDerivatives: 16-byte boundary | |
40 @ lookupTable: 4-byte boundary (TODO: check this) | |
41 | |
42 .align 2 | |
43 .global oscillator_bank_neon | |
44 .thumb | |
45 .thumb_func | |
46 .type oscillator_bank_neon, %function | |
47 oscillator_bank_neon: | |
48 | |
49 | |
50 dSample .dn D6.F32 | |
51 qPhases .qn Q8.F32 | |
52 dPhases_0 .dn D16.F32 | |
53 dPhases_1 .dn D17.F32 | |
54 qFreqs .qn Q9.F32 | |
55 dFreqs_0 .dn D18.F32 | |
56 dFreqs_1 .dn D19.F32 | |
57 qAmps .qn Q10.F32 | |
58 dAmps_0 .dn D20.F32 | |
59 dAmps_1 .dn D21.F32 | |
60 qFreqDs .qn Q11.F32 | |
61 dFreqDs_0 .dn D22.F32 | |
62 dFreqDs_1 .dn D23.F32 | |
63 qAmpDs .qn Q12.F32 | |
64 dAmpDs_0 .dn D24.F32 | |
65 dAmpDs_1 .dn D25.F32 | |
66 | |
67 qBaseInts .qn Q13.U32 @ Base indexes: unsigned ints x4 | |
68 dBaseInts_0 .dn D26.U32 | |
69 dBaseInts_1 .dn D27.U32 | |
70 qFractions .qn Q14.F32 @ Fraction indexes: floats x4 | |
71 qTableBase .qn Q15.U32 @ Base of lookup table | |
72 | |
73 cmp r0, #0 @ Check for trivial case 1: zero frames | |
74 it eq | |
75 bxeq lr @ Return if that's the case (otherwise might have odd behaviour) | |
76 cmp r2, #4 @ Check for trivial case 2: zero oscillators | |
77 it lt | |
78 bxlt lr @ Return if that's the case | |
79 | |
80 push {r4-r11} @ Now arguments start 32 bytes above SP | |
81 add r11, sp, #32 @ Pointer to 32 bytes into the stack | |
82 ldm r11, {r4-r9} @ Load 6 arguments into registers | |
83 | |
84 vdup qTableBase, r9 @ Move lookup table base index into 4 ints | |
85 | |
86 @ Outer loop: iterate over the number of oscillators, choosing 4 at a | |
87 @ time to work with. | |
88 oscbank_oscillator_loop: | |
89 vld1 {dPhases_0, dPhases_1}, [r4] @ no increment; will store at end of sample loop | |
90 vld1 {dFreqs_0, dFreqs_1}, [r5] | |
91 vld1 {dAmps_0, dAmps_1}, [r6] | |
92 vld1 {dFreqDs_0, dFreqDs_1}, [r7]! @ increment; won't update at end of sample loop | |
93 vld1 {dAmpDs_0, dAmpDs_1}, [r8]! | |
94 | |
95 push {r0-r1,r4-r8} | |
96 @ --- inner loop: iterate over the number of samples --- | |
97 oscbank_sample_loop: | |
98 vcvt qBaseInts, qPhases @ Take floor(phases) | |
99 vmov q2.f32, #1.0 @ Load 1.0 into every slot of q2 | |
100 vshl q0.U32, qBaseInts, #2 @ Shift the indexes left 2 (*4 for float addressing) | |
101 vcvt qFractions, qBaseInts @ int back to float | |
102 vadd q0.U32, q0.U32, qTableBase @ Find memory addresses | |
103 | |
104 vmov r4, r5, d0 @ Move two indexes to ARM registers | |
105 vmov r6, r7, d1 @ Move two more indexes to ARM registers | |
106 vsub qFractions, qPhases, qFractions @ fraction = phase - floor(phase) | |
107 | |
108 vldr.64 d0, [r4] @ Load two consecutive floats at each location | |
109 vldr.64 d1, [r5] @ These hold the previous and following samples in the table | |
110 vldr.64 d2, [r6] @ TODO: check whether these work at 4-byte alignment | |
111 vldr.64 d3, [r7] | |
112 | |
113 @ Format at this point: | |
114 @ Osc0(before) Osc0(after) Osc1(before) Osc1(after) Osc2(before) Osc2(after) Osc3(before) Osc3(after) | |
115 @ We want: | |
116 @ Osc0(before) Osc1(before) Osc2(before) Osc3(before) Osc0(after) Osc1(after) Osc2(after) Osc3(after) | |
117 | |
118 vuzp.32 q0, q1 @ Now q0 contains before, q1 contains after | |
119 vsub q2.f32, q2.f32, qFractions @ q2 = 1.0 - fraction | |
120 vmul q1.f32, q1.f32, qFractions @ q1 = fraction * after | |
121 vmul q0.f32, q0.f32, q2.f32 @ q0 = (1.0 - fraction) * before | |
122 | |
123 vadd qPhases, qPhases, qFreqs @ Update phases | |
124 vadd qFreqs, qFreqs, qFreqDs @ Update frequencies | |
125 | |
126 vadd q0.f32, q0.f32, q1.f32 @ Add two interpolated components to get the final sample | |
127 vdup q2.u32, r3 @ Put lookup table size into each element of q2 | |
128 vcvt qBaseInts, qPhases @ Take floor of new phases | |
129 vmul q0.f32, q0.f32, qAmps @ Multiply samples by current amplitude | |
130 | |
131 vld1 dSample, [r1] @ Load the current stereo samples | |
132 vpadd d2.f32, d0.f32, d1.f32 @ Pairwise accumulate q0 (output sample) into d2 | |
133 | |
134 vand q2, q2, qBaseInts @ Logical AND of new phase int leaves 1 bit set only if phase >= table size | |
135 vpadd d3.f32, d2.f32, d2.f32 @ Pairwise accumulate d2 into d0 --> d0[0] and d0[1] both hold total of 4 oscillators | |
136 vadd qAmps, qAmps, qAmpDs @ Update amplitudes | |
137 vcvt q0.f32, q2.u32 @ Convert int back to float after AND operation | |
138 | |
139 vadd dSample, dSample, d3.f32 @ Add oscillator outputs to each channel | |
140 | |
141 subs r0, r0, #1 @ numFrames-- | |
142 vsub qPhases, qPhases, q0.f32 @ Keep phases in table range | |
143 vst1 dSample, [r1]! @ Store back in buffer and increment by 8 | |
144 | |
145 it gt | |
146 bgt oscbank_sample_loop @ Loop if numFrames > 0 | |
147 | |
148 @ --- end inner loop --- | |
149 pop {r0-r1,r4-r8} @ Restore registers: restores audioOut and numFrames, among others | |
150 | |
151 vst1 {dPhases_0, dPhases_1}, [r4]! @ Store phases back to array | |
152 vst1 {dFreqs_0, dFreqs_1}, [r5]! @ Store frequencies back to array | |
153 vst1 {dAmps_0, dAmps_1}, [r6]! @ Store amplitudes back to array | |
154 @ No need to update r7, r8 | |
155 | |
156 subs r2, r2, #4 @ numPartials -= 4 | |
157 it gt | |
158 bgt oscbank_oscillator_loop @ Loop if numPartials > 0 | |
159 | |
160 pop {r4-r11} | |
161 bx lr |