annotate projects/d-box/audio_routines.S @ 68:59edd5780fef

Changed d-box code to run cleanly when built on board. Updated Makefile to add ne10 include path on board. Some extra docs in Utilities.h
author andrewm
date Fri, 17 Jul 2015 16:57:08 +0100
parents 8a575ba3ab52
children
rev   line source
andrewm@0 1 @
andrewm@0 2 @ audio_routines.S
andrewm@0 3 @
andrewm@0 4 @ NEON-based functions for time-critical audio processing
andrewm@0 5 @
andrewm@0 6 @ Andrew McPherson 2014
andrewm@0 7 @ Queen Mary University of London
andrewm@0 8 @
andrewm@0 9
andrewm@0 10 .syntax unified
andrewm@0 11 .arch armv7-a
andrewm@0 12 .fpu neon
andrewm@0 13
andrewm@0 14 @ void oscillator_bank_neon(int numAudioFrames, float *audioOut,
andrewm@0 15 @ int activePartialNum, int lookupTableSize,
andrewm@0 16 @ float *phases, float *frequencies, float *amplitudes,
andrewm@0 17 @ float *freqDerivatives, float *ampDerivatives,
andrewm@0 18 @ float *lookupTable);
andrewm@0 19
andrewm@0 20 @ Registers:
andrewm@0 21 @ r0: numAudioFrames How many frames to render
andrewm@0 22 @ r1: audioOut Buffer for audio output samples [stereo]
andrewm@0 23 @ r2: activePartialNum How many active partials to render
andrewm@0 24 @ r3: lookupTableSize Size of lookup table
andrewm@0 25 @ ---- other arguments start on the stack and are moved: -----
andrewm@0 26 @ r4: phases Phase of each oscillator (pointer)
andrewm@0 27 @ r5: frequencies Normalised frequency of each oscillator (pointer)
andrewm@0 28 @ r6: amplitudes Normalised amplitude of each oscillator (pointer)
andrewm@0 29 @ r7: freqDerivatives Derivative of frequency for each oscillator (pointer)
andrewm@0 30 @ r8: ampDerivatives Derivative of amplitude for each oscillator (pointer)
andrewm@0 31 @ r9: lookupTable Lookup table containing one oscillation
andrewm@0 32 @
andrewm@0 33 @ Alignment requirements:
andrewm@0 34 @ audioOut: 8-byte boundary
andrewm@0 35 @ phases: 16-byte boundary
andrewm@0 36 @ frequencies: 16-byte boundary
andrewm@0 37 @ amplitudes: 16-byte boundary
andrewm@0 38 @ freqDerivatives: 16-byte bounary
andrewm@0 39 @ ampDerivatives: 16-byte boundary
andrewm@0 40 @ lookupTable: 4-byte boundary (TODO: check this)
andrewm@0 41
andrewm@0 42 .align 2
andrewm@0 43 .global oscillator_bank_neon
andrewm@0 44 .thumb
andrewm@0 45 .thumb_func
andrewm@0 46 .type oscillator_bank_neon, %function
andrewm@0 47 oscillator_bank_neon:
andrewm@0 48
andrewm@0 49
andrewm@0 50 dSample .dn D6.F32
andrewm@0 51 qPhases .qn Q8.F32
andrewm@0 52 dPhases_0 .dn D16.F32
andrewm@0 53 dPhases_1 .dn D17.F32
andrewm@0 54 qFreqs .qn Q9.F32
andrewm@0 55 dFreqs_0 .dn D18.F32
andrewm@0 56 dFreqs_1 .dn D19.F32
andrewm@0 57 qAmps .qn Q10.F32
andrewm@0 58 dAmps_0 .dn D20.F32
andrewm@0 59 dAmps_1 .dn D21.F32
andrewm@0 60 qFreqDs .qn Q11.F32
andrewm@0 61 dFreqDs_0 .dn D22.F32
andrewm@0 62 dFreqDs_1 .dn D23.F32
andrewm@0 63 qAmpDs .qn Q12.F32
andrewm@0 64 dAmpDs_0 .dn D24.F32
andrewm@0 65 dAmpDs_1 .dn D25.F32
andrewm@0 66
andrewm@0 67 qBaseInts .qn Q13.U32 @ Base indexes: unsigned ints x4
andrewm@0 68 dBaseInts_0 .dn D26.U32
andrewm@0 69 dBaseInts_1 .dn D27.U32
andrewm@0 70 qFractions .qn Q14.F32 @ Fraction indexes: floats x4
andrewm@0 71 qTableBase .qn Q15.U32 @ Base of lookup table
andrewm@0 72
andrewm@0 73 cmp r0, #0 @ Check for trivial case 1: zero frames
andrewm@0 74 it eq
andrewm@0 75 bxeq lr @ Return if that's the case (otherwise might have odd behaviour)
andrewm@0 76 cmp r2, #4 @ Check for trivial case 2: zero oscillators
andrewm@0 77 it lt
andrewm@0 78 bxlt lr @ Return if that's the case
andrewm@0 79
andrewm@0 80 push {r4-r11} @ Now arguments start 32 bytes above SP
andrewm@0 81 add r11, sp, #32 @ Pointer to 32 bytes into the stack
andrewm@0 82 ldm r11, {r4-r9} @ Load 6 arguments into registers
andrewm@0 83
andrewm@0 84 vdup qTableBase, r9 @ Move lookup table base index into 4 ints
andrewm@0 85
andrewm@0 86 @ Outer loop: iterate over the number of oscillators, choosing 4 at a
andrewm@0 87 @ time to work with.
andrewm@0 88 oscbank_oscillator_loop:
andrewm@0 89 vld1 {dPhases_0, dPhases_1}, [r4] @ no increment; will store at end of sample loop
andrewm@0 90 vld1 {dFreqs_0, dFreqs_1}, [r5]
andrewm@0 91 vld1 {dAmps_0, dAmps_1}, [r6]
andrewm@0 92 vld1 {dFreqDs_0, dFreqDs_1}, [r7]! @ increment; won't update at end of sample loop
andrewm@0 93 vld1 {dAmpDs_0, dAmpDs_1}, [r8]!
andrewm@0 94
andrewm@0 95 push {r0-r1,r4-r8}
andrewm@0 96 @ --- inner loop: iterate over the number of samples ---
andrewm@0 97 oscbank_sample_loop:
andrewm@0 98 vcvt qBaseInts, qPhases @ Take floor(phases)
andrewm@0 99 vmov q2.f32, #1.0 @ Load 1.0 into every slot of q2
andrewm@0 100 vshl q0.U32, qBaseInts, #2 @ Shift the indexes left 2 (*4 for float addressing)
andrewm@0 101 vcvt qFractions, qBaseInts @ int back to float
andrewm@0 102 vadd q0.U32, q0.U32, qTableBase @ Find memory addresses
andrewm@0 103
andrewm@0 104 vmov r4, r5, d0 @ Move two indexes to ARM registers
andrewm@0 105 vmov r6, r7, d1 @ Move two more indexes to ARM registers
andrewm@0 106 vsub qFractions, qPhases, qFractions @ fraction = phase - floor(phase)
andrewm@0 107
andrewm@0 108 vldr.64 d0, [r4] @ Load two consecutive floats at each location
andrewm@0 109 vldr.64 d1, [r5] @ These hold the previous and following samples in the table
andrewm@0 110 vldr.64 d2, [r6] @ TODO: check whether these work at 4-byte alignment
andrewm@0 111 vldr.64 d3, [r7]
andrewm@0 112
andrewm@0 113 @ Format at this point:
andrewm@0 114 @ Osc0(before) Osc0(after) Osc1(before) Osc1(after) Osc2(before) Osc2(after) Osc3(before) Osc3(after)
andrewm@0 115 @ We want:
andrewm@0 116 @ Osc0(before) Osc1(before) Osc2(before) Osc3(before) Osc0(after) Osc1(after) Osc2(after) Osc3(after)
andrewm@0 117
andrewm@0 118 vuzp.32 q0, q1 @ Now q0 contains before, q1 contains after
andrewm@0 119 vsub q2.f32, q2.f32, qFractions @ q2 = 1.0 - fraction
andrewm@0 120 vmul q1.f32, q1.f32, qFractions @ q1 = fraction * after
andrewm@0 121 vmul q0.f32, q0.f32, q2.f32 @ q0 = (1.0 - fraction) * before
andrewm@0 122
andrewm@0 123 vadd qPhases, qPhases, qFreqs @ Update phases
andrewm@0 124 vadd qFreqs, qFreqs, qFreqDs @ Update frequencies
andrewm@0 125
andrewm@0 126 vadd q0.f32, q0.f32, q1.f32 @ Add two interpolated components to get the final sample
andrewm@0 127 vdup q2.u32, r3 @ Put lookup table size into each element of q2
andrewm@0 128 vcvt qBaseInts, qPhases @ Take floor of new phases
andrewm@0 129 vmul q0.f32, q0.f32, qAmps @ Multiply samples by current amplitude
andrewm@0 130
andrewm@0 131 vld1 dSample, [r1] @ Load the current stereo samples
andrewm@0 132 vpadd d2.f32, d0.f32, d1.f32 @ Pairwise accumulate q0 (output sample) into d2
andrewm@0 133
andrewm@0 134 vand q2, q2, qBaseInts @ Logical AND of new phase int leaves 1 bit set only if phase >= table size
andrewm@0 135 vpadd d3.f32, d2.f32, d2.f32 @ Pairwise accumulate d2 into d0 --> d0[0] and d0[1] both hold total of 4 oscillators
andrewm@0 136 vadd qAmps, qAmps, qAmpDs @ Update amplitudes
andrewm@0 137 vcvt q0.f32, q2.u32 @ Convert int back to float after AND operation
andrewm@0 138
andrewm@0 139 vadd dSample, dSample, d3.f32 @ Add oscillator outputs to each channel
andrewm@0 140
andrewm@0 141 subs r0, r0, #1 @ numFrames--
andrewm@0 142 vsub qPhases, qPhases, q0.f32 @ Keep phases in table range
andrewm@0 143 vst1 dSample, [r1]! @ Store back in buffer and increment by 8
andrewm@0 144
andrewm@0 145 it gt
andrewm@0 146 bgt oscbank_sample_loop @ Loop if numFrames > 0
andrewm@0 147
andrewm@0 148 @ --- end inner loop ---
andrewm@0 149 pop {r0-r1,r4-r8} @ Restore registers: restores audioOut and numFrames, among others
andrewm@0 150
andrewm@0 151 vst1 {dPhases_0, dPhases_1}, [r4]! @ Store phases back to array
andrewm@0 152 vst1 {dFreqs_0, dFreqs_1}, [r5]! @ Store frequencies back to array
andrewm@0 153 vst1 {dAmps_0, dAmps_1}, [r6]! @ Store amplitudes back to array
andrewm@0 154 @ No need to update r7, r8
andrewm@0 155
andrewm@0 156 subs r2, r2, #4 @ numPartials -= 4
andrewm@0 157 it gt
andrewm@0 158 bgt oscbank_oscillator_loop @ Loop if numPartials > 0
andrewm@0 159
andrewm@0 160 pop {r4-r11}
andrewm@0 161 bx lr
andrewm@0 162
andrewm@0 163
andrewm@0 164 @ void wavetable_interpolate_neon(int numSamplesIn, int numSamplesOut,
andrewm@0 165 @ float *tableIn, float *tableOut);
andrewm@0 166
andrewm@0 167 @ Registers:
andrewm@0 168 @ r0: numSamplesIn Size of the input table
andrewm@0 169 @ r1: numSamplesOut Size of the output table
andrewm@0 170 @ r2: tableIn Pointer to input table
andrewm@0 171 @ r3: tableOut Pointer to output table
andrewm@0 172
andrewm@0 173 @ Alignment requirements:
andrewm@0 174 @ tableIn: 8-byte boundary
andrewm@0 175 @ tableOut: 8-byte boundary
andrewm@0 176
andrewm@0 177 .align 2
andrewm@0 178 .global wavetable_interpolate_neon
andrewm@0 179 .thumb
andrewm@0 180 .thumb_func
andrewm@0 181 .type wavetable_interpolate_neon, %function
andrewm@0 182 wavetable_interpolate_neon:
andrewm@0 183 @ TODO
andrewm@0 184
andrewm@0 185 bx lr