annotate examples/04-Audio/oscillator-bank/audio_routines.S @ 516:7eb66c7898cb prerelease

Fixed setup_ssh (got screwed by bela_common)
author Giulio Moro <giuliomoro@yahoo.it>
date Wed, 22 Jun 2016 13:34:23 +0100
parents 8fcfbfb32aa0
children
rev   line source
robert@464 1 @
robert@464 2 @ audio_routines.S
robert@464 3 @
robert@464 4 @ NEON-based functions for time-critical audio processing
robert@464 5 @
robert@464 6 @ Andrew McPherson 2014
robert@464 7 @ Queen Mary University of London
robert@464 8 @
robert@464 9
robert@464 10 .syntax unified
robert@464 11 .arch armv7-a
robert@464 12 .fpu neon
robert@464 13
robert@464 14 @ void oscillator_bank_neon(int numAudioFrames, float *audioOut,
robert@464 15 @ int activePartialNum, int lookupTableSize,
robert@464 16 @ float *phases, float *frequencies, float *amplitudes,
robert@464 17 @ float *freqDerivatives, float *ampDerivatives,
robert@464 18 @ float *lookupTable);
robert@464 19
robert@464 20 @ Registers:
robert@464 21 @ r0: numAudioFrames How many frames to render
robert@464 22 @ r1: audioOut Buffer for audio output samples [stereo]
robert@464 23 @ r2: activePartialNum How many active partials to render
robert@464 24 @ r3: lookupTableSize Size of lookup table
robert@464 25 @ ---- other arguments start on the stack and are moved: -----
robert@464 26 @ r4: phases Phase of each oscillator (pointer)
robert@464 27 @ r5: frequencies Normalised frequency of each oscillator (pointer)
robert@464 28 @ r6: amplitudes Normalised amplitude of each oscillator (pointer)
robert@464 29 @ r7: freqDerivatives Derivative of frequency for each oscillator (pointer)
robert@464 30 @ r8: ampDerivatives Derivative of amplitude for each oscillator (pointer)
robert@464 31 @ r9: lookupTable Lookup table containing one oscillation
robert@464 32 @
robert@464 33 @ Alignment requirements:
robert@464 34 @ audioOut: 8-byte boundary
robert@464 35 @ phases: 16-byte boundary
robert@464 36 @ frequencies: 16-byte boundary
robert@464 37 @ amplitudes: 16-byte boundary
robert@464 38 @ freqDerivatives: 16-byte bounary
robert@464 39 @ ampDerivatives: 16-byte boundary
robert@464 40 @ lookupTable: 4-byte boundary (TODO: check this)
robert@464 41
robert@464 42 .align 2
robert@464 43 .global oscillator_bank_neon
robert@464 44 .thumb
robert@464 45 .thumb_func
robert@464 46 .type oscillator_bank_neon, %function
robert@464 47 oscillator_bank_neon:
robert@464 48
robert@464 49
robert@464 50 dSample .dn D6.F32
robert@464 51 qPhases .qn Q8.F32
robert@464 52 dPhases_0 .dn D16.F32
robert@464 53 dPhases_1 .dn D17.F32
robert@464 54 qFreqs .qn Q9.F32
robert@464 55 dFreqs_0 .dn D18.F32
robert@464 56 dFreqs_1 .dn D19.F32
robert@464 57 qAmps .qn Q10.F32
robert@464 58 dAmps_0 .dn D20.F32
robert@464 59 dAmps_1 .dn D21.F32
robert@464 60 qFreqDs .qn Q11.F32
robert@464 61 dFreqDs_0 .dn D22.F32
robert@464 62 dFreqDs_1 .dn D23.F32
robert@464 63 qAmpDs .qn Q12.F32
robert@464 64 dAmpDs_0 .dn D24.F32
robert@464 65 dAmpDs_1 .dn D25.F32
robert@464 66
robert@464 67 qBaseInts .qn Q13.U32 @ Base indexes: unsigned ints x4
robert@464 68 dBaseInts_0 .dn D26.U32
robert@464 69 dBaseInts_1 .dn D27.U32
robert@464 70 qFractions .qn Q14.F32 @ Fraction indexes: floats x4
robert@464 71 qTableBase .qn Q15.U32 @ Base of lookup table
robert@464 72
robert@464 73 cmp r0, #0 @ Check for trivial case 1: zero frames
robert@464 74 it eq
robert@464 75 bxeq lr @ Return if that's the case (otherwise might have odd behaviour)
robert@464 76 cmp r2, #4 @ Check for trivial case 2: zero oscillators
robert@464 77 it lt
robert@464 78 bxlt lr @ Return if that's the case
robert@464 79
robert@464 80 push {r4-r11} @ Now arguments start 32 bytes above SP
robert@464 81 add r11, sp, #32 @ Pointer to 32 bytes into the stack
robert@464 82 ldm r11, {r4-r9} @ Load 6 arguments into registers
robert@464 83
robert@464 84 vdup qTableBase, r9 @ Move lookup table base index into 4 ints
robert@464 85
robert@464 86 @ Outer loop: iterate over the number of oscillators, choosing 4 at a
robert@464 87 @ time to work with.
robert@464 88 oscbank_oscillator_loop:
robert@464 89 vld1 {dPhases_0, dPhases_1}, [r4] @ no increment; will store at end of sample loop
robert@464 90 vld1 {dFreqs_0, dFreqs_1}, [r5]
robert@464 91 vld1 {dAmps_0, dAmps_1}, [r6]
robert@464 92 vld1 {dFreqDs_0, dFreqDs_1}, [r7]! @ increment; won't update at end of sample loop
robert@464 93 vld1 {dAmpDs_0, dAmpDs_1}, [r8]!
robert@464 94
robert@464 95 push {r0-r1,r4-r8}
robert@464 96 @ --- inner loop: iterate over the number of samples ---
robert@464 97 oscbank_sample_loop:
robert@464 98 vcvt qBaseInts, qPhases @ Take floor(phases)
robert@464 99 vmov q2.f32, #1.0 @ Load 1.0 into every slot of q2
robert@464 100 vshl q0.U32, qBaseInts, #2 @ Shift the indexes left 2 (*4 for float addressing)
robert@464 101 vcvt qFractions, qBaseInts @ int back to float
robert@464 102 vadd q0.U32, q0.U32, qTableBase @ Find memory addresses
robert@464 103
robert@464 104 vmov r4, r5, d0 @ Move two indexes to ARM registers
robert@464 105 vmov r6, r7, d1 @ Move two more indexes to ARM registers
robert@464 106 vsub qFractions, qPhases, qFractions @ fraction = phase - floor(phase)
robert@464 107
robert@464 108 vldr.64 d0, [r4] @ Load two consecutive floats at each location
robert@464 109 vldr.64 d1, [r5] @ These hold the previous and following samples in the table
robert@464 110 vldr.64 d2, [r6] @ TODO: check whether these work at 4-byte alignment
robert@464 111 vldr.64 d3, [r7]
robert@464 112
robert@464 113 @ Format at this point:
robert@464 114 @ Osc0(before) Osc0(after) Osc1(before) Osc1(after) Osc2(before) Osc2(after) Osc3(before) Osc3(after)
robert@464 115 @ We want:
robert@464 116 @ Osc0(before) Osc1(before) Osc2(before) Osc3(before) Osc0(after) Osc1(after) Osc2(after) Osc3(after)
robert@464 117
robert@464 118 vuzp.32 q0, q1 @ Now q0 contains before, q1 contains after
robert@464 119 vsub q2.f32, q2.f32, qFractions @ q2 = 1.0 - fraction
robert@464 120 vmul q1.f32, q1.f32, qFractions @ q1 = fraction * after
robert@464 121 vmul q0.f32, q0.f32, q2.f32 @ q0 = (1.0 - fraction) * before
robert@464 122
robert@464 123 vadd qPhases, qPhases, qFreqs @ Update phases
robert@464 124 vadd qFreqs, qFreqs, qFreqDs @ Update frequencies
robert@464 125
robert@464 126 vadd q0.f32, q0.f32, q1.f32 @ Add two interpolated components to get the final sample
robert@464 127 vdup q2.u32, r3 @ Put lookup table size into each element of q2
robert@464 128 vcvt qBaseInts, qPhases @ Take floor of new phases
robert@464 129 vmul q0.f32, q0.f32, qAmps @ Multiply samples by current amplitude
robert@464 130
robert@464 131 vld1 dSample, [r1] @ Load the current stereo samples
robert@464 132 vpadd d2.f32, d0.f32, d1.f32 @ Pairwise accumulate q0 (output sample) into d2
robert@464 133
robert@464 134 vand q2, q2, qBaseInts @ Logical AND of new phase int leaves 1 bit set only if phase >= table size
robert@464 135 vpadd d3.f32, d2.f32, d2.f32 @ Pairwise accumulate d2 into d0 --> d0[0] and d0[1] both hold total of 4 oscillators
robert@464 136 vadd qAmps, qAmps, qAmpDs @ Update amplitudes
robert@464 137 vcvt q0.f32, q2.u32 @ Convert int back to float after AND operation
robert@464 138
robert@464 139 vadd dSample, dSample, d3.f32 @ Add oscillator outputs to each channel
robert@464 140
robert@464 141 subs r0, r0, #1 @ numFrames--
robert@464 142 vsub qPhases, qPhases, q0.f32 @ Keep phases in table range
robert@464 143 vst1 dSample, [r1]! @ Store back in buffer and increment by 8
robert@464 144
robert@464 145 it gt
robert@464 146 bgt oscbank_sample_loop @ Loop if numFrames > 0
robert@464 147
robert@464 148 @ --- end inner loop ---
robert@464 149 pop {r0-r1,r4-r8} @ Restore registers: restores audioOut and numFrames, among others
robert@464 150
robert@464 151 vst1 {dPhases_0, dPhases_1}, [r4]! @ Store phases back to array
robert@464 152 vst1 {dFreqs_0, dFreqs_1}, [r5]! @ Store frequencies back to array
robert@464 153 vst1 {dAmps_0, dAmps_1}, [r6]! @ Store amplitudes back to array
robert@464 154 @ No need to update r7, r8
robert@464 155
robert@464 156 subs r2, r2, #4 @ numPartials -= 4
robert@464 157 it gt
robert@464 158 bgt oscbank_oscillator_loop @ Loop if numPartials > 0
robert@464 159
robert@464 160 pop {r4-r11}
robert@464 161 bx lr