andrewm@0: @
andrewm@0: @ audio_routines.S
andrewm@0: @
andrewm@0: @ NEON-based functions for time-critical audio processing
andrewm@0: @
andrewm@0: @ Andrew McPherson 2014
andrewm@0: @ Queen Mary University of London
andrewm@0: @
andrewm@0: 
andrewm@0: 	.syntax unified
andrewm@0: 	.arch armv7-a
andrewm@0: 	.fpu neon
andrewm@0: 
andrewm@0: @ 	void oscillator_bank_neon(int numAudioFrames, float *audioOut,
andrewm@0: @							  int activePartialNum, int lookupTableSize,
andrewm@0: @							  float *phases, float *frequencies, float *amplitudes,
andrewm@0: @							  float *freqDerivatives, float *ampDerivatives,
andrewm@0: @							  float *lookupTable);
andrewm@0: 
andrewm@0: @ Registers:
andrewm@0: @    r0: numAudioFrames        How many frames to render
andrewm@0: @    r1: audioOut              Buffer for audio output samples [stereo]
andrewm@0: @    r2: activePartialNum      How many active partials to render
andrewm@0: @    r3: lookupTableSize       Size of lookup table
andrewm@0: @    ---- other arguments start on the stack and are moved: -----
andrewm@0: @    r4: phases                Phase of each oscillator (pointer)
andrewm@0: @    r5: frequencies           Normalised frequency of each oscillator (pointer)
andrewm@0: @    r6: amplitudes            Normalised amplitude of each oscillator (pointer)
andrewm@0: @    r7: freqDerivatives       Derivative of frequency for each oscillator (pointer)
andrewm@0: @    r8: ampDerivatives        Derivative of amplitude for each oscillator (pointer)
andrewm@0: @    r9: lookupTable           Lookup table containing one oscillation
andrewm@0: @
andrewm@0: @ Alignment requirements:
andrewm@0: @    audioOut: 8-byte boundary
andrewm@0: @    phases: 16-byte boundary
andrewm@0: @    frequencies: 16-byte boundary
andrewm@0: @    amplitudes: 16-byte boundary
andrewm@0: @    freqDerivatives: 16-byte bounary
andrewm@0: @    ampDerivatives: 16-byte boundary
andrewm@0: @    lookupTable: 4-byte boundary (TODO: check this)
andrewm@0: 
andrewm@0: 	.align	2
andrewm@0: 	.global	oscillator_bank_neon
andrewm@0: 	.thumb
andrewm@0: 	.thumb_func
andrewm@0: 	.type	oscillator_bank_neon, %function
andrewm@0: oscillator_bank_neon:
andrewm@0: 
andrewm@0: 
andrewm@0: dSample		.dn		D6.F32
andrewm@0: qPhases		.qn		Q8.F32
andrewm@0: dPhases_0	.dn		D16.F32
andrewm@0: dPhases_1	.dn		D17.F32
andrewm@0: qFreqs		.qn		Q9.F32
andrewm@0: dFreqs_0	.dn		D18.F32
andrewm@0: dFreqs_1	.dn		D19.F32
andrewm@0: qAmps		.qn		Q10.F32
andrewm@0: dAmps_0		.dn		D20.F32
andrewm@0: dAmps_1		.dn		D21.F32
andrewm@0: qFreqDs		.qn		Q11.F32
andrewm@0: dFreqDs_0	.dn		D22.F32
andrewm@0: dFreqDs_1	.dn		D23.F32
andrewm@0: qAmpDs		.qn		Q12.F32
andrewm@0: dAmpDs_0	.dn		D24.F32
andrewm@0: dAmpDs_1	.dn		D25.F32
andrewm@0: 
andrewm@0: qBaseInts	.qn		Q13.U32		@ Base indexes: unsigned ints x4
andrewm@0: dBaseInts_0	.dn		D26.U32
andrewm@0: dBaseInts_1	.dn		D27.U32
andrewm@0: qFractions  .qn     Q14.F32		@ Fraction indexes: floats x4
andrewm@0: qTableBase	.qn		Q15.U32		@ Base of lookup table
andrewm@0: 
andrewm@0: 	cmp r0, #0					@ Check for trivial case 1: zero frames
andrewm@0: 	it eq
andrewm@0: 	bxeq lr						@ Return if that's the case (otherwise might have odd behaviour)
andrewm@0: 	cmp r2, #4					@ Check for trivial case 2: zero oscillators
andrewm@0: 	it lt
andrewm@0: 	bxlt lr						@ Return if that's the case
andrewm@0: 
andrewm@0: 	push {r4-r11}				@ Now arguments start 32 bytes above SP
andrewm@0:     add r11, sp, #32			@ Pointer to 32 bytes into the stack
andrewm@0:     ldm r11, {r4-r9}			@ Load 6 arguments into registers
andrewm@0: 
andrewm@0: 	vdup qTableBase, r9			@ Move lookup table base index into 4 ints
andrewm@0: 
andrewm@0: 	@ Outer loop: iterate over the number of oscillators, choosing 4 at a
andrewm@0: 	@ time to work with.
andrewm@0: oscbank_oscillator_loop:
andrewm@0: 	vld1 {dPhases_0, dPhases_1}, [r4]		@ no increment; will store at end of sample loop
andrewm@0: 	vld1 {dFreqs_0, dFreqs_1}, [r5]
andrewm@0: 	vld1 {dAmps_0, dAmps_1}, [r6]
andrewm@0: 	vld1 {dFreqDs_0, dFreqDs_1}, [r7]!		@ increment; won't update at end of sample loop
andrewm@0: 	vld1 {dAmpDs_0, dAmpDs_1}, [r8]!
andrewm@0: 
andrewm@0: 	push {r0-r1,r4-r8}
andrewm@0: 	@ --- inner loop: iterate over the number of samples ---
andrewm@0: oscbank_sample_loop:
andrewm@0: 	vcvt qBaseInts, qPhases		     		@ Take floor(phases)
andrewm@0: 	vmov q2.f32, #1.0						@ Load 1.0 into every slot of q2
andrewm@0: 	vshl q0.U32, qBaseInts, #2				@ Shift the indexes left 2 (*4 for float addressing)
andrewm@0: 	vcvt qFractions, qBaseInts				@ int back to float
andrewm@0: 	vadd q0.U32, q0.U32, qTableBase			@ Find memory addresses
andrewm@0: 
andrewm@0: 	vmov r4, r5, d0							@ Move two indexes to ARM registers
andrewm@0: 	vmov r6, r7, d1							@ Move two more indexes to ARM registers
andrewm@0: 	vsub qFractions, qPhases, qFractions	@ fraction = phase - floor(phase)
andrewm@0: 
andrewm@0: 	vldr.64	d0, [r4]						@ Load two consecutive floats at each location
andrewm@0: 	vldr.64 d1, [r5]						@ These hold the previous and following samples in the table
andrewm@0: 	vldr.64	d2, [r6]						@ TODO: check whether these work at 4-byte alignment
andrewm@0: 	vldr.64 d3, [r7]
andrewm@0: 
andrewm@0: 	@ Format at this point:
andrewm@0: 	@ Osc0(before) Osc0(after) Osc1(before) Osc1(after) Osc2(before) Osc2(after) Osc3(before) Osc3(after)
andrewm@0: 	@ We want:
andrewm@0: 	@ Osc0(before) Osc1(before) Osc2(before) Osc3(before) Osc0(after) Osc1(after) Osc2(after) Osc3(after)
andrewm@0: 
andrewm@0: 	vuzp.32 q0, q1							@ Now q0 contains before, q1 contains after
andrewm@0: 	vsub q2.f32, q2.f32, qFractions			@ q2 = 1.0 - fraction
andrewm@0: 	vmul q1.f32, q1.f32, qFractions			@ q1 = fraction * after
andrewm@0: 	vmul q0.f32, q0.f32, q2.f32				@ q0 = (1.0 - fraction) * before
andrewm@0: 
andrewm@0: 	vadd qPhases, qPhases, qFreqs			@ Update phases
andrewm@0: 	vadd qFreqs, qFreqs, qFreqDs			@ Update frequencies
andrewm@0: 
andrewm@0: 	vadd q0.f32, q0.f32, q1.f32				@ Add two interpolated components to get the final sample
andrewm@0: 	vdup q2.u32, r3							@ Put lookup table size into each element of q2
andrewm@0: 	vcvt qBaseInts, qPhases					@ Take floor of new phases
andrewm@0: 	vmul q0.f32, q0.f32, qAmps				@ Multiply samples by current amplitude
andrewm@0: 
andrewm@0: 	vld1 dSample, [r1]						@ Load the current stereo samples
andrewm@0: 	vpadd d2.f32, d0.f32, d1.f32			@ Pairwise accumulate q0 (output sample) into d2
andrewm@0: 
andrewm@0: 	vand q2, q2, qBaseInts					@ Logical AND of new phase int leaves 1 bit set only if phase >= table size
andrewm@0: 	vpadd d3.f32, d2.f32, d2.f32			@ Pairwise accumulate d2 into d0 --> d0[0] and d0[1] both hold total of 4 oscillators
andrewm@0: 	vadd qAmps, qAmps, qAmpDs				@ Update amplitudes
andrewm@0: 	vcvt q0.f32, q2.u32						@ Convert int back to float after AND operation
andrewm@0: 
andrewm@0: 	vadd  dSample, dSample, d3.f32			@ Add oscillator outputs to each channel
andrewm@0: 
andrewm@0: 	subs r0, r0, #1							@ numFrames--
andrewm@0: 	vsub qPhases, qPhases, q0.f32			@ Keep phases in table range
andrewm@0: 	vst1 dSample, [r1]!						@ Store back in buffer and increment by 8
andrewm@0: 
andrewm@0: 	it gt
andrewm@0: 	bgt oscbank_sample_loop					@ Loop if numFrames > 0
andrewm@0: 
andrewm@0: 	@ --- end inner loop ---
andrewm@0: 	pop {r0-r1,r4-r8}						@ Restore registers: restores audioOut and numFrames, among others
andrewm@0: 
andrewm@0: 	vst1 {dPhases_0, dPhases_1}, [r4]!		@ Store phases back to array
andrewm@0: 	vst1 {dFreqs_0, dFreqs_1}, [r5]!		@ Store frequencies back to array
andrewm@0: 	vst1 {dAmps_0, dAmps_1}, [r6]!			@ Store amplitudes back to array
andrewm@0: 											@ No need to update r7, r8
andrewm@0: 
andrewm@0: 	subs r2, r2, #4							@ numPartials -= 4
andrewm@0: 	it  gt
andrewm@0: 	bgt oscbank_oscillator_loop	@ Loop if numPartials > 0
andrewm@0: 
andrewm@0:     pop {r4-r11}
andrewm@0: 	bx lr