diff examples/04-Audio/oscillator-bank/audio_routines.S @ 468:85cf9c0da052 prerelease

merge
author Giulio Moro <giuliomoro@yahoo.it>
date Mon, 20 Jun 2016 17:08:02 +0100
parents 8fcfbfb32aa0
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/examples/04-Audio/oscillator-bank/audio_routines.S	Mon Jun 20 17:08:02 2016 +0100
@@ -0,0 +1,161 @@
+@
+@ audio_routines.S
+@
+@ NEON-based functions for time-critical audio processing
+@
+@ Andrew McPherson 2014
+@ Queen Mary University of London
+@
+
+	.syntax unified
+	.arch armv7-a
+	.fpu neon
+
+@ 	void oscillator_bank_neon(int numAudioFrames, float *audioOut,
+@							  int activePartialNum, int lookupTableSize,
+@							  float *phases, float *frequencies, float *amplitudes,
+@							  float *freqDerivatives, float *ampDerivatives,
+@							  float *lookupTable);
+
+@ Registers:
+@    r0: numAudioFrames        How many frames to render
+@    r1: audioOut              Buffer for audio output samples [stereo]
+@    r2: activePartialNum      How many active partials to render
+@    r3: lookupTableSize       Size of lookup table
+@    ---- other arguments start on the stack and are moved: -----
+@    r4: phases                Phase of each oscillator (pointer)
+@    r5: frequencies           Normalised frequency of each oscillator (pointer)
+@    r6: amplitudes            Normalised amplitude of each oscillator (pointer)
+@    r7: freqDerivatives       Derivative of frequency for each oscillator (pointer)
+@    r8: ampDerivatives        Derivative of amplitude for each oscillator (pointer)
+@    r9: lookupTable           Lookup table containing one oscillation
+@
+@ Alignment requirements:
+@    audioOut: 8-byte boundary
+@    phases: 16-byte boundary
+@    frequencies: 16-byte boundary
+@    amplitudes: 16-byte boundary
+@    freqDerivatives: 16-byte bounary
+@    ampDerivatives: 16-byte boundary
+@    lookupTable: 4-byte boundary (TODO: check this)
+
+	.align	2
+	.global	oscillator_bank_neon
+	.thumb
+	.thumb_func
+	.type	oscillator_bank_neon, %function
+oscillator_bank_neon:
+
+
+dSample		.dn		D6.F32
+qPhases		.qn		Q8.F32
+dPhases_0	.dn		D16.F32
+dPhases_1	.dn		D17.F32
+qFreqs		.qn		Q9.F32
+dFreqs_0	.dn		D18.F32
+dFreqs_1	.dn		D19.F32
+qAmps		.qn		Q10.F32
+dAmps_0		.dn		D20.F32
+dAmps_1		.dn		D21.F32
+qFreqDs		.qn		Q11.F32
+dFreqDs_0	.dn		D22.F32
+dFreqDs_1	.dn		D23.F32
+qAmpDs		.qn		Q12.F32
+dAmpDs_0	.dn		D24.F32
+dAmpDs_1	.dn		D25.F32
+
+qBaseInts	.qn		Q13.U32		@ Base indexes: unsigned ints x4
+dBaseInts_0	.dn		D26.U32
+dBaseInts_1	.dn		D27.U32
+qFractions  .qn     Q14.F32		@ Fraction indexes: floats x4
+qTableBase	.qn		Q15.U32		@ Base of lookup table
+
+	cmp r0, #0					@ Check for trivial case 1: zero frames
+	it eq
+	bxeq lr						@ Return if that's the case (otherwise might have odd behaviour)
+	cmp r2, #4					@ Check for trivial case 2: zero oscillators
+	it lt
+	bxlt lr						@ Return if that's the case
+
+	push {r4-r11}				@ Now arguments start 32 bytes above SP
+    add r11, sp, #32			@ Pointer to 32 bytes into the stack
+    ldm r11, {r4-r9}			@ Load 6 arguments into registers
+
+	vdup qTableBase, r9			@ Move lookup table base index into 4 ints
+
+	@ Outer loop: iterate over the number of oscillators, choosing 4 at a
+	@ time to work with.
+oscbank_oscillator_loop:
+	vld1 {dPhases_0, dPhases_1}, [r4]		@ no increment; will store at end of sample loop
+	vld1 {dFreqs_0, dFreqs_1}, [r5]
+	vld1 {dAmps_0, dAmps_1}, [r6]
+	vld1 {dFreqDs_0, dFreqDs_1}, [r7]!		@ increment; won't update at end of sample loop
+	vld1 {dAmpDs_0, dAmpDs_1}, [r8]!
+
+	push {r0-r1,r4-r8}
+	@ --- inner loop: iterate over the number of samples ---
+oscbank_sample_loop:
+	vcvt qBaseInts, qPhases		     		@ Take floor(phases)
+	vmov q2.f32, #1.0						@ Load 1.0 into every slot of q2
+	vshl q0.U32, qBaseInts, #2				@ Shift the indexes left 2 (*4 for float addressing)
+	vcvt qFractions, qBaseInts				@ int back to float
+	vadd q0.U32, q0.U32, qTableBase			@ Find memory addresses
+
+	vmov r4, r5, d0							@ Move two indexes to ARM registers
+	vmov r6, r7, d1							@ Move two more indexes to ARM registers
+	vsub qFractions, qPhases, qFractions	@ fraction = phase - floor(phase)
+
+	vldr.64	d0, [r4]						@ Load two consecutive floats at each location
+	vldr.64 d1, [r5]						@ These hold the previous and following samples in the table
+	vldr.64	d2, [r6]						@ TODO: check whether these work at 4-byte alignment
+	vldr.64 d3, [r7]
+
+	@ Format at this point:
+	@ Osc0(before) Osc0(after) Osc1(before) Osc1(after) Osc2(before) Osc2(after) Osc3(before) Osc3(after)
+	@ We want:
+	@ Osc0(before) Osc1(before) Osc2(before) Osc3(before) Osc0(after) Osc1(after) Osc2(after) Osc3(after)
+
+	vuzp.32 q0, q1							@ Now q0 contains before, q1 contains after
+	vsub q2.f32, q2.f32, qFractions			@ q2 = 1.0 - fraction
+	vmul q1.f32, q1.f32, qFractions			@ q1 = fraction * after
+	vmul q0.f32, q0.f32, q2.f32				@ q0 = (1.0 - fraction) * before
+
+	vadd qPhases, qPhases, qFreqs			@ Update phases
+	vadd qFreqs, qFreqs, qFreqDs			@ Update frequencies
+
+	vadd q0.f32, q0.f32, q1.f32				@ Add two interpolated components to get the final sample
+	vdup q2.u32, r3							@ Put lookup table size into each element of q2
+	vcvt qBaseInts, qPhases					@ Take floor of new phases
+	vmul q0.f32, q0.f32, qAmps				@ Multiply samples by current amplitude
+
+	vld1 dSample, [r1]						@ Load the current stereo samples
+	vpadd d2.f32, d0.f32, d1.f32			@ Pairwise accumulate q0 (output sample) into d2
+
+	vand q2, q2, qBaseInts					@ Logical AND of new phase int leaves 1 bit set only if phase >= table size
+	vpadd d3.f32, d2.f32, d2.f32			@ Pairwise accumulate d2 into d0 --> d0[0] and d0[1] both hold total of 4 oscillators
+	vadd qAmps, qAmps, qAmpDs				@ Update amplitudes
+	vcvt q0.f32, q2.u32						@ Convert int back to float after AND operation
+
+	vadd  dSample, dSample, d3.f32			@ Add oscillator outputs to each channel
+
+	subs r0, r0, #1							@ numFrames--
+	vsub qPhases, qPhases, q0.f32			@ Keep phases in table range
+	vst1 dSample, [r1]!						@ Store back in buffer and increment by 8
+
+	it gt
+	bgt oscbank_sample_loop					@ Loop if numFrames > 0
+
+	@ --- end inner loop ---
+	pop {r0-r1,r4-r8}						@ Restore registers: restores audioOut and numFrames, among others
+
+	vst1 {dPhases_0, dPhases_1}, [r4]!		@ Store phases back to array
+	vst1 {dFreqs_0, dFreqs_1}, [r5]!		@ Store frequencies back to array
+	vst1 {dAmps_0, dAmps_1}, [r6]!			@ Store amplitudes back to array
+											@ No need to update r7, r8
+
+	subs r2, r2, #4							@ numPartials -= 4
+	it  gt
+	bgt oscbank_oscillator_loop	@ Loop if numPartials > 0
+
+    pop {r4-r11}
+	bx lr