changeset 318:f7b19ea31bbb prerelease

Added NEON vectorised float<->int converters. Curiously, its performance is worse than the C version. I guess clang is better at vectorising than we thought! The new code remains disabled for now.
author andrewm
date Mon, 30 May 2016 01:06:01 +0100
parents 52733b58bdf3
children 2c4ddf4277d1
files Makefile core/FormatConvert.S core/PRU.cpp
diffstat 3 files changed, 297 insertions(+), 10 deletions(-) [+]
line wrap: on
line diff
--- a/Makefile	Sat May 28 01:23:56 2016 +0100
+++ b/Makefile	Mon May 30 01:06:01 2016 +0100
@@ -88,6 +88,11 @@
 CORE_OBJS := $(addprefix build/core/,$(notdir $(CORE_CPP_SRCS:.cpp=.o)))
 CORE_CPP_DEPS := $(addprefix build/core/,$(notdir $(CORE_CPP_SRCS:.cpp=.d)))
 
+CORE_ASM_SRCS := $(wildcard core/*.S)
+CORE_ASM_OBJS := $(addprefix build/core/,$(notdir $(CORE_ASM_SRCS:.S=.o)))
+CORE_ASM_DEPS := $(addprefix build/core/,$(notdir $(CORE_ASM_SRCS:.S=.d)))
+
+
 # Objects for a system-supplied default main() file, if the user
 # only wants to provide the render functions.
 DEFAULT_MAIN_CPP_SRCS := ./core/default_main.cpp
@@ -117,6 +122,14 @@
 	@echo ' ...done'
 	@echo ' '
 
+# Rule for Bela core ASM files
+build/core/%.o: ./core/%.S
+	@echo 'Building $(notdir $<)...'
+#	@echo 'Invoking: GCC Assembler'
+	@as  -o "$@" "$<"
+	@echo ' ...done'
+	@echo ' '
+
 # Rule for user-supplied C++ files
 $(PROJECT_DIR)/build/%.o: $(PROJECT_DIR)/%.cpp
 	@echo 'Building $(notdir $<)...'
@@ -146,10 +159,10 @@
 # function, and conditionally call one of two recursive make targets depending on whether
 # we want to link in the default main file or not. The kludge is the mess of a shell script
 # line below. Surely there's a better way to do this?
-Bela: $(CORE_OBJS) $(ASM_OBJS) $(C_OBJS) $(CPP_OBJS) $(STATIC_LIBS) $(DEFAULT_MAIN_OBJS)
+Bela: $(CORE_ASM_OBJS) $(CORE_OBJS) $(ASM_OBJS) $(C_OBJS) $(CPP_OBJS) $(STATIC_LIBS) $(DEFAULT_MAIN_OBJS)
 	$(eval DEFAULT_MAIN_CONDITIONAL := $(shell bash -c 'if [ `nm $(PROJECT_DIR)/build/*.o | grep -w T | grep -w main | wc -l` == '0' ]; then echo "$(DEFAULT_MAIN_OBJS)"; else echo ""; fi'))
 	@echo 'Invoking: C++ linker'
-	@$(CXX) $(SYNTAX_FLAG) -L/usr/xenomai/lib -L/usr/arm-linux-gnueabihf/lib -L/usr/arm-linux-gnueabihf/lib/xenomai -L/usr/lib/arm-linux-gnueabihf -pthread -Wpointer-arith -o "$(PROJECT_DIR)/$(PROJECT)" $(CORE_OBJS) $(DEFAULT_MAIN_CONDITIONAL) $(ASM_OBJS) $(C_OBJS) $(CPP_OBJS) $(STATIC_LIBS) $(LIBS)
+	@$(CXX) $(SYNTAX_FLAG) -L/usr/xenomai/lib -L/usr/arm-linux-gnueabihf/lib -L/usr/arm-linux-gnueabihf/lib/xenomai -L/usr/lib/arm-linux-gnueabihf -pthread -Wpointer-arith -o "$(PROJECT_DIR)/$(PROJECT)" $(CORE_ASM_OBJS) $(CORE_OBJS) $(DEFAULT_MAIN_CONDITIONAL) $(ASM_OBJS) $(C_OBJS) $(CPP_OBJS) $(STATIC_LIBS) $(LIBS)
 	@echo 'Finished building target: $@'
 
 # Other Targets:
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/core/FormatConvert.S	Mon May 30 01:06:01 2016 +0100
@@ -0,0 +1,204 @@
+@
+@ FormatConvert.S
+@
+@ NEON-based vector functions for converting data between int
+@ and float with clipping and optional level scaling. 
+@ Written in assembly for performance reasons.
+@
+@ Bela: http://bela.io
+@
+@ (c) 2016 Andrew McPherson
+@ Centre for Digital Music
+@ Queen Mary University of London
+@
+
+	.syntax unified
+	.arch armv7-a
+	.fpu neon
+
+@	void int16_to_float_audio(int numSamples, int16_t *inBuffer, float *outBuffer);
+@
+@	Convert 16-bit signed ints to floats between -1.0 and 1.0. Used for audio input.
+@
+@	numSamples should be a multiple of 4 (i.e. with stereo audio, frames should be even)
+@	inBuffer should be aligned on an 8-byte boundary.
+@ 	outBuffer should be aligned on a 16-byte boundary.
+
+@ Registers:
+@    r0: numSamples            How many frames to convert
+@    r1: inBuffer              Buffer for input samples from ADC
+@    r2: outBuffer             Buffer to store output samples for render function
+
+dIFAu_Input		.dn		D0.S16
+qIFAu_Expanded	.qn		Q1.S32
+qIFAu_Output	.qn		Q2.F32
+dIFAu_Output_0	.dn		D4.F32
+dIFAu_Output_1	.dn		D5.F32
+
+	.align	2
+	.global	int16_to_float_audio
+	.thumb
+	.thumb_func
+	.type	int16_to_float_audio, %function
+int16_to_float_audio:
+	lsr r0, r0, #2
+	lsl r0, r0, #2				@ Clip off the last two bits of numSamples (only multiples of 4 allowed)
+	cmp r0, #0					@ Check for trivial case: zero samples
+	it eq
+	bxeq lr						@ Return if that's the case
+
+	@ ---- loop: iterate over the number of samples ----
+int16_to_float_audio_loop:
+	vld1 {dIFAu_Input}, [r1]!						@ Load four 16-bit signed ints from inBuffer++
+	vmovl qIFAu_Expanded, dIFAu_Input				@ Convert four 16-bit signed ints to 32-bit signed ints
+	vcvt qIFAu_Output, qIFAu_Expanded, #15			@ Convert four Q17.15 fixed points to floats (equiv. to / 32768)
+	vst1 {dIFAu_Output_0, dIFAu_Output_1}, [r2]!	@ Store floats back into memory
+
+	subs r0, r0, #4							@ numSamples -= 4
+	it gt
+	bgt int16_to_float_audio_loop			@ Loop if numSamples > 0
+
+	bx lr
+	
+
+
+@	void int16_to_float_analog(int numSamples, uint16_t *inBuffer, float *outBuffer);
+@
+@	Convert 16-bit unsigned ints to floats between 0.0 and 1.0. Used for analog input.
+@
+@	numSamples should be a multiple of 4 (i.e. integer numFrames at 4 channels)
+@	inBuffer should be aligned on an 8-byte boundary.
+@ 	outBuffer should be aligned on a 16-byte boundary.
+
+@ Registers:
+@    r0: numSamples            How many frames to convert
+@    r1: inBuffer              Buffer for input samples from ADC
+@    r2: outBuffer             Buffer to store output samples for render function
+
+dIFAn_Input		.dn		D0.U16
+qIFAn_Expanded	.qn		Q1.U32
+qIFAn_Output	.qn		Q2.F32
+dIFAn_Output_0	.dn		D4.F32
+dIFAn_Output_1	.dn		D5.F32
+
+	.align	2
+	.global	int16_to_float_analog
+	.thumb
+	.thumb_func
+	.type	int16_to_float_analog, %function
+int16_to_float_analog:
+	lsr r0, r0, #2
+	lsl r0, r0, #2				@ Clip off the last two bits of numSamples (only multiples of 4 allowed)
+	cmp r0, #0					@ Check for trivial case: zero samples
+	it eq
+	bxeq lr						@ Return if that's the case
+
+	@ ---- loop: iterate over the number of samples ----
+int16_to_float_analog_loop:
+	vld1 {dIFAn_Input}, [r1]!						@ Load four 16-bit signed ints from inBuffer++
+	vmovl qIFAn_Expanded, dIFAn_Input				@ Convert four 16-bit signed ints to 32-bit signed ints
+	vcvt qIFAn_Output, qIFAn_Expanded, #16			@ Convert four Q16.16 fixed points to floats (equiv. to / 65536)
+	vst1 {dIFAn_Output_0, dIFAn_Output_1}, [r2]!	@ Store floats back into memory
+
+	subs r0, r0, #4							@ numSamples -= 4
+	it gt
+	bgt int16_to_float_analog_loop			@ Loop if numSamples > 0
+
+	bx lr
+	
+
+@	void float_to_int16_audio(int numSamples, float *inBuffer, int16_t *outBuffer);
+@
+@	Convert floats between -1.0 and 1.0 to 16-bit signed ints, with saturation.
+@	Used for audio output.
+@
+@	numSamples should be a multiple of 4 (i.e. with stereo audio, frames should be even)
+@	inBuffer should be aligned on a 16-byte boundary.
+@ 	outBuffer should be aligned on an 8-byte boundary.
+
+@ Registers:
+@    r0: numSamples            How many frames to convert
+@    r1: inBuffer              Buffer for input samples from render function
+@    r2: outBuffer             Buffer to store output samples for DAC
+
+qFIAu_Input		.qn		Q0.F32
+dFIAu_Input_0	.dn		D0.F32
+dFIAu_Input_1,	.dn		D1.F32
+qFIAu_Converted	.qn		Q1.S32
+dFIAu_Narrowed	.dn		D4.S16	
+
+	.align	2
+	.global	float_to_int16_audio
+	.thumb
+	.thumb_func
+	.type	float_to_int16_audio, %function
+	
+float_to_int16_audio:
+	lsr r0, r0, #2
+	lsl r0, r0, #2				@ Clip off the last two bits of numSamples (only multiples of 4 allowed)
+	cmp r0, #0					@ Check for trivial case: zero samples
+	it eq
+	bxeq lr						@ Return if that's the case
+
+	@ ---- loop: iterate over the number of samples ----
+float_to_int16_audio_loop:
+	vld1 {dFIAu_Input_0, dFIAu_Input_1}, [r1]!		@ Load four floats from inBuffer++
+	vcvt qFIAu_Converted, qFIAu_Input, #15			@ Convert four floats into four Q17.15 fixed points (equiv. to * 32768)
+													@ This will truncate the result to a 32-bit representable value
+	vqmovn dFIAu_Narrowed, qFIAu_Converted			@ Convert four 32-bit signed ints to 16-bit signed ints, with saturation
+	vst1 {dFIAu_Narrowed}, [r2]!					@ Store ints back into memory
+
+	subs r0, r0, #4							@ numSamples -= 4
+	it gt
+	bgt float_to_int16_audio_loop			@ Loop if numSamples > 0
+
+	bx lr
+	
+
+@	void float_to_int16_analog(int numSamples, float *inBuffer, uint16_t *outBuffer);
+@
+@	Convert floats between 0.0 and 1.0 to 16-bit unsigned ints, with saturation.
+@	Used for analog output.
+@
+@	numSamples should be a multiple of 4 (i.e. with stereo audio, frames should be even)
+@	inBuffer should be aligned on a 16-byte boundary.
+@ 	outBuffer should be aligned on an 8-byte boundary.
+
+@ Registers:
+@    r0: numSamples            How many frames to convert
+@    r1: inBuffer              Buffer for input samples from render function
+@    r2: outBuffer             Buffer to store output samples for DAC
+
+qFIAn_Input		.qn		Q0.F32
+dFIAn_Input_0	.dn		D0.F32
+dFIAn_Input_1,	.dn		D1.F32
+qFIAn_Converted	.qn		Q1.U32
+dFIAn_Narrowed	.dn		D4.U16	
+
+	.align	2
+	.global	float_to_int16_analog
+	.thumb
+	.thumb_func
+	.type	float_to_int16_analog, %function
+	
+float_to_int16_analog:
+	lsr r0, r0, #2
+	lsl r0, r0, #2				@ Clip off the last two bits of numSamples (only multiples of 4 allowed)
+	cmp r0, #0					@ Check for trivial case: zero samples
+	it eq
+	bxeq lr						@ Return if that's the case
+
+	@ ---- loop: iterate over the number of samples ----
+float_to_int16_analog_loop:
+	vld1 {dFIAn_Input_0, dFIAn_Input_1}, [r1]!		@ Load four floats from inBuffer++
+	vcvt qFIAn_Converted, qFIAn_Input, #16			@ Convert four floats into four Q16.16 fixed points (equiv. to * 65536)
+													@ This will truncate the result to a 32-bit representable value
+	vqmovn dFIAn_Narrowed, qFIAn_Converted			@ Convert four 32-bit unsigned ints to 16-bit unsigned ints, with saturation
+	vst1 {dFIAn_Narrowed}, [r2]!					@ Store ints back into memory
+
+	subs r0, r0, #4							@ numSamples -= 4
+	it gt
+	bgt float_to_int16_analog_loop			@ Loop if numSamples > 0
+
+	bx lr
+	
--- a/core/PRU.cpp	Sat May 28 01:23:56 2016 +0100
+++ b/core/PRU.cpp	Mon May 30 01:06:01 2016 +0100
@@ -37,6 +37,11 @@
 
 using namespace std;
 
+// Select whether to use NEON-based sample conversion
+// (this will probably go away in a future commit once its performance
+//  is verified over extended use)
+#undef USE_NEON_FORMAT_CONVERSION
+
 // PRU memory: PRU0 and PRU1 RAM are 8kB (0x2000) long each
 //             PRU-SHARED RAM is 12kB (0x3000) long
 
@@ -110,6 +115,14 @@
 extern int gShouldStop;
 extern int gRTAudioVerbose;
 
+// These four functions are written in assembly in FormatConvert.S
+extern "C" {
+	void int16_to_float_audio(int numSamples, int16_t *inBuffer, float *outBuffer);
+	void int16_to_float_analog(int numSamples, uint16_t *inBuffer, float *outBuffer);
+	void float_to_int16_audio(int numSamples, float *inBuffer, int16_t *outBuffer);
+	void float_to_int16_analog(int numSamples, float *inBuffer, uint16_t *outBuffer);
+}
+
 // Constructor: specify a PRU number (0 or 1)
 PRU::PRU(InternalBelaContext *input_context)
 : context(input_context), pru_number(0), running(false), analog_enabled(false),
@@ -407,17 +420,56 @@
 			}
 		}
 	}
+	
+	// TESTING
+	// if(posix_memalign((void **)&testing_float, 16, 8 * context->audioFrames * sizeof(float))) {
+	// 	printf("Error allocating float buffers\n");
+	// 	return false;
+	// }
+	// if(posix_memalign((void **)&testing_int16, 8, 8 * context->audioFrames * sizeof(int16_t))) {
+	// 	printf("Error allocating float buffers\n");
+	// 	return false;
+	// }
 
 	// Allocate audio buffers
+#ifdef USE_NEON_FORMAT_CONVERSION
+	if(posix_memalign((void **)&context->audioIn, 16, 2 * context->audioFrames * sizeof(float))) {
+		printf("Error allocating audio input buffer\n");
+		return 1;
+	}
+	if(posix_memalign((void **)&context->audioOut, 16, 2 * context->audioFrames * sizeof(float))) {
+		printf("Error allocating audio output buffer\n");
+		return 1;
+	}
+#else
 	context->audioIn = (float *)malloc(2 * context->audioFrames * sizeof(float));
 	context->audioOut = (float *)malloc(2 * context->audioFrames * sizeof(float));
 	if(context->audioIn == 0 || context->audioOut == 0) {
 		rt_printf("Error: couldn't allocate audio buffers\n");
 		return 1;
 	}
-
+#endif
+	
 	// Allocate analog buffers
 	if(analog_enabled) {
+#ifdef USE_NEON_FORMAT_CONVERSION
+		if(posix_memalign((void **)&context->analogIn, 16, 
+							context->analogChannels * context->analogFrames * sizeof(float))) {
+			printf("Error allocating analog input buffer\n");
+			return 1;
+		}
+		if(posix_memalign((void **)&context->analogOut, 16, 
+							context->analogChannels * context->analogFrames * sizeof(float))) {
+			printf("Error allocating analog output buffer\n");
+			return 1;
+		}
+		last_analog_out_frame = (float *)malloc(context->analogChannels * sizeof(float));
+
+		if(last_analog_out_frame == 0) {
+			rt_printf("Error: couldn't allocate analog persistence buffer\n");
+			return 1;
+		}		
+#else
 		context->analogIn = (float *)malloc(context->analogChannels * context->analogFrames * sizeof(float));
 		context->analogOut = (float *)malloc(context->analogChannels * context->analogFrames * sizeof(float));
 		last_analog_out_frame = (float *)malloc(context->analogChannels * sizeof(float));
@@ -426,7 +478,8 @@
 			rt_printf("Error: couldn't allocate analog buffers\n");
 			return 1;
 		}
-
+#endif
+		
 		memset(last_analog_out_frame, 0, context->analogChannels * sizeof(float));
 	}
 
@@ -580,10 +633,14 @@
 		}
 
 		// Convert short (16-bit) samples to float
-		// TODO: NEON
-		for(unsigned int n = 0; n < 2 * context->audioFrames; n++)
+#ifdef USE_NEON_FORMAT_CONVERSION
+		int16_to_float_audio(2 * context->audioFrames, &pru_buffer_audio_adc[pru_audio_offset], context->audioIn);
+#else
+		for(unsigned int n = 0; n < 2 * context->audioFrames; n++) {
 			context->audioIn[n] = (float)pru_buffer_audio_adc[n + pru_audio_offset] / 32768.0f;
-
+		}
+#endif
+		
 		if(analog_enabled) {
 			if(mux_channels != 0) {
 				// If multiplexer is enabled, find out which channels we have by pulling out
@@ -593,9 +650,14 @@
 				// TODO
 			}
 			
-			// TODO: NEON
-			for(unsigned int n = 0; n < context->analogChannels * context->analogFrames; n++)
+#ifdef USE_NEON_FORMAT_CONVERSION
+			int16_to_float_analog(context->analogChannels * context->analogFrames, 
+									&pru_buffer_spi_adc[pru_spi_offset], context->analogIn);
+#else	
+			for(unsigned int n = 0; n < context->analogChannels * context->analogFrames; n++) {
 				context->analogIn[n] = (float)pru_buffer_spi_adc[n + pru_spi_offset] / 65536.0f;
+			}
+#endif
 
 			if(context->flags & BELA_FLAG_ANALOG_OUTPUTS_PERSIST) {
 				// Initialize the output buffer with the values that were in the last frame of the previous output
@@ -642,12 +704,17 @@
 			}
 
 			// Convert float back to short for SPI output
+#ifdef USE_NEON_FORMAT_CONVERSION
+			float_to_int16_analog(context->analogChannels * context->analogFrames, 
+								  context->analogOut, (uint16_t*)&pru_buffer_spi_dac[pru_spi_offset]);
+#else		
 			for(unsigned int n = 0; n < context->analogChannels * context->analogFrames; n++) {
 				int out = context->analogOut[n] * 65536.0f;
 				if(out < 0) out = 0;
 				else if(out > 65535) out = 65535;
 				pru_buffer_spi_dac[n + pru_spi_offset] = (uint16_t)out;
 			}
+#endif
 		}
 
 		if(digital_enabled) { // keep track of past digital values
@@ -657,13 +724,16 @@
 		}
 
         // Convert float back to short for audio
-		// TODO: NEON
+#ifdef USE_NEON_FORMAT_CONVERSION
+		float_to_int16_audio(2 * context->audioFrames, context->audioOut, &pru_buffer_audio_dac[pru_audio_offset]);
+#else	
 		for(unsigned int n = 0; n < 2 * context->audioFrames; n++) {
 			int out = context->audioOut[n] * 32768.0f;
 			if(out < -32768) out = -32768;
 			else if(out > 32767) out = 32767;
 			pru_buffer_audio_dac[n + pru_audio_offset] = (int16_t)out;
 		}
+#endif
 
 		// Increment total number of samples that have elapsed
 		context->audioFramesElapsed += context->audioFrames;