Mercurial > hg > beaglert
changeset 318:f7b19ea31bbb prerelease
Added NEON vectorised float<->int converters. Curiously, its performance is worse than the C version. I guess clang is better at vectorising than we thought! The new code remains disabled for now.
author | andrewm |
---|---|
date | Mon, 30 May 2016 01:06:01 +0100 |
parents | 52733b58bdf3 |
children | 2c4ddf4277d1 |
files | Makefile core/FormatConvert.S core/PRU.cpp |
diffstat | 3 files changed, 297 insertions(+), 10 deletions(-) [+] |
line wrap: on
line diff
--- a/Makefile Sat May 28 01:23:56 2016 +0100 +++ b/Makefile Mon May 30 01:06:01 2016 +0100 @@ -88,6 +88,11 @@ CORE_OBJS := $(addprefix build/core/,$(notdir $(CORE_CPP_SRCS:.cpp=.o))) CORE_CPP_DEPS := $(addprefix build/core/,$(notdir $(CORE_CPP_SRCS:.cpp=.d))) +CORE_ASM_SRCS := $(wildcard core/*.S) +CORE_ASM_OBJS := $(addprefix build/core/,$(notdir $(CORE_ASM_SRCS:.S=.o))) +CORE_ASM_DEPS := $(addprefix build/core/,$(notdir $(CORE_ASM_SRCS:.S=.d))) + + # Objects for a system-supplied default main() file, if the user # only wants to provide the render functions. DEFAULT_MAIN_CPP_SRCS := ./core/default_main.cpp @@ -117,6 +122,14 @@ @echo ' ...done' @echo ' ' +# Rule for Bela core ASM files +build/core/%.o: ./core/%.S + @echo 'Building $(notdir $<)...' +# @echo 'Invoking: GCC Assembler' + @as -o "$@" "$<" + @echo ' ...done' + @echo ' ' + # Rule for user-supplied C++ files $(PROJECT_DIR)/build/%.o: $(PROJECT_DIR)/%.cpp @echo 'Building $(notdir $<)...' @@ -146,10 +159,10 @@ # function, and conditionally call one of two recursive make targets depending on whether # we want to link in the default main file or not. The kludge is the mess of a shell script # line below. Surely there's a better way to do this? -Bela: $(CORE_OBJS) $(ASM_OBJS) $(C_OBJS) $(CPP_OBJS) $(STATIC_LIBS) $(DEFAULT_MAIN_OBJS) +Bela: $(CORE_ASM_OBJS) $(CORE_OBJS) $(ASM_OBJS) $(C_OBJS) $(CPP_OBJS) $(STATIC_LIBS) $(DEFAULT_MAIN_OBJS) $(eval DEFAULT_MAIN_CONDITIONAL := $(shell bash -c 'if [ `nm $(PROJECT_DIR)/build/*.o | grep -w T | grep -w main | wc -l` == '0' ]; then echo "$(DEFAULT_MAIN_OBJS)"; else echo ""; fi')) @echo 'Invoking: C++ linker' - @$(CXX) $(SYNTAX_FLAG) -L/usr/xenomai/lib -L/usr/arm-linux-gnueabihf/lib -L/usr/arm-linux-gnueabihf/lib/xenomai -L/usr/lib/arm-linux-gnueabihf -pthread -Wpointer-arith -o "$(PROJECT_DIR)/$(PROJECT)" $(CORE_OBJS) $(DEFAULT_MAIN_CONDITIONAL) $(ASM_OBJS) $(C_OBJS) $(CPP_OBJS) $(STATIC_LIBS) $(LIBS) + @$(CXX) $(SYNTAX_FLAG) -L/usr/xenomai/lib -L/usr/arm-linux-gnueabihf/lib -L/usr/arm-linux-gnueabihf/lib/xenomai -L/usr/lib/arm-linux-gnueabihf -pthread -Wpointer-arith -o "$(PROJECT_DIR)/$(PROJECT)" $(CORE_ASM_OBJS) $(CORE_OBJS) $(DEFAULT_MAIN_CONDITIONAL) $(ASM_OBJS) $(C_OBJS) $(CPP_OBJS) $(STATIC_LIBS) $(LIBS) @echo 'Finished building target: $@' # Other Targets:
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/core/FormatConvert.S Mon May 30 01:06:01 2016 +0100 @@ -0,0 +1,204 @@ +@ +@ FormatConvert.S +@ +@ NEON-based vector functions for converting data between int +@ and float with clipping and optional level scaling. +@ Written in assembly for performance reasons. +@ +@ Bela: http://bela.io +@ +@ (c) 2016 Andrew McPherson +@ Centre for Digital Music +@ Queen Mary University of London +@ + + .syntax unified + .arch armv7-a + .fpu neon + +@ void int16_to_float_audio(int numSamples, int16_t *inBuffer, float *outBuffer); +@ +@ Convert 16-bit signed ints to floats between -1.0 and 1.0. Used for audio input. +@ +@ numSamples should be a multiple of 4 (i.e. with stereo audio, frames should be even) +@ inBuffer should be aligned on an 8-byte boundary. +@ outBuffer should be aligned on a 16-byte boundary. + +@ Registers: +@ r0: numSamples How many frames to convert +@ r1: inBuffer Buffer for input samples from ADC +@ r2: outBuffer Buffer to store output samples for render function + +dIFAu_Input .dn D0.S16 +qIFAu_Expanded .qn Q1.S32 +qIFAu_Output .qn Q2.F32 +dIFAu_Output_0 .dn D4.F32 +dIFAu_Output_1 .dn D5.F32 + + .align 2 + .global int16_to_float_audio + .thumb + .thumb_func + .type int16_to_float_audio, %function +int16_to_float_audio: + lsr r0, r0, #2 + lsl r0, r0, #2 @ Clip off the last two bits of numSamples (only multiples of 4 allowed) + cmp r0, #0 @ Check for trivial case: zero samples + it eq + bxeq lr @ Return if that's the case + + @ ---- loop: iterate over the number of samples ---- +int16_to_float_audio_loop: + vld1 {dIFAu_Input}, [r1]! @ Load four 16-bit signed ints from inBuffer++ + vmovl qIFAu_Expanded, dIFAu_Input @ Convert four 16-bit signed ints to 32-bit signed ints + vcvt qIFAu_Output, qIFAu_Expanded, #15 @ Convert four Q17.15 fixed points to floats (equiv. to / 32768) + vst1 {dIFAu_Output_0, dIFAu_Output_1}, [r2]! @ Store floats back into memory + + subs r0, r0, #4 @ numSamples -= 4 + it gt + bgt int16_to_float_audio_loop @ Loop if numSamples > 0 + + bx lr + + + +@ void int16_to_float_analog(int numSamples, uint16_t *inBuffer, float *outBuffer); +@ +@ Convert 16-bit unsigned ints to floats between 0.0 and 1.0. Used for analog input. +@ +@ numSamples should be a multiple of 4 (i.e. integer numFrames at 4 channels) +@ inBuffer should be aligned on an 8-byte boundary. +@ outBuffer should be aligned on a 16-byte boundary. + +@ Registers: +@ r0: numSamples How many frames to convert +@ r1: inBuffer Buffer for input samples from ADC +@ r2: outBuffer Buffer to store output samples for render function + +dIFAn_Input .dn D0.U16 +qIFAn_Expanded .qn Q1.U32 +qIFAn_Output .qn Q2.F32 +dIFAn_Output_0 .dn D4.F32 +dIFAn_Output_1 .dn D5.F32 + + .align 2 + .global int16_to_float_analog + .thumb + .thumb_func + .type int16_to_float_analog, %function +int16_to_float_analog: + lsr r0, r0, #2 + lsl r0, r0, #2 @ Clip off the last two bits of numSamples (only multiples of 4 allowed) + cmp r0, #0 @ Check for trivial case: zero samples + it eq + bxeq lr @ Return if that's the case + + @ ---- loop: iterate over the number of samples ---- +int16_to_float_analog_loop: + vld1 {dIFAn_Input}, [r1]! @ Load four 16-bit signed ints from inBuffer++ + vmovl qIFAn_Expanded, dIFAn_Input @ Convert four 16-bit signed ints to 32-bit signed ints + vcvt qIFAn_Output, qIFAn_Expanded, #16 @ Convert four Q16.16 fixed points to floats (equiv. to / 65536) + vst1 {dIFAn_Output_0, dIFAn_Output_1}, [r2]! @ Store floats back into memory + + subs r0, r0, #4 @ numSamples -= 4 + it gt + bgt int16_to_float_analog_loop @ Loop if numSamples > 0 + + bx lr + + +@ void float_to_int16_audio(int numSamples, float *inBuffer, int16_t *outBuffer); +@ +@ Convert floats between -1.0 and 1.0 to 16-bit signed ints, with saturation. +@ Used for audio output. +@ +@ numSamples should be a multiple of 4 (i.e. with stereo audio, frames should be even) +@ inBuffer should be aligned on a 16-byte boundary. +@ outBuffer should be aligned on an 8-byte boundary. + +@ Registers: +@ r0: numSamples How many frames to convert +@ r1: inBuffer Buffer for input samples from render function +@ r2: outBuffer Buffer to store output samples for DAC + +qFIAu_Input .qn Q0.F32 +dFIAu_Input_0 .dn D0.F32 +dFIAu_Input_1, .dn D1.F32 +qFIAu_Converted .qn Q1.S32 +dFIAu_Narrowed .dn D4.S16 + + .align 2 + .global float_to_int16_audio + .thumb + .thumb_func + .type float_to_int16_audio, %function + +float_to_int16_audio: + lsr r0, r0, #2 + lsl r0, r0, #2 @ Clip off the last two bits of numSamples (only multiples of 4 allowed) + cmp r0, #0 @ Check for trivial case: zero samples + it eq + bxeq lr @ Return if that's the case + + @ ---- loop: iterate over the number of samples ---- +float_to_int16_audio_loop: + vld1 {dFIAu_Input_0, dFIAu_Input_1}, [r1]! @ Load four floats from inBuffer++ + vcvt qFIAu_Converted, qFIAu_Input, #15 @ Convert four floats into four Q17.15 fixed points (equiv. to * 32768) + @ This will truncate the result to a 32-bit representable value + vqmovn dFIAu_Narrowed, qFIAu_Converted @ Convert four 32-bit signed ints to 16-bit signed ints, with saturation + vst1 {dFIAu_Narrowed}, [r2]! @ Store ints back into memory + + subs r0, r0, #4 @ numSamples -= 4 + it gt + bgt float_to_int16_audio_loop @ Loop if numSamples > 0 + + bx lr + + +@ void float_to_int16_analog(int numSamples, float *inBuffer, uint16_t *outBuffer); +@ +@ Convert floats between 0.0 and 1.0 to 16-bit unsigned ints, with saturation. +@ Used for analog output. +@ +@ numSamples should be a multiple of 4 (i.e. with stereo audio, frames should be even) +@ inBuffer should be aligned on a 16-byte boundary. +@ outBuffer should be aligned on an 8-byte boundary. + +@ Registers: +@ r0: numSamples How many frames to convert +@ r1: inBuffer Buffer for input samples from render function +@ r2: outBuffer Buffer to store output samples for DAC + +qFIAn_Input .qn Q0.F32 +dFIAn_Input_0 .dn D0.F32 +dFIAn_Input_1, .dn D1.F32 +qFIAn_Converted .qn Q1.U32 +dFIAn_Narrowed .dn D4.U16 + + .align 2 + .global float_to_int16_analog + .thumb + .thumb_func + .type float_to_int16_analog, %function + +float_to_int16_analog: + lsr r0, r0, #2 + lsl r0, r0, #2 @ Clip off the last two bits of numSamples (only multiples of 4 allowed) + cmp r0, #0 @ Check for trivial case: zero samples + it eq + bxeq lr @ Return if that's the case + + @ ---- loop: iterate over the number of samples ---- +float_to_int16_analog_loop: + vld1 {dFIAn_Input_0, dFIAn_Input_1}, [r1]! @ Load four floats from inBuffer++ + vcvt qFIAn_Converted, qFIAn_Input, #16 @ Convert four floats into four Q16.16 fixed points (equiv. to * 65536) + @ This will truncate the result to a 32-bit representable value + vqmovn dFIAn_Narrowed, qFIAn_Converted @ Convert four 32-bit unsigned ints to 16-bit unsigned ints, with saturation + vst1 {dFIAn_Narrowed}, [r2]! @ Store ints back into memory + + subs r0, r0, #4 @ numSamples -= 4 + it gt + bgt float_to_int16_analog_loop @ Loop if numSamples > 0 + + bx lr +
--- a/core/PRU.cpp Sat May 28 01:23:56 2016 +0100 +++ b/core/PRU.cpp Mon May 30 01:06:01 2016 +0100 @@ -37,6 +37,11 @@ using namespace std; +// Select whether to use NEON-based sample conversion +// (this will probably go away in a future commit once its performance +// is verified over extended use) +#undef USE_NEON_FORMAT_CONVERSION + // PRU memory: PRU0 and PRU1 RAM are 8kB (0x2000) long each // PRU-SHARED RAM is 12kB (0x3000) long @@ -110,6 +115,14 @@ extern int gShouldStop; extern int gRTAudioVerbose; +// These four functions are written in assembly in FormatConvert.S +extern "C" { + void int16_to_float_audio(int numSamples, int16_t *inBuffer, float *outBuffer); + void int16_to_float_analog(int numSamples, uint16_t *inBuffer, float *outBuffer); + void float_to_int16_audio(int numSamples, float *inBuffer, int16_t *outBuffer); + void float_to_int16_analog(int numSamples, float *inBuffer, uint16_t *outBuffer); +} + // Constructor: specify a PRU number (0 or 1) PRU::PRU(InternalBelaContext *input_context) : context(input_context), pru_number(0), running(false), analog_enabled(false), @@ -407,17 +420,56 @@ } } } + + // TESTING + // if(posix_memalign((void **)&testing_float, 16, 8 * context->audioFrames * sizeof(float))) { + // printf("Error allocating float buffers\n"); + // return false; + // } + // if(posix_memalign((void **)&testing_int16, 8, 8 * context->audioFrames * sizeof(int16_t))) { + // printf("Error allocating float buffers\n"); + // return false; + // } // Allocate audio buffers +#ifdef USE_NEON_FORMAT_CONVERSION + if(posix_memalign((void **)&context->audioIn, 16, 2 * context->audioFrames * sizeof(float))) { + printf("Error allocating audio input buffer\n"); + return 1; + } + if(posix_memalign((void **)&context->audioOut, 16, 2 * context->audioFrames * sizeof(float))) { + printf("Error allocating audio output buffer\n"); + return 1; + } +#else context->audioIn = (float *)malloc(2 * context->audioFrames * sizeof(float)); context->audioOut = (float *)malloc(2 * context->audioFrames * sizeof(float)); if(context->audioIn == 0 || context->audioOut == 0) { rt_printf("Error: couldn't allocate audio buffers\n"); return 1; } - +#endif + // Allocate analog buffers if(analog_enabled) { +#ifdef USE_NEON_FORMAT_CONVERSION + if(posix_memalign((void **)&context->analogIn, 16, + context->analogChannels * context->analogFrames * sizeof(float))) { + printf("Error allocating analog input buffer\n"); + return 1; + } + if(posix_memalign((void **)&context->analogOut, 16, + context->analogChannels * context->analogFrames * sizeof(float))) { + printf("Error allocating analog output buffer\n"); + return 1; + } + last_analog_out_frame = (float *)malloc(context->analogChannels * sizeof(float)); + + if(last_analog_out_frame == 0) { + rt_printf("Error: couldn't allocate analog persistence buffer\n"); + return 1; + } +#else context->analogIn = (float *)malloc(context->analogChannels * context->analogFrames * sizeof(float)); context->analogOut = (float *)malloc(context->analogChannels * context->analogFrames * sizeof(float)); last_analog_out_frame = (float *)malloc(context->analogChannels * sizeof(float)); @@ -426,7 +478,8 @@ rt_printf("Error: couldn't allocate analog buffers\n"); return 1; } - +#endif + memset(last_analog_out_frame, 0, context->analogChannels * sizeof(float)); } @@ -580,10 +633,14 @@ } // Convert short (16-bit) samples to float - // TODO: NEON - for(unsigned int n = 0; n < 2 * context->audioFrames; n++) +#ifdef USE_NEON_FORMAT_CONVERSION + int16_to_float_audio(2 * context->audioFrames, &pru_buffer_audio_adc[pru_audio_offset], context->audioIn); +#else + for(unsigned int n = 0; n < 2 * context->audioFrames; n++) { context->audioIn[n] = (float)pru_buffer_audio_adc[n + pru_audio_offset] / 32768.0f; - + } +#endif + if(analog_enabled) { if(mux_channels != 0) { // If multiplexer is enabled, find out which channels we have by pulling out @@ -593,9 +650,14 @@ // TODO } - // TODO: NEON - for(unsigned int n = 0; n < context->analogChannels * context->analogFrames; n++) +#ifdef USE_NEON_FORMAT_CONVERSION + int16_to_float_analog(context->analogChannels * context->analogFrames, + &pru_buffer_spi_adc[pru_spi_offset], context->analogIn); +#else + for(unsigned int n = 0; n < context->analogChannels * context->analogFrames; n++) { context->analogIn[n] = (float)pru_buffer_spi_adc[n + pru_spi_offset] / 65536.0f; + } +#endif if(context->flags & BELA_FLAG_ANALOG_OUTPUTS_PERSIST) { // Initialize the output buffer with the values that were in the last frame of the previous output @@ -642,12 +704,17 @@ } // Convert float back to short for SPI output +#ifdef USE_NEON_FORMAT_CONVERSION + float_to_int16_analog(context->analogChannels * context->analogFrames, + context->analogOut, (uint16_t*)&pru_buffer_spi_dac[pru_spi_offset]); +#else for(unsigned int n = 0; n < context->analogChannels * context->analogFrames; n++) { int out = context->analogOut[n] * 65536.0f; if(out < 0) out = 0; else if(out > 65535) out = 65535; pru_buffer_spi_dac[n + pru_spi_offset] = (uint16_t)out; } +#endif } if(digital_enabled) { // keep track of past digital values @@ -657,13 +724,16 @@ } // Convert float back to short for audio - // TODO: NEON +#ifdef USE_NEON_FORMAT_CONVERSION + float_to_int16_audio(2 * context->audioFrames, context->audioOut, &pru_buffer_audio_dac[pru_audio_offset]); +#else for(unsigned int n = 0; n < 2 * context->audioFrames; n++) { int out = context->audioOut[n] * 32768.0f; if(out < -32768) out = -32768; else if(out > 32767) out = 32767; pru_buffer_audio_dac[n + pru_audio_offset] = (int16_t)out; } +#endif // Increment total number of samples that have elapsed context->audioFramesElapsed += context->audioFrames;