diff core/PRU.cpp @ 318:f7b19ea31bbb prerelease

Added NEON vectorised float<->int converters. Curiously, its performance is worse than the C version. I guess clang is better at vectorising than we thought! The new code remains disabled for now.
author andrewm
date Mon, 30 May 2016 01:06:01 +0100
parents 493a07f6ec09
children 2c4ddf4277d1
line wrap: on
line diff
--- a/core/PRU.cpp	Sat May 28 01:23:56 2016 +0100
+++ b/core/PRU.cpp	Mon May 30 01:06:01 2016 +0100
@@ -37,6 +37,11 @@
 
 using namespace std;
 
+// Select whether to use NEON-based sample conversion
+// (this will probably go away in a future commit once its performance
+//  is verified over extended use)
+#undef USE_NEON_FORMAT_CONVERSION
+
 // PRU memory: PRU0 and PRU1 RAM are 8kB (0x2000) long each
 //             PRU-SHARED RAM is 12kB (0x3000) long
 
@@ -110,6 +115,14 @@
 extern int gShouldStop;
 extern int gRTAudioVerbose;
 
+// These four functions are written in assembly in FormatConvert.S
+extern "C" {
+	void int16_to_float_audio(int numSamples, int16_t *inBuffer, float *outBuffer);
+	void int16_to_float_analog(int numSamples, uint16_t *inBuffer, float *outBuffer);
+	void float_to_int16_audio(int numSamples, float *inBuffer, int16_t *outBuffer);
+	void float_to_int16_analog(int numSamples, float *inBuffer, uint16_t *outBuffer);
+}
+
 // Constructor: specify a PRU number (0 or 1)
 PRU::PRU(InternalBelaContext *input_context)
 : context(input_context), pru_number(0), running(false), analog_enabled(false),
@@ -407,17 +420,56 @@
 			}
 		}
 	}
+	
+	// TESTING
+	// if(posix_memalign((void **)&testing_float, 16, 8 * context->audioFrames * sizeof(float))) {
+	// 	printf("Error allocating float buffers\n");
+	// 	return false;
+	// }
+	// if(posix_memalign((void **)&testing_int16, 8, 8 * context->audioFrames * sizeof(int16_t))) {
+	// 	printf("Error allocating float buffers\n");
+	// 	return false;
+	// }
 
 	// Allocate audio buffers
+#ifdef USE_NEON_FORMAT_CONVERSION
+	if(posix_memalign((void **)&context->audioIn, 16, 2 * context->audioFrames * sizeof(float))) {
+		printf("Error allocating audio input buffer\n");
+		return 1;
+	}
+	if(posix_memalign((void **)&context->audioOut, 16, 2 * context->audioFrames * sizeof(float))) {
+		printf("Error allocating audio output buffer\n");
+		return 1;
+	}
+#else
 	context->audioIn = (float *)malloc(2 * context->audioFrames * sizeof(float));
 	context->audioOut = (float *)malloc(2 * context->audioFrames * sizeof(float));
 	if(context->audioIn == 0 || context->audioOut == 0) {
 		rt_printf("Error: couldn't allocate audio buffers\n");
 		return 1;
 	}
-
+#endif
+	
 	// Allocate analog buffers
 	if(analog_enabled) {
+#ifdef USE_NEON_FORMAT_CONVERSION
+		if(posix_memalign((void **)&context->analogIn, 16, 
+							context->analogChannels * context->analogFrames * sizeof(float))) {
+			printf("Error allocating analog input buffer\n");
+			return 1;
+		}
+		if(posix_memalign((void **)&context->analogOut, 16, 
+							context->analogChannels * context->analogFrames * sizeof(float))) {
+			printf("Error allocating analog output buffer\n");
+			return 1;
+		}
+		last_analog_out_frame = (float *)malloc(context->analogChannels * sizeof(float));
+
+		if(last_analog_out_frame == 0) {
+			rt_printf("Error: couldn't allocate analog persistence buffer\n");
+			return 1;
+		}		
+#else
 		context->analogIn = (float *)malloc(context->analogChannels * context->analogFrames * sizeof(float));
 		context->analogOut = (float *)malloc(context->analogChannels * context->analogFrames * sizeof(float));
 		last_analog_out_frame = (float *)malloc(context->analogChannels * sizeof(float));
@@ -426,7 +478,8 @@
 			rt_printf("Error: couldn't allocate analog buffers\n");
 			return 1;
 		}
-
+#endif
+		
 		memset(last_analog_out_frame, 0, context->analogChannels * sizeof(float));
 	}
 
@@ -580,10 +633,14 @@
 		}
 
 		// Convert short (16-bit) samples to float
-		// TODO: NEON
-		for(unsigned int n = 0; n < 2 * context->audioFrames; n++)
+#ifdef USE_NEON_FORMAT_CONVERSION
+		int16_to_float_audio(2 * context->audioFrames, &pru_buffer_audio_adc[pru_audio_offset], context->audioIn);
+#else
+		for(unsigned int n = 0; n < 2 * context->audioFrames; n++) {
 			context->audioIn[n] = (float)pru_buffer_audio_adc[n + pru_audio_offset] / 32768.0f;
-
+		}
+#endif
+		
 		if(analog_enabled) {
 			if(mux_channels != 0) {
 				// If multiplexer is enabled, find out which channels we have by pulling out
@@ -593,9 +650,14 @@
 				// TODO
 			}
 			
-			// TODO: NEON
-			for(unsigned int n = 0; n < context->analogChannels * context->analogFrames; n++)
+#ifdef USE_NEON_FORMAT_CONVERSION
+			int16_to_float_analog(context->analogChannels * context->analogFrames, 
+									&pru_buffer_spi_adc[pru_spi_offset], context->analogIn);
+#else	
+			for(unsigned int n = 0; n < context->analogChannels * context->analogFrames; n++) {
 				context->analogIn[n] = (float)pru_buffer_spi_adc[n + pru_spi_offset] / 65536.0f;
+			}
+#endif
 
 			if(context->flags & BELA_FLAG_ANALOG_OUTPUTS_PERSIST) {
 				// Initialize the output buffer with the values that were in the last frame of the previous output
@@ -642,12 +704,17 @@
 			}
 
 			// Convert float back to short for SPI output
+#ifdef USE_NEON_FORMAT_CONVERSION
+			float_to_int16_analog(context->analogChannels * context->analogFrames, 
+								  context->analogOut, (uint16_t*)&pru_buffer_spi_dac[pru_spi_offset]);
+#else		
 			for(unsigned int n = 0; n < context->analogChannels * context->analogFrames; n++) {
 				int out = context->analogOut[n] * 65536.0f;
 				if(out < 0) out = 0;
 				else if(out > 65535) out = 65535;
 				pru_buffer_spi_dac[n + pru_spi_offset] = (uint16_t)out;
 			}
+#endif
 		}
 
 		if(digital_enabled) { // keep track of past digital values
@@ -657,13 +724,16 @@
 		}
 
         // Convert float back to short for audio
-		// TODO: NEON
+#ifdef USE_NEON_FORMAT_CONVERSION
+		float_to_int16_audio(2 * context->audioFrames, context->audioOut, &pru_buffer_audio_dac[pru_audio_offset]);
+#else	
 		for(unsigned int n = 0; n < 2 * context->audioFrames; n++) {
 			int out = context->audioOut[n] * 32768.0f;
 			if(out < -32768) out = -32768;
 			else if(out > 32767) out = 32767;
 			pru_buffer_audio_dac[n + pru_audio_offset] = (int16_t)out;
 		}
+#endif
 
 		// Increment total number of samples that have elapsed
 		context->audioFramesElapsed += context->audioFrames;