beaglert: core/PRU.cpp comparison

comparison core/PRU.cpp @ 318:f7b19ea31bbb prerelease

Added NEON vectorised float<->int converters. Curiously, its performance is worse than the C version. I guess clang is better at vectorising than we thought! The new code remains disabled for now.

author	andrewm
date	Mon, 30 May 2016 01:06:01 +0100
parents	493a07f6ec09
children	2c4ddf4277d1

comparison

equal deleted inserted replaced

-:52733b58bdf3
+:f7b19ea31bbb
 #include <native/task.h>
 #include <native/timer.h>
 #include <rtdk.h>
 using namespace std;
+// Select whether to use NEON-based sample conversion
+// (this will probably go away in a future commit once its performance
+//  is verified over extended use)
+#undef USE_NEON_FORMAT_CONVERSION
 // PRU memory: PRU0 and PRU1 RAM are 8kB (0x2000) long each
 //             PRU-SHARED RAM is 12kB (0x3000) long
 #define PRU_MEM_MCASP_OFFSET 0x2000  // Offset within PRU-SHARED RAM
 const unsigned int PRU::kPruGPIOTestPin2 = 31;	// GPIO0(31); P9-13
 const unsigned int PRU::kPruGPIOTestPin3 = 26;	// GPIO0(26); P8-14
 extern int gShouldStop;
 extern int gRTAudioVerbose;
+// These four functions are written in assembly in FormatConvert.S
+extern "C" {
+	void int16_to_float_audio(int numSamples, int16_t *inBuffer, float *outBuffer);
+	void int16_to_float_analog(int numSamples, uint16_t *inBuffer, float *outBuffer);
+	void float_to_int16_audio(int numSamples, float *inBuffer, int16_t *outBuffer);
+	void float_to_int16_analog(int numSamples, float *inBuffer, uint16_t *outBuffer);
+}
 // Constructor: specify a PRU number (0 or 1)
 PRU::PRU(InternalBelaContext *input_context)
 : context(input_context), pru_number(0), running(false), analog_enabled(false),
 digital_enabled(false), gpio_enabled(false), led_enabled(false),
 				close(xenomai_gpio_fd);
 				xenomai_gpio_fd = -1;
 			}
 		}
 	}
+	// TESTING
+	// if(posix_memalign((void **)&testing_float, 16, 8 * context->audioFrames * sizeof(float))) {
+	// 	printf("Error allocating float buffers\n");
+	// 	return false;
+	// }
+	// if(posix_memalign((void **)&testing_int16, 8, 8 * context->audioFrames * sizeof(int16_t))) {
+	// 	printf("Error allocating float buffers\n");
+	// 	return false;
+	// }
 	// Allocate audio buffers
+#ifdef USE_NEON_FORMAT_CONVERSION
+	if(posix_memalign((void **)&context->audioIn, 16, 2 * context->audioFrames * sizeof(float))) {
+		printf("Error allocating audio input buffer\n");
+		return 1;
+	}
+	if(posix_memalign((void **)&context->audioOut, 16, 2 * context->audioFrames * sizeof(float))) {
+		printf("Error allocating audio output buffer\n");
+		return 1;
+	}
+#else
 	context->audioIn = (float *)malloc(2 * context->audioFrames * sizeof(float));
 	context->audioOut = (float *)malloc(2 * context->audioFrames * sizeof(float));
 	if(context->audioIn == 0 || context->audioOut == 0) {
 		rt_printf("Error: couldn't allocate audio buffers\n");
 		return 1;
 	}
+#endif
 	// Allocate analog buffers
 	if(analog_enabled) {
+#ifdef USE_NEON_FORMAT_CONVERSION
+		if(posix_memalign((void **)&context->analogIn, 16,
+							context->analogChannels * context->analogFrames * sizeof(float))) {
+			printf("Error allocating analog input buffer\n");
+			return 1;
+		}
+		if(posix_memalign((void **)&context->analogOut, 16,
+							context->analogChannels * context->analogFrames * sizeof(float))) {
+			printf("Error allocating analog output buffer\n");
+			return 1;
+		}
+		last_analog_out_frame = (float *)malloc(context->analogChannels * sizeof(float));
+		if(last_analog_out_frame == 0) {
+			rt_printf("Error: couldn't allocate analog persistence buffer\n");
+			return 1;
+		}
+#else
 		context->analogIn = (float *)malloc(context->analogChannels * context->analogFrames * sizeof(float));
 		context->analogOut = (float *)malloc(context->analogChannels * context->analogFrames * sizeof(float));
 		last_analog_out_frame = (float *)malloc(context->analogChannels * sizeof(float));
 		if(context->analogIn == 0 || context->analogOut == 0 || last_analog_out_frame == 0) {
 			rt_printf("Error: couldn't allocate analog buffers\n");
 			return 1;
 		}
+#endif
 		memset(last_analog_out_frame, 0, context->analogChannels * sizeof(float));
 	}
 	// Allocate digital buffers
 	digital_buffer0 = pru_buffer_digital;
 			// Set the test pin high
 			xenomai_gpio[GPIO_SETDATAOUT] = TEST_PIN_MASK;
 		}
 		// Convert short (16-bit) samples to float
-		// TODO: NEON
+#ifdef USE_NEON_FORMAT_CONVERSION
-		for(unsigned int n = 0; n < 2 * context->audioFrames; n++)
+		int16_to_float_audio(2 * context->audioFrames, &pru_buffer_audio_adc[pru_audio_offset], context->audioIn);
+#else
+		for(unsigned int n = 0; n < 2 * context->audioFrames; n++) {
 			context->audioIn[n] = (float)pru_buffer_audio_adc[n + pru_audio_offset] / 32768.0f;
+		}
+#endif
 		if(analog_enabled) {
 			if(mux_channels != 0) {
 				// If multiplexer is enabled, find out which channels we have by pulling out
 				// the place that it ended.
 				// int lastMuxChannel = pru_buffer_comm[PRU_MUX_END_CHANNEL];
 				// TODO
 			}
-			// TODO: NEON
+#ifdef USE_NEON_FORMAT_CONVERSION
-			for(unsigned int n = 0; n < context->analogChannels * context->analogFrames; n++)
+			int16_to_float_analog(context->analogChannels * context->analogFrames,
+									&pru_buffer_spi_adc[pru_spi_offset], context->analogIn);
+#else
+			for(unsigned int n = 0; n < context->analogChannels * context->analogFrames; n++) {
 				context->analogIn[n] = (float)pru_buffer_spi_adc[n + pru_spi_offset] / 65536.0f;
+			}
+#endif
 			if(context->flags & BELA_FLAG_ANALOG_OUTPUTS_PERSIST) {
 				// Initialize the output buffer with the values that were in the last frame of the previous output
 				for(unsigned int ch = 0; ch < context->analogChannels; ch++){
 					for(unsigned int n = 0; n < context->analogFrames; n++){
 					last_analog_out_frame[ch] = context->analogOut[context->analogChannels * (context->analogFrames - 1) + ch];
 				}
 			}
 			// Convert float back to short for SPI output
+#ifdef USE_NEON_FORMAT_CONVERSION
+			float_to_int16_analog(context->analogChannels * context->analogFrames,
+								  context->analogOut, (uint16_t*)&pru_buffer_spi_dac[pru_spi_offset]);
+#else
 			for(unsigned int n = 0; n < context->analogChannels * context->analogFrames; n++) {
 				int out = context->analogOut[n] * 65536.0f;
 				if(out < 0) out = 0;
 				else if(out > 65535) out = 65535;
 				pru_buffer_spi_dac[n + pru_spi_offset] = (uint16_t)out;
 			}
+#endif
 		}
 		if(digital_enabled) { // keep track of past digital values
 			for(unsigned int n = 0; n < context->digitalFrames; n++){
 				last_digital_buffer[n] = context->digital[n];
 			}
 		}
 // Convert float back to short for audio
-		// TODO: NEON
+#ifdef USE_NEON_FORMAT_CONVERSION
+		float_to_int16_audio(2 * context->audioFrames, context->audioOut, &pru_buffer_audio_dac[pru_audio_offset]);
+#else
 		for(unsigned int n = 0; n < 2 * context->audioFrames; n++) {
 			int out = context->audioOut[n] * 32768.0f;
 			if(out < -32768) out = -32768;
 			else if(out > 32767) out = 32767;
 			pru_buffer_audio_dac[n + pru_audio_offset] = (int16_t)out;
 		}
+#endif
 		// Increment total number of samples that have elapsed
 		context->audioFramesElapsed += context->audioFrames;
 		if(xenomai_gpio != 0) {

Mercurial > hg > beaglert

comparison core/PRU.cpp @ 318:f7b19ea31bbb prerelease