Merge pull request #1022 from martinling/sgpio-cleanup

M0 SGPIO code cleanup & optimisation
2022-01-11 12:55:24 +00:00
parent aec108be46 98df8c23be
commit 60cfd0fe74
7 changed files with 229 additions and 86 deletions
--- a/firmware/common/LPC43xx_M4_memory.ld
+++ b/firmware/common/LPC43xx_M4_memory.ld
@ -34,6 +34,5 @@ MEMORY
 }

 usb_bulk_buffer = ORIGIN(ram_usb);
-usb_bulk_buffer_offset = ORIGIN(ram_shared);
-usb_bulk_buffer_tx = ORIGIN(ram_shared)+4;
+m0_state = ORIGIN(ram_shared);
 PROVIDE(__ram_m0_start__ = ORIGIN(ram_m0));
--- a/firmware/hackrf_usb/CMakeLists.txt
+++ b/firmware/hackrf_usb/CMakeLists.txt
@ -35,7 +35,6 @@ set(SRC_M4
 	hackrf_usb.c
 	"${PATH_HACKRF_FIRMWARE_COMMON}/tuning.c"
 	"${PATH_HACKRF_FIRMWARE_COMMON}/streaming.c"
-	usb_bulk_buffer.c
 	"${PATH_HACKRF_FIRMWARE_COMMON}/usb.c"
 	"${PATH_HACKRF_FIRMWARE_COMMON}/usb_request.c"
 	"${PATH_HACKRF_FIRMWARE_COMMON}/usb_standard_request.c"
--- a/firmware/hackrf_usb/usb_bulk_buffer.c
+++ b/firmware/hackrf_usb/usb_bulk_buffer.c
@ -1,6 +1,5 @@
 /*
- * Copyright 2012 Jared Boone
- * Copyright 2013 Benjamin Vernoux
+ * Copyright 2022 Great Scott Gadgets
 *
 * This file is part of HackRF.
 *
@ -20,6 +19,18 @@
 * Boston, MA 02110-1301, USA.
 */

-#include "usb_bulk_buffer.h"
+#ifndef __M0_STATE_H__
+#define __M0_STATE_H__

-volatile uint32_t usb_bulk_buffer_offset = 0;
+struct m0_state {
+	uint32_t offset;
+	uint32_t tx;
+};
+
+/* Address of m0_state is set in ldscripts. If you change the name of this
+ * variable, it won't be where it needs to be in the processor's address space,
+ * unless you also adjust the ldscripts.
+ */
+extern volatile struct m0_state m0_state;
+
+#endif/*__M0_STATE_H__*/
--- a/firmware/hackrf_usb/sgpio_m0.s
+++ b/firmware/hackrf_usb/sgpio_m0.s
@ -1,108 +1,244 @@
 /*
- * This file is part of GreatFET
+ * Copyright 2019-2022 Great Scott Gadgets
 *
- * Specialized SGPIO interrupt handler for Rhododendron.
+ * This file is part of HackRF.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; see the file COPYING.  If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
 */

+/*
+
+Introduction
+============
+
+This file contains the code that runs on the Cortex-M0 core of the LPC43xx.
+
+The M0 core is used to implement all the timing-critical usage of the SGPIO
+peripheral, which interfaces to the MAX5864 ADC/DAC via the CPLD.
+
+The M0 reads or writes 32 bytes at a time from the SGPIO registers,
+transferring these bytes to or from a shared USB bulk buffer. The M4 core
+handles transferring data between this buffer and the USB host.
+
+The SGPIO peripheral is set up and enabled by the M4 core. All the M0 needs to
+do is handle the SGPIO exchange interrupt, which indicates that new data can
+now be read from or written to the SGPIO shadow registers.
+
+Timing
+======
+
+This code has tight timing constraints.
+
+We have to complete a read or write from SGPIO every 163 cycles.
+
+The CPU clock is 204MHz. We exchange 32 bytes at a time in the SGPIO
+registers, which is 16 samples worth of IQ data. At the maximum sample rate of
+20MHz, the SGPIO update rate is 20 / 16 = 1.25MHz. So we have 204 / 1.25 =
+163.2 cycles available.
+
+Access to the SGPIO peripheral is slow, due to the asynchronous bridge that
+connects it to the AHB bus matrix. Section 20.4.1 of the LPC43xx user manual
+(UM10503) specifies the access latencies as:
+
+Read:  4 x MCLK + 4 x CLK_PERIPH_SGPIO
+Write: 4 x MCLK + 2 x CLK_PERIPH_SGPIO
+
+In our case both these clocks are at 204MHz so reads add 8 cycles and writes
+add 6. These are latencies that add to the usual M0 instruction timings, so an
+ldr from SGPIO takes 10 cycles, and an str to SGPIO takes 8 cycles.
+
+These latencies are assumed to apply to all accesses to the SGPIO peripheral's
+address space, which includes its interrupt control registers as well as the
+shadow registers.
+
+There are two key code paths, with the following worst-case timings:
+
+RX:             140 cycles
+TX:             125 cycles
+
+Design
+======
+
+Due to the timing constraints, this code is highly optimised.
+
+This is the only code that runs on the M0, so it does not need to follow
+calling conventions, nor use features of the architecture in standard ways.
+
+The SGPIO handling does not run as an ISR. It polls the interrupt status.
+This saves the cycle costs of interrupt entry and exit, and allows all
+registers to be used freely.
+
+All possible registers, including the stack pointer and link register, can be
+used to store values needed in the code, to minimise memory loads and stores.
+
+There are no function calls. There is no stack usage. All values are in
+registers and fixed memory addresses.
+
+*/

 // Constants that point to registers we'll need to modify in the SGPIO block.
-.equ SGPIO_REGISTER_BLOCK_BASE,            0x40101000
 .equ SGPIO_SHADOW_REGISTERS_BASE,          0x40101100
-.equ SGPIO_EXCHANGE_INTERRUPT_CLEAR_REG,   0x40101F30
-.equ SGPIO_EXCHANGE_INTERRUPT_STATUS_REG,  0x40101F2C
-.equ SGPIO_GPIO_INPUT,                     0x40101210
+.equ SGPIO_EXCHANGE_INTERRUPT_BASE,        0x40101F00

+// Offsets into the interrupt control registers.
+.equ INT_CLEAR,                            0x30
+.equ INT_STATUS,                           0x2C

 // Buffer that we're funneling data to/from.
 .equ TARGET_DATA_BUFFER,                   0x20008000
-.equ TARGET_BUFFER_POSITION,               0x20007000
-.equ TARGET_BUFFER_TX,                     0x20007004
 .equ TARGET_BUFFER_MASK,                   0x7fff

+// Base address of the state structure.
+.equ STATE_BASE,                           0x20007000
+
+// Offsets into the state structure.
+.equ OFFSET,                               0x00
+.equ TX,                                   0x04
+
+// Our slice chain is set up as follows (ascending data age; arrows are reversed for flow):
+//     L  -> F  -> K  -> C -> J  -> E  -> I  -> A
+// Which has equivalent shadow register offsets:
+//     44 -> 20 -> 40 -> 8 -> 36 -> 16 -> 32 -> 0
+.equ SLICE0,                               44
+.equ SLICE1,                               20
+.equ SLICE2,                               40
+.equ SLICE3,                               8
+.equ SLICE4,                               36
+.equ SLICE5,                               16
+.equ SLICE6,                               32
+.equ SLICE7,                               0
+
+/* Allocations of single-use registers */
+
+state             .req r13
+buf_base          .req r12
+buf_mask          .req r11
+sgpio_data        .req r7
+sgpio_int         .req r6
+buf_ptr           .req r5
+
+// Entry point. At this point, the libopencm3 startup code has set things up as
+// normal; .data and .bss are initialised, the stack is set up, etc.  However,
+// we don't actually use any of that.  All the code in this file would work
+// fine if the M0 jumped straight to main at reset.
 .global main
 .thumb_func
-main:
+main:                                                                                           // Cycle counts:
+	// Initialise registers used for constant values.
+	value .req r0
+	ldr sgpio_int, =SGPIO_EXCHANGE_INTERRUPT_BASE   // sgpio_int = SGPIO_INT_BASE           // 2
+	ldr sgpio_data, =SGPIO_SHADOW_REGISTERS_BASE    // sgpio_data = SGPIO_REG_SS            // 2
+	ldr value, =TARGET_DATA_BUFFER                  // value = TARGET_DATA_BUFFER           // 2
+	mov buf_base, value                             // buf_base = value                     // 1
+	ldr value, =TARGET_BUFFER_MASK                  // value = TARGET_DATA_MASK             // 2
+	mov buf_mask, value                             // buf_mask = value                     // 1
+	ldr value, =STATE_BASE                          // value = STATE_BASE                   // 2
+	mov state, value                                // state = value                        // 1
+
+	// Initialise state.
+	zero .req r0
+	mov zero, #0                                    // zero = 0                             // 1
+	str zero, [state, #OFFSET]                      // state.offset = zero                  // 2
+	str zero, [state, #TX]                          // state.tx = zero                      // 2
+
+loop:
+	// The worst case timing is assumed to occur when reading the interrupt
+	// status register *just* misses the flag being set - so we include the
+	// cycles required to check it a second time.
+	//
+	// We also assume that we can spend a full 10 cycles doing an ldr from
+	// SGPIO the first time (2 for ldr, plus 8 for SGPIO-AHB bus latency),
+	// and still miss a flag that was set at the start of those 10 cycles.
+	//
+	// This latter asssumption is probably slightly pessimistic, since the
+	// sampling of the flag on the SGPIO side must occur some time after
+	// the ldr instruction begins executing on the M0. However, we avoid
+	// relying on any assumptions about the timing details of a read over
+	// the SGPIO to AHB bridge.
+
+	int_status .req r0
+	scratch .req r1

 	// Spin until we're ready to handle an SGPIO packet:
-	// Grab the exchange interrupt staus...
-	ldr r0, =SGPIO_EXCHANGE_INTERRUPT_STATUS_REG
-	ldr r0, [r0]
+	// Grab the exchange interrupt status...
+	ldr int_status, [sgpio_int, #INT_STATUS]        // int_status = SGPIO_STATUS_1          // 10, twice

-	// ... check to see if it has any interrupt bits set...
-	lsr r0, #1
+	// ... check to see if bit #0 (slice A) was set, by shifting it into the carry bit...
+	lsr scratch, int_status, #1                     // scratch = int_status >> 1            // 1, twice

 	// ... and if not, jump back to the beginning.
-	bcc main
+	bcc loop                                        // if !carry: goto loop                 // 3, then 1

-	// Clear the interrupt pending bits for the SGPIO slices we're working with.
-	ldr r0, =SGPIO_EXCHANGE_INTERRUPT_CLEAR_REG
-	ldr r1, =0xffff
-	str r1, [r0]
-
-	// Grab the base address of the SGPIO shadow registers...
-	ldr r7, =SGPIO_SHADOW_REGISTERS_BASE
+	// Clear the interrupt pending bits that were set.
+	str int_status, [sgpio_int, #INT_CLEAR]         // SGPIO_CLR_STATUS_1 = int_status      // 8

 	// ... and grab the address of the buffer segment we want to write to / read from.
-	ldr r0, =TARGET_DATA_BUFFER       // r0 = &buffer
-	ldr r3, =TARGET_BUFFER_POSITION   // r3 = &position_in_buffer
-	ldr r2, [r3]                      // r2 = position_in_buffer
-	add r6, r0, r2                    // r6 = buffer_target = &buffer + position_in_buffer
+	ldr buf_ptr, [state, #OFFSET]                   // buf_ptr = state.offset               // 2
+	add buf_ptr, buf_base                           // buf_ptr += buf_base                  // 1

-	mov r8, r3                        // Store &position_in_buffer.
-
-	// Our slice chain is set up as follows (ascending data age; arrows are reversed for flow):
-	//     L  -> F  -> K  -> C -> J  -> E  -> I  -> A
-	// Which has equivalent shadow register offsets:
-	//     44 -> 20 -> 40 -> 8 -> 36 -> 16 -> 32 -> 0
+	tx .req r0

 	// Load direction (TX or RX)
-	ldr r0, =TARGET_BUFFER_TX
-	ldr r0, [r0]
+	ldr tx, [state, #TX]                            // tx = state.tx                        // 2

 	// TX?
-	lsr r0, #1
-	bcc direction_rx
+	lsr tx, #1                                      // tx >>= 1                             // 1
+	bcc direction_rx                                // if !carry: goto direction_rx         // 1 thru, 3 taken

 direction_tx:

-	ldm r6!, {r0-r5}
-	str r0,  [r7, #44]
-	str r1,  [r7, #20]
-	str r2,  [r7, #40]
-	str r3,  [r7, #8 ]
-	str r4,  [r7, #36]
-	str r5,  [r7, #16]
+	ldm buf_ptr!, {r0-r3}                           // r0-r3 = buf_ptr[0:16]; buf_ptr += 16 // 5
+	str r0, [sgpio_data, #SLICE0]                   // SGPIO_REG_SS[SLICE0] = r0            // 8
+	str r1, [sgpio_data, #SLICE1]                   // SGPIO_REG_SS[SLICE1] = r1            // 8
+	str r2, [sgpio_data, #SLICE2]                   // SGPIO_REG_SS[SLICE2] = r2            // 8
+	str r3, [sgpio_data, #SLICE3]                   // SGPIO_REG_SS[SLICE3] = r3            // 8

-	ldm r6!, {r0-r1}
-	str r0,  [r7, #32]
-	str r1,  [r7, #0]
+	ldm buf_ptr!, {r0-r3}                           // r0-r3 = buf_ptr[0:16]; buf_ptr += 16 // 5
+	str r0, [sgpio_data, #SLICE4]                   // SGPIO_REG_SS[SLICE4] = r0            // 8
+	str r1, [sgpio_data, #SLICE5]                   // SGPIO_REG_SS[SLICE5] = r1            // 8
+	str r2, [sgpio_data, #SLICE6]                   // SGPIO_REG_SS[SLICE6] = r2            // 8
+	str r3, [sgpio_data, #SLICE7]                   // SGPIO_REG_SS[SLICE7] = r3            // 8

-	b done
+	b done                                          // goto done                            // 3

 direction_rx:

-	// 8 cycles
-	ldr r0,  [r7, #44] // 2
-	ldr r1,  [r7, #20] // 2
-	ldr r2,  [r7, #40] // 2
-	ldr r3,  [r7, #8 ] // 2
-	ldr r4,  [r7, #36] // 2
-	ldr r5,  [r7, #16] // 2
-	stm r6!, {r0-r5}   // 7
+	ldr r0, [sgpio_data, #SLICE0]                   // r0 = SGPIO_REG_SS[SLICE0]            // 10
+	ldr r1, [sgpio_data, #SLICE1]                   // r1 = SGPIO_REG_SS[SLICE1]            // 10
+	ldr r2, [sgpio_data, #SLICE2]                   // r2 = SGPIO_REG_SS[SLICE2]            // 10
+	ldr r3, [sgpio_data, #SLICE3]                   // r3 = SGPIO_REG_SS[SLICE3]            // 10
+	stm buf_ptr!, {r0-r3}                           // buf_ptr[0:16] = r0-r3; buf_ptr += 16 // 5

-	// 6 cycles
-	ldr r0,  [r7, #32] // 2
-	ldr r1,  [r7, #0]  // 2
-	stm r6!, {r0-r1}
+	ldr r0, [sgpio_data, #SLICE4]                   // r0 = SGPIO_REG_SS[SLICE4]            // 10
+	ldr r1, [sgpio_data, #SLICE5]                   // r1 = SGPIO_REG_SS[SLICE5]            // 10
+	ldr r2, [sgpio_data, #SLICE6]                   // r2 = SGPIO_REG_SS[SLICE6]            // 10
+	ldr r3, [sgpio_data, #SLICE7]                   // r3 = SGPIO_REG_SS[SLICE7]            // 10
+	stm buf_ptr!, {r0-r3}                           // buf_ptr[0:16] = r0-r3; buf_ptr += 16 // 5

 done:
+	offset .req r0

 	// Finally, update the buffer location...
-	ldr r0, =TARGET_BUFFER_MASK
-	and r0, r6, r0         // r0 = (position_in_buffer + size_copied) % buffer_size
+	mov offset, buf_mask                            // offset = buf_mask                    // 1
+	and offset, buf_ptr                             // offset &= buf_ptr                    // 1

-	// ... restore &position_in_buffer, and store the new position there...
-	mov r1, r8
-	str r0, [r1]           // position_in_buffer = (position_in_buffer + size_copied) % buffer_size
+	// ... and store the new position.
+	str offset, [state, #OFFSET]                    // state.offset = offset                // 2

-	b main
+	b loop                                          // goto loop                            // 3
+
+// The linker will put a literal pool here, so add a label for clearer objdump output:
+constants:
--- a/firmware/hackrf_usb/usb_api_sweep.c
+++ b/firmware/hackrf_usb/usb_api_sweep.c
@ -25,6 +25,7 @@
 #include <hackrf_core.h>
 #include "usb_api_transceiver.h"
 #include "usb_bulk_buffer.h"
+#include "m0_state.h"
 #include "tuning.h"
 #include "usb_endpoint.h"
 #include "streaming.h"
@ -99,7 +100,7 @@ void sweep_mode(void) {

 	while (TRANSCEIVER_MODE_RX_SWEEP == transceiver_mode()) {
 		// Set up IN transfer of buffer 0.
-		if ( usb_bulk_buffer_offset >= 16384 && phase == 1) {
+		if ( m0_state.offset >= 16384 && phase == 1) {
 			transfer = true;
 			buffer = &usb_bulk_buffer[0x0000];
 			phase = 0;
@ -107,7 +108,7 @@ void sweep_mode(void) {
 		}

 		// Set up IN transfer of buffer 1.
-		if ( usb_bulk_buffer_offset < 16384 && phase == 0) {
+		if ( m0_state.offset < 16384 && phase == 0) {
 			transfer = true;
 			buffer = &usb_bulk_buffer[0x4000];
 			phase = 1;
--- a/firmware/hackrf_usb/usb_api_transceiver.c
+++ b/firmware/hackrf_usb/usb_api_transceiver.c
@ -27,6 +27,7 @@

 #include <libopencm3/cm3/vector.h>
 #include "usb_bulk_buffer.h"
+#include "m0_state.h"

 #include "usb_api_cpld.h" // Remove when CPLD update is handled elsewhere

@ -262,20 +263,20 @@ void set_transceiver_mode(const transceiver_mode_t new_transceiver_mode) {
 		led_off(LED3);
 		led_on(LED2);
 		rf_path_set_direction(&rf_path, RF_PATH_DIRECTION_RX);
-		usb_bulk_buffer_tx = false;
+		m0_state.tx = false;
 		break;
 	case TRANSCEIVER_MODE_TX:
 		led_off(LED2);
 		led_on(LED3);
 		rf_path_set_direction(&rf_path, RF_PATH_DIRECTION_TX);
-		usb_bulk_buffer_tx = true;
+		m0_state.tx = true;
 		break;
 	case TRANSCEIVER_MODE_OFF:
 	default:
 		led_off(LED2);
 		led_off(LED3);
 		rf_path_set_direction(&rf_path, RF_PATH_DIRECTION_OFF);
-		usb_bulk_buffer_tx = false;
+		m0_state.tx = false;
 	}


@ -284,7 +285,7 @@ void set_transceiver_mode(const transceiver_mode_t new_transceiver_mode) {

        hw_sync_enable(_hw_sync_mode);

-		usb_bulk_buffer_offset = 0;
+		m0_state.offset = 0;
 	}
 }

@ -330,7 +331,7 @@ void rx_mode(void) {

 	while (TRANSCEIVER_MODE_RX == _transceiver_mode) {
 		// Set up IN transfer of buffer 0.
-		if (16384 <= usb_bulk_buffer_offset && 1 == phase) {
+		if (16384 <= m0_state.offset && 1 == phase) {
 			usb_transfer_schedule_block(
 				&usb_endpoint_bulk_in,
 				&usb_bulk_buffer[0x0000],
@ -340,7 +341,7 @@ void rx_mode(void) {
 			phase = 0;
 		}
 		// Set up IN transfer of buffer 1.
-		if (16384 > usb_bulk_buffer_offset && 0 == phase) {
+		if (16384 > m0_state.offset && 0 == phase) {
 			usb_transfer_schedule_block(
 				&usb_endpoint_bulk_in,
 				&usb_bulk_buffer[0x4000],
@ -368,7 +369,7 @@ void tx_mode(void) {

 	while (TRANSCEIVER_MODE_TX == _transceiver_mode) {
 		// Set up OUT transfer of buffer 0.
-		if (16384 <= usb_bulk_buffer_offset && 1 == phase) {
+		if (16384 <= m0_state.offset && 1 == phase) {
 			usb_transfer_schedule_block(
 				&usb_endpoint_bulk_out,
 				&usb_bulk_buffer[0x0000],
@ -378,7 +379,7 @@ void tx_mode(void) {
 			phase = 0;
 		}
 		// Set up OUT transfer of buffer 1.
-		if (16384 > usb_bulk_buffer_offset && 0 == phase) {
+		if (16384 > m0_state.offset && 0 == phase) {
 			usb_transfer_schedule_block(
 				&usb_endpoint_bulk_out,
 				&usb_bulk_buffer[0x4000],
--- a/firmware/hackrf_usb/usb_bulk_buffer.h
+++ b/firmware/hackrf_usb/usb_bulk_buffer.h
@ -32,8 +32,4 @@
 */
 extern uint8_t usb_bulk_buffer[32768];

-extern volatile uint32_t usb_bulk_buffer_offset;
-
-extern bool usb_bulk_buffer_tx;
-
 #endif/*__USB_BULK_BUFFER_H__*/