Merge pull request #1022 from martinling/sgpio-cleanup

M0 SGPIO code cleanup & optimisation
2022-01-11 12:55:24 +00:00
parent aec108be46 98df8c23be
commit 60cfd0fe74
7 changed files with 229 additions and 86 deletions
--- a/firmware/common/LPC43xx_M4_memory.ld
+++ b/firmware/common/LPC43xx_M4_memory.ld
@ -34,6 +34,5 @@ MEMORY
 }
 usb_bulk_buffer = ORIGIN(ram_usb);
-usb_bulk_buffer_offset = ORIGIN(ram_shared);
+m0_state = ORIGIN(ram_shared);
 usb_bulk_buffer_tx = ORIGIN(ram_shared)+4;
 PROVIDE(__ram_m0_start__ = ORIGIN(ram_m0));
--- a/firmware/hackrf_usb/CMakeLists.txt
+++ b/firmware/hackrf_usb/CMakeLists.txt
@ -35,7 +35,6 @@ set(SRC_M4
 	hackrf_usb.c
 	"${PATH_HACKRF_FIRMWARE_COMMON}/tuning.c"
 	"${PATH_HACKRF_FIRMWARE_COMMON}/streaming.c"
 	usb_bulk_buffer.c
 	"${PATH_HACKRF_FIRMWARE_COMMON}/usb.c"
 	"${PATH_HACKRF_FIRMWARE_COMMON}/usb_request.c"
 	"${PATH_HACKRF_FIRMWARE_COMMON}/usb_standard_request.c"
--- a/firmware/hackrf_usb/usb_bulk_buffer.c
+++ b/firmware/hackrf_usb/usb_bulk_buffer.c
@ -1,6 +1,5 @@
 /*
- * Copyright 2012 Jared Boone
+ * Copyright 2022 Great Scott Gadgets
 * Copyright 2013 Benjamin Vernoux
 *
 * This file is part of HackRF.
 *
@ -20,6 +19,18 @@
 * Boston, MA 02110-1301, USA.
 */
-#include "usb_bulk_buffer.h"
+#ifndef __M0_STATE_H__
 #define __M0_STATE_H__
-volatile uint32_t usb_bulk_buffer_offset = 0;
+struct m0_state {
 	uint32_t offset;
 	uint32_t tx;
 };
 /* Address of m0_state is set in ldscripts. If you change the name of this
 * variable, it won't be where it needs to be in the processor's address space,
 * unless you also adjust the ldscripts.
 */
 extern volatile struct m0_state m0_state;
 #endif/*__M0_STATE_H__*/
--- a/firmware/hackrf_usb/sgpio_m0.s
+++ b/firmware/hackrf_usb/sgpio_m0.s
@ -1,108 +1,244 @@
 /*
- * This file is part of GreatFET
+ * Copyright 2019-2022 Great Scott Gadgets
 *
- * Specialized SGPIO interrupt handler for Rhododendron.
+ * This file is part of HackRF.
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2, or (at your option)
 * any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; see the file COPYING.  If not, write to
 * the Free Software Foundation, Inc., 51 Franklin Street,
 * Boston, MA 02110-1301, USA.
 */
 /*
 Introduction
 ============
 This file contains the code that runs on the Cortex-M0 core of the LPC43xx.
 The M0 core is used to implement all the timing-critical usage of the SGPIO
 peripheral, which interfaces to the MAX5864 ADC/DAC via the CPLD.
 The M0 reads or writes 32 bytes at a time from the SGPIO registers,
 transferring these bytes to or from a shared USB bulk buffer. The M4 core
 handles transferring data between this buffer and the USB host.
 The SGPIO peripheral is set up and enabled by the M4 core. All the M0 needs to
 do is handle the SGPIO exchange interrupt, which indicates that new data can
 now be read from or written to the SGPIO shadow registers.
 Timing
 ======
 This code has tight timing constraints.
 We have to complete a read or write from SGPIO every 163 cycles.
 The CPU clock is 204MHz. We exchange 32 bytes at a time in the SGPIO
 registers, which is 16 samples worth of IQ data. At the maximum sample rate of
 20MHz, the SGPIO update rate is 20 / 16 = 1.25MHz. So we have 204 / 1.25 =
 163.2 cycles available.
 Access to the SGPIO peripheral is slow, due to the asynchronous bridge that
 connects it to the AHB bus matrix. Section 20.4.1 of the LPC43xx user manual
 (UM10503) specifies the access latencies as:
 Read:  4 x MCLK + 4 x CLK_PERIPH_SGPIO
 Write: 4 x MCLK + 2 x CLK_PERIPH_SGPIO
 In our case both these clocks are at 204MHz so reads add 8 cycles and writes
 add 6. These are latencies that add to the usual M0 instruction timings, so an
 ldr from SGPIO takes 10 cycles, and an str to SGPIO takes 8 cycles.
 These latencies are assumed to apply to all accesses to the SGPIO peripheral's
 address space, which includes its interrupt control registers as well as the
 shadow registers.
 There are two key code paths, with the following worst-case timings:
 RX:             140 cycles
 TX:             125 cycles
 Design
 ======
 Due to the timing constraints, this code is highly optimised.
 This is the only code that runs on the M0, so it does not need to follow
 calling conventions, nor use features of the architecture in standard ways.
 The SGPIO handling does not run as an ISR. It polls the interrupt status.
 This saves the cycle costs of interrupt entry and exit, and allows all
 registers to be used freely.
 All possible registers, including the stack pointer and link register, can be
 used to store values needed in the code, to minimise memory loads and stores.
 There are no function calls. There is no stack usage. All values are in
 registers and fixed memory addresses.
 */
 // Constants that point to registers we'll need to modify in the SGPIO block.
 .equ SGPIO_REGISTER_BLOCK_BASE,            0x40101000
 .equ SGPIO_SHADOW_REGISTERS_BASE,          0x40101100
-.equ SGPIO_EXCHANGE_INTERRUPT_CLEAR_REG,   0x40101F30
+.equ SGPIO_EXCHANGE_INTERRUPT_BASE,        0x40101F00
 .equ SGPIO_EXCHANGE_INTERRUPT_STATUS_REG,  0x40101F2C
 .equ SGPIO_GPIO_INPUT,                     0x40101210
 // Offsets into the interrupt control registers.
 .equ INT_CLEAR,                            0x30
 .equ INT_STATUS,                           0x2C
 // Buffer that we're funneling data to/from.
 .equ TARGET_DATA_BUFFER,                   0x20008000
 .equ TARGET_BUFFER_POSITION,               0x20007000
 .equ TARGET_BUFFER_TX,                     0x20007004
 .equ TARGET_BUFFER_MASK,                   0x7fff
 // Base address of the state structure.
 .equ STATE_BASE,                           0x20007000
 // Offsets into the state structure.
 .equ OFFSET,                               0x00
 .equ TX,                                   0x04
 // Our slice chain is set up as follows (ascending data age; arrows are reversed for flow):
 //     L  -> F  -> K  -> C -> J  -> E  -> I  -> A
 // Which has equivalent shadow register offsets:
 //     44 -> 20 -> 40 -> 8 -> 36 -> 16 -> 32 -> 0
 .equ SLICE0,                               44
 .equ SLICE1,                               20
 .equ SLICE2,                               40
 .equ SLICE3,                               8
 .equ SLICE4,                               36
 .equ SLICE5,                               16
 .equ SLICE6,                               32
 .equ SLICE7,                               0
 /* Allocations of single-use registers */
 state             .req r13
 buf_base          .req r12
 buf_mask          .req r11
 sgpio_data        .req r7
 sgpio_int         .req r6
 buf_ptr           .req r5
 // Entry point. At this point, the libopencm3 startup code has set things up as
 // normal; .data and .bss are initialised, the stack is set up, etc.  However,
 // we don't actually use any of that.  All the code in this file would work
 // fine if the M0 jumped straight to main at reset.
 .global main
 .thumb_func
-main:
+main:                                                                                           // Cycle counts:
 	// Initialise registers used for constant values.
 	value .req r0
 	ldr sgpio_int, =SGPIO_EXCHANGE_INTERRUPT_BASE   // sgpio_int = SGPIO_INT_BASE           // 2
 	ldr sgpio_data, =SGPIO_SHADOW_REGISTERS_BASE    // sgpio_data = SGPIO_REG_SS            // 2
 	ldr value, =TARGET_DATA_BUFFER                  // value = TARGET_DATA_BUFFER           // 2
 	mov buf_base, value                             // buf_base = value                     // 1
 	ldr value, =TARGET_BUFFER_MASK                  // value = TARGET_DATA_MASK             // 2
 	mov buf_mask, value                             // buf_mask = value                     // 1
 	ldr value, =STATE_BASE                          // value = STATE_BASE                   // 2
 	mov state, value                                // state = value                        // 1
 	// Initialise state.
 	zero .req r0
 	mov zero, #0                                    // zero = 0                             // 1
 	str zero, [state, #OFFSET]                      // state.offset = zero                  // 2
 	str zero, [state, #TX]                          // state.tx = zero                      // 2
 loop:
 	// The worst case timing is assumed to occur when reading the interrupt
 	// status register *just* misses the flag being set - so we include the
 	// cycles required to check it a second time.
 	//
 	// We also assume that we can spend a full 10 cycles doing an ldr from
 	// SGPIO the first time (2 for ldr, plus 8 for SGPIO-AHB bus latency),
 	// and still miss a flag that was set at the start of those 10 cycles.
 	//
 	// This latter asssumption is probably slightly pessimistic, since the
 	// sampling of the flag on the SGPIO side must occur some time after
 	// the ldr instruction begins executing on the M0. However, we avoid
 	// relying on any assumptions about the timing details of a read over
 	// the SGPIO to AHB bridge.
 	int_status .req r0
 	scratch .req r1
 	// Spin until we're ready to handle an SGPIO packet:
-	// Grab the exchange interrupt staus...
+	// Grab the exchange interrupt status...
-	ldr r0, =SGPIO_EXCHANGE_INTERRUPT_STATUS_REG
+	ldr int_status, [sgpio_int, #INT_STATUS]        // int_status = SGPIO_STATUS_1          // 10, twice
 	ldr r0, [r0]
-	// ... check to see if it has any interrupt bits set...
+	// ... check to see if bit #0 (slice A) was set, by shifting it into the carry bit...
-	lsr r0, #1
+	lsr scratch, int_status, #1                     // scratch = int_status >> 1            // 1, twice
 	// ... and if not, jump back to the beginning.
-	bcc main
+	bcc loop                                        // if !carry: goto loop                 // 3, then 1
-	// Clear the interrupt pending bits for the SGPIO slices we're working with.
+	// Clear the interrupt pending bits that were set.
-	ldr r0, =SGPIO_EXCHANGE_INTERRUPT_CLEAR_REG
+	str int_status, [sgpio_int, #INT_CLEAR]         // SGPIO_CLR_STATUS_1 = int_status      // 8
 	ldr r1, =0xffff
 	str r1, [r0]
 	// Grab the base address of the SGPIO shadow registers...
 	ldr r7, =SGPIO_SHADOW_REGISTERS_BASE
 	// ... and grab the address of the buffer segment we want to write to / read from.
-	ldr r0, =TARGET_DATA_BUFFER       // r0 = &buffer
+	ldr buf_ptr, [state, #OFFSET]                   // buf_ptr = state.offset               // 2
-	ldr r3, =TARGET_BUFFER_POSITION   // r3 = &position_in_buffer
+	add buf_ptr, buf_base                           // buf_ptr += buf_base                  // 1
 	ldr r2, [r3]                      // r2 = position_in_buffer
 	add r6, r0, r2                    // r6 = buffer_target = &buffer + position_in_buffer
-	mov r8, r3                        // Store &position_in_buffer.
+	tx .req r0
 	// Our slice chain is set up as follows (ascending data age; arrows are reversed for flow):
 	//     L  -> F  -> K  -> C -> J  -> E  -> I  -> A
 	// Which has equivalent shadow register offsets:
 	//     44 -> 20 -> 40 -> 8 -> 36 -> 16 -> 32 -> 0
 	// Load direction (TX or RX)
-	ldr r0, =TARGET_BUFFER_TX
+	ldr tx, [state, #TX]                            // tx = state.tx                        // 2
 	ldr r0, [r0]
 	// TX?
-	lsr r0, #1
+	lsr tx, #1                                      // tx >>= 1                             // 1
-	bcc direction_rx
+	bcc direction_rx                                // if !carry: goto direction_rx         // 1 thru, 3 taken
 direction_tx:
-	ldm r6!, {r0-r5}
+	ldm buf_ptr!, {r0-r3}                           // r0-r3 = buf_ptr[0:16]; buf_ptr += 16 // 5
-	str r0,  [r7, #44]
+	str r0, [sgpio_data, #SLICE0]                   // SGPIO_REG_SS[SLICE0] = r0            // 8
-	str r1,  [r7, #20]
+	str r1, [sgpio_data, #SLICE1]                   // SGPIO_REG_SS[SLICE1] = r1            // 8
-	str r2,  [r7, #40]
+	str r2, [sgpio_data, #SLICE2]                   // SGPIO_REG_SS[SLICE2] = r2            // 8
-	str r3,  [r7, #8 ]
+	str r3, [sgpio_data, #SLICE3]                   // SGPIO_REG_SS[SLICE3] = r3            // 8
 	str r4,  [r7, #36]
 	str r5,  [r7, #16]
-	ldm r6!, {r0-r1}
+	ldm buf_ptr!, {r0-r3}                           // r0-r3 = buf_ptr[0:16]; buf_ptr += 16 // 5
-	str r0,  [r7, #32]
+	str r0, [sgpio_data, #SLICE4]                   // SGPIO_REG_SS[SLICE4] = r0            // 8
-	str r1,  [r7, #0]
+	str r1, [sgpio_data, #SLICE5]                   // SGPIO_REG_SS[SLICE5] = r1            // 8
 	str r2, [sgpio_data, #SLICE6]                   // SGPIO_REG_SS[SLICE6] = r2            // 8
 	str r3, [sgpio_data, #SLICE7]                   // SGPIO_REG_SS[SLICE7] = r3            // 8
-	b done
+	b done                                          // goto done                            // 3
 direction_rx:
-	// 8 cycles
+	ldr r0, [sgpio_data, #SLICE0]                   // r0 = SGPIO_REG_SS[SLICE0]            // 10
-	ldr r0,  [r7, #44] // 2
+	ldr r1, [sgpio_data, #SLICE1]                   // r1 = SGPIO_REG_SS[SLICE1]            // 10
-	ldr r1,  [r7, #20] // 2
+	ldr r2, [sgpio_data, #SLICE2]                   // r2 = SGPIO_REG_SS[SLICE2]            // 10
-	ldr r2,  [r7, #40] // 2
+	ldr r3, [sgpio_data, #SLICE3]                   // r3 = SGPIO_REG_SS[SLICE3]            // 10
-	ldr r3,  [r7, #8 ] // 2
+	stm buf_ptr!, {r0-r3}                           // buf_ptr[0:16] = r0-r3; buf_ptr += 16 // 5
 	ldr r4,  [r7, #36] // 2
 	ldr r5,  [r7, #16] // 2
 	stm r6!, {r0-r5}   // 7
-	// 6 cycles
+	ldr r0, [sgpio_data, #SLICE4]                   // r0 = SGPIO_REG_SS[SLICE4]            // 10
-	ldr r0,  [r7, #32] // 2
+	ldr r1, [sgpio_data, #SLICE5]                   // r1 = SGPIO_REG_SS[SLICE5]            // 10
-	ldr r1,  [r7, #0]  // 2
+	ldr r2, [sgpio_data, #SLICE6]                   // r2 = SGPIO_REG_SS[SLICE6]            // 10
-	stm r6!, {r0-r1}
+	ldr r3, [sgpio_data, #SLICE7]                   // r3 = SGPIO_REG_SS[SLICE7]            // 10
 	stm buf_ptr!, {r0-r3}                           // buf_ptr[0:16] = r0-r3; buf_ptr += 16 // 5
 done:
 	offset .req r0
 	// Finally, update the buffer location...
-	ldr r0, =TARGET_BUFFER_MASK
+	mov offset, buf_mask                            // offset = buf_mask                    // 1
-	and r0, r6, r0         // r0 = (position_in_buffer + size_copied) % buffer_size
+	and offset, buf_ptr                             // offset &= buf_ptr                    // 1
-	// ... restore &position_in_buffer, and store the new position there...
+	// ... and store the new position.
-	mov r1, r8
+	str offset, [state, #OFFSET]                    // state.offset = offset                // 2
 	str r0, [r1]           // position_in_buffer = (position_in_buffer + size_copied) % buffer_size
-	b main
+	b loop                                          // goto loop                            // 3
 // The linker will put a literal pool here, so add a label for clearer objdump output:
 constants:
--- a/firmware/hackrf_usb/usb_api_sweep.c
+++ b/firmware/hackrf_usb/usb_api_sweep.c
@ -25,6 +25,7 @@
 #include <hackrf_core.h>
 #include "usb_api_transceiver.h"
 #include "usb_bulk_buffer.h"
 #include "m0_state.h"
 #include "tuning.h"
 #include "usb_endpoint.h"
 #include "streaming.h"
@ -99,7 +100,7 @@ void sweep_mode(void) {
 	while (TRANSCEIVER_MODE_RX_SWEEP == transceiver_mode()) {
 		// Set up IN transfer of buffer 0.
-		if ( usb_bulk_buffer_offset >= 16384 && phase == 1) {
+		if ( m0_state.offset >= 16384 && phase == 1) {
 			transfer = true;
 			buffer = &usb_bulk_buffer[0x0000];
 			phase = 0;
@ -107,7 +108,7 @@ void sweep_mode(void) {
 		}
 		// Set up IN transfer of buffer 1.
-		if ( usb_bulk_buffer_offset < 16384 && phase == 0) {
+		if ( m0_state.offset < 16384 && phase == 0) {
 			transfer = true;
 			buffer = &usb_bulk_buffer[0x4000];
 			phase = 1;
--- a/firmware/hackrf_usb/usb_api_transceiver.c
+++ b/firmware/hackrf_usb/usb_api_transceiver.c
@ -27,6 +27,7 @@
 #include <libopencm3/cm3/vector.h>
 #include "usb_bulk_buffer.h"
 #include "m0_state.h"
 #include "usb_api_cpld.h" // Remove when CPLD update is handled elsewhere
@ -262,20 +263,20 @@ void set_transceiver_mode(const transceiver_mode_t new_transceiver_mode) {
 		led_off(LED3);
 		led_on(LED2);
 		rf_path_set_direction(&rf_path, RF_PATH_DIRECTION_RX);
-		usb_bulk_buffer_tx = false;
+		m0_state.tx = false;
 		break;
 	case TRANSCEIVER_MODE_TX:
 		led_off(LED2);
 		led_on(LED3);
 		rf_path_set_direction(&rf_path, RF_PATH_DIRECTION_TX);
-		usb_bulk_buffer_tx = true;
+		m0_state.tx = true;
 		break;
 	case TRANSCEIVER_MODE_OFF:
 	default:
 		led_off(LED2);
 		led_off(LED3);
 		rf_path_set_direction(&rf_path, RF_PATH_DIRECTION_OFF);
-		usb_bulk_buffer_tx = false;
+		m0_state.tx = false;
 	}
@ -284,7 +285,7 @@ void set_transceiver_mode(const transceiver_mode_t new_transceiver_mode) {
        hw_sync_enable(_hw_sync_mode);
-		usb_bulk_buffer_offset = 0;
+		m0_state.offset = 0;
 	}
 }
@ -330,7 +331,7 @@ void rx_mode(void) {
 	while (TRANSCEIVER_MODE_RX == _transceiver_mode) {
 		// Set up IN transfer of buffer 0.
-		if (16384 <= usb_bulk_buffer_offset && 1 == phase) {
+		if (16384 <= m0_state.offset && 1 == phase) {
 			usb_transfer_schedule_block(
 				&usb_endpoint_bulk_in,
 				&usb_bulk_buffer[0x0000],
@ -340,7 +341,7 @@ void rx_mode(void) {
 			phase = 0;
 		}
 		// Set up IN transfer of buffer 1.
-		if (16384 > usb_bulk_buffer_offset && 0 == phase) {
+		if (16384 > m0_state.offset && 0 == phase) {
 			usb_transfer_schedule_block(
 				&usb_endpoint_bulk_in,
 				&usb_bulk_buffer[0x4000],
@ -368,7 +369,7 @@ void tx_mode(void) {
 	while (TRANSCEIVER_MODE_TX == _transceiver_mode) {
 		// Set up OUT transfer of buffer 0.
-		if (16384 <= usb_bulk_buffer_offset && 1 == phase) {
+		if (16384 <= m0_state.offset && 1 == phase) {
 			usb_transfer_schedule_block(
 				&usb_endpoint_bulk_out,
 				&usb_bulk_buffer[0x0000],
@ -378,7 +379,7 @@ void tx_mode(void) {
 			phase = 0;
 		}
 		// Set up OUT transfer of buffer 1.
-		if (16384 > usb_bulk_buffer_offset && 0 == phase) {
+		if (16384 > m0_state.offset && 0 == phase) {
 			usb_transfer_schedule_block(
 				&usb_endpoint_bulk_out,
 				&usb_bulk_buffer[0x4000],
--- a/firmware/hackrf_usb/usb_bulk_buffer.h
+++ b/firmware/hackrf_usb/usb_bulk_buffer.h
@ -32,8 +32,4 @@
 */
 extern uint8_t usb_bulk_buffer[32768];
 extern volatile uint32_t usb_bulk_buffer_offset;
 extern bool usb_bulk_buffer_tx;
 #endif/*__USB_BULK_BUFFER_H__*/