From 3d9802260e7f1fecf9bc75fddb87f810848869ba Mon Sep 17 00:00:00 2001 From: Martin Ling Date: Mon, 20 Dec 2021 10:55:22 +0000 Subject: [PATCH] Document purpose and timing of existing M0 code. This commit does not modify the code; it only updates comments. --- firmware/hackrf_usb/sgpio_m0.s | 195 +++++++++++++++++++++++++-------- 1 file changed, 147 insertions(+), 48 deletions(-) diff --git a/firmware/hackrf_usb/sgpio_m0.s b/firmware/hackrf_usb/sgpio_m0.s index 9bbca574..9423d363 100644 --- a/firmware/hackrf_usb/sgpio_m0.s +++ b/firmware/hackrf_usb/sgpio_m0.s @@ -1,9 +1,93 @@ /* - * This file is part of GreatFET + * Copyright 2019-2022 Great Scott Gadgets * - * Specialized SGPIO interrupt handler for Rhododendron. + * This file is part of HackRF. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; see the file COPYING. If not, write to + * the Free Software Foundation, Inc., 51 Franklin Street, + * Boston, MA 02110-1301, USA. */ +/* + +Introduction +============ + +This file contains the code that runs on the Cortex-M0 core of the LPC43xx. + +The M0 core is used to implement all the timing-critical usage of the SGPIO +peripheral, which interfaces to the MAX5864 ADC/DAC via the CPLD. + +The M0 reads or writes 32 bytes at a time from the SGPIO registers, +transferring these bytes to or from a shared USB bulk buffer. The M4 core +handles transferring data between this buffer and the USB host. + +The SGPIO peripheral is set up and enabled by the M4 core. All the M0 needs to +do is handle the SGPIO exchange interrupt, which indicates that new data can +now be read from or written to the SGPIO shadow registers. + +Timing +====== + +This code has tight timing constraints. + +We have to complete a read or write from SGPIO every 163 cycles. + +The CPU clock is 204MHz. We exchange 32 bytes at a time in the SGPIO +registers, which is 16 samples worth of IQ data. At the maximum sample rate of +20MHz, the SGPIO update rate is 20 / 16 = 1.25MHz. So we have 204 / 1.25 = +163.2 cycles available. + +Access to the SGPIO peripheral is slow, due to the asynchronous bridge that +connects it to the AHB bus matrix. Section 20.4.1 of the LPC43xx user manual +(UM10503) specifies the access latencies as: + +Read: 4 x MCLK + 4 x CLK_PERIPH_SGPIO +Write: 4 x MCLK + 2 x CLK_PERIPH_SGPIO + +In our case both these clocks are at 204MHz so reads add 8 cycles and writes +add 6. These are latencies that add to the usual M0 instruction timings, so an +ldr from SGPIO takes 10 cycles, and an str to SGPIO takes 8 cycles. + +These latencies are assumed to apply to all accesses to the SGPIO peripheral's +address space, which includes its interrupt control registers as well as the +shadow registers. + +There are two key code paths, with the following worst-case timings: + +RX: 159 cycles +TX: 144 cycles + +Design +====== + +Due to the timing constraints, this code is highly optimised. + +This is the only code that runs on the M0, so it does not need to follow +calling conventions, nor use features of the architecture in standard ways. + +The SGPIO handling does not run as an ISR. It polls the interrupt status. +This saves the cycle costs of interrupt entry and exit, and allows all +registers to be used freely. + +All possible registers, including the stack pointer and link register, can be +used to store values needed in the code, to minimise memory loads and stores. + +There are no function calls. There is no stack usage. All values are in +registers and fixed memory addresses. + +*/ // Constants that point to registers we'll need to modify in the SGPIO block. .equ SGPIO_REGISTER_BLOCK_BASE, 0x40101000 @@ -19,36 +103,53 @@ .equ TARGET_BUFFER_TX, 0x20007004 .equ TARGET_BUFFER_MASK, 0x7fff +// Entry point. At this point, the libopencm3 startup code has set things up as +// normal; .data and .bss are initialised, the stack is set up, etc. However, +// we don't actually use any of that. All the code in this file would work +// fine if the M0 jumped straight to main at reset. .global main .thumb_func -main: +main: // Cycle counts: + // The worst case timing is assumed to occur when reading the interrupt + // status register *just* misses the flag being set - so we include the + // cycles required to check it a second time. + // + // We also assume that we can spend a full 10 cycles doing an ldr from + // SGPIO the first time (2 for ldr, plus 8 for SGPIO-AHB bus latency), + // and still miss a flag that was set at the start of those 10 cycles. + // + // This latter asssumption is probably slightly pessimistic, since the + // sampling of the flag on the SGPIO side must occur some time after + // the ldr instruction begins executing on the M0. However, we avoid + // relying on any assumptions about the timing details of a read over + // the SGPIO to AHB bridge. // Spin until we're ready to handle an SGPIO packet: // Grab the exchange interrupt staus... - ldr r0, =SGPIO_EXCHANGE_INTERRUPT_STATUS_REG - ldr r0, [r0] + ldr r0, =SGPIO_EXCHANGE_INTERRUPT_STATUS_REG // 2, twice + ldr r0, [r0] // 10, twice // ... check to see if it has any interrupt bits set... - lsr r0, #1 + lsr r0, #1 // 1, twice // ... and if not, jump back to the beginning. - bcc main + bcc main // 3, then 1 // Clear the interrupt pending bits for the SGPIO slices we're working with. - ldr r0, =SGPIO_EXCHANGE_INTERRUPT_CLEAR_REG - ldr r1, =0xffff - str r1, [r0] + ldr r0, =SGPIO_EXCHANGE_INTERRUPT_CLEAR_REG // 2 + ldr r1, =0xffff // 2 + str r1, [r0] // 8 // Grab the base address of the SGPIO shadow registers... - ldr r7, =SGPIO_SHADOW_REGISTERS_BASE + ldr r7, =SGPIO_SHADOW_REGISTERS_BASE // 2 // ... and grab the address of the buffer segment we want to write to / read from. - ldr r0, =TARGET_DATA_BUFFER // r0 = &buffer - ldr r3, =TARGET_BUFFER_POSITION // r3 = &position_in_buffer - ldr r2, [r3] // r2 = position_in_buffer - add r6, r0, r2 // r6 = buffer_target = &buffer + position_in_buffer + ldr r0, =TARGET_DATA_BUFFER // r0 = &buffer // 2 + ldr r3, =TARGET_BUFFER_POSITION // r3 = &position_in_buffer // 2 + ldr r2, [r3] // r2 = position_in_buffer // 2 + add r6, r0, r2 // r6 = buffer_target = &buffer + position_in_buffer // 1 - mov r8, r3 // Store &position_in_buffer. + mov r8, r3 // Store &position_in_buffer. // 1 // Our slice chain is set up as follows (ascending data age; arrows are reversed for flow): // L -> F -> K -> C -> J -> E -> I -> A @@ -56,53 +157,51 @@ main: // 44 -> 20 -> 40 -> 8 -> 36 -> 16 -> 32 -> 0 // Load direction (TX or RX) - ldr r0, =TARGET_BUFFER_TX - ldr r0, [r0] + ldr r0, =TARGET_BUFFER_TX // 2 + ldr r0, [r0] // 2 // TX? - lsr r0, #1 - bcc direction_rx + lsr r0, #1 // 1 + bcc direction_rx // 1 thru, 3 taken direction_tx: - ldm r6!, {r0-r5} - str r0, [r7, #44] - str r1, [r7, #20] - str r2, [r7, #40] - str r3, [r7, #8 ] - str r4, [r7, #36] - str r5, [r7, #16] + ldm r6!, {r0-r5} // 7 + str r0, [r7, #44] // 8 + str r1, [r7, #20] // 8 + str r2, [r7, #40] // 8 + str r3, [r7, #8 ] // 8 + str r4, [r7, #36] // 8 + str r5, [r7, #16] // 8 - ldm r6!, {r0-r1} - str r0, [r7, #32] - str r1, [r7, #0] + ldm r6!, {r0-r1} // 3 + str r0, [r7, #32] // 8 + str r1, [r7, #0] // 8 - b done + b done // 3 direction_rx: - // 8 cycles - ldr r0, [r7, #44] // 2 - ldr r1, [r7, #20] // 2 - ldr r2, [r7, #40] // 2 - ldr r3, [r7, #8 ] // 2 - ldr r4, [r7, #36] // 2 - ldr r5, [r7, #16] // 2 - stm r6!, {r0-r5} // 7 + ldr r0, [r7, #44] // 10 + ldr r1, [r7, #20] // 10 + ldr r2, [r7, #40] // 10 + ldr r3, [r7, #8 ] // 10 + ldr r4, [r7, #36] // 10 + ldr r5, [r7, #16] // 10 + stm r6!, {r0-r5} // 7 - // 6 cycles - ldr r0, [r7, #32] // 2 - ldr r1, [r7, #0] // 2 - stm r6!, {r0-r1} + ldr r0, [r7, #32] // 10 + ldr r1, [r7, #0] // 10 + stm r6!, {r0-r1} // 3 done: // Finally, update the buffer location... - ldr r0, =TARGET_BUFFER_MASK - and r0, r6, r0 // r0 = (position_in_buffer + size_copied) % buffer_size + ldr r0, =TARGET_BUFFER_MASK // 2 + and r0, r6, r0 // r0 = (pos_in_buffer + size_copied) % buffer_size // 1 // ... restore &position_in_buffer, and store the new position there... - mov r1, r8 - str r0, [r1] // position_in_buffer = (position_in_buffer + size_copied) % buffer_size + mov r1, r8 // 1 + str r0, [r1] // pos_in_buffer = (pos_in_buffer + size_copied) % buffer_size // 2 - b main + b main // 3