Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions agent/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,7 @@ CFLAGS = -mcpu=$(CPU_TYPE) -marm -O2 -ffreestanding -nostdlib \
-DUART_BASE=$(UART_BASE) -DUART_CLOCK=$(UART_CLOCK) $(CPU_FLAG) \
-DFLASH_MEM=$(FLASH_MEM) -DFMC_BASE=$(FMC_BASE) -DRAM_BASE=$(RAM_BASE) -DWDT_BASE=$(WDT_BASE) \
-DCRG_BASE=$(CRG_BASE) -DSYSCTRL_REBOOT=$(SYSCTRL_REBOOT) \
-DAGENT_LOAD_ADDR=$(LOAD_ADDR) \
$(if $(UART_CKSEL_REG),-DUART_CKSEL_REG=$(UART_CKSEL_REG) -DUART_CKSEL_BIT=$(UART_CKSEL_BIT)) \
-mno-unaligned-access -Wall -Wextra

Expand Down
213 changes: 210 additions & 3 deletions agent/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -123,8 +123,10 @@ static int addr_readable(uint32_t addr, uint32_t size) {
* v3 added: flash_mem at INFO bytes 24..27 (so the host knows which
* memory-mapped flash window CMD_CRC32/CMD_READ should target on
* SoCs where it isn't 0x14000000 — e.g. hi3520dv200 has it at
* 0x58000000). */
#define AGENT_VERSION 3
* 0x58000000).
* v4 added: CMD_MEMBW for bare-metal DDR bandwidth measurement
* (ARMv7 only; ACK_FLASH_ERROR on ARMv5). */
#define AGENT_VERSION 4

/* Capability flags — advertise supported features */
#define CAP_FLASH_STREAM (1 << 0) /* CMD_FLASH_STREAM with double-buffer */
Expand All @@ -134,9 +136,15 @@ static int addr_readable(uint32_t addr, uint32_t size) {
#define CAP_REBOOT (1 << 4) /* CMD_REBOOT */
#define CAP_SELFUPDATE (1 << 5) /* CMD_SELFUPDATE */
#define CAP_SCAN (1 << 6) /* CMD_SCAN */
#ifndef CPU_ARM926
#define CAP_MEMBW (1 << 7) /* CMD_MEMBW (ARMv7 PMU cycle counter) */
#else
#define CAP_MEMBW 0
#endif

#define AGENT_CAPS (CAP_FLASH_STREAM | CAP_SECTOR_BITMAP | CAP_PAGE_SKIP | \
CAP_SET_BAUD | CAP_REBOOT | CAP_SELFUPDATE | CAP_SCAN)
CAP_SET_BAUD | CAP_REBOOT | CAP_SELFUPDATE | CAP_SCAN | \
CAP_MEMBW)

static void handle_info(void) {
uint8_t resp[28];
Expand Down Expand Up @@ -240,6 +248,202 @@ static void handle_crc32_cmd(const uint8_t *data, uint32_t len) {
proto_send(RSP_CRC32, resp, 4);
}

/*
* CMD_MEMBW: DDR bandwidth test. ARMv7 (Cortex-A7) only.
*
* Request: [size:4LE][iters:4LE][addr:4LE]
* size = 0 → 4 MiB default; otherwise must be 256B-aligned, ≤ 16 MiB
* iters = 0 → 8 default; max 256
* addr = 0 → RAM_BASE + MEMBW_SCRATCH_OFF (auto-pick)
*
* Response: [base:4LE][size:4LE][iters:4LE][timer_hz:4LE]
* [memset_ticks:4LE][read_ticks:4LE][memcpy_ticks:4LE][cpu_arch:4LE]
*
* timer_hz = CCNT frequency in Hz, calibrated against the architectural
* generic timer; 0 if CNTFRQ wasn't set up. Host can still
* compute cycles/byte (CPU-clock-invariant) when timer_hz==0.
*
* Cache state: MMU is on with DDR mapped as write-back / write-allocate
* (see startup.S page-table fill). Test runs cached — apples-to-apples
* with userspace memcpy/memset, with the buffer sized well above L1+L2.
*/
#ifndef CPU_ARM926
static inline void pmccntr_init(void) {
uint32_t v;
asm volatile("mrc p15, 0, %0, c9, c12, 0" : "=r"(v));
v |= (1u << 0); /* E: enable all counters */
v |= (1u << 2); /* C: reset CCNT */
asm volatile("mcr p15, 0, %0, c9, c12, 0" :: "r"(v));
asm volatile("mcr p15, 0, %0, c9, c12, 1" :: "r"(0x80000000u));
asm volatile("isb");
}

static inline uint32_t pmccntr_read(void) {
uint32_t v;
asm volatile("isb\n\t"
"mrc p15, 0, %0, c9, c13, 0" : "=r"(v));
return v;
}

/* Calibrate CCNT (CPU cycles) against CNTPCT (architectural timer, fixed
* frequency from CNTFRQ). Returns CCNT ticks per second, or 0 if CNTFRQ
* wasn't initialised by an earlier boot stage. */
static uint32_t pmccntr_calibrate_hz(void) {
uint32_t cntfrq;
asm volatile("mrc p15, 0, %0, c14, c0, 0" : "=r"(cntfrq));
/* Sanity: most hi-silicon BL1 sets this to 24 MHz. Anything outside
* 1 MHz..100 MHz is almost certainly an uninitialised register. */
if (cntfrq < 1000000u || cntfrq > 100000000u) return 0;

uint32_t lo0, hi0, lo1, hi1;
asm volatile("mrrc p15, 0, %0, %1, c14" : "=r"(lo0), "=r"(hi0));
uint32_t target = cntfrq / 100; /* 10 ms window */
pmccntr_init();
uint32_t c0 = pmccntr_read();
do {
asm volatile("mrrc p15, 0, %0, %1, c14" : "=r"(lo1), "=r"(hi1));
} while ((lo1 - lo0) < target);
uint32_t c1 = pmccntr_read();
return (c1 - c0) * 100u;
}

/* Write 8 words per stm — 32 B per loop iteration. r4-r11 are AAPCS
* callee-saved; listing them as clobbers makes GCC push/pop them in
* the prologue. */
static void __attribute__((noinline)) membw_memset(uint32_t addr, uint32_t bytes) {
asm volatile(
"mov r4, %[v]\n\t"
"mov r5, %[v]\n\t"
"mov r6, %[v]\n\t"
"mov r7, %[v]\n\t"
"mov r8, %[v]\n\t"
"mov r9, %[v]\n\t"
"mov r10, %[v]\n\t"
"mov r11, %[v]\n\t"
"1:\n\t"
"stmia %[p]!, {r4, r5, r6, r7, r8, r9, r10, r11}\n\t"
"cmp %[p], %[end]\n\t"
"blo 1b\n\t"
: [p] "+r"(addr)
: [end] "r"(addr + bytes), [v] "r"(0xA5A5A5A5u)
: "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11",
"cc", "memory"
);
}

/* Read 8 words per ldm. No store — pure read bandwidth. */
static void __attribute__((noinline)) membw_read(uint32_t addr, uint32_t bytes) {
asm volatile(
"1:\n\t"
"ldmia %[p]!, {r4, r5, r6, r7, r8, r9, r10, r11}\n\t"
"cmp %[p], %[end]\n\t"
"blo 1b\n\t"
: [p] "+r"(addr)
: [end] "r"(addr + bytes)
: "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11",
"cc", "memory"
);
}

/* Copy 8 words per ldm/stm pair — 32 B in, 32 B out per iteration. */
static void __attribute__((noinline)) membw_memcpy(uint32_t dst, uint32_t src, uint32_t bytes) {
asm volatile(
"1:\n\t"
"ldmia %[s]!, {r4, r5, r6, r7, r8, r9, r10, r11}\n\t"
"stmia %[d]!, {r4, r5, r6, r7, r8, r9, r10, r11}\n\t"
"cmp %[s], %[end]\n\t"
"blo 1b\n\t"
: [s] "+r"(src), [d] "+r"(dst)
: [end] "r"(src + bytes)
: "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11",
"cc", "memory"
);
}
#endif /* !CPU_ARM926 */

#define MAX_MEMBW_SIZE (16u * 1024u * 1024u)
/* Agent footprint guard: protect [AGENT_LOAD_ADDR - 64 KB, AGENT_LOAD_ADDR
* + 8 MiB) from the test buffer. The lower margin covers the 16 KB stack
* that lives below _start; the upper margin (8 MiB) is generous head-room
* for .text/.data/.bss including the 16 KB-aligned page table. The default
* scratch sits at AGENT_LOAD_ADDR + 8 MiB so even 16 MiB × memcpy (32 MiB
* total span) fits inside the 128 MiB cached DDR window. */
#define MEMBW_AGENT_GUARD_LO ((uint32_t)AGENT_LOAD_ADDR - 0x10000u)
#define MEMBW_AGENT_GUARD_HI ((uint32_t)AGENT_LOAD_ADDR + 0x800000u)
#define MEMBW_DEFAULT_ADDR ((uint32_t)AGENT_LOAD_ADDR + 0x800000u)

static void handle_membw(const uint8_t *data, uint32_t len) {
#ifdef CPU_ARM926
(void)data; (void)len;
/* ARMv5 (ARM926EJ-S) has a different PMU register layout. Out of
* scope — the motivating use case (gk7205v300 DDR fabric audit) is
* ARMv7. */
proto_send_ack(ACK_FLASH_ERROR);
#else
if (len < 12) { proto_send_ack(ACK_CRC_ERROR); return; }

uint32_t size = read_le32(&data[0]);
uint32_t iters = read_le32(&data[4]);
uint32_t addr = read_le32(&data[8]);

if (size == 0) size = 4u * 1024u * 1024u;
if (iters == 0) iters = 8;
if (addr == 0) addr = MEMBW_DEFAULT_ADDR;

if (iters > 256 || size > MAX_MEMBW_SIZE || (size & 0xFFu) != 0) {
proto_send_ack(ACK_FLASH_ERROR); return;
}
/* Fit dst = addr + size and src = addr inside the cached DDR
* window (128 MiB from RAM_BASE per startup.S page-table fill). */
if (addr < RAM_BASE) { proto_send_ack(ACK_FLASH_ERROR); return; }
uint32_t off = addr - RAM_BASE;
if (off + 2u * size > 128u * 1024u * 1024u) {
proto_send_ack(ACK_FLASH_ERROR); return;
}
/* Reject scratch ranges that would overlap the agent's own footprint
* (its code, stack, page table). memcpy would otherwise overwrite
* the running agent and the device would hang. */
uint32_t scratch_end = addr + 2u * size;
if (scratch_end > MEMBW_AGENT_GUARD_LO &&
addr < MEMBW_AGENT_GUARD_HI) {
proto_send_ack(ACK_FLASH_ERROR); return;
}

uint32_t timer_hz = pmccntr_calibrate_hz();

uint32_t t0, t1;

pmccntr_init();
t0 = pmccntr_read();
for (uint32_t i = 0; i < iters; i++) membw_memset(addr, size);
t1 = pmccntr_read();
uint32_t memset_ticks = t1 - t0;

pmccntr_init();
t0 = pmccntr_read();
for (uint32_t i = 0; i < iters; i++) membw_read(addr, size);
t1 = pmccntr_read();
uint32_t read_ticks = t1 - t0;

pmccntr_init();
t0 = pmccntr_read();
for (uint32_t i = 0; i < iters; i++) membw_memcpy(addr + size, addr, size);
t1 = pmccntr_read();
uint32_t memcpy_ticks = t1 - t0;

uint8_t resp[32];
write_le32(&resp[0], addr);
write_le32(&resp[4], size);
write_le32(&resp[8], iters);
write_le32(&resp[12], timer_hz);
write_le32(&resp[16], memset_ticks);
write_le32(&resp[20], read_ticks);
write_le32(&resp[24], memcpy_ticks);
write_le32(&resp[28], 1); /* cpu_arch: 1 = ARMv7 Cortex-A */
proto_send(RSP_MEMBW, resp, sizeof(resp));
#endif
}

/* Forward declaration */
static void handle_flash_write(const uint8_t *data, uint32_t len);

Expand Down Expand Up @@ -1131,6 +1335,9 @@ int main(void) {
case CMD_MARK_BAD:
handle_mark_bad(cmd_buf, data_len);
break;
case CMD_MEMBW:
handle_membw(cmd_buf, data_len);
break;
case CMD_SET_BAUD:
handle_set_baud(cmd_buf, data_len);
break;
Expand Down
2 changes: 2 additions & 0 deletions agent/protocol.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
#define CMD_FLASH_PROGRAM 0x0A
#define CMD_FLASH_STREAM 0x0B
#define CMD_MARK_BAD 0x0C /* NAND only: write 0x00 to OOB[0] of page 0 of a block */
#define CMD_MEMBW 0x0D /* DDR bandwidth test (ARMv7 only): see handle_membw */

/* Responses (device → host) */
#define RSP_INFO 0x81
Expand All @@ -28,6 +29,7 @@
#define RSP_CRC32 0x84
#define RSP_READY 0x85
#define RSP_SCAN 0x86
#define RSP_MEMBW 0x87

/* ACK status codes */
#define ACK_OK 0x00
Expand Down
75 changes: 75 additions & 0 deletions agent/test_agent.c
Original file line number Diff line number Diff line change
Expand Up @@ -448,6 +448,79 @@ static void test_cobs_roundtrip_all_crc_patterns(void) {
}
}

/*
* CMD_MEMBW request framing: host sends [size:4LE][iters:4LE][addr:4LE].
* The handler runs on ARM hardware (CCNT register), but we can verify
* the request and response packets round-trip through proto_send/recv
* with the right shape — that's what catches wire-format mismatches
* between agent C and host Python.
*/
static void test_proto_membw_request_framing(void) {
mock_reset();

/* Host → device: 12-byte payload */
uint8_t req[12];
uint32_t size = 4u * 1024u * 1024u;
uint32_t iters = 8;
uint32_t addr = 0x40400000u;
req[0] = (size >> 0) & 0xFF; req[1] = (size >> 8) & 0xFF;
req[2] = (size >> 16) & 0xFF; req[3] = (size >> 24) & 0xFF;
req[4] = (iters >> 0) & 0xFF; req[5] = (iters >> 8) & 0xFF;
req[6] = (iters >> 16) & 0xFF; req[7] = (iters >> 24) & 0xFF;
req[8] = (addr >> 0) & 0xFF; req[9] = (addr >> 8) & 0xFF;
req[10] = (addr >> 16) & 0xFF; req[11] = (addr >> 24) & 0xFF;

proto_send(CMD_MEMBW, req, 12);

memcpy(mock_rx, mock_tx, mock_tx_len);
mock_rx_len = mock_tx_len;
mock_rx_pos = 0;

uint8_t buf[MAX_PAYLOAD + 16];
uint32_t len = 0;
uint8_t cmd = proto_recv(buf, &len, 1000);
ASSERT(cmd == CMD_MEMBW, "membw request: command opcode");
ASSERT(len == 12, "membw request: payload length");
ASSERT(memcmp(buf, req, 12) == 0, "membw request: payload bytes");
}

static void test_proto_membw_response_framing(void) {
mock_reset();

/* Device → host: 32-byte response. Build with synthetic values that
* exercise all 8 little-endian word fields. */
uint8_t resp[32];
uint32_t fields[8] = {
0x40400000u, /* base */
4u << 20, /* size = 4 MiB */
8u, /* iters */
24000000u, /* timer_hz */
123456u, /* memset_ticks */
654321u, /* read_ticks */
999999u, /* memcpy_ticks */
1u, /* cpu_arch */
};
for (int i = 0; i < 8; i++) {
resp[i*4 + 0] = (fields[i] >> 0) & 0xFF;
resp[i*4 + 1] = (fields[i] >> 8) & 0xFF;
resp[i*4 + 2] = (fields[i] >> 16) & 0xFF;
resp[i*4 + 3] = (fields[i] >> 24) & 0xFF;
}

proto_send(RSP_MEMBW, resp, 32);

memcpy(mock_rx, mock_tx, mock_tx_len);
mock_rx_len = mock_tx_len;
mock_rx_pos = 0;

uint8_t buf[MAX_PAYLOAD + 16];
uint32_t len = 0;
uint8_t cmd = proto_recv(buf, &len, 1000);
ASSERT(cmd == RSP_MEMBW, "membw response: command opcode");
ASSERT(len == 32, "membw response: payload length");
ASSERT(memcmp(buf, resp, 32) == 0, "membw response: payload bytes");
}

/*
* page_is_ff helper: verify it correctly identifies all-0xFF pages
* and rejects pages with even a single non-0xFF byte.
Expand Down Expand Up @@ -548,6 +621,8 @@ int main(void) {
test_proto_recv_bad_crc();
test_proto_max_payload();
test_proto_multiple_packets();
test_proto_membw_request_framing();
test_proto_membw_response_framing();

printf("Cross-compatibility:\n");
test_cobs_matches_python();
Expand Down
Loading
Loading