SlugLab
diff --git a/‎lib/qemu‎ b/‎lib/qemu‎
diff --git a/‎qemu_integration/README.md‎
Lines changed: 73 additions & 0 deletions b/‎qemu_integration/README.md‎
Lines changed: 73 additions & 0 deletions
diff --git a/‎qemu_integration/guest_libcuda/cxl_bar_benchmark.c‎
Lines changed: 159 additions & 2 deletions b/‎qemu_integration/guest_libcuda/cxl_bar_benchmark.c‎
Lines changed: 159 additions & 2 deletions
diff --git a/‎qemu_integration/guest_libcuda/cxl_gpu_cmd.h‎
Lines changed: 33 additions & 0 deletions b/‎qemu_integration/guest_libcuda/cxl_gpu_cmd.h‎
Lines changed: 33 additions & 0 deletions
diff --git a/‎qemu_integration/launch_qemu_vcs_dcd_gfam.sh‎
Lines changed: 10 additions & 2 deletions b/‎qemu_integration/launch_qemu_vcs_dcd_gfam.sh‎
Lines changed: 10 additions & 2 deletions
@@ -180,6 +180,79 @@ Query counters again from the host with:
 python3 ./zettai_host_dcd_gfam_test.py --query
 ```
 
+### Zettai Type2 tmatmul and CXL.mem ioctl test
+
+The Zettai switch CCI device (`7a74:a123`) creates a guest char device such as
+`/dev/zettai_cxl0d003`. The current Linux driver ABI for this device is
+`ioctl()`, not `io_uring_cmd`; `/tmp/zettai-qmp.sock` remains a host-side QMP
+socket used for bind/add/query orchestration.
+
+Build the guest helper:
+
+```bash
+gcc -O2 -Wall -Wextra -o zettai_tmatmul_ctl zettai_tmatmul_ctl.c
+```
+
+Check whether QEMU exposed the tmatmul CSR block:
+
+```bash
+./zettai_tmatmul_ctl --dev /dev/zettai_cxl0d003 --info
+```
+
+If dmesg reports `tmatmul=0` or the tool prints `tmatmul_present=no`, QEMU only
+exposed the switch CCI BAR and tmatmul smoke runs will return `ENODEV`. CXL.mem
+read/write can still be tested by passing a real nonzero HPA base from a CXL
+region or decoder resource:
+
+```bash
+cxl list -R -u
+./zettai_tmatmul_ctl --dev /dev/zettai_cxl0d003 \
+  --mem-write --hpa-base 0xYOUR_REGION_RESOURCE --hpa-size 0x10000000 \
+  --offset 0 --size 4096 --pattern 0x5a
+./zettai_tmatmul_ctl --dev /dev/zettai_cxl0d003 \
+  --mem-read --hpa-base 0xYOUR_REGION_RESOURCE --hpa-size 0x10000000 \
+  --offset 0 --size 64
+```
+
+Once the QEMU Zettai device exposes a BAR large enough for the tmatmul CSR window
+at `BAR0 + 0x1c0000`, run:
+
+```bash
+./zettai_tmatmul_ctl --dev /dev/zettai_cxl0d003 \
+  --smoke --hpa-base 0xYOUR_REGION_RESOURCE --hpa-size 0x10000000
+```
+
+### Zettai benchmark harness
+
+For a repeatable host-side smoke benchmark, use:
+
+```bash
+QEMU_NET_MODE=none \
+KERNEL_IMAGE=/path/to/bzImage \
+DISK_IMAGE=/path/to/rootfs.img \
+./zettai_benchmark.sh --launch --keep-qemu
+```
+
+The harness launches QEMU with a QMP socket, binds `cxl-dcd0`, adds a 256 MiB
+DCD extent, queries CXLMemSim DCD/GFAM counters, and writes logs under
+`build/zettai-bench/`. If QEMU is already running, omit `--launch` and keep the
+same `ZETTAI_QMP_SOCKET` value used by `QEMU_EXTRA_ARGS`.
+
+To include the in-guest DCD region setup and Type2 fabric-memory BAR benchmark,
+provide SSH access to the guest:
+
+```bash
+ZETTAI_GUEST_SSH="ssh root@192.168.122.10" \
+ZETTAI_GUEST_DIR=/root/CXLMemSim/qemu_integration \
+./zettai_benchmark.sh --guest --run-type2-bench
+```
+
+The Type2 benchmark is `guest_libcuda/cxl_bar_benchmark.c`. It discovers the
+`cxl-type2` endpoint (`8086:0d92`), reports BAR register and data-region
+latency/bandwidth, then exercises the Zettai fabric-memory controls exposed by
+QEMU: `DCD_GET_INFO`, optional DCD add/release when free capacity exists,
+`GFAM_GET_INFO`, and `MHSLD_GET_INFO/SET_HEAD`.
+
 ## Features
 
 - **Cacheline-granular access**: All memory operations are performed at 64-byte cacheline granularity
 
@@ -11,6 +11,7 @@
 #include <dirent.h>
 #include <errno.h>
 #include <fcntl.h>
+#include <inttypes.h>
 #include <pthread.h>
 #include <sched.h>
 #include <stdint.h>
@@ -101,7 +102,9 @@ static void enable_device(const char *bdf) {
     snprintf(path, sizeof(path), "/sys/bus/pci/devices/%s/enable", bdf);
     int fd = open(path, O_WRONLY);
     if (fd >= 0) {
-        write(fd, "1", 1);
+        if (write(fd, "1", 1) != 1) {
+            fprintf(stderr, "  Cannot enable %s: %s\n", bdf, strerror(errno));
+        }
         close(fd);
     }
 }
@@ -118,9 +121,11 @@ static int discover_devices(void) {
             continue;
         if (read_pci_id(ent->d_name, "device") != CXL_TYPE2_DEVICE)
             continue;
+        if (strlen(ent->d_name) >= sizeof(g_devs[g_num_devices].bdf))
+            continue;
 
         cxl_dev_t *d = &g_devs[g_num_devices];
-        strncpy(d->bdf, ent->d_name, sizeof(d->bdf) - 1);
+        strcpy(d->bdf, ent->d_name);
         enable_device(d->bdf);
 
         d->bar2_size = bar_range(d->bdf, 2);
@@ -182,6 +187,49 @@ static double time_ns(void) {
     return ts.tv_sec * 1e9 + ts.tv_nsec;
 }
 
+static inline uint64_t read_reg64(cxl_dev_t *d, size_t off) { return *(volatile uint64_t *)(d->bar2 + off); }
+
+static inline void write_reg64(cxl_dev_t *d, size_t off, uint64_t val) { *(volatile uint64_t *)(d->bar2 + off) = val; }
+
+static int issue_command(cxl_dev_t *d, uint32_t cmd) {
+    volatile uint32_t *cmd_reg = (volatile uint32_t *)(d->bar2 + CXL_GPU_REG_CMD);
+    volatile uint32_t *status_reg = (volatile uint32_t *)(d->bar2 + CXL_GPU_REG_CMD_STATUS);
+    volatile uint32_t *result_reg = (volatile uint32_t *)(d->bar2 + CXL_GPU_REG_CMD_RESULT);
+
+    *cmd_reg = cmd;
+    __sync_synchronize();
+
+    for (int timeout = 100000; timeout > 0; timeout--) {
+        uint32_t st = *status_reg;
+
+        if (st == CXL_GPU_CMD_STATUS_COMPLETE) {
+            return (int)*result_reg;
+        }
+        if (st == CXL_GPU_CMD_STATUS_ERROR) {
+            return -(int)*result_reg;
+        }
+    }
+
+    return -ETIMEDOUT;
+}
+
+static void measure_command_latency(cxl_dev_t *d, uint32_t cmd, const char *name, int iters) {
+    double t0;
+    double t1;
+    int rc = 0;
+
+    t0 = time_ns();
+    for (int i = 0; i < iters; i++) {
+        rc = issue_command(d, cmd);
+        if (rc != CXL_GPU_SUCCESS) {
+            printf("  %-20s failed: rc=%d\n", name, rc);
+            return;
+        }
+    }
+    t1 = time_ns();
+    printf("  %-20s %7.1f ns/op  (%d ops)\n", name, (t1 - t0) / iters, iters);
+}
+
 /*  Benchmark: Register Latency  */
 
 static void bench_register_latency(cxl_dev_t *d) {
@@ -453,6 +501,114 @@ static void bench_access_patterns(cxl_dev_t *d) {
     }
 }
 
+/*  Benchmark: DCD / GFAM / MH-SLD Fabric Memory Controls  */
+
+static void bench_fabric_memory(cxl_dev_t *d, uint32_t caps) {
+    uint64_t dcd_total = 0;
+    uint64_t dcd_alloc = 0;
+    uint64_t dcd_free = 0;
+    uint64_t dcd_extents = 0;
+    uint64_t gfam_hosts = 0;
+    uint64_t gfam_mappings = 0;
+    uint64_t gfam_allowed = 0;
+    uint64_t gfam_denied = 0;
+    uint64_t mhsld_heads = 0;
+    uint64_t mhsld_head_id = 0;
+    int rc;
+
+    if (!(caps & (CXL_GPU_CAP_DCD | CXL_GPU_CAP_GFAM | CXL_GPU_CAP_MHSLD))) {
+        printf("\n--- Fabric Memory Controls: skipped (no DCD/GFAM/MH-SLD caps) ---\n");
+        return;
+    }
+
+    printf("\n--- Fabric Memory Controls (device %s) ---\n", d->bdf);
+
+    if (caps & CXL_GPU_CAP_DCD) {
+        rc = issue_command(d, CXL_GPU_CMD_DCD_GET_INFO);
+        if (rc == CXL_GPU_SUCCESS) {
+            dcd_total = read_reg64(d, CXL_GPU_REG_RESULT0);
+            dcd_alloc = read_reg64(d, CXL_GPU_REG_RESULT1);
+            dcd_free = read_reg64(d, CXL_GPU_REG_RESULT2);
+            dcd_extents = read_reg64(d, CXL_GPU_REG_RESULT3);
+            printf("  DCD command info: total=%" PRIu64 " alloc=%" PRIu64 " free=%" PRIu64 " extents=%" PRIu64 "\n",
+                   dcd_total, dcd_alloc, dcd_free, dcd_extents);
+            printf("  DCD status regs:  total=%" PRIu64 " alloc=%" PRIu64 " free=%" PRIu64 " extents=%" PRIu64 "\n",
+                   read_reg64(d, CXL_GPU_REG_DCD_TOTAL), read_reg64(d, CXL_GPU_REG_DCD_ALLOCATED),
+                   read_reg64(d, CXL_GPU_REG_DCD_FREE), read_reg64(d, CXL_GPU_REG_DCD_EXTENTS));
+            measure_command_latency(d, CXL_GPU_CMD_DCD_GET_INFO, "DCD_GET_INFO", 5000);
+        } else {
+            printf("  DCD_GET_INFO failed: rc=%d\n", rc);
+        }
+
+        if (dcd_free >= 1024 * 1024) {
+            write_reg64(d, CXL_GPU_REG_PARAM0, UINT64_MAX);
+            write_reg64(d, CXL_GPU_REG_PARAM1, 1024 * 1024);
+            write_reg64(d, CXL_GPU_REG_PARAM2, 0);
+            rc = issue_command(d, CXL_GPU_CMD_DCD_ADD);
+            if (rc == CXL_GPU_SUCCESS) {
+                uint64_t base = read_reg64(d, CXL_GPU_REG_RESULT0);
+                uint64_t size = read_reg64(d, CXL_GPU_REG_RESULT1);
+                uint64_t tag = read_reg64(d, CXL_GPU_REG_RESULT2);
+
+                printf("  DCD add/release:  base=0x%" PRIx64 " size=%" PRIu64 " tag=%" PRIu64 "\n", base, size, tag);
+                write_reg64(d, CXL_GPU_REG_PARAM0, base);
+                write_reg64(d, CXL_GPU_REG_PARAM1, size);
+                write_reg64(d, CXL_GPU_REG_PARAM2, tag);
+                rc = issue_command(d, CXL_GPU_CMD_DCD_RELEASE);
+                if (rc != CXL_GPU_SUCCESS) {
+                    printf("  DCD_RELEASE failed after add: rc=%d\n", rc);
+                }
+            } else {
+                printf("  DCD_ADD skipped/failed: rc=%d\n", rc);
+            }
+        } else {
+            printf("  DCD add/release:  skipped (no free DCD capacity)\n");
+        }
+    }
+
+    if (caps & CXL_GPU_CAP_GFAM) {
+        rc = issue_command(d, CXL_GPU_CMD_GFAM_GET_INFO);
+        if (rc == CXL_GPU_SUCCESS) {
+            gfam_hosts = read_reg64(d, CXL_GPU_REG_RESULT0);
+            gfam_mappings = read_reg64(d, CXL_GPU_REG_RESULT1);
+            gfam_allowed = read_reg64(d, CXL_GPU_REG_RESULT2);
+            gfam_denied = read_reg64(d, CXL_GPU_REG_RESULT3);
+            printf("  GFAM command info: hosts=%" PRIu64 " mappings=%" PRIu64 " allowed=%" PRIu64 " denied=%" PRIu64
+                   "\n",
+                   gfam_hosts, gfam_mappings, gfam_allowed, gfam_denied);
+            printf("  GFAM status regs:  hosts=%" PRIu64 " mappings=%" PRIu64 " denied=%" PRIu64 "\n",
+                   read_reg64(d, CXL_GPU_REG_GFAM_HOSTS), read_reg64(d, CXL_GPU_REG_GFAM_MAPPINGS),
+                   read_reg64(d, CXL_GPU_REG_GFAM_DENIED));
+            measure_command_latency(d, CXL_GPU_CMD_GFAM_GET_INFO, "GFAM_GET_INFO", 5000);
+        } else {
+            printf("  GFAM_GET_INFO failed: rc=%d\n", rc);
+        }
+    }
+
+    if (caps & CXL_GPU_CAP_MHSLD) {
+        rc = issue_command(d, CXL_GPU_CMD_MHSLD_GET_INFO);
+        if (rc == CXL_GPU_SUCCESS) {
+            mhsld_heads = read_reg64(d, CXL_GPU_REG_RESULT0);
+            mhsld_head_id = read_reg64(d, CXL_GPU_REG_RESULT1);
+            printf("  MH-SLD command info: heads=%" PRIu64 " local=%" PRIu64 " reads=%" PRIu64 " writes=%" PRIu64 "\n",
+                   mhsld_heads, mhsld_head_id, read_reg64(d, CXL_GPU_REG_RESULT2), read_reg64(d, CXL_GPU_REG_RESULT3));
+            printf("  MH-SLD status regs:  heads=%" PRIu64 " local=%" PRIu64 " conflicts=%" PRIu64
+                   " invalidations=%" PRIu64 "\n",
+                   read_reg64(d, CXL_GPU_REG_MHSLD_HEADS), read_reg64(d, CXL_GPU_REG_MHSLD_HEAD_ID),
+                   read_reg64(d, CXL_GPU_REG_MHSLD_CONFLICTS), read_reg64(d, CXL_GPU_REG_MHSLD_INV));
+            measure_command_latency(d, CXL_GPU_CMD_MHSLD_GET_INFO, "MHSLD_GET_INFO", 5000);
+        } else {
+            printf("  MHSLD_GET_INFO failed: rc=%d\n", rc);
+        }
+
+        if (mhsld_heads > 0) {
+            write_reg64(d, CXL_GPU_REG_PARAM0, mhsld_head_id);
+            rc = issue_command(d, CXL_GPU_CMD_MHSLD_SET_HEAD);
+            printf("  MH-SLD set-head:  head=%" PRIu64 " rc=%d\n", mhsld_head_id, rc);
+        }
+    }
+}
+
 /*  Benchmark: Dual-Device Concurrent Access  */
 
 typedef struct {
@@ -561,6 +717,7 @@ int main(void) {
 
         bench_register_latency(d);
         bench_cmd_latency(d);
+        bench_fabric_memory(d, caps);
         bench_data_region_bw(d);
         bench_access_patterns(d);
         bench_bar4_bulk_bw(d);
 
@@ -63,6 +63,9 @@
 #define CXL_GPU_CAP_DMA_ENGINE (1 << 2) /* Hardware DMA engine available */
 #define CXL_GPU_CAP_COHERENT_POOL (1 << 3) /* Coherent shared memory pool */
 #define CXL_GPU_CAP_DEVICE_BIAS (1 << 4) /* Device-biased directory mode */
+#define CXL_GPU_CAP_DCD (1 << 5) /* Dynamic Capacity Device model */
+#define CXL_GPU_CAP_GFAM (1 << 6) /* Global Fabric Attached Memory */
+#define CXL_GPU_CAP_MHSLD (1 << 7) /* Multi-headed SLD coherency */
 
 /* Magic number */
 #define CXL_GPU_MAGIC 0x43584C32 /* "CXL2" */
@@ -151,6 +154,16 @@ typedef enum {
     /* Coherency statistics commands */
     CXL_GPU_CMD_COH_GET_STATS = 0xB0, /* Get coherency statistics */
     CXL_GPU_CMD_COH_RESET_STATS = 0xB1, /* Reset coherency statistics */
+
+    /* DCD/GFAM/MH-SLD fabric-memory commands */
+    CXL_GPU_CMD_DCD_ADD = 0xC0, /* params: base, size, tag */
+    CXL_GPU_CMD_DCD_RELEASE = 0xC1, /* params: base, size, tag */
+    CXL_GPU_CMD_DCD_GET_INFO = 0xC2, /* results: total, alloc, free */
+    CXL_GPU_CMD_GFAM_GRANT = 0xC8, /* params: host, base, size, perms */
+    CXL_GPU_CMD_GFAM_REVOKE = 0xC9, /* params: host, base, size */
+    CXL_GPU_CMD_GFAM_GET_INFO = 0xCA, /* results: hosts, mappings, deny */
+    CXL_GPU_CMD_MHSLD_GET_INFO = 0xD0, /* results: heads, current, stats */
+    CXL_GPU_CMD_MHSLD_SET_HEAD = 0xD1, /* params: head_id */
 } CXLGPUCommand;
 
 /* Coherent pool register offsets (in GPU command region) */
@@ -160,6 +173,19 @@ typedef enum {
 #define CXL_GPU_REG_COH_DIR_SIZE 0x0318 /* Directory size (entries) */
 #define CXL_GPU_REG_COH_DIR_USED 0x0320 /* Directory used entries */
 
+/* DCD/GFAM/MH-SLD status registers */
+#define CXL_GPU_REG_DCD_TOTAL 0x0330 /* DCD total capacity */
+#define CXL_GPU_REG_DCD_ALLOCATED 0x0338 /* DCD allocated capacity */
+#define CXL_GPU_REG_DCD_FREE 0x0340 /* DCD free capacity */
+#define CXL_GPU_REG_DCD_EXTENTS 0x0348 /* Active DCD extent count */
+#define CXL_GPU_REG_GFAM_HOSTS 0x0350 /* Configured GFAM hosts */
+#define CXL_GPU_REG_GFAM_MAPPINGS 0x0358 /* Active GFAM mappings */
+#define CXL_GPU_REG_GFAM_DENIED 0x0360 /* Denied GFAM accesses */
+#define CXL_GPU_REG_MHSLD_HEADS 0x0370 /* MH-SLD head count */
+#define CXL_GPU_REG_MHSLD_HEAD_ID 0x0378 /* Local MH-SLD head id */
+#define CXL_GPU_REG_MHSLD_CONFLICTS 0x0380 /* MH-SLD coherency conflicts */
+#define CXL_GPU_REG_MHSLD_INV 0x0388 /* MH-SLD invalidations */
+
 /* Bias mode constants.
  * Legacy values 0/1 remain valid and imply 64B flit/cache-line granularity.
  * Extended encodings use low 8 bits for home domain and upper bits for
@@ -177,6 +203,13 @@ typedef enum {
 #define CXL_BIAS_MODE(encoded) ((uint8_t)((uint64_t)(encoded) & CXL_BIAS_MODE_MASK))
 #define CXL_BIAS_GRAN(encoded) ((uint64_t)(encoded) >> CXL_BIAS_GRAN_SHIFT)
 
+/* DCD/GFAM permission bits */
+#define CXL_DCD_PERM_READ (1 << 0)
+#define CXL_DCD_PERM_WRITE (1 << 1)
+#define CXL_DCD_PERM_ATOMIC (1 << 2)
+#define CXL_DCD_PERM_SHARED (1 << 3)
+#define CXL_DCD_PERM_ALL (CXL_DCD_PERM_READ | CXL_DCD_PERM_WRITE | CXL_DCD_PERM_ATOMIC | CXL_DCD_PERM_SHARED)
+
 /* P2P register offsets (in GPU command region) */
 #define CXL_GPU_REG_P2P_NUM_PEERS 0x0200 /* Number of discovered peers */
 #define CXL_GPU_REG_P2P_PEER_ID 0x0204 /* Current peer ID for queries */
 
@@ -35,12 +35,13 @@ VM_MAXMEM=${VM_MAXMEM:-32G}
 VM_SMP=${VM_SMP:-4}
 DISK_IMAGE=${DISK_IMAGE:-./qemu.img}
 DISK_FORMAT=${DISK_FORMAT:-auto}
-KERNEL_IMAGE=${KERNEL_IMAGE:-./bzImage}
+KERNEL_IMAGE=${KERNEL_IMAGE:-/root/linux-cxl-type2/arch/x86/boot/bzImage}
 KERNEL_APPEND=${KERNEL_APPEND:-"root=/dev/vda rw console=ttyS0,115200 nokaslr"}
 QEMU_DISK_DEVICE=${QEMU_DISK_DEVICE:-"virtio-blk-pci,drive=bootdisk,bus=pcie.0,id=bootdisk0"}
 QEMU_NET_MODE=${QEMU_NET_MODE:-none}
 QEMU_NETDEV=${QEMU_NETDEV:-}
 QEMU_NET_DEVICE=${QEMU_NET_DEVICE:-"virtio-net-pci,netdev=net0"}
+QEMU_STDIO_SIGNAL=${QEMU_STDIO_SIGNAL:-off}
 
 if [[ ! -x "$QEMU_BINARY" ]]; then
     echo "QEMU binary not found or not executable: $QEMU_BINARY" >&2
@@ -136,6 +137,13 @@ if [[ -n "$QEMU_NETDEV" ]]; then
     net_args=(-netdev "$QEMU_NETDEV" -device "$QEMU_NET_DEVICE")
 fi
 
+console_args=(
+    -display none
+    -chardev "stdio,id=char0,signal=$QEMU_STDIO_SIGNAL,mux=on"
+    -serial chardev:char0
+    -mon chardev=char0,mode=readline
+)
+
 mkdir -p "$RUN_DIR"
 truncate -s "$CXL_DC_SIZE" "$RUN_DIR/cxl-dcd0.raw"
 truncate -s "$CXL_DC_SIZE" "$RUN_DIR/cxl-dcd1.raw"
@@ -286,5 +294,5 @@ exec "$QEMU_BINARY" \
     -device cxl-type3,volatile-dc-memdev=cxl-mem1,lsa=cxl-lsa1,id=cxl-dcd1,sn=102,num-dc-regions=8,vcs=zettai0,dsppb=1,memsim-dcd=on,memsim-gfam=on,memsim-gfam-host-id=1 \
     -device cxl-type3,volatile-dc-memdev=cxl-mem2,lsa=cxl-lsa2,id=cxl-dcd2,sn=103,num-dc-regions=8,vcs=zettai0,dsppb=2,memsim-dcd=on,memsim-gfam=on,memsim-gfam-host-id=2 \
     -device cxl-type3,volatile-dc-memdev=cxl-mem3,lsa=cxl-lsa3,id=cxl-dcd3,sn=104,num-dc-regions=8,vcs=zettai0,dsppb=3,memsim-dcd=on,memsim-gfam=on,memsim-gfam-host-id=3 \
-    -nographic \
+    "${console_args[@]}" \
     "${extra_args[@]}"