From a51f0e5e5e5ca9bcd37e776877caba4c1500eb3c Mon Sep 17 00:00:00 2001 From: wangy10 Date: Thu, 14 May 2026 13:44:18 +0800 Subject: [PATCH 1/5] remote vdev just works?? all by copilot+deepseek v4-pro. --- .gitignore | 1 + cmd/CMakeLists.txt | 3 + cmd/zfs_remoted/CMakeLists.txt | 17 + cmd/zfs_remoted/zfs_remoted.c | 589 ++++++++++++++++++ cmd/zpool/zpool_vdev.c | 16 + .../Inno.Setup/ZFSInstall-ARM64-debug.iss | 2 + .../windows/Inno.Setup/ZFSInstall-debug.iss | 2 + include/sys/fs/zfs.h | 1 + include/sys/vdev_impl.h | 2 + include/sys/vdev_remote.h | 107 ++++ lib/libzpool/CMakeLists.txt | 1 + lib/libzpool/Makefile.am | 1 + module/os/windows/CMakeLists.txt | 1 + module/os/windows/zfs/CMakeLists.txt | 1 + module/os/windows/zfs/vdev_remote_os.c | 457 ++++++++++++++ module/zfs/CMakeLists.txt | 1 + module/zfs/spa_misc.c | 3 + module/zfs/vdev.c | 1 + module/zfs/vdev_remote.c | 426 +++++++++++++ 19 files changed, 1632 insertions(+) create mode 100644 cmd/zfs_remoted/CMakeLists.txt create mode 100644 cmd/zfs_remoted/zfs_remoted.c create mode 100644 include/sys/vdev_remote.h create mode 100644 module/os/windows/zfs/vdev_remote_os.c create mode 100644 module/zfs/vdev_remote.c diff --git a/.gitignore b/.gitignore index 2f29df6fe94e..ed1043265d4e 100644 --- a/.gitignore +++ b/.gitignore @@ -91,3 +91,4 @@ changelog *.orig *.tmp *.log +contrib/windows/OpenZFSOnWindows-debug-2.4.1* diff --git a/cmd/CMakeLists.txt b/cmd/CMakeLists.txt index 5931d4257447..a21f74847131 100644 --- a/cmd/CMakeLists.txt +++ b/cmd/CMakeLists.txt @@ -16,3 +16,6 @@ add_subdirectory(os/windows/zfs_tray) add_subdirectory(raidz_test) add_subdirectory(zinject) add_subdirectory(zed) +if(WIN32) + add_subdirectory(zfs_remoted) +endif() diff --git a/cmd/zfs_remoted/CMakeLists.txt b/cmd/zfs_remoted/CMakeLists.txt new file mode 100644 index 000000000000..26f30e52c0a9 --- /dev/null +++ b/cmd/zfs_remoted/CMakeLists.txt @@ -0,0 +1,17 @@ +# CMakeLists.txt for zfs_remoted - ZFS Remote Block Device Daemon +# Windows-only TCP daemon that serves a raw image file as a block device. +# +# This is a standalone Windows user-mode application. It does NOT link to +# any ZFS libraries and does NOT include ZFS kernel headers. + +add_executable(zfs_remoted + zfs_remoted.c +) + +# Clear inherited include directories from parent scope to avoid pulling +# in ZFS kernel headers (which cause include_next failures with MSVC). +set_target_properties(zfs_remoted PROPERTIES + INCLUDE_DIRECTORIES "" +) + +install(TARGETS zfs_remoted RUNTIME DESTINATION ${CMAKE_INSTALL_SBINDIR}) diff --git a/cmd/zfs_remoted/zfs_remoted.c b/cmd/zfs_remoted/zfs_remoted.c new file mode 100644 index 000000000000..983f48f5eb57 --- /dev/null +++ b/cmd/zfs_remoted/zfs_remoted.c @@ -0,0 +1,589 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or https://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2026, OpenZFS Remote VDEV contributors. + * + * zfs_remoted - ZFS Remote Block Device Daemon for Windows + * + * This daemon listens on a TCP port and serves a raw image file as a + * block device. It implements the remote VDEV RPC protocol used by the + * OpenZFS vdev_remote kernel module. + * + * Usage: zfs_remoted.exe -f -p + * + * Protocol: + * Header: [4 bytes cmd][4 bytes status][8 bytes offset][4 bytes size][4 bytes reserved] + * Commands: + * READ (0x52454144): server reads from img and sends [header + data] + * WRITE (0x57524954): client sends [header + data], server writes to img + * FLUSH (0x464C5553): server calls _commit / FlushFileBuffers + * TRIM (0x5452494D): server deallocates (not implemented for file, no-op) + * INFO (0x494E464F): server returns device size / block info + */ + +#ifndef _CRT_SECURE_NO_WARNINGS +#define _CRT_SECURE_NO_WARNINGS +#endif +#define WIN32_LEAN_AND_MEAN + +#include +#include +#include +#include +#include +#include +#include + +#pragma comment(lib, "Ws2_32.lib") + +/* + * RPC header - must match vdev_remote.h exactly. + */ +#pragma pack(push, 1) +typedef struct rpc_hdr { + uint32_t cmd; + uint32_t status; + uint64_t offset; + uint32_t size; + uint32_t reserved; +} rpc_hdr_t; +#pragma pack(pop) + +/* Command opcodes - must match vdev_remote.h */ +#define CMD_READ 0x52454144 +#define CMD_WRITE 0x57524954 +#define CMD_FLUSH 0x464C5553 +#define CMD_TRIM 0x5452494D +#define CMD_INFO 0x494E464F + +/* Status codes */ +#define STATUS_OK 0 +#define STATUS_ERR_IO 1 +#define STATUS_ERR_INVAL 2 +#define STATUS_ERR_NOSPC 3 + +static volatile LONG g_running = 1; +static HANDLE g_file_handle = INVALID_HANDLE_VALUE; +static CRITICAL_SECTION g_file_lock; + +static void +print_usage(const char *prog) +{ + fprintf(stderr, + "Usage: %s -f -p \n" + " -f Path to the raw image file to serve\n" + " -p TCP port to listen on\n" + " -h Show this help\n", + prog); +} + +static BOOL +ctrl_handler(DWORD ctrl_type) +{ + (void)ctrl_type; + InterlockedExchange(&g_running, 0); + return TRUE; +} + +/* + * Receive exactly 'len' bytes from socket. + * Returns 0 on success, -1 on error. + */ +static int +recv_all(SOCKET s, void *buf, int len) +{ + char *p = (char *)buf; + int remaining = len; + + while (remaining > 0) { + int n = recv(s, p, remaining, 0); + if (n <= 0) { + if (n == 0) + fprintf(stderr, "Connection closed by peer\n"); + else + fprintf(stderr, "recv error: %d\n", + WSAGetLastError()); + return (-1); + } + p += n; + remaining -= n; + } + return (0); +} + +/* + * Send exactly 'len' bytes to socket. + * Returns 0 on success, -1 on error. + */ +static int +send_all(SOCKET s, const void *buf, int len) +{ + const char *p = (const char *)buf; + int remaining = len; + + while (remaining > 0) { + int n = send(s, p, remaining, 0); + if (n <= 0) { + fprintf(stderr, "send error: %d\n", WSAGetLastError()); + return (-1); + } + p += n; + remaining -= n; + } + return (0); +} + +/* + * Handle a READ command. + * 1. Read data from the image file at the given offset + * 2. Send response header + * 3. Send the data + */ +static void +handle_read(SOCKET client_sock, rpc_hdr_t *hdr) +{ + rpc_hdr_t resp; + uint8_t *buf = NULL; + + if (hdr->size == 0 || hdr->size > (256 * 1024 * 1024)) { + /* Sanity: max 256MB per read */ + resp.cmd = CMD_READ; + resp.status = STATUS_ERR_INVAL; + resp.offset = hdr->offset; + resp.size = 0; + resp.reserved = 0; + send_all(client_sock, &resp, sizeof (resp)); + return; + } + + buf = (uint8_t *)malloc(hdr->size); + if (buf == NULL) { + resp.cmd = CMD_READ; + resp.status = STATUS_ERR_IO; + resp.offset = hdr->offset; + resp.size = 0; + resp.reserved = 0; + send_all(client_sock, &resp, sizeof (resp)); + return; + } + + EnterCriticalSection(&g_file_lock); + + LARGE_INTEGER liOffset; + liOffset.QuadPart = (LONGLONG)hdr->offset; + + /* Seek to offset */ + if (!SetFilePointerEx(g_file_handle, liOffset, NULL, FILE_BEGIN)) { + LeaveCriticalSection(&g_file_lock); + fprintf(stderr, "ReadFile seek to %llu failed: %lu\n", + (unsigned long long)hdr->offset, GetLastError()); + resp.cmd = CMD_READ; + resp.status = STATUS_ERR_IO; + resp.offset = hdr->offset; + resp.size = 0; + resp.reserved = 0; + send_all(client_sock, &resp, sizeof (resp)); + free(buf); + return; + } + + DWORD bytes_read = 0; + BOOL ok = ReadFile(g_file_handle, buf, hdr->size, &bytes_read, NULL); + + LeaveCriticalSection(&g_file_lock); + + if (!ok || bytes_read != hdr->size) { + fprintf(stderr, "ReadFile failed at offset %llu size %u: %lu\n", + (unsigned long long)hdr->offset, hdr->size, + ok ? 0 : GetLastError()); + resp.cmd = CMD_READ; + resp.status = STATUS_ERR_IO; + resp.offset = hdr->offset; + resp.size = 0; + resp.reserved = 0; + send_all(client_sock, &resp, sizeof (resp)); + free(buf); + return; + } + + /* Send success response header */ + resp.cmd = CMD_READ; + resp.status = STATUS_OK; + resp.offset = hdr->offset; + resp.size = bytes_read; + resp.reserved = 0; + + if (send_all(client_sock, &resp, sizeof (resp)) != 0) { + free(buf); + return; + } + + /* Send the data */ + send_all(client_sock, buf, bytes_read); + free(buf); +} + +/* + * Handle a WRITE command. + * 1. Receive data from socket + * 2. Write data to the image file + * 3. Send response header + */ +static void +handle_write(SOCKET client_sock, rpc_hdr_t *hdr) +{ + rpc_hdr_t resp; + uint8_t *buf = NULL; + + if (hdr->size == 0 || hdr->size > (256 * 1024 * 1024)) { + resp.cmd = CMD_WRITE; + resp.status = STATUS_ERR_INVAL; + resp.offset = hdr->offset; + resp.size = 0; + resp.reserved = 0; + send_all(client_sock, &resp, sizeof (resp)); + return; + } + + buf = (uint8_t *)malloc(hdr->size); + if (buf == NULL) { + resp.cmd = CMD_WRITE; + resp.status = STATUS_ERR_IO; + resp.offset = hdr->offset; + resp.size = 0; + resp.reserved = 0; + send_all(client_sock, &resp, sizeof (resp)); + return; + } + + /* Receive the write data */ + if (recv_all(client_sock, buf, hdr->size) != 0) { + free(buf); + return; + } + + EnterCriticalSection(&g_file_lock); + + LARGE_INTEGER liOffset; + liOffset.QuadPart = (LONGLONG)hdr->offset; + + /* Seek to offset */ + if (!SetFilePointerEx(g_file_handle, liOffset, NULL, FILE_BEGIN)) { + LeaveCriticalSection(&g_file_lock); + free(buf); + fprintf(stderr, "WriteFile seek to %llu failed: %lu\n", + (unsigned long long)hdr->offset, GetLastError()); + resp.cmd = CMD_WRITE; + resp.status = STATUS_ERR_IO; + resp.offset = hdr->offset; + resp.size = 0; + resp.reserved = 0; + send_all(client_sock, &resp, sizeof (resp)); + return; + } + + DWORD bytes_written = 0; + BOOL ok = WriteFile(g_file_handle, buf, hdr->size, &bytes_written, NULL); + + LeaveCriticalSection(&g_file_lock); + free(buf); + + if (!ok || bytes_written != hdr->size) { + fprintf(stderr, "WriteFile failed at offset %llu size %u: %lu\n", + (unsigned long long)hdr->offset, hdr->size, + ok ? 0 : GetLastError()); + resp.cmd = CMD_WRITE; + resp.status = STATUS_ERR_IO; + resp.offset = hdr->offset; + resp.size = 0; + resp.reserved = 0; + send_all(client_sock, &resp, sizeof (resp)); + return; + } + + resp.cmd = CMD_WRITE; + resp.status = STATUS_OK; + resp.offset = hdr->offset; + resp.size = bytes_written; + resp.reserved = 0; + send_all(client_sock, &resp, sizeof (resp)); +} + +/* + * Handle a FLUSH command - flush file buffers to disk. + */ +static void +handle_flush(SOCKET client_sock, rpc_hdr_t *hdr) +{ + rpc_hdr_t resp; + + EnterCriticalSection(&g_file_lock); + BOOL ok = FlushFileBuffers(g_file_handle); + LeaveCriticalSection(&g_file_lock); + + resp.cmd = CMD_FLUSH; + resp.status = ok ? STATUS_OK : STATUS_ERR_IO; + resp.offset = 0; + resp.size = 0; + resp.reserved = 0; + send_all(client_sock, &resp, sizeof (resp)); +} + +/* + * Handle TRIM - deallocate region. + * For file-backed storage on Windows, this is a no-op + * (could use FSCTL_SET_ZERO_DATA but that zeroes, not trims). + */ +static void +handle_trim(SOCKET client_sock, rpc_hdr_t *hdr) +{ + rpc_hdr_t resp; + + /* For files, TRIM is a no-op. Return success. */ + resp.cmd = CMD_TRIM; + resp.status = STATUS_OK; + resp.offset = hdr->offset; + resp.size = hdr->size; + resp.reserved = 0; + send_all(client_sock, &resp, sizeof (resp)); + + fprintf(stderr, "TRIM range [%llu, %llu] (no-op for file)\n", + (unsigned long long)hdr->offset, + (unsigned long long)(hdr->offset + hdr->size)); +} + +/* + * Handle INFO - return device information. + * Response: offset=file_size, size=logical_block, reserved=phys_block + */ +static void +handle_info(SOCKET client_sock) +{ + rpc_hdr_t resp; + + EnterCriticalSection(&g_file_lock); + + LARGE_INTEGER file_size; + BOOL ok = GetFileSizeEx(g_file_handle, &file_size); + + LeaveCriticalSection(&g_file_lock); + + if (!ok) { + resp.cmd = CMD_INFO; + resp.status = STATUS_ERR_IO; + resp.offset = 0; + resp.size = 0; + resp.reserved = 0; + } else { + resp.cmd = CMD_INFO; + resp.status = STATUS_OK; + resp.offset = (uint64_t)file_size.QuadPart; /* device size */ + resp.size = 512; /* logical block size */ + resp.reserved = 4096; /* physical block size */ + } + + send_all(client_sock, &resp, sizeof (resp)); +} + +/* + * Handle a single client connection. + */ +static void +handle_client(SOCKET client_sock) +{ + rpc_hdr_t hdr; + + fprintf(stderr, "Client connected\n"); + + while (g_running) { + /* Receive the RPC header */ + if (recv_all(client_sock, &hdr, sizeof (hdr)) != 0) + break; + + switch (hdr.cmd) { + case CMD_READ: + handle_read(client_sock, &hdr); + break; + + case CMD_WRITE: + handle_write(client_sock, &hdr); + break; + + case CMD_FLUSH: + handle_flush(client_sock, &hdr); + break; + + case CMD_TRIM: + handle_trim(client_sock, &hdr); + break; + + case CMD_INFO: + handle_info(client_sock); + break; + + default: + fprintf(stderr, "Unknown command: 0x%08X\n", hdr.cmd); + hdr.status = STATUS_ERR_INVAL; + send_all(client_sock, &hdr, sizeof (hdr)); + break; + } + } + + closesocket(client_sock); + fprintf(stderr, "Client disconnected\n"); +} + +/* + * Main: parse arguments, open image file, listen for connections. + */ +int +main(int argc, char *argv[]) +{ + const char *image_file = NULL; + uint16_t port = 0; + int opt; + + /* Parse arguments */ + for (opt = 1; opt < argc; opt++) { + if (strcmp(argv[opt], "-f") == 0 && opt + 1 < argc) { + image_file = argv[++opt]; + } else if (strcmp(argv[opt], "-p") == 0 && opt + 1 < argc) { + port = (uint16_t)atoi(argv[++opt]); + } else if (strcmp(argv[opt], "-h") == 0) { + print_usage(argv[0]); + return (0); + } else { + fprintf(stderr, "Unknown option: %s\n", argv[opt]); + print_usage(argv[0]); + return (1); + } + } + + if (image_file == NULL || port == 0) { + fprintf(stderr, "Error: -f and -p " + "are required\n"); + print_usage(argv[0]); + return (1); + } + + /* Open the image file for synchronous I/O */ + g_file_handle = CreateFileA( + image_file, + GENERIC_READ | GENERIC_WRITE, + 0, /* exclusive access */ + NULL, + OPEN_EXISTING, + FILE_ATTRIBUTE_NORMAL, /* synchronous, buffered */ + NULL); + + if (g_file_handle == INVALID_HANDLE_VALUE) { + fprintf(stderr, "Failed to open image file '%s': %lu\n", + image_file, GetLastError()); + return (1); + } + + fprintf(stderr, "Opened image file: %s\n", image_file); + + InitializeCriticalSection(&g_file_lock); + + /* Initialize Winsock */ + WSADATA wsa_data; + if (WSAStartup(MAKEWORD(2, 2), &wsa_data) != 0) { + fprintf(stderr, "WSAStartup failed: %d\n", WSAGetLastError()); + CloseHandle(g_file_handle); + return (1); + } + + /* Create listening socket */ + SOCKET listen_sock = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); + if (listen_sock == INVALID_SOCKET) { + fprintf(stderr, "socket failed: %d\n", WSAGetLastError()); + WSACleanup(); + CloseHandle(g_file_handle); + return (1); + } + + /* Set SO_REUSEADDR */ + int reuse = 1; + setsockopt(listen_sock, SOL_SOCKET, SO_REUSEADDR, + (const char *)&reuse, sizeof (reuse)); + + struct sockaddr_in server_addr; + memset(&server_addr, 0, sizeof (server_addr)); + server_addr.sin_family = AF_INET; + server_addr.sin_addr.s_addr = INADDR_ANY; + server_addr.sin_port = htons(port); + + if (bind(listen_sock, (struct sockaddr *)&server_addr, + sizeof (server_addr)) == SOCKET_ERROR) { + fprintf(stderr, "bind to port %u failed: %d\n", + port, WSAGetLastError()); + closesocket(listen_sock); + WSACleanup(); + CloseHandle(g_file_handle); + return (1); + } + + if (listen(listen_sock, SOMAXCONN) == SOCKET_ERROR) { + fprintf(stderr, "listen failed: %d\n", WSAGetLastError()); + closesocket(listen_sock); + WSACleanup(); + CloseHandle(g_file_handle); + return (1); + } + + /* Set Ctrl+C handler for graceful shutdown */ + SetConsoleCtrlHandler((PHANDLER_ROUTINE)ctrl_handler, TRUE); + + fprintf(stderr, "zfs_remoted: listening on port %u, serving '%s'\n", + port, image_file); + + while (g_running) { + struct sockaddr_in client_addr; + int client_len = sizeof (client_addr); + + /* Set a timeout so we can check g_running periodically */ + struct timeval tv; + tv.tv_sec = 1; + tv.tv_usec = 0; + + fd_set readfds; + FD_ZERO(&readfds); + FD_SET(listen_sock, &readfds); + + int sel_ret = select(0, &readfds, NULL, NULL, &tv); + if (sel_ret < 0) + break; + if (sel_ret == 0) + continue; /* timeout, check g_running */ + + SOCKET client_sock = accept(listen_sock, + (struct sockaddr *)&client_addr, &client_len); + if (client_sock == INVALID_SOCKET) + continue; + + handle_client(client_sock); + } + + fprintf(stderr, "Shutting down...\n"); + + closesocket(listen_sock); + WSACleanup(); + + DeleteCriticalSection(&g_file_lock); + CloseHandle(g_file_handle); + + return (0); +} diff --git a/cmd/zpool/zpool_vdev.c b/cmd/zpool/zpool_vdev.c index 0b615f7c277c..c74359985a4e 100644 --- a/cmd/zpool/zpool_vdev.c +++ b/cmd/zpool/zpool_vdev.c @@ -284,6 +284,18 @@ make_leaf_vdev(const char *arg, boolean_t is_primary, uint64_t ashift) * 'path'. We detect whether this is a device of file afterwards by * checking the st_mode of the file. */ + + /* + * Remote block device (remote://host:port) - no local filesystem + * checks needed. The URI itself is the path. + */ + if (strncmp(arg, "remote://", 9) == 0) { + (void) strlcpy(path, arg, sizeof (path)); + type = VDEV_TYPE_REMOTE; + wholedisk = B_FALSE; + goto skip_type_detect; + } + #ifdef _WIN32 if (arg[0] == '/' || arg[0] == '\\') { #else @@ -352,6 +364,7 @@ make_leaf_vdev(const char *arg, boolean_t is_primary, uint64_t ashift) } } +skip_type_detect: if (type == NULL) { /* * Determine whether this is a device or a file. @@ -1199,6 +1212,9 @@ is_device_in_use(nvlist_t *config, nvlist_t *nv, boolean_t force, else if (strcmp(type, VDEV_TYPE_FILE) == 0) ret = check_file(path, force, isspare); + else if (strcmp(type, VDEV_TYPE_REMOTE) == 0) + ret = 0; /* remote vdevs are always valid */ + return (ret != 0); } diff --git a/contrib/windows/Inno.Setup/ZFSInstall-ARM64-debug.iss b/contrib/windows/Inno.Setup/ZFSInstall-ARM64-debug.iss index fa41f4943357..e585c12530b6 100644 --- a/contrib/windows/Inno.Setup/ZFSInstall-ARM64-debug.iss +++ b/contrib/windows/Inno.Setup/ZFSInstall-ARM64-debug.iss @@ -184,6 +184,7 @@ Source: "{#Root}\out\build\x64-Debug\cmd\os\windows\zfs_tray\zfs_tray.exe"; Dest Source: "{#Root}\out\build\x64-Debug\cmd\zpool\zpool.exe"; DestDir: "{app}"; Flags: ignoreversion Source: "{#Root}\out\build\x64-Debug\cmd\zfs\zfs.exe"; DestDir: "{app}"; Flags: ignoreversion Source: "{#Root}\out\build\x64-Debug\cmd\zdb\zdb.exe"; DestDir: "{app}"; Flags: ignoreversion +Source: "{#Root}\out\build\x64-Debug\cmd\zfs_remoted\zfs_remoted.exe"; DestDir: "{app}"; Flags: ignoreversion Source: "{#Root}\out\build\x64-Debug\cmd\zed\zed.exe"; DestDir: "{app}"; Flags: ignoreversion Source: "{#Root}\out\build\x64-Debug\cmd\zstream\zstreamdump.exe"; DestDir: "{app}"; Flags: ignoreversion Source: "{#Root}\out\build\x64-Debug\cmd\raidz_test\raidz_test.exe"; DestDir: "{app}"; Flags: ignoreversion @@ -198,6 +199,7 @@ Source: "{#Root}\out\build\x64-Debug\cmd\zstream\*.pdb"; DestDir: "{app}\symbols Source: "{#Root}\out\build\x64-Debug\cmd\zpool\*.pdb"; DestDir: "{app}\symbols"; Flags: ignoreversion Source: "{#Root}\out\build\x64-Debug\cmd\zfs\*.pdb"; DestDir: "{app}\symbols"; Flags: ignoreversion Source: "{#Root}\out\build\x64-Debug\cmd\zdb\*.pdb"; DestDir: "{app}\symbols"; Flags: ignoreversion +Source: "{#Root}\out\build\x64-Debug\cmd\zfs_remoted\*.pdb"; DestDir: "{app}\symbols"; Flags: ignoreversion Source: "{#Root}\out\build\x64-Debug\cmd\zed\*.pdb"; DestDir: "{app}\symbols"; Flags: ignoreversion Source: "{#Root}\out\build\x64-Debug\cmd\raidz_test\*.pdb"; DestDir: "{app}\symbols"; Flags: ignoreversion Source: "{#Root}\out\build\x64-Debug\cmd\os\windows\zfsinstaller\*.pdb"; DestDir: "{app}\symbols"; Flags: ignoreversion diff --git a/contrib/windows/Inno.Setup/ZFSInstall-debug.iss b/contrib/windows/Inno.Setup/ZFSInstall-debug.iss index fa41f4943357..e585c12530b6 100644 --- a/contrib/windows/Inno.Setup/ZFSInstall-debug.iss +++ b/contrib/windows/Inno.Setup/ZFSInstall-debug.iss @@ -184,6 +184,7 @@ Source: "{#Root}\out\build\x64-Debug\cmd\os\windows\zfs_tray\zfs_tray.exe"; Dest Source: "{#Root}\out\build\x64-Debug\cmd\zpool\zpool.exe"; DestDir: "{app}"; Flags: ignoreversion Source: "{#Root}\out\build\x64-Debug\cmd\zfs\zfs.exe"; DestDir: "{app}"; Flags: ignoreversion Source: "{#Root}\out\build\x64-Debug\cmd\zdb\zdb.exe"; DestDir: "{app}"; Flags: ignoreversion +Source: "{#Root}\out\build\x64-Debug\cmd\zfs_remoted\zfs_remoted.exe"; DestDir: "{app}"; Flags: ignoreversion Source: "{#Root}\out\build\x64-Debug\cmd\zed\zed.exe"; DestDir: "{app}"; Flags: ignoreversion Source: "{#Root}\out\build\x64-Debug\cmd\zstream\zstreamdump.exe"; DestDir: "{app}"; Flags: ignoreversion Source: "{#Root}\out\build\x64-Debug\cmd\raidz_test\raidz_test.exe"; DestDir: "{app}"; Flags: ignoreversion @@ -198,6 +199,7 @@ Source: "{#Root}\out\build\x64-Debug\cmd\zstream\*.pdb"; DestDir: "{app}\symbols Source: "{#Root}\out\build\x64-Debug\cmd\zpool\*.pdb"; DestDir: "{app}\symbols"; Flags: ignoreversion Source: "{#Root}\out\build\x64-Debug\cmd\zfs\*.pdb"; DestDir: "{app}\symbols"; Flags: ignoreversion Source: "{#Root}\out\build\x64-Debug\cmd\zdb\*.pdb"; DestDir: "{app}\symbols"; Flags: ignoreversion +Source: "{#Root}\out\build\x64-Debug\cmd\zfs_remoted\*.pdb"; DestDir: "{app}\symbols"; Flags: ignoreversion Source: "{#Root}\out\build\x64-Debug\cmd\zed\*.pdb"; DestDir: "{app}\symbols"; Flags: ignoreversion Source: "{#Root}\out\build\x64-Debug\cmd\raidz_test\*.pdb"; DestDir: "{app}\symbols"; Flags: ignoreversion Source: "{#Root}\out\build\x64-Debug\cmd\os\windows\zfsinstaller\*.pdb"; DestDir: "{app}\symbols"; Flags: ignoreversion diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index c03a6f14f2bc..2679fb8b3206 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -925,6 +925,7 @@ typedef struct zpool_load_policy { #define VDEV_TYPE_DRAID_SPARE "dspare" #define VDEV_TYPE_DISK "disk" #define VDEV_TYPE_FILE "file" +#define VDEV_TYPE_REMOTE "remote" #define VDEV_TYPE_MISSING "missing" #define VDEV_TYPE_HOLE "hole" #define VDEV_TYPE_SPARE "spare" diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h index afaa401343d9..f2a1ae801aad 100644 --- a/include/sys/vdev_impl.h +++ b/include/sys/vdev_impl.h @@ -41,6 +41,7 @@ #include #include #include +#include #include #ifdef __cplusplus @@ -613,6 +614,7 @@ extern vdev_ops_t vdev_draid_ops; extern vdev_ops_t vdev_draid_spare_ops; extern vdev_ops_t vdev_disk_ops; extern vdev_ops_t vdev_file_ops; +extern vdev_ops_t vdev_remote_ops; extern vdev_ops_t vdev_missing_ops; extern vdev_ops_t vdev_hole_ops; extern vdev_ops_t vdev_spare_ops; diff --git a/include/sys/vdev_remote.h b/include/sys/vdev_remote.h new file mode 100644 index 000000000000..49e338761fe9 --- /dev/null +++ b/include/sys/vdev_remote.h @@ -0,0 +1,107 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or https://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2026, OpenZFS Remote VDEV contributors. + */ + +#ifndef _SYS_VDEV_REMOTE_H +#define _SYS_VDEV_REMOTE_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Remote VDEV RPC protocol definitions. + * + * A remote VDEV communicates over TCP with a daemon that serves a raw + * image file as a block device. The protocol is a simple binary RPC + * with fixed-size header followed by optional data payload. + */ + +/* + * Maximum host:port string length for remote connections. + */ +#define VDEV_REMOTE_HOST_MAX 256 + +/* + * RPC header sent for every command. + */ +typedef struct vdev_remote_rpc_hdr { + uint32_t vr_cmd; /* command opcode */ + uint32_t vr_status; /* response status (0 = success) */ + uint64_t vr_offset; /* byte offset in the remote device */ + uint32_t vr_size; /* size of optional data payload */ + uint32_t vr_reserved; /* padding */ +} vdev_remote_rpc_hdr_t; + +/* + * RPC command opcodes. + */ +#define VDEV_REMOTE_CMD_READ 0x52454144 /* "READ" */ +#define VDEV_REMOTE_CMD_WRITE 0x57524954 /* "WRIT" */ +#define VDEV_REMOTE_CMD_FLUSH 0x464C5553 /* "FLUS" */ +#define VDEV_REMOTE_CMD_TRIM 0x5452494D /* "TRIM" */ +#define VDEV_REMOTE_CMD_INFO 0x494E464F /* "INFO" */ + +/* + * RPC status codes. + */ +#define VDEV_REMOTE_STATUS_OK 0 +#define VDEV_REMOTE_STATUS_ERR_IO 1 +#define VDEV_REMOTE_STATUS_ERR_INVAL 2 +#define VDEV_REMOTE_STATUS_ERR_NOSPC 3 + +/* + * Per-instance state for a remote VDEV. + */ +typedef struct vdev_remote { + char vr_host[VDEV_REMOTE_HOST_MAX]; + uint16_t vr_port; + void *vr_os_priv; /* OS-specific state */ + uint64_t vr_device_size; + uint32_t vr_block_size; + uint32_t vr_phys_block_size; + kmutex_t vr_lock; + boolean_t vr_connected; +} vdev_remote_t; + +/* + * Core remote vdev module lifecycle. + */ +extern void vdev_remote_init(void); +extern void vdev_remote_fini(void); + +/* + * OS-specific functions that must be implemented by each platform. + * + * These are declared here so the core vdev_remote.c can call them, + * and each OS layer (e.g. module/os/windows/zfs/vdev_remote_os.c) + * provides the implementation. + */ +int vdev_remote_os_connect(vdev_remote_t *vr); +void vdev_remote_os_disconnect(vdev_remote_t *vr); +int vdev_remote_os_io(vdev_remote_t *vr, uint32_t cmd, + uint64_t offset, void *data, uint32_t size); +int vdev_remote_os_info(vdev_remote_t *vr, + uint64_t *size, uint32_t *blksz, uint32_t *pblksz); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_VDEV_REMOTE_H */ diff --git a/lib/libzpool/CMakeLists.txt b/lib/libzpool/CMakeLists.txt index 63419cac0930..78e7984a0012 100644 --- a/lib/libzpool/CMakeLists.txt +++ b/lib/libzpool/CMakeLists.txt @@ -123,6 +123,7 @@ add_library(libzpool "${MODULE_DIR}/zfs/vdev_draid.c" "${MODULE_DIR}/zfs/vdev_draid_rand.c" "${MODULE_DIR}/zfs/vdev_file.c" + "${MODULE_DIR}/zfs/vdev_remote.c" "${MODULE_DIR}/zfs/vdev_indirect.c" "${MODULE_DIR}/zfs/vdev_indirect_births.c" "${MODULE_DIR}/zfs/vdev_indirect_mapping.c" diff --git a/lib/libzpool/Makefile.am b/lib/libzpool/Makefile.am index 6e2028093d9c..e534ae6fd1ae 100644 --- a/lib/libzpool/Makefile.am +++ b/lib/libzpool/Makefile.am @@ -143,6 +143,7 @@ nodist_libzpool_la_SOURCES = \ module/zfs/vdev_draid.c \ module/zfs/vdev_draid_rand.c \ module/zfs/vdev_file.c \ + module/zfs/vdev_remote.c \ module/zfs/vdev_indirect.c \ module/zfs/vdev_indirect_births.c \ module/zfs/vdev_indirect_mapping.c \ diff --git a/module/os/windows/CMakeLists.txt b/module/os/windows/CMakeLists.txt index 0f059f107117..c259d2468302 100644 --- a/module/os/windows/CMakeLists.txt +++ b/module/os/windows/CMakeLists.txt @@ -39,6 +39,7 @@ target_link_libraries(OpenZFS nvpairkern # WDK libraries WDK::WDMSEC + WDK::NETIO # WDK::STORPORT # WDK::SCSIWMI ) diff --git a/module/os/windows/zfs/CMakeLists.txt b/module/os/windows/zfs/CMakeLists.txt index 4bdd4e213f6c..a11b55bc525e 100644 --- a/module/os/windows/zfs/CMakeLists.txt +++ b/module/os/windows/zfs/CMakeLists.txt @@ -11,6 +11,7 @@ qat_crypt.c spa_misc_os.c sysctl_os.c vdev_disk.c +vdev_remote_os.c vdev_label_os.c zfs_acl.c zfs_racct.c diff --git a/module/os/windows/zfs/vdev_remote_os.c b/module/os/windows/zfs/vdev_remote_os.c new file mode 100644 index 000000000000..cbda3b03423c --- /dev/null +++ b/module/os/windows/zfs/vdev_remote_os.c @@ -0,0 +1,457 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or https://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2026, OpenZFS Remote VDEV contributors. + * + * Windows remote VDEV using Winsock Kernel (WSK). + * + * CRITICAL: All WSK calls require IRPs. Event-based synchronous wrappers + * used here are safe because taskq workers run at PASSIVE_LEVEL. + * + * WSK socket dispatch hierarchy: + * Global WSK_PROVIDER_DISPATCH: WskSocket, WskSocketConnect + * Socket WSK_PROVIDER_CONNECTION_DISPATCH: WskBind, WskConnect, + * WskSend, WskReceive, WskDisconnect, WskCloseSocket(Basic) + */ + +#include +#include +#include +#include + +#include +#include + +static __inline uint16_t +wsk_htons(uint16_t hostshort) +{ + return (RtlUshortByteSwap(hostshort)); +} + +/* + * Synchronous IRP completion routine. + * Signals the event, returns MORE_PROCESSING so I/O mgr doesn't free the IRP + * (we own it and free it after inspecting IoStatus). + */ +static NTSTATUS +wsk_sync_comp( + _In_ PDEVICE_OBJECT DeviceObject, + _In_ PIRP Irp, + _In_opt_ PVOID Context) +{ + (void)DeviceObject; + (void)Irp; + KeSetEvent((PKEVENT)Context, IO_NO_INCREMENT, FALSE); + return (STATUS_MORE_PROCESSING_REQUIRED); +} + +/* Issue a WSK call synchronously: wait on event if PENDING, return status */ +static NTSTATUS +wsk_sync(KEVENT *event, PIRP irp, NTSTATUS st) +{ + if (st == STATUS_PENDING) { + KeWaitForSingleObject(event, Executive, KernelMode, + FALSE, NULL); + st = irp->IoStatus.Status; + } + return (st); +} + +/* + * WSK global state with interlocked init. + * Using InterlockedCompareExchange avoids needing mutex_init before first use. + */ +static WSK_REGISTRATION wsk_reg = { 0 }; +static WSK_PROVIDER_NPI wsk_npi; +static WSK_CLIENT_NPI wsk_cli = { NULL, NULL }; +static LONG wsk_ready = 0; /* 0=not init, 1=initing, 2=ready */ + +static const WSK_CLIENT_DISPATCH wsk_disp = { + MAKE_WSK_VERSION(1, 0), 0, NULL +}; + +static int vdev_remote_wsk_init(void); +static int vdev_remote_wsk_connect(vdev_remote_t *vr, + PWSK_SOCKET *out); +static void vdev_remote_wsk_close(PWSK_SOCKET sock); +static int vdev_remote_wsk_send_recv(PWSK_SOCKET sock, + vdev_remote_rpc_hdr_t *hdr, void *data, uint32_t dlen, uint32_t cmd); + +/* + * Lazy one-shot WSK init. Interlocked state machine ensures exactly one + * thread does WskRegister/WskCaptureProviderNPI. Other threads spin-wait. + */ +static int +vdev_remote_wsk_init(void) +{ + LONG prev; + + /* Quick check: already ready? */ + if (wsk_ready == 2) + return (0); + + /* Try to claim the "initing" slot */ + prev = InterlockedCompareExchange(&wsk_ready, 1, 0); + if (prev == 2) + return (0); /* another thread finished while we were racing */ + + if (prev == 1) { + /* Another thread is initializing — wait for it */ + LONG spin = 0; + while (wsk_ready == 1 && spin < 100000) { + KeStallExecutionProcessor(10); + spin++; + } + if (wsk_ready != 2) { + dprintf("vdev_remote: WSK init timed out\n"); + return (SET_ERROR(ENODEV)); + } + return (0); + } + + /* We own the init slot (prev == 0) */ + wsk_cli.Dispatch = (PWSK_CLIENT_DISPATCH)&wsk_disp; + + NTSTATUS st = WskRegister(&wsk_cli, &wsk_reg); + if (!NT_SUCCESS(st)) { + wsk_ready = 0; + dprintf("vdev_remote: WskRegister: 0x%lx\n", + (unsigned long)st); + return (SET_ERROR(ENODEV)); + } + + st = WskCaptureProviderNPI(&wsk_reg, WSK_INFINITE_WAIT, &wsk_npi); + if (!NT_SUCCESS(st)) { + WskDeregister(&wsk_reg); + wsk_ready = 0; + dprintf("vdev_remote: WskCaptureProviderNPI: 0x%lx\n", + (unsigned long)st); + return (SET_ERROR(ENODEV)); + } + + wsk_ready = 2; + return (0); +} + +/* + * Connect to remote daemon using WskSocketConnect: creates socket, + * binds locally, and connects — all in one IRP round-trip. + */ +static int +vdev_remote_wsk_connect(vdev_remote_t *vr, PWSK_SOCKET *out) +{ + SOCKADDR_IN local = { 0 }; + SOCKADDR_IN remote = { 0 }; + PIRP irp; + KEVENT ev; + NTSTATUS st; + PCSTR term = NULL; + + *out = NULL; + + remote.sin_family = AF_INET; + remote.sin_port = wsk_htons(vr->vr_port); + + st = RtlIpv4StringToAddressA(vr->vr_host, TRUE, + &term, &remote.sin_addr); + if (!NT_SUCCESS(st)) { + dprintf("vdev_remote: bad IPv4 '%s': 0x%lx\n", + vr->vr_host, (unsigned long)st); + return (SET_ERROR(EINVAL)); + } + + local.sin_family = AF_INET; + local.sin_addr.s_addr = INADDR_ANY; + local.sin_port = 0; + + KeInitializeEvent(&ev, SynchronizationEvent, FALSE); + irp = IoAllocateIrp(1, FALSE); + if (!irp) return (SET_ERROR(ENOMEM)); + IoSetCompletionRoutine(irp, wsk_sync_comp, &ev, TRUE, TRUE, TRUE); + + st = wsk_sync(&ev, irp, + wsk_npi.Dispatch->WskSocketConnect( + wsk_npi.Client, SOCK_STREAM, IPPROTO_TCP, + (PSOCKADDR)&local, (PSOCKADDR)&remote, + 0, NULL, NULL, NULL, NULL, NULL, irp)); + + if (!NT_SUCCESS(st)) { + dprintf("vdev_remote: WskSocketConnect %s:%u: 0x%lx\n", + vr->vr_host, vr->vr_port, (unsigned long)st); + IoFreeIrp(irp); + return (SET_ERROR(ECONNREFUSED)); + } + *out = (PWSK_SOCKET)irp->IoStatus.Information; + IoFreeIrp(irp); + return (0); +} + +/* + * Disconnect + close socket. + */ +static void +vdev_remote_wsk_close(PWSK_SOCKET sock) +{ + PIRP irp; + KEVENT ev; + PWSK_PROVIDER_CONNECTION_DISPATCH d; + + if (!sock) return; + d = (PWSK_PROVIDER_CONNECTION_DISPATCH)sock->Dispatch; + + /* Graceful disconnect */ + KeInitializeEvent(&ev, SynchronizationEvent, FALSE); + irp = IoAllocateIrp(1, FALSE); + if (irp) { + IoSetCompletionRoutine(irp, wsk_sync_comp, &ev, + TRUE, TRUE, TRUE); + (void)wsk_sync(&ev, irp, + d->WskDisconnect(sock, NULL, 0, irp)); + IoFreeIrp(irp); + } + + /* Close (Basic dispatch — IRP is optional but we pass one) */ + d->WskCloseSocket(sock, NULL); +} + +/* + * Round-trip RPC: send header [+data if write], recv header [+data if read]. + * + * WSK_BUF.Mdl == NULL → WSK uses Irp->MdlAddress. We build a simple MDL + * for the buffer pages to ensure correct transfer regardless of WSK provider + * implementation (some require MDL, some accept UserBuffer). + */ +static int +vdev_remote_wsk_send_recv(PWSK_SOCKET sock, + vdev_remote_rpc_hdr_t *hdr, void *data, uint32_t dlen, uint32_t cmd) +{ + NTSTATUS st; + WSK_BUF wb; + PIRP irp; + KEVENT ev; + PMDL mdl; + PWSK_PROVIDER_CONNECTION_DISPATCH d; + + d = (PWSK_PROVIDER_CONNECTION_DISPATCH)sock->Dispatch; + + /* Populate header */ + hdr->vr_cmd = cmd; + hdr->vr_status = 0; + hdr->vr_size = dlen; + hdr->vr_reserved = 0; + + /* + * -- Send header -- + * Build an MDL for the header so WSK always sees valid MdlAddress. + */ + mdl = IoAllocateMdl(hdr, (ULONG)sizeof(*hdr), FALSE, FALSE, NULL); + if (!mdl) return (SET_ERROR(ENOMEM)); + MmBuildMdlForNonPagedPool(mdl); + + wb.Mdl = mdl; + wb.Offset = 0; + wb.Length = sizeof(*hdr); + + KeInitializeEvent(&ev, SynchronizationEvent, FALSE); + irp = IoAllocateIrp(1, FALSE); + if (!irp) { IoFreeMdl(mdl); return (SET_ERROR(ENOMEM)); } + IoSetCompletionRoutine(irp, wsk_sync_comp, &ev, TRUE, TRUE, TRUE); + irp->MdlAddress = mdl; + + st = wsk_sync(&ev, irp, + d->WskSend(sock, &wb, WSK_FLAG_NODELAY, irp)); + IoFreeMdl(mdl); + IoFreeIrp(irp); + if (!NT_SUCCESS(st)) { + dprintf("vdev_remote: send hdr err 0x%lx\n", + (unsigned long)st); + return (SET_ERROR(EIO)); + } + + /* -- If writing, send data payload -- */ + if (cmd == VDEV_REMOTE_CMD_WRITE && data && dlen > 0) { + mdl = IoAllocateMdl(data, dlen, FALSE, FALSE, NULL); + if (!mdl) return (SET_ERROR(ENOMEM)); + MmBuildMdlForNonPagedPool(mdl); + + wb.Mdl = mdl; + wb.Offset = 0; + wb.Length = dlen; + + KeInitializeEvent(&ev, SynchronizationEvent, FALSE); + irp = IoAllocateIrp(1, FALSE); + if (!irp) { IoFreeMdl(mdl); return (SET_ERROR(ENOMEM)); } + IoSetCompletionRoutine(irp, wsk_sync_comp, &ev, + TRUE, TRUE, TRUE); + irp->MdlAddress = mdl; + + st = wsk_sync(&ev, irp, + d->WskSend(sock, &wb, WSK_FLAG_NODELAY, irp)); + IoFreeMdl(mdl); + IoFreeIrp(irp); + if (!NT_SUCCESS(st)) { + dprintf("vdev_remote: send dat err 0x%lx\n", + (unsigned long)st); + return (SET_ERROR(EIO)); + } + } + + /* -- Receive response header -- */ + mdl = IoAllocateMdl(hdr, (ULONG)sizeof(*hdr), FALSE, FALSE, NULL); + if (!mdl) return (SET_ERROR(ENOMEM)); + MmBuildMdlForNonPagedPool(mdl); + + wb.Mdl = mdl; + wb.Offset = 0; + wb.Length = sizeof(*hdr); + + KeInitializeEvent(&ev, SynchronizationEvent, FALSE); + irp = IoAllocateIrp(1, FALSE); + if (!irp) { IoFreeMdl(mdl); return (SET_ERROR(ENOMEM)); } + IoSetCompletionRoutine(irp, wsk_sync_comp, &ev, TRUE, TRUE, TRUE); + irp->MdlAddress = mdl; + + st = wsk_sync(&ev, irp, + d->WskReceive(sock, &wb, WSK_FLAG_WAITALL, irp)); + IoFreeMdl(mdl); + IoFreeIrp(irp); + if (!NT_SUCCESS(st)) { + dprintf("vdev_remote: recv hdr err 0x%lx\n", + (unsigned long)st); + return (SET_ERROR(EIO)); + } + + if (hdr->vr_status != VDEV_REMOTE_STATUS_OK) { + dprintf("vdev_remote: srv err %u\n", hdr->vr_status); + if (hdr->vr_status == VDEV_REMOTE_STATUS_ERR_NOSPC) + return (SET_ERROR(ENOSPC)); + return (SET_ERROR(EIO)); + } + + /* -- If reading, receive data payload -- */ + if (cmd == VDEV_REMOTE_CMD_READ && data && hdr->vr_size > 0) { + mdl = IoAllocateMdl(data, hdr->vr_size, FALSE, FALSE, NULL); + if (!mdl) return (SET_ERROR(ENOMEM)); + MmBuildMdlForNonPagedPool(mdl); + + wb.Mdl = mdl; + wb.Offset = 0; + wb.Length = hdr->vr_size; + + KeInitializeEvent(&ev, SynchronizationEvent, FALSE); + irp = IoAllocateIrp(1, FALSE); + if (!irp) { IoFreeMdl(mdl); return (SET_ERROR(ENOMEM)); } + IoSetCompletionRoutine(irp, wsk_sync_comp, &ev, + TRUE, TRUE, TRUE); + irp->MdlAddress = mdl; + + st = wsk_sync(&ev, irp, + d->WskReceive(sock, &wb, WSK_FLAG_WAITALL, irp)); + IoFreeMdl(mdl); + IoFreeIrp(irp); + if (!NT_SUCCESS(st)) { + dprintf("vdev_remote: recv dat err 0x%lx\n", + (unsigned long)st); + return (SET_ERROR(EIO)); + } + } + + return (0); +} + +/* ---- Public OS interface ---- */ + +int +vdev_remote_os_connect(vdev_remote_t *vr) +{ + int err; + PWSK_SOCKET sock = NULL; + + err = vdev_remote_wsk_init(); + if (err) return (err); + + err = vdev_remote_wsk_connect(vr, &sock); + if (err) return (err); + + vr->vr_os_priv = (void *)sock; + return (0); +} + +void +vdev_remote_os_disconnect(vdev_remote_t *vr) +{ + PWSK_SOCKET sock = (PWSK_SOCKET)vr->vr_os_priv; + if (sock) { + vdev_remote_wsk_close(sock); + vr->vr_os_priv = NULL; + } +} + +int +vdev_remote_os_io(vdev_remote_t *vr, uint32_t cmd, + uint64_t offset, void *data, uint32_t size) +{ + vdev_remote_rpc_hdr_t hdr; + PWSK_SOCKET sock; + int err; + + ASSERT(vr); + mutex_enter(&vr->vr_lock); + sock = (PWSK_SOCKET)vr->vr_os_priv; + if (!sock) { mutex_exit(&vr->vr_lock); return (SET_ERROR(ENXIO)); } + + if (cmd == VDEV_REMOTE_CMD_TRIM) { + memset(&hdr, 0, sizeof(hdr)); + hdr.vr_cmd = cmd; + hdr.vr_offset = offset; + hdr.vr_size = size; + err = vdev_remote_wsk_send_recv(sock, &hdr, NULL, 0, cmd); + mutex_exit(&vr->vr_lock); + return (err); + } + + memset(&hdr, 0, sizeof(hdr)); + hdr.vr_cmd = cmd; + hdr.vr_offset = offset; + hdr.vr_size = size; + err = vdev_remote_wsk_send_recv(sock, &hdr, data, size, cmd); + mutex_exit(&vr->vr_lock); + return (err); +} + +int +vdev_remote_os_info(vdev_remote_t *vr, + uint64_t *size, uint32_t *blksz, uint32_t *pblksz) +{ + vdev_remote_rpc_hdr_t hdr; + PWSK_SOCKET sock; + int err; + + ASSERT(vr); + mutex_enter(&vr->vr_lock); + sock = (PWSK_SOCKET)vr->vr_os_priv; + if (!sock) { mutex_exit(&vr->vr_lock); return (SET_ERROR(ENXIO)); } + + memset(&hdr, 0, sizeof(hdr)); + err = vdev_remote_wsk_send_recv(sock, &hdr, NULL, 0, + VDEV_REMOTE_CMD_INFO); + mutex_exit(&vr->vr_lock); + if (err == 0) { + *size = hdr.vr_offset; + *blksz = hdr.vr_size; + *pblksz = hdr.vr_reserved; + } + return (err); +} diff --git a/module/zfs/CMakeLists.txt b/module/zfs/CMakeLists.txt index 84e5272fac42..f38e11d02d27 100644 --- a/module/zfs/CMakeLists.txt +++ b/module/zfs/CMakeLists.txt @@ -83,6 +83,7 @@ wdk_add_library(zfskern vdev_draid.c vdev_draid_rand.c vdev_file.c + vdev_remote.c vdev_indirect.c vdev_indirect_births.c vdev_indirect_mapping.c diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index 28a5ce1e3710..516d91954691 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -45,6 +45,7 @@ #include #include #include +#include #include #include #include @@ -2662,6 +2663,7 @@ spa_init(spa_mode_t mode) vdev_mirror_stat_init(); vdev_raidz_math_init(); vdev_file_init(); + vdev_remote_init(); zfs_prop_init(); chksum_init(); zpool_prop_init(); @@ -2682,6 +2684,7 @@ spa_fini(void) spa_evict_all(); vdev_file_fini(); + vdev_remote_fini(); vdev_mirror_stat_fini(); vdev_raidz_math_fini(); chksum_fini(); diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 7c4dd4eced1b..ee6be4b2c4fb 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -276,6 +276,7 @@ static vdev_ops_t *const vdev_ops_table[] = { &vdev_spare_ops, &vdev_disk_ops, &vdev_file_ops, + &vdev_remote_ops, &vdev_missing_ops, &vdev_hole_ops, &vdev_indirect_ops, diff --git a/module/zfs/vdev_remote.c b/module/zfs/vdev_remote.c new file mode 100644 index 000000000000..e747b561cefa --- /dev/null +++ b/module/zfs/vdev_remote.c @@ -0,0 +1,426 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or https://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2026, OpenZFS Remote VDEV contributors. + * + * Virtual device vector for remote block devices accessed via TCP/RPC. + * + * A remote VDEV connects to a TCP daemon (zfs_remoted) that serves a raw + * image file as a block device. All I/O operations are marshalled over + * the RPC protocol to the remote daemon. + * + * NOTE: Currently only supported on Windows. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#if defined(_WIN32) && defined(_KERNEL) +/* + * On Windows kernel, the OS-specific functions are implemented in + * module/os/windows/zfs/vdev_remote_os.c using Winsock Kernel (WSK). + */ +#else +/* + * Stubs for non-kernel or non-Windows platforms. + * In user-space (libzpool), remote VDEV TCP is handled by the + * kernel driver; these stubs satisfy the linker. + */ +int +vdev_remote_os_connect(vdev_remote_t *vr) +{ + (void) vr; + return (SET_ERROR(ENOTSUP)); +} + +void +vdev_remote_os_disconnect(vdev_remote_t *vr) +{ + (void) vr; +} + +int +vdev_remote_os_io(vdev_remote_t *vr, uint32_t cmd, + uint64_t offset, void *data, uint32_t size) +{ + (void) vr; + (void) cmd; + (void) offset; + (void) data; + (void) size; + return (SET_ERROR(ENOTSUP)); +} + +int +vdev_remote_os_info(vdev_remote_t *vr, + uint64_t *size, uint32_t *blksz, uint32_t *pblksz) +{ + (void) vr; + (void) size; + (void) blksz; + (void) pblksz; + return (SET_ERROR(ENOTSUP)); +} +#endif /* _WIN32 && _KERNEL */ + +/* + * Parse a "remote://host:port" URI into host and port components. + * Returns 0 on success, non-zero on parse error. + */ +static int +vdev_remote_parse_uri(const char *path, char *host, size_t hostlen, + uint16_t *port) +{ + const char *p; + size_t len; + + if (path == NULL) + return (SET_ERROR(EINVAL)); + + /* Must start with "remote://" */ + if (strncmp(path, "remote://", 9) != 0) + return (SET_ERROR(EINVAL)); + + p = path + 9; + + /* Find the colon separating host:port */ + const char *colon = strrchr(p, ':'); + if (colon == NULL || colon == p) + return (SET_ERROR(EINVAL)); + + /* Extract host */ + len = (size_t)(colon - p); + if (len >= hostlen) + return (SET_ERROR(EINVAL)); + + memcpy(host, p, len); + host[len] = '\0'; + + /* Parse port */ + unsigned long port_val = 0; + const char *port_str = colon + 1; + while (*port_str >= '0' && *port_str <= '9') { + port_val = port_val * 10 + (unsigned long)(*port_str - '0'); + port_str++; + } + if (port_val == 0 || port_val > 65535) + return (SET_ERROR(EINVAL)); + + *port = (uint16_t)port_val; + return (0); +} + +/* + * Task queue for asynchronous I/O dispatch. + * Remote I/O is dispatched to this taskq to avoid blocking the ZIO pipeline + * on network latency. + */ +static taskq_t *vdev_remote_taskq; + +void +vdev_remote_init(void) +{ + vdev_remote_taskq = taskq_create("z_vdev_remote", MAX(boot_ncpus, 16), + minclsyspri, boot_ncpus, INT_MAX, TASKQ_DYNAMIC); + VERIFY(vdev_remote_taskq); +} + +void +vdev_remote_fini(void) +{ + if (vdev_remote_taskq != NULL) { + taskq_destroy(vdev_remote_taskq); + vdev_remote_taskq = NULL; + } +} + +static void +vdev_remote_hold(vdev_t *vd) +{ + ASSERT3P(vd->vdev_path, !=, NULL); +} + +static void +vdev_remote_rele(vdev_t *vd) +{ + ASSERT3P(vd->vdev_path, !=, NULL); +} + +static int +vdev_remote_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, + uint64_t *logical_ashift, uint64_t *physical_ashift) +{ + vdev_remote_t *vr; + + /* + * Remote devices are always non-rotational. + */ + vd->vdev_nonrot = B_TRUE; + + /* + * TRIM is supported via the remote protocol. + */ + vd->vdev_has_trim = B_TRUE; + vd->vdev_has_securetrim = B_FALSE; + + /* We must have a pathname starting with "remote://" */ + if (vd->vdev_path == NULL || + strncmp(vd->vdev_path, "remote://", 9) != 0) { + vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; + return (SET_ERROR(EINVAL)); + } + + /* Reopen: just re-read device info */ + if (vd->vdev_tsd != NULL) { + ASSERT(vd->vdev_reopening); + vr = vd->vdev_tsd; + goto skip_open; + } + + vr = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_remote_t), KM_SLEEP); + mutex_init(&vr->vr_lock, NULL, MUTEX_DEFAULT, NULL); + + /* Parse host:port from the URI */ + int error = vdev_remote_parse_uri(vd->vdev_path, + vr->vr_host, sizeof (vr->vr_host), &vr->vr_port); + if (error != 0) { + vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; + mutex_destroy(&vr->vr_lock); + kmem_free(vr, sizeof (vdev_remote_t)); + vd->vdev_tsd = NULL; + return (error); + } + + /* Connect to the remote daemon */ + error = vdev_remote_os_connect(vr); + if (error != 0) { + vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; + mutex_destroy(&vr->vr_lock); + kmem_free(vr, sizeof (vdev_remote_t)); + vd->vdev_tsd = NULL; + return (error); + } + + vr->vr_connected = B_TRUE; + +skip_open: + /* Query device info from remote daemon */ + error = vdev_remote_os_info(vr, &vr->vr_device_size, + &vr->vr_block_size, &vr->vr_phys_block_size); + if (error != 0) { + vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; + if (!vd->vdev_reopening) { + vdev_remote_os_disconnect(vr); + mutex_destroy(&vr->vr_lock); + kmem_free(vr, sizeof (vdev_remote_t)); + vd->vdev_tsd = NULL; + } + return (error); + } + + *psize = vr->vr_device_size; + *max_psize = vr->vr_device_size; + + /* Set ashift based on the remote block size */ + if (vr->vr_block_size == 0) + vr->vr_block_size = DEV_BSIZE; + if (vr->vr_phys_block_size == 0) + vr->vr_phys_block_size = vr->vr_block_size; + + *logical_ashift = highbit64(vr->vr_block_size) - 1; + *physical_ashift = highbit64(vr->vr_phys_block_size) - 1; + + return (0); +} + +static void +vdev_remote_close(vdev_t *vd) +{ + vdev_remote_t *vr = vd->vdev_tsd; + + if (vd->vdev_reopening || vr == NULL) + return; + + if (vr->vr_connected) { + vdev_remote_os_disconnect(vr); + vr->vr_connected = B_FALSE; + } + + mutex_destroy(&vr->vr_lock); + kmem_free(vr, sizeof (vdev_remote_t)); + vd->vdev_tsd = NULL; + vd->vdev_delayed_close = B_FALSE; +} + +/* + * Background I/O strategy: performs the actual RPC call. + * Runs on the vdev_remote_taskq. + */ +static void +vdev_remote_io_strategy(void *arg) +{ + zio_t *zio = (zio_t *)arg; + vdev_t *vd = zio->io_vd; + vdev_remote_t *vr = vd->vdev_tsd; + void *buf = NULL; + uint32_t size = (uint32_t)zio->io_size; + + if (vr == NULL || !vr->vr_connected) { + zio->io_error = SET_ERROR(ENXIO); + zio_delay_interrupt(zio); + return; + } + + if (zio->io_type == ZIO_TYPE_READ) { + buf = abd_borrow_buf(zio->io_abd, size); + zio->io_error = vdev_remote_os_io(vr, VDEV_REMOTE_CMD_READ, + zio->io_offset, buf, size); + if (zio->io_error == 0) + abd_return_buf_copy(zio->io_abd, buf, size); + else + abd_return_buf(zio->io_abd, buf, size); + } else { + /* ZIO_TYPE_WRITE */ + buf = abd_borrow_buf_copy(zio->io_abd, size); + zio->io_error = vdev_remote_os_io(vr, VDEV_REMOTE_CMD_WRITE, + zio->io_offset, buf, size); + abd_return_buf(zio->io_abd, buf, size); + } + + zio_delay_interrupt(zio); +} + +/* + * Flush (sync) operation: tells remote daemon to fsync. + */ +static void +vdev_remote_io_flush(void *arg) +{ + zio_t *zio = (zio_t *)arg; + vdev_t *vd = zio->io_vd; + vdev_remote_t *vr = vd->vdev_tsd; + + if (vr == NULL || !vr->vr_connected) { + zio->io_error = SET_ERROR(ENXIO); + zio_interrupt(zio); + return; + } + + zio->io_error = vdev_remote_os_io(vr, VDEV_REMOTE_CMD_FLUSH, 0, NULL, 0); + zio_interrupt(zio); +} + +/* + * TRIM/UNMAP operation: tells remote daemon to deallocate. + */ +static void +vdev_remote_io_trim(void *arg) +{ + zio_t *zio = (zio_t *)arg; + vdev_t *vd = zio->io_vd; + vdev_remote_t *vr = vd->vdev_tsd; + + if (vr == NULL || !vr->vr_connected) { + zio->io_error = SET_ERROR(ENXIO); + zio_interrupt(zio); + return; + } + + zio->io_error = vdev_remote_os_io(vr, VDEV_REMOTE_CMD_TRIM, + zio->io_offset, NULL, (uint32_t)zio->io_size); + zio_interrupt(zio); +} + +/* + * I/O start entry point: dispatches the appropriate async handler. + */ +static void +vdev_remote_io_start(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + + if (zio->io_type == ZIO_TYPE_FLUSH) { + if (!vdev_readable(vd)) { + zio->io_error = SET_ERROR(ENXIO); + zio_interrupt(zio); + return; + } + if (zfs_nocacheflush) { + zio_interrupt(zio); + return; + } + VERIFY3U(taskq_dispatch(vdev_remote_taskq, + vdev_remote_io_flush, zio, TQ_SLEEP), !=, + TASKQID_INVALID); + return; + } + + if (zio->io_type == ZIO_TYPE_TRIM) { + ASSERT3U(zio->io_size, !=, 0); + VERIFY3U(taskq_dispatch(vdev_remote_taskq, + vdev_remote_io_trim, zio, TQ_SLEEP), !=, + TASKQID_INVALID); + return; + } + + ASSERT(zio->io_type == ZIO_TYPE_READ || + zio->io_type == ZIO_TYPE_WRITE); + zio->io_target_timestamp = zio_handle_io_delay(zio); + + VERIFY3U(taskq_dispatch(vdev_remote_taskq, + vdev_remote_io_strategy, zio, TQ_SLEEP), !=, + TASKQID_INVALID); +} + +static void +vdev_remote_io_done(zio_t *zio) +{ + (void) zio; +} + +/* + * Remote VDEV operations vector. + */ +vdev_ops_t vdev_remote_ops = { + .vdev_op_init = NULL, + .vdev_op_fini = NULL, + .vdev_op_open = vdev_remote_open, + .vdev_op_close = vdev_remote_close, + .vdev_op_psize_to_asize = vdev_default_asize, + .vdev_op_asize_to_psize = vdev_default_psize, + .vdev_op_min_asize = vdev_default_min_asize, + .vdev_op_min_alloc = NULL, + .vdev_op_io_start = vdev_remote_io_start, + .vdev_op_io_done = vdev_remote_io_done, + .vdev_op_state_change = NULL, + .vdev_op_need_resilver = NULL, + .vdev_op_hold = vdev_remote_hold, + .vdev_op_rele = vdev_remote_rele, + .vdev_op_remap = NULL, + .vdev_op_xlate = vdev_default_xlate, + .vdev_op_rebuild_asize = NULL, + .vdev_op_metaslab_init = NULL, + .vdev_op_config_generate = NULL, + .vdev_op_nparity = NULL, + .vdev_op_ndisks = NULL, + .vdev_op_type = VDEV_TYPE_REMOTE, /* name of this vdev type */ + .vdev_op_leaf = B_TRUE /* leaf vdev */ +}; From 74576341ab78c97793cb2f2bb657eeea7da7f0dd Mon Sep 17 00:00:00 2001 From: wangy10 Date: Thu, 14 May 2026 15:29:29 +0800 Subject: [PATCH 2/5] zpool import support -d with remote:// protocol. --- lib/libzutil/os/windows/zutil_import_os.c | 234 +++++++++++++++++++++- lib/libzutil/zutil_import.c | 38 +++- 2 files changed, 265 insertions(+), 7 deletions(-) diff --git a/lib/libzutil/os/windows/zutil_import_os.c b/lib/libzutil/os/windows/zutil_import_os.c index bb6876fa39b1..069fc15ca780 100644 --- a/lib/libzutil/os/windows/zutil_import_os.c +++ b/lib/libzutil/os/windows/zutil_import_os.c @@ -80,7 +80,10 @@ #include #include #include +#include +#include #pragma comment(lib, "setupapi.lib") +#pragma comment(lib, "ws2_32.lib") /* * We allow /dev/ to be search in DEBUG build @@ -225,17 +228,230 @@ zfs_slashes(char *s) *r = '/'; } +/* + * Remote VDEV RPC header - must match sys/vdev_remote.h exactly. + */ +#pragma pack(push, 1) +typedef struct { + uint32_t cmd; + uint32_t status; + uint64_t offset; + uint32_t size; + uint32_t reserved; +} remote_rpc_hdr_t; +#pragma pack(pop) + +#define REMOTE_CMD_READ 0x52454144 /* "READ" */ +#define REMOTE_STATUS_OK 0 + +/* + * Read a range of bytes from a remote VDEV daemon via TCP. + * Parses "remote://host:port" URI, connects, reads exactly 'size' bytes + * at 'offset', stores in 'buf'. Returns 0 on success, -1 on failure. + */ +static int +pread_remote(const char *uri, void *buf, size_t size, uint64_t offset, + uint64_t *dev_size) +{ + const char *p; + char host[256]; + uint16_t port = 0; + SOCKET sock = INVALID_SOCKET; + remote_rpc_hdr_t hdr; + int ret = -1; + + /* Parse remote://host:port */ + if (strncmp(uri, "remote://", 9) != 0) + return (-1); + p = uri + 9; + const char *colon = strrchr(p, ':'); + if (!colon || colon == p) + return (-1); + size_t hostlen = (size_t)(colon - p); + if (hostlen >= sizeof(host)) + return (-1); + memcpy(host, p, hostlen); + host[hostlen] = '\0'; + port = (uint16_t)atoi(colon + 1); + if (port == 0) + return (-1); + + /* Winsock init (thread-safe if already called) */ + WSADATA wsa; + if (WSAStartup(MAKEWORD(2, 2), &wsa) != 0) + return (-1); + + sock = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); + if (sock == INVALID_SOCKET) + goto out; + + struct sockaddr_in addr; + memset(&addr, 0, sizeof(addr)); + addr.sin_family = AF_INET; + addr.sin_port = htons(port); + addr.sin_addr.s_addr = inet_addr(host); + if (addr.sin_addr.s_addr == INADDR_NONE) + goto out; + + if (connect(sock, (struct sockaddr *)&addr, sizeof(addr)) != 0) + goto out; + + /* Send INFO command to get device size */ + memset(&hdr, 0, sizeof(hdr)); + hdr.cmd = 0x494E464F; /* "INFO" */ + if (send(sock, (const char *)&hdr, sizeof(hdr), 0) != sizeof(hdr)) + goto out; + if (recv(sock, (char *)&hdr, sizeof(hdr), 0) != sizeof(hdr)) + goto out; + if (dev_size) + *dev_size = hdr.offset; /* server puts size in offset field */ + + /* Send READ command */ + memset(&hdr, 0, sizeof(hdr)); + hdr.cmd = REMOTE_CMD_READ; + hdr.offset = offset; + hdr.size = (uint32_t)size; + if (send(sock, (const char *)&hdr, sizeof(hdr), 0) != sizeof(hdr)) + goto out; + + /* Receive response header */ + if (recv(sock, (char *)&hdr, sizeof(hdr), 0) != sizeof(hdr)) + goto out; + if (hdr.status != REMOTE_STATUS_OK) + goto out; + + /* Receive data */ + size_t total = 0; + char *dst = (char *)buf; + while (total < hdr.size) { + int n = recv(sock, dst + total, (int)(hdr.size - total), 0); + if (n <= 0) + goto out; + total += n; + } + + ret = 0; +out: + if (sock != INVALID_SOCKET) + closesocket(sock); + return (ret); +} + +/* + * Read ZFS labels from a remote VDEV via TCP. + * Mirrors zpool_read_label_win() but uses the daemon RPC protocol. + */ +static int +zpool_read_label_remote(const char *uri, + nvlist_t **config, int *num_labels) +{ + vdev_label_t *label; + nvlist_t *expected_config = NULL; + uint64_t expected_guid = 0, size; + int l, count = 0; + uint64_t devsize = 0; + + *config = NULL; + + label = (vdev_label_t *)malloc(sizeof(vdev_label_t)); + if (label == NULL) + return (-1); + + /* + * First read just the first label to determine device size. + * The label stores its own position, and we read all 4 labels. + * For remote vdevs we can't compute label offsets without size, + * so we query the daemon for total device size first. + */ + if (pread_remote(uri, label, sizeof(vdev_label_t), + 0, &devsize) != 0) { + free(label); + return (-1); + } + + size = P2ALIGN_TYPED(devsize, sizeof(vdev_label_t), uint64_t); + + for (l = 0; l < VDEV_LABELS; l++) { + uint64_t state, guid, txg; + uint64_t lo; + + if (l < VDEV_LABELS / 2) + lo = (uint64_t)l * sizeof(vdev_label_t); + else + lo = size - (uint64_t)(VDEV_LABELS - l) * + sizeof(vdev_label_t); + + if (pread_remote(uri, label, sizeof(vdev_label_t), + lo, NULL) != 0) + continue; + + if (nvlist_unpack(label->vl_vdev_phys.vp_nvlist, + sizeof(label->vl_vdev_phys.vp_nvlist), config, 0) != 0) + continue; + + if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_GUID, + &guid) != 0 || guid == 0) { + nvlist_free(*config); + continue; + } + + if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_STATE, + &state) != 0 || state > POOL_STATE_L2CACHE) { + nvlist_free(*config); + continue; + } + + if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE && + (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_TXG, + &txg) != 0 || txg == 0)) { + nvlist_free(*config); + continue; + } + + if (expected_guid) { + if (expected_guid == guid) + count++; + nvlist_free(*config); + } else { + expected_config = *config; + expected_guid = guid; + count++; + } + } + + if (num_labels != NULL) + *num_labels = count; + + free(label); + *config = expected_config; + return (0); +} + void zpool_open_func(void *arg) { rdsk_node_t *rn = arg; libpc_handle_t *hdl = rn->rn_hdl; - struct stat64 statbuf; nvlist_t *config; - char *bname, *dupname; uint64_t vdev_guid = 0; int error; int num_labels = 0; + + /* + * Handle remote:// vdevs via TCP to the daemon. + * No local file handle needed. + */ + if (strncmp(rn->rn_name, "remote://", 9) == 0) { + if (zpool_read_label_remote(rn->rn_name, + &config, &num_labels) != 0) { + (void) fprintf(stderr, + "error: cannot read labels from " + "remote vdev '%s'\n", rn->rn_name); + return; + } + goto remote_done; + } + HANDLE h; uint64_t offset = 0; uint64_t len = 0; @@ -316,6 +532,9 @@ zpool_open_func(void *arg) return; } + CloseHandle(h); + +remote_done: /* * Check that the vdev is for the expected guid. Additional entries * are speculatively added based on the paths stored in the labels. @@ -323,13 +542,10 @@ zpool_open_func(void *arg) */ error = nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, &vdev_guid); if (error || (rn->rn_vdev_guid && rn->rn_vdev_guid != vdev_guid)) { - CloseHandle(h); nvlist_free(config); return; } - CloseHandle(h); - rn->rn_config = config; rn->rn_num_labels = num_labels; @@ -1038,6 +1254,14 @@ update_vdev_config_dev_strs(nvlist_t *nv) if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) != 0) return; + + /* + * Remote vdevs: path is a remote:// URI — no local device + * to open, no path rewriting needed. Leave as-is. + */ + if (strncmp(path, "remote://", 9) == 0) + return; + nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, &wholedisk); fprintf(stderr, "working on dev '%s'\n", path); fflush(stderr); diff --git a/lib/libzutil/zutil_import.c b/lib/libzutil/zutil_import.c index 56ac11e2029c..e318e5429dd4 100644 --- a/lib/libzutil/zutil_import.c +++ b/lib/libzutil/zutil_import.c @@ -1396,6 +1396,34 @@ zpool_find_import_scan(libpc_handle_t *hdl, pthread_mutex_t *lock, for (i = 0; i < dirs; i++) { struct stat sbuf; + /* + * Handle remote:// URIs directly — stat() won't work + * since these are network endpoints, not local paths. + */ + if (strncmp(dir[i], "remote://", 9) == 0) { + avl_index_t where; + rdsk_node_t *slice; + + slice = zutil_alloc(hdl, sizeof (rdsk_node_t)); + slice->rn_name = zutil_strdup(hdl, dir[i]); + slice->rn_vdev_guid = 0; + slice->rn_lock = lock; + slice->rn_avl = cache; + slice->rn_hdl = hdl; + slice->rn_order = i + IMPORT_ORDER_SCAN_OFFSET; + slice->rn_labelpaths = B_FALSE; + + pthread_mutex_lock(lock); + if (avl_find(cache, slice, &where)) { + free(slice->rn_name); + free(slice); + } else { + avl_insert(cache, slice, where); + } + pthread_mutex_unlock(lock); + continue; + } + if (stat(dir[i], &sbuf) != 0) { error = errno; if (error == ENOENT) @@ -1537,9 +1565,15 @@ zpool_find_import_impl(libpc_handle_t *hdl, importargs_t *iarg, * Under zdb, this step isn't required and * would prevent a zdb -e of active pools with * no cachefile. + * + * Remote vdevs skip local open() — they're + * network endpoints, not local files. */ - fd = open(slice->rn_name, - O_RDONLY | O_EXCL | O_CLOEXEC); + if (strncmp(slice->rn_name, "remote://", 9) == 0) + fd = 0; + else + fd = open(slice->rn_name, + O_RDONLY | O_EXCL | O_CLOEXEC); if (fd >= 0 || iarg->can_be_active) { if (fd >= 0) close(fd); From b237c39709c21e9ec8570528f6b94336b9359567 Mon Sep 17 00:00:00 2001 From: wangy10 Date: Thu, 14 May 2026 17:14:42 +0800 Subject: [PATCH 3/5] added support for remote disk, and refactoring. --- cmd/zfs_remoted/CMakeLists.txt | 25 +- cmd/zfs_remoted/backend_disk.c | 181 ++++++++++ cmd/zfs_remoted/backend_file.c | 96 +++++ cmd/zfs_remoted/zfs_remoted.c | 643 ++++++++++----------------------- cmd/zfs_remoted/zfs_remoted.h | 83 +++++ 5 files changed, 571 insertions(+), 457 deletions(-) create mode 100644 cmd/zfs_remoted/backend_disk.c create mode 100644 cmd/zfs_remoted/backend_file.c create mode 100644 cmd/zfs_remoted/zfs_remoted.h diff --git a/cmd/zfs_remoted/CMakeLists.txt b/cmd/zfs_remoted/CMakeLists.txt index 26f30e52c0a9..de020b899e89 100644 --- a/cmd/zfs_remoted/CMakeLists.txt +++ b/cmd/zfs_remoted/CMakeLists.txt @@ -1,17 +1,32 @@ # CMakeLists.txt for zfs_remoted - ZFS Remote Block Device Daemon -# Windows-only TCP daemon that serves a raw image file as a block device. # # This is a standalone Windows user-mode application. It does NOT link to # any ZFS libraries and does NOT include ZFS kernel headers. +# +# Source files: +# zfs_remoted.c - main: arg parsing, network, RPC dispatch +# backend_file.c - file backend (serves a raw image file) +# backend_disk.c - disk backend (serves a physical Windows disk) add_executable(zfs_remoted zfs_remoted.c + backend_file.c + backend_disk.c ) -# Clear inherited include directories from parent scope to avoid pulling -# in ZFS kernel headers (which cause include_next failures with MSVC). -set_target_properties(zfs_remoted PROPERTIES - INCLUDE_DIRECTORIES "" +# The parent cmake scope adds ZFS kernel headers globally via +# include_directories(), which shadows MSVC's system (ZFS's +# wrapper uses #include_next, unsupported by MSVC). We reset this target's +# includes to only the Windows SDK user-mode paths. +set_target_properties(zfs_remoted PROPERTIES INCLUDE_DIRECTORIES "") + +# Re-add Windows SDK UM paths so winsock2.h, winioctl.h etc. are found. +# WDK_VERSION and WDK_ROOT are already set by the parent build system. +target_include_directories(zfs_remoted PRIVATE + "${WDK_ROOT}/Include/${WDK_VERSION}/um" + "${WDK_ROOT}/Include/${WDK_VERSION}/shared" + "${WDK_ROOT}/Include/${WDK_VERSION}/ucrt" + "${CMAKE_CURRENT_SOURCE_DIR}" # for "zfs_remoted.h" ) install(TARGETS zfs_remoted RUNTIME DESTINATION ${CMAKE_INSTALL_SBINDIR}) diff --git a/cmd/zfs_remoted/backend_disk.c b/cmd/zfs_remoted/backend_disk.c new file mode 100644 index 000000000000..160909eeb4a5 --- /dev/null +++ b/cmd/zfs_remoted/backend_disk.c @@ -0,0 +1,181 @@ +/* + * CDDL HEADER START ... (see LICENSE) + * CDDL HEADER END + */ +/* + * Copyright (c) 2026, OpenZFS Remote VDEV contributors. + * + * Disk backend -- serves a physical disk (e.g. \\.\PhysicalDrive0). + * + * Disk specifier resolution: + * "0" -> \\.\PhysicalDrive0 + * "physicaldrive2" -> \\.\PhysicalDrive2 (case-insensitive) + * "\\.\PhysicalDrive3" -> literal NT path + */ + +#include "zfs_remoted.h" +#include +#include +#include +#include + +typedef struct { + HANDLE dh; +} disk_priv_t; + +static int +resolve_path(const char *spec, char *out, size_t outlen) +{ + /* Already a full NT path? */ + if (spec[0] == '\\' && spec[1] == '\\' && spec[2] == '.') { + strncpy(out, spec, outlen - 1); + out[outlen - 1] = '\0'; + return 0; + } + + /* "physicaldriveN" (case-insensitive) */ + if (_strnicmp(spec, "physicaldrive", 13) == 0) { + snprintf(out, outlen, "\\\\.\\%s", spec); + return 0; + } + + /* Plain number "N" -> PhysicalDriveN */ + int all_digits = 1; + for (const char *p = spec; *p; p++) { + if (!isdigit((unsigned char)*p)) { all_digits = 0; break; } + } + if (all_digits && spec[0] != '\0') { + snprintf(out, outlen, "\\\\.\\PhysicalDrive%s", spec); + return 0; + } + + return -1; +} + +static int +disk_open(block_backend_t *bb, const char *spec) +{ + char path[256]; + HANDLE h; + + if (resolve_path(spec, path, sizeof(path)) != 0) { + fprintf(stderr, "disk: cannot resolve '%s'\n", spec); + return -1; + } + + h = CreateFileA(path, + GENERIC_READ | GENERIC_WRITE, + 0, /* exclusive */ + NULL, OPEN_EXISTING, + FILE_ATTRIBUTE_NORMAL, + NULL); + + if (h == INVALID_HANDLE_VALUE) { + fprintf(stderr, "disk: CreateFile(%s) failed: %lu\n", + path, GetLastError()); + return -1; + } + + /* --- disk size --- */ + GET_LENGTH_INFORMATION gli = { 0 }; + DWORD junk; + if (DeviceIoControl(h, IOCTL_DISK_GET_LENGTH_INFO, + NULL, 0, &gli, sizeof(gli), &junk, NULL)) { + bb->bb_dev_size = gli.Length.QuadPart; + } else { + DISK_GEOMETRY_EX dg = { 0 }; + if (DeviceIoControl(h, IOCTL_DISK_GET_DRIVE_GEOMETRY_EX, + NULL, 0, &dg, sizeof(dg), &junk, NULL)) { + bb->bb_dev_size = dg.DiskSize.QuadPart; + } else { + bb->bb_dev_size = 0; + } + } + + /* --- sector sizes --- */ + STORAGE_PROPERTY_QUERY spq = { 0 }; + spq.PropertyId = StorageAccessAlignmentProperty; + spq.QueryType = PropertyStandardQuery; + + STORAGE_ACCESS_ALIGNMENT_DESCRIPTOR sa = { 0 }; + if (DeviceIoControl(h, IOCTL_STORAGE_QUERY_PROPERTY, + &spq, sizeof(spq), &sa, sizeof(sa), &junk, NULL)) { + bb->bb_lbasize = sa.BytesPerLogicalSector; + bb->bb_pbasize = sa.BytesPerPhysicalSector; + } else { + bb->bb_lbasize = 512; + bb->bb_pbasize = 4096; + } + + disk_priv_t *dp = (disk_priv_t *)calloc(1, sizeof(*dp)); + if (!dp) { CloseHandle(h); return -1; } + dp->dh = h; + bb->bb_priv = dp; + + fprintf(stderr, "disk: opened %s (%llu MiB, %u/%u byte sectors)\n", + path, + (unsigned long long)(bb->bb_dev_size / (1024 * 1024)), + bb->bb_lbasize, bb->bb_pbasize); + return 0; +} + +static void +disk_close(block_backend_t *bb) +{ + disk_priv_t *dp = (disk_priv_t *)bb->bb_priv; + if (dp) { + if (dp->dh != INVALID_HANDLE_VALUE) CloseHandle(dp->dh); + free(dp); + bb->bb_priv = NULL; + } +} + +/* Helper: overlapped I/O with GetOverlappedResult wait */ +static int +disk_io(HANDLE dh, void *buf, uint32_t size, uint64_t offset, + int is_write) +{ + OVERLAPPED ov = { 0 }; + DWORD bytes = 0; + + ov.Offset = (DWORD)(offset & 0xFFFFFFFF); + ov.OffsetHigh = (DWORD)(offset >> 32); + + BOOL ok = is_write + ? WriteFile(dh, buf, size, &bytes, &ov) + : ReadFile(dh, buf, size, &bytes, &ov); + + if (!ok) { + if (GetLastError() != ERROR_IO_PENDING || + !GetOverlappedResult(dh, &ov, &bytes, TRUE)) + return -1; + } + return (bytes == size) ? 0 : -1; +} + +static int +disk_read(block_backend_t *bb, void *buf, uint32_t size, uint64_t offset) +{ + return disk_io(((disk_priv_t *)bb->bb_priv)->dh, + buf, size, offset, 0); +} + +static int +disk_write(block_backend_t *bb, const void *buf, uint32_t size, + uint64_t offset) +{ + return disk_io(((disk_priv_t *)bb->bb_priv)->dh, + (void *)buf, size, offset, 1); +} + +static int +disk_flush(block_backend_t *bb) +{ + return FlushFileBuffers(((disk_priv_t *)bb->bb_priv)->dh) ? 0 : -1; +} + +const block_backend_t disk_backend = { + "disk", + disk_open, disk_close, disk_read, disk_write, disk_flush, + 0, 512, 4096, NULL +}; diff --git a/cmd/zfs_remoted/backend_file.c b/cmd/zfs_remoted/backend_file.c new file mode 100644 index 000000000000..7de53a35baf9 --- /dev/null +++ b/cmd/zfs_remoted/backend_file.c @@ -0,0 +1,96 @@ +/* + * CDDL HEADER START ... (see LICENSE) + * CDDL HEADER END + */ +/* + * Copyright (c) 2026, OpenZFS Remote VDEV contributors. + * + * File backend -- serves a raw image file. + */ +#include "zfs_remoted.h" +#include +#include + +typedef struct { + HANDLE fh; +} file_priv_t; + +static int +file_open(block_backend_t *bb, const char *path) +{ + fprintf(stderr, "file_open: '%s'\n", path); + fflush(stderr); + + file_priv_t *fp = (file_priv_t *)calloc(1, sizeof(*fp)); + if (!fp) { fprintf(stderr, "file_open: calloc failed\n"); return -1; } + + fp->fh = CreateFileA( + path, + GENERIC_READ | GENERIC_WRITE, + 0, /* exclusive */ + NULL, + OPEN_EXISTING, + FILE_ATTRIBUTE_NORMAL, + NULL); + + if (fp->fh == INVALID_HANDLE_VALUE) { free(fp); return -1; } + + LARGE_INTEGER sz; + bb->bb_dev_size = (GetFileSizeEx(fp->fh, &sz)) + ? (uint64_t)sz.QuadPart : 0; + bb->bb_lbasize = 512; + bb->bb_pbasize = 4096; + bb->bb_priv = fp; + return 0; +} + +static void +file_close(block_backend_t *bb) +{ + file_priv_t *fp = (file_priv_t *)bb->bb_priv; + if (fp) { + if (fp->fh != INVALID_HANDLE_VALUE) CloseHandle(fp->fh); + free(fp); + bb->bb_priv = NULL; + } +} + +static int +file_read(block_backend_t *bb, void *buf, uint32_t size, uint64_t offset) +{ + file_priv_t *fp = (file_priv_t *)bb->bb_priv; + LARGE_INTEGER li; + DWORD bytes = 0; + + li.QuadPart = (LONGLONG)offset; + if (!SetFilePointerEx(fp->fh, li, NULL, FILE_BEGIN)) return -1; + if (!ReadFile(fp->fh, buf, size, &bytes, NULL)) return -1; + return (bytes == size) ? 0 : -1; +} + +static int +file_write(block_backend_t *bb, const void *buf, uint32_t size, + uint64_t offset) +{ + file_priv_t *fp = (file_priv_t *)bb->bb_priv; + LARGE_INTEGER li; + DWORD bytes = 0; + + li.QuadPart = (LONGLONG)offset; + if (!SetFilePointerEx(fp->fh, li, NULL, FILE_BEGIN)) return -1; + if (!WriteFile(fp->fh, buf, size, &bytes, NULL)) return -1; + return (bytes == size) ? 0 : -1; +} + +static int +file_flush(block_backend_t *bb) +{ + file_priv_t *fp = (file_priv_t *)bb->bb_priv; + return FlushFileBuffers(fp->fh) ? 0 : -1; +} + +const block_backend_t file_backend = { + "file", + file_open, file_close, file_read, file_write, file_flush, + 0, 512, 4096, NULL +}; diff --git a/cmd/zfs_remoted/zfs_remoted.c b/cmd/zfs_remoted/zfs_remoted.c index 983f48f5eb57..0b5f135ac8b9 100644 --- a/cmd/zfs_remoted/zfs_remoted.c +++ b/cmd/zfs_remoted/zfs_remoted.c @@ -1,15 +1,5 @@ /* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or https://opensource.org/licenses/CDDL-1.0. - * See the License for the specific language governing permissions - * and limitations under the License. - * + * CDDL HEADER START ... (see LICENSE) * CDDL HEADER END */ /* @@ -17,573 +7,322 @@ * * zfs_remoted - ZFS Remote Block Device Daemon for Windows * - * This daemon listens on a TCP port and serves a raw image file as a - * block device. It implements the remote VDEV RPC protocol used by the - * OpenZFS vdev_remote kernel module. - * - * Usage: zfs_remoted.exe -f -p + * Serves a block device over TCP using the remote VDEV RPC protocol. * - * Protocol: - * Header: [4 bytes cmd][4 bytes status][8 bytes offset][4 bytes size][4 bytes reserved] - * Commands: - * READ (0x52454144): server reads from img and sends [header + data] - * WRITE (0x57524954): client sends [header + data], server writes to img - * FLUSH (0x464C5553): server calls _commit / FlushFileBuffers - * TRIM (0x5452494D): server deallocates (not implemented for file, no-op) - * INFO (0x494E464F): server returns device size / block info + * Usage: + * zfs_remoted -f -p (file backend) + * zfs_remoted -d -p (disk backend) */ -#ifndef _CRT_SECURE_NO_WARNINGS -#define _CRT_SECURE_NO_WARNINGS -#endif -#define WIN32_LEAN_AND_MEAN - -#include -#include -#include +#include "zfs_remoted.h" #include #include #include -#include #pragma comment(lib, "Ws2_32.lib") -/* - * RPC header - must match vdev_remote.h exactly. - */ -#pragma pack(push, 1) -typedef struct rpc_hdr { - uint32_t cmd; - uint32_t status; - uint64_t offset; - uint32_t size; - uint32_t reserved; -} rpc_hdr_t; -#pragma pack(pop) - -/* Command opcodes - must match vdev_remote.h */ -#define CMD_READ 0x52454144 -#define CMD_WRITE 0x57524954 -#define CMD_FLUSH 0x464C5553 -#define CMD_TRIM 0x5452494D -#define CMD_INFO 0x494E464F - -/* Status codes */ -#define STATUS_OK 0 -#define STATUS_ERR_IO 1 -#define STATUS_ERR_INVAL 2 -#define STATUS_ERR_NOSPC 3 - -static volatile LONG g_running = 1; -static HANDLE g_file_handle = INVALID_HANDLE_VALUE; -static CRITICAL_SECTION g_file_lock; - -static void -print_usage(const char *prog) -{ - fprintf(stderr, - "Usage: %s -f -p \n" - " -f Path to the raw image file to serve\n" - " -p TCP port to listen on\n" - " -h Show this help\n", - prog); -} +/* ---- daemon state ---- */ +static volatile LONG g_running = 1; +static block_backend_t g_backend; +static CRITICAL_SECTION g_backend_lock; -static BOOL -ctrl_handler(DWORD ctrl_type) -{ - (void)ctrl_type; - InterlockedExchange(&g_running, 0); - return TRUE; -} - -/* - * Receive exactly 'len' bytes from socket. - * Returns 0 on success, -1 on error. - */ -static int +/* ---- network helpers ---- */ +int recv_all(SOCKET s, void *buf, int len) { char *p = (char *)buf; int remaining = len; - while (remaining > 0) { int n = recv(s, p, remaining, 0); - if (n <= 0) { - if (n == 0) - fprintf(stderr, "Connection closed by peer\n"); - else - fprintf(stderr, "recv error: %d\n", - WSAGetLastError()); - return (-1); - } - p += n; - remaining -= n; + if (n <= 0) return -1; + p += n; remaining -= n; } - return (0); + return 0; } -/* - * Send exactly 'len' bytes to socket. - * Returns 0 on success, -1 on error. - */ -static int +int send_all(SOCKET s, const void *buf, int len) { const char *p = (const char *)buf; int remaining = len; - while (remaining > 0) { int n = send(s, p, remaining, 0); - if (n <= 0) { - fprintf(stderr, "send error: %d\n", WSAGetLastError()); - return (-1); - } - p += n; - remaining -= n; + if (n <= 0) return -1; + p += n; remaining -= n; } - return (0); + return 0; } -/* - * Handle a READ command. - * 1. Read data from the image file at the given offset - * 2. Send response header - * 3. Send the data - */ -static void -handle_read(SOCKET client_sock, rpc_hdr_t *hdr) +/* ---- RPC command handlers ---- */ + +static void handle_read(SOCKET s, rpc_hdr_t *hdr) { rpc_hdr_t resp; - uint8_t *buf = NULL; if (hdr->size == 0 || hdr->size > (256 * 1024 * 1024)) { - /* Sanity: max 256MB per read */ - resp.cmd = CMD_READ; - resp.status = STATUS_ERR_INVAL; - resp.offset = hdr->offset; - resp.size = 0; - resp.reserved = 0; - send_all(client_sock, &resp, sizeof (resp)); - return; - } - - buf = (uint8_t *)malloc(hdr->size); - if (buf == NULL) { - resp.cmd = CMD_READ; - resp.status = STATUS_ERR_IO; - resp.offset = hdr->offset; - resp.size = 0; - resp.reserved = 0; - send_all(client_sock, &resp, sizeof (resp)); - return; - } - - EnterCriticalSection(&g_file_lock); - - LARGE_INTEGER liOffset; - liOffset.QuadPart = (LONGLONG)hdr->offset; - - /* Seek to offset */ - if (!SetFilePointerEx(g_file_handle, liOffset, NULL, FILE_BEGIN)) { - LeaveCriticalSection(&g_file_lock); - fprintf(stderr, "ReadFile seek to %llu failed: %lu\n", - (unsigned long long)hdr->offset, GetLastError()); - resp.cmd = CMD_READ; - resp.status = STATUS_ERR_IO; - resp.offset = hdr->offset; - resp.size = 0; - resp.reserved = 0; - send_all(client_sock, &resp, sizeof (resp)); - free(buf); + resp.cmd = CMD_READ; resp.status = STATUS_ERR_INVAL; + resp.offset = hdr->offset; resp.size = 0; resp.reserved = 0; + send_all(s, &resp, sizeof(resp)); return; } - DWORD bytes_read = 0; - BOOL ok = ReadFile(g_file_handle, buf, hdr->size, &bytes_read, NULL); - - LeaveCriticalSection(&g_file_lock); - - if (!ok || bytes_read != hdr->size) { - fprintf(stderr, "ReadFile failed at offset %llu size %u: %lu\n", - (unsigned long long)hdr->offset, hdr->size, - ok ? 0 : GetLastError()); - resp.cmd = CMD_READ; - resp.status = STATUS_ERR_IO; - resp.offset = hdr->offset; - resp.size = 0; - resp.reserved = 0; - send_all(client_sock, &resp, sizeof (resp)); - free(buf); + uint8_t *buf = (uint8_t *)malloc(hdr->size); + if (!buf) { + resp.cmd = CMD_READ; resp.status = STATUS_ERR_IO; + resp.offset = hdr->offset; resp.size = 0; resp.reserved = 0; + send_all(s, &resp, sizeof(resp)); return; } - /* Send success response header */ - resp.cmd = CMD_READ; - resp.status = STATUS_OK; - resp.offset = hdr->offset; - resp.size = bytes_read; - resp.reserved = 0; + EnterCriticalSection(&g_backend_lock); + int err = g_backend.bb_read(&g_backend, buf, hdr->size, hdr->offset); + LeaveCriticalSection(&g_backend_lock); - if (send_all(client_sock, &resp, sizeof (resp)) != 0) { + if (err) { + resp.cmd = CMD_READ; resp.status = STATUS_ERR_IO; + resp.offset = hdr->offset; resp.size = 0; resp.reserved = 0; + send_all(s, &resp, sizeof(resp)); free(buf); return; } - /* Send the data */ - send_all(client_sock, buf, bytes_read); + resp.cmd = CMD_READ; resp.status = STATUS_OK; + resp.offset = hdr->offset; resp.size = hdr->size; resp.reserved = 0; + if (send_all(s, &resp, sizeof(resp)) == 0) + send_all(s, buf, hdr->size); free(buf); } -/* - * Handle a WRITE command. - * 1. Receive data from socket - * 2. Write data to the image file - * 3. Send response header - */ -static void -handle_write(SOCKET client_sock, rpc_hdr_t *hdr) +static void handle_write(SOCKET s, rpc_hdr_t *hdr) { rpc_hdr_t resp; - uint8_t *buf = NULL; if (hdr->size == 0 || hdr->size > (256 * 1024 * 1024)) { - resp.cmd = CMD_WRITE; - resp.status = STATUS_ERR_INVAL; - resp.offset = hdr->offset; - resp.size = 0; - resp.reserved = 0; - send_all(client_sock, &resp, sizeof (resp)); + resp.cmd = CMD_WRITE; resp.status = STATUS_ERR_INVAL; + resp.offset = hdr->offset; resp.size = 0; resp.reserved = 0; + send_all(s, &resp, sizeof(resp)); return; } - buf = (uint8_t *)malloc(hdr->size); - if (buf == NULL) { - resp.cmd = CMD_WRITE; - resp.status = STATUS_ERR_IO; - resp.offset = hdr->offset; - resp.size = 0; - resp.reserved = 0; - send_all(client_sock, &resp, sizeof (resp)); + uint8_t *buf = (uint8_t *)malloc(hdr->size); + if (!buf) { + resp.cmd = CMD_WRITE; resp.status = STATUS_ERR_IO; + resp.offset = hdr->offset; resp.size = 0; resp.reserved = 0; + send_all(s, &resp, sizeof(resp)); return; } + if (recv_all(s, buf, hdr->size) != 0) { free(buf); return; } - /* Receive the write data */ - if (recv_all(client_sock, buf, hdr->size) != 0) { - free(buf); - return; - } - - EnterCriticalSection(&g_file_lock); - - LARGE_INTEGER liOffset; - liOffset.QuadPart = (LONGLONG)hdr->offset; - - /* Seek to offset */ - if (!SetFilePointerEx(g_file_handle, liOffset, NULL, FILE_BEGIN)) { - LeaveCriticalSection(&g_file_lock); - free(buf); - fprintf(stderr, "WriteFile seek to %llu failed: %lu\n", - (unsigned long long)hdr->offset, GetLastError()); - resp.cmd = CMD_WRITE; - resp.status = STATUS_ERR_IO; - resp.offset = hdr->offset; - resp.size = 0; - resp.reserved = 0; - send_all(client_sock, &resp, sizeof (resp)); - return; - } - - DWORD bytes_written = 0; - BOOL ok = WriteFile(g_file_handle, buf, hdr->size, &bytes_written, NULL); - - LeaveCriticalSection(&g_file_lock); + EnterCriticalSection(&g_backend_lock); + int err = g_backend.bb_write(&g_backend, buf, hdr->size, + hdr->offset); + LeaveCriticalSection(&g_backend_lock); free(buf); - if (!ok || bytes_written != hdr->size) { - fprintf(stderr, "WriteFile failed at offset %llu size %u: %lu\n", - (unsigned long long)hdr->offset, hdr->size, - ok ? 0 : GetLastError()); - resp.cmd = CMD_WRITE; - resp.status = STATUS_ERR_IO; - resp.offset = hdr->offset; - resp.size = 0; - resp.reserved = 0; - send_all(client_sock, &resp, sizeof (resp)); - return; - } - resp.cmd = CMD_WRITE; - resp.status = STATUS_OK; + resp.status = err ? STATUS_ERR_IO : STATUS_OK; resp.offset = hdr->offset; - resp.size = bytes_written; + resp.size = err ? 0 : hdr->size; resp.reserved = 0; - send_all(client_sock, &resp, sizeof (resp)); + send_all(s, &resp, sizeof(resp)); } -/* - * Handle a FLUSH command - flush file buffers to disk. - */ -static void -handle_flush(SOCKET client_sock, rpc_hdr_t *hdr) +static void handle_flush(SOCKET s) { rpc_hdr_t resp; - - EnterCriticalSection(&g_file_lock); - BOOL ok = FlushFileBuffers(g_file_handle); - LeaveCriticalSection(&g_file_lock); + EnterCriticalSection(&g_backend_lock); + int err = g_backend.bb_flush(&g_backend); + LeaveCriticalSection(&g_backend_lock); resp.cmd = CMD_FLUSH; - resp.status = ok ? STATUS_OK : STATUS_ERR_IO; - resp.offset = 0; - resp.size = 0; - resp.reserved = 0; - send_all(client_sock, &resp, sizeof (resp)); + resp.status = err ? STATUS_ERR_IO : STATUS_OK; + resp.offset = 0; resp.size = 0; resp.reserved = 0; + send_all(s, &resp, sizeof(resp)); } -/* - * Handle TRIM - deallocate region. - * For file-backed storage on Windows, this is a no-op - * (could use FSCTL_SET_ZERO_DATA but that zeroes, not trims). - */ -static void -handle_trim(SOCKET client_sock, rpc_hdr_t *hdr) +static void handle_trim(SOCKET s, rpc_hdr_t *hdr) { rpc_hdr_t resp; - - /* For files, TRIM is a no-op. Return success. */ - resp.cmd = CMD_TRIM; - resp.status = STATUS_OK; - resp.offset = hdr->offset; - resp.size = hdr->size; - resp.reserved = 0; - send_all(client_sock, &resp, sizeof (resp)); - - fprintf(stderr, "TRIM range [%llu, %llu] (no-op for file)\n", - (unsigned long long)hdr->offset, - (unsigned long long)(hdr->offset + hdr->size)); + resp.cmd = CMD_TRIM; resp.status = STATUS_OK; + resp.offset = hdr->offset; resp.size = hdr->size; resp.reserved = 0; + send_all(s, &resp, sizeof(resp)); } -/* - * Handle INFO - return device information. - * Response: offset=file_size, size=logical_block, reserved=phys_block - */ -static void -handle_info(SOCKET client_sock) +static void handle_info(SOCKET s) { rpc_hdr_t resp; - - EnterCriticalSection(&g_file_lock); - - LARGE_INTEGER file_size; - BOOL ok = GetFileSizeEx(g_file_handle, &file_size); - - LeaveCriticalSection(&g_file_lock); - - if (!ok) { - resp.cmd = CMD_INFO; - resp.status = STATUS_ERR_IO; - resp.offset = 0; - resp.size = 0; - resp.reserved = 0; - } else { - resp.cmd = CMD_INFO; - resp.status = STATUS_OK; - resp.offset = (uint64_t)file_size.QuadPart; /* device size */ - resp.size = 512; /* logical block size */ - resp.reserved = 4096; /* physical block size */ - } - - send_all(client_sock, &resp, sizeof (resp)); + resp.cmd = CMD_INFO; resp.status = STATUS_OK; + resp.offset = g_backend.bb_dev_size; + resp.size = g_backend.bb_lbasize; + resp.reserved = g_backend.bb_pbasize; + send_all(s, &resp, sizeof(resp)); } -/* - * Handle a single client connection. - */ +/* ---- client connection loop ---- */ static void handle_client(SOCKET client_sock) { rpc_hdr_t hdr; - - fprintf(stderr, "Client connected\n"); + fprintf(stderr, "Client connected (%s backend)\n", g_backend.bb_label); while (g_running) { - /* Receive the RPC header */ - if (recv_all(client_sock, &hdr, sizeof (hdr)) != 0) - break; + if (recv_all(client_sock, &hdr, sizeof(hdr)) != 0) break; switch (hdr.cmd) { - case CMD_READ: - handle_read(client_sock, &hdr); - break; - - case CMD_WRITE: - handle_write(client_sock, &hdr); - break; - - case CMD_FLUSH: - handle_flush(client_sock, &hdr); - break; - - case CMD_TRIM: - handle_trim(client_sock, &hdr); - break; - - case CMD_INFO: - handle_info(client_sock); - break; - + case CMD_READ: handle_read(client_sock, &hdr); break; + case CMD_WRITE: handle_write(client_sock, &hdr); break; + case CMD_FLUSH: handle_flush(client_sock); break; + case CMD_TRIM: handle_trim(client_sock, &hdr); break; + case CMD_INFO: handle_info(client_sock); break; default: - fprintf(stderr, "Unknown command: 0x%08X\n", hdr.cmd); + fprintf(stderr, "Unknown cmd 0x%08X\n", hdr.cmd); hdr.status = STATUS_ERR_INVAL; - send_all(client_sock, &hdr, sizeof (hdr)); + send_all(client_sock, &hdr, sizeof(hdr)); break; } } - closesocket(client_sock); fprintf(stderr, "Client disconnected\n"); } -/* - * Main: parse arguments, open image file, listen for connections. - */ +/* ---- usage / ctrl-c ---- */ +static void +print_usage(const char *prog) +{ + fprintf(stderr, + "Usage:\n" + " %s -f -p serve a raw image file\n" + " %s -d -p serve a physical disk\n" + "\nDisk examples: -d 0 -d physicaldrive1 -d \\\\.\\PhysicalDrive2\n", + prog, prog); +} + +static BOOL WINAPI +ctrl_handler(DWORD ctrl_type) +{ + (void)ctrl_type; + InterlockedExchange(&g_running, 0); + return TRUE; +} + +/* ---- main ---- */ int main(int argc, char *argv[]) { - const char *image_file = NULL; + const char *file_spec = NULL; + const char *disk_spec = NULL; uint16_t port = 0; - int opt; - - /* Parse arguments */ - for (opt = 1; opt < argc; opt++) { - if (strcmp(argv[opt], "-f") == 0 && opt + 1 < argc) { - image_file = argv[++opt]; - } else if (strcmp(argv[opt], "-p") == 0 && opt + 1 < argc) { - port = (uint16_t)atoi(argv[++opt]); - } else if (strcmp(argv[opt], "-h") == 0) { - print_usage(argv[0]); - return (0); - } else { - fprintf(stderr, "Unknown option: %s\n", argv[opt]); - print_usage(argv[0]); - return (1); - } - } - if (image_file == NULL || port == 0) { - fprintf(stderr, "Error: -f and -p " - "are required\n"); - print_usage(argv[0]); - return (1); + /* ensure output immediately visible */ + setvbuf(stderr, NULL, _IONBF, 0); + fprintf(stderr, "zfs_remoted starting...\n"); + + for (int i = 1; i < argc; i++) { + if (strcmp(argv[i], "-f") == 0 && i + 1 < argc) + file_spec = argv[++i]; + else if (strcmp(argv[i], "-d") == 0 && i + 1 < argc) + disk_spec = argv[++i]; + else if (strcmp(argv[i], "-p") == 0 && i + 1 < argc) + port = (uint16_t)atoi(argv[++i]); + else if (strcmp(argv[i], "-h") == 0) + { print_usage(argv[0]); return 0; } + else { + fprintf(stderr, "Unknown: %s\n", argv[i]); + print_usage(argv[0]); return 1; + } } - /* Open the image file for synchronous I/O */ - g_file_handle = CreateFileA( - image_file, - GENERIC_READ | GENERIC_WRITE, - 0, /* exclusive access */ - NULL, - OPEN_EXISTING, - FILE_ATTRIBUTE_NORMAL, /* synchronous, buffered */ - NULL); - - if (g_file_handle == INVALID_HANDLE_VALUE) { - fprintf(stderr, "Failed to open image file '%s': %lu\n", - image_file, GetLastError()); - return (1); + if ((!file_spec && !disk_spec) || port == 0) + { print_usage(argv[0]); return 1; } + if (file_spec && disk_spec) + { fprintf(stderr, "Use -f OR -d, not both.\n"); return 1; } + + /* open backend */ + memset(&g_backend, 0, sizeof(g_backend)); + g_backend = file_spec ? file_backend : disk_backend; + fprintf(stderr, "Opening %s backend with '%s'...\n", + g_backend.bb_label, file_spec ? file_spec : disk_spec); + fflush(stderr); + + int rc = g_backend.bb_open(&g_backend, + file_spec ? file_spec : disk_spec); + fprintf(stderr, "bb_open returned %d\n", rc); + fflush(stderr); + + if (rc) { + fprintf(stderr, "Failed to open backend: %lu\n", GetLastError()); + return 1; } + fprintf(stderr, "Backend opened, %llu bytes\n", + (unsigned long long)g_backend.bb_dev_size); - fprintf(stderr, "Opened image file: %s\n", image_file); - - InitializeCriticalSection(&g_file_lock); + InitializeCriticalSection(&g_backend_lock); - /* Initialize Winsock */ - WSADATA wsa_data; - if (WSAStartup(MAKEWORD(2, 2), &wsa_data) != 0) { + /* winsock */ + WSADATA wsa; + if (WSAStartup(MAKEWORD(2, 2), &wsa) != 0) { fprintf(stderr, "WSAStartup failed: %d\n", WSAGetLastError()); - CloseHandle(g_file_handle); - return (1); + g_backend.bb_close(&g_backend); + return 1; } - /* Create listening socket */ - SOCKET listen_sock = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); - if (listen_sock == INVALID_SOCKET) { + SOCKET lsn = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); + if (lsn == INVALID_SOCKET) { fprintf(stderr, "socket failed: %d\n", WSAGetLastError()); WSACleanup(); - CloseHandle(g_file_handle); - return (1); + g_backend.bb_close(&g_backend); + return 1; } - /* Set SO_REUSEADDR */ int reuse = 1; - setsockopt(listen_sock, SOL_SOCKET, SO_REUSEADDR, - (const char *)&reuse, sizeof (reuse)); - - struct sockaddr_in server_addr; - memset(&server_addr, 0, sizeof (server_addr)); - server_addr.sin_family = AF_INET; - server_addr.sin_addr.s_addr = INADDR_ANY; - server_addr.sin_port = htons(port); - - if (bind(listen_sock, (struct sockaddr *)&server_addr, - sizeof (server_addr)) == SOCKET_ERROR) { - fprintf(stderr, "bind to port %u failed: %d\n", + setsockopt(lsn, SOL_SOCKET, SO_REUSEADDR, + (const char *)&reuse, sizeof(reuse)); + + struct sockaddr_in srv = { 0 }; + srv.sin_family = AF_INET; + srv.sin_port = htons(port); + + if (bind(lsn, (struct sockaddr *)&srv, sizeof(srv)) != 0) { + fprintf(stderr, "bind port %u failed: %d\n", port, WSAGetLastError()); - closesocket(listen_sock); + closesocket(lsn); WSACleanup(); - CloseHandle(g_file_handle); - return (1); + g_backend.bb_close(&g_backend); + return 1; } - if (listen(listen_sock, SOMAXCONN) == SOCKET_ERROR) { + if (listen(lsn, SOMAXCONN) != 0) { fprintf(stderr, "listen failed: %d\n", WSAGetLastError()); - closesocket(listen_sock); + closesocket(lsn); WSACleanup(); - CloseHandle(g_file_handle); - return (1); + g_backend.bb_close(&g_backend); + return 1; } - /* Set Ctrl+C handler for graceful shutdown */ - SetConsoleCtrlHandler((PHANDLER_ROUTINE)ctrl_handler, TRUE); - - fprintf(stderr, "zfs_remoted: listening on port %u, serving '%s'\n", - port, image_file); + SetConsoleCtrlHandler(ctrl_handler, TRUE); + fprintf(stderr, "zfs_remoted: port %u, %s backend, %llu MiB\n", + port, g_backend.bb_label, + (unsigned long long)(g_backend.bb_dev_size / (1024*1024))); + fflush(stderr); while (g_running) { - struct sockaddr_in client_addr; - int client_len = sizeof (client_addr); - - /* Set a timeout so we can check g_running periodically */ - struct timeval tv; - tv.tv_sec = 1; - tv.tv_usec = 0; - - fd_set readfds; - FD_ZERO(&readfds); - FD_SET(listen_sock, &readfds); - - int sel_ret = select(0, &readfds, NULL, NULL, &tv); - if (sel_ret < 0) + struct timeval tv = { 1, 0 }; + fd_set rfds; FD_ZERO(&rfds); FD_SET(lsn, &rfds); + int sr = select(0, &rfds, NULL, NULL, &tv); + if (sr < 0) { + fprintf(stderr, "select error: %d\n", WSAGetLastError()); break; - if (sel_ret == 0) - continue; /* timeout, check g_running */ - - SOCKET client_sock = accept(listen_sock, - (struct sockaddr *)&client_addr, &client_len); - if (client_sock == INVALID_SOCKET) - continue; + } + if (sr == 0) continue; - handle_client(client_sock); + SOCKET cli = accept(lsn, NULL, NULL); + if (cli != INVALID_SOCKET) handle_client(cli); } fprintf(stderr, "Shutting down...\n"); - - closesocket(listen_sock); + closesocket(lsn); WSACleanup(); - - DeleteCriticalSection(&g_file_lock); - CloseHandle(g_file_handle); - - return (0); + DeleteCriticalSection(&g_backend_lock); + g_backend.bb_close(&g_backend); + return 0; } diff --git a/cmd/zfs_remoted/zfs_remoted.h b/cmd/zfs_remoted/zfs_remoted.h new file mode 100644 index 000000000000..ae85b7c39b2f --- /dev/null +++ b/cmd/zfs_remoted/zfs_remoted.h @@ -0,0 +1,83 @@ +/* + * CDDL HEADER START ... (see LICENSE) + * CDDL HEADER END + */ +/* + * Copyright (c) 2026, OpenZFS Remote VDEV contributors. + * + * Shared header for zfs_remoted: RPC protocol and block backend interface. + */ + +#ifndef ZFS_REMOTED_H +#define ZFS_REMOTED_H + +#define WIN32_LEAN_AND_MEAN +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* ================================================================ + * RPC Protocol -- must match sys/vdev_remote.h exactly + * ================================================================ */ +#pragma pack(push, 1) +typedef struct rpc_hdr { + uint32_t cmd; + uint32_t status; + uint64_t offset; + uint32_t size; + uint32_t reserved; +} rpc_hdr_t; +#pragma pack(pop) + +#define CMD_READ 0x52454144 /* "READ" */ +#define CMD_WRITE 0x57524954 /* "WRIT" */ +#define CMD_FLUSH 0x464C5553 /* "FLUS" */ +#define CMD_TRIM 0x5452494D /* "TRIM" */ +#define CMD_INFO 0x494E464F /* "INFO" */ + +#define STATUS_OK 0 +#define STATUS_ERR_IO 1 +#define STATUS_ERR_INVAL 2 +#define STATUS_ERR_NOSPC 3 + +/* ================================================================ + * Block device backend abstraction + * ================================================================ */ +typedef struct block_backend { + const char *bb_label; + + int (*bb_open)(struct block_backend *bb, const char *spec); + void (*bb_close)(struct block_backend *bb); + int (*bb_read)(struct block_backend *bb, + void *buf, uint32_t size, uint64_t offset); + int (*bb_write)(struct block_backend *bb, + const void *buf, uint32_t size, uint64_t offset); + int (*bb_flush)(struct block_backend *bb); + + /* read-only after open */ + uint64_t bb_dev_size; + uint32_t bb_lbasize; + uint32_t bb_pbasize; + + /* opaque private data */ + void *bb_priv; +} block_backend_t; + +/* Pre-built templates (caller copies them and calls bb_open) */ +extern const block_backend_t file_backend; +extern const block_backend_t disk_backend; + +/* Network helpers */ +int recv_all(SOCKET s, void *buf, int len); +int send_all(SOCKET s, const void *buf, int len); + +#ifdef __cplusplus +} +#endif + +#endif /* ZFS_REMOTED_H */ From 9daa9dff29f680c39c1845d8a7f0e5d4f75b9e6b Mon Sep 17 00:00:00 2001 From: wangy10 Date: Tue, 19 May 2026 20:13:04 +0800 Subject: [PATCH 4/5] Made zfs_remoted multiple clients connected available, and enhance vdev_remote module for error handling, keep stable when vdev connection is lost. --- cmd/zfs_remoted/zfs_remoted.c | 219 ++++++++++++++++---- include/sys/vdev_remote.h | 9 + module/os/windows/zfs/vdev_remote_os.c | 268 +++++++++++++++++++++---- module/zfs/vdev_remote.c | 32 ++- 4 files changed, 444 insertions(+), 84 deletions(-) diff --git a/cmd/zfs_remoted/zfs_remoted.c b/cmd/zfs_remoted/zfs_remoted.c index 0b5f135ac8b9..0043f376bdc2 100644 --- a/cmd/zfs_remoted/zfs_remoted.c +++ b/cmd/zfs_remoted/zfs_remoted.c @@ -8,6 +8,7 @@ * zfs_remoted - ZFS Remote Block Device Daemon for Windows * * Serves a block device over TCP using the remote VDEV RPC protocol. + * Supports up to 64 concurrent ZFS kernel clients. * * Usage: * zfs_remoted -f -p (file backend) @@ -18,14 +19,31 @@ #include #include #include +#include #pragma comment(lib, "Ws2_32.lib") +/* ---- constants ---- */ +#define MAX_CLIENTS 64 + /* ---- daemon state ---- */ static volatile LONG g_running = 1; static block_backend_t g_backend; static CRITICAL_SECTION g_backend_lock; +/* ---- client-thread tracking ---- */ +static HANDLE g_threads[MAX_CLIENTS]; +static LONG g_thread_count = 0; +static CRITICAL_SECTION g_threads_lock; + +/* ---- forward declarations ---- */ +static void handle_read(SOCKET s, rpc_hdr_t *hdr); +static void handle_write(SOCKET s, rpc_hdr_t *hdr); +static void handle_flush(SOCKET s); +static void handle_trim(SOCKET s, rpc_hdr_t *hdr); +static void handle_info(SOCKET s); +static DWORD WINAPI handle_client_thread(LPVOID param); + /* ---- network helpers ---- */ int recv_all(SOCKET s, void *buf, int len) @@ -148,41 +166,95 @@ static void handle_trim(SOCKET s, rpc_hdr_t *hdr) send_all(s, &resp, sizeof(resp)); } -static void handle_info(SOCKET s) +static void +handle_info(SOCKET s) { rpc_hdr_t resp; + EnterCriticalSection(&g_backend_lock); resp.cmd = CMD_INFO; resp.status = STATUS_OK; resp.offset = g_backend.bb_dev_size; resp.size = g_backend.bb_lbasize; resp.reserved = g_backend.bb_pbasize; + LeaveCriticalSection(&g_backend_lock); send_all(s, &resp, sizeof(resp)); } -/* ---- client connection loop ---- */ +/* ---- thread scavenger: reap finished client threads ---- */ static void -handle_client(SOCKET client_sock) +scavenge_threads(void) +{ + EnterCriticalSection(&g_threads_lock); + int write_idx = 0; + for (int i = 0; i < g_thread_count; i++) { + if (WaitForSingleObject(g_threads[i], 0) == WAIT_OBJECT_0) { + /* thread has exited — clean up */ + CloseHandle(g_threads[i]); + } else { + /* thread still running — keep it */ + g_threads[write_idx++] = g_threads[i]; + } + } + g_thread_count = write_idx; + LeaveCriticalSection(&g_threads_lock); +} + +/* ---- per-client thread entry point ---- */ +static DWORD WINAPI +handle_client_thread(LPVOID param) { + SOCKET client_sock = (SOCKET)(INT_PTR)param; rpc_hdr_t hdr; - fprintf(stderr, "Client connected (%s backend)\n", g_backend.bb_label); + DWORD tid = GetCurrentThreadId(); + + fprintf(stderr, "[tid %lu] connected (%s backend)\n", + tid, g_backend.bb_label); while (g_running) { - if (recv_all(client_sock, &hdr, sizeof(hdr)) != 0) break; + /* + * Wait for the next RPC header with a 1 s timeout so we + * re-check g_running periodically and can shut down cleanly. + */ + fd_set rfds; + FD_ZERO(&rfds); + FD_SET(client_sock, &rfds); + struct timeval tv = { 1, 0 }; + int sr = select(0, &rfds, NULL, NULL, &tv); + if (sr < 0) + break; /* socket error */ + if (sr == 0) + continue; /* timeout → re-check g_running */ + + if (recv_all(client_sock, &hdr, sizeof(hdr)) != 0) + break; /* client closed or error */ switch (hdr.cmd) { - case CMD_READ: handle_read(client_sock, &hdr); break; - case CMD_WRITE: handle_write(client_sock, &hdr); break; - case CMD_FLUSH: handle_flush(client_sock); break; - case CMD_TRIM: handle_trim(client_sock, &hdr); break; - case CMD_INFO: handle_info(client_sock); break; + case CMD_READ: + handle_read(client_sock, &hdr); + break; + case CMD_WRITE: + handle_write(client_sock, &hdr); + break; + case CMD_FLUSH: + handle_flush(client_sock); + break; + case CMD_TRIM: + handle_trim(client_sock, &hdr); + break; + case CMD_INFO: + handle_info(client_sock); + break; default: - fprintf(stderr, "Unknown cmd 0x%08X\n", hdr.cmd); + fprintf(stderr, "[tid %lu] unknown cmd 0x%08X\n", + tid, hdr.cmd); hdr.status = STATUS_ERR_INVAL; send_all(client_sock, &hdr, sizeof(hdr)); break; } } + closesocket(client_sock); - fprintf(stderr, "Client disconnected\n"); + fprintf(stderr, "[tid %lu] disconnected\n", tid); + return 0; } /* ---- usage / ctrl-c ---- */ @@ -215,7 +287,7 @@ main(int argc, char *argv[]) /* ensure output immediately visible */ setvbuf(stderr, NULL, _IONBF, 0); - fprintf(stderr, "zfs_remoted starting...\n"); + fprintf(stderr, "zfs_remoted starting (multi-client mode)...\n"); for (int i = 1; i < argc; i++) { if (strcmp(argv[i], "-f") == 0 && i + 1 < argc) @@ -237,32 +309,33 @@ main(int argc, char *argv[]) if (file_spec && disk_spec) { fprintf(stderr, "Use -f OR -d, not both.\n"); return 1; } - /* open backend */ + /* ---- open backend ---- */ memset(&g_backend, 0, sizeof(g_backend)); g_backend = file_spec ? file_backend : disk_backend; - fprintf(stderr, "Opening %s backend with '%s'...\n", - g_backend.bb_label, file_spec ? file_spec : disk_spec); - fflush(stderr); - - int rc = g_backend.bb_open(&g_backend, - file_spec ? file_spec : disk_spec); - fprintf(stderr, "bb_open returned %d\n", rc); - fflush(stderr); - if (rc) { - fprintf(stderr, "Failed to open backend: %lu\n", GetLastError()); - return 1; + { + int rc = g_backend.bb_open(&g_backend, + file_spec ? file_spec : disk_spec); + if (rc) { + fprintf(stderr, "Failed to open backend: %lu\n", + GetLastError()); + return 1; + } } + fprintf(stderr, "Backend opened, %llu bytes\n", (unsigned long long)g_backend.bb_dev_size); InitializeCriticalSection(&g_backend_lock); + InitializeCriticalSection(&g_threads_lock); /* winsock */ WSADATA wsa; if (WSAStartup(MAKEWORD(2, 2), &wsa) != 0) { fprintf(stderr, "WSAStartup failed: %d\n", WSAGetLastError()); g_backend.bb_close(&g_backend); + DeleteCriticalSection(&g_backend_lock); + DeleteCriticalSection(&g_threads_lock); return 1; } @@ -271,12 +344,16 @@ main(int argc, char *argv[]) fprintf(stderr, "socket failed: %d\n", WSAGetLastError()); WSACleanup(); g_backend.bb_close(&g_backend); + DeleteCriticalSection(&g_backend_lock); + DeleteCriticalSection(&g_threads_lock); return 1; } - int reuse = 1; - setsockopt(lsn, SOL_SOCKET, SO_REUSEADDR, - (const char *)&reuse, sizeof(reuse)); + { + int reuse = 1; + setsockopt(lsn, SOL_SOCKET, SO_REUSEADDR, + (const char *)&reuse, sizeof(reuse)); + } struct sockaddr_in srv = { 0 }; srv.sin_family = AF_INET; @@ -288,6 +365,8 @@ main(int argc, char *argv[]) closesocket(lsn); WSACleanup(); g_backend.bb_close(&g_backend); + DeleteCriticalSection(&g_backend_lock); + DeleteCriticalSection(&g_threads_lock); return 1; } @@ -296,33 +375,99 @@ main(int argc, char *argv[]) closesocket(lsn); WSACleanup(); g_backend.bb_close(&g_backend); + DeleteCriticalSection(&g_backend_lock); + DeleteCriticalSection(&g_threads_lock); return 1; } SetConsoleCtrlHandler(ctrl_handler, TRUE); - fprintf(stderr, "zfs_remoted: port %u, %s backend, %llu MiB\n", + fprintf(stderr, + "zfs_remoted: port %u, %s backend, %llu MiB, max %d clients\n", port, g_backend.bb_label, - (unsigned long long)(g_backend.bb_dev_size / (1024*1024))); - fflush(stderr); + (unsigned long long)(g_backend.bb_dev_size / (1024 * 1024)), + MAX_CLIENTS); + /* ---- accept loop ---- */ while (g_running) { + fd_set rfds; + FD_ZERO(&rfds); + FD_SET(lsn, &rfds); struct timeval tv = { 1, 0 }; - fd_set rfds; FD_ZERO(&rfds); FD_SET(lsn, &rfds); int sr = select(0, &rfds, NULL, NULL, &tv); if (sr < 0) { - fprintf(stderr, "select error: %d\n", WSAGetLastError()); + fprintf(stderr, "select error: %d\n", + WSAGetLastError()); break; } - if (sr == 0) continue; + if (sr == 0) { + /* no incoming connections — reap finished threads */ + scavenge_threads(); + continue; + } SOCKET cli = accept(lsn, NULL, NULL); - if (cli != INVALID_SOCKET) handle_client(cli); + if (cli == INVALID_SOCKET) { + if (g_running) + fprintf(stderr, "accept error: %d\n", + WSAGetLastError()); + continue; + } + + /* spawn a worker thread for this client */ + EnterCriticalSection(&g_threads_lock); + if (g_thread_count >= MAX_CLIENTS) { + LeaveCriticalSection(&g_threads_lock); + fprintf(stderr, + "max clients (%d) reached, rejecting connection\n", + MAX_CLIENTS); + closesocket(cli); + continue; + } + + HANDLE h = CreateThread(NULL, 0, handle_client_thread, + (LPVOID)(INT_PTR)cli, 0, NULL); + if (h) { + g_threads[g_thread_count++] = h; + fprintf(stderr, "client accepted (%d/%d active)\n", + (int)g_thread_count, MAX_CLIENTS); + } else { + fprintf(stderr, "CreateThread failed: %lu\n", + GetLastError()); + closesocket(cli); + } + LeaveCriticalSection(&g_threads_lock); } - fprintf(stderr, "Shutting down...\n"); + /* ---- graceful shutdown ---- */ + fprintf(stderr, "Shutting down, waiting for %d client(s)...\n", + (int)g_thread_count); + + /* Close the listen socket so no new connections arrive. */ closesocket(lsn); - WSACleanup(); + + /* + * Wait for every client thread to finish. Each thread checks + * g_running at least once per second and will exit promptly. + * Use a 10 s per-thread timeout as a safety net. + */ + EnterCriticalSection(&g_threads_lock); + for (int i = 0; i < g_thread_count; i++) { + DWORD wr = WaitForSingleObject(g_threads[i], 10000); + if (wr == WAIT_TIMEOUT) { + fprintf(stderr, + "warning: thread %d did not exit in time\n", i); + TerminateThread(g_threads[i], 1); + } + CloseHandle(g_threads[i]); + } + g_thread_count = 0; + LeaveCriticalSection(&g_threads_lock); + + DeleteCriticalSection(&g_threads_lock); DeleteCriticalSection(&g_backend_lock); + WSACleanup(); g_backend.bb_close(&g_backend); + + fprintf(stderr, "zfs_remoted stopped.\n"); return 0; } diff --git a/include/sys/vdev_remote.h b/include/sys/vdev_remote.h index 49e338761fe9..37ecb49344e2 100644 --- a/include/sys/vdev_remote.h +++ b/include/sys/vdev_remote.h @@ -66,6 +66,13 @@ typedef struct vdev_remote_rpc_hdr { #define VDEV_REMOTE_STATUS_ERR_INVAL 2 #define VDEV_REMOTE_STATUS_ERR_NOSPC 3 +/* + * Reconnect / timeout constants (in milliseconds). + */ +#define VDEV_REMOTE_WSK_TIMEOUT_MS 30000 /* per-WSK-op timeout */ +#define VDEV_REMOTE_RECONNECT_BACKOFF_MIN_MS 1000 /* 1 s initial backoff */ +#define VDEV_REMOTE_RECONNECT_BACKOFF_MAX_MS 60000 /* 60 s max backoff */ + /* * Per-instance state for a remote VDEV. */ @@ -78,6 +85,8 @@ typedef struct vdev_remote { uint32_t vr_phys_block_size; kmutex_t vr_lock; boolean_t vr_connected; + hrtime_t vr_reconnect_until; /* backoff deadline */ + uint32_t vr_reconnect_backoff; /* current backoff in ms */ } vdev_remote_t; /* diff --git a/module/os/windows/zfs/vdev_remote_os.c b/module/os/windows/zfs/vdev_remote_os.c index cbda3b03423c..224249a0c650 100644 --- a/module/os/windows/zfs/vdev_remote_os.c +++ b/module/os/windows/zfs/vdev_remote_os.c @@ -57,13 +57,33 @@ wsk_sync_comp( return (STATUS_MORE_PROCESSING_REQUIRED); } -/* Issue a WSK call synchronously: wait on event if PENDING, return status */ +/* Issue a WSK call synchronously: wait on event if PENDING, return status. + * + * timeout_ms — if the IRP stays pending longer than this, IoCancelIrp is + * issued and we wait for the cancelled completion. This prevents an + * unresponsive remote peer from blocking a taskq thread indefinitely. + */ static NTSTATUS -wsk_sync(KEVENT *event, PIRP irp, NTSTATUS st) +wsk_sync_timeout(KEVENT *event, PIRP irp, NTSTATUS st, ULONG timeout_ms) { if (st == STATUS_PENDING) { - KeWaitForSingleObject(event, Executive, KernelMode, - FALSE, NULL); + LARGE_INTEGER li; + li.QuadPart = -(LONGLONG)timeout_ms * 10000; + NTSTATUS wait_st = KeWaitForSingleObject(event, Executive, + KernelMode, FALSE, &li); + if (wait_st == STATUS_TIMEOUT) { + /* + * Cancel the IRP. WSK owns the cancel routine so + * IoCancelIrp is safe. Returns TRUE if cancel was + * queued; in that case we must wait for completion. + * Returns FALSE if the IRP already completed (or is + * about to), so IoStatus is already valid. + */ + if (IoCancelIrp(irp)) { + KeWaitForSingleObject(event, Executive, + KernelMode, FALSE, NULL); + } + } st = irp->IoStatus.Status; } return (st); @@ -85,7 +105,7 @@ static const WSK_CLIENT_DISPATCH wsk_disp = { static int vdev_remote_wsk_init(void); static int vdev_remote_wsk_connect(vdev_remote_t *vr, PWSK_SOCKET *out); -static void vdev_remote_wsk_close(PWSK_SOCKET sock); +static void vdev_remote_wsk_close(PWSK_SOCKET sock, boolean_t is_dead); static int vdev_remote_wsk_send_recv(PWSK_SOCKET sock, vdev_remote_rpc_hdr_t *hdr, void *data, uint32_t dlen, uint32_t cmd); @@ -181,11 +201,12 @@ vdev_remote_wsk_connect(vdev_remote_t *vr, PWSK_SOCKET *out) if (!irp) return (SET_ERROR(ENOMEM)); IoSetCompletionRoutine(irp, wsk_sync_comp, &ev, TRUE, TRUE, TRUE); - st = wsk_sync(&ev, irp, + st = wsk_sync_timeout(&ev, irp, wsk_npi.Dispatch->WskSocketConnect( wsk_npi.Client, SOCK_STREAM, IPPROTO_TCP, (PSOCKADDR)&local, (PSOCKADDR)&remote, - 0, NULL, NULL, NULL, NULL, NULL, irp)); + 0, NULL, NULL, NULL, NULL, NULL, irp), + VDEV_REMOTE_WSK_TIMEOUT_MS); if (!NT_SUCCESS(st)) { dprintf("vdev_remote: WskSocketConnect %s:%u: 0x%lx\n", @@ -199,10 +220,14 @@ vdev_remote_wsk_connect(vdev_remote_t *vr, PWSK_SOCKET *out) } /* - * Disconnect + close socket. + * Close a WSK socket. + * + * When 'is_dead' is true the socket is known (or suspected) to be broken; + * skip the graceful WskDisconnect and go straight to WskCloseSocket to + * avoid blocking on a dead peer. */ static void -vdev_remote_wsk_close(PWSK_SOCKET sock) +vdev_remote_wsk_close(PWSK_SOCKET sock, boolean_t is_dead) { PIRP irp; KEVENT ev; @@ -211,15 +236,17 @@ vdev_remote_wsk_close(PWSK_SOCKET sock) if (!sock) return; d = (PWSK_PROVIDER_CONNECTION_DISPATCH)sock->Dispatch; - /* Graceful disconnect */ - KeInitializeEvent(&ev, SynchronizationEvent, FALSE); - irp = IoAllocateIrp(1, FALSE); - if (irp) { - IoSetCompletionRoutine(irp, wsk_sync_comp, &ev, - TRUE, TRUE, TRUE); - (void)wsk_sync(&ev, irp, - d->WskDisconnect(sock, NULL, 0, irp)); - IoFreeIrp(irp); + if (!is_dead) { + KeInitializeEvent(&ev, SynchronizationEvent, FALSE); + irp = IoAllocateIrp(1, FALSE); + if (irp) { + IoSetCompletionRoutine(irp, wsk_sync_comp, &ev, + TRUE, TRUE, TRUE); + (void)wsk_sync_timeout(&ev, irp, + d->WskDisconnect(sock, NULL, 0, irp), + VDEV_REMOTE_WSK_TIMEOUT_MS); + IoFreeIrp(irp); + } } /* Close (Basic dispatch — IRP is optional but we pass one) */ @@ -270,8 +297,9 @@ vdev_remote_wsk_send_recv(PWSK_SOCKET sock, IoSetCompletionRoutine(irp, wsk_sync_comp, &ev, TRUE, TRUE, TRUE); irp->MdlAddress = mdl; - st = wsk_sync(&ev, irp, - d->WskSend(sock, &wb, WSK_FLAG_NODELAY, irp)); + st = wsk_sync_timeout(&ev, irp, + d->WskSend(sock, &wb, WSK_FLAG_NODELAY, irp), + VDEV_REMOTE_WSK_TIMEOUT_MS); IoFreeMdl(mdl); IoFreeIrp(irp); if (!NT_SUCCESS(st)) { @@ -297,8 +325,9 @@ vdev_remote_wsk_send_recv(PWSK_SOCKET sock, TRUE, TRUE, TRUE); irp->MdlAddress = mdl; - st = wsk_sync(&ev, irp, - d->WskSend(sock, &wb, WSK_FLAG_NODELAY, irp)); + st = wsk_sync_timeout(&ev, irp, + d->WskSend(sock, &wb, WSK_FLAG_NODELAY, irp), + VDEV_REMOTE_WSK_TIMEOUT_MS); IoFreeMdl(mdl); IoFreeIrp(irp); if (!NT_SUCCESS(st)) { @@ -323,8 +352,9 @@ vdev_remote_wsk_send_recv(PWSK_SOCKET sock, IoSetCompletionRoutine(irp, wsk_sync_comp, &ev, TRUE, TRUE, TRUE); irp->MdlAddress = mdl; - st = wsk_sync(&ev, irp, - d->WskReceive(sock, &wb, WSK_FLAG_WAITALL, irp)); + st = wsk_sync_timeout(&ev, irp, + d->WskReceive(sock, &wb, WSK_FLAG_WAITALL, irp), + VDEV_REMOTE_WSK_TIMEOUT_MS); IoFreeMdl(mdl); IoFreeIrp(irp); if (!NT_SUCCESS(st)) { @@ -357,8 +387,9 @@ vdev_remote_wsk_send_recv(PWSK_SOCKET sock, TRUE, TRUE, TRUE); irp->MdlAddress = mdl; - st = wsk_sync(&ev, irp, - d->WskReceive(sock, &wb, WSK_FLAG_WAITALL, irp)); + st = wsk_sync_timeout(&ev, irp, + d->WskReceive(sock, &wb, WSK_FLAG_WAITALL, irp), + VDEV_REMOTE_WSK_TIMEOUT_MS); IoFreeMdl(mdl); IoFreeIrp(irp); if (!NT_SUCCESS(st)) { @@ -373,12 +404,103 @@ vdev_remote_wsk_send_recv(PWSK_SOCKET sock, /* ---- Public OS interface ---- */ +/* + * Reconnect to the remote daemon with exponential backoff. + * + * Must be called with vr_lock held. On success vr_os_priv and + * vr_connected are updated and the backoff state is reset. + * On failure the backoff timer is advanced. + * + * Returns: 0 on success, errno on failure. + */ +static int +vdev_remote_os_reconnect(vdev_remote_t *vr) +{ + PWSK_SOCKET sock = NULL; + int err; + + ASSERT(MUTEX_HELD(&vr->vr_lock)); + + /* + * If we already have a live socket (e.g. racing reconnect from + * another path), just return success. + */ + if (vr->vr_connected && vr->vr_os_priv != NULL) + return (0); + + /* Enforce backoff: don't retry before vr_reconnect_until. */ + if (vr->vr_reconnect_backoff > 0) { + hrtime_t now = gethrtime(); + if (now < vr->vr_reconnect_until) { + dprintf("vdev_remote: reconnect backoff %ums " + "(%lld ms remaining)\n", + vr->vr_reconnect_backoff, + (long long)((vr->vr_reconnect_until - now) + / (NANOSEC / MILLISEC))); + return (SET_ERROR(EAGAIN)); + } + } + + /* Clean up any stale socket before reconnecting. */ + if (vr->vr_os_priv != NULL) { + vdev_remote_wsk_close((PWSK_SOCKET)vr->vr_os_priv, B_TRUE); + vr->vr_os_priv = NULL; + } + vr->vr_connected = B_FALSE; + + err = vdev_remote_wsk_init(); + if (err) + goto fail; + + err = vdev_remote_wsk_connect(vr, &sock); + if (err) + goto fail; + + vr->vr_os_priv = (void *)sock; + vr->vr_connected = B_TRUE; + + /* Reset backoff on successful reconnect. */ + vr->vr_reconnect_backoff = 0; + vr->vr_reconnect_until = 0; + + dprintf("vdev_remote: reconnected to %s:%u\n", + vr->vr_host, vr->vr_port); + return (0); + +fail: + /* Advance exponential backoff. */ + if (vr->vr_reconnect_backoff == 0) + vr->vr_reconnect_backoff = VDEV_REMOTE_RECONNECT_BACKOFF_MIN_MS; + else + vr->vr_reconnect_backoff = MIN( + vr->vr_reconnect_backoff * 2, + VDEV_REMOTE_RECONNECT_BACKOFF_MAX_MS); + + vr->vr_reconnect_until = gethrtime() + + (hrtime_t)vr->vr_reconnect_backoff * (NANOSEC / MILLISEC); + + dprintf("vdev_remote: reconnect %s:%u failed (err %d), " + "backoff %ums\n", vr->vr_host, vr->vr_port, + err, vr->vr_reconnect_backoff); + return (err); +} + int vdev_remote_os_connect(vdev_remote_t *vr) { int err; PWSK_SOCKET sock = NULL; + vr->vr_reconnect_backoff = 0; + vr->vr_reconnect_until = 0; + + /* Clean up any stale socket before connecting. */ + if (vr->vr_os_priv != NULL) { + vdev_remote_wsk_close((PWSK_SOCKET)vr->vr_os_priv, B_TRUE); + vr->vr_os_priv = NULL; + } + vr->vr_connected = B_FALSE; + err = vdev_remote_wsk_init(); if (err) return (err); @@ -394,39 +516,84 @@ vdev_remote_os_disconnect(vdev_remote_t *vr) { PWSK_SOCKET sock = (PWSK_SOCKET)vr->vr_os_priv; if (sock) { - vdev_remote_wsk_close(sock); + vdev_remote_wsk_close(sock, B_FALSE); vr->vr_os_priv = NULL; } + vr->vr_reconnect_backoff = 0; + vr->vr_reconnect_until = 0; } -int -vdev_remote_os_io(vdev_remote_t *vr, uint32_t cmd, +/* + * Perform a single RPC round-trip over the current socket. + * On transport-level failure the dead socket is closed and vr_connected + * is cleared so a subsequent call will attempt reconnect. + * + * Returns 0 on success, errno on failure (EIO = transport error, + * EAGAIN = backoff timer hasn't expired yet). + */ +static int +vdev_remote_os_io_locked(vdev_remote_t *vr, uint32_t cmd, uint64_t offset, void *data, uint32_t size) { vdev_remote_rpc_hdr_t hdr; PWSK_SOCKET sock; int err; - ASSERT(vr); - mutex_enter(&vr->vr_lock); - sock = (PWSK_SOCKET)vr->vr_os_priv; - if (!sock) { mutex_exit(&vr->vr_lock); return (SET_ERROR(ENXIO)); } + ASSERT(MUTEX_HELD(&vr->vr_lock)); - if (cmd == VDEV_REMOTE_CMD_TRIM) { - memset(&hdr, 0, sizeof(hdr)); - hdr.vr_cmd = cmd; - hdr.vr_offset = offset; - hdr.vr_size = size; - err = vdev_remote_wsk_send_recv(sock, &hdr, NULL, 0, cmd); - mutex_exit(&vr->vr_lock); - return (err); + /* Try to (re)connect if needed. */ + if (!vr->vr_connected || vr->vr_os_priv == NULL) { + err = vdev_remote_os_reconnect(vr); + if (err) + return (err); } + sock = (PWSK_SOCKET)vr->vr_os_priv; + ASSERT(sock != NULL); + memset(&hdr, 0, sizeof(hdr)); hdr.vr_cmd = cmd; hdr.vr_offset = offset; hdr.vr_size = size; + err = vdev_remote_wsk_send_recv(sock, &hdr, data, size, cmd); + + /* + * If the transport failed the socket is dead; close it now so the + * next caller will attempt reconnect (subject to backoff). + */ + if (err == EIO) { + vdev_remote_wsk_close(sock, B_TRUE); + vr->vr_os_priv = NULL; + vr->vr_connected = B_FALSE; + dprintf("vdev_remote: socket %s:%u marked dead " + "(cmd 0x%x, err %d)\n", + vr->vr_host, vr->vr_port, cmd, err); + } + + return (err); +} + +int +vdev_remote_os_io(vdev_remote_t *vr, uint32_t cmd, + uint64_t offset, void *data, uint32_t size) +{ + int err; + + ASSERT(vr); + mutex_enter(&vr->vr_lock); + + /* + * TRIM does not carry a data payload but otherwise follows the + * same send-recv pattern. + */ + if (cmd == VDEV_REMOTE_CMD_TRIM) { + err = vdev_remote_os_io_locked(vr, cmd, offset, NULL, 0); + mutex_exit(&vr->vr_lock); + return (err); + } + + err = vdev_remote_os_io_locked(vr, cmd, offset, data, size); mutex_exit(&vr->vr_lock); return (err); } @@ -441,12 +608,29 @@ vdev_remote_os_info(vdev_remote_t *vr, ASSERT(vr); mutex_enter(&vr->vr_lock); + + /* Try to (re)connect if needed. */ + if (!vr->vr_connected || vr->vr_os_priv == NULL) { + err = vdev_remote_os_reconnect(vr); + if (err) { + mutex_exit(&vr->vr_lock); + return (err); + } + } + sock = (PWSK_SOCKET)vr->vr_os_priv; - if (!sock) { mutex_exit(&vr->vr_lock); return (SET_ERROR(ENXIO)); } + ASSERT(sock != NULL); memset(&hdr, 0, sizeof(hdr)); err = vdev_remote_wsk_send_recv(sock, &hdr, NULL, 0, VDEV_REMOTE_CMD_INFO); + + if (err == EIO) { + vdev_remote_wsk_close(sock, B_TRUE); + vr->vr_os_priv = NULL; + vr->vr_connected = B_FALSE; + } + mutex_exit(&vr->vr_lock); if (err == 0) { *size = hdr.vr_offset; diff --git a/module/zfs/vdev_remote.c b/module/zfs/vdev_remote.c index e747b561cefa..9bcd2f271bc5 100644 --- a/module/zfs/vdev_remote.c +++ b/module/zfs/vdev_remote.c @@ -169,6 +169,7 @@ vdev_remote_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, uint64_t *logical_ashift, uint64_t *physical_ashift) { vdev_remote_t *vr; + int error; /* * Remote devices are always non-rotational. @@ -188,10 +189,24 @@ vdev_remote_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, return (SET_ERROR(EINVAL)); } - /* Reopen: just re-read device info */ + /* Reopen: reconnect if needed, then re-read device info */ if (vd->vdev_tsd != NULL) { ASSERT(vd->vdev_reopening); vr = vd->vdev_tsd; + + /* + * If the connection was lost (e.g. server restart, network + * blip) proactively re-establish the link so that + * vdev_remote_os_info below succeeds. + * + * vdev_remote_os_connect resets backoff and tears down any + * stale socket, so this is safe to call unconditionally. + */ + if (!vr->vr_connected) { + error = vdev_remote_os_connect(vr); + if (error == 0) + vr->vr_connected = B_TRUE; + } goto skip_open; } @@ -199,7 +214,7 @@ vdev_remote_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, mutex_init(&vr->vr_lock, NULL, MUTEX_DEFAULT, NULL); /* Parse host:port from the URI */ - int error = vdev_remote_parse_uri(vd->vdev_path, + error = vdev_remote_parse_uri(vd->vdev_path, vr->vr_host, sizeof (vr->vr_host), &vr->vr_port); if (error != 0) { vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; @@ -273,6 +288,9 @@ vdev_remote_close(vdev_t *vd) /* * Background I/O strategy: performs the actual RPC call. * Runs on the vdev_remote_taskq. + * + * The OS layer (vdev_remote_os_io) handles connection state, reconnect, + * and backoff internally, so we just call through unconditionally. */ static void vdev_remote_io_strategy(void *arg) @@ -283,7 +301,7 @@ vdev_remote_io_strategy(void *arg) void *buf = NULL; uint32_t size = (uint32_t)zio->io_size; - if (vr == NULL || !vr->vr_connected) { + if (vr == NULL) { zio->io_error = SET_ERROR(ENXIO); zio_delay_interrupt(zio); return; @@ -310,6 +328,8 @@ vdev_remote_io_strategy(void *arg) /* * Flush (sync) operation: tells remote daemon to fsync. + * + * The OS layer handles reconnect internally. */ static void vdev_remote_io_flush(void *arg) @@ -318,7 +338,7 @@ vdev_remote_io_flush(void *arg) vdev_t *vd = zio->io_vd; vdev_remote_t *vr = vd->vdev_tsd; - if (vr == NULL || !vr->vr_connected) { + if (vr == NULL) { zio->io_error = SET_ERROR(ENXIO); zio_interrupt(zio); return; @@ -330,6 +350,8 @@ vdev_remote_io_flush(void *arg) /* * TRIM/UNMAP operation: tells remote daemon to deallocate. + * + * The OS layer handles reconnect internally. */ static void vdev_remote_io_trim(void *arg) @@ -338,7 +360,7 @@ vdev_remote_io_trim(void *arg) vdev_t *vd = zio->io_vd; vdev_remote_t *vr = vd->vdev_tsd; - if (vr == NULL || !vr->vr_connected) { + if (vr == NULL) { zio->io_error = SET_ERROR(ENXIO); zio_interrupt(zio); return; From 0754e30346a61650dc027673b6115545cbb49835 Mon Sep 17 00:00:00 2001 From: wangy10 Date: Wed, 20 May 2026 17:17:26 +0800 Subject: [PATCH 5/5] optimize reconnect logis to support multiple importing same vdev remote in different machine, it works, sometimes crash, and when the two client writes to the mounted filesystem, the last commited one success as we knew. --- module/os/windows/zfs/vdev_remote_os.c | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/module/os/windows/zfs/vdev_remote_os.c b/module/os/windows/zfs/vdev_remote_os.c index 224249a0c650..662cabf303c0 100644 --- a/module/os/windows/zfs/vdev_remote_os.c +++ b/module/os/windows/zfs/vdev_remote_os.c @@ -411,6 +411,10 @@ vdev_remote_wsk_send_recv(PWSK_SOCKET sock, * vr_connected are updated and the backoff state is reset. * On failure the backoff timer is advanced. * + * The first reconnect attempt after a disconnect is always immediate + * (backoff is only enforced on retries), so that a freshly imported + * or mounted pool can recover its connection without artificial delay. + * * Returns: 0 on success, errno on failure. */ static int @@ -468,16 +472,25 @@ vdev_remote_os_reconnect(vdev_remote_t *vr) return (0); fail: - /* Advance exponential backoff. */ - if (vr->vr_reconnect_backoff == 0) + /* + * Advance exponential backoff, but always allow at least ONE + * fresh attempt: if backoff is still 0 this is the first + * failure since the last successful connect, so set the + * deadline to "now" meaning the next caller gets an immediate + * retry. Only after that retry also fails do we start + * enforcing delays. + */ + if (vr->vr_reconnect_backoff == 0) { vr->vr_reconnect_backoff = VDEV_REMOTE_RECONNECT_BACKOFF_MIN_MS; - else + vr->vr_reconnect_until = gethrtime(); /* now → immediate ok */ + } else { vr->vr_reconnect_backoff = MIN( vr->vr_reconnect_backoff * 2, VDEV_REMOTE_RECONNECT_BACKOFF_MAX_MS); - - vr->vr_reconnect_until = gethrtime() + - (hrtime_t)vr->vr_reconnect_backoff * (NANOSEC / MILLISEC); + vr->vr_reconnect_until = gethrtime() + + (hrtime_t)vr->vr_reconnect_backoff * + (NANOSEC / MILLISEC); + } dprintf("vdev_remote: reconnect %s:%u failed (err %d), " "backoff %ums\n", vr->vr_host, vr->vr_port,