diff --git a/.gitignore b/.gitignore index 7dd3d65272..d4dc7ecb5a 100644 --- a/.gitignore +++ b/.gitignore @@ -78,3 +78,6 @@ examples/*/public/ !examples/multiplayer-game-patterns/public/** !examples/multiplayer-game-patterns-vercel/public/ !examples/multiplayer-game-patterns-vercel/public/** + +# Native addon binaries +*.node diff --git a/CLAUDE.md b/CLAUDE.md index cbd229a4d0..dc270bb2e7 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -89,6 +89,9 @@ docker-compose up -d git commit -m "chore(my-pkg): foo bar" ``` +- We use Graphite for stacked PRs. Diff against the parent branch (`gt ls` to see the stack), not `main`. +- To revert a file to the version before this branch's changes, checkout from the first child branch (below in the stack), not from `main` or the parent. Child branches contain the pre-this-branch state of files modified by branches further down the stack. + **Never push to `main` unless explicitly specified by the user.** ## Dependency Management @@ -101,13 +104,21 @@ git commit -m "chore(my-pkg): foo bar" - Prefer the Tokio-shaped APIs from `antiox` for concurrency needs. For example, use `antiox/sync/mpsc` for `tx` and `rx` channels, `antiox/task` for spawning tasks, and the matching sync and time modules as needed. - Treat `antiox` as the default choice for any TypeScript concurrency work because it mirrors Rust and Tokio APIs used elsewhere in the codebase. +### RivetKit Type Build Troubleshooting +- If `rivetkit` type or DTS builds fail with missing `@rivetkit/*` declarations, run `pnpm build -F rivetkit` from repo root (Turbo build path) before changing TypeScript `paths`. +- Do not add temporary `@rivetkit/*` path aliases in `rivetkit-typescript/packages/rivetkit/tsconfig.json` to work around stale or missing built declarations. + +### RivetKit Test Fixtures +- Keep RivetKit test fixtures scoped to the engine-only runtime. +- Prefer targeted integration tests under `rivetkit-typescript/packages/rivetkit/tests/` over shared multi-driver matrices. + ### SQLite Package - Use `@rivetkit/sqlite` for SQLite WebAssembly support. - Do not use the legacy upstream package directly. `@rivetkit/sqlite` is the maintained fork used in this repository and is sourced from `rivet-dev/wa-sqlite`. - The native SQLite addon (`@rivetkit/sqlite-native`) statically links SQLite via `libsqlite3-sys` with the `bundled` feature. The bundled SQLite version must match the version used by `@rivetkit/sqlite` (WASM). When upgrading either, upgrade both. ### RivetKit Package Resolutions -The root `/package.json` contains `resolutions` that map RivetKit packages to their local workspace versions: +- The root `/package.json` contains `resolutions` that map RivetKit packages to local workspace versions: ```json { @@ -120,7 +131,7 @@ The root `/package.json` contains `resolutions` that map RivetKit packages to th } ``` -When adding RivetKit dependencies to examples in `/examples/`, use `*` as the version. The root resolutions will automatically resolve these to the local workspace packages: +- Use `*` as the dependency version when adding RivetKit packages to `/examples/`, because root resolutions map them to local workspace packages: ```json { @@ -131,7 +142,19 @@ When adding RivetKit dependencies to examples in `/examples/`, use `*` as the ve } ``` -If you need to add a new `@rivetkit/*` package that isn't already in the root resolutions, add it to the `resolutions` object in `/package.json` with `"workspace:*"` as the value. Internal packages like `@rivetkit/workflow-engine` should be re-exported from `rivetkit` subpaths (e.g., `rivetkit/workflow`) rather than added as direct dependencies. +- Add new internal `@rivetkit/*` packages to root `resolutions` with `"workspace:*"` if missing, and prefer re-exporting internal packages (for example `@rivetkit/workflow-engine`) from `rivetkit` subpaths like `rivetkit/workflow` instead of direct dependencies. + +### Dynamic Import Pattern +- For runtime-only dependencies, use dynamic loading so bundlers do not eagerly include them. +- Build the module specifier from string parts (for example with `["pkg", "name"].join("-")` or `["@scope", "pkg"].join("/")`) instead of a single string literal. +- Prefer this pattern for modules like `@rivetkit/sqlite-vfs`, `sandboxed-node`, and `isolated-vm`. +- If loading by resolved file path, resolve first and then import via `pathToFileURL(...).href`. + +### Fail-By-Default Runtime Behavior +- Avoid silent no-ops for required runtime behavior. +- Do not use optional chaining for required lifecycle and bridge operations (for example sleep, destroy, alarm dispatch, ack, and websocket dispatch paths). +- If a capability is required, validate it and throw an explicit error with actionable context instead of returning early. +- Optional chaining is acceptable only for best-effort diagnostics and cleanup paths (for example logging hooks and dispose/release cleanup). ### Rust Dependencies @@ -151,7 +174,7 @@ If you need to add a new `@rivetkit/*` package that isn't already in the root re ### Docs (`website/src/content/docs/**/*.mdx`) -Required frontmatter fields: +- Required frontmatter fields: - `title` (string) - `description` (string) @@ -159,7 +182,7 @@ Required frontmatter fields: ### Blog + Changelog (`website/src/content/posts/**/page.mdx`) -Required frontmatter fields: +- Required frontmatter fields: - `title` (string) - `description` (string) @@ -167,13 +190,13 @@ Required frontmatter fields: - `published` (date string) - `category` (enum: `changelog`, `monthly-update`, `launch-week`, `technical`, `guide`, `frogs`) -Optional frontmatter fields: +- Optional frontmatter fields: - `keywords` (string array) ## Examples -All example READMEs in `/examples/` should follow the format defined in `.claude/resources/EXAMPLE_TEMPLATE.md`. +- All example READMEs in `/examples/` should follow the format defined in `.claude/resources/EXAMPLE_TEMPLATE.md`. ## Agent Working Directory @@ -185,11 +208,12 @@ All agent working files live in `.agent/` at the repo root. - **Notes**: `.agent/notes/` -- general notes and tracking. When the user asks to track something in a note, store it in `.agent/notes/` by default. When something is identified as "do later", add it to `.agent/todo/`. Design documents and interface specs go in `.agent/specs/`. +- When the user asks to update any `CLAUDE.md`, add one-line bullet points only, or add a new section containing one-line bullet points. ## Architecture ### Monorepo Structure -This is a Rust workspace-based monorepo for Rivet. Key packages and components: +- This is a Rust workspace-based monorepo for Rivet with the following key packages and components: - **Core Engine** (`packages/core/engine/`) - Main orchestration service that coordinates all operations - **Workflow Engine** (`packages/common/gasoline/`) - Handles complex multi-step operations with reliability and observability @@ -205,7 +229,7 @@ This is a Rust workspace-based monorepo for Rivet. Key packages and components: - Custom error system at `packages/common/error/` - Uses derive macros with struct-based error definitions -To use custom errors: +- Use this pattern for custom errors: ```rust use rivet_error::*; @@ -234,13 +258,13 @@ let error = AuthInvalidToken.build(); let error_with_meta = ApiRateLimited { limit: 100, reset_at: 1234567890 }.build(); ``` -Key points: +- Key points: - Use `#[derive(RivetError)]` on struct definitions - Use `#[error(group, code, description)]` or `#[error(group, code, description, formatted_message)]` attribute - Group errors by module/domain (e.g., "auth", "actor", "namespace") - Add `Serialize, Deserialize` derives for errors with metadata fields - Always return anyhow errors from failable functions - - For example: `fn foo() -> Result { /* ... */ }` +- For example: `fn foo() -> Result { /* ... */ }` - Do not glob import (`::*`) from anyhow. Instead, import individual types and traits - Prefer anyhow's `.context()` over `anyhow!` macro @@ -261,7 +285,7 @@ Key points: **Inspector HTTP API** - When updating the WebSocket inspector (`rivetkit-typescript/packages/rivetkit/src/inspector/`), also update the HTTP inspector endpoints in `rivetkit-typescript/packages/rivetkit/src/actor/router.ts`. The HTTP API mirrors the WebSocket inspector for agent-based debugging. -- When adding or modifying inspector endpoints, also update the driver test at `rivetkit-typescript/packages/rivetkit/src/driver-test-suite/tests/actor-inspector.ts` to cover all inspector HTTP endpoints. +- When adding or modifying inspector endpoints, also update the relevant RivetKit tests in `rivetkit-typescript/packages/rivetkit/tests/` to cover all inspector HTTP endpoints. - When adding or modifying inspector endpoints, also update the documentation in `website/src/metadata/skill-base-rivetkit.md` and `website/src/content/docs/actors/debugging.mdx` to keep them in sync. **Database Usage** @@ -280,7 +304,7 @@ Key points: ## Naming Conventions -Data structures often include: +- Data structures often include: - `id` (uuid) - `name` (machine-readable name, must be valid DNS subdomain, convention is using kebab case) @@ -317,6 +341,7 @@ Data structures often include: - **Never use `vi.mock`, `jest.mock`, or module-level mocking.** Write tests against real infrastructure (Docker containers, real databases, real filesystems). For LLM calls, use `@copilotkit/llmock` to run a mock LLM server. For protocol-level test doubles (e.g., ACP adapters), write hand-written scripts that run as real processes. If you need callback tracking, `vi.fn()` for simple callbacks is acceptable. - When running tests, always pipe the test to a file in /tmp/ then grep it in a second step. You can grep test logs multiple times to search for different log lines. - For RivetKit TypeScript tests, run from `rivetkit-typescript/packages/rivetkit` and use `pnpm test ` with `-t` to narrow to specific suites. For example: `pnpm test driver-file-system -t ".*Actor KV.*"`. +- When RivetKit tests need a local engine instance, start the RocksDB engine in the background with `./scripts/run/engine-rocksdb.sh >/tmp/rivet-engine-startup.log 2>&1 &`. - For frontend testing, use the `agent-browser` skill to interact with and test web UIs in examples. This allows automated browser-based testing of frontend applications. - If you modify frontend UI, automatically use the Agent Browser CLI to take updated screenshots and post them to the PR with a short comment before wrapping up the task. @@ -329,7 +354,7 @@ Data structures often include: - When talking about "Rivet Actors" make sure to capitalize "Rivet Actor" as a proper noun and lowercase "actor" as a generic noun ### Documentation Sync -When making changes to the engine or RivetKit, ensure the corresponding documentation is updated: +- Ensure corresponding documentation is updated when making engine or RivetKit changes: - **Limits changes** (e.g., max message sizes, timeouts): Update `website/src/content/docs/actors/limits.mdx` - **Config changes** (e.g., new config options in `engine/packages/config/`): Update `website/src/content/docs/self-hosting/configuration.mdx` - **RivetKit config changes** (e.g., `rivetkit-typescript/packages/rivetkit/src/registry/config/index.ts` or `rivetkit-typescript/packages/rivetkit/src/actor/config.ts`): Update `website/src/content/docs/actors/limits.mdx` if they affect limits/timeouts @@ -339,6 +364,10 @@ When making changes to the engine or RivetKit, ensure the corresponding document - **Landing page changes**: When updating the landing page (`website/src/pages/index.astro` and its section components in `website/src/components/marketing/sections/`), update `README.md` to reflect the same headlines, features, benchmarks, and talking points where applicable. - **Sandbox provider changes**: When adding, removing, or modifying sandbox providers in `rivetkit-typescript/packages/rivetkit/src/sandbox/providers/`, update `website/src/content/docs/actors/sandbox.mdx` to keep provider documentation, option tables, and custom provider guidance in sync. +### CLAUDE.md conventions + +- When adding entries to any CLAUDE.md file, keep them concise. Ideally a single bullet point or minimal bullet points. Do not write paragraphs. + ### Comments - Write comments as normal, complete sentences. Avoid fragmented structures with parentheticals and dashes like `// Spawn engine (if configured) - regardless of start kind`. Instead, write `// Spawn the engine if configured`. Especially avoid dashes (hyphens are OK). @@ -352,8 +381,8 @@ When making changes to the engine or RivetKit, ensure the corresponding document #### Common Vercel Example Errors -After regenerating Vercel examples, you may see type check errors like: +- You may see type-check errors like the following after regenerating Vercel examples: ``` error TS2688: Cannot find type definition file for 'vite/client'. ``` -with warnings about `node_modules missing`. This happens because the regenerated examples need their dependencies reinstalled. Fix by running `pnpm install` before running type checks. +- You may also see `node_modules missing` warnings; fix this by running `pnpm install` before type checks because regenerated examples need dependencies reinstalled. diff --git a/Cargo.lock b/Cargo.lock index 11bd2269ac..88113125e0 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3357,6 +3357,7 @@ dependencies = [ "serde_bare", "serde_json", "strum", + "test-snapshot-gen", "tokio", "tracing", "tracing-subscriber", @@ -4369,6 +4370,7 @@ dependencies = [ "rivet-util", "serde", "serde_json", + "subtle", "tokio", "tower-http", "tracing", @@ -6135,11 +6137,14 @@ dependencies = [ "epoxy", "epoxy-protocol", "gasoline", + "namespace", + "pegboard", "portpicker", "rivet-api-builder", "rivet-config", "rivet-pools", "rivet-test-deps", + "rivet-types", "rivet-util", "serde", "serde_json", diff --git a/README.md b/README.md index a473acc5aa..26f3b77c00 100644 --- a/README.md +++ b/README.md @@ -221,7 +221,6 @@ Works with Claude Code, Cursor, Windsurf, and other AI coding tools. - [Node.js & Bun](https://www.rivet.dev/docs/actors/quickstart/backend) - [React](https://www.rivet.dev/docs/actors/quickstart/react) - [Next.js](https://www.rivet.dev/docs/actors/quickstart/next-js) -- [Cloudflare Workers](https://www.rivet.dev/docs/actors/quickstart/cloudflare-workers) [View documentation →](https://www.rivet.dev/docs) @@ -237,7 +236,7 @@ Serverless, containers, or your own servers — Rivet Actors work with your exis **Frameworks**: [React](https://www.rivet.dev/docs/clients/react) • [Next.js](https://www.rivet.dev/docs/clients/next-js) • [Hono](https://github.com/rivet-dev/rivet/tree/main/examples/hono) • [Express](https://github.com/rivet-dev/rivet/tree/main/examples/express) • [Elysia](https://github.com/rivet-dev/rivet/tree/main/examples/elysia) • [tRPC](https://github.com/rivet-dev/rivet/tree/main/examples/trpc) -**Runtimes**: [Node.js](https://www.rivet.dev/docs/actors/quickstart/backend) • [Bun](https://www.rivet.dev/docs/actors/quickstart/backend) • [Deno](https://github.com/rivet-dev/rivet/tree/main/examples/deno) • [Cloudflare Workers](https://www.rivet.dev/docs/actors/quickstart/cloudflare-workers) +**Runtimes**: [Node.js](https://www.rivet.dev/docs/actors/quickstart/backend) • [Bun](https://www.rivet.dev/docs/actors/quickstart/backend) • [Deno](https://github.com/rivet-dev/rivet/tree/main/examples/deno) **Tools**: [Vitest](https://www.rivet.dev/docs/actors/testing) • [Pino](https://www.rivet.dev/docs/general/logging) • [AI SDK](https://github.com/rivet-dev/rivet/tree/main/examples/ai-agent) • [OpenAPI](https://github.com/rivet-dev/rivet/tree/main/rivetkit-openapi) • [AsyncAPI](https://github.com/rivet-dev/rivet/tree/main/rivetkit-asyncapi) @@ -270,4 +269,4 @@ Serverless, containers, or your own servers — Rivet Actors work with your exis ## License -[Apache 2.0](LICENSE) \ No newline at end of file +[Apache 2.0](LICENSE) diff --git a/docs-internal/rivetkit-typescript/DYNAMIC_ACTORS_ARCHITECTURE.md b/docs-internal/rivetkit-typescript/DYNAMIC_ACTORS_ARCHITECTURE.md new file mode 100644 index 0000000000..556f8fb318 --- /dev/null +++ b/docs-internal/rivetkit-typescript/DYNAMIC_ACTORS_ARCHITECTURE.md @@ -0,0 +1,111 @@ +# Dynamic Actors Architecture + +## Overview + +Dynamic actors let a registry entry resolve actor source code at actor start time. + +Dynamic actors are represented by `dynamicActor({ load, auth?, options? })` +and still participate in normal registry routing and actor lifecycle. + +Dynamic actor parity is verified by running the same engine-focused integration +tests against two fixture registries: + +- `examples/sandbox/src/index.ts` for shared actor behavior +- dedicated static and dynamic registry fixtures in test coverage + +The shared actor fixtures keep behavior consistent between static and dynamic +execution. + +## Main Components + +- Host runtime manager: + `rivetkit-typescript/packages/rivetkit/src/dynamic/isolate-runtime.ts` + Creates and owns one `NodeProcess` isolate per dynamic actor instance. +- Isolate bootstrap runtime: + `rivetkit-typescript/packages/rivetkit/dynamic-isolate-runtime/src/index.cts` + Runs inside the isolate, parses registry config via + `RegistryConfigSchema.parse`, and exports envelope handlers. +- Runtime bridge: + `rivetkit-typescript/packages/rivetkit/src/dynamic/runtime-bridge.ts` + Shared envelope and callback payload types for host and isolate. +- Driver integration: + `drivers/file-system/global-state.ts` and `drivers/engine/actor-driver.ts` + Branch on definition type, construct dynamic runtime, and proxy fetch and websocket traffic. + +## Lifecycle + +1. Driver resolves actor definition from registry. +2. If definition is dynamic, driver creates `DynamicActorIsolateRuntime`. +3. Runtime calls loader and gets `{ source, sourceFormat?, nodeProcess? }`. +4. Runtime writes source into actor runtime dir: + - `sourceFormat: "esm-js"` -> `dynamic-source.mjs` (written unchanged) + - `sourceFormat: "commonjs-js"` -> `dynamic-source.cjs` (written unchanged) + - default `sourceFormat: "typescript"` -> transpiled to `dynamic-source.cjs` +5. Runtime writes isolate bootstrap entry into actor runtime dir. +6. Runtime builds a locked down sandbox driver and creates `NodeProcess`. +7. Runtime injects host bridge refs and bootstrap config into isolate globals. +8. Runtime loads bootstrap module and captures exported envelope refs. + +Before HTTP and WebSocket traffic is forwarded into the isolate, the host +runtime may run an optional dynamic auth hook. The auth hook receives dynamic +actor metadata, the incoming `Request`, and decoded connection params. Throwing +from auth rejects the request before actor dispatch. HTTP requests return +standard RivetKit error responses and WebSockets close with the derived +`group.code` reason. + +Dynamic actors also expose an internal `PUT /dynamic/reload` control endpoint. +Drivers intercept this request before isolate dispatch, mark the actor for +sleep, and return `200`. The next request wakes the actor through the normal +start path, which calls the dynamic loader again and picks up fresh source. + +Note: isolate bootstrap does not construct `Registry` at runtime. Constructing +`Registry` would auto-start runtime preparation on next tick in non-test mode +and pull default drivers that are not needed for dynamic actor execution. + +## Bridge Contract + +Host to isolate calls: + +- `dynamicFetchEnvelope` +- `dynamicOpenWebSocketEnvelope` +- `dynamicWebSocketSendEnvelope` +- `dynamicWebSocketCloseEnvelope` +- `dynamicDispatchAlarmEnvelope` +- `dynamicStopEnvelope` +- `dynamicGetHibernatingWebSocketsEnvelope` +- `dynamicDisposeEnvelope` + +Isolate to host callbacks: + +- KV: `kvBatchPut`, `kvBatchGet`, `kvBatchDelete`, `kvListPrefix` +- Lifecycle: `setAlarm`, `startSleep`, `startDestroy` +- Networking: `dispatch` for websocket events +- Runner ack path: `ackHibernatableWebSocketMessage` +- Inline client bridge: `clientCall` + +Binary payloads are normalized to `ArrayBuffer` at the host and isolate boundary. + +## Security Model + +- Each dynamic actor runs in its own sandboxed `NodeProcess`. +- Sandbox permissions deny network and child process access. +- Filesystem access is restricted to dynamic runtime root and read only `node_modules` paths. +- Environment is explicitly injected by host config for the isolate process. + +## Module Access Projection + +Dynamic actors use secure-exec `moduleAccess` projection to expose a +read-only `/root/node_modules` view into host dependencies (allow-listing +`rivetkit` and transitive packages). We no longer stage a temporary +`node_modules` tree for runtime bootstrap. + +## Driver Test Skip Gate + +The dynamic registry variant in driver tests has a narrow skip gate for two +cases only: + +- secure-exec dist is not available on the local machine +- nested dynamic harness mode is explicitly enabled for tests + +This gate is only to avoid invalid test harness setups. Static and dynamic +behavior parity remains the expected target for normal driver test execution. diff --git a/docs-internal/rivetkit-typescript/DYNAMIC_ACTOR_FAILED_START_RELOAD_SPEC.md b/docs-internal/rivetkit-typescript/DYNAMIC_ACTOR_FAILED_START_RELOAD_SPEC.md new file mode 100644 index 0000000000..2968951807 --- /dev/null +++ b/docs-internal/rivetkit-typescript/DYNAMIC_ACTOR_FAILED_START_RELOAD_SPEC.md @@ -0,0 +1,691 @@ +# Dynamic Actor Failed-Start Reload Spec + +## Status + +Draft + +## Summary + +Dynamic actors need a first-class failed-start path that still allows a +driver-level `reload` endpoint to recover the actor immediately. + +Today, reload is implemented as "sleep the running actor so the next request +loads fresh code." That works only when there is already a live dynamic actor +runtime. It does not handle the case where dynamic startup fails before the +runtime becomes runnable. + +This spec defines identical behavior for the file-system and engine drivers: + +- Failed dynamic startup leaves a host-side wrapper alive in memory. +- Normal requests receive a sanitized failed-start error. +- Failed starts use exponential backoff. +- Backoff is passive and must not create a background retry loop that keeps the + actor effectively awake forever. +- `reload` bypasses backoff, immediately attempts a fresh startup, and returns + the result. +- `reload` resets failure backoff state before attempting fresh startup. +- Reload on an already sleeping actor is a manager-side no-op so it does not + wake, load, and immediately sleep again. +- All non-obvious lifecycle and routing behavior must be commented in code. + +## Current Behavior + +### Normal Sleep + +Normal actor sleep removes the live in-memory dynamic runtime and host handler. +After sleep there is no live actor process in memory. + +- File-system removes the runtime and actor entry during sleep. +- Engine removes the runtime and handler during stop. + +This means "sleeping" is not a long-lived in-memory actor state. It is a +lifecycle fact plus persisted actor metadata such as `sleepTs`. + +### Failed Start + +Dynamic startup currently fails out of `DynamicActorIsolateRuntime.start()` or +`ensureStarted()`. + +- File-system disposes partial runtime state and rethrows startup failure. +- Engine disposes partial runtime state, stores a transient startup error, and + stops the actor. + +There is no durable or explicit host-side failed-start state machine today. Any +future failed-start state introduced by this spec is intentionally ephemeral, +stored only in memory on the host-side wrapper, and cleared when that wrapper +is removed during normal sleep or stop cleanup. + +### Reload + +Reload is currently implemented as a pre-dispatch overlay route that sleeps a +running dynamic actor. + +- This works for a running dynamic actor. +- This does not recover a failed startup cleanly. +- This also risks an unnecessary double-load if reload is sent to an already + sleeping actor and the driver wakes it before intercepting reload. + +## Goals + +- Make failed-start behavior identical across file-system and engine drivers. +- Preserve the normal actor sleep lifecycle for actors that started + successfully. +- Keep failed-start state in memory only. +- Return the actor's real startup error code to clients. +- Return full error detail only in development. +- Keep full failure detail in logs in all environments. +- Reuse existing exponential backoff logic instead of inventing a new bespoke + retry algorithm. +- Make retry behavior configurable per dynamic actor. +- Add a configurable timeout for dynamic load and startup. +- Document all of this behavior in docs-internal. + +## Non-Goals + +- Persisting failed-start state across process restarts. Backoff state is + intentionally reset on process restart. This means actors retry from initial + backoff after a restart even if they previously reached max backoff. This is + an acceptable trade-off of keeping state in memory only. +- Changing normal static actor lifecycle behavior. +- Hiding all startup failure information from clients. Clients should still + receive a stable failure code. + +## Scope + +This spec requires parity for the file-system and engine drivers. + +The memory driver does not currently participate in the normal sleep lifecycle, +so it is out of scope unless explicitly added as a follow-up. + +## Terminology + +### Dynamic Runtime + +The isolate runtime that loads and executes dynamic actor code. + +### Host Wrapper + +The driver-side in-memory handler or entry that exists outside the dynamic +runtime. This wrapper can outlive a failed startup even when there is no live +dynamic runtime. + +### Failed Start + +Any failure in the dynamic startup pipeline before the actor becomes runnable, +including: + +- loader execution +- source normalization or materialization +- sandbox/bootstrap setup +- `runtime.start()` +- `runtime.ensureStarted()` + +## Required State Model + +Dynamic actors need an explicit host-side runtime state for reload and failure +handling. + +Recommended state shape: + +- `inactive` + The actor is not currently running. This includes the normal sleeping case. + The host wrapper may or may not exist in this state. If the wrapper was + removed by normal sleep cleanup, the actor is still logically inactive but + reload is handled at the manager/gateway level (see Reload While Inactive). +- `starting` + A startup attempt is in flight. +- `running` + Dynamic runtime is live and can serve requests. +- `failed_start` + The last startup attempt failed before the actor became runnable. + +Required metadata: + +- `lastStartErrorCode` — the `ActorError` subclass code (e.g., + `"dynamic_startup_failed"`, `"dynamic_load_timeout"`) +- `lastStartErrorMessage` — the error message string +- `lastStartErrorDetails` — full error details including stack trace. In + production, this field is stored but never serialized into client responses. + Only `lastStartErrorCode` and a sanitized message are returned to clients. +- `lastFailureAt` — timestamp of the last failure +- `retryAt` — timestamp when the next passive retry is allowed +- `retryAttempt` — number of consecutive failed attempts +- `reloadCount` — number of reload calls in the current rate-limit window +- `reloadWindowStart` — timestamp when the current rate-limit window began +- `generation` — monotonic integer counter, incremented synchronously before + each new startup attempt is dispatched. Used to reject stale async + completions. Note: this is distinct from the existing driver-level + `generation` field (UUID in file-system driver, number in engine driver), + which tracks actor identity across destroy/create cycles. This generation + tracks startup attempts within a single actor's in-memory lifetime. +- `startupPromise` — the shared promise for the current in-flight startup + attempt. Created via `promiseWithResolvers` when transitioning to `starting`. + All concurrent requests and reload calls join this promise instead of + starting parallel attempts. + +Rules: + +- This state is host-side and in-memory only. +- It must not be written into persisted actor storage by default. +- It must be cleared or replaced on successful startup. +- It must be cleared when the host-side wrapper is removed by normal sleep or + stop cleanup. +- It must be safe against stale async completion. When a startup attempt + completes, the handler must compare its captured generation against the + current generation. If they differ, the completion is discarded silently. +- Backoff must be represented as recorded metadata such as `retryAt`, not as a + background retry loop. +- `generation` is a per-actor, process-local monotonic counter. It must be + incremented synchronously (before any `await`) when initiating a new startup + attempt. This ensures that concurrent requests arriving during the transition + from `failed_start` to `starting` always join the new attempt rather than + racing to create their own. +- Only one startup attempt may be in flight at a time. The `startupPromise` + field enforces this. When a startup is needed (from `inactive` or expired + `failed_start`), the implementation must: + 1. Synchronously transition to `starting`. + 2. Synchronously increment `generation`. + 3. Synchronously create a new `promiseWithResolvers` and store it as + `startupPromise`. + 4. Begin the async startup work. + 5. Any concurrent request that arrives while in `starting` state awaits the + existing `startupPromise` rather than creating a new one. + +## Reload Authentication + +Reload must be authenticated. The implementation must use both the existing +`DynamicActorAuth` hook and a new `canReload` callback. + +### Auth Flow + +1. The existing `auth` hook on `dynamicActor({ auth })` is called first with + the reload request context. If it throws, the reload is rejected with `403`. +2. If `auth` passes, the `canReload` callback is called. If it returns `false` + or throws, the reload is rejected with `403`. + +### `canReload` Callback + +Add a `canReload` field to `DynamicActorConfigInput`: + +```typescript +export interface DynamicActorConfigInput { + load: DynamicActorLoader; + auth?: DynamicActorAuth; + canReload?: (context: DynamicActorReloadContext) => boolean | Promise; +} + +export interface DynamicActorReloadContext { + actorId: string; + name: string; + key: unknown[]; + request: Request; +} +``` + +If `canReload` is not provided, reload defaults to allowed when `auth` passes +(or when no `auth` is configured, which is only valid in development). + +In development mode without a configured `auth` or `canReload`, reload is +allowed with a warning log, matching the existing inspector auth behavior. + +## Request Behavior + +### Normal Request While Running + +Dispatch normally. + +### Normal Request While Inactive + +Attempt startup immediately. + +If startup succeeds, handle the request normally. + +If startup fails, transition to `failed_start`, log the failure, record retry +metadata, and return the failed-start error. + +### Normal Request While Starting + +Await the existing `startupPromise` rather than starting a new attempt. When +the promise resolves, dispatch the request normally. When it rejects, return +the failed-start error. + +### Normal Request While Failed Start + +If backoff is still active, return the stored failed-start error immediately. + +If backoff has expired, transition synchronously to `starting`, increment +`generation`, create a new `startupPromise` via `promiseWithResolvers`, and +begin one fresh startup attempt. All concurrent requests arriving during this +startup join the same `startupPromise`. + +Retries must be passive. The implementation must not schedule autonomous retry +timers that keep a failed actor spinning in memory until the next attempt. The +wrapper may remain available to return failed-start responses and serve reload, +but startup retries only happen because of an incoming request or explicit +reload. + +### WebSocket Upgrade While Failed Start + +WebSocket upgrade requests during `failed_start` must be rejected before the +WebSocket handshake completes. The server must respond with the same HTTP error +status and body as a normal failed-start HTTP request. The WebSocket upgrade +must not be accepted and then immediately closed. + +If the actor is in `starting` state when a WebSocket upgrade arrives, the +upgrade awaits the `startupPromise`. If startup fails, the upgrade is rejected +with the failed-start HTTP error. If startup succeeds, the upgrade proceeds +normally. + +### WebSocket Connections During Reload + +When reload triggers a sleep on a running actor, open WebSocket connections are +closed as part of the normal sleep lifecycle. The close code must be `1012` +(Service Restart) with a reason string of `"dynamic.reload"`. This tells +clients that the closure is intentional and reconnection is appropriate. + +## Reload Behavior + +Reload must be handled at the manager or host wrapper layer before request +dispatch into dynamic actor code. + +Reload must pass authentication before any state changes occur (see Reload +Authentication above). + +### Reload While Running + +Use the existing sleep-based reload behavior: + +1. Stop the running actor through the normal sleep lifecycle. +2. Return success when the actor is inactive. +3. The next normal request starts the actor with fresh code. + +Note: this means reload does not verify that the new code loads successfully. +The reload caller receives `200` confirming the old code was stopped. Any +startup failure surfaces on the next request that wakes the actor. + +### Reload While Inactive + +Return `200` without waking the actor. + +This is required to prevent the double-load path where reload wakes a sleeping +actor, loads code once, then immediately sleeps it again. + +Note: a reload sent to a nonexistent or misspelled actor ID is rejected at the +engine gateway level with an appropriate error before it reaches the driver. +The driver-level reload handler only sees requests for actors that the gateway +has already resolved. + +### Reload While Starting + +Abort the current startup attempt and immediately begin a fresh one. + +The implementation must pass an `AbortController` signal through the startup +pipeline. When reload is called during `starting`: + +1. Abort the current startup's `AbortController`. This signals the in-flight + `DynamicActorIsolateRuntime.start()` to cancel (e.g., abort the loader + fetch, stop waiting for isolate bootstrap). +2. Synchronously increment `generation`. +3. Create a new `startupPromise` via `promiseWithResolvers`. +4. Create a new `AbortController` for the fresh attempt. +5. Begin the new startup attempt. +6. Any requests that were awaiting the old `startupPromise` receive a + rejection. They then observe the new `starting` state and join the new + `startupPromise`. + +The `AbortController` signal must be threaded through: + +- The user-provided `loader` callback (available as `context.signal`). +- `DynamicActorIsolateRuntime.start()` as a parameter. +- Any internal async operations within the startup pipeline that support + cancellation (e.g., `fetch` calls, file I/O). + +Operations that do not support cancellation (e.g., `isolated-vm` context +creation) will run to completion, but the stale generation check at completion +time will discard their result. + +### Reload While Failed Start + +Reload resets failed-start backoff state (`retryAt`, `retryAttempt`) and +immediately attempts a fresh startup following the same synchronous transition +to `starting` described above. + +If the fresh startup succeeds, return `200`. + +If the fresh startup fails, return the failed-start error immediately and keep +the actor in `failed_start`. + +### Reload Rate Limiting + +Reload bypasses backoff, but the driver must log a warning when reload is +forced more than `N` times in `Y` interval. + +Rate limiting uses a simple bucket system: + +- `reloadCount` tracks the number of reload calls in the current window. +- `reloadWindowStart` tracks when the current window began. +- When a reload is received, if `now - reloadWindowStart > Y`, reset the + bucket: set `reloadCount = 1` and `reloadWindowStart = now`. +- Otherwise, increment `reloadCount`. +- If `reloadCount > N`, log a warning with the actor ID and the count. + +Default values: `N = 10`, `Y = 60_000` (60 seconds). + +The first implementation only needs warning-level logging, not enforcement. + +## Retry Configuration + +Retry behavior must be configurable by the user on dynamic actors. + +### Configuration Interface + +```typescript +export interface DynamicActorConfigInput { + load: DynamicActorLoader; + auth?: DynamicActorAuth; + canReload?: (context: DynamicActorReloadContext) => boolean | Promise; + options?: DynamicActorOptions; +} + +export interface DynamicActorOptions extends GlobalActorOptionsInput { + startup?: DynamicStartupOptions; +} + +export interface DynamicStartupOptions { + /** Timeout for the full startup pipeline in ms. Default: 15_000. */ + timeoutMs?: number; + + /** Initial backoff delay in ms after a failed startup. Default: 1_000. */ + retryInitialDelayMs?: number; + + /** Maximum backoff delay in ms. Default: 30_000. */ + retryMaxDelayMs?: number; + + /** Backoff multiplier. Default: 2. */ + retryMultiplier?: number; + + /** Whether to add jitter to backoff delays. Default: true. */ + retryJitter?: boolean; + + /** + * Maximum number of consecutive failed startup attempts before the host + * wrapper is torn down. After this limit, the actor transitions to a + * terminal failed state and the wrapper is removed from memory. Subsequent + * requests will trigger a fresh startup attempt with no prior backoff + * context, as if the actor had never been loaded. + * + * Default: 20. + * Set to 0 for unlimited retries (wrapper stays alive indefinitely). + */ + maxAttempts?: number; +} +``` + +### Backoff Implementation + +Reuse the `p-retry` exponential backoff algorithm that is already used in +`engine-client/metadata.ts` and `client/actor-conn.ts`. The +implementation does not need to use `p-retry` directly (since retries are +passive, not loop-driven), but must compute backoff delays using the same +formula: `min(maxDelay, initialDelay * multiplier^attempt)` with optional +jitter. + +### Max Attempts + +When `retryAttempt` exceeds `maxAttempts`, the host wrapper is torn down. The +actor transitions to `inactive` with no in-memory state. The next request for +this actor triggers a completely fresh startup attempt with `retryAttempt = 0`, +as if the actor had never been loaded. This prevents unbounded memory +accumulation from permanently broken actors while still allowing recovery. + +Reload must clear the active retry delay and failure attempt count before +attempting a fresh startup. + +## Error Surfacing + +Failed-start errors use the existing `ActorError` subclass hierarchy. The +error code comes from the `ActorError` subclass (e.g., the `code` field), and +the error details come from the underlying cause (e.g., the secure-exec +process output, the loader exception message, the isolate bootstrap error). + +The following stable error codes must be defined as `ActorError` subclasses for +dynamic startup failures: + +- `DynamicStartupFailed` — general startup failure (catch-all for unclassified + errors from the loader, sandbox, or bootstrap). +- `DynamicLoadTimeout` — the startup pipeline exceeded the configured timeout. + +These codes are always returned to clients. The distinction between what is +sanitized is the error details, not the code. + +Client-facing rules: + +- The `ActorError` code (e.g., `"dynamic_startup_failed"`, + `"dynamic_load_timeout"`) is always returned to clients in both production + and development. +- In production, the message is sanitized to a generic string (e.g., "Dynamic + actor startup failed. Check server logs for details."). The + `lastStartErrorDetails` field is not included in the response. +- In development, the full error message and details (including stack traces + and loader output) are included in the response, matching how internal errors + are currently exposed. +- Full details must always be emitted to logs in all environments. + +This implies the failed-start state must retain enough structured error data to +reconstruct a sanitized or full response without re-running startup. + +## Load Timeout + +The startup pipeline must be wrapped in a configurable timeout. + +### Scope + +The timeout starts when `DynamicActorIsolateRuntime.start()` is called and +ends when `ensureStarted()` resolves. This covers: + +- The user-provided `loader` callback. +- Source normalization and materialization. +- Dynamic module loading (`secure-exec`, `isolated-vm`). +- Sandbox filesystem setup. +- Bootstrap script execution. +- The isolate-side `ensureStarted()` call (actor `onStart` lifecycle hook). + +Note: first-cold-start overhead (loading `secure-exec` and `isolated-vm` +modules for the first time) is included in this timeout. The default of 15 +seconds is chosen to accommodate cold starts. If cold-start overhead is a +concern, the user can increase the timeout via `startup.timeoutMs`. + +### Implementation + +The timeout is implemented via the same `AbortController` used for reload +cancellation. When the timeout fires: + +1. The `AbortController` is aborted with a `DynamicLoadTimeout` error. +2. The startup pipeline observes the abort signal and cancels where possible. +3. The actor transitions to `failed_start` with `lastStartErrorCode` set to + `"dynamic_load_timeout"`. +4. The timeout failure participates in backoff identically to any other startup + failure. + +### Configuration + +The timeout is configured under `dynamicActor({ options: { startup: { timeoutMs } } })`. + +Default: `15_000` (15 seconds). + +## Dynamic Actor Status Endpoint + +A new `GET /dynamic/status` endpoint must be added to expose the host-side +runtime state for observability and debugging. + +### Response Shape + +```typescript +interface DynamicActorStatusResponse { + state: "inactive" | "starting" | "running" | "failed_start"; + generation: number; + + // Present when state is "failed_start" + lastStartErrorCode?: string; + lastStartErrorMessage?: string; // sanitized in production + lastStartErrorDetails?: string; // only in development + lastFailureAt?: number; + retryAt?: number; + retryAttempt?: number; +} +``` + +### Authentication + +The status endpoint uses the same authentication as the inspector endpoints: +Bearer token via `config.inspector.token()`, with timing-safe comparison. In +development mode without a configured token, access is allowed with a warning. + +### Client-Side Support + +Add a `status()` method to `ActorHandleRaw`: + +```typescript +class ActorHandleRaw { + async status(): Promise { + // GET /dynamic/status + } +} +``` + +This method is only meaningful for dynamic actors. Calling it on a static actor +returns `{ state: "running", generation: 0 }`. + +## Sleep Interaction with Failed Start + +When a sleep signal arrives while an actor is in `failed_start`, the failed- +start metadata is cleared and the host wrapper is removed. The actor +transitions to `inactive` with no in-memory state. This is the same as the +`maxAttempts` exhaustion behavior: the next request triggers a completely fresh +startup attempt. + +This is intentional. A sleep on a failed actor is equivalent to a full reset. +If the underlying issue has been fixed, the next request will succeed. If not, +the actor will re-enter `failed_start` with fresh backoff starting from +attempt 0. + +## Documentation Requirements + +The implementation must update docs-internal to describe: + +- the dynamic actor startup state model +- what `failed_start` means +- how normal requests behave during `failed_start` +- how backoff works +- that backoff is passive and does not create an autonomous retry loop +- how `reload` behaves for `running`, `inactive`, `starting`, and + `failed_start` +- that `reload` resets backoff before retrying startup +- why reload on inactive actors is a no-op +- how errors are sanitized in production and expanded in development +- the dynamic load timeout and where it is configured +- the retry configuration and where it is configured +- reload authentication via `auth` and `canReload` +- the `GET /dynamic/status` endpoint +- WebSocket close behavior during reload (`1012`, `"dynamic.reload"`) +- the `maxAttempts` limit and what happens when it is exceeded + +Minimum docs change: + +- expand `docs-internal/rivetkit-typescript/DYNAMIC_ACTORS_ARCHITECTURE.md` + with a dedicated failed-start and reload lifecycle section + +The implementation is not complete until the docs-internal update ships in the +same change. + +## Comment Requirements + +All non-obvious logic introduced by this change must be commented in code. + +Examples that require comments: + +- why failed-start state is kept in the host wrapper instead of persisted actor + state +- why reload on inactive actors is intercepted as a no-op +- how generation invalidation prevents stale startup completions from winning +- why reload bypasses backoff +- why backoff is passive instead of being driven by background timers +- why production errors are sanitized while development errors include details +- why `startupPromise` is created synchronously before the async startup work +- why WebSocket upgrades are rejected before handshake during failed start + +Comments should explain intent and invariants, not implementation history. + +## Implementation Outline + +1. Define `DynamicStartupOptions` interface and add `startup` key to the + existing `DynamicActorOptions` type. +2. Define `DynamicStartupFailed` and `DynamicLoadTimeout` as `ActorError` + subclasses in `actor/errors.ts`. +3. Add `canReload` to `DynamicActorConfigInput` and `DynamicActorReloadContext` + type. +4. Introduce a host-side dynamic runtime status model shared by file-system and + engine driver code, using the state enum and metadata fields defined above. +5. Implement startup coalescing via `promiseWithResolvers`: synchronous state + transition to `starting`, synchronous generation increment, shared promise + for all concurrent waiters. +6. Thread `AbortController` through `DynamicActorIsolateRuntime.start()` and + the user-provided `loader` callback (as `context.signal`). +7. Implement load timeout using the `AbortController` signal with a + `setTimeout` that aborts after `startup.timeoutMs`. +8. Move or extend overlay routing so reload on inactive actors can be handled + before waking actor code. +9. Implement reload authentication: call `auth` then `canReload` before + processing reload. +10. Implement reload-while-starting: abort current `AbortController`, increment + generation, create new `startupPromise`, begin fresh attempt. +11. Preserve existing sleep-based reload only for actors that are already + running. +12. Implement passive failed-start backoff metadata (using the `p-retry` + backoff formula) without background retry timers. +13. Implement `maxAttempts` exhaustion: tear down wrapper and transition to + `inactive` when exceeded. +14. Implement failed-start error replay with sanitized production output and + detailed development output. +15. Add `GET /dynamic/status` endpoint with inspector-style auth. +16. Add `status()` method to `ActorHandleRaw` on the client. +17. Reject WebSocket upgrades during `failed_start` before handshake. Close + WebSockets during reload with code `1012` and reason `"dynamic.reload"`. +18. Implement reload rate-limit bucket (`reloadCount` / `reloadWindowStart`). +19. Update docs-internal architecture docs. +20. Add comments for every non-obvious lifecycle transition and overlay routing + rule. + +## Test Requirements + +All failed-start tests must be added to the shared engine-focused integration +suite so the runtime path uses one common set of cases. This enforces the +parity requirement. + +Add or update tests for: + +- normal request retries startup after backoff expires +- normal request during active backoff returns stored failed-start error +- no background retry loop runs while actor is in failed-start backoff +- reload bypasses backoff and immediately retries startup +- reload resets backoff metadata before retrying +- reload on failed-start actor returns success or error from the immediate + startup attempt +- reload on inactive actor is a no-op and does not cause double-load +- concurrent requests coalesce onto one startup via shared `startupPromise` +- stale startup generation cannot overwrite newer reload-triggered generation +- production response is sanitized (no details, has code) +- development response includes full detail +- dynamic load timeout returns `"dynamic_load_timeout"` error code +- retry options change backoff behavior as configured by the user +- `maxAttempts` exhaustion tears down the wrapper +- request after `maxAttempts` exhaustion triggers fresh startup from attempt 0 +- reload authentication rejects unauthenticated callers with `403` +- `canReload` returning `false` rejects reload with `403` +- WebSocket upgrade during `failed_start` is rejected before handshake +- WebSocket connections receive close code `1012` during reload +- `GET /dynamic/status` returns correct state and metadata +- `GET /dynamic/status` respects inspector auth +- reload-while-starting aborts old attempt and starts new generation +- AbortController signal is delivered to the loader callback +- docs-internal file is updated alongside the implementation diff --git a/docs-internal/rivetkit-typescript/DYNAMIC_ACTOR_SQLITE_PROXY_SPEC.md b/docs-internal/rivetkit-typescript/DYNAMIC_ACTOR_SQLITE_PROXY_SPEC.md new file mode 100644 index 0000000000..435f1c784d --- /dev/null +++ b/docs-internal/rivetkit-typescript/DYNAMIC_ACTOR_SQLITE_PROXY_SPEC.md @@ -0,0 +1,275 @@ +# Dynamic Actor SQLite Proxy Spec + +## Problem + +Dynamic actors run in sandboxed `secure-exec` / `isolated-vm` processes. The current SQLite path requires `@rivetkit/sqlite` WASM to load inside the isolate, which isn't set up and is the wrong direction — we plan to add a native SQLite extension on the host side. Dynamic actors need a way to use `db()` and `db()` from `rivetkit/db` and `rivetkit/db/drizzle` without running WASM in the isolate. + +## Approach + +Run SQLite on the **host side** and bridge a thin `execute(sql, params) → rows` RPC from isolate → host. The `ActorDriver` already has `overrideRawDatabaseClient()` and `overrideDrizzleDatabaseClient()` hooks designed for this exact purpose. The `DatabaseProvider.createClient()` already checks for overrides before falling back to KV-backed construction. + +``` +Isolate Host +────── ──── +db.execute(sql, args) ──bridge──► host SQLite (per-actor) + ◄────────── { rows, columns } +``` + +One bridge call per query instead of per KV page. + +## Architecture + +### Host side (manager process) + +Each actor gets a dedicated SQLite database file managed by the host. For the file-system driver, this is already done for KV via `#actorKvDatabases` in `FileSystemGlobalState`. The actor's **application database** is a separate SQLite file alongside the KV database. + +The host exposes two bridge callbacks to the isolate: + +1. **`sqliteExec(actorId, sql, params) → string`** — Executes a SQL statement. Returns JSON-encoded `{ rows: unknown[][], columns: string[] }`. Handles both read and write queries. Params are JSON-serialized across the boundary. + +2. **`sqliteBatch(actorId, statements) → string`** — Executes multiple SQL statements in a single bridge call, wrapped in a transaction. Each statement is `{ sql: string, params: unknown[] }`. Returns JSON-encoded array of `{ rows, columns }` per statement. This is critical for migrations and reduces bridge round-trips. + +### Isolate side (dynamic actor process) + +The isolate-side `actorDriver` (defined in `host-runtime.ts` line 1767) gains: + +- `overrideRawDatabaseClient(actorId)` — Returns a `RawDatabaseClient` whose `exec()` method calls through the bridge to `sqliteExec`. +- `overrideDrizzleDatabaseClient(actorId)` — Returns a drizzle `sqlite-proxy` instance whose async callback calls through the bridge to `sqliteExec`. + +Because the overrides are set, `DatabaseProvider.createClient()` in both `db/mod.ts` and `db/drizzle/mod.ts` will use them instead of trying to construct a KV-backed WASM SQLite. No `createSqliteVfs()` is needed in the dynamic actor driver. + +## Detailed Changes + +### 1. Bridge contract (`src/dynamic/runtime-bridge.ts`) + +Add new bridge global keys: + +```typescript +export const DYNAMIC_HOST_BRIDGE_GLOBAL_KEYS = { + // ... existing keys ... + sqliteExec: "__rivetkitDynamicHostSqliteExec", + sqliteBatch: "__rivetkitDynamicHostSqliteBatch", +} as const; +``` + +### 2. Host-side SQLite pool (`src/drivers/file-system/global-state.ts`) + +Add a **per-actor application database** map alongside the existing KV database map: + +```typescript +#actorAppDatabases = new Map(); +``` + +Add methods: + +```typescript +#getOrCreateActorAppDatabase(actorId: string): SqliteRuntimeDatabase +// Opens/creates a SQLite database file at: /app-databases/.db +// Separate from the KV database. Enables WAL mode for concurrency. + +#closeActorAppDatabase(actorId: string): void +// Called during actor teardown, alongside #closeActorKvDatabase. + +sqliteExec(actorId: string, sql: string, params: unknown[]): { rows: unknown[][], columns: string[] } +// Runs a single statement against the actor's app database. +// Uses the same SqliteRuntime (bun:sqlite / better-sqlite3) already loaded. +// Synchronous — native SQLite is sync, the bridge async wrapper handles the rest. + +sqliteBatch(actorId: string, statements: { sql: string, params: unknown[] }[]): { rows: unknown[][], columns: string[] }[] +// Wraps all statements in BEGIN/COMMIT. Returns results per statement. +``` + +Cleanup: extend `#destroyActorData` and actor teardown to also close and delete app databases. + +### 3. Host bridge wiring — `isolated-vm` path (`src/dynamic/isolate-runtime.ts`) + +In `#setIsolateBridge()` (around line 880), add refs for the new bridge callbacks: + +```typescript +const sqliteExecRef = makeRef( + async (actorId: string, sql: string, paramsJson: string): Promise<{ copy(): string }> => { + const params = JSON.parse(paramsJson); + const result = this.#config.globalState.sqliteExec(actorId, sql, params); + return makeExternalCopy(JSON.stringify(result)); + }, +); + +const sqliteBatchRef = makeRef( + async (actorId: string, statementsJson: string): Promise<{ copy(): string }> => { + const statements = JSON.parse(statementsJson); + const results = this.#config.globalState.sqliteBatch(actorId, statements); + return makeExternalCopy(JSON.stringify(results)); + }, +); + +await context.global.set(DYNAMIC_HOST_BRIDGE_GLOBAL_KEYS.sqliteExec, sqliteExecRef); +await context.global.set(DYNAMIC_HOST_BRIDGE_GLOBAL_KEYS.sqliteBatch, sqliteBatchRef); +``` + +### 4. Host bridge wiring — `secure-exec` path (`src/dynamic/host-runtime.ts`) + +In `#setIsolateBridge()` (around line 586), add the same refs using the same base64/JSON bridge pattern already used for KV: + +```typescript +const sqliteExecRef = makeRef( + async (actorId: string, sql: string, paramsJson: string): Promise => { + const params = JSON.parse(paramsJson); + const result = this.#config.globalState.sqliteExec(actorId, sql, params); + return JSON.stringify(result); + }, +); +// ... same for sqliteBatch + +await context.global.set("__dynamicHostSqliteExec", sqliteExecRef); +await context.global.set("__dynamicHostSqliteBatch", sqliteBatchRef); +``` + +And on the isolate-side `actorDriver` object (line 1767), add: + +```typescript +const actorDriver = { + // ... existing methods ... + + async overrideRawDatabaseClient(actorIdValue) { + return { + exec: async (query, ...args) => { + const resultJson = await bridgeCall( + globalThis.__dynamicHostSqliteExec, + [actorIdValue, query, JSON.stringify(args)] + ); + const { rows, columns } = JSON.parse(resultJson); + return rows.map((row) => { + const obj = {}; + for (let i = 0; i < columns.length; i++) { + obj[columns[i]] = row[i]; + } + return obj; + }); + }, + }; + }, + + async overrideDrizzleDatabaseClient(actorIdValue) { + // Return undefined — let the raw override handle it. + // Drizzle provider will fall back to using the raw override path. + return undefined; + }, +}; +``` + +### 5. Drizzle support + +The drizzle `DatabaseProvider` in `db/drizzle/mod.ts` currently does NOT check for overrides — it always constructs a KV-backed WASM database. This needs to change. + +Add an override check at the top of `createClient`: + +```typescript +createClient: async (ctx) => { + // Check for drizzle override first + if (ctx.overrideDrizzleDatabaseClient) { + const override = await ctx.overrideDrizzleDatabaseClient(); + if (override) { + // Wrap with RawAccess execute/close methods and return + return Object.assign(override, { + execute: async (query, ...args) => { /* delegate to override */ }, + close: async () => {}, + }); + } + } + + // Check for raw override — build drizzle sqlite-proxy on top of it + if (ctx.overrideRawDatabaseClient) { + const rawOverride = await ctx.overrideRawDatabaseClient(); + if (rawOverride) { + const callback = async (sql, params, method) => { + const rows = await rawOverride.exec(sql, ...params); + if (method === "run") return { rows: [] }; + if (method === "get") return { rows: rows[0] ? Object.values(rows[0]) : undefined }; + return { rows: rows.map(r => Object.values(r)) }; + }; + const client = proxyDrizzle(callback, config); + return Object.assign(client, { + execute: async (query, ...args) => rawOverride.exec(query, ...args), + close: async () => {}, + }); + } + } + + // Existing KV-backed path... +} +``` + +This lets dynamic actors use `db()` from `rivetkit/db/drizzle` with migrations working through the bridge. The host runs the actual SQL; the isolate just sends strings. + +### 6. Migrations + +Drizzle inline migrations (`runInlineMigrations`) currently operate on the `@rivetkit/sqlite` `Database` WASM instance directly. For the proxy path, migrations need to run through the same `execute()` bridge. + +Option A (simpler): The raw override's `exec()` already supports multi-statement SQL via the host's `db.exec()`. Migrations can use `execute()` directly. The `sqliteBatch` bridge method handles transactional migration application. + +Option B: Add a dedicated `sqliteMigrate(actorId, migrationSql[])` bridge call that runs all migrations in a single transaction on the host. Cleaner but more surface area. + +**Recommendation**: Option A. The `execute()` path is sufficient. The drizzle provider's `onMigrate` can call `client.execute(migrationSql)` for each pending migration, same as it does today but through the bridge. + +### 7. Engine driver (`src/drivers/engine/actor-driver.ts`) + +The engine driver manages dynamic actors the same way. It needs the same `sqliteExec` / `sqliteBatch` bridge wiring, backed by whatever storage the engine provides for actor application databases. + +For now, this can be deferred — the engine driver can continue using the KV-backed path for static actors and throw a clear error for dynamic actors that try to use `db()` until the engine-side SQLite proxy is implemented. + +## Data model + +Each dynamic actor gets TWO SQLite databases on the host: + +| Database | Purpose | Path | Managed by | +|----------|---------|------|------------| +| KV database | Actor KV state (`kvBatchPut`/`kvBatchGet`) | `/databases/.db` | Existing `#actorKvDatabases` | +| App database | User-defined schema via `db()` / drizzle | `/app-databases/.db` | New `#actorAppDatabases` | + +On actor destroy, both databases are deleted. On actor sleep, both databases are closed (and reopened on wake). + +## Serialization format + +All data crosses the bridge as JSON strings: + +- **Params**: `JSON.stringify(args)` — supports `null`, `number`, `string`, `boolean`. Binary (`Uint8Array`) params are base64-encoded. +- **Results**: `JSON.stringify({ rows: unknown[][], columns: string[] })` — column-oriented format, same as `@rivetkit/sqlite`'s `query()` return shape. +- **Batch**: Array of the above per statement. + +## Error handling + +- SQL errors on the host throw through the bridge. The isolate receives the error message and stack trace as a rejected promise. +- If the actor's app database doesn't exist yet, `sqliteExec` creates it on first use (same lazy-open pattern as KV databases). +- Invalid SQL, constraint violations, etc. surface as normal SQLite errors to the actor code. + +## Testing + +Add an engine-focused integration test that: + +1. Creates a dynamic actor that uses `db()` (raw) with a simple schema +2. Runs migrations, inserts rows, queries them back +3. Verifies data persists across actor sleep/wake cycles +4. Creates a dynamic actor that uses `db()` from `rivetkit/db/drizzle` with schema + migrations +5. Verifies drizzle queries work through the proxy + +Add corresponding fixture actors in the shared sandbox-style test fixtures. + +## Files to modify + +| File | Change | +|------|--------| +| `src/dynamic/runtime-bridge.ts` | Add `sqliteExec`, `sqliteBatch` bridge keys | +| `src/drivers/file-system/global-state.ts` | Add `#actorAppDatabases`, `sqliteExec()`, `sqliteBatch()`, cleanup | +| `src/dynamic/isolate-runtime.ts` | Wire `sqliteExec`/`sqliteBatch` refs in `#setIsolateBridge()` | +| `src/dynamic/host-runtime.ts` | Wire bridge refs + add `overrideRawDatabaseClient` to isolate-side `actorDriver` | +| `src/db/drizzle/mod.ts` | Add override check at top of `createClient` | +| `tests/` | New engine-focused integration test for dynamic SQLite proxy | +| shared test fixtures | New fixture actors using `db()` in dynamic actors | +| `docs-internal/rivetkit-typescript/DYNAMIC_ACTORS_ARCHITECTURE.md` | Document SQLite proxy bridge | + +## Non-goals + +- Running WASM SQLite inside the isolate. +- Implementing this for the engine driver (deferred until engine-side app database support exists). +- Shared/cross-actor databases. +- Direct filesystem access from the isolate. diff --git a/docs-internal/rivetkit-typescript/DYNAMIC_ACTOR_TYPESCRIPT_SOURCE.md b/docs-internal/rivetkit-typescript/DYNAMIC_ACTOR_TYPESCRIPT_SOURCE.md new file mode 100644 index 0000000000..7f401f2e46 --- /dev/null +++ b/docs-internal/rivetkit-typescript/DYNAMIC_ACTOR_TYPESCRIPT_SOURCE.md @@ -0,0 +1,214 @@ +# Dynamic Actor TypeScript Source Compilation + +## Overview + +Add TypeScript source support to dynamic actors via `@secure-exec/typescript`, allowing loaders to return `.ts` source directly instead of requiring pre-transpiled JavaScript. + +## Current State + +- `DynamicActorLoadResult.sourceFormat` accepts `"commonjs-js" | "esm-js"` only +- Loaders must pre-transpile TypeScript before returning source +- secure-exec is loaded dynamically at runtime (not a direct dependency) from `secure-exec` or the legacy `sandboxed-node` package specifier +- The codebase currently resolves secure-exec from a pre-release commit hash (`pkg.pr.new/rivet-dev/secure-exec@7659aba`) in the example, and from local dist paths or npm in the runtime + +## Dependency Update + +Update secure-exec from the pre-release commit hash to the published `0.1.0` release: + +- `secure-exec@0.1.0` — core runtime (published 2026-03-18) +- `@secure-exec/typescript@0.1.0` — TypeScript compiler tools (published 2026-03-18, depends on `secure-exec@0.1.0` and `typescript@^5.9.3`) + +The `@secure-exec/typescript` package provides `createTypeScriptTools()` which runs the TypeScript compiler inside a secure-exec isolate, returning compiled JS and diagnostics. This means type-checking and transpilation happen in a sandboxed environment with memory/CPU limits, matching the existing security model. + +Update locations: +- `examples/ai-generated-actor/package.json` — replace commit hash URL with `secure-exec@0.1.0` +- Any local dist path fallbacks in `isolate-runtime.ts` that reference old directory structures + +## New Source Formats + +Extend `DynamicSourceFormat` in `runtime-bridge.ts`: + +```ts +export type DynamicSourceFormat = + | "commonjs-js" + | "esm-js" + | "esm-ts" // ESM TypeScript, compiled to esm-js before execution + | "commonjs-ts"; // CJS TypeScript, compiled to commonjs-js before execution +``` + +## API: `compileActorSource` + +Exported from `rivetkit/dynamic`. This is a helper that the loader calls explicitly — compilation does not happen implicitly in the runtime. + +### Signature + +```ts +interface CompileActorSourceOptions { + /** TypeScript source text. */ + source: string; + + /** Filename hint for diagnostics (default: "actor.ts"). */ + filename?: string; + + /** Output module format (default: "esm"). */ + format?: "esm" | "commonjs"; + + /** Run the full type checker (default: false). Strip-only when false. */ + typecheck?: boolean; + + /** Additional tsconfig compilerOptions overrides. */ + compilerOptions?: Record; + + /** Memory limit for the compiler isolate in MB (default: 512). */ + memoryLimit?: number; + + /** CPU time limit for the compiler isolate in ms. */ + cpuTimeLimitMs?: number; +} + +interface CompileActorSourceResult { + /** Compiled JavaScript output. Undefined if compilation failed. */ + js?: string; + + /** Source map text, if generated. */ + sourceMap?: string; + + /** Whether compilation succeeded without errors. */ + success: boolean; + + /** TypeScript diagnostics (errors, warnings, suggestions). */ + diagnostics: TypeScriptDiagnostic[]; +} + +interface TypeScriptDiagnostic { + code: number; + category: "error" | "warning" | "suggestion" | "message"; + message: string; + line?: number; + column?: number; +} + +function compileActorSource( + options: CompileActorSourceOptions, +): Promise; +``` + +### Usage in a Loader + +```ts +import { dynamicActor, compileActorSource } from "rivetkit/dynamic"; + +const myActor = dynamicActor({ + load: async (ctx) => { + const tsSource = await fetchActorSource(ctx.name); + + const compiled = await compileActorSource({ + source: tsSource, + typecheck: true, + }); + + if (!compiled.success) { + const errors = compiled.diagnostics + .filter(d => d.category === "error") + .map(d => `${d.line}:${d.column} ${d.message}`) + .join("\n"); + throw new Error(`Actor TypeScript compilation failed:\n${errors}`); + } + + return { + source: compiled.js!, + sourceFormat: "esm-js", + }; + }, +}); +``` + +### Usage Without Type Checking (Fast Path) + +```ts +const compiled = await compileActorSource({ + source: tsSource, + typecheck: false, // strip types only, much faster +}); +``` + +## Implementation Plan + +### 1. Update secure-exec dependency + +- Replace pre-release URLs with `secure-exec@0.1.0` in examples +- Add `@secure-exec/typescript` as an optional peer dependency of rivetkit (dynamically loaded like secure-exec itself) + +### 2. Add `compileActorSource` to `rivetkit/dynamic` + +New file: `src/dynamic/compile.ts` + +Implementation: +1. Dynamically load `@secure-exec/typescript` (same pattern as secure-exec itself — build specifier from parts to avoid bundler eager inclusion) +2. Dynamically load `secure-exec` to get `SystemDriver` and `NodeRuntimeDriverFactory` +3. Call `createTypeScriptTools()` with the secure-exec drivers +4. Call `compileSource()` with the user's source text and compiler options +5. Map the `SourceCompileResult` to `CompileActorSourceResult` + +The key mapping from `@secure-exec/typescript` API to ours: + +| `@secure-exec/typescript` | `compileActorSource` | +|-----------------------------------|------------------------------------| +| `createTypeScriptTools()` | Called internally, cached per call | +| `tools.compileSource()` | Core operation | +| `tools.typecheckSource()` | Used when `typecheck: true` | +| `SourceCompileResult.outputText` | `CompileActorSourceResult.js` | +| `SourceCompileResult.diagnostics` | Passed through directly | + +When `typecheck: false`, use compiler options `{ noCheck: true }` (TS 5.9+ `--noCheck` flag) to strip types without running the checker. This is substantially faster. + +### 3. Add source format aliases (optional convenience) + +Extend `DynamicSourceFormat` with `"esm-ts"` and `"commonjs-ts"`. When the isolate runtime sees a TS format, it calls `compileActorSource` automatically before writing source to the sandbox filesystem. This is a convenience — loaders can always compile explicitly and return `"esm-js"`. + +### 4. Export from `rivetkit/dynamic` + +Add to `src/dynamic/mod.ts`: +```ts +export { compileActorSource } from "./compile"; +export type { + CompileActorSourceOptions, + CompileActorSourceResult, + TypeScriptDiagnostic, +} from "./compile"; +``` + +### 5. Tests + +- Unit test: `compileActorSource` with valid TS returns JS and `success: true` +- Unit test: `compileActorSource` with type errors returns diagnostics and `success: false` +- Unit test: `compileActorSource` with `typecheck: false` strips types without error on invalid types +- Driver test: dynamic actor with `sourceFormat: "esm-ts"` loads and responds to actions +- Driver test: dynamic actor reload with TS source + +## Design Decisions + +**Why a helper function, not automatic compilation in reload/load?** +- Type checking is expensive (spins up a compiler isolate). Loaders should opt in explicitly. +- Loaders may want to cache compiled output, skip type checking in production, or use different compiler options per actor. +- Keeps the runtime path simple — it always receives JS. + +**Why not `transpile` or `prepare`?** +- `compile` is the standard term in the TypeScript ecosystem for TS→JS transformation. +- `transpile` is technically more precise but less commonly used by TS developers. +- `prepare` is too vague. + +**Why run the compiler inside secure-exec?** +- `@secure-exec/typescript` already handles this — the compiler runs in an isolate with memory/CPU limits. +- User-provided source code never touches the host TypeScript installation. +- Consistent with the existing security model where all dynamic actor code runs sandboxed. + +## Files Changed + +| File | Change | +|------|--------| +| `src/dynamic/compile.ts` | New — `compileActorSource` implementation | +| `src/dynamic/mod.ts` | Export `compileActorSource` and types | +| `src/dynamic/runtime-bridge.ts` | Add `"esm-ts"` and `"commonjs-ts"` to `DynamicSourceFormat` | +| `src/dynamic/isolate-runtime.ts` | Handle TS formats by compiling before sandbox write | +| `examples/ai-generated-actor/package.json` | Update secure-exec to `0.1.0` | diff --git a/engine/CLAUDE.md b/engine/CLAUDE.md index 86a04d8d2f..19175eb9a4 100644 --- a/engine/CLAUDE.md +++ b/engine/CLAUDE.md @@ -33,6 +33,10 @@ When changing a versioned VBARE schema, follow the existing migration pattern. - When adding fields to epoxy workflow state structs, mark them `#[serde(default)]` so Gasoline can replay older serialized state. - Epoxy integration tests that spin up `tests/common::TestCtx` must call `shutdown()` before returning. +## Concurrent containers + +Never use `Mutex>` or `RwLock>`. Use `scc::HashMap` (preferred), `moka::Cache` (for TTL/bounded), or `DashMap`. Same for sets: use `scc::HashSet` instead of `Mutex>`. Note that `scc` async methods do not hold locks across `.await` points. Use `entry_async` for atomic read-then-write. + ## Test snapshots Use `test-snapshot-gen` to generate and load RocksDB snapshots of the full UDB KV store for migration and integration tests. Scenarios produce per-replica RocksDB checkpoints stored under `engine/packages/test-snapshot-gen/snapshots/` (git LFS tracked). In tests, use `test_snapshot::SnapshotTestCtx::from_snapshot("scenario-name")` to boot a cluster from snapshot data. See `docs-internal/engine/TEST_SNAPSHOTS.md` for the full guide. diff --git a/engine/package.json b/engine/package.json deleted file mode 100644 index 47d65f8fb6..0000000000 --- a/engine/package.json +++ /dev/null @@ -1,8 +0,0 @@ -{ - "name": "@rivetkit/engine", - "version": "1.0.0", - "keywords": [], - "author": "", - "license": "ISC", - "packageManager": "pnpm@10.13.1" -} diff --git a/engine/packages/api-public/Cargo.toml b/engine/packages/api-public/Cargo.toml index 1d84a529b2..22f8f20b91 100644 --- a/engine/packages/api-public/Cargo.toml +++ b/engine/packages/api-public/Cargo.toml @@ -28,6 +28,7 @@ rivet-types.workspace = true rivet-util.workspace = true serde_json.workspace = true serde.workspace = true +subtle.workspace = true tokio.workspace = true tower-http.workspace = true tracing.workspace = true diff --git a/engine/packages/api-public/src/ctx.rs b/engine/packages/api-public/src/ctx.rs index be9d6a165e..94af879edb 100644 --- a/engine/packages/api-public/src/ctx.rs +++ b/engine/packages/api-public/src/ctx.rs @@ -1,3 +1,4 @@ +use anyhow::Result; use std::{ ops::Deref, sync::{ @@ -5,8 +6,7 @@ use std::{ atomic::{AtomicBool, Ordering}, }, }; - -use anyhow::Result; +use subtle::ConstantTimeEq; #[derive(Clone)] pub struct ApiCtx { @@ -31,11 +31,19 @@ impl ApiCtx { self.authentication_handled.store(true, Ordering::Relaxed); - if self.token.as_ref() == Some(auth.admin_token.read()) { - Ok(()) - } else { - Err(rivet_api_builder::ApiForbidden.build()) + let Some(token) = &self.token else { + return Err(rivet_api_builder::ApiForbidden.build()); + }; + + if token + .as_bytes() + .ct_ne(auth.admin_token.read().as_bytes()) + .into() + { + return Err(rivet_api_builder::ApiForbidden.build()); } + + Ok(()) } pub fn skip_auth(&self) { diff --git a/engine/packages/cache/src/driver.rs b/engine/packages/cache/src/driver.rs index 915dc8eb35..c99d54965a 100644 --- a/engine/packages/cache/src/driver.rs +++ b/engine/packages/cache/src/driver.rs @@ -1,5 +1,6 @@ use std::{ fmt::Debug, + sync::OnceLock, time::{Duration, Instant}, }; @@ -132,10 +133,10 @@ impl moka::Expiry for ValueExpiry { } } +static CACHE: OnceLock> = OnceLock::new(); + /// In-memory cache driver implementation using the moka crate -pub struct InMemoryDriver { - cache: Cache, -} +pub struct InMemoryDriver {} impl Debug for InMemoryDriver { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { @@ -146,11 +147,20 @@ impl Debug for InMemoryDriver { impl InMemoryDriver { pub fn new(max_capacity: u64) -> Self { // Create a cache with ValueExpiry implementation for custom expiration times - let cache = CacheBuilder::new(max_capacity) - .expire_after(ValueExpiry) - .build(); + CACHE.get_or_init(|| { + CacheBuilder::new(max_capacity) + .expire_after(ValueExpiry) + .eviction_listener(|key, _value, cause| { + tracing::debug!(?key, ?cause, "cache eviction"); + }) + .build() + }); + + Self {} + } - Self { cache } + fn cache(&self) -> &Cache { + CACHE.get().expect("should be initialized") } pub async fn get<'a>( @@ -163,7 +173,7 @@ impl InMemoryDriver { // Async block for metrics async { for key in keys { - result.push(self.cache.get(&**key).await.map(|x| x.value.clone())); + result.push(self.cache().get(&**key).await.map(|x| x.value.clone())); } } .instrument(tracing::info_span!("get")) @@ -193,7 +203,7 @@ impl InMemoryDriver { }; // Store in cache - expiry will be handled by ValueExpiry - self.cache.insert(key.into(), entry).await; + self.cache().insert(key.into(), entry).await; } } .instrument(tracing::info_span!("set")) @@ -212,7 +222,7 @@ impl InMemoryDriver { async { for key in keys { // Use remove instead of invalidate to ensure it's actually removed - self.cache.remove(&*key).await; + self.cache().remove(&*key).await; } } .instrument(tracing::info_span!("delete")) diff --git a/engine/packages/cache/src/inner.rs b/engine/packages/cache/src/inner.rs index cb65782dea..d34e0e32c6 100644 --- a/engine/packages/cache/src/inner.rs +++ b/engine/packages/cache/src/inner.rs @@ -1,16 +1,20 @@ -use std::{fmt::Debug, sync::Arc}; +use std::{ + fmt::Debug, + sync::{Arc, OnceLock}, +}; use tokio::sync::broadcast; use super::*; use crate::driver::{Driver, InMemoryDriver}; +static IN_FLIGHT: OnceLock>> = OnceLock::new(); + pub type Cache = Arc; /// Utility type used to hold information relating to caching. pub struct CacheInner { pub(crate) driver: Driver, - pub(crate) in_flight: scc::HashMap>, pub(crate) ups: Option, } @@ -36,11 +40,12 @@ impl CacheInner { #[tracing::instrument(skip(ups))] pub fn new_in_memory(max_capacity: u64, ups: Option) -> Cache { let driver = Driver::InMemory(InMemoryDriver::new(max_capacity)); - Arc::new(CacheInner { - driver, - in_flight: scc::HashMap::new(), - ups, - }) + + Arc::new(CacheInner { driver, ups }) + } + + pub(crate) fn in_flight(&self) -> &scc::HashMap> { + IN_FLIGHT.get_or_init(scc::HashMap::new) } } diff --git a/engine/packages/cache/src/req_config.rs b/engine/packages/cache/src/req_config.rs index 8baacd55cb..ccacc5f94d 100644 --- a/engine/packages/cache/src/req_config.rs +++ b/engine/packages/cache/src/req_config.rs @@ -47,7 +47,7 @@ impl RequestConfig { // MARK: Fetch impl RequestConfig { - #[tracing::instrument(err, skip(keys, getter, encoder, decoder))] + #[tracing::instrument(err, skip_all, fields(?base_key))] async fn fetch_all_convert( self, base_key: impl ToString + Debug, @@ -129,7 +129,7 @@ impl RequestConfig { // Determine which keys are currently being fetched and not for key in remaining_keys { let cache_key = self.cache.driver.process_key(&base_key, &key); - match self.cache.in_flight.entry_async(cache_key).await { + match self.cache.in_flight().entry_async(cache_key).await { scc::hash_map::Entry::Occupied(broadcast) => { waiting_keys.push((key, broadcast.subscribe())); } @@ -189,7 +189,13 @@ impl RequestConfig { succeeded_keys.into_iter().unzip(); let (cached_values_res, ctx3_res) = tokio::join!( - cache.driver.get(&base_key2, &succeeded_cache_keys), + async { + if succeeded_cache_keys.is_empty() { + Ok(Vec::new()) + } else { + cache.driver.get(&base_key2, &succeeded_cache_keys).await + } + }, async { if failed_keys.is_empty() { Ok(ctx3) @@ -276,7 +282,7 @@ impl RequestConfig { // Release leases for key in leased_keys { let cache_key = self.cache.driver.process_key(&base_key, &key); - self.cache.in_flight.remove_async(&cache_key).await; + self.cache.in_flight().remove_async(&cache_key).await; } } @@ -310,7 +316,7 @@ impl RequestConfig { } } - #[tracing::instrument(err, skip(keys))] + #[tracing::instrument(err, skip_all, fields(?base_key))] pub async fn purge( self, base_key: impl AsRef + Debug, @@ -363,7 +369,7 @@ impl RequestConfig { /// Purges keys from the local cache without publishing to NATS. /// This is used by the cache-purge service to avoid recursive publishing. - #[tracing::instrument(err, skip(keys))] + #[tracing::instrument(err, skip_all, fields(?base_key))] pub async fn purge_local( self, base_key: impl AsRef + Debug, @@ -398,7 +404,6 @@ impl RequestConfig { // MARK: JSON fetch impl RequestConfig { - #[tracing::instrument(err, skip(key, getter))] pub async fn fetch_one_json( self, base_key: impl ToString + Debug, @@ -428,7 +433,6 @@ impl RequestConfig { Ok(values.into_iter().next().map(|(_, v)| v)) } - #[tracing::instrument(err, skip(keys, getter))] pub async fn fetch_all_json( self, base_key: impl ToString + Debug, @@ -447,7 +451,6 @@ impl RequestConfig { .map(|x| x.into_iter().map(|(_, v)| v).collect::>()) } - #[tracing::instrument(err, skip(keys, getter))] pub async fn fetch_all_json_with_keys( self, base_key: impl ToString + Debug, diff --git a/engine/packages/epoxy/src/consts.rs b/engine/packages/epoxy/src/consts.rs index fbead8e92f..1fa489080e 100644 --- a/engine/packages/epoxy/src/consts.rs +++ b/engine/packages/epoxy/src/consts.rs @@ -8,4 +8,3 @@ pub const REQUEST_TIMEOUT: Duration = Duration::from_secs(10); /// This keeps learner range reads bounded while still making steady progress through the /// immutable per-key commit history. pub const CHANGELOG_READ_COUNT: u64 = 5_000; - diff --git a/engine/packages/epoxy/src/http_routes.rs b/engine/packages/epoxy/src/http_routes.rs index 58dda80696..2cc9e24f06 100644 --- a/engine/packages/epoxy/src/http_routes.rs +++ b/engine/packages/epoxy/src/http_routes.rs @@ -16,7 +16,10 @@ pub fn mount_routes( ) -> axum::Router { router .route("/v{version}/epoxy/message", bin::post(message)) - .route("/v{version}/epoxy/changelog-read", bin::post(changelog_read)) + .route( + "/v{version}/epoxy/changelog-read", + bin::post(changelog_read), + ) } pub async fn message(ctx: ApiCtx, path: ProtocolPath, _query: (), body: Bytes) -> Result> { @@ -66,7 +69,9 @@ fn request_kind_label(kind: &protocol::RequestKind) -> &'static str { protocol::RequestKind::CommitRequest(_) => "commit", protocol::RequestKind::ChangelogReadRequest(_) => "changelog_read", protocol::RequestKind::HealthCheckRequest => "health_check", - protocol::RequestKind::CoordinatorUpdateReplicaStatusRequest(_) => "coordinator_update_replica_status", + protocol::RequestKind::CoordinatorUpdateReplicaStatusRequest(_) => { + "coordinator_update_replica_status" + } protocol::RequestKind::BeginLearningRequest(_) => "begin_learning", protocol::RequestKind::KvGetRequest(_) => "kv_get", protocol::RequestKind::KvPurgeCacheRequest(_) => "kv_purge_cache", diff --git a/engine/packages/epoxy/src/keys/keys.rs b/engine/packages/epoxy/src/keys/keys.rs index 35230892ab..7caa0c9598 100644 --- a/engine/packages/epoxy/src/keys/keys.rs +++ b/engine/packages/epoxy/src/keys/keys.rs @@ -336,8 +336,7 @@ impl TuplePack for ChangelogKey { impl<'de> TupleUnpack<'de> for ChangelogKey { fn unpack(input: &[u8], tuple_depth: TupleDepth) -> PackResult<(&[u8], Self)> { - let (input, (root, versionstamp)) = - <(usize, Versionstamp)>::unpack(input, tuple_depth)?; + let (input, (root, versionstamp)) = <(usize, Versionstamp)>::unpack(input, tuple_depth)?; if root != CHANGELOG { return Err(PackError::Message("expected CHANGELOG root".into())); } @@ -347,4 +346,3 @@ impl<'de> TupleUnpack<'de> for ChangelogKey { Ok((input, v)) } } - diff --git a/engine/packages/epoxy/src/metrics.rs b/engine/packages/epoxy/src/metrics.rs index d44f298f15..65fe312832 100644 --- a/engine/packages/epoxy/src/metrics.rs +++ b/engine/packages/epoxy/src/metrics.rs @@ -96,7 +96,9 @@ pub fn record_changelog_append() { } pub fn record_request(request_type: &str, result: &str, duration: std::time::Duration) { - REQUEST_TOTAL.with_label_values(&[request_type, result]).inc(); + REQUEST_TOTAL + .with_label_values(&[request_type, result]) + .inc(); REQUEST_DURATION .with_label_values(&[request_type]) .observe(duration.as_secs_f64()); diff --git a/engine/packages/epoxy/src/ops/kv/get_local.rs b/engine/packages/epoxy/src/ops/kv/get_local.rs index ce86d043f6..1c9ec16d17 100644 --- a/engine/packages/epoxy/src/ops/kv/get_local.rs +++ b/engine/packages/epoxy/src/ops/kv/get_local.rs @@ -19,14 +19,10 @@ pub struct Output { #[operation] pub async fn epoxy_kv_get_local(ctx: &OperationCtx, input: &Input) -> Result { - let committed_value = read_value::read_local_value( - ctx, - input.replica_id, - input.key.clone(), - false, - ) - .await? - .value; + let committed_value = + read_value::read_local_value(ctx, input.replica_id, input.key.clone(), false) + .await? + .value; Ok(Output { value: committed_value.as_ref().map(|value| value.value.clone()), diff --git a/engine/packages/epoxy/src/ops/kv/get_optimistic.rs b/engine/packages/epoxy/src/ops/kv/get_optimistic.rs index 0607607267..fe0b9a67af 100644 --- a/engine/packages/epoxy/src/ops/kv/get_optimistic.rs +++ b/engine/packages/epoxy/src/ops/kv/get_optimistic.rs @@ -123,13 +123,7 @@ pub async fn epoxy_kv_get_optimistic(ctx: &OperationCtx, input: &Input) -> Resul }; if input.caching_behavior == protocol::CachingBehavior::Optimistic { - cache_fanout_value( - ctx, - input.replica_id, - input.key.clone(), - value.clone(), - ) - .await?; + cache_fanout_value(ctx, input.replica_id, input.key.clone(), value.clone()).await?; } return Ok(Output { @@ -162,7 +156,10 @@ async fn cache_fanout_value( // This covers the race where a commit lands between the fanout read and // the cache write. if let Some(committed_value) = tx - .read_opt(&committed_key, universaldb::utils::IsolationLevel::Serializable) + .read_opt( + &committed_key, + universaldb::utils::IsolationLevel::Serializable, + ) .await? { if committed_value.version >= value_to_cache.version { @@ -174,10 +171,7 @@ async fn cache_fanout_value( // prevents a slow fanout response from overwriting a fresher cache entry // written by a concurrent request. if let Some(existing_cache) = tx - .read_opt( - &cache_key, - universaldb::utils::IsolationLevel::Serializable, - ) + .read_opt(&cache_key, universaldb::utils::IsolationLevel::Serializable) .await? { if existing_cache.version > value_to_cache.version { diff --git a/engine/packages/epoxy/src/ops/kv/read_value.rs b/engine/packages/epoxy/src/ops/kv/read_value.rs index f85678196a..3d36af6e0a 100644 --- a/engine/packages/epoxy/src/ops/kv/read_value.rs +++ b/engine/packages/epoxy/src/ops/kv/read_value.rs @@ -3,7 +3,9 @@ use epoxy_protocol::protocol::ReplicaId; use gas::prelude::*; use universaldb::utils::{FormalKey, IsolationLevel::Serializable}; -use crate::keys::{self, CommittedValue, KvOptimisticCacheKey, KvValueKey, LegacyCommittedValueKey}; +use crate::keys::{ + self, CommittedValue, KvOptimisticCacheKey, KvValueKey, LegacyCommittedValueKey, +}; #[derive(Debug)] pub(crate) struct LocalValueRead { @@ -19,10 +21,7 @@ pub(crate) struct LocalValueRead { /// 1. **V2 value** (`EPOXY_V2/replica/{id}/kv/{key}/value`). The current write path. /// 2. **Legacy committed value** (`EPOXY_V1/replica/{id}/kv/{key}/committed_value`). Written by /// the original EPaxos protocol. Deserialized as raw bytes with version 0 and mutable=false. -/// 3. **Legacy v2-format value** (`EPOXY_V1/replica/{id}/kv/{key}/value`). Written during the -/// intermediate v1-to-v2 transition where the key layout matched v2 but the subspace was -/// still v1. -/// 4. **Optimistic cache** (`EPOXY_V2/replica/{id}/kv/{key}/cache`). Only checked when +/// 3. **Optimistic cache** (`EPOXY_V2/replica/{id}/kv/{key}/cache`). Only checked when /// `include_cache` is true. Contains values fetched from remote replicas for the optimistic /// read path. /// @@ -36,24 +35,20 @@ pub(crate) async fn read_local_value( ) -> Result { let value_key = KvValueKey::new(key.clone()); let legacy_value_key = LegacyCommittedValueKey::new(key.clone()); - let legacy_v2_value_key = KvValueKey::new(key.clone()); let cache_key = KvOptimisticCacheKey::new(key); let subspace = keys::subspace(replica_id); let legacy_subspace = keys::legacy_subspace(replica_id); let packed_value_key = subspace.pack(&value_key); let packed_legacy_value_key = legacy_subspace.pack(&legacy_value_key); - let packed_legacy_v2_value_key = legacy_subspace.pack(&legacy_v2_value_key); let packed_cache_key = subspace.pack(&cache_key); ctx.udb()? .run(|tx| { let packed_value_key = packed_value_key.clone(); let packed_legacy_value_key = packed_legacy_value_key.clone(); - let packed_legacy_v2_value_key = packed_legacy_v2_value_key.clone(); let packed_cache_key = packed_cache_key.clone(); let value_key = value_key.clone(); let legacy_value_key = legacy_value_key.clone(); - let legacy_v2_value_key = legacy_v2_value_key.clone(); let cache_key = cache_key.clone(); async move { @@ -77,14 +72,6 @@ pub(crate) async fn read_local_value( }); } - // Legacy v2-format value (v1 subspace, v2 key layout) - if let Some(value) = tx.get(&packed_legacy_v2_value_key, Serializable).await? { - return Ok(LocalValueRead { - value: Some(legacy_v2_value_key.deserialize(&value)?), - cache_value: None, - }); - } - let cache_value = if include_cache { tx.get(&packed_cache_key, Serializable) .await? diff --git a/engine/packages/epoxy/src/ops/propose.rs b/engine/packages/epoxy/src/ops/propose.rs index 395c1f5b15..badf62aabe 100644 --- a/engine/packages/epoxy/src/ops/propose.rs +++ b/engine/packages/epoxy/src/ops/propose.rs @@ -9,8 +9,9 @@ use std::collections::{BTreeSet, HashSet}; use std::time::{Duration, Instant}; use crate::{ - http_client, metrics, + http_client, keys::CommittedValue, + metrics, replica::{ ballot::{self, Ballot, BallotSelection}, commit_kv::{self, CommitKvOutcome}, @@ -101,13 +102,11 @@ impl SetProposal { key, expect_one_of, new_value: Some(value), - }) if expect_one_of.len() == 1 && matches!(expect_one_of.first(), Some(None)) => { - Ok(Self { - key: key.clone(), - value: value.clone(), - mutable, - }) - } + }) if expect_one_of.len() == 1 && matches!(expect_one_of.first(), Some(None)) => Ok(Self { + key: key.clone(), + value: value.clone(), + mutable, + }), _ => bail!( "epoxy v2 only supports single-key set-if-absent proposals with a concrete value" ), @@ -314,14 +313,14 @@ pub async fn epoxy_propose(ctx: &OperationCtx, input: &Input) -> Result { run_slow_path( @@ -444,16 +443,7 @@ async fn run_accept_path( version, mutable, } = value; - commit_kv::commit_kv( - &tx, - replica_id, - key, - value, - ballot, - mutable, - version, - ) - .await + commit_kv::commit_kv(&tx, replica_id, key, value, ballot, mutable, version).await } }) .custom_instrument(tracing::info_span!("commit_kv_tx")) @@ -472,16 +462,15 @@ async fn run_accept_path( let ballot = ballot; let purge_cache = purge_cache && proposal.mutable; async move { - if let Err(err) = - broadcast_commits( - &ctx, - &config, - replica_id, - key.clone(), - chosen_value.clone(), - ballot, - ) - .await + if let Err(err) = broadcast_commits( + &ctx, + &config, + replica_id, + key.clone(), + chosen_value.clone(), + ballot, + ) + .await { tracing::warn!(?err, "commit broadcast failed after local commit"); } @@ -550,7 +539,8 @@ async fn run_prepare_phase( } PrepareRoundOutcome::Retry { next_ballot } => { store_prepare_ballot(ctx, replica_id, key.clone(), next_ballot).await?; - let Some(retry_delay) = next_prepare_retry_delay(retry_count, &mut rand::thread_rng()) + let Some(retry_delay) = + next_prepare_retry_delay(retry_count, &mut rand::thread_rng()) else { tracing::warn!( %replica_id, @@ -592,31 +582,32 @@ async fn send_prepare_round( ballot: protocol::Ballot, ) -> Result { let target = utils::calculate_quorum(replica_ids.len(), utils::QuorumType::Slow); - let mut pending = futures_util::stream::iter(replica_ids.iter().copied().map(|to_replica_id| { - let key = key.clone(); - let proposed_value = proposed_value.clone(); - let ballot = ballot.clone(); - async move { - ( - to_replica_id, - tokio::time::timeout( - crate::consts::REQUEST_TIMEOUT, - send_prepare_request( - ctx, - config, - from_replica_id, - to_replica_id, - key, - proposed_value, - ballot, - ), + let mut pending = + futures_util::stream::iter(replica_ids.iter().copied().map(|to_replica_id| { + let key = key.clone(); + let proposed_value = proposed_value.clone(); + let ballot = ballot.clone(); + async move { + ( + to_replica_id, + tokio::time::timeout( + crate::consts::REQUEST_TIMEOUT, + send_prepare_request( + ctx, + config, + from_replica_id, + to_replica_id, + key, + proposed_value, + ballot, + ), + ) + .await, ) - .await, - ) - } - })) - .collect::>() - .await; + } + })) + .collect::>() + .await; let mut ok_responses = 0; let mut remaining = replica_ids.len(); @@ -655,7 +646,9 @@ async fn send_prepare_round( } (None, None) => {} _ => { - bail!("prepare response from replica {to_replica_id} returned partial accepted state"); + bail!( + "prepare response from replica {to_replica_id} returned partial accepted state" + ); } } @@ -720,31 +713,32 @@ async fn send_accept_round( accept_quorum: utils::QuorumType, ) -> Result { let target = utils::calculate_quorum(replica_ids.len(), accept_quorum); - let mut pending = futures_util::stream::iter(replica_ids.iter().copied().map(|to_replica_id| { - let key = key.clone(); - let value = value.clone(); - let ballot = ballot.clone(); - async move { - ( - to_replica_id, - tokio::time::timeout( - crate::consts::REQUEST_TIMEOUT, - send_accept_request( - ctx, - config, - from_replica_id, - to_replica_id, - key, - value, - ballot, - ), + let mut pending = + futures_util::stream::iter(replica_ids.iter().copied().map(|to_replica_id| { + let key = key.clone(); + let value = value.clone(); + let ballot = ballot.clone(); + async move { + ( + to_replica_id, + tokio::time::timeout( + crate::consts::REQUEST_TIMEOUT, + send_accept_request( + ctx, + config, + from_replica_id, + to_replica_id, + key, + value, + ballot, + ), + ) + .await, ) - .await, - ) - } - })) - .collect::>() - .await; + } + })) + .collect::>() + .await; let mut state = AcceptRoundState { target, @@ -754,9 +748,7 @@ async fn send_accept_round( while let Some((to_replica_id, response)) = pending.next().await { let observation = match response { - Ok(Ok(protocol::AcceptResponse::AcceptResponseOk(_))) => { - AcceptObservation::Ok - } + Ok(Ok(protocol::AcceptResponse::AcceptResponseOk(_))) => AcceptObservation::Ok, Ok(Ok(protocol::AcceptResponse::AcceptResponseAlreadyCommitted(committed))) => { AcceptObservation::AlreadyCommitted(committed.value) } diff --git a/engine/packages/epoxy/src/replica/ballot.rs b/engine/packages/epoxy/src/replica/ballot.rs index b3ec151e40..33e169178a 100644 --- a/engine/packages/epoxy/src/replica/ballot.rs +++ b/engine/packages/epoxy/src/replica/ballot.rs @@ -65,7 +65,9 @@ pub enum BallotSelection { value: CommittedValue, ballot: Ballot, }, - NeedsPrepare { ballot: Ballot }, + NeedsPrepare { + ballot: Ballot, + }, FreshBallot(Ballot), } @@ -88,12 +90,7 @@ pub async fn ballot_selection( let packed_legacy_v2_value_key = legacy_subspace.pack(&legacy_v2_value_key); let packed_ballot_key = subspace.pack(&ballot_key); - let ( - committed_value, - legacy_committed_value, - legacy_v2_committed_value, - current_ballot, - ) = tokio::try_join!( + let (committed_value, legacy_committed_value, legacy_v2_committed_value, current_ballot) = tokio::try_join!( async { let value = tx.get(&packed_value_key, Serializable).await?; if let Some(bytes) = value { @@ -136,9 +133,7 @@ pub async fn ballot_selection( mutable: false, }) }) - .or_else(|| { - legacy_v2_committed_value - }) + .or_else(|| legacy_v2_committed_value) { if !value.mutable || !mutable { return Ok(BallotSelection::AlreadyCommitted(value.value)); @@ -150,10 +145,7 @@ pub async fn ballot_selection( ballot_key, current_ballot.unwrap_or_else(|| Ballot::zero(replica_id)), )?; - return Ok(BallotSelection::AlreadyCommittedMutable { - value, - ballot, - }); + return Ok(BallotSelection::AlreadyCommittedMutable { value, ballot }); } let current_ballot = current_ballot.unwrap_or_else(|| Ballot::zero(replica_id)); diff --git a/engine/packages/epoxy/src/replica/changelog.rs b/engine/packages/epoxy/src/replica/changelog.rs index 060a4e7777..c86a654d77 100644 --- a/engine/packages/epoxy/src/replica/changelog.rs +++ b/engine/packages/epoxy/src/replica/changelog.rs @@ -5,13 +5,14 @@ use universaldb::{ KeySelector, RangeOption, Transaction, options::StreamingMode, tuple::Versionstamp, - utils::{ - FormalKey, IsolationLevel::Serializable, keys::CHANGELOG, - }, + utils::{FormalKey, IsolationLevel::Serializable, keys::CHANGELOG}, versionstamp::{generate_versionstamp, substitute_versionstamp}, }; -use crate::keys::{self, ChangelogKey, CommittedValue, KvAcceptedKey, KvBallotKey, KvOptimisticCacheKey, KvValueKey}; +use crate::keys::{ + self, ChangelogKey, CommittedValue, KvAcceptedKey, KvBallotKey, KvOptimisticCacheKey, + KvValueKey, +}; use crate::metrics; #[tracing::instrument(skip_all, fields(%replica_id, key = ?key))] @@ -52,15 +53,15 @@ pub async fn read( let replica_subspace = keys::subspace(replica_id); let changelog_subspace = replica_subspace.subspace(&(CHANGELOG,)); let mut range: RangeOption<'static> = (&changelog_subspace).into(); - range.limit = Some( - usize::try_from(req.count).context("changelog read count does not fit in usize")?, - ); + range.limit = + Some(usize::try_from(req.count).context("changelog read count does not fit in usize")?); range.mode = StreamingMode::WantAll; let mut last_versionstamp = req.after_versionstamp.clone().unwrap_or_default(); if let Some(after_versionstamp) = req.after_versionstamp { - let after_key = - replica_subspace.pack(&ChangelogKey::new(decode_versionstamp(&after_versionstamp)?)); + let after_key = replica_subspace.pack(&ChangelogKey::new(decode_versionstamp( + &after_versionstamp, + )?)); range.begin = KeySelector::first_greater_than(after_key); } @@ -144,4 +145,3 @@ fn decode_versionstamp(raw: &[u8]) -> Result { .context("expected 12-byte versionstamp cursor")?; Ok(Versionstamp::from(bytes)) } - diff --git a/engine/packages/epoxy/src/replica/commit_kv.rs b/engine/packages/epoxy/src/replica/commit_kv.rs index fe30533199..7bf98a66b3 100644 --- a/engine/packages/epoxy/src/replica/commit_kv.rs +++ b/engine/packages/epoxy/src/replica/commit_kv.rs @@ -1,9 +1,6 @@ use anyhow::Result; use epoxy_protocol::protocol; -use universaldb::{ - Transaction, - utils::IsolationLevel::Serializable, -}; +use universaldb::{Transaction, utils::IsolationLevel::Serializable}; use crate::{ keys::{self, CommittedValue, KvAcceptedKey, KvBallotKey, KvOptimisticCacheKey, KvValueKey}, diff --git a/engine/packages/epoxy/src/replica/messages/accept.rs b/engine/packages/epoxy/src/replica/messages/accept.rs index f0f59207fd..f388a31625 100644 --- a/engine/packages/epoxy/src/replica/messages/accept.rs +++ b/engine/packages/epoxy/src/replica/messages/accept.rs @@ -35,13 +35,11 @@ pub async fn accept( if let Some(committed_value) = committed_value { if !committed_value.mutable || !mutable || version <= committed_value.version { - return Ok( - protocol::AcceptResponse::AcceptResponseAlreadyCommitted( - protocol::AcceptResponseAlreadyCommitted { - value: committed_value.value, - }, - ), - ); + return Ok(protocol::AcceptResponse::AcceptResponseAlreadyCommitted( + protocol::AcceptResponseAlreadyCommitted { + value: committed_value.value, + }, + )); } } diff --git a/engine/packages/epoxy/src/replica/messages/commit.rs b/engine/packages/epoxy/src/replica/messages/commit.rs index fb90aad4e4..033b5a8274 100644 --- a/engine/packages/epoxy/src/replica/messages/commit.rs +++ b/engine/packages/epoxy/src/replica/messages/commit.rs @@ -27,9 +27,7 @@ pub async fn commit( protocol::CommitResponseAlreadyCommitted { value }, ) } - CommitKvOutcome::StaleBallot { .. } => { - protocol::CommitResponse::CommitResponseStaleCommit - } + CommitKvOutcome::StaleBallot { .. } => protocol::CommitResponse::CommitResponseStaleCommit, }; Ok(response) diff --git a/engine/packages/epoxy/src/replica/messages/mod.rs b/engine/packages/epoxy/src/replica/messages/mod.rs index 4aaa2cd76f..a49dff9014 100644 --- a/engine/packages/epoxy/src/replica/messages/mod.rs +++ b/engine/packages/epoxy/src/replica/messages/mod.rs @@ -1,7 +1,7 @@ -pub mod commit; pub mod accept; +pub mod commit; pub mod prepare; -pub use commit::commit; pub use accept::accept; +pub use commit::commit; pub use prepare::prepare; diff --git a/engine/packages/epoxy/src/types.rs b/engine/packages/epoxy/src/types.rs index e045405ac6..db3372f926 100644 --- a/engine/packages/epoxy/src/types.rs +++ b/engine/packages/epoxy/src/types.rs @@ -31,7 +31,11 @@ impl From for ClusterConfig { Self { coordinator_replica_id: config.coordinator_replica_id, epoch: config.epoch, - replicas: config.replicas.into_iter().map(ReplicaConfig::from).collect(), + replicas: config + .replicas + .into_iter() + .map(ReplicaConfig::from) + .collect(), } } } diff --git a/engine/packages/epoxy/src/utils.rs b/engine/packages/epoxy/src/utils.rs index 509ab51829..6c7ba3227e 100644 --- a/engine/packages/epoxy/src/utils.rs +++ b/engine/packages/epoxy/src/utils.rs @@ -107,13 +107,28 @@ mod tests { (7, 6, 4, 7, 1, 5, 3, 6, 1), ]; - for (n, fast, slow, all, any, fanout_fast, fanout_slow, fanout_all, fanout_any) in - expected + for (n, fast, slow, all, any, fanout_fast, fanout_slow, fanout_all, fanout_any) in expected { - assert_eq!(calculate_quorum(n, QuorumType::Fast), fast, "fast quorum for n={n}"); - assert_eq!(calculate_quorum(n, QuorumType::Slow), slow, "slow quorum for n={n}"); - assert_eq!(calculate_quorum(n, QuorumType::All), all, "all quorum for n={n}"); - assert_eq!(calculate_quorum(n, QuorumType::Any), any, "any quorum for n={n}"); + assert_eq!( + calculate_quorum(n, QuorumType::Fast), + fast, + "fast quorum for n={n}" + ); + assert_eq!( + calculate_quorum(n, QuorumType::Slow), + slow, + "slow quorum for n={n}" + ); + assert_eq!( + calculate_quorum(n, QuorumType::All), + all, + "all quorum for n={n}" + ); + assert_eq!( + calculate_quorum(n, QuorumType::Any), + any, + "any quorum for n={n}" + ); assert_eq!( calculate_fanout_quorum(n, QuorumType::Fast), fanout_fast, @@ -169,7 +184,10 @@ mod tests { ); if n >= 2 { - assert!(slow * 2 > n, "slow quorum must be a strict majority for n={n}"); + assert!( + slow * 2 > n, + "slow quorum must be a strict majority for n={n}" + ); assert!( (2 * fast) + slow > 2 * n, "fast quorum must satisfy the Fast Paxos intersection invariant for n={n}" diff --git a/engine/packages/epoxy/src/workflows/backfill.rs b/engine/packages/epoxy/src/workflows/backfill.rs index c15048630a..b0f86df8e8 100644 --- a/engine/packages/epoxy/src/workflows/backfill.rs +++ b/engine/packages/epoxy/src/workflows/backfill.rs @@ -7,7 +7,8 @@ use universaldb::{ KeySelector, RangeOption, options::StreamingMode, utils::{ - FormalKey, IsolationLevel::Serializable, + FormalKey, + IsolationLevel::Serializable, keys::{COMMITTED_VALUE, KV, VALUE}, }, }; @@ -76,7 +77,10 @@ pub struct BackfillChunkOutput { } #[activity(BackfillChunk)] -pub async fn backfill_chunk(ctx: &ActivityCtx, input: &BackfillChunkInput) -> Result { +pub async fn backfill_chunk( + ctx: &ActivityCtx, + input: &BackfillChunkInput, +) -> Result { let replica_id = ctx.config().epoxy_replica_id(); ctx.udb()? @@ -180,11 +184,9 @@ async fn migrate_legacy_key( legacy_value: Option>, legacy_committed_value: Option>, ) -> Result { - let Some(committed_value) = build_legacy_committed_value( - key.clone(), - legacy_value, - legacy_committed_value, - )? else { + let Some(committed_value) = + build_legacy_committed_value(key.clone(), legacy_value, legacy_committed_value)? + else { return Ok(false); }; diff --git a/engine/packages/epoxy/src/workflows/replica/setup.rs b/engine/packages/epoxy/src/workflows/replica/setup.rs index 88583e0e76..7b8df65a30 100644 --- a/engine/packages/epoxy/src/workflows/replica/setup.rs +++ b/engine/packages/epoxy/src/workflows/replica/setup.rs @@ -15,10 +15,7 @@ pub async fn setup_replica(ctx: &mut WorkflowCtx, _input: &super::Input) -> Resu } #[tracing::instrument(skip_all, fields(replica_id = %ctx.config().epoxy_replica_id()))] -pub async fn begin_learning( - ctx: &mut WorkflowCtx, - signal: &super::BeginLearning, -) -> Result<()> { +pub async fn begin_learning(ctx: &mut WorkflowCtx, signal: &super::BeginLearning) -> Result<()> { ctx.activity(StoreConfigInput { config: signal.config.clone(), }) @@ -111,9 +108,7 @@ pub async fn catch_up_replica(ctx: &ActivityCtx, input: &CatchUpReplicaInput) -> ctx.udb()? .run(|tx| { let entry = entry.clone(); - async move { - crate::replica::changelog::apply_entry(&*tx, replica_id, entry).await - } + async move { crate::replica::changelog::apply_entry(&*tx, replica_id, entry).await } }) .custom_instrument(tracing::info_span!("apply_changelog_entry_tx")) .await?; diff --git a/engine/packages/epoxy/tests/backfill.rs b/engine/packages/epoxy/tests/backfill.rs index 2a690a2eca..78b3b1ebd2 100644 --- a/engine/packages/epoxy/tests/backfill.rs +++ b/engine/packages/epoxy/tests/backfill.rs @@ -21,7 +21,9 @@ async fn backfill_migrates_legacy_values_into_v2_and_changelog() { .unwrap(); let workflow_id = ctx - .workflow(epoxy::workflows::backfill::Input { chunk_size: Some(1) }) + .workflow(epoxy::workflows::backfill::Input { + chunk_size: Some(1), + }) .tag("replica", replica_id) .dispatch() .await diff --git a/engine/packages/epoxy/tests/backfill_snapshot.rs b/engine/packages/epoxy/tests/backfill_snapshot.rs index 83d0ac8b7a..e438d54b2b 100644 --- a/engine/packages/epoxy/tests/backfill_snapshot.rs +++ b/engine/packages/epoxy/tests/backfill_snapshot.rs @@ -137,7 +137,9 @@ async fn v1_snapshot_dual_read_mutate_and_backfill() { // -- Phase 4: Run backfill and verify migration -- let workflow_id = ctx - .workflow(epoxy::workflows::backfill::Input { chunk_size: Some(10) }) + .workflow(epoxy::workflows::backfill::Input { + chunk_size: Some(10), + }) .tag("replica", replica_id) .dispatch() .await @@ -207,19 +209,17 @@ async fn v1_snapshot_dual_read_mutate_and_backfill() { // The backfilled keys are immutable (version 0, mutable=false). A same- // value re-proposal should be idempotent. A new-value proposal should // be rejected since the v2 committed value now exists. - let result = - propose_local(ctx, replica_id, b"actor:def456", b"stopped", false) - .await - .unwrap(); + let result = propose_local(ctx, replica_id, b"actor:def456", b"stopped", false) + .await + .unwrap(); assert!( matches!(result, ProposalResult::Committed), "replica {replica_id}: idempotent proposal on backfilled key should succeed: {result:?}", ); - let result = - propose_local(ctx, replica_id, b"actor:def456", b"changed", false) - .await - .unwrap(); + let result = propose_local(ctx, replica_id, b"actor:def456", b"changed", false) + .await + .unwrap(); assert!( matches!( result, diff --git a/engine/packages/epoxy/tests/common/mod.rs b/engine/packages/epoxy/tests/common/mod.rs index 1ad6c1fb7f..03fc31220c 100644 --- a/engine/packages/epoxy/tests/common/mod.rs +++ b/engine/packages/epoxy/tests/common/mod.rs @@ -279,15 +279,18 @@ impl TestCtx { for &other_replica_id in &replica_ids { let metadata = &self.replica_metadata[&other_replica_id]; let name = format!("dc-{}", other_replica_id); - datacenters.insert(name.clone(), rivet_config::config::topology::Datacenter { - name: format!("dc-{}", other_replica_id), - datacenter_label: other_replica_id as u16, - is_leader: other_replica_id == self.leader_id, - peer_url: Url::parse(&format!("http://127.0.0.1:{}", metadata.api_peer_port))?, - public_url: Url::parse(&format!("http://127.0.0.1:{}", metadata.guard_port))?, - proxy_url: None, - valid_hosts: None, - }); + datacenters.insert( + name.clone(), + rivet_config::config::topology::Datacenter { + name: format!("dc-{}", other_replica_id), + datacenter_label: other_replica_id as u16, + is_leader: other_replica_id == self.leader_id, + peer_url: Url::parse(&format!("http://127.0.0.1:{}", metadata.api_peer_port))?, + public_url: Url::parse(&format!("http://127.0.0.1:{}", metadata.guard_port))?, + proxy_url: None, + valid_hosts: None, + }, + ); } Ok(datacenters) diff --git a/engine/packages/epoxy/tests/common/utils.rs b/engine/packages/epoxy/tests/common/utils.rs index c0e942b5af..7c9fb5f36d 100644 --- a/engine/packages/epoxy/tests/common/utils.rs +++ b/engine/packages/epoxy/tests/common/utils.rs @@ -4,7 +4,9 @@ use epoxy::{ self, ChangelogKey, CommittedValue, KvAcceptedKey, KvAcceptedValue, KvBallotKey, KvOptimisticCacheKey, KvValueKey, LegacyCommittedValueKey, }, - ops::propose::{self, CheckAndSetCommand, Command, CommandKind, Proposal, ProposalResult, SetCommand}, + ops::propose::{ + self, CheckAndSetCommand, Command, CommandKind, Proposal, ProposalResult, SetCommand, + }, }; use epoxy_protocol::protocol::{self, ReplicaId}; use futures_util::TryStreamExt; @@ -12,10 +14,7 @@ use gas::prelude::TestCtx as WorkflowTestCtx; use universaldb::{ RangeOption, options::StreamingMode, - utils::{ - FormalKey, IsolationLevel::Serializable, - keys::CHANGELOG, - }, + utils::{FormalKey, IsolationLevel::Serializable, keys::CHANGELOG}, }; #[allow(dead_code)] @@ -74,7 +73,11 @@ pub async fn check_and_set_absent( } #[allow(dead_code)] -pub async fn set_mutable(ctx: &WorkflowTestCtx, key: &[u8], value: &[u8]) -> Result { +pub async fn set_mutable( + ctx: &WorkflowTestCtx, + key: &[u8], + value: &[u8], +) -> Result { let result = ctx .op(propose::Input { proposal: Proposal { @@ -176,7 +179,7 @@ pub async fn write_v2_committed_value( Ok(()) } }) - .await + .await } #[allow(dead_code)] @@ -191,7 +194,8 @@ pub async fn read_legacy_value( let key = key.clone(); async move { let tx = tx.with_subspace(keys::legacy_subspace(replica_id)); - tx.read_opt(&LegacyCommittedValueKey::new(key), Serializable).await + tx.read_opt(&LegacyCommittedValueKey::new(key), Serializable) + .await } }) .await @@ -254,7 +258,8 @@ pub async fn read_cache_committed_value( let key = key.clone(); async move { let tx = tx.with_subspace(keys::subspace(replica_id)); - tx.read_opt(&KvOptimisticCacheKey::new(key), Serializable).await + tx.read_opt(&KvOptimisticCacheKey::new(key), Serializable) + .await } }) .await diff --git a/engine/packages/epoxy/tests/consensus_regressions.rs b/engine/packages/epoxy/tests/consensus_regressions.rs index 65b3d17938..34f93f0a97 100644 --- a/engine/packages/epoxy/tests/consensus_regressions.rs +++ b/engine/packages/epoxy/tests/consensus_regressions.rs @@ -5,11 +5,9 @@ use common::{ THREE_REPLICAS, TestCtx, utils::{read_accepted_value, read_v2_value, set_if_absent, write_ballot}, }; -use epoxy::{ - protocol::{ - self, AcceptRequest, AcceptResponse, CommitRequest, CommitResponse, PrepareRequest, - PrepareResponse, Request, RequestKind, ResponseKind, - }, +use epoxy::protocol::{ + self, AcceptRequest, AcceptResponse, CommitRequest, CommitResponse, PrepareRequest, + PrepareResponse, Request, RequestKind, ResponseKind, }; use epoxy_protocol::PROTOCOL_VERSION; @@ -34,11 +32,17 @@ async fn slow_path_recovery_uses_majority_quorum_after_prepare() { .await .unwrap(); - test_ctx.stop_replica(THREE_REPLICAS[2], false).await.unwrap(); + test_ctx + .stop_replica(THREE_REPLICAS[2], false) + .await + .unwrap(); let ctx = test_ctx.get_ctx(replica_id); let result = set_if_absent(ctx, key, b"committed").await.unwrap(); - assert!(matches!(result, epoxy::ops::propose::ProposalResult::Committed)); + assert!(matches!( + result, + epoxy::ops::propose::ProposalResult::Committed + )); assert_eq!( read_v2_value(ctx, replica_id, key).await.unwrap(), Some(b"committed".to_vec()), @@ -67,8 +71,8 @@ async fn equal_ballot_accepts_do_not_overwrite_accepted_state() { b"value-1", ballot.clone(), ) - .await - .unwrap(); + .await + .unwrap(); assert!(matches!( first_response, AcceptResponse::AcceptResponseOk(_) @@ -89,8 +93,8 @@ async fn equal_ballot_accepts_do_not_overwrite_accepted_state() { b"value-2", ballot.clone(), ) - .await - .unwrap(); + .await + .unwrap(); assert!(matches!( second_response, AcceptResponse::AcceptResponseHigherBallot(_) diff --git a/engine/packages/epoxy/tests/kv.rs b/engine/packages/epoxy/tests/kv.rs index 35fbdfc6f1..e0db4c4dcb 100644 --- a/engine/packages/epoxy/tests/kv.rs +++ b/engine/packages/epoxy/tests/kv.rs @@ -49,14 +49,18 @@ async fn test_kv_operations() { let cas_key = b"immutable-cas-key"; - let first_result = check_and_set_absent(ctx, cas_key, b"created").await.unwrap(); + let first_result = check_and_set_absent(ctx, cas_key, b"created") + .await + .unwrap(); assert!(matches!(first_result, ProposalResult::Committed)); assert_eq!( get_local(ctx, replica_id, cas_key).await.unwrap(), Some(b"created".to_vec()), ); - let same_value_result = check_and_set_absent(ctx, cas_key, b"created").await.unwrap(); + let same_value_result = check_and_set_absent(ctx, cas_key, b"created") + .await + .unwrap(); assert!(matches!(same_value_result, ProposalResult::Committed)); let different_value_result = check_and_set_absent(ctx, cas_key, b"other").await.unwrap(); @@ -105,26 +109,29 @@ async fn test_kv_operations() { )); let changelog_entries = read_changelog_entries(ctx, replica_id).await.unwrap(); - assert!(changelog_entries.contains(&epoxy::protocol::ChangelogEntry { - key: key.to_vec(), - value: b"value1".to_vec(), - version: 1, - mutable: true, - })); - assert!(changelog_entries.contains(&epoxy::protocol::ChangelogEntry { - key: key.to_vec(), - value: b"value2".to_vec(), - version: 2, - mutable: true, - })); + assert!( + changelog_entries.contains(&epoxy::protocol::ChangelogEntry { + key: key.to_vec(), + value: b"value1".to_vec(), + version: 1, + mutable: true, + }) + ); + assert!( + changelog_entries.contains(&epoxy::protocol::ChangelogEntry { + key: key.to_vec(), + value: b"value2".to_vec(), + version: 2, + mutable: true, + }) + ); for _ in 0..20 { let mut replicated = true; for replica_id in THREE_REPLICAS { if read_v2_value(test_ctx.get_ctx(*replica_id), *replica_id, key) .await - .unwrap() - != Some(b"value2".to_vec()) + .unwrap() != Some(b"value2".to_vec()) { replicated = false; break; diff --git a/engine/packages/epoxy/tests/kv_get_optimistic.rs b/engine/packages/epoxy/tests/kv_get_optimistic.rs index d19dc95676..05f9e25b84 100644 --- a/engine/packages/epoxy/tests/kv_get_optimistic.rs +++ b/engine/packages/epoxy/tests/kv_get_optimistic.rs @@ -49,7 +49,10 @@ async fn test_kv_get_optimistic_paths() { let result = set_if_absent(ctx, key, value).await.unwrap(); assert!(matches!(result, ProposalResult::Committed)); - assert_eq!(optimistic_get(ctx, replica_id, key).await, Some(value.to_vec())); + assert_eq!( + optimistic_get(ctx, replica_id, key).await, + Some(value.to_vec()) + ); test_ctx.shutdown().await.unwrap(); } @@ -110,7 +113,10 @@ async fn test_kv_get_optimistic_paths() { Some(remote_value.to_vec()), ); - test_ctx.stop_replica(writer_replica_id, false).await.unwrap(); + test_ctx + .stop_replica(writer_replica_id, false) + .await + .unwrap(); assert_eq!( optimistic_get( test_ctx.get_ctx(reader_replica_id), @@ -257,11 +263,9 @@ async fn test_kv_get_optimistic_paths() { if read_cache_value(follower_ctx, follower_replica_id, key) .await .unwrap() - .is_none() - && read_v2_value(follower_ctx, follower_replica_id, key) - .await - .unwrap() - == Some(b"value2".to_vec()) + .is_none() && read_v2_value(follower_ctx, follower_replica_id, key) + .await + .unwrap() == Some(b"value2".to_vec()) { test_ctx.shutdown().await.unwrap(); return; diff --git a/engine/packages/epoxy/tests/migration.rs b/engine/packages/epoxy/tests/migration.rs index 45b145659f..b5783bc43f 100644 --- a/engine/packages/epoxy/tests/migration.rs +++ b/engine/packages/epoxy/tests/migration.rs @@ -7,13 +7,11 @@ use common::{ write_legacy_value, }, }; -use epoxy::{ - ops::propose::{CommandError, ProposalResult}, -}; +use epoxy::ops::propose::{CommandError, ProposalResult}; #[tokio::test(flavor = "multi_thread")] async fn dual_read_fallback_reads_legacy_subspaces_without_migrating() { -let mut test_ctx = TestCtx::new_with(&[1_u64]).await.unwrap(); + let mut test_ctx = TestCtx::new_with(&[1_u64]).await.unwrap(); let replica_id = test_ctx.leader_id; let ctx = test_ctx.get_ctx(replica_id); let blocked_key = b"legacy-committed-key"; @@ -25,9 +23,14 @@ let mut test_ctx = TestCtx::new_with(&[1_u64]).await.unwrap(); get_local(ctx, replica_id, blocked_key).await.unwrap(), Some(blocked_value.to_vec()), ); - assert_eq!(read_v2_value(ctx, replica_id, blocked_key).await.unwrap(), None); + assert_eq!( + read_v2_value(ctx, replica_id, blocked_key).await.unwrap(), + None + ); - let blocked_result = set_if_absent(ctx, blocked_key, b"new-v2-value").await.unwrap(); + let blocked_result = set_if_absent(ctx, blocked_key, b"new-v2-value") + .await + .unwrap(); assert!(matches!( blocked_result, ProposalResult::CommandError(CommandError::ExpectedValueDoesNotMatch { @@ -41,9 +44,7 @@ let mut test_ctx = TestCtx::new_with(&[1_u64]).await.unwrap(); Some(blocked_value.to_vec()), ); assert_eq!( - read_v2_value(ctx, replica_id, blocked_key) - .await - .unwrap(), + read_v2_value(ctx, replica_id, blocked_key).await.unwrap(), None, ); @@ -56,7 +57,10 @@ let mut test_ctx = TestCtx::new_with(&[1_u64]).await.unwrap(); get_local(ctx, replica_id, migrated_key).await.unwrap(), Some(migrated_value.to_vec()), ); - assert_eq!(read_v2_value(ctx, replica_id, migrated_key).await.unwrap(), None); + assert_eq!( + read_v2_value(ctx, replica_id, migrated_key).await.unwrap(), + None + ); let fresh_key = b"fresh-v2-key"; let fresh_value = b"fresh-v2-value"; diff --git a/engine/packages/epoxy/tests/proposal.rs b/engine/packages/epoxy/tests/proposal.rs index c49f72b641..50e8c31ffc 100644 --- a/engine/packages/epoxy/tests/proposal.rs +++ b/engine/packages/epoxy/tests/proposal.rs @@ -4,10 +4,7 @@ use common::{ THREE_REPLICAS, TestCtx, utils::{get_local, read_ballot, set_if_absent, set_mutable, write_ballot}, }; -use epoxy::{ - metrics, - ops::propose::ProposalResult, -}; +use epoxy::{metrics, ops::propose::ProposalResult}; use epoxy_protocol::protocol; static TEST_LOCK: tokio::sync::Mutex<()> = tokio::sync::Mutex::const_new(()); @@ -52,15 +49,13 @@ async fn proposal_uses_fast_and_contention_paths() { assert_eq!( metrics::PROPOSAL_TOTAL .with_label_values(&["committed"]) - .get() - - committed_before, + .get() - committed_before, 1, ); assert_eq!( metrics::PROPOSAL_TOTAL .with_label_values(&["slow_path"]) - .get() - - slow_result_before, + .get() - slow_result_before, 0, ); @@ -106,8 +101,7 @@ async fn proposal_uses_fast_and_contention_paths() { assert_eq!( metrics::PROPOSAL_TOTAL .with_label_values(&["slow_path"]) - .get() - - slow_result_before, + .get() - slow_result_before, 1, ); let key = b"mutable-fast-path"; @@ -132,8 +126,7 @@ async fn proposal_uses_fast_and_contention_paths() { for replica_id in THREE_REPLICAS { if get_local(test_ctx.get_ctx(*replica_id), *replica_id, key) .await - .unwrap() - != Some(b"value2".to_vec()) + .unwrap() != Some(b"value2".to_vec()) { replicated = false; break; diff --git a/engine/packages/epoxy/tests/reconfigure.rs b/engine/packages/epoxy/tests/reconfigure.rs index fd1c3ec51b..44cd7188f2 100644 --- a/engine/packages/epoxy/tests/reconfigure.rs +++ b/engine/packages/epoxy/tests/reconfigure.rs @@ -190,7 +190,8 @@ async fn verify_changelog_catch_up( replica_id: ReplicaId, expected_keys: &[(Vec, Vec)], ) -> Result<()> { - let changelog_entries = read_changelog_entries(test_ctx.get_ctx(replica_id), replica_id).await?; + let changelog_entries = + read_changelog_entries(test_ctx.get_ctx(replica_id), replica_id).await?; assert_eq!( changelog_entries.len(), expected_keys.len(), diff --git a/engine/packages/guard-core/src/proxy_service.rs b/engine/packages/guard-core/src/proxy_service.rs index 1bb4ed21f0..885002d042 100644 --- a/engine/packages/guard-core/src/proxy_service.rs +++ b/engine/packages/guard-core/src/proxy_service.rs @@ -750,10 +750,10 @@ impl ProxyService { .body(Full::new(req_body.clone())) .map_err(|err| errors::RequestBuildError(err.to_string()).build())?; - // Send the request with timeout - let res = timeout(timeout_duration, self.state.client.request(proxied_req)) - .await - .map_err(|_| { + // Send the request with timeout + let res = timeout(timeout_duration, self.state.client.request(proxied_req)) + .await + .map_err(|_| { errors::RequestTimeout { timeout_seconds: timeout_duration.as_secs(), } diff --git a/engine/packages/guard-core/src/server.rs b/engine/packages/guard-core/src/server.rs index db9e3e1083..16cc5ef756 100644 --- a/engine/packages/guard-core/src/server.rs +++ b/engine/packages/guard-core/src/server.rs @@ -96,9 +96,7 @@ pub async fn run_server( metrics::TCP_CONNECTION_PENDING.inc(); metrics::TCP_CONNECTION_TOTAL.inc(); - if tcp_nodelay - && let Err(err) = tcp_stream.set_nodelay(true) - { + if tcp_nodelay && let Err(err) = tcp_stream.set_nodelay(true) { tracing::debug!(?err, "failed to enable tcp nodelay"); } diff --git a/engine/packages/guard/src/cache/mod.rs b/engine/packages/guard/src/cache/mod.rs index 24efec175d..6bcc19d744 100644 --- a/engine/packages/guard/src/cache/mod.rs +++ b/engine/packages/guard/src/cache/mod.rs @@ -9,10 +9,7 @@ use rivet_guard_core::{CacheKeyFn, request_context::RequestContext}; pub mod pegboard_gateway; -use crate::routing::{ - SEC_WEBSOCKET_PROTOCOL, WS_PROTOCOL_TARGET, X_RIVET_TARGET, - actor_path::{self, QueryActorPathInfo}, -}; +use crate::routing::{SEC_WEBSOCKET_PROTOCOL, WS_PROTOCOL_TARGET, X_RIVET_TARGET, actor_path}; /// Creates the main cache key function that handles all incoming requests #[tracing::instrument(skip_all)] @@ -39,7 +36,10 @@ pub fn create_cache_key_function() -> CacheKeyFn { // is excluded because it does not affect which actor the // request routes to. tracing::debug!("using query-path cache key for actor"); - return Ok(query_path_cache_key(&query_path_info, req_ctx)); + return Ok(pegboard_gateway::build_cache_key_query_based( + &query_path_info, + req_ctx, + )); } } } @@ -87,45 +87,3 @@ fn host_path_method_cache_key(req_ctx: &RequestContext) -> u64 { req_ctx.method().as_str().hash(&mut hasher); hasher.finish() } - -/// Build a cache key from only the routing-relevant fields of a query gateway -/// path. Token is intentionally excluded so requests with different tokens but -/// the same query resolve to the same cached route. -fn query_path_cache_key(info: &QueryActorPathInfo, req_ctx: &RequestContext) -> u64 { - use crate::routing::actor_path::QueryActorQuery; - - let mut hasher = DefaultHasher::new(); - match &info.query { - QueryActorQuery::Get { - namespace, - name, - key, - } => { - "get".hash(&mut hasher); - namespace.hash(&mut hasher); - name.hash(&mut hasher); - key.hash(&mut hasher); - } - QueryActorQuery::GetOrCreate { - namespace, - name, - runner_name, - key, - input, - region, - crash_policy, - } => { - "getOrCreate".hash(&mut hasher); - namespace.hash(&mut hasher); - name.hash(&mut hasher); - runner_name.hash(&mut hasher); - key.hash(&mut hasher); - input.hash(&mut hasher); - region.hash(&mut hasher); - crash_policy.hash(&mut hasher); - } - } - info.stripped_path.hash(&mut hasher); - req_ctx.method().as_str().hash(&mut hasher); - hasher.finish() -} diff --git a/engine/packages/guard/src/cache/pegboard_gateway.rs b/engine/packages/guard/src/cache/pegboard_gateway.rs index 9816e477a0..5492f1c58a 100644 --- a/engine/packages/guard/src/cache/pegboard_gateway.rs +++ b/engine/packages/guard/src/cache/pegboard_gateway.rs @@ -8,7 +8,8 @@ use gas::prelude::*; use rivet_guard_core::request_context::RequestContext; use crate::routing::{ - SEC_WEBSOCKET_PROTOCOL, WS_PROTOCOL_ACTOR, actor_path::ActorPathInfo, + SEC_WEBSOCKET_PROTOCOL, WS_PROTOCOL_ACTOR, + actor_path::{ActorPathInfo, QueryActorPathInfo}, pegboard_gateway::X_RIVET_ACTOR, }; @@ -99,3 +100,45 @@ pub fn build_cache_key_target_based(req_ctx: &RequestContext, target: &str) -> R Ok(hash) } + +/// Build a cache key from only the routing-relevant fields of a query gateway +/// path. Token is intentionally excluded so requests with different tokens but +/// the same query resolve to the same cached route. +pub fn build_cache_key_query_based(info: &QueryActorPathInfo, req_ctx: &RequestContext) -> u64 { + use crate::routing::actor_path::QueryActorQuery; + + let mut hasher = DefaultHasher::new(); + match &info.query { + QueryActorQuery::Get { + namespace, + name, + key, + } => { + "get".hash(&mut hasher); + namespace.hash(&mut hasher); + name.hash(&mut hasher); + key.hash(&mut hasher); + } + QueryActorQuery::GetOrCreate { + namespace, + name, + runner_name, + key, + input, + region, + crash_policy, + } => { + "getOrCreate".hash(&mut hasher); + namespace.hash(&mut hasher); + name.hash(&mut hasher); + runner_name.hash(&mut hasher); + key.hash(&mut hasher); + input.hash(&mut hasher); + region.hash(&mut hasher); + crash_policy.hash(&mut hasher); + } + } + info.stripped_path.hash(&mut hasher); + req_ctx.method().as_str().hash(&mut hasher); + hasher.finish() +} diff --git a/engine/packages/guard/src/errors.rs b/engine/packages/guard/src/errors.rs index 365034e1f9..45912a8922 100644 --- a/engine/packages/guard/src/errors.rs +++ b/engine/packages/guard/src/errors.rs @@ -165,4 +165,3 @@ pub struct QueryInvalidCborInput { pub struct QueryInvalidPercentEncoding { pub name: String, } - diff --git a/engine/packages/guard/src/routing/actor_path.rs b/engine/packages/guard/src/routing/actor_path.rs index 94e6b55b30..7b41fa3e5b 100644 --- a/engine/packages/guard/src/routing/actor_path.rs +++ b/engine/packages/guard/src/routing/actor_path.rs @@ -115,7 +115,11 @@ pub fn parse_actor_path(path: &str) -> Result> { Some(q) => format!("?{q}"), None => String::new(), }; - Ok(parse_direct_actor_path(base_path, &segments, &raw_query_string)) + Ok(parse_direct_actor_path( + base_path, + &segments, + &raw_query_string, + )) } fn parse_direct_actor_path( @@ -143,10 +147,7 @@ fn parse_direct_actor_path( let token = urlencoding::decode(raw_token).ok()?.into_owned(); (actor_id, Some(token)) } else { - ( - urlencoding::decode(actor_segment).ok()?.into_owned(), - None, - ) + (urlencoding::decode(actor_segment).ok()?.into_owned(), None) }; let remaining_path = build_remaining_path(base_path, raw_query_string, 2); @@ -252,13 +253,11 @@ fn build_actor_query(name: &str, rvt: RvtParams) -> Result { }) } "getOrCreate" => { - let runner_name = rvt.runner.ok_or_else(|| errors::QueryMissingRunnerName.build())?; + let runner_name = rvt + .runner + .ok_or_else(|| errors::QueryMissingRunnerName.build())?; - let input = rvt - .input - .as_deref() - .map(decode_query_input) - .transpose()?; + let input = rvt.input.as_deref().map(decode_query_input).transpose()?; let crash_policy = rvt .crash_policy diff --git a/engine/packages/guard/src/routing/envoy.rs b/engine/packages/guard/src/routing/envoy.rs index 5ee4cedf47..f2f6ae6ea7 100644 --- a/engine/packages/guard/src/routing/envoy.rs +++ b/engine/packages/guard/src/routing/envoy.rs @@ -2,6 +2,7 @@ use anyhow::Result; use gas::prelude::*; use rivet_guard_core::{RoutingOutput, request_context::RequestContext}; use std::sync::Arc; +use subtle::ConstantTimeEq; use super::{SEC_WEBSOCKET_PROTOCOL, WS_PROTOCOL_TOKEN, X_RIVET_TOKEN, validate_regional_host}; @@ -81,7 +82,11 @@ async fn route_envoy_internal( }; // Validate token - if token != auth.admin_token.read() { + if token + .as_bytes() + .ct_ne(auth.admin_token.read().as_bytes()) + .into() + { return Err(rivet_api_builder::ApiForbidden.build()); } diff --git a/engine/packages/guard/src/routing/kv_channel.rs b/engine/packages/guard/src/routing/kv_channel.rs index b0bdd46549..7f56971b0a 100644 --- a/engine/packages/guard/src/routing/kv_channel.rs +++ b/engine/packages/guard/src/routing/kv_channel.rs @@ -43,7 +43,11 @@ pub async fn route_request_path_based( .build() })?; - if token.as_bytes().ct_ne(auth.admin_token.read().as_bytes()).into() { + if token + .as_bytes() + .ct_ne(auth.admin_token.read().as_bytes()) + .into() + { return Err(rivet_api_builder::ApiForbidden.build()); } diff --git a/engine/packages/guard/src/routing/mod.rs b/engine/packages/guard/src/routing/mod.rs index bf227c9d1a..f8a26a7aca 100644 --- a/engine/packages/guard/src/routing/mod.rs +++ b/engine/packages/guard/src/routing/mod.rs @@ -7,8 +7,8 @@ use rivet_guard_core::{RoutingFn, request_context::RequestContext}; use crate::{errors, metrics, shared_state::SharedState}; -mod api_public; pub mod actor_path; +mod api_public; mod envoy; mod kv_channel; pub mod pegboard_gateway; @@ -27,9 +27,9 @@ pub(crate) const WS_PROTOCOL_TOKEN: &str = "rivet_token."; #[tracing::instrument(skip_all)] pub fn create_routing_function(ctx: &StandaloneCtx, shared_state: SharedState) -> RoutingFn { let ctx = ctx.clone(); - let kv_channel_handler = Arc::new( - pegboard_kv_channel::PegboardKvChannelCustomServe::new(ctx.clone()), - ); + let kv_channel_handler = Arc::new(pegboard_kv_channel::PegboardKvChannelCustomServe::new( + ctx.clone(), + )); Arc::new(move |req_ctx| { let ctx = ctx.with_ray(req_ctx.ray_id(), req_ctx.req_id()).unwrap(); let shared_state = shared_state.clone(); @@ -58,21 +58,12 @@ pub fn create_routing_function(ctx: &StandaloneCtx, shared_state: SharedState) - // MARK: Path-based routing // Route actor - if let Some(actor_path_info) = actor_path::parse_actor_path(req_ctx.path())? { - tracing::debug!(?actor_path_info, "routing using path-based actor routing"); - - if let Some(routing_output) = pegboard_gateway::route_request_path_based( - &ctx, - &shared_state, - req_ctx, - &actor_path_info, - ) - .await? - { - metrics::ROUTE_TOTAL.with_label_values(&["gateway"]).inc(); + if let Some(routing_output) = + pegboard_gateway::route_request_path_based(&ctx, &shared_state, req_ctx).await? + { + metrics::ROUTE_TOTAL.with_label_values(&["gateway"]).inc(); - return Ok(routing_output); - } + return Ok(routing_output); } // Route runner @@ -94,8 +85,7 @@ pub fn create_routing_function(ctx: &StandaloneCtx, shared_state: SharedState) - // Route KV channel if let Some(routing_output) = - kv_channel::route_request_path_based(&ctx, req_ctx, &kv_channel_handler) - .await? + kv_channel::route_request_path_based(&ctx, req_ctx, &kv_channel_handler).await? { metrics::ROUTE_TOTAL .with_label_values(&["kv_channel"]) @@ -186,10 +176,7 @@ pub fn create_routing_function(ctx: &StandaloneCtx, shared_state: SharedState) - /// Validates that the request hostname is valid for the current datacenter. /// Returns an error if the host does not match a valid regional host. -pub(crate) fn validate_regional_host( - ctx: &StandaloneCtx, - req_ctx: &RequestContext, -) -> Result<()> { +pub(crate) fn validate_regional_host(ctx: &StandaloneCtx, req_ctx: &RequestContext) -> Result<()> { let current_dc = ctx.config().topology().current_dc()?; if !current_dc.is_valid_regional_host(req_ctx.hostname()) { tracing::warn!( diff --git a/engine/packages/guard/src/routing/pegboard_gateway/mod.rs b/engine/packages/guard/src/routing/pegboard_gateway/mod.rs index 6d444dd810..1924fb79cf 100644 --- a/engine/packages/guard/src/routing/pegboard_gateway/mod.rs +++ b/engine/packages/guard/src/routing/pegboard_gateway/mod.rs @@ -11,7 +11,7 @@ use super::{ SEC_WEBSOCKET_PROTOCOL, WS_PROTOCOL_ACTOR, WS_PROTOCOL_TOKEN, X_RIVET_TOKEN, actor_path::ParsedActorPath, }; -use crate::{errors, shared_state::SharedState}; +use crate::{errors, routing::actor_path::parse_actor_path, shared_state::SharedState}; use resolve_actor_query::resolve_query_actor_id; const ACTOR_FORCE_WAKE_PENDING_TIMEOUT: i64 = util::duration::seconds(60); @@ -30,9 +30,14 @@ pub async fn route_request_path_based( ctx: &StandaloneCtx, shared_state: &SharedState, req_ctx: &RequestContext, - actor_path: &ParsedActorPath, ) -> Result> { - let resolved_route = resolve_path_based_route(ctx, req_ctx, actor_path).await?; + let Some(actor_path) = parse_actor_path(req_ctx.path())? else { + return Ok(None); + }; + + tracing::debug!(?actor_path, "routing using path-based actor routing"); + + let resolved_route = resolve_path_based_route(ctx, req_ctx, &actor_path).await?; route_request_inner( ctx, @@ -43,6 +48,7 @@ pub async fn route_request_path_based( resolved_route.token.as_deref(), ) .await + .map(Some) } /// Route requests to actor services based on headers @@ -121,7 +127,9 @@ pub async fn route_request( // Find actor to route to let actor_id = Id::parse(&actor_id_str).context("invalid x-rivet-actor header")?; - route_request_inner(ctx, shared_state, req_ctx, actor_id, req_ctx.path(), token).await + route_request_inner(ctx, shared_state, req_ctx, actor_id, req_ctx.path(), token) + .await + .map(Some) } #[derive(Debug)] @@ -197,7 +205,7 @@ async fn route_request_inner( actor_id: Id, stripped_path: &str, _token: Option<&str>, -) -> Result> { +) -> Result { // NOTE: Token validation implemented in EE // Route to peer dc where the actor lives @@ -209,7 +217,7 @@ async fn route_request_inner( .dc_for_label(actor_id.label()) .context("dc with the given label not found")?; - return Ok(Some(RoutingOutput::Route(RouteConfig { + return Ok(RoutingOutput::Route(RouteConfig { targets: vec![RouteTarget { host: peer_dc .proxy_url_host() @@ -220,7 +228,7 @@ async fn route_request_inner( .context("bad peer dc proxy url port")?, path: req_ctx.path().to_owned(), }], - }))); + })); } // Create subs before checking if actor exists/is not destroyed @@ -278,7 +286,6 @@ async fn route_request_inner( destroy_sub2, ) .await - .map(Some) } 1 => { handle_actor_v1( @@ -298,7 +305,6 @@ async fn route_request_inner( destroy_sub2, ) .await - .map(Some) } _ => bail!("unknown actor version"), } @@ -317,7 +323,8 @@ async fn handle_actor_v2( ) -> Result { // Wake actor if sleeping if actor.sleeping { - tracing::debug!(?actor_id, "actor sleeping, waking"); + // Route attempts wake sleeping actors up front so the ready wait below has a chance to complete. + tracing::debug!(?actor_id, "actor sleeping, sending wake signal"); ctx.signal(pegboard::workflows::actor2::Wake {}) .to_workflow_id(actor.workflow_id) @@ -328,8 +335,6 @@ async fn handle_actor_v2( let envoy_key = if let (Some(envoy_key), true) = (actor.envoy_key, actor.connectable) { envoy_key } else { - tracing::debug!(?actor_id, "waiting for actor to become ready"); - let mut wake_retries = 0; // Create pool error check future @@ -347,7 +352,9 @@ async fn handle_actor_v2( res = stopped_sub.next() => { res?; - if wake_retries < 8 { + // Actors may stop again before they finish waking up, so resend a limited number of + // wake signals while we keep waiting for the ready event. + if wake_retries < 16 { tracing::debug!(?actor_id, ?wake_retries, "actor stopped while we were waiting for it to become ready, attempting rewake"); wake_retries += 1; @@ -360,7 +367,7 @@ async fn handle_actor_v2( if res.is_none() { tracing::warn!( ?actor_id, - "actor workflow not found for rewake" + "actor workflow not found while sending another wake signal" ); return Err(pegboard::errors::Actor::NotFound.build()); } diff --git a/engine/packages/guard/src/routing/pegboard_gateway/resolve_actor_query.rs b/engine/packages/guard/src/routing/pegboard_gateway/resolve_actor_query.rs index da4d1740ae..88b0216656 100644 --- a/engine/packages/guard/src/routing/pegboard_gateway/resolve_actor_query.rs +++ b/engine/packages/guard/src/routing/pegboard_gateway/resolve_actor_query.rs @@ -112,14 +112,9 @@ async fn resolve_query_get_or_create_actor_id( return Ok(actor_id); } - let target_dc_label = resolve_query_target_dc_label( - ctx, - namespace_id, - namespace_name, - runner_name, - region, - ) - .await?; + let target_dc_label = + resolve_query_target_dc_label(ctx, namespace_id, namespace_name, runner_name, region) + .await?; let encoded_input = input.map(|input| STANDARD.encode(input)); if target_dc_label == ctx.config().dc_label() { @@ -157,14 +152,16 @@ async fn resolve_query_get_or_create_actor_id( Some(&rivet_api_types::actors::get_or_create::GetOrCreateQuery { namespace: namespace_name.to_string(), }), - Some(&rivet_api_types::actors::get_or_create::GetOrCreateRequest { - datacenter: None, - name: name.to_string(), - key: serialized_key, - input: encoded_input, - runner_name_selector: runner_name.to_string(), - crash_policy, - }), + Some( + &rivet_api_types::actors::get_or_create::GetOrCreateRequest { + datacenter: None, + name: name.to_string(), + key: serialized_key, + input: encoded_input, + runner_name_selector: runner_name.to_string(), + crash_policy, + }, + ), ) .await?; Ok(response.actor.actor_id) @@ -190,10 +187,12 @@ async fn resolve_query_target_dc_label( } let res = ctx - .op(pegboard::ops::runner::list_runner_config_enabled_dcs::Input { - namespace_id, - runner_name: runner_name_selector.to_string(), - }) + .op( + pegboard::ops::runner::list_runner_config_enabled_dcs::Input { + namespace_id, + runner_name: runner_name_selector.to_string(), + }, + ) .await?; if let Some(dc_label) = res.dc_labels.into_iter().next() { diff --git a/engine/packages/guard/src/routing/runner.rs b/engine/packages/guard/src/routing/runner.rs index 071715d214..c67551ddd7 100644 --- a/engine/packages/guard/src/routing/runner.rs +++ b/engine/packages/guard/src/routing/runner.rs @@ -83,7 +83,11 @@ async fn route_runner_internal( }; // Validate token - if token.as_bytes().ct_ne(auth.admin_token.read().as_bytes()).into() { + if token + .as_bytes() + .ct_ne(auth.admin_token.read().as_bytes()) + .into() + { return Err(rivet_api_builder::ApiForbidden.build()); } diff --git a/engine/packages/guard/tests/parse_actor_path.rs b/engine/packages/guard/tests/parse_actor_path.rs index b0486170d9..eab5d81144 100644 --- a/engine/packages/guard/tests/parse_actor_path.rs +++ b/engine/packages/guard/tests/parse_actor_path.rs @@ -47,8 +47,8 @@ fn parses_query_actor_get_paths() { #[test] fn parses_query_actor_get_or_create_paths_with_input_and_region() { let input_bytes = vec![ - 0xa2, 0x65, b'c', b'o', b'u', b'n', b't', 0x02, 0x67, b'e', b'n', b'a', b'b', b'l', - b'e', b'd', 0xf5, + 0xa2, 0x65, b'c', b'o', b'u', b'n', b't', 0x02, 0x67, b'e', b'n', b'a', b'b', b'l', b'e', + b'd', 0xf5, ]; let input = encode_cbor_base64url(&input_bytes); let path = format!( @@ -245,7 +245,8 @@ fn preserves_percent_encoding_in_actor_query_params() { #[test] fn preserves_plus_in_actor_query_params() { // Actor params should preserve + literally, not re-encode to %2B or decode to space. - let path = "/gateway/lobby/api?rvt-namespace=default&rvt-method=get&search=hello+world&tag=c%2B%2B"; + let path = + "/gateway/lobby/api?rvt-namespace=default&rvt-method=get&search=hello+world&tag=c%2B%2B"; let result = parse_actor_path(path).unwrap().unwrap(); match result { @@ -282,7 +283,8 @@ fn handles_interleaved_rvt_and_actor_params() { fn decodes_plus_as_space_in_rvt_values() { // rvt-* values should decode + as space (form-urlencoded), while actor // params preserve + literally. - let path = "/gateway/lobby/api?rvt-namespace=my+ns&rvt-method=get&rvt-key=hello+world&q=search+term"; + let path = + "/gateway/lobby/api?rvt-namespace=my+ns&rvt-method=get&rvt-key=hello+world&q=search+term"; let result = parse_actor_path(path).unwrap().unwrap(); match result { @@ -332,10 +334,7 @@ fn rejects_missing_method() { let err = parse_actor_path("/gateway/lobby?rvt-namespace=default") .unwrap_err() .to_string(); - assert!( - err.contains("method"), - "expected method error, got: {err}" - ); + assert!(err.contains("method"), "expected method error, got: {err}"); } #[test] @@ -351,11 +350,10 @@ fn rejects_invalid_query_method() { #[test] fn rejects_unknown_query_params() { - let err = parse_actor_path( - "/gateway/lobby?rvt-namespace=default&rvt-method=get&rvt-unknown=value", - ) - .unwrap_err() - .to_string(); + let err = + parse_actor_path("/gateway/lobby?rvt-namespace=default&rvt-method=get&rvt-unknown=value") + .unwrap_err() + .to_string(); assert!( err.contains("unknown field"), "expected unknown field error, got: {err}" @@ -406,10 +404,9 @@ fn rejects_invalid_cbor_input() { #[test] fn rejects_raw_at_token_syntax_in_query_paths() { - let err = - parse_actor_path("/gateway/lobby@token/connect?rvt-namespace=default&rvt-method=get") - .unwrap_err() - .to_string(); + let err = parse_actor_path("/gateway/lobby@token/connect?rvt-namespace=default&rvt-method=get") + .unwrap_err() + .to_string(); assert!(err.contains("query gateway paths must not use @token syntax")); } @@ -454,11 +451,10 @@ fn rejects_crash_policy_for_get_queries() { #[test] fn rejects_runner_for_get_queries() { - let err = parse_actor_path( - "/gateway/lobby?rvt-namespace=default&rvt-method=get&rvt-runner=default", - ) - .unwrap_err() - .to_string(); + let err = + parse_actor_path("/gateway/lobby?rvt-namespace=default&rvt-method=get&rvt-runner=default") + .unwrap_err() + .to_string(); assert!(err.contains( "query gateway method=get does not allow rvt-input, rvt-region, rvt-crash-policy, or rvt-runner params" )); @@ -466,13 +462,10 @@ fn rejects_runner_for_get_queries() { #[test] fn rejects_missing_runner_for_get_or_create_queries() { - let err = - parse_actor_path("/gateway/lobby?rvt-namespace=default&rvt-method=getOrCreate") - .unwrap_err() - .to_string(); - assert!(err.contains( - "query gateway method=getOrCreate requires rvt-runner param" - )); + let err = parse_actor_path("/gateway/lobby?rvt-namespace=default&rvt-method=getOrCreate") + .unwrap_err() + .to_string(); + assert!(err.contains("query gateway method=getOrCreate requires rvt-runner param")); } #[test] diff --git a/engine/packages/pegboard-envoy/src/conn.rs b/engine/packages/pegboard-envoy/src/conn.rs index 9f47c426c9..768e5e9ce2 100644 --- a/engine/packages/pegboard-envoy/src/conn.rs +++ b/engine/packages/pegboard-envoy/src/conn.rs @@ -147,7 +147,7 @@ pub async fn handle_init( // Read existing data let (create_ts_entry, old_last_ping_ts_entry, version_entry) = tokio::try_join!( tx.read_opt(&create_ts_key, Serializable), - tx.read_opt(&create_ts_key, Serializable), + tx.read_opt(&last_ping_ts_key, Serializable), tx.read_opt(&version_key, Serializable), )?; @@ -228,6 +228,7 @@ pub async fn handle_init( ); tx.add_conflict_key(&old_lb_key, ConflictRangeType::Read)?; + tx.delete(&old_lb_key); } // Insert into LB @@ -258,6 +259,18 @@ pub async fn handle_init( } } + // Update the pool's protocol version. This is required for serverful pools because normally + // the pool's protocol version is updated via the metadata_poller wf but that only runs for + // serverless pools. + tx.write( + &pegboard::keys::runner_config::ProtocolVersionKey::new( + namespace_id, + pool_name.clone(), + ), + protocol_version, + )?; + + // Write envoy metadata if let Some(metadata) = &init.metadata { let metadata = MetadataKeyData { metadata: @@ -354,36 +367,28 @@ pub async fn handle_init( // Send missed commands if !missed_commands.is_empty() { let db = ctx.udb()?; - let msg = - { - for cmd_wrapper in &mut missed_commands { - if let protocol::Command::CommandStartActor(ref mut start) = - cmd_wrapper.inner - { - let actor_id = cmd_wrapper - .checkpoint - .actor_id - .parse::() - .context( - "failed to parse actor_id from missed envoy command", - )?; - let preloaded = - pegboard::actor_kv::preload::fetch_preloaded_kv( - &db, - pb, - actor_id, - conn.namespace_id, - &start.config.name, - ) - .await?; - start.preloaded_kv = preloaded; - } + let msg = { + for cmd_wrapper in &mut missed_commands { + if let protocol::Command::CommandStartActor(ref mut start) = cmd_wrapper.inner { + let actor_id = cmd_wrapper + .checkpoint + .actor_id + .parse::() + .context("failed to parse actor_id from missed envoy command")?; + let preloaded = pegboard::actor_kv::preload::fetch_preloaded_kv( + &db, + pb, + actor_id, + conn.namespace_id, + &start.config.name, + ) + .await?; + start.preloaded_kv = preloaded; } + } - versioned::ToEnvoy::wrap_latest(protocol::ToEnvoy::ToEnvoyCommands( - missed_commands, - )) - }; + versioned::ToEnvoy::wrap_latest(protocol::ToEnvoy::ToEnvoyCommands(missed_commands)) + }; let msg_serialized = msg.serialize(conn.protocol_version)?; conn.ws_handle .send(Message::Binary(msg_serialized.into())) diff --git a/engine/packages/pegboard-envoy/src/tunnel_to_ws_task.rs b/engine/packages/pegboard-envoy/src/tunnel_to_ws_task.rs index 32f230fdff..0fdb4c1716 100644 --- a/engine/packages/pegboard-envoy/src/tunnel_to_ws_task.rs +++ b/engine/packages/pegboard-envoy/src/tunnel_to_ws_task.rs @@ -126,15 +126,11 @@ async fn handle_message( protocol::ToEnvoyConn::ToEnvoyCommands(mut command_wrappers) => { // TODO: Parallelize for command_wrapper in &mut command_wrappers { - if let protocol::Command::CommandStartActor(start) = - &mut command_wrapper.inner - { + if let protocol::Command::CommandStartActor(start) = &mut command_wrapper.inner { let actor_id = Id::parse(&command_wrapper.checkpoint.actor_id)?; let actor_name = start.config.name.clone(); let ids = ctx - .op(pegboard::ops::actor::hibernating_request::list::Input { - actor_id, - }) + .op(pegboard::ops::actor::hibernating_request::list::Input { actor_id }) .await?; // Dynamically populate hibernating request ids @@ -148,15 +144,14 @@ async fn handle_message( if start.preloaded_kv.is_none() { let db = ctx.udb()?; - start.preloaded_kv = - pegboard::actor_kv::preload::fetch_preloaded_kv( - &db, - ctx.config().pegboard(), - actor_id, - conn.namespace_id, - &actor_name, - ) - .await?; + start.preloaded_kv = pegboard::actor_kv::preload::fetch_preloaded_kv( + &db, + ctx.config().pegboard(), + actor_id, + conn.namespace_id, + &actor_name, + ) + .await?; } } } diff --git a/engine/packages/pegboard-gateway/src/lib.rs b/engine/packages/pegboard-gateway/src/lib.rs index a340811ca2..bbf3435702 100644 --- a/engine/packages/pegboard-gateway/src/lib.rs +++ b/engine/packages/pegboard-gateway/src/lib.rs @@ -313,8 +313,8 @@ impl PegboardGateway { ); // If we are reconnecting after hibernation, don't send an open message - let can_hibernate = if after_hibernation { - true + let (can_hibernate, buffered_runner_messages) = if after_hibernation { + (true, Vec::new()) } else { // Send WebSocket open message let open_message = protocol::mk2::ToClientTunnelMessageKind::ToClientWebSocketOpen( @@ -333,22 +333,28 @@ impl PegboardGateway { // Wait for WebSocket open acknowledgment let fut = async { + // The runner can emit frames or acks before the open ack arrives, so hold them until the WS pump starts. + let mut buffered_runner_messages = Vec::new(); + loop { tokio::select! { - res = msg_rx.recv() => { - if let Some(msg) = res { - match msg { - protocol::mk2::ToServerTunnelMessageKind::ToServerWebSocketOpen(msg) => { - return anyhow::Ok(msg); - } + res = msg_rx.recv() => { + if let Some(msg) = res { + match msg { + protocol::mk2::ToServerTunnelMessageKind::ToServerWebSocketOpen(msg) => { + tracing::trace!(can_hibernate = msg.can_hibernate, "received websocket open acknowledgement from runner"); + return anyhow::Ok((msg, buffered_runner_messages)); + } protocol::mk2::ToServerTunnelMessageKind::ToServerWebSocketClose(close) => { tracing::warn!(?close, "websocket closed before opening"); return Err(WebSocketServiceUnavailable.build()); } - _ => { - tracing::warn!( - "received unexpected message while waiting for websocket open" + other => { + tracing::debug!( + ?other, + "buffering runner websocket message while waiting for websocket open" ); + buffered_runner_messages.push(other); } } } else { @@ -360,6 +366,26 @@ impl PegboardGateway { } } _ = stopped_sub.next() => { + let actor_state = self + .ctx + .op(pegboard::ops::actor::get_for_gateway::Input { + actor_id: self.actor_id, + }) + .await?; + + let stale_stopped_message = actor_state.as_ref().is_some_and(|actor| { + actor.connectable && actor.runner_id == Some(self.runner_id) + }); + // A stale Stopped event from the previous lifecycle can race with reconnect/open. + if stale_stopped_message { + tracing::debug!( + actor_id = ?self.actor_id, + runner_id = ?self.runner_id, + "ignoring stale actor stopped message while waiting for websocket open" + ); + continue; + } + tracing::debug!("actor stopped while waiting for websocket open"); return Err(WebSocketServiceUnavailable.build()); } @@ -379,19 +405,20 @@ impl PegboardGateway { .pegboard() .gateway_websocket_open_timeout_ms(), ); - let open_msg = tokio::time::timeout(websocket_open_timeout, fut) - .await - .map_err(|_| { - tracing::warn!("timed out waiting for websocket open from runner"); + let (open_msg, buffered_runner_messages) = + tokio::time::timeout(websocket_open_timeout, fut) + .await + .map_err(|_| { + tracing::warn!("timed out waiting for websocket open from runner"); - WebSocketServiceUnavailable.build() - })??; + WebSocketServiceUnavailable.build() + })??; self.shared_state .toggle_hibernation(request_id, open_msg.can_hibernate) .await?; - open_msg.can_hibernate + (open_msg.can_hibernate, buffered_runner_messages) }; let ingress_bytes = Arc::new(AtomicU64::new(0)); @@ -411,11 +438,15 @@ impl PegboardGateway { let (metrics_abort_tx, metrics_abort_rx) = watch::channel(()); let tunnel_to_ws = tokio::spawn(tunnel_to_ws_task::task( + self.ctx.clone(), self.shared_state.clone(), client_ws, request_id, + self.actor_id, + self.runner_id, stopped_sub, msg_rx, + buffered_runner_messages, drop_rx, can_hibernate, egress_bytes.clone(), diff --git a/engine/packages/pegboard-gateway/src/shared_state.rs b/engine/packages/pegboard-gateway/src/shared_state.rs index f2ca560d93..5d75bca16c 100644 --- a/engine/packages/pegboard-gateway/src/shared_state.rs +++ b/engine/packages/pegboard-gateway/src/shared_state.rs @@ -135,6 +135,7 @@ impl SharedState { protocol_version: u16, request_id: protocol::mk2::RequestId, ) -> InFlightRequestHandle { + let receiver_subject_for_log = receiver_subject.clone(); let (msg_tx, msg_rx) = mpsc::channel(128); let (drop_tx, drop_rx) = watch::channel(None); @@ -156,6 +157,7 @@ impl SharedState { } // If the entry already exists it means we transition from hibernating to active Entry::Occupied(mut entry) => { + // Preserve per-request message bookkeeping when a hibernated request reconnects. entry.receiver_subject = receiver_subject; entry.msg_tx = msg_tx; entry.drop_tx = drop_tx; @@ -171,6 +173,14 @@ impl SharedState { } }; + tracing::debug!( + request_id = %protocol::util::id_to_string(&request_id), + new, + receiver_subject = receiver_subject_for_log, + protocol_version, + "started in-flight request" + ); + InFlightRequestHandle { msg_rx, drop_rx, @@ -354,6 +364,12 @@ impl SharedState { } Ok(protocol::mk2::ToGateway::ToServerTunnelMessage(msg)) => { let message_id = msg.message_id; + tracing::trace!( + request_id = %protocol::util::id_to_string(&message_id.request_id), + message_index = message_id.message_index, + kind = ?msg.message_kind, + "pegboard-gateway received tunnel message from runner" + ); let Some(in_flight) = self .in_flight_requests diff --git a/engine/packages/pegboard-gateway/src/tunnel_to_ws_task.rs b/engine/packages/pegboard-gateway/src/tunnel_to_ws_task.rs index e22ce4bdfd..838dde4502 100644 --- a/engine/packages/pegboard-gateway/src/tunnel_to_ws_task.rs +++ b/engine/packages/pegboard-gateway/src/tunnel_to_ws_task.rs @@ -1,3 +1,4 @@ +use std::collections::VecDeque; use std::sync::{ Arc, atomic::{AtomicU64, Ordering}, @@ -17,60 +18,54 @@ use super::LifecycleResult; use crate::shared_state::{MsgGcReason, SharedState}; pub async fn task( + ctx: StandaloneCtx, shared_state: SharedState, client_ws: WebSocketHandle, request_id: protocol::RequestId, + actor_id: Id, + runner_id: Id, mut stopped_sub: message::SubscriptionHandle, mut msg_rx: mpsc::Receiver, + pending_msgs: Vec, mut drop_rx: watch::Receiver>, can_hibernate: bool, egress_bytes: Arc, mut tunnel_to_ws_abort_rx: watch::Receiver<()>, ) -> Result { + // Drain any runner messages buffered during the open handshake before consuming new ones. + let mut pending_msgs = VecDeque::from(pending_msgs); + loop { + if let Some(msg) = pending_msgs.pop_front() { + if let Some(result) = handle_runner_message( + &shared_state, + &client_ws, + request_id, + msg, + can_hibernate, + &egress_bytes, + ) + .await? + { + return Ok(result); + } + continue; + } + tokio::select! { res = msg_rx.recv() => { if let Some(msg) = res { - match msg { - protocol::mk2::ToServerTunnelMessageKind::ToServerWebSocketMessage(ws_msg) => { - tracing::trace!( - request_id=%protocol::util::id_to_string(&request_id), - data_len=ws_msg.data.len(), - binary=ws_msg.binary, - "forwarding websocket message to client" - ); - let msg = if ws_msg.binary { - Message::Binary(ws_msg.data.into()) - } else { - Message::Text( - String::from_utf8_lossy(&ws_msg.data).into_owned().into(), - ) - }; - - egress_bytes.fetch_add(msg.len() as u64, Ordering::AcqRel); - client_ws.send(msg).await?; - } - protocol::mk2::ToServerTunnelMessageKind::ToServerWebSocketMessageAck(ack) => { - tracing::debug!( - request_id=%protocol::util::id_to_string(&request_id), - ack_index=?ack.index, - "received WebSocketMessageAck from runner" - ); - shared_state - .ack_pending_websocket_messages(request_id, ack.index) - .await?; - } - protocol::mk2::ToServerTunnelMessageKind::ToServerWebSocketClose(close) => { - tracing::debug!(?close, "server closed websocket"); - - if can_hibernate && close.hibernate { - return Err(WebSocketServiceHibernate.build()); - } else { - // Successful closure - return Ok(LifecycleResult::ServerClose(close)); - } - } - _ => {} + if let Some(result) = handle_runner_message( + &shared_state, + &client_ws, + request_id, + msg, + can_hibernate, + &egress_bytes, + ) + .await? + { + return Ok(result); } } else { tracing::debug!("tunnel sub closed"); @@ -78,6 +73,23 @@ pub async fn task( } } _ = stopped_sub.next() => { + let actor_state = ctx + .op(pegboard::ops::actor::get_for_gateway::Input { actor_id }) + .await?; + + let stale_stopped_message = actor_state.as_ref().is_some_and(|actor| { + actor.connectable && actor.runner_id == Some(runner_id) + }); + // The actor may already be reconnected on this runner, so ignore old stop events in that case. + if stale_stopped_message { + tracing::debug!( + ?actor_id, + ?runner_id, + "ignoring stale actor stopped message during websocket handler loop" + ); + continue; + } + tracing::debug!("actor stopped during websocket handler loop"); if can_hibernate { @@ -97,3 +109,53 @@ pub async fn task( } } } + +async fn handle_runner_message( + shared_state: &SharedState, + client_ws: &WebSocketHandle, + request_id: protocol::RequestId, + msg: protocol::mk2::ToServerTunnelMessageKind, + can_hibernate: bool, + egress_bytes: &Arc, +) -> Result> { + match msg { + protocol::mk2::ToServerTunnelMessageKind::ToServerWebSocketMessage(ws_msg) => { + tracing::trace!( + request_id=%protocol::util::id_to_string(&request_id), + data_len=ws_msg.data.len(), + binary=ws_msg.binary, + "forwarding websocket message to client" + ); + let msg = if ws_msg.binary { + Message::Binary(ws_msg.data.into()) + } else { + Message::Text(String::from_utf8_lossy(&ws_msg.data).into_owned().into()) + }; + + egress_bytes.fetch_add(msg.len() as u64, Ordering::AcqRel); + client_ws.send(msg).await?; + Ok(None) + } + protocol::mk2::ToServerTunnelMessageKind::ToServerWebSocketMessageAck(ack) => { + tracing::trace!( + request_id=%protocol::util::id_to_string(&request_id), + ack_index=?ack.index, + "received WebSocketMessageAck from runner" + ); + shared_state + .ack_pending_websocket_messages(request_id, ack.index) + .await?; + Ok(None) + } + protocol::mk2::ToServerTunnelMessageKind::ToServerWebSocketClose(close) => { + tracing::debug!(?close, "server closed websocket"); + + if can_hibernate && close.hibernate { + Err(WebSocketServiceHibernate.build()) + } else { + Ok(Some(LifecycleResult::ServerClose(close))) + } + } + _ => Ok(None), + } +} diff --git a/engine/packages/pegboard-kv-channel/src/lib.rs b/engine/packages/pegboard-kv-channel/src/lib.rs index 79a45fcf2a..3a13b319c2 100644 --- a/engine/packages/pegboard-kv-channel/src/lib.rs +++ b/engine/packages/pegboard-kv-channel/src/lib.rs @@ -7,8 +7,8 @@ mod metrics; use std::collections::{HashMap, HashSet}; -use std::sync::atomic::{AtomicI64, Ordering}; use std::sync::Arc; +use std::sync::atomic::{AtomicI64, Ordering}; use std::time::{Duration, Instant}; use anyhow::{Context, Result}; @@ -21,12 +21,10 @@ use hyper::{Response, StatusCode}; use hyper_tungstenite::tungstenite::Message; use pegboard::actor_kv; use rivet_guard_core::{ - ResponseBody, WebSocketHandle, custom_serve::CustomServeTrait, - request_context::RequestContext, + ResponseBody, WebSocketHandle, custom_serve::CustomServeTrait, request_context::RequestContext, }; use tokio::sync::{Mutex, mpsc, watch}; use tokio_tungstenite::tungstenite::protocol::frame::CloseFrame; -use uuid::Uuid; pub use rivet_kv_channel_protocol as protocol; @@ -41,27 +39,13 @@ const KEY_WRAPPER_OVERHEAD: usize = 2; /// Prevents a malicious client from exhausting memory via unbounded actor_channels. const MAX_ACTORS_PER_CONNECTION: usize = 1000; -/// Shared state across all KV channel connections. -pub struct KvChannelState { - /// Maps actor_id string to the connection_id holding the single-writer lock and a reference - /// to that connection's open_actors set. The Arc reference allows lock eviction to remove the - /// actor from the old connection's set without acquiring the global lock on the KV hot path. - actor_locks: Mutex>>)>>, -} - pub struct PegboardKvChannelCustomServe { ctx: StandaloneCtx, - state: Arc, } impl PegboardKvChannelCustomServe { pub fn new(ctx: StandaloneCtx) -> Self { - Self { - ctx, - state: Arc::new(KvChannelState { - actor_locks: Mutex::new(HashMap::new()), - }), - } + Self { ctx } } } @@ -90,19 +74,15 @@ impl CustomServeTrait for PegboardKvChannelCustomServe { _after_hibernation: bool, ) -> Result> { let ctx = self.ctx.with_ray(req_ctx.ray_id(), req_ctx.req_id())?; - let state = self.state.clone(); // Parse URL params. let url = url::Url::parse(&format!("ws://placeholder{}", req_ctx.path())) .context("failed to parse WebSocket URL")?; - let params: HashMap = url - .query_pairs() - .map(|(k, v)| (k.to_string(), v.to_string())) - .collect(); // Validate protocol version. - let protocol_version: u32 = params - .get("protocol_version") + let protocol_version: u32 = url + .query_pairs() + .find_map(|(n, v)| (n == "protocol_version").then_some(v)) .context("missing protocol_version query param")? .parse() .context("invalid protocol_version")?; @@ -113,10 +93,11 @@ impl CustomServeTrait for PegboardKvChannelCustomServe { ); // Resolve namespace. - let namespace_name = params - .get("namespace") + let namespace_name = url + .query_pairs() + .find_map(|(n, v)| (n == "namespace").then_some(v)) .context("missing namespace query param")? - .clone(); + .to_string(); let namespace = ctx .op(namespace::ops::resolve_for_name_global::Input { name: namespace_name.clone(), @@ -126,44 +107,24 @@ impl CustomServeTrait for PegboardKvChannelCustomServe { .ok_or_else(|| namespace::errors::Namespace::NotFound.build()) .with_context(|| format!("namespace not found: {namespace_name}"))?; - // Assign connection ID. Uses UUID to eliminate any possibility of ID collision. - let conn_id = Uuid::new_v4(); let namespace_id = namespace.namespace_id; - tracing::info!(%conn_id, %namespace_id, "kv channel connection established"); + tracing::info!(%namespace_id, "kv channel connection established"); - // Track actors opened by this connection for cleanup on disconnect. + // Track actors opened by this connection. let open_actors: Arc>> = Arc::new(Mutex::new(HashSet::new())); let last_pong_ts = Arc::new(AtomicI64::new(util::timestamp::now())); - // Run the connection loop. Any error triggers cleanup below. let result = run_connection( ctx.clone(), - state.clone(), ws_handle, - conn_id, namespace_id, - open_actors.clone(), + open_actors, last_pong_ts, ) .await; - // Release all locks held by this connection. Only remove entries where the lock is still - // held by this conn_id, since another connection may have evicted it via ActorOpenRequest. - { - let open = open_actors.lock().await; - let mut locks = state.actor_locks.lock().await; - for actor_id in open.iter() { - if let Some((lock_conn, _)) = locks.get(actor_id) { - if *lock_conn == conn_id { - locks.remove(actor_id); - tracing::debug!(%conn_id, %actor_id, "released actor lock on disconnect"); - } - } - } - } - - tracing::info!(%conn_id, "kv channel connection closed"); + tracing::info!("kv channel connection closed"); result.map(|_| None) } @@ -173,9 +134,7 @@ impl CustomServeTrait for PegboardKvChannelCustomServe { async fn run_connection( ctx: StandaloneCtx, - state: Arc, ws_handle: WebSocketHandle, - conn_id: Uuid, namespace_id: Id, open_actors: Arc>>, last_pong_ts: Arc, @@ -203,9 +162,7 @@ async fn run_connection( // Run message loop. let msg_result = message_loop( &ctx, - &state, &ws_handle, - conn_id, namespace_id, &open_actors, &last_pong_ts, @@ -253,9 +210,7 @@ async fn ping_task( async fn message_loop( ctx: &StandaloneCtx, - state: &Arc, ws_handle: &WebSocketHandle, - conn_id: Uuid, namespace_id: Id, open_actors: &Arc>>, last_pong_ts: &AtomicI64, @@ -270,7 +225,7 @@ async fn message_loop( // parallelism. Do not use tokio::spawn per request as that would break // optimistic pipelining and journal write ordering. // See docs-internal/engine/NATIVE_SQLITE_REVIEW_FINDINGS.md Finding 2. - let mut actor_channels: HashMap> = + let mut actor_channels: HashMap> = HashMap::new(); let mut actor_tasks = tokio::task::JoinSet::new(); @@ -300,9 +255,7 @@ async fn message_loop( Message::Binary(data) => { handle_binary_message( ctx, - state, ws_handle, - conn_id, namespace_id, open_actors, last_pong_ts, @@ -332,14 +285,12 @@ async fn message_loop( async fn handle_binary_message( ctx: &StandaloneCtx, - state: &Arc, ws_handle: &WebSocketHandle, - conn_id: Uuid, namespace_id: Id, open_actors: &Arc>>, last_pong_ts: &AtomicI64, data: &[u8], - actor_channels: &mut HashMap>, + actor_channels: &mut HashMap>, actor_tasks: &mut tokio::task::JoinSet<()>, ) -> Result<()> { let msg = match protocol::decode_to_server(data) { @@ -355,11 +306,11 @@ async fn handle_binary_message( }; match msg { - protocol::ToServer::ToServerPong(pong) => { + protocol::ToRivet::ToRivetPong(pong) => { last_pong_ts.store(util::timestamp::now(), Ordering::Relaxed); tracing::trace!(ts = pong.ts, "received pong"); } - protocol::ToServer::ToServerRequest(req) => { + protocol::ToRivet::ToRivetRequest(req) => { let is_close = matches!(req.data, protocol::RequestData::ActorCloseRequest); let actor_id = req.actor_id.clone(); let request_id = req.request_id; @@ -369,9 +320,7 @@ async fn handle_binary_message( let (tx, rx) = mpsc::channel(64); actor_tasks.spawn(actor_request_task( Clone::clone(ctx), - Clone::clone(state), Clone::clone(ws_handle), - conn_id, namespace_id, Clone::clone(open_actors), rx, @@ -401,10 +350,7 @@ async fn handle_binary_message( send_response( ws_handle, request_id, - error_response( - "internal_error", - "internal error", - ), + error_response("internal_error", "internal error"), ) .await; } @@ -427,16 +373,13 @@ async fn handle_binary_message( /// dropped (connection end) or after processing an ActorCloseRequest. async fn actor_request_task( ctx: StandaloneCtx, - state: Arc, ws_handle: WebSocketHandle, - conn_id: Uuid, namespace_id: Id, open_actors: Arc>>, - mut rx: mpsc::Receiver, + mut rx: mpsc::Receiver, ) { - // Cached actor resolution. Populated on first KV request, reused for all - // subsequent requests. Actor name is immutable so this never goes stale. - let mut cached_actor: Option<(Id, String)> = None; + // Cache keyed by actor id since a single connection multiplexes many actors. + let mut cached_actors: HashMap = HashMap::new(); while let Some(req) = rx.recv().await { let is_close = matches!(req.data, protocol::RequestData::ActorCloseRequest); @@ -445,30 +388,21 @@ async fn actor_request_task( // Open/close are lifecycle ops that don't need a resolved actor. protocol::RequestData::ActorOpenRequest | protocol::RequestData::ActorCloseRequest => { - handle_request(&ctx, &state, conn_id, namespace_id, &open_actors, &req).await + handle_request(&open_actors, &req).await } // KV ops: resolve once, cache, reuse. _ => { let is_open = open_actors.lock().await.contains(&req.actor_id); if !is_open { - let locks = state.actor_locks.lock().await; - if locks.contains_key(&req.actor_id) { - error_response( - "actor_locked", - "actor is locked by another connection", - ) - } else { - error_response( - "actor_not_open", - "actor is not opened on this connection", - ) - } + error_response( + "actor_not_open", + "actor is not opened on this connection", + ) } else { - // Lazy-resolve and cache. - if cached_actor.is_none() { + if !cached_actors.contains_key(&req.actor_id) { match resolve_actor(&ctx, &req.actor_id, namespace_id).await { Ok(v) => { - cached_actor = Some(v); + cached_actors.insert(req.actor_id.clone(), v); } Err(resp) => { // Don't cache failures. Next request will retry. @@ -480,7 +414,8 @@ async fn actor_request_task( } } } - let (parsed_id, actor_name) = cached_actor.as_ref().unwrap(); + let (parsed_id, actor_name) = + cached_actors.get(&req.actor_id).unwrap(); let recipient = actor_kv::Recipient { actor_id: *parsed_id, @@ -518,15 +453,9 @@ async fn actor_request_task( } /// Encode and send a response to the client. Logs warnings on failure. -async fn send_response( - ws_handle: &WebSocketHandle, - request_id: u32, - data: protocol::ResponseData, -) { - let response = protocol::ToClient::ToClientResponse(protocol::ToClientResponse { - request_id, - data, - }); +async fn send_response(ws_handle: &WebSocketHandle, request_id: u32, data: protocol::ResponseData) { + let response = + protocol::ToClient::ToClientResponse(protocol::ToClientResponse { request_id, data }); match protocol::encode_to_client(&response) { Ok(encoded) => { @@ -545,19 +474,15 @@ async fn send_response( /// Handles actor lifecycle requests (open/close). KV operations are handled /// directly in `actor_request_task` with cached actor resolution. async fn handle_request( - _ctx: &StandaloneCtx, - state: &KvChannelState, - conn_id: Uuid, - _namespace_id: Id, open_actors: &Arc>>, - req: &protocol::ToServerRequest, + req: &protocol::ToRivetRequest, ) -> protocol::ResponseData { match &req.data { protocol::RequestData::ActorOpenRequest => { - handle_actor_open(state, conn_id, open_actors, &req.actor_id).await + handle_actor_open(open_actors, &req.actor_id).await } protocol::RequestData::ActorCloseRequest => { - handle_actor_close(state, conn_id, open_actors, &req.actor_id).await + handle_actor_close(open_actors, &req.actor_id).await } _ => unreachable!("KV operations are handled in actor_request_task"), } @@ -566,8 +491,6 @@ async fn handle_request( // MARK: Actor open/close async fn handle_actor_open( - state: &KvChannelState, - conn_id: Uuid, open_actors: &Arc>>, actor_id: &str, ) -> protocol::ResponseData { @@ -577,54 +500,22 @@ async fn handle_actor_open( if current_count >= MAX_ACTORS_PER_CONNECTION { return error_response( "too_many_actors", - &format!( - "connection has too many open actors (max {MAX_ACTORS_PER_CONNECTION})" - ), - ); - } - } - - let mut locks = state.actor_locks.lock().await; - - // If the actor is locked by a different connection, unconditionally evict the old lock. - // This handles reconnection scenarios where the server hasn't detected the old connection's - // disconnect yet. The old connection's next KV request will fail the fast-path check - // (open_actors.contains) and return actor_not_open. - // See docs-internal/engine/NATIVE_SQLITE_REVIEW_FINDINGS.md Finding 4. - if let Some((existing_conn, old_open_actors)) = locks.get(actor_id) { - if *existing_conn != conn_id { - old_open_actors.lock().await.remove(actor_id); - tracing::info!( - %conn_id, - old_conn_id = %existing_conn, - %actor_id, - "evicted stale actor lock from old connection" + &format!("connection has too many open actors (max {MAX_ACTORS_PER_CONNECTION})"), ); } } - locks.insert(actor_id.to_string(), (conn_id, open_actors.clone())); open_actors.lock().await.insert(actor_id.to_string()); - tracing::debug!(%conn_id, %actor_id, "actor lock acquired"); + tracing::debug!(%actor_id, "actor opened"); protocol::ResponseData::ActorOpenResponse } async fn handle_actor_close( - state: &KvChannelState, - conn_id: Uuid, open_actors: &Arc>>, actor_id: &str, ) -> protocol::ResponseData { - let mut locks = state.actor_locks.lock().await; - - if let Some((lock_conn, _)) = locks.get(actor_id) { - if *lock_conn == conn_id { - locks.remove(actor_id); - open_actors.lock().await.remove(actor_id); - tracing::debug!(%conn_id, %actor_id, "actor lock released"); - } - } - + open_actors.lock().await.remove(actor_id); + tracing::debug!(%actor_id, "actor closed"); protocol::ResponseData::ActorCloseResponse } @@ -636,8 +527,12 @@ async fn handle_kv_get( body: &protocol::KvGetRequest, ) -> protocol::ResponseData { let start = Instant::now(); - metrics::KV_CHANNEL_REQUESTS_TOTAL.with_label_values(&["get"]).inc(); - metrics::KV_CHANNEL_REQUEST_KEYS.with_label_values(&["get"]).observe(body.keys.len() as f64); + metrics::KV_CHANNEL_REQUESTS_TOTAL + .with_label_values(&["get"]) + .inc(); + metrics::KV_CHANNEL_REQUEST_KEYS + .with_label_values(&["get"]) + .observe(body.keys.len() as f64); if let Err(resp) = validate_keys(&body.keys) { return resp; @@ -654,7 +549,9 @@ async fn handle_kv_get( } Err(err) => internal_error(&err), }; - metrics::KV_CHANNEL_REQUEST_DURATION.with_label_values(&["get"]).observe(start.elapsed().as_secs_f64()); + metrics::KV_CHANNEL_REQUEST_DURATION + .with_label_values(&["get"]) + .observe(start.elapsed().as_secs_f64()); result } @@ -664,8 +561,12 @@ async fn handle_kv_put( body: &protocol::KvPutRequest, ) -> protocol::ResponseData { let start = Instant::now(); - metrics::KV_CHANNEL_REQUESTS_TOTAL.with_label_values(&["put"]).inc(); - metrics::KV_CHANNEL_REQUEST_KEYS.with_label_values(&["put"]).observe(body.keys.len() as f64); + metrics::KV_CHANNEL_REQUESTS_TOTAL + .with_label_values(&["put"]) + .inc(); + metrics::KV_CHANNEL_REQUEST_KEYS + .with_label_values(&["put"]) + .observe(body.keys.len() as f64); // Validate keys/values length match. if body.keys.len() != body.values.len() { @@ -687,7 +588,10 @@ async fn handle_kv_put( if key.len() + KEY_WRAPPER_OVERHEAD > MAX_KEY_SIZE { return error_response( "key_too_large", - &format!("key is too long (max {} bytes)", MAX_KEY_SIZE - KEY_WRAPPER_OVERHEAD), + &format!( + "key is too long (max {} bytes)", + MAX_KEY_SIZE - KEY_WRAPPER_OVERHEAD + ), ); } } @@ -700,7 +604,11 @@ async fn handle_kv_put( } } - let payload_size: usize = body.keys.iter().map(|k| k.len() + KEY_WRAPPER_OVERHEAD).sum::() + let payload_size: usize = body + .keys + .iter() + .map(|k| k.len() + KEY_WRAPPER_OVERHEAD) + .sum::() + body.values.iter().map(|v| v.len()).sum::(); if payload_size > MAX_PUT_PAYLOAD_SIZE { return error_response( @@ -717,7 +625,8 @@ async fn handle_kv_put( Err(err) => return internal_error(&err), }; - let result = match actor_kv::put(&*udb, recipient, body.keys.clone(), body.values.clone()).await { + let result = match actor_kv::put(&*udb, recipient, body.keys.clone(), body.values.clone()).await + { Ok(()) => protocol::ResponseData::KvPutResponse, Err(err) => { let rivet_err = rivet_error::RivetError::extract(&err); @@ -728,7 +637,9 @@ async fn handle_kv_put( } } }; - metrics::KV_CHANNEL_REQUEST_DURATION.with_label_values(&["put"]).observe(start.elapsed().as_secs_f64()); + metrics::KV_CHANNEL_REQUEST_DURATION + .with_label_values(&["put"]) + .observe(start.elapsed().as_secs_f64()); result } @@ -738,8 +649,12 @@ async fn handle_kv_delete( body: &protocol::KvDeleteRequest, ) -> protocol::ResponseData { let start = Instant::now(); - metrics::KV_CHANNEL_REQUESTS_TOTAL.with_label_values(&["delete"]).inc(); - metrics::KV_CHANNEL_REQUEST_KEYS.with_label_values(&["delete"]).observe(body.keys.len() as f64); + metrics::KV_CHANNEL_REQUESTS_TOTAL + .with_label_values(&["delete"]) + .inc(); + metrics::KV_CHANNEL_REQUEST_KEYS + .with_label_values(&["delete"]) + .observe(body.keys.len() as f64); if let Err(resp) = validate_keys(&body.keys) { return resp; @@ -754,7 +669,9 @@ async fn handle_kv_delete( Ok(()) => protocol::ResponseData::KvDeleteResponse, Err(err) => internal_error(&err), }; - metrics::KV_CHANNEL_REQUEST_DURATION.with_label_values(&["delete"]).observe(start.elapsed().as_secs_f64()); + metrics::KV_CHANNEL_REQUEST_DURATION + .with_label_values(&["delete"]) + .observe(start.elapsed().as_secs_f64()); result } @@ -764,17 +681,25 @@ async fn handle_kv_delete_range( body: &protocol::KvDeleteRangeRequest, ) -> protocol::ResponseData { let start = Instant::now(); - metrics::KV_CHANNEL_REQUESTS_TOTAL.with_label_values(&["delete_range"]).inc(); + metrics::KV_CHANNEL_REQUESTS_TOTAL + .with_label_values(&["delete_range"]) + .inc(); if body.start.len() + KEY_WRAPPER_OVERHEAD > MAX_KEY_SIZE { return error_response( "key_too_large", - &format!("start key is too long (max {} bytes)", MAX_KEY_SIZE - KEY_WRAPPER_OVERHEAD), + &format!( + "start key is too long (max {} bytes)", + MAX_KEY_SIZE - KEY_WRAPPER_OVERHEAD + ), ); } if body.end.len() + KEY_WRAPPER_OVERHEAD > MAX_KEY_SIZE { return error_response( "key_too_large", - &format!("end key is too long (max {} bytes)", MAX_KEY_SIZE - KEY_WRAPPER_OVERHEAD), + &format!( + "end key is too long (max {} bytes)", + MAX_KEY_SIZE - KEY_WRAPPER_OVERHEAD + ), ); } @@ -783,11 +708,20 @@ async fn handle_kv_delete_range( Err(err) => return internal_error(&err), }; - let result = match actor_kv::delete_range(&*udb, recipient, body.start.clone(), body.end.clone()).await { + let result = match actor_kv::delete_range( + &*udb, + recipient, + body.start.clone(), + body.end.clone(), + ) + .await + { Ok(()) => protocol::ResponseData::KvDeleteResponse, Err(err) => internal_error(&err), }; - metrics::KV_CHANNEL_REQUEST_DURATION.with_label_values(&["delete_range"]).observe(start.elapsed().as_secs_f64()); + metrics::KV_CHANNEL_REQUEST_DURATION + .with_label_values(&["delete_range"]) + .observe(start.elapsed().as_secs_f64()); result } @@ -795,28 +729,25 @@ async fn handle_kv_delete_range( /// Look up an actor by ID and return the parsed ID and actor name. /// -/// Defense-in-depth: verifies the actor belongs to the authenticated namespace. -/// The admin_token is a global credential, so this is not strictly necessary -/// today, but prevents cross-namespace access if a less-privileged auth -/// mechanism is introduced in the future. +/// Verifies the actor belongs to the authenticated namespace. async fn resolve_actor( ctx: &StandaloneCtx, actor_id: &str, expected_namespace_id: Id, ) -> std::result::Result<(Id, String), protocol::ResponseData> { - let parsed_id = Id::parse(actor_id).map_err(|err| { - error_response( - "actor_not_found", - &format!("invalid actor id: {err}"), - ) - })?; + let parsed_id = Id::parse(actor_id) + .map_err(|err| error_response("actor_not_found", &format!("invalid actor id: {err}")))?; let actor = ctx - .op(pegboard::ops::actor::get_for_runner::Input { - actor_id: parsed_id, + .op(pegboard::ops::actor::get::Input { + actor_ids: vec![parsed_id], + fetch_error: false, }) .await - .map_err(|err| internal_error(&err))?; + .map_err(|err| internal_error(&err))? + .actors + .into_iter() + .next(); match actor { Some(actor) => { @@ -847,7 +778,10 @@ fn validate_keys(keys: &[protocol::KvKey]) -> std::result::Result<(), protocol:: if key.len() + KEY_WRAPPER_OVERHEAD > MAX_KEY_SIZE { return Err(error_response( "key_too_large", - &format!("key is too long (max {} bytes)", MAX_KEY_SIZE - KEY_WRAPPER_OVERHEAD), + &format!( + "key is too long (max {} bytes)", + MAX_KEY_SIZE - KEY_WRAPPER_OVERHEAD + ), )); } } diff --git a/engine/packages/pegboard-outbound/src/lib.rs b/engine/packages/pegboard-outbound/src/lib.rs index d45a1726ec..d2dfb2ae6e 100644 --- a/engine/packages/pegboard-outbound/src/lib.rs +++ b/engine/packages/pegboard-outbound/src/lib.rs @@ -166,20 +166,23 @@ async fn handle(ctx: &StandaloneCtx, packet: protocol::ToOutbound) -> Result<()> tracing::debug!(?namespace_id, %pool_name, ?actor_id, ?generation, "received outbound request"); - // Check pool - let (pool_res, namespace_res) = tokio::try_join!( + let db = ctx.udb()?; + let (namespace_res, pool_res, preloaded_kv) = tokio::try_join!( + ctx.op(namespace::ops::get_global::Input { + namespace_ids: vec![namespace_id], + }), ctx.op(pegboard::ops::runner_config::get::Input { runners: vec![(namespace_id, pool_name.clone())], bypass_cache: false, }), - ctx.op(namespace::ops::get_global::Input { - namespace_ids: vec![namespace_id], - }), + pegboard::actor_kv::preload::fetch_preloaded_kv( + &db, + ctx.config().pegboard(), + actor_id, + namespace_id, + &actor_config.name, + ), )?; - let Some(pool) = pool_res.into_iter().next() else { - tracing::debug!("pool does not exist, ending outbound handler"); - return Ok(()); - }; let Some(namespace) = namespace_res.into_iter().next() else { tracing::error!("namespace not found, ending outbound handler"); report_error( @@ -191,16 +194,24 @@ async fn handle(ctx: &StandaloneCtx, packet: protocol::ToOutbound) -> Result<()> .await; return Ok(()); }; + let Some(pool) = pool_res.into_iter().next() else { + tracing::debug!("pool does not exist, ending outbound handler"); + return Ok(()); + }; - let udb = ctx.udb()?; - let preloaded_kv = pegboard::actor_kv::preload::fetch_preloaded_kv( - &udb, - ctx.config().pegboard(), - actor_id, - namespace_id, - &actor_config.name, - ) - .await?; + let RunnerConfigKind::Serverless { + url, + headers, + request_lifespan, + .. + } = pool.config.kind + else { + tracing::warn!( + ?actor_id, + "config no longer serverless, ignoring outbound allocation" + ); + return Ok(()); + }; let payload = versioned::ToEnvoy::wrap_latest(protocol::ToEnvoy::ToEnvoyCommands(vec![ protocol::CommandWrapper { @@ -216,20 +227,6 @@ async fn handle(ctx: &StandaloneCtx, packet: protocol::ToOutbound) -> Result<()> ])) .serialize_with_embedded_version(pool.protocol_version.unwrap_or(PROTOCOL_VERSION))?; - let RunnerConfigKind::Serverless { - url, - headers, - request_lifespan, - .. - } = pool.config.kind - else { - tracing::warn!( - ?actor_id, - "config no longer serverless, ignoring outbound allocation" - ); - return Ok(()); - }; - // Send ack to actor wf before starting an outbound req ctx.signal(pegboard::workflows::actor2::Allocated { generation }) .to_workflow::() @@ -252,6 +249,10 @@ async fn handle(ctx: &StandaloneCtx, packet: protocol::ToOutbound) -> Result<()> &url, headers, request_lifespan, + ctx.config() + .auth + .as_ref() + .map(|a| a.admin_token.read().as_str()), ) .await; @@ -274,15 +275,13 @@ async fn serverless_outbound_req( url: &str, headers: HashMap, request_lifespan: u32, + token: Option<&str>, ) -> Result<()> { let current_dc = ctx.config().topology().current_dc()?; let mut term_signal = TermSignal::get(); - let token = if let Some(auth) = &ctx.config().auth { - Some(( - X_RIVET_TOKEN, - HeaderValue::try_from(auth.admin_token.read())?, - )) + let token = if let Some(token) = token { + Some((X_RIVET_TOKEN, HeaderValue::try_from(token)?)) } else { None }; diff --git a/engine/packages/pegboard-runner/src/lib.rs b/engine/packages/pegboard-runner/src/lib.rs index db97f070fd..fcd7a7f649 100644 --- a/engine/packages/pegboard-runner/src/lib.rs +++ b/engine/packages/pegboard-runner/src/lib.rs @@ -26,6 +26,7 @@ mod ws_to_tunnel_task; enum LifecycleResult { Closed, Aborted, + Evicted, } pub struct PegboardRunnerWsCustomServe { @@ -222,34 +223,40 @@ impl CustomServeTrait for PegboardRunnerWsCustomServe { ); // Determine single result from all tasks - let lifecycle_res = match (tunnel_to_ws_res, ws_to_tunnel_res, ping_res) { + let mut lifecycle_res = match (tunnel_to_ws_res, ws_to_tunnel_res, ping_res) { // Prefer error (Err(err), _, _) => Err(err), (_, Err(err), _) => Err(err), (_, _, Err(err)) => Err(err), - // Prefer non aborted result if both succeed + // Prefer non aborted result (Ok(res), Ok(LifecycleResult::Aborted), _) => Ok(res), (Ok(LifecycleResult::Aborted), Ok(res), _) => Ok(res), // Unlikely case (res, _, _) => res, }; - // Make runner immediately ineligible when it disconnects - let update_alloc_res = self - .ctx - .op(pegboard::ops::runner::update_alloc_idx::Input { - runners: vec![pegboard::ops::runner::update_alloc_idx::Runner { - runner_id: conn.runner_id, - action: Action::ClearIdx, - }], - }) - .await; - if let Err(err) = update_alloc_res { - tracing::error!( - runner_id=?conn.runner_id, - ?err, - "critical: failed to evict runner from allocation index during disconnect" - ); + if let Ok(LifecycleResult::Evicted) = &lifecycle_res { + lifecycle_res = Err(errors::WsError::Eviction.build()); + } + // Clear alloc idx if not evicted + else { + // Make runner immediately ineligible when it disconnects + let update_alloc_res = self + .ctx + .op(pegboard::ops::runner::update_alloc_idx::Input { + runners: vec![pegboard::ops::runner::update_alloc_idx::Runner { + runner_id: conn.runner_id, + action: Action::ClearIdx, + }], + }) + .await; + if let Err(err) = update_alloc_res { + tracing::error!( + runner_id=?conn.runner_id, + ?err, + "failed to evict runner from allocation index during disconnect" + ); + } } tracing::debug!(%topic, "runner websocket closed"); diff --git a/engine/packages/pegboard-runner/src/metrics.rs b/engine/packages/pegboard-runner/src/metrics.rs index 931d0c9077..c5edab2c14 100644 --- a/engine/packages/pegboard-runner/src/metrics.rs +++ b/engine/packages/pegboard-runner/src/metrics.rs @@ -31,13 +31,13 @@ lazy_static::lazy_static! { ).unwrap(); pub static ref EVENT_MULTIPLEXER_COUNT: IntGauge = register_int_gauge_with_registry!( - "pegboard_event_multiplexer_count", + "pegboard_runner_event_multiplexer_count", "Number of active actor event multiplexers.", *REGISTRY ).unwrap(); pub static ref INGESTED_EVENTS_TOTAL: IntCounter = register_int_counter_with_registry!( - "pegboard_ingested_events_total", + "pegboard_runner_ingested_events_total", "Count of actor events.", *REGISTRY ).unwrap(); diff --git a/engine/packages/pegboard-runner/src/tunnel_to_ws_task.rs b/engine/packages/pegboard-runner/src/tunnel_to_ws_task.rs index 1c8f2168ba..6763e27519 100644 --- a/engine/packages/pegboard-runner/src/tunnel_to_ws_task.rs +++ b/engine/packages/pegboard-runner/src/tunnel_to_ws_task.rs @@ -66,7 +66,7 @@ async fn recv_msg( ]) .inc(); - return Err(errors::WsError::Eviction.build()); + return Ok(Err(LifecycleResult::Evicted)); } _ = tunnel_to_ws_abort_rx.changed() => { tracing::debug!("task aborted"); diff --git a/engine/packages/pegboard-runner/src/ws_to_tunnel_task.rs b/engine/packages/pegboard-runner/src/ws_to_tunnel_task.rs index 861bf440ee..def1a329b6 100644 --- a/engine/packages/pegboard-runner/src/ws_to_tunnel_task.rs +++ b/engine/packages/pegboard-runner/src/ws_to_tunnel_task.rs @@ -6,8 +6,8 @@ use gas::prelude::*; use hyper_tungstenite::tungstenite::Message; use pegboard::actor_kv; use pegboard::pubsub_subjects::GatewayReceiverSubject; -use rivet_guard_core::websocket_handle::WebSocketReceiver; use rivet_envoy_protocol as ep; +use rivet_guard_core::websocket_handle::WebSocketReceiver; use rivet_runner_protocol::{self as protocol, PROTOCOL_MK2_VERSION, versioned}; use std::sync::{Arc, atomic::Ordering}; use tokio::sync::{Mutex, MutexGuard, watch}; @@ -106,7 +106,7 @@ async fn recv_msg( ]) .inc(); - return Err(errors::WsError::Eviction.build()); + return Ok(Err(LifecycleResult::Evicted)); } _ = ws_to_tunnel_abort_rx.changed() => { tracing::debug!("task aborted"); @@ -242,9 +242,9 @@ async fn handle_message_mk2( values, metadata: metadata .into_iter() - .map(|x| protocol::mk2::KvMetadata { - version: x.version, - update_ts: x.update_ts, + .map(|m| protocol::mk2::KvMetadata { + version: m.version, + update_ts: m.update_ts, }) .collect(), }, @@ -277,16 +277,16 @@ async fn handle_message_mk2( protocol::mk2::KvListQuery::KvListAllQuery => { ep::KvListQuery::KvListAllQuery } - protocol::mk2::KvListQuery::KvListRangeQuery(q) => { + protocol::mk2::KvListQuery::KvListRangeQuery(x) => { ep::KvListQuery::KvListRangeQuery(ep::KvListRangeQuery { - start: q.start, - end: q.end, - exclusive: q.exclusive, + start: x.start, + end: x.end, + exclusive: x.exclusive, }) } - protocol::mk2::KvListQuery::KvListPrefixQuery(q) => { + protocol::mk2::KvListQuery::KvListPrefixQuery(x) => { ep::KvListQuery::KvListPrefixQuery(ep::KvListPrefixQuery { - key: q.key, + key: x.key, }) } }, @@ -310,9 +310,9 @@ async fn handle_message_mk2( values, metadata: metadata .into_iter() - .map(|x| protocol::mk2::KvMetadata { - version: x.version, - update_ts: x.update_ts, + .map(|m| protocol::mk2::KvMetadata { + version: m.version, + update_ts: m.update_ts, }) .collect(), }, @@ -636,10 +636,10 @@ async fn handle_message_mk1(ctx: &StandaloneCtx, conn: &Conn, msg: Bytes) -> Res } protocol::KvListQuery::KvListRangeQuery(q) => { ep::KvListQuery::KvListRangeQuery(ep::KvListRangeQuery { - start: q.start, - end: q.end, - exclusive: q.exclusive, - }) + start: q.start, + end: q.end, + exclusive: q.exclusive, + }) } protocol::KvListQuery::KvListPrefixQuery(q) => { ep::KvListQuery::KvListPrefixQuery(ep::KvListPrefixQuery { @@ -841,6 +841,14 @@ async fn handle_tunnel_message_mk2( ctx: &StandaloneCtx, msg: protocol::mk2::ToServerTunnelMessage, ) -> Result<()> { + // Log the gateway-issued message id so gateway and runner traces can be matched during WS races. + tracing::trace!( + request_id = %protocol::util::id_to_string(&msg.message_id.request_id), + message_index = msg.message_id.message_index, + kind = ?msg.message_kind, + "pegboard-runner received tunnel message from runner" + ); + // Extract inner data length before consuming msg let inner_data_len = tunnel_message_inner_data_len_mk2(&msg.message_kind); diff --git a/engine/packages/pegboard/Cargo.toml b/engine/packages/pegboard/Cargo.toml index f069963674..d04bf7f095 100644 --- a/engine/packages/pegboard/Cargo.toml +++ b/engine/packages/pegboard/Cargo.toml @@ -47,6 +47,7 @@ vbare.workspace = true [dev-dependencies] portpicker.workspace = true +test-snapshot-gen.workspace = true rivet-config.workspace = true rivet-test-deps.workspace = true tokio.workspace = true diff --git a/engine/packages/pegboard/src/actor_kv/mod.rs b/engine/packages/pegboard/src/actor_kv/mod.rs index d4689e4268..683f8ae6f4 100644 --- a/engine/packages/pegboard/src/actor_kv/mod.rs +++ b/engine/packages/pegboard/src/actor_kv/mod.rs @@ -49,101 +49,108 @@ pub async fn get( keys: Vec, ) -> Result<(Vec, Vec, Vec)> { let start = std::time::Instant::now(); -metrics::ACTOR_KV_KEYS_PER_OP.with_label_values(&["get"]).observe(keys.len() as f64); + metrics::ACTOR_KV_KEYS_PER_OP + .with_label_values(&["get"]) + .observe(keys.len() as f64); validate_keys(&keys)?; - let result = db.run(|tx| { - let keys = keys.clone(); - async move { - let tx = tx.with_subspace(keys::actor_kv::subspace(recipient.actor_id)); - - let mut stream = futures_util::stream::iter(keys) - .map(|key| { - let key_subspace = keys::actor_kv::subspace(recipient.actor_id) - .subspace(&keys::actor_kv::KeyWrapper(key)); - - // Get all sub keys in the key subspace - tx.get_ranges_keyvalues( - universaldb::RangeOption { - mode: universaldb::options::StreamingMode::WantAll, - ..key_subspace.range().into() - }, - Serializable, - ) - }) - .flatten(); - - let mut keys = Vec::new(); - let mut values = Vec::new(); - let mut metadata = Vec::new(); - let mut total_size = 0; - let mut current_entry: Option = None; - - loop { - let Some(entry) = stream.try_next().await? else { - break; - }; - - total_size += entry.key().len() + entry.value().len(); - - let key = tx.unpack::(&entry.key())?.key; - - let current_entry = if let Some(inner) = &mut current_entry { - if inner.key != key { - let (key, value, meta) = - std::mem::replace(inner, EntryBuilder::new(key)).build()?; + let result = db + .run(|tx| { + let keys = keys.clone(); + async move { + let tx = tx.with_subspace(keys::actor_kv::subspace(recipient.actor_id)); + + let mut stream = futures_util::stream::iter(keys) + .map(|key| { + let key_subspace = keys::actor_kv::subspace(recipient.actor_id) + .subspace(&keys::actor_kv::KeyWrapper(key)); + + // Get all sub keys in the key subspace + tx.get_ranges_keyvalues( + universaldb::RangeOption { + mode: universaldb::options::StreamingMode::WantAll, + ..key_subspace.range().into() + }, + Serializable, + ) + }) + .flatten(); + + let mut keys = Vec::new(); + let mut values = Vec::new(); + let mut metadata = Vec::new(); + let mut total_size = 0; + let mut current_entry: Option = None; + + loop { + let Some(entry) = stream.try_next().await? else { + break; + }; + + total_size += entry.key().len() + entry.value().len(); + + let key = tx.unpack::(&entry.key())?.key; + + let current_entry = if let Some(inner) = &mut current_entry { + if inner.key != key { + let (key, value, meta) = + std::mem::replace(inner, EntryBuilder::new(key)).build()?; + + keys.push(key); + values.push(value); + metadata.push(meta); + } - keys.push(key); - values.push(value); - metadata.push(meta); + inner + } else { + current_entry = Some(EntryBuilder::new(key)); + + current_entry.as_mut().expect("must be set") + }; + + if let Ok(chunk_key) = + tx.unpack::(&entry.key()) + { + current_entry.append_chunk(chunk_key.chunk, entry.value()); + } else if let Ok(metadata_key) = + tx.unpack::(&entry.key()) + { + let value = metadata_key.deserialize(entry.value())?; + + current_entry.append_metadata(value); + } else { + bail!("unexpected sub key"); } + } - inner - } else { - current_entry = Some(EntryBuilder::new(key)); - - current_entry.as_mut().expect("must be set") - }; - - if let Ok(chunk_key) = tx.unpack::(&entry.key()) - { - current_entry.append_chunk(chunk_key.chunk, entry.value()); - } else if let Ok(metadata_key) = - tx.unpack::(&entry.key()) - { - let value = metadata_key.deserialize(entry.value())?; + if let Some(inner) = current_entry { + let (key, value, meta) = inner.build()?; - current_entry.append_metadata(value); - } else { - bail!("unexpected sub key"); + keys.push(key); + values.push(value); + metadata.push(meta); } - } - if let Some(inner) = current_entry { - let (key, value, meta) = inner.build()?; - - keys.push(key); - values.push(value); - metadata.push(meta); + // Total read bytes (rounded up to nearest chunk) + let total_size_chunked = (total_size as u64) + .div_ceil(util::metric::KV_BILLABLE_CHUNK) + * util::metric::KV_BILLABLE_CHUNK; + namespace::keys::metric::inc( + &tx.with_subspace(namespace::keys::subspace()), + recipient.namespace_id, + namespace::keys::metric::Metric::KvRead(recipient.name.clone()), + total_size_chunked.try_into().unwrap_or_default(), + ); + + Ok((keys, values, metadata)) } - - // Total read bytes (rounded up to nearest chunk) - let total_size_chunked = (total_size as u64).div_ceil(util::metric::KV_BILLABLE_CHUNK) - * util::metric::KV_BILLABLE_CHUNK; - namespace::keys::metric::inc( - &tx.with_subspace(namespace::keys::subspace()), - recipient.namespace_id, - namespace::keys::metric::Metric::KvRead(recipient.name.clone()), - total_size_chunked.try_into().unwrap_or_default(), - ); - - Ok((keys, values, metadata)) - } - }) - .custom_instrument(tracing::info_span!("kv_get_tx")) - .await - .map_err(Into::::into); - metrics::ACTOR_KV_OPERATION_DURATION.with_label_values(&["get"]).observe(start.elapsed().as_secs_f64()); + }) + .custom_instrument(tracing::info_span!("kv_get_tx")) + .await + .map_err(Into::::into); + metrics::ACTOR_KV_OPERATION_DURATION + .with_label_values(&["get"]) + .observe(start.elapsed().as_secs_f64()); result } @@ -268,79 +275,85 @@ pub async fn put( values: Vec, ) -> Result<()> { let start = std::time::Instant::now(); -metrics::ACTOR_KV_KEYS_PER_OP.with_label_values(&["put"]).observe(keys.len() as f64); + metrics::ACTOR_KV_KEYS_PER_OP + .with_label_values(&["put"]) + .observe(keys.len() as f64); let keys = &keys; let values = &values; - let result = db.run(|tx| { - async move { - let total_size = estimate_kv_size(&tx, recipient.actor_id).await? as usize; - - validate_entries(&keys, &values, total_size)?; - - let subspace = &keys::actor_kv::subspace(recipient.actor_id); - let tx = tx.with_subspace(subspace.clone()); - let now = util::timestamp::now(); - - // TODO: Include metadata size? - // Total written bytes (rounded up to nearest chunk) - let total_size = keys.iter().fold(0, |s, key| s + key.len()) - + values.iter().fold(0, |s, value| s + value.len()); - let total_size_chunked = (total_size as u64).div_ceil(util::metric::KV_BILLABLE_CHUNK) - * util::metric::KV_BILLABLE_CHUNK; - namespace::keys::metric::inc( - &tx.with_subspace(namespace::keys::subspace()), - recipient.namespace_id, - namespace::keys::metric::Metric::KvWrite(recipient.name.clone()), - total_size_chunked.try_into().unwrap_or_default(), - ); - - futures_util::stream::iter(0..keys.len()) - .map(|i| { - let tx = tx.clone(); - async move { - // TODO: Costly clone - let key = keys::actor_kv::KeyWrapper( - keys.get(i).context("index should exist")?.clone(), - ); - let value = values.get(i).context("index should exist")?; - // Clear previous key data before setting - tx.clear_subspace_range(&subspace.subspace(&key)); - - // Set metadata - tx.write( - &keys::actor_kv::EntryMetadataKey::new(key.clone()), - ep::KvMetadata { - version: VERSION.as_bytes().to_vec(), - update_ts: now, - }, - )?; - - // Set key data in chunks - for start in (0..value.len()).step_by(VALUE_CHUNK_SIZE) { - let idx = start / VALUE_CHUNK_SIZE; - let end = (start + VALUE_CHUNK_SIZE).min(value.len()); - - tx.set( - &subspace.pack(&keys::actor_kv::EntryValueChunkKey::new( - key.clone(), - idx, - )), - &value.get(start..end).context("bad slice")?, + let result = db + .run(|tx| { + async move { + let total_size = estimate_kv_size(&tx, recipient.actor_id).await? as usize; + + validate_entries(&keys, &values, total_size)?; + + let subspace = &keys::actor_kv::subspace(recipient.actor_id); + let tx = tx.with_subspace(subspace.clone()); + let now = util::timestamp::now(); + + // TODO: Include metadata size? + // Total written bytes (rounded up to nearest chunk) + let total_size = keys.iter().fold(0, |s, key| s + key.len()) + + values.iter().fold(0, |s, value| s + value.len()); + let total_size_chunked = (total_size as u64) + .div_ceil(util::metric::KV_BILLABLE_CHUNK) + * util::metric::KV_BILLABLE_CHUNK; + namespace::keys::metric::inc( + &tx.with_subspace(namespace::keys::subspace()), + recipient.namespace_id, + namespace::keys::metric::Metric::KvWrite(recipient.name.clone()), + total_size_chunked.try_into().unwrap_or_default(), + ); + + futures_util::stream::iter(0..keys.len()) + .map(|i| { + let tx = tx.clone(); + async move { + // TODO: Costly clone + let key = keys::actor_kv::KeyWrapper( + keys.get(i).context("index should exist")?.clone(), ); + let value = values.get(i).context("index should exist")?; + // Clear previous key data before setting + tx.clear_subspace_range(&subspace.subspace(&key)); + + // Set metadata + tx.write( + &keys::actor_kv::EntryMetadataKey::new(key.clone()), + ep::KvMetadata { + version: VERSION.as_bytes().to_vec(), + update_ts: now, + }, + )?; + + // Set key data in chunks + for start in (0..value.len()).step_by(VALUE_CHUNK_SIZE) { + let idx = start / VALUE_CHUNK_SIZE; + let end = (start + VALUE_CHUNK_SIZE).min(value.len()); + + tx.set( + &subspace.pack(&keys::actor_kv::EntryValueChunkKey::new( + key.clone(), + idx, + )), + &value.get(start..end).context("bad slice")?, + ); + } + + Ok(()) } - - Ok(()) - } - }) - .buffer_unordered(32) - .try_collect() - .await - } - }) - .custom_instrument(tracing::info_span!("kv_put_tx")) - .await - .map_err(Into::into); - metrics::ACTOR_KV_OPERATION_DURATION.with_label_values(&["put"]).observe(start.elapsed().as_secs_f64()); + }) + .buffer_unordered(32) + .try_collect() + .await + } + }) + .custom_instrument(tracing::info_span!("kv_put_tx")) + .await + .map_err(Into::into); + metrics::ACTOR_KV_OPERATION_DURATION + .with_label_values(&["put"]) + .observe(start.elapsed().as_secs_f64()); result } @@ -352,38 +365,44 @@ pub async fn delete( keys: Vec, ) -> Result<()> { let start = std::time::Instant::now(); -metrics::ACTOR_KV_KEYS_PER_OP.with_label_values(&["delete"]).observe(keys.len() as f64); + metrics::ACTOR_KV_KEYS_PER_OP + .with_label_values(&["delete"]) + .observe(keys.len() as f64); validate_keys(&keys)?; let keys = &keys; - let result = db.run(|tx| { - async move { - // Total written bytes (rounded up to nearest chunk) - let total_size = keys.iter().fold(0, |s, key| s + key.len()); - let total_size_chunked = (total_size as u64).div_ceil(util::metric::KV_BILLABLE_CHUNK) - * util::metric::KV_BILLABLE_CHUNK; - namespace::keys::metric::inc( - &tx.with_subspace(namespace::keys::subspace()), - recipient.namespace_id, - namespace::keys::metric::Metric::KvWrite(recipient.name.clone()), - total_size_chunked.try_into().unwrap_or_default(), - ); + let result = db + .run(|tx| { + async move { + // Total written bytes (rounded up to nearest chunk) + let total_size = keys.iter().fold(0, |s, key| s + key.len()); + let total_size_chunked = (total_size as u64) + .div_ceil(util::metric::KV_BILLABLE_CHUNK) + * util::metric::KV_BILLABLE_CHUNK; + namespace::keys::metric::inc( + &tx.with_subspace(namespace::keys::subspace()), + recipient.namespace_id, + namespace::keys::metric::Metric::KvWrite(recipient.name.clone()), + total_size_chunked.try_into().unwrap_or_default(), + ); + + for key in keys { + // TODO: Costly clone + let key_subspace = keys::actor_kv::subspace(recipient.actor_id) + .subspace(&keys::actor_kv::KeyWrapper(key.clone())); - for key in keys { - // TODO: Costly clone - let key_subspace = keys::actor_kv::subspace(recipient.actor_id) - .subspace(&keys::actor_kv::KeyWrapper(key.clone())); + tx.clear_subspace_range(&key_subspace); + } - tx.clear_subspace_range(&key_subspace); + Ok(()) } - - Ok(()) - } - }) - .custom_instrument(tracing::info_span!("kv_delete_tx")) - .await - .map_err(Into::into); - metrics::ACTOR_KV_OPERATION_DURATION.with_label_values(&["delete"]).observe(start.elapsed().as_secs_f64()); + }) + .custom_instrument(tracing::info_span!("kv_delete_tx")) + .await + .map_err(Into::into); + metrics::ACTOR_KV_OPERATION_DURATION + .with_label_values(&["delete"]) + .observe(start.elapsed().as_secs_f64()); result } @@ -396,45 +415,51 @@ pub async fn delete_range( end: ep::KvKey, ) -> Result<()> { let timer = std::time::Instant::now(); -validate_range(&start, &end)?; + validate_range(&start, &end)?; if start >= end { - metrics::ACTOR_KV_OPERATION_DURATION.with_label_values(&["delete_range"]).observe(timer.elapsed().as_secs_f64()); + metrics::ACTOR_KV_OPERATION_DURATION + .with_label_values(&["delete_range"]) + .observe(timer.elapsed().as_secs_f64()); return Ok(()); } - let result = db.run(|tx| { - let start = start.clone(); - let end = end.clone(); - async move { - // Total written bytes (rounded up to nearest chunk) - let total_size = start.len() + end.len(); - let total_size_chunked = (total_size as u64).div_ceil(util::metric::KV_BILLABLE_CHUNK) - * util::metric::KV_BILLABLE_CHUNK; - namespace::keys::metric::inc( - &tx.with_subspace(namespace::keys::subspace()), - recipient.namespace_id, - namespace::keys::metric::Metric::KvWrite(recipient.name.clone()), - total_size_chunked.try_into().unwrap_or_default(), - ); - - let subspace = keys::actor_kv::subspace(recipient.actor_id); - let begin = subspace - .subspace(&keys::actor_kv::KeyWrapper(start)) - .range() - .0; - let end = subspace - .subspace(&keys::actor_kv::KeyWrapper(end)) - .range() - .0; - tx.clear_range(&begin, &end); + let result = db + .run(|tx| { + let start = start.clone(); + let end = end.clone(); + async move { + // Total written bytes (rounded up to nearest chunk) + let total_size = start.len() + end.len(); + let total_size_chunked = (total_size as u64) + .div_ceil(util::metric::KV_BILLABLE_CHUNK) + * util::metric::KV_BILLABLE_CHUNK; + namespace::keys::metric::inc( + &tx.with_subspace(namespace::keys::subspace()), + recipient.namespace_id, + namespace::keys::metric::Metric::KvWrite(recipient.name.clone()), + total_size_chunked.try_into().unwrap_or_default(), + ); + + let subspace = keys::actor_kv::subspace(recipient.actor_id); + let begin = subspace + .subspace(&keys::actor_kv::KeyWrapper(start)) + .range() + .0; + let end = subspace + .subspace(&keys::actor_kv::KeyWrapper(end)) + .range() + .0; + tx.clear_range(&begin, &end); - Ok(()) - } - }) - .custom_instrument(tracing::info_span!("kv_delete_range_tx")) - .await - .map_err(Into::into); - metrics::ACTOR_KV_OPERATION_DURATION.with_label_values(&["delete_range"]).observe(timer.elapsed().as_secs_f64()); + Ok(()) + } + }) + .custom_instrument(tracing::info_span!("kv_delete_range_tx")) + .await + .map_err(Into::into); + metrics::ACTOR_KV_OPERATION_DURATION + .with_label_values(&["delete_range"]) + .observe(timer.elapsed().as_secs_f64()); result } diff --git a/engine/packages/pegboard/src/actor_kv/preload.rs b/engine/packages/pegboard/src/actor_kv/preload.rs index 3b7f4cd824..f22e28d27c 100644 --- a/engine/packages/pegboard/src/actor_kv/preload.rs +++ b/engine/packages/pegboard/src/actor_kv/preload.rs @@ -99,8 +99,7 @@ pub(crate) async fn batch_preload( // Mark this key as scanned regardless of whether it exists in FDB. requested_get_keys.push(key.clone()); - let key_subspace = - subspace.subspace(&keys::actor_kv::KeyWrapper(key.clone())); + let key_subspace = subspace.subspace(&keys::actor_kv::KeyWrapper(key.clone())); let mut stream = tx.get_ranges_keyvalues( universaldb::RangeOption { mode: universaldb::options::StreamingMode::WantAll, @@ -113,9 +112,9 @@ pub(crate) async fn batch_preload( while let Some(fdb_kv) = stream.try_next().await? { if builder.is_none() { - let parsed_key = - tx.unpack::(&fdb_kv.key())? - .key; + let parsed_key = tx + .unpack::(&fdb_kv.key())? + .key; builder = Some(EntryBuilder::new(parsed_key)); } @@ -185,14 +184,14 @@ pub(crate) async fn batch_preload( let mut exceeded = false; while let Some(fdb_kv) = stream.try_next().await? { - let key = - tx.unpack::(&fdb_kv.key())?.key; + let key = tx + .unpack::(&fdb_kv.key())? + .key; let curr = if let Some(inner) = &mut current_entry { if inner.key != key { // Finalize the previous entry. - let prev = - std::mem::replace(inner, EntryBuilder::new(key)); + let prev = std::mem::replace(inner, EntryBuilder::new(key)); let (k, v, m) = prev.build()?; let size = entry_size(&k, &v, &m); @@ -301,10 +300,10 @@ pub async fn fetch_preloaded_kv( let metadata = db .run(|tx| { let tx = tx.with_subspace(keys::subspace()); - let name_key = - keys::ns::ActorNameKey::new(namespace_id, actor_name.to_string()); + let name_key = keys::ns::ActorNameKey::new(namespace_id, actor_name.to_string()); async move { tx.read_opt(&name_key, Snapshot).await } }) + .instrument(tracing::info_span!("read_actor_metadata_tx")) .await?; let metadata_map = metadata @@ -318,7 +317,8 @@ pub async fn fetch_preloaded_kv( return Ok(None); }; - if config.preload_max_total_bytes() == 0 { + let preload_max_total_bytes = config.preload_max_total_bytes(); + if preload_max_total_bytes == 0 { return Ok(None); }; @@ -337,7 +337,7 @@ pub async fn fetch_preloaded_kv( actor_id, preload_config.keys, prefix_requests, - config.preload_max_total_bytes(), + preload_max_total_bytes, ) .await?; diff --git a/engine/packages/pegboard/src/errors.rs b/engine/packages/pegboard/src/errors.rs index f59b03a2b6..4856a1cbdc 100644 --- a/engine/packages/pegboard/src/errors.rs +++ b/engine/packages/pegboard/src/errors.rs @@ -74,7 +74,10 @@ pub enum Actor { "Not enough space left in storage.", "Not enough space left in storage ({remaining} bytes remaining, current payload is {payload_size} bytes)." )] - KvStorageQuotaExceeded { remaining: usize, payload_size: usize }, + KvStorageQuotaExceeded { + remaining: usize, + payload_size: usize, + }, } #[derive(RivetError, Debug, Clone, Deserialize, Serialize)] diff --git a/engine/packages/pegboard/src/ops/actor/get_for_runner.rs b/engine/packages/pegboard/src/ops/actor/get_for_runner.rs index b34639ced2..4bc81fef81 100644 --- a/engine/packages/pegboard/src/ops/actor/get_for_runner.rs +++ b/engine/packages/pegboard/src/ops/actor/get_for_runner.rs @@ -41,11 +41,19 @@ pub async fn pegboard_actor_get_for_runner( tx.exists(&connectable_key, Serializable), )?; - let (Some(workflow_id), Some(namespace_id), Some(runner_id)) = (workflow_id, namespace_id, runner_id_entry) else { + let (Some(workflow_id), Some(namespace_id), Some(runner_id)) = + (workflow_id, namespace_id, runner_id_entry) + else { return Ok(None); }; - Ok(Some((workflow_id, name_entry, namespace_id, runner_id, is_connectable))) + Ok(Some(( + workflow_id, + name_entry, + namespace_id, + runner_id, + is_connectable, + ))) }) .custom_instrument(tracing::info_span!("actor_get_for_runner_tx")) .await?; diff --git a/engine/packages/pegboard/src/ops/envoy/update_ping.rs b/engine/packages/pegboard/src/ops/envoy/update_ping.rs index b0c2466bb8..949ad540f1 100644 --- a/engine/packages/pegboard/src/ops/envoy/update_ping.rs +++ b/engine/packages/pegboard/src/ops/envoy/update_ping.rs @@ -68,10 +68,8 @@ pub async fn pegboard_envoy_update_ping(ctx: &OperationCtx, input: &Input) -> Re input.envoy_key.clone(), ); - // Add read conflict - tx.add_conflict_key(&old_lb_key, ConflictRangeType::Read)?; - // Clear old key + tx.add_conflict_key(&old_lb_key, ConflictRangeType::Read)?; tx.delete(&old_lb_key); tx.write( diff --git a/engine/packages/pegboard/src/workflows/actor/keys.rs b/engine/packages/pegboard/src/workflows/actor/keys.rs index 22d89f70cf..e7e8a629be 100644 --- a/engine/packages/pegboard/src/workflows/actor/keys.rs +++ b/engine/packages/pegboard/src/workflows/actor/keys.rs @@ -1,5 +1,7 @@ use epoxy::{ - ops::propose::{CheckAndSetCommand, Command, CommandError, CommandKind, Proposal, ProposalResult}, + ops::propose::{ + CheckAndSetCommand, Command, CommandError, CommandKind, Proposal, ProposalResult, + }, protocol::ReplicaId, }; use futures_util::TryStreamExt; diff --git a/engine/packages/pegboard/src/workflows/actor/mod.rs b/engine/packages/pegboard/src/workflows/actor/mod.rs index 0dbf5d7750..a07cc552f4 100644 --- a/engine/packages/pegboard/src/workflows/actor/mod.rs +++ b/engine/packages/pegboard/src/workflows/actor/mod.rs @@ -624,6 +624,15 @@ pub async fn pegboard_actor(ctx: &mut WorkflowCtx, input: &Input) -> Result<()> )).await?; } Main::Wake(sig) => { + // Wake requests can be retried externally, so log the current state before deciding whether it is actionable. + tracing::debug!( + actor_id = ?input.actor_id, + sleeping = state.sleeping, + runner_id = ?state.runner_id, + will_wake = state.will_wake, + "received wake signal" + ); + // Clear alarm if let Some(alarm_ts) = state.alarm_ts { let now = ctx.v(3).activity(GetTsInput {}).await?; @@ -678,7 +687,8 @@ pub async fn pegboard_actor(ctx: &mut WorkflowCtx, input: &Input) -> Result<()> ); } } else { - tracing::debug!( + // Repeated wake probes are expected while gateway-side retries settle, so keep this low-noise. + tracing::trace!( actor_id=?input.actor_id, "cannot wake actor that is not sleeping", ); diff --git a/engine/packages/pegboard/src/workflows/actor2/keys.rs b/engine/packages/pegboard/src/workflows/actor2/keys.rs index 27006ef961..36f7a2f817 100644 --- a/engine/packages/pegboard/src/workflows/actor2/keys.rs +++ b/engine/packages/pegboard/src/workflows/actor2/keys.rs @@ -1,5 +1,7 @@ use epoxy::{ - ops::propose::{CheckAndSetCommand, Command, CommandError, CommandKind, Proposal, ProposalResult}, + ops::propose::{ + CheckAndSetCommand, Command, CommandError, CommandKind, Proposal, ProposalResult, + }, protocol::ReplicaId, }; use futures_util::TryStreamExt; diff --git a/engine/packages/pegboard/src/workflows/actor2/mod.rs b/engine/packages/pegboard/src/workflows/actor2/mod.rs index 3279bbda7b..c7038ff918 100644 --- a/engine/packages/pegboard/src/workflows/actor2/mod.rs +++ b/engine/packages/pegboard/src/workflows/actor2/mod.rs @@ -720,7 +720,7 @@ async fn process_signal( &input, state, metrics_workflow_id, - runtime::StoppedVariant::Normal { + runtime::StoppedVariant::Stopped { code: code.clone(), message: message.clone(), }, diff --git a/engine/packages/pegboard/src/workflows/actor2/runtime.rs b/engine/packages/pegboard/src/workflows/actor2/runtime.rs index 5986573e15..e8c5bc7103 100644 --- a/engine/packages/pegboard/src/workflows/actor2/runtime.rs +++ b/engine/packages/pegboard/src/workflows/actor2/runtime.rs @@ -441,7 +441,7 @@ pub async fn reschedule_actor( #[derive(Debug)] pub enum StoppedVariant { FailedAllocation, - Normal { + Stopped { code: protocol::StopCode, message: Option, }, @@ -468,11 +468,11 @@ pub async fn handle_stopped( // Save error to state match &variant { StoppedVariant::FailedAllocation => {} - StoppedVariant::Normal { + StoppedVariant::Stopped { code: protocol::StopCode::Ok, .. } => {} - StoppedVariant::Normal { + StoppedVariant::Stopped { code: protocol::StopCode::Error, message, } => { @@ -527,122 +527,106 @@ pub async fn handle_stopped( .await?; } - let (try_reallocate, going_away) = match state.transition { - Transition::SleepIntent { - rewake_after_stop, .. - } => (rewake_after_stop, false), - Transition::GoingAway { .. } => (true, true), - Transition::Destroying { .. } => return Ok(StoppedResult::Destroy), - _ => (true, false), - }; - - // Always immediately reallocate if going away - let stopped_res = if going_away { - let allocate_res = ctx.activity(AllocateInput {}).await?; - - if let Some(allocation) = allocate_res.allocation { - state.generation += 1; - - ctx.activity(SendOutboundInput { - generation: state.generation, - input: input.input.clone(), - allocation, - }) - .await?; - - // Transition to allocating - state.transition = Transition::Allocating { - destroy_after_start: false, - lost_timeout_ts: allocate_res.now - + ctx.config().pegboard().actor_allocation_threshold(), - }; - } else { - // Transition to retry backoff - state.transition = Transition::Reallocating { - since_ts: allocate_res.now, - }; - } + enum Decision { + Reallocate, + Backoff, + Sleep, + Destroy, + } - StoppedResult::Continue - } else if try_reallocate { + let decision = match (&state.transition, input.crash_policy, variant) { + ( + Transition::SleepIntent { + rewake_after_stop: true, + .. + }, + _, + _, + ) => Decision::Reallocate, + ( + Transition::SleepIntent { + rewake_after_stop: false, + .. + }, + _, + _, + ) => Decision::Reallocate, + (Transition::GoingAway { .. }, _, _) => Decision::Reallocate, + (Transition::Destroying { .. }, _, _) => Decision::Destroy, + (_, _, StoppedVariant::FailedAllocation) => Decision::Backoff, // An actor stopping with `StopCode::Ok` indicates a graceful exit - let graceful_exit = matches!( - variant, - StoppedVariant::Normal { + ( + _, + _, + StoppedVariant::Stopped { code: protocol::StopCode::Ok, .. - } - ); + }, + ) => Decision::Destroy, + (_, CrashPolicy::Restart, _) => Decision::Reallocate, + (_, CrashPolicy::Sleep, _) => Decision::Sleep, + (_, CrashPolicy::Destroy, _) => Decision::Destroy, + }; - match (input.crash_policy, graceful_exit) { - (CrashPolicy::Restart, false) => { - let allocate_res = ctx.activity(AllocateInput {}).await?; - - if let Some(allocation) = allocate_res.allocation { - state.generation += 1; - - ctx.activity(SendOutboundInput { - generation: state.generation, - input: input.input.clone(), - allocation, - }) - .await?; - - // Transition to allocating - state.transition = Transition::Allocating { - destroy_after_start: false, - lost_timeout_ts: allocate_res.now - + ctx.config().pegboard().actor_allocation_threshold(), - }; - } else { - // Transition to retry backoff - state.transition = Transition::Reallocating { - since_ts: allocate_res.now, - }; - } + let stopped_res = match decision { + Decision::Reallocate => { + let allocate_res = ctx.activity(AllocateInput {}).await?; - StoppedResult::Continue - } - (CrashPolicy::Sleep, false) => { - tracing::debug!(actor_id=?input.actor_id, "actor sleeping due to ungraceful exit"); + if let Some(allocation) = allocate_res.allocation { + state.generation += 1; - // Clear alarm - if let Some(alarm_ts) = state.alarm_ts { - let now = ctx.activity(GetTsInput {}).await?; + ctx.activity(SendOutboundInput { + generation: state.generation, + input: input.input.clone(), + allocation, + }) + .await?; - if now >= alarm_ts { - state.alarm_ts = None; - } - } + // Transition to allocating + state.transition = Transition::Allocating { + destroy_after_start: false, + lost_timeout_ts: allocate_res.now + + ctx.config().pegboard().actor_allocation_threshold(), + }; + } else { + // Transition to retry backoff + state.transition = Transition::Reallocating { + since_ts: allocate_res.now, + }; + } - // Transition to sleeping - state.transition = Transition::Sleeping; + StoppedResult::Continue + } + Decision::Backoff => { + let now = ctx.activity(GetTsInput {}).await?; - StoppedResult::Continue - } - _ => { - let now = ctx.activity(GetTsInput {}).await?; + state.transition = Transition::Reallocating { since_ts: now }; - // Don't destroy on failed allocation, retry instead - if let StoppedVariant::FailedAllocation { .. } = &variant { - // Transition to retry backoff - state.transition = Transition::Reallocating { since_ts: now }; + StoppedResult::Continue + } + Decision::Sleep => { + // Clear alarm + if let Some(alarm_ts) = state.alarm_ts { + let now = ctx.activity(GetTsInput {}).await?; - StoppedResult::Continue - } else { - StoppedResult::Destroy + if now >= alarm_ts { + state.alarm_ts = None; } } - } - } else { - // Transition to sleeping - state.transition = Transition::Sleeping; - StoppedResult::Continue + // Transition to sleeping + state.transition = Transition::Sleeping; + + StoppedResult::Continue + } + Decision::Destroy => StoppedResult::Destroy, }; - if let Transition::Sleeping = state.transition { - ctx.activity(SetSleepingInput {}).await?; + match state.transition { + Transition::Sleeping | Transition::Reallocating { .. } => { + ctx.activity(SetSleepingInput {}).await?; + } + _ => {} } ctx.msg(Stopped {}) @@ -966,66 +950,25 @@ pub async fn insert_and_send_commands( state.envoy_last_command_idx += input.commands.len() as i64; - // Fetch preloaded KV at send time for any start commands. Preloaded KV is - // never persisted in the command queue or workflow history. - let preloaded_kv = { - let has_start_cmd = input - .commands - .iter() - .any(|command| matches!(command, protocol::Command::CommandStartActor(_))); - if has_start_cmd { - let db = ctx.udb()?; - crate::actor_kv::preload::fetch_preloaded_kv( - &db, - ctx.config().pegboard(), - actor_id, - namespace_id, - &input - .commands - .iter() - .find_map(|command| match command { - protocol::Command::CommandStartActor(start) => { - Some(start.config.name.clone()) - } - _ => None, - }) - .unwrap_or_default(), - ) - .await? - } else { - None - } - }; - let receiver_subject = crate::pubsub_subjects::EnvoyReceiverSubject::new( state.namespace_id, input.envoy_key.clone(), ) .to_string(); - let mut preloaded_kv = preloaded_kv; let message_serialized = versioned::ToEnvoyConn::wrap_latest(protocol::ToEnvoyConn::ToEnvoyCommands( input .commands .iter() .enumerate() - .map(|(i, command)| { - let mut command = command.clone(); - if let protocol::Command::CommandStartActor(ref mut start) = - command - { - start.preloaded_kv = preloaded_kv.take(); - } - - protocol::CommandWrapper { - checkpoint: protocol::ActorCheckpoint { - actor_id: state.actor_id.to_string(), - generation: input.generation, - index: old_last_command_idx + i as i64 + 1, - }, - inner: command, - } + .map(|(i, command)| protocol::CommandWrapper { + checkpoint: protocol::ActorCheckpoint { + actor_id: state.actor_id.to_string(), + generation: input.generation, + index: old_last_command_idx + i as i64 + 1, + }, + inner: command.clone(), }) .collect(), )) diff --git a/engine/packages/pegboard/tests/actor_v1_pre_migration.rs b/engine/packages/pegboard/tests/actor_v1_pre_migration.rs new file mode 100644 index 0000000000..ca7768c3b3 --- /dev/null +++ b/engine/packages/pegboard/tests/actor_v1_pre_migration.rs @@ -0,0 +1,77 @@ +use std::time::Duration; + +use gas::prelude::*; +use pegboard::workflows::actor::AllocationOverride; +use test_snapshot::SnapshotTestCtx; +use universaldb::prelude::*; + +#[tokio::test] +async fn actor_v1_pre_migration() { + let test_ctx = SnapshotTestCtx::from_snapshot_with_coordinator("pb-actor-v1-pre-migration") + .await + .unwrap(); + let ctx = test_ctx.get_ctx(test_ctx.leader_id); + + let existing_namespace = ctx + .op(namespace::ops::resolve_for_name_local::Input { + name: "default".to_string(), + }) + .await + .unwrap() + .expect("default ns should exist"); + + let actors_res = ctx + .op(pegboard::ops::actor::list_for_ns::Input { + namespace_id: existing_namespace.namespace_id, + name: "test".to_string(), + key: None, + include_destroyed: true, + created_before: None, + limit: 1, + fetch_error: false, + }) + .await + .unwrap(); + let actor = actors_res + .actors + .into_iter() + .next() + .expect("actor should exist"); + + ctx.signal(pegboard::workflows::actor::Wake { + allocation_override: AllocationOverride::default(), + }) + .to_workflow::() + .tag("actor_id", actor.actor_id) + .send() + .await + .unwrap(); + + tokio::time::sleep(Duration::from_secs(3)).await; + + // Get workflow id + let workflow_id = ctx + .udb() + .unwrap() + .run(|tx| async move { + let tx = tx.with_subspace(pegboard::keys::subspace()); + + tx.read( + &pegboard::keys::actor::WorkflowIdKey::new(actor.actor_id), + Serializable, + ) + .await + }) + .await + .unwrap(); + + let wf = ctx + .get_workflows(vec![workflow_id]) + .await + .unwrap() + .into_iter() + .next() + .expect("workflow should exist"); + + assert!(!wf.is_dead()); +} diff --git a/engine/packages/pegboard/tests/kv_list_edge_cases.rs b/engine/packages/pegboard/tests/kv_list_edge_cases.rs index d18b0b5001..d333a56544 100644 --- a/engine/packages/pegboard/tests/kv_list_edge_cases.rs +++ b/engine/packages/pegboard/tests/kv_list_edge_cases.rs @@ -1,7 +1,7 @@ use anyhow::Result; use gas::prelude::*; use pegboard::actor_kv as kv; -use rivet_runner_protocol::mk2 as rp; +use rivet_envoy_protocol as ep; #[tokio::test] async fn test_list_edge_cases() -> Result<()> { @@ -12,15 +12,20 @@ async fn test_list_edge_cases() -> Result<()> { let test_id = Uuid::new_v4(); let dc_label = 1; - let datacenters = vec![rivet_config::config::topology::Datacenter { - name: "test-dc".to_string(), - datacenter_label: dc_label, - is_leader: true, - peer_url: url::Url::parse("http://127.0.0.1:8080")?, - public_url: url::Url::parse("http://127.0.0.1:8081")?, - proxy_url: None, - valid_hosts: None, - }]; + let datacenters = [( + "test-dc".to_string(), + rivet_config::config::topology::Datacenter { + name: "test-dc".to_string(), + datacenter_label: dc_label, + is_leader: true, + peer_url: url::Url::parse("http://127.0.0.1:8080")?, + public_url: url::Url::parse("http://127.0.0.1:8081")?, + proxy_url: None, + valid_hosts: None, + }, + )] + .into_iter() + .collect(); let api_peer_port = portpicker::pick_unused_port().expect("failed to pick api peer port"); let guard_port = portpicker::pick_unused_port().expect("failed to pick guard port"); @@ -45,7 +50,7 @@ async fn test_list_edge_cases() -> Result<()> { // Test 1: List when empty tracing::info!("test 1: list when empty"); let (empty_keys, _, _) = - kv::list(db, &recipient, rp::KvListQuery::KvListAllQuery, false, None).await?; + kv::list(db, &recipient, ep::KvListQuery::KvListAllQuery, false, None).await?; assert_eq!(empty_keys.len(), 0, "should return empty list"); // Test 2: Prefix that matches nothing @@ -61,7 +66,7 @@ async fn test_list_edge_cases() -> Result<()> { let (no_match, _, _) = kv::list( db, &recipient, - rp::KvListQuery::KvListPrefixQuery(rp::KvListPrefixQuery { + ep::KvListQuery::KvListPrefixQuery(ep::KvListPrefixQuery { key: b"xyz".to_vec(), }), false, @@ -79,7 +84,7 @@ async fn test_list_edge_cases() -> Result<()> { let (backwards_range, _, _) = kv::list( db, &recipient, - rp::KvListQuery::KvListRangeQuery(rp::KvListRangeQuery { + ep::KvListQuery::KvListRangeQuery(ep::KvListRangeQuery { start: b"z".to_vec(), end: b"a".to_vec(), exclusive: false, @@ -99,7 +104,7 @@ async fn test_list_edge_cases() -> Result<()> { let (same_inclusive, _, _) = kv::list( db, &recipient, - rp::KvListQuery::KvListRangeQuery(rp::KvListRangeQuery { + ep::KvListQuery::KvListRangeQuery(ep::KvListRangeQuery { start: b"foo".to_vec(), end: b"foo".to_vec(), exclusive: false, @@ -117,7 +122,7 @@ async fn test_list_edge_cases() -> Result<()> { let (same_exclusive, _, _) = kv::list( db, &recipient, - rp::KvListQuery::KvListRangeQuery(rp::KvListRangeQuery { + ep::KvListQuery::KvListRangeQuery(ep::KvListRangeQuery { start: b"foo".to_vec(), end: b"foo".to_vec(), exclusive: true, @@ -153,7 +158,7 @@ async fn test_list_edge_cases() -> Result<()> { let (null_prefix, _, _) = kv::list( db, &recipient, - rp::KvListQuery::KvListPrefixQuery(rp::KvListPrefixQuery { + ep::KvListQuery::KvListPrefixQuery(ep::KvListPrefixQuery { key: vec![b'a', 0x00], }), false, @@ -198,7 +203,7 @@ async fn test_list_edge_cases() -> Result<()> { let (empty_prefix, _, _) = kv::list( db, &recipient, - rp::KvListQuery::KvListPrefixQuery(rp::KvListPrefixQuery { key: vec![] }), + ep::KvListQuery::KvListPrefixQuery(ep::KvListPrefixQuery { key: vec![] }), false, None, ) @@ -214,7 +219,7 @@ async fn test_list_edge_cases() -> Result<()> { let (long_prefix, _, _) = kv::list( db, &recipient, - rp::KvListQuery::KvListPrefixQuery(rp::KvListPrefixQuery { + ep::KvListQuery::KvListPrefixQuery(ep::KvListPrefixQuery { key: b"abcdefghijk".to_vec(), }), false, @@ -248,7 +253,7 @@ async fn test_list_edge_cases() -> Result<()> { let (prefix_match, _, _) = kv::list( db, &recipient, - rp::KvListQuery::KvListPrefixQuery(rp::KvListPrefixQuery { + ep::KvListQuery::KvListPrefixQuery(ep::KvListPrefixQuery { key: b"key".to_vec(), }), false, @@ -271,7 +276,7 @@ async fn test_list_edge_cases() -> Result<()> { let (byte_range, _, _) = kv::list( db, &recipient, - rp::KvListQuery::KvListRangeQuery(rp::KvListRangeQuery { + ep::KvListQuery::KvListRangeQuery(ep::KvListRangeQuery { start: b"key\x00".to_vec(), end: b"key\x02".to_vec(), exclusive: false, @@ -297,7 +302,7 @@ async fn test_list_edge_cases() -> Result<()> { let (zero_limit, _, _) = kv::list( db, &recipient, - rp::KvListQuery::KvListAllQuery, + ep::KvListQuery::KvListAllQuery, false, Some(0), ) @@ -309,7 +314,7 @@ async fn test_list_edge_cases() -> Result<()> { let (one_limit, _, _) = kv::list( db, &recipient, - rp::KvListQuery::KvListAllQuery, + ep::KvListQuery::KvListAllQuery, false, Some(1), ) @@ -321,7 +326,7 @@ async fn test_list_edge_cases() -> Result<()> { let (large_limit, _, _) = kv::list( db, &recipient, - rp::KvListQuery::KvListAllQuery, + ep::KvListQuery::KvListAllQuery, false, Some(1000), ) @@ -347,7 +352,7 @@ async fn test_list_edge_cases() -> Result<()> { let (reverse_limited, _, _) = kv::list( db, &recipient, - rp::KvListQuery::KvListAllQuery, + ep::KvListQuery::KvListAllQuery, true, Some(2), ) @@ -366,7 +371,7 @@ async fn test_list_edge_cases() -> Result<()> { let (prefix_reverse, _, _) = kv::list( db, &recipient, - rp::KvListQuery::KvListPrefixQuery(rp::KvListPrefixQuery { key: vec![] }), + ep::KvListQuery::KvListPrefixQuery(ep::KvListPrefixQuery { key: vec![] }), true, None, ) diff --git a/engine/packages/pegboard/tests/kv_operations.rs b/engine/packages/pegboard/tests/kv_operations.rs index 631f245429..141d09c6ea 100644 --- a/engine/packages/pegboard/tests/kv_operations.rs +++ b/engine/packages/pegboard/tests/kv_operations.rs @@ -1,7 +1,7 @@ use anyhow::Result; use gas::prelude::*; use pegboard::actor_kv as kv; -use rivet_runner_protocol::mk2 as rp; +use rivet_envoy_protocol as ep; #[tokio::test] async fn test_kv_operations() -> Result<()> { @@ -13,15 +13,20 @@ async fn test_kv_operations() -> Result<()> { let test_id = Uuid::new_v4(); let dc_label = 1; - let datacenters = vec![rivet_config::config::topology::Datacenter { - name: "test-dc".to_string(), - datacenter_label: dc_label, - is_leader: true, - peer_url: url::Url::parse("http://127.0.0.1:8080")?, - public_url: url::Url::parse("http://127.0.0.1:8081")?, - proxy_url: None, - valid_hosts: None, - }]; + let datacenters = [( + "test-dc".to_string(), + rivet_config::config::topology::Datacenter { + name: "test-dc".to_string(), + datacenter_label: dc_label, + is_leader: true, + peer_url: url::Url::parse("http://127.0.0.1:8080")?, + public_url: url::Url::parse("http://127.0.0.1:8081")?, + proxy_url: None, + valid_hosts: None, + }, + )] + .into_iter() + .collect(); let api_peer_port = portpicker::pick_unused_port().expect("failed to pick api peer port"); let guard_port = portpicker::pick_unused_port().expect("failed to pick guard port"); @@ -98,7 +103,7 @@ async fn test_kv_operations() -> Result<()> { // Test 3: List all keys tracing::info!("test 3: listing all keys"); let (list_keys, list_values, list_metadata) = - kv::list(db, &recipient, rp::KvListQuery::KvListAllQuery, false, None).await?; + kv::list(db, &recipient, ep::KvListQuery::KvListAllQuery, false, None).await?; assert_eq!(list_keys.len(), 5, "should list 5 keys"); assert_eq!(list_values.len(), 5, "should list 5 values"); @@ -110,7 +115,7 @@ async fn test_kv_operations() -> Result<()> { let (limited_keys, _, _) = kv::list( db, &recipient, - rp::KvListQuery::KvListAllQuery, + ep::KvListQuery::KvListAllQuery, false, Some(2), ) @@ -122,9 +127,9 @@ async fn test_kv_operations() -> Result<()> { // Test 5: List with reverse tracing::info!("test 5: listing in reverse"); let (forward_keys, _, _) = - kv::list(db, &recipient, rp::KvListQuery::KvListAllQuery, false, None).await?; + kv::list(db, &recipient, ep::KvListQuery::KvListAllQuery, false, None).await?; let (reverse_keys, _, _) = - kv::list(db, &recipient, rp::KvListQuery::KvListAllQuery, true, None).await?; + kv::list(db, &recipient, ep::KvListQuery::KvListAllQuery, true, None).await?; assert_eq!(forward_keys.len(), reverse_keys.len()); // Keys should be in opposite order @@ -161,7 +166,7 @@ async fn test_kv_operations() -> Result<()> { let (users_keys, _, _) = kv::list( db, &recipient, - rp::KvListQuery::KvListPrefixQuery(rp::KvListPrefixQuery { + ep::KvListQuery::KvListPrefixQuery(ep::KvListPrefixQuery { key: b"users:".to_vec(), }), false, @@ -177,7 +182,7 @@ async fn test_kv_operations() -> Result<()> { let (posts_keys, _, _) = kv::list( db, &recipient, - rp::KvListQuery::KvListPrefixQuery(rp::KvListPrefixQuery { + ep::KvListQuery::KvListPrefixQuery(ep::KvListPrefixQuery { key: b"posts:".to_vec(), }), false, @@ -199,7 +204,7 @@ async fn test_kv_operations() -> Result<()> { let (range_keys, _, _) = kv::list( db, &recipient, - rp::KvListQuery::KvListRangeQuery(rp::KvListRangeQuery { + ep::KvListQuery::KvListRangeQuery(ep::KvListRangeQuery { start: b"key1".to_vec(), end: b"key2".to_vec(), exclusive: false, @@ -223,7 +228,7 @@ async fn test_kv_operations() -> Result<()> { let (exclusive_range_keys, _, _) = kv::list( db, &recipient, - rp::KvListQuery::KvListRangeQuery(rp::KvListRangeQuery { + ep::KvListQuery::KvListRangeQuery(ep::KvListRangeQuery { start: b"key1".to_vec(), end: b"key2".to_vec(), exclusive: true, @@ -245,7 +250,7 @@ async fn test_kv_operations() -> Result<()> { // Verify keys are deleted let (remaining_keys, _, _) = - kv::list(db, &recipient, rp::KvListQuery::KvListAllQuery, false, None).await?; + kv::list(db, &recipient, ep::KvListQuery::KvListAllQuery, false, None).await?; assert_eq!(remaining_keys.len(), 3, "should have 3 keys remaining"); assert!(!remaining_keys.contains(&b"key1".to_vec())); assert!(!remaining_keys.contains(&b"key2".to_vec())); @@ -256,7 +261,7 @@ async fn test_kv_operations() -> Result<()> { kv::delete_range(db, &recipient, b"key3".to_vec(), b"key5".to_vec()).await?; let (post_range_keys, _, _) = - kv::list(db, &recipient, rp::KvListQuery::KvListAllQuery, false, None).await?; + kv::list(db, &recipient, ep::KvListQuery::KvListAllQuery, false, None).await?; assert_eq!(post_range_keys.len(), 1, "should have 1 key remaining"); assert_eq!( post_range_keys[0], @@ -271,7 +276,7 @@ async fn test_kv_operations() -> Result<()> { // Verify all keys are deleted let (all_keys, _, _) = - kv::list(db, &recipient, rp::KvListQuery::KvListAllQuery, false, None).await?; + kv::list(db, &recipient, ep::KvListQuery::KvListAllQuery, false, None).await?; assert_eq!(all_keys.len(), 0, "should have no keys remaining"); tracing::info!("successfully deleted all keys"); diff --git a/engine/packages/test-snapshot-gen/Cargo.toml b/engine/packages/test-snapshot-gen/Cargo.toml index 8850ce67e0..1fdbb18a17 100644 --- a/engine/packages/test-snapshot-gen/Cargo.toml +++ b/engine/packages/test-snapshot-gen/Cargo.toml @@ -18,17 +18,20 @@ anyhow.workspace = true async-trait.workspace = true axum.workspace = true clap.workspace = true -epoxy.workspace = true epoxy-protocol.workspace = true +epoxy.workspace = true gas.workspace = true +namespace.workspace = true +pegboard.workspace = true portpicker.workspace = true rivet-api-builder.workspace = true rivet-config.workspace = true rivet-pools.workspace = true rivet-test-deps.workspace = true +rivet-types.workspace = true rivet-util.workspace = true -serde.workspace = true serde_json.workspace = true +serde.workspace = true tokio.workspace = true tracing.workspace = true universaldb.workspace = true diff --git a/engine/packages/test-snapshot-gen/snapshots/pb-actor-v1-pre-migration/metadata.json b/engine/packages/test-snapshot-gen/snapshots/pb-actor-v1-pre-migration/metadata.json new file mode 100644 index 0000000000..3c31d13aac --- /dev/null +++ b/engine/packages/test-snapshot-gen/snapshots/pb-actor-v1-pre-migration/metadata.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e219c7a03b9c9722dc72666b9385a5af9fe58c93c9e2d2bab064084dde4fb4f0 +size 79 diff --git a/engine/packages/test-snapshot-gen/snapshots/pb-actor-v1-pre-migration/replica-1/000008.log b/engine/packages/test-snapshot-gen/snapshots/pb-actor-v1-pre-migration/replica-1/000008.log new file mode 100644 index 0000000000..e69de29bb2 diff --git a/engine/packages/test-snapshot-gen/snapshots/pb-actor-v1-pre-migration/replica-1/000009.sst b/engine/packages/test-snapshot-gen/snapshots/pb-actor-v1-pre-migration/replica-1/000009.sst new file mode 100644 index 0000000000..d22a2edba4 --- /dev/null +++ b/engine/packages/test-snapshot-gen/snapshots/pb-actor-v1-pre-migration/replica-1/000009.sst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:34758e8b0294c9c264b56a7b38283a47aa4f66d7d857b2348983992ef7d8898a +size 10371 diff --git a/engine/packages/test-snapshot-gen/snapshots/pb-actor-v1-pre-migration/replica-1/CURRENT b/engine/packages/test-snapshot-gen/snapshots/pb-actor-v1-pre-migration/replica-1/CURRENT new file mode 100644 index 0000000000..f8d5048625 --- /dev/null +++ b/engine/packages/test-snapshot-gen/snapshots/pb-actor-v1-pre-migration/replica-1/CURRENT @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9c283f6e81028b9eb0760d918ee4bc0aa256ed3b926393c1734c760c4bd724fd +size 16 diff --git a/engine/packages/test-snapshot-gen/snapshots/pb-actor-v1-pre-migration/replica-1/MANIFEST-000005 b/engine/packages/test-snapshot-gen/snapshots/pb-actor-v1-pre-migration/replica-1/MANIFEST-000005 new file mode 100644 index 0000000000..1a7f8b79cb --- /dev/null +++ b/engine/packages/test-snapshot-gen/snapshots/pb-actor-v1-pre-migration/replica-1/MANIFEST-000005 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f1ba367165061fdc717905d50320b368f4161fcc4daa510d46a9e0ca614297e +size 300 diff --git a/engine/packages/test-snapshot-gen/snapshots/pb-actor-v1-pre-migration/replica-1/OPTIONS-000007 b/engine/packages/test-snapshot-gen/snapshots/pb-actor-v1-pre-migration/replica-1/OPTIONS-000007 new file mode 100644 index 0000000000..108379c184 --- /dev/null +++ b/engine/packages/test-snapshot-gen/snapshots/pb-actor-v1-pre-migration/replica-1/OPTIONS-000007 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7e335d1f28391ae45a3b6b5acc70245581d7137ca22d213bddbcca518195aec8 +size 7749 diff --git a/engine/packages/test-snapshot-gen/snapshots/pb-actor-v1-pre-migration/replica-2/000008.log b/engine/packages/test-snapshot-gen/snapshots/pb-actor-v1-pre-migration/replica-2/000008.log new file mode 100644 index 0000000000..e69de29bb2 diff --git a/engine/packages/test-snapshot-gen/snapshots/pb-actor-v1-pre-migration/replica-2/000009.sst b/engine/packages/test-snapshot-gen/snapshots/pb-actor-v1-pre-migration/replica-2/000009.sst new file mode 100644 index 0000000000..3c57c2f524 --- /dev/null +++ b/engine/packages/test-snapshot-gen/snapshots/pb-actor-v1-pre-migration/replica-2/000009.sst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:632cfb41b8b61ceb5682d11f1b2146697dfadb2a7e00f9ebfd5f776887aaeabe +size 2914 diff --git a/engine/packages/test-snapshot-gen/snapshots/pb-actor-v1-pre-migration/replica-2/CURRENT b/engine/packages/test-snapshot-gen/snapshots/pb-actor-v1-pre-migration/replica-2/CURRENT new file mode 100644 index 0000000000..f8d5048625 --- /dev/null +++ b/engine/packages/test-snapshot-gen/snapshots/pb-actor-v1-pre-migration/replica-2/CURRENT @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9c283f6e81028b9eb0760d918ee4bc0aa256ed3b926393c1734c760c4bd724fd +size 16 diff --git a/engine/packages/test-snapshot-gen/snapshots/pb-actor-v1-pre-migration/replica-2/MANIFEST-000005 b/engine/packages/test-snapshot-gen/snapshots/pb-actor-v1-pre-migration/replica-2/MANIFEST-000005 new file mode 100644 index 0000000000..6156fc863b --- /dev/null +++ b/engine/packages/test-snapshot-gen/snapshots/pb-actor-v1-pre-migration/replica-2/MANIFEST-000005 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6dd5a89f06f320efc61e2f6a0846fc04c4d8aa5f836e4f542fb503d3c28e6e6f +size 263 diff --git a/engine/packages/test-snapshot-gen/snapshots/pb-actor-v1-pre-migration/replica-2/OPTIONS-000007 b/engine/packages/test-snapshot-gen/snapshots/pb-actor-v1-pre-migration/replica-2/OPTIONS-000007 new file mode 100644 index 0000000000..9a63c24680 --- /dev/null +++ b/engine/packages/test-snapshot-gen/snapshots/pb-actor-v1-pre-migration/replica-2/OPTIONS-000007 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aa1c4b471e8d7f80a1785ced50c1f422ddb237ccdc6ed354812a9cc435c6ad15 +size 7750 diff --git a/engine/packages/test-snapshot-gen/src/lib.rs b/engine/packages/test-snapshot-gen/src/lib.rs index f8aa454989..7a70263b75 100644 --- a/engine/packages/test-snapshot-gen/src/lib.rs +++ b/engine/packages/test-snapshot-gen/src/lib.rs @@ -21,10 +21,7 @@ pub fn snapshot_dir(scenario: &str) -> PathBuf { /// /// Returns a map of `replica_id -> temp_path` where each temp_path is a /// copy of that replica's RocksDB data directory. -pub fn load_snapshot( - scenario: &str, - test_id: uuid::Uuid, -) -> Result> { +pub fn load_snapshot(scenario: &str, test_id: uuid::Uuid) -> Result> { let dir = snapshot_dir(scenario); load_snapshot_from(&dir, test_id) } @@ -128,8 +125,7 @@ impl SnapshotTestCtx { for &replica_id in &replica_ids { let api_peer_port = portpicker::pick_unused_port().context("failed to pick API peer port")?; - let guard_port = - portpicker::pick_unused_port().context("failed to pick guard port")?; + let guard_port = portpicker::pick_unused_port().context("failed to pick guard port")?; ctx.replica_metadata.insert( replica_id, ReplicaMetadata { @@ -224,7 +220,9 @@ impl SnapshotTestCtx { ) .await?; - let reg = epoxy::registry()?; + let reg = epoxy::registry()? + .merge(namespace::registry()?)? + .merge(pegboard::registry()?)?; let test_ctx = WorkflowTestCtx::new_with_deps(reg, test_deps).await?; let api_handle = setup_api_server( @@ -269,14 +267,8 @@ impl SnapshotTestCtx { name: format!("dc-{}", id), datacenter_label: id as u16, is_leader: id == self.leader_id, - peer_url: Url::parse(&format!( - "http://127.0.0.1:{}", - metadata.api_peer_port - ))?, - public_url: Url::parse(&format!( - "http://127.0.0.1:{}", - metadata.guard_port - ))?, + peer_url: Url::parse(&format!("http://127.0.0.1:{}", metadata.api_peer_port))?, + public_url: Url::parse(&format!("http://127.0.0.1:{}", metadata.guard_port))?, proxy_url: None, valid_hosts: None, }, diff --git a/engine/packages/test-snapshot-gen/src/main.rs b/engine/packages/test-snapshot-gen/src/main.rs index 6c913f8fec..4341b88cf4 100644 --- a/engine/packages/test-snapshot-gen/src/main.rs +++ b/engine/packages/test-snapshot-gen/src/main.rs @@ -104,11 +104,9 @@ async fn run_scenario(scenario: &dyn scenarios::Scenario) -> Result<()> { // Clean previous snapshot if it exists. if scenario_dir.exists() { - std::fs::remove_dir_all(&scenario_dir) - .context("failed to remove old snapshot")?; + std::fs::remove_dir_all(&scenario_dir).context("failed to remove old snapshot")?; } - std::fs::create_dir_all(&scenario_dir) - .context("failed to create snapshot directory")?; + std::fs::create_dir_all(&scenario_dir).context("failed to create snapshot directory")?; // Build the cluster. let replica_ids: Vec = (1..=scenario.replica_count() as u64).collect(); diff --git a/engine/packages/test-snapshot-gen/src/scenarios/mod.rs b/engine/packages/test-snapshot-gen/src/scenarios/mod.rs index 47d7f4a8b5..5c7a74708d 100644 --- a/engine/packages/test-snapshot-gen/src/scenarios/mod.rs +++ b/engine/packages/test-snapshot-gen/src/scenarios/mod.rs @@ -4,6 +4,7 @@ use async_trait::async_trait; use crate::test_cluster::TestCluster; mod epoxy_keys; +mod pb_actor_v1_pre_migration; #[async_trait(?Send)] pub trait Scenario { @@ -19,5 +20,8 @@ pub trait Scenario { } pub fn all() -> Vec> { - vec![Box::new(epoxy_keys::EpoxyKeys)] + vec![ + Box::new(epoxy_keys::EpoxyKeys), + Box::new(pb_actor_v1_pre_migration::PbActorV1PreMigration), + ] } diff --git a/engine/packages/test-snapshot-gen/src/scenarios/pb_actor_v1_pre_migration.rs b/engine/packages/test-snapshot-gen/src/scenarios/pb_actor_v1_pre_migration.rs new file mode 100644 index 0000000000..6f512b42f9 --- /dev/null +++ b/engine/packages/test-snapshot-gen/src/scenarios/pb_actor_v1_pre_migration.rs @@ -0,0 +1,56 @@ +use std::time::Duration; + +use anyhow::{Context, Result}; +use async_trait::async_trait; +use gas::prelude::*; +use rivet_types::actors::CrashPolicy; + +use crate::test_cluster::TestCluster; + +use super::Scenario; + +/// Scenario that creates a sleeping actor v1 before envoys were introduced. When loaded after envoys +/// introduced the wf should not die when awoken. +pub struct PbActorV1PreMigration; + +#[async_trait(?Send)] +impl Scenario for PbActorV1PreMigration { + fn name(&self) -> &'static str { + "pb-actor-v1-pre-migration" + } + + fn replica_count(&self) -> usize { + 2 + } + + async fn populate(&self, cluster: &TestCluster) -> Result<()> { + let ctx = cluster.get_ctx(cluster.leader_id()); + + let existing_namespace = ctx + .op(namespace::ops::resolve_for_name_local::Input { + name: "default".to_string(), + }) + .await? + .context("default ns should exist")?; + + let actor_id = Id::new_v1(ctx.config().dc_label()); + + ctx.op(pegboard::ops::actor::create::Input { + actor_id, + namespace_id: existing_namespace.namespace_id, + name: "test".to_string(), + key: None, + runner_name_selector: "default".to_string(), + input: None, + crash_policy: CrashPolicy::Sleep, + forward_request: false, + datacenter_name: None, + }) + .await?; + + // Wait for wf to sleep + tokio::time::sleep(Duration::from_secs(5)).await; + + Ok(()) + } +} diff --git a/engine/packages/test-snapshot-gen/src/test_cluster.rs b/engine/packages/test-snapshot-gen/src/test_cluster.rs index 40e267eced..861de75cb5 100644 --- a/engine/packages/test-snapshot-gen/src/test_cluster.rs +++ b/engine/packages/test-snapshot-gen/src/test_cluster.rs @@ -41,8 +41,7 @@ impl TestCluster { for &replica_id in replica_ids { let api_peer_port = portpicker::pick_unused_port().context("failed to pick API peer port")?; - let guard_port = - portpicker::pick_unused_port().context("failed to pick guard port")?; + let guard_port = portpicker::pick_unused_port().context("failed to pick guard port")?; cluster.replica_metadata.insert( replica_id, ReplicaMetadata { @@ -92,13 +91,6 @@ impl TestCluster { .wf_ctx } - pub fn replica_ids(&self) -> Vec { - let mut ids = self.replica_metadata.keys().copied().collect::>(); - ids.sort_unstable(); - ids - } - - #[allow(dead_code)] pub fn leader_id(&self) -> ReplicaId { self.leader_id } @@ -134,7 +126,9 @@ impl TestCluster { ) .await?; - let reg = epoxy::registry()?; + let reg = epoxy::registry()? + .merge(namespace::registry()?)? + .merge(pegboard::registry()?)?; let test_ctx = WorkflowTestCtx::new_with_deps(reg, test_deps).await?; let api_handle = setup_api_server( @@ -179,14 +173,8 @@ impl TestCluster { name: format!("dc-{}", id), datacenter_label: id as u16, is_leader: id == self.leader_id, - peer_url: Url::parse(&format!( - "http://127.0.0.1:{}", - metadata.api_peer_port - ))?, - public_url: Url::parse(&format!( - "http://127.0.0.1:{}", - metadata.guard_port - ))?, + peer_url: Url::parse(&format!("http://127.0.0.1:{}", metadata.api_peer_port))?, + public_url: Url::parse(&format!("http://127.0.0.1:{}", metadata.guard_port))?, proxy_url: None, valid_hosts: None, }, diff --git a/engine/sdks/rust/kv-channel-protocol/src/lib.rs b/engine/sdks/rust/kv-channel-protocol/src/lib.rs index 7502faf079..8851f713aa 100644 --- a/engine/sdks/rust/kv-channel-protocol/src/lib.rs +++ b/engine/sdks/rust/kv-channel-protocol/src/lib.rs @@ -5,13 +5,13 @@ pub use generated::v1::*; pub const PROTOCOL_VERSION: u32 = 1; -/// Serialize a ToServer message to BARE bytes. -pub fn encode_to_server(msg: &ToServer) -> Result, serde_bare::error::Error> { +/// Serialize a ToRivet message to BARE bytes. +pub fn encode_to_server(msg: &ToRivet) -> Result, serde_bare::error::Error> { serde_bare::to_vec(msg) } -/// Deserialize a ToServer message from BARE bytes. -pub fn decode_to_server(bytes: &[u8]) -> Result { +/// Deserialize a ToRivet message from BARE bytes. +pub fn decode_to_server(bytes: &[u8]) -> Result { serde_bare::from_slice(bytes) } @@ -33,7 +33,7 @@ mod tests { #[test] fn round_trip_to_server_request_actor_open() { - let msg = ToServer::ToServerRequest(ToServerRequest { + let msg = ToRivet::ToRivetRequest(ToRivetRequest { request_id: 1, actor_id: "abc".into(), data: RequestData::ActorOpenRequest, @@ -45,7 +45,7 @@ mod tests { #[test] fn round_trip_to_server_request_kv_get() { - let msg = ToServer::ToServerRequest(ToServerRequest { + let msg = ToRivet::ToRivetRequest(ToRivetRequest { request_id: 3, actor_id: "actor1".into(), data: RequestData::KvGetRequest(KvGetRequest { @@ -91,7 +91,7 @@ mod tests { #[test] fn bytes_to_server_request_actor_open() { - let msg = ToServer::ToServerRequest(ToServerRequest { + let msg = ToRivet::ToRivetRequest(ToRivetRequest { request_id: 1, actor_id: "abc".into(), data: RequestData::ActorOpenRequest, @@ -105,7 +105,7 @@ mod tests { #[test] fn bytes_to_server_pong() { - let msg = ToServer::ToServerPong(ToServerPong { ts: 1234567890 }); + let msg = ToRivet::ToRivetPong(ToRivetPong { ts: 1234567890 }); let bytes = encode_to_server(&msg).unwrap(); assert_eq!( bytes, @@ -132,7 +132,10 @@ mod tests { let bytes = encode_to_client(&msg).unwrap(); assert_eq!( bytes, - [0x00, 0x2A, 0x00, 0x00, 0x00, 0x03, 0x01, 0x02, 0x01, 0x02, 0x01, 0x03, 0x03, 0x04, 0x05] + [ + 0x00, 0x2A, 0x00, 0x00, 0x00, 0x03, 0x01, 0x02, 0x01, 0x02, 0x01, 0x03, 0x03, 0x04, + 0x05 + ] ); } } diff --git a/engine/sdks/schemas/kv-channel-protocol/v1.bare b/engine/sdks/schemas/kv-channel-protocol/v1.bare index 1a393301d7..1130a406da 100644 --- a/engine/sdks/schemas/kv-channel-protocol/v1.bare +++ b/engine/sdks/schemas/kv-channel-protocol/v1.bare @@ -19,7 +19,7 @@ type Id str # the server sends an error response for the open and rejects # subsequent KV requests for that actor with "actor_locked". -# actorId is on ToServerRequest, not on open/close. The outer +# actorId is on ToRivetRequest, not on open/close. The outer # actorId is the single source of truth for routing. type ActorOpenRequest void @@ -105,21 +105,21 @@ type ResponseData union { KvDeleteResponse } -# MARK: To Server +# MARK: To Rivet -type ToServerRequest struct { +type ToRivetRequest struct { requestId: u32 actorId: Id data: RequestData } -type ToServerPong struct { +type ToRivetPong struct { ts: i64 } -type ToServer union { - ToServerRequest | - ToServerPong +type ToRivet union { + ToRivetRequest | + ToRivetPong } # MARK: To Client diff --git a/engine/sdks/typescript/api-full/package.json b/engine/sdks/typescript/api-full/package.json index b9024853a8..5ee5dd9caa 100644 --- a/engine/sdks/typescript/api-full/package.json +++ b/engine/sdks/typescript/api-full/package.json @@ -1,7 +1,7 @@ { "name": "@rivetkit/engine-api-full", "version": "2.2.1", - "repository": "https://github.com/rivet-gg/rivet/tree/main/sdks/typescript", + "repository": "https://github.com/rivet-dev/rivet/tree/main/engine/sdks/typescript", "files": [ "dist", "types", diff --git a/engine/sdks/typescript/envoy-client/src/config.ts b/engine/sdks/typescript/envoy-client/src/config.ts index 37720ed78f..bb656c3f06 100644 --- a/engine/sdks/typescript/envoy-client/src/config.ts +++ b/engine/sdks/typescript/envoy-client/src/config.ts @@ -12,6 +12,11 @@ export interface EnvoyConfig { poolName: string; prepopulateActorNames: Record }>; metadata?: Record; + /** + * When startEnvoy is called, create a new envoy every time instead of using a single global envoy + * instance for the entire runtime. + */ + notGlobal?: boolean; /** * Debug option to inject artificial latency (in ms) into WebSocket @@ -161,5 +166,5 @@ export interface EnvoyConfig { generation: number, reason: protocol.StopActorReason, ) => Promise; - onShutdown: (reason: ShutdownReason) => void; + onShutdown: () => void; } diff --git a/engine/sdks/typescript/envoy-client/src/handle.ts b/engine/sdks/typescript/envoy-client/src/handle.ts index 8a0e783c2f..3dc703a9fa 100644 --- a/engine/sdks/typescript/envoy-client/src/handle.ts +++ b/engine/sdks/typescript/envoy-client/src/handle.ts @@ -96,5 +96,5 @@ export interface EnvoyHandle { clientMessageIndex: number, ): void; - startServerless(payload: ArrayBuffer): void; + startServerlessActor(payload: ArrayBuffer): void; } diff --git a/engine/sdks/typescript/envoy-client/src/tasks/envoy/events.ts b/engine/sdks/typescript/envoy-client/src/tasks/envoy/events.ts index 22ee15783b..17247dbdf1 100644 --- a/engine/sdks/typescript/envoy-client/src/tasks/envoy/events.ts +++ b/engine/sdks/typescript/envoy-client/src/tasks/envoy/events.ts @@ -6,9 +6,7 @@ import { wsSend } from "../connection.js"; export function handleSendEvents( ctx: EnvoyContext, events: protocol.EventWrapper[], -): boolean { - let stop = false; - +) { // Record in history per actor for (const event of events) { const entry = getActorEntry( @@ -24,9 +22,6 @@ export function handleSendEvents( if (event.inner.tag === "EventActorStateUpdate") { if (event.inner.val.state.tag === "ActorStateStopped") { entry.handle.close(); - - // Serverless envoys only handle one actor which means if it stops, the envoy should stop too - if (ctx.serverless) stop = true; } } } @@ -37,8 +32,6 @@ export function handleSendEvents( tag: "ToRivetEvents", val: events, }); - - return stop; } export function handleAckEvents( diff --git a/engine/sdks/typescript/envoy-client/src/tasks/envoy/index.ts b/engine/sdks/typescript/envoy-client/src/tasks/envoy/index.ts index 9daefd8137..1b48c2503f 100644 --- a/engine/sdks/typescript/envoy-client/src/tasks/envoy/index.ts +++ b/engine/sdks/typescript/envoy-client/src/tasks/envoy/index.ts @@ -32,9 +32,11 @@ import { sleep, spawn, watch, WatchReceiver, WatchSender } from "antiox"; import { BufferMap, EnvoyShutdownError } from "@/utils.js"; import { stringifyToEnvoy } from "@/stringify.js"; +let GLOBAL_ENVOY: EnvoyHandle | undefined = undefined; +let GLOBAL_SHARED_CTX: SharedContext | undefined = undefined; + export interface EnvoyContext { shared: SharedContext; - serverless: boolean; shuttingDown: boolean; actors: Map>; kvRequests: Map; @@ -95,6 +97,15 @@ export async function startEnvoy(config: EnvoyConfig): Promise { // Must manually wait for envoy to start. export function startEnvoySync(config: EnvoyConfig): EnvoyHandle { + if (!config.notGlobal && GLOBAL_ENVOY && GLOBAL_SHARED_CTX) { + // Copy the token when called multiple times. This is done for serverless envoys where the token + // normally expires around the same time as the /start request expires. The envoy persists longer + // than the /start request so it needs an up to date token. + GLOBAL_SHARED_CTX.config.token = config.token; + + return GLOBAL_ENVOY; + } + const [envoyTx, envoyRx] = unboundedChannel(); const [startTx, startRx] = watch(void 0); const actors: Map> = new Map(); @@ -107,11 +118,12 @@ export function startEnvoySync(config: EnvoyConfig): EnvoyHandle { handle: null as any, }; - const connHandle = startConnection(shared); + if (!config.notGlobal) GLOBAL_SHARED_CTX = shared; + + startConnection(shared); const ctx: EnvoyContext = { shared, - serverless: false, shuttingDown: false, actors, kvRequests: new Map(), @@ -124,6 +136,16 @@ export function startEnvoySync(config: EnvoyConfig): EnvoyHandle { const handle = createHandle(ctx, startRx); shared.handle = handle; + if (!config.notGlobal) GLOBAL_ENVOY = handle; + + // Register signal handlers + const onSignal = () => { + log(ctx.shared)?.info({ msg: "received stop signal, starting envoy shutdown" }); + handle.shutdown(false); + }; + process.once("SIGINT", onSignal); + process.once("SIGTERM", onSignal); + log(ctx.shared)?.info({ msg: "starting envoy" }); spawn(async () => { @@ -136,24 +158,20 @@ export function startEnvoySync(config: EnvoyConfig): EnvoyHandle { }, KV_CLEANUP_INTERVAL_MS); let lostTimeout: NodeJS.Timeout | undefined = undefined; - let serverlessShutdown = false; for await (const msg of envoyRx) { if (msg.type === "conn-message") { - await handleConnMessage(ctx, startTx, lostTimeout, msg.message); + lostTimeout = handleConnMessage( + ctx, + startTx, + lostTimeout, + msg.message, + ); } else if (msg.type === "conn-close") { - handleConnClose(ctx, lostTimeout); + lostTimeout = handleConnClose(ctx, lostTimeout); if (msg.evict) break; } else if (msg.type === "send-events") { - const stop = handleSendEvents(ctx, msg.events); - - if (stop) { - serverlessShutdown = true; - log(ctx.shared)?.info({ - msg: "serverless actor stopped, stopping envoy" - }); - break; - } + handleSendEvents(ctx, msg.events); } else if (msg.type === "kv-request") { handleKvRequest(ctx, msg); } else if (msg.type === "buffer-tunnel-msg") { @@ -168,7 +186,12 @@ export function startEnvoySync(config: EnvoyConfig): EnvoyHandle { } // Cleanup + if (lostTimeout) { + clearTimeout(lostTimeout); + } ctx.shared.wsTx?.send({ type: "close", code: 1000, reason: "envoy.shutdown" }); + connHandle.abort(); + await connHandle.catch(() => undefined); clearInterval(ackInterval); clearInterval(kvCleanupInterval); @@ -188,13 +211,13 @@ export function startEnvoySync(config: EnvoyConfig): EnvoyHandle { msg: "envoy stopped", }); - ctx.shared.config.onShutdown(serverlessShutdown ? "serverless-early-exit" : "normal"); - }); + if (!ctx.shared.config.notGlobal) { + GLOBAL_ENVOY = undefined; + GLOBAL_SHARED_CTX = undefined; + } - // Queue start actor - if (shared.config.serverlessStartPayload) { - handle.startServerless(shared.config.serverlessStartPayload); - } + ctx.shared.config.onShutdown(); + }); return handle; } @@ -204,7 +227,7 @@ function handleConnMessage( startTx: WatchSender, lostTimeout: NodeJS.Timeout | undefined, message: ToEnvoyFromConnMessage, -) { +): NodeJS.Timeout | undefined { if (message.tag === "ToEnvoyInit") { ctx.shared.protocolMetadata = message.val.metadata; log(ctx.shared)?.info({ @@ -212,7 +235,10 @@ function handleConnMessage( protocolMetadata: message.val.metadata, }); - clearTimeout(lostTimeout); + if (lostTimeout) { + clearTimeout(lostTimeout); + lostTimeout = undefined; + } resendUnacknowledgedEvents(ctx); processUnsentKvRequests(ctx); resendBufferedTunnelMessages(ctx); @@ -229,9 +255,14 @@ function handleConnMessage( } else { unreachable(message); } + + return lostTimeout; } -function handleConnClose(ctx: EnvoyContext, lostTimeout: NodeJS.Timeout | undefined) { +function handleConnClose( + ctx: EnvoyContext, + lostTimeout: NodeJS.Timeout | undefined, +): NodeJS.Timeout | undefined { if (!lostTimeout) { let lostThreshold = ctx.shared.protocolMetadata ? Number(ctx.shared.protocolMetadata.envoyLostThreshold) : 10000; log(ctx.shared)?.debug({ @@ -268,6 +299,8 @@ function handleConnClose(ctx: EnvoyContext, lostTimeout: NodeJS.Timeout | undefi lostThreshold, ); } + + return lostTimeout; } function handleShutdown(ctx: EnvoyContext) { @@ -346,7 +379,14 @@ function createHandle( return { shutdown(immediate: boolean) { - ctx.shared.envoyTx.send({ type: "shutdown" }); + if (immediate) { + log(ctx.shared)?.debug({ + msg: "envoy received immediate shutdown", + }); + ctx.shared.envoyTx.send({ type: "stop" }); + } else { + ctx.shared.envoyTx.send({ type: "shutdown" }); + } }, getProtocolMetadata(): protocol.ProtocolMetadata | undefined { @@ -576,10 +616,7 @@ function createHandle( sendHibernatableWebSocketMessageAck(ctx, gatewayId, requestId, clientMessageIndex); }, - startServerless(payload: ArrayBuffer) { - if (ctx.serverless) throw new Error("Already started serverless actor"); - ctx.serverless = true; - + startServerlessActor(payload: ArrayBuffer) { let version = new DataView(payload).getUint16(0, true); if (version != protocol.VERSION) @@ -588,9 +625,9 @@ function createHandle( // Skip first 2 bytes (version) const message = protocol.decodeToEnvoy(new Uint8Array(payload, 2)); - if (message.tag !== "ToEnvoyCommands") throw new Error("invalid serverless body"); - if (message.val.length !== 1) throw new Error("invalid serverless body"); - if (message.val[0].inner.tag !== "CommandStartActor") throw new Error("invalid serverless body"); + if (message.tag !== "ToEnvoyCommands") throw new Error("invalid serverless payload"); + if (message.val.length !== 1) throw new Error("invalid serverless payload"); + if (message.val[0].inner.tag !== "CommandStartActor") throw new Error("invalid serverless payload"); // Wait for envoy to start before adding message startedPromise.then(() => { diff --git a/engine/sdks/typescript/kv-channel-protocol/src/index.ts b/engine/sdks/typescript/kv-channel-protocol/src/index.ts index 229cde4a2f..fad75ec166 100644 --- a/engine/sdks/typescript/kv-channel-protocol/src/index.ts +++ b/engine/sdks/typescript/kv-channel-protocol/src/index.ts @@ -24,7 +24,7 @@ export function writeId(bc: bare.ByteCursor, x: Id): void { } /** - * actorId is on ToServerRequest, not on open/close. The outer + * actorId is on ToRivetRequest, not on open/close. The outer * actorId is the single source of truth for routing. */ export type ActorOpenRequest = null @@ -332,13 +332,13 @@ export function writeResponseData(bc: bare.ByteCursor, x: ResponseData): void { } } -export type ToServerRequest = { +export type ToRivetRequest = { readonly requestId: u32 readonly actorId: Id readonly data: RequestData } -export function readToServerRequest(bc: bare.ByteCursor): ToServerRequest { +export function readToRivetRequest(bc: bare.ByteCursor): ToRivetRequest { return { requestId: bare.readU32(bc), actorId: readId(bc), @@ -346,38 +346,38 @@ export function readToServerRequest(bc: bare.ByteCursor): ToServerRequest { } } -export function writeToServerRequest(bc: bare.ByteCursor, x: ToServerRequest): void { +export function writeToRivetRequest(bc: bare.ByteCursor, x: ToRivetRequest): void { bare.writeU32(bc, x.requestId) writeId(bc, x.actorId) writeRequestData(bc, x.data) } -export type ToServerPong = { +export type ToRivetPong = { readonly ts: i64 } -export function readToServerPong(bc: bare.ByteCursor): ToServerPong { +export function readToRivetPong(bc: bare.ByteCursor): ToRivetPong { return { ts: bare.readI64(bc), } } -export function writeToServerPong(bc: bare.ByteCursor, x: ToServerPong): void { +export function writeToRivetPong(bc: bare.ByteCursor, x: ToRivetPong): void { bare.writeI64(bc, x.ts) } -export type ToServer = - | { readonly tag: "ToServerRequest"; readonly val: ToServerRequest } - | { readonly tag: "ToServerPong"; readonly val: ToServerPong } +export type ToRivet = + | { readonly tag: "ToRivetRequest"; readonly val: ToRivetRequest } + | { readonly tag: "ToRivetPong"; readonly val: ToRivetPong } -export function readToServer(bc: bare.ByteCursor): ToServer { +export function readToRivet(bc: bare.ByteCursor): ToRivet { const offset = bc.offset const tag = bare.readU8(bc) switch (tag) { case 0: - return { tag: "ToServerRequest", val: readToServerRequest(bc) } + return { tag: "ToRivetRequest", val: readToRivetRequest(bc) } case 1: - return { tag: "ToServerPong", val: readToServerPong(bc) } + return { tag: "ToRivetPong", val: readToRivetPong(bc) } default: { bc.offset = offset throw new bare.BareError(offset, "invalid tag") @@ -385,34 +385,34 @@ export function readToServer(bc: bare.ByteCursor): ToServer { } } -export function writeToServer(bc: bare.ByteCursor, x: ToServer): void { +export function writeToRivet(bc: bare.ByteCursor, x: ToRivet): void { switch (x.tag) { - case "ToServerRequest": { + case "ToRivetRequest": { bare.writeU8(bc, 0) - writeToServerRequest(bc, x.val) + writeToRivetRequest(bc, x.val) break } - case "ToServerPong": { + case "ToRivetPong": { bare.writeU8(bc, 1) - writeToServerPong(bc, x.val) + writeToRivetPong(bc, x.val) break } } } -export function encodeToServer(x: ToServer, config?: Partial): Uint8Array { +export function encodeToRivet(x: ToRivet, config?: Partial): Uint8Array { const fullConfig = config != null ? bare.Config(config) : DEFAULT_CONFIG const bc = new bare.ByteCursor( new Uint8Array(fullConfig.initialBufferLength), fullConfig, ) - writeToServer(bc, x) + writeToRivet(bc, x) return new Uint8Array(bc.view.buffer, bc.view.byteOffset, bc.offset) } -export function decodeToServer(bytes: Uint8Array): ToServer { +export function decodeToRivet(bytes: Uint8Array): ToRivet { const bc = new bare.ByteCursor(bytes, DEFAULT_CONFIG) - const result = readToServer(bc) + const result = readToRivet(bc) if (bc.offset < bc.view.byteLength) { throw new bare.BareError(bc.offset, "remaining bytes") } diff --git a/engine/sdks/typescript/runner/src/actor.ts b/engine/sdks/typescript/runner/src/actor.ts index 6a1f12455a..852e31892e 100644 --- a/engine/sdks/typescript/runner/src/actor.ts +++ b/engine/sdks/typescript/runner/src/actor.ts @@ -108,6 +108,23 @@ export class RunnerActor { }); } + resetPendingRequestMessageIndex( + gatewayId: protocol.GatewayId, + requestId: protocol.RequestId, + clientMessageIndex: number, + ) { + const existing = this.getPendingRequest(gatewayId, requestId); + if (!existing) { + this.createPendingRequest(gatewayId, requestId, clientMessageIndex); + return; + } + + existing.clientMessageIndex = clientMessageIndex; + existing.actorId = this.actorId; + existing.gatewayId = gatewayId; + existing.requestId = requestId; + } + createPendingRequestWithStreamController( gatewayId: protocol.GatewayId, requestId: protocol.RequestId, diff --git a/engine/sdks/typescript/runner/src/mod.ts b/engine/sdks/typescript/runner/src/mod.ts index 724f8ecd49..6fec2b9b93 100644 --- a/engine/sdks/typescript/runner/src/mod.ts +++ b/engine/sdks/typescript/runner/src/mod.ts @@ -851,72 +851,103 @@ export class Runner { }); ws.addEventListener("message", async (ev) => { - let buf: Uint8Array; - if (ev.data instanceof Blob) { - buf = new Uint8Array(await ev.data.arrayBuffer()); - } else if (Buffer.isBuffer(ev.data)) { - buf = new Uint8Array(ev.data); - } else { - throw new Error(`expected binary data, got ${typeof ev.data}`); - } + try { + if (ws !== this.#pegboardWebSocket) { + this.log?.debug({ + msg: "ignoring runner message from stale websocket", + }); + return; + } - await this.#injectLatency(); + let buf: Uint8Array; + if (ev.data instanceof Blob) { + buf = new Uint8Array(await ev.data.arrayBuffer()); + } else if (Buffer.isBuffer(ev.data)) { + buf = new Uint8Array(ev.data); + } else { + throw new Error(`expected binary data, got ${typeof ev.data}`); + } - // Parse message - const message = protocol.decodeToClient(buf); - this.log?.debug({ - msg: "received runner message", - data: stringifyToClient(message), - }); + await this.#injectLatency(); - // Handle message - if (message.tag === "ToClientInit") { - const init = message.val; + // Parse message + const message = protocol.decodeToClient(buf); + this.log?.debug({ + msg: "received runner message", + data: stringifyToClient(message), + }); - if (this.runnerId !== init.runnerId) { - this.runnerId = init.runnerId; + // Handle message + if (message.tag === "ToClientInit") { + const init = message.val; - // Clear actors if runner id changed - this.#stopAllActors(); - } + if (this.runnerId !== init.runnerId) { + this.runnerId = init.runnerId; - this.#protocolMetadata = init.metadata; + // Clear actors if runner id changed + this.#stopAllActors(); + } - this.log?.info({ - msg: "received init", - protocolMetadata: this.#protocolMetadata, - }); + this.#protocolMetadata = init.metadata; - // Resend pending events - this.#processUnsentKvRequests(); - this.#resendUnacknowledgedEvents(); - this.#tunnel?.resendBufferedEvents(); - - this.#config.onConnected(); - } else if (message.tag === "ToClientCommands") { - const commands = message.val; - this.#handleCommands(commands); - } else if (message.tag === "ToClientAckEvents") { - this.#handleAckEvents(message.val); - } else if (message.tag === "ToClientKvResponse") { - const kvResponse = message.val; - this.#handleKvResponse(kvResponse); - } else if (message.tag === "ToClientTunnelMessage") { - this.#tunnel?.handleTunnelMessage(message.val).catch((err) => { - this.log?.error({ - msg: "error handling tunnel message", - error: stringifyError(err), + this.log?.info({ + msg: "received init", + protocolMetadata: this.#protocolMetadata, }); + + // Resend pending events + this.#processUnsentKvRequests(); + this.#resendUnacknowledgedEvents(); + this.#tunnel?.resendBufferedEvents(); + + this.#config.onConnected(); + } else if (message.tag === "ToClientCommands") { + const commands = message.val; + this.#handleCommands(commands); + } else if (message.tag === "ToClientAckEvents") { + this.#handleAckEvents(message.val); + } else if (message.tag === "ToClientKvResponse") { + const kvResponse = message.val; + this.#handleKvResponse(kvResponse); + } else if (message.tag === "ToClientTunnelMessage") { + this.#tunnel?.handleTunnelMessage(message.val).catch((err) => { + this.log?.error({ + msg: "error handling tunnel message", + error: stringifyError(err), + }); + }); + } else if (message.tag === "ToClientPing") { + this.__sendToServer({ + tag: "ToServerPong", + val: { + ts: message.val.ts, + }, + }); + } else { + unreachable(message); + } + } catch (error) { + if (this.#shutdown || ws.readyState !== ws.OPEN) { + this.log?.debug({ + msg: "ignoring runner websocket message during shutdown", + error: stringifyError(error), + readyState: ws.readyState, + }); + return; + } + + this.log?.error({ + msg: "failed to decode runner websocket message", + error: stringifyError(error), }); - } else if (message.tag === "ToClientPing") { - this.__sendToServer({ - tag: "ToServerPong", - val: { - ts: message.val.ts, - }, - }); - } else { - unreachable(message); + try { + ws.close(1011, "runner.invalid_frame"); + } catch (closeError) { + this.log?.debug({ + msg: "failed closing runner websocket after decode error", + error: stringifyError(closeError), + }); + } } }); @@ -934,6 +965,9 @@ export class Runner { }); ws.addEventListener("close", async (ev) => { + if (this.#pegboardWebSocket === ws) { + this.#pegboardWebSocket = undefined; + } if (!this.#shutdown) { const closeError = parseWebSocketCloseReason(ev.reason); if ( diff --git a/engine/sdks/typescript/runner/src/tunnel.ts b/engine/sdks/typescript/runner/src/tunnel.ts index ad2b650e74..7a503476b8 100644 --- a/engine/sdks/typescript/runner/src/tunnel.ts +++ b/engine/sdks/typescript/runner/src/tunnel.ts @@ -178,6 +178,15 @@ export class Tunnel { meta.headers, ); + // Restored websocket handlers can synchronously send messages + // during onRestore, so the pending request tracking must exist + // before the websocket is handed to user code. + actor.resetPendingRequestMessageIndex( + meta.gatewayId, + meta.requestId, + meta.clientMessageIndex, + ); + // This will call `runner.config.websocket` under the hood to // attach the event listeners to the WebSocket. // Track this operation to ensure it completes @@ -195,16 +204,6 @@ export class Tunnel { false, ) .then(() => { - // Create a PendingRequest entry to track the message index - const actor = this.#runner.getActor(actorId); - if (actor) { - actor.createPendingRequest( - gatewayId, - requestId, - meta.clientMessageIndex, - ); - } - this.log?.info({ msg: "connection successfully restored", actorId, @@ -637,13 +636,14 @@ export class Tunnel { case "ToClientRequestAbort": await this.#handleRequestAbort(gatewayId, requestId); break; - case "ToClientWebSocketOpen": - await this.#handleWebSocketOpen( - gatewayId, - requestId, - message.messageKind.val, - ); - break; + case "ToClientWebSocketOpen": + await this.#handleWebSocketOpen( + gatewayId, + requestId, + message.messageId.messageIndex, + message.messageKind.val, + ); + break; case "ToClientWebSocketMessage": { await this.#handleWebSocketMessage( gatewayId, @@ -912,6 +912,7 @@ export class Tunnel { async #handleWebSocketOpen( gatewayId: GatewayId, requestId: RequestId, + serverMessageIndex: number, open: protocol.ToClientWebSocketOpen, ) { // NOTE: This method is safe to be async since we will not receive any @@ -951,13 +952,18 @@ export class Tunnel { // WebSockets from retransmits. const existingAdapter = actor.getWebSocket(gatewayId, requestId); if (existingAdapter) { - this.log?.warn({ - msg: "closing existing websocket for duplicate open event for the same request id", - requestId: requestIdStr, + const replayAction = + existingAdapter._handleOpenReplay(serverMessageIndex); + if (replayAction === "reset") { + actor.resetPendingRequestMessageIndex(gatewayId, requestId, 0); + } + this.#sendMessage(gatewayId, requestId, { + tag: "ToServerWebSocketOpen", + val: { + canHibernate: existingAdapter[HIBERNATABLE_SYMBOL], + }, }); - // Close without sending a message through the tunnel since the server - // already knows about the new connection - existingAdapter._closeWithoutCallback(1000, "ws.duplicate_open"); + return; } // Create WebSocket @@ -979,15 +985,15 @@ export class Tunnel { // hood to add the event listeners for open, etc. If this handler // throws, then the WebSocket will be closed before sending the // open event. - const adapter = await this.#createWebSocket( - actor.actorId, - gatewayId, - requestId, - requestIdStr, - 0, - canHibernate, - false, - request, + const adapter = await this.#createWebSocket( + actor.actorId, + gatewayId, + requestId, + requestIdStr, + serverMessageIndex, + canHibernate, + false, + request, open.path, Object.fromEntries(open.headers), false, diff --git a/engine/sdks/typescript/runner/src/websocket-tunnel-adapter.ts b/engine/sdks/typescript/runner/src/websocket-tunnel-adapter.ts index a40f0830d9..e2b74e982d 100644 --- a/engine/sdks/typescript/runner/src/websocket-tunnel-adapter.ts +++ b/engine/sdks/typescript/runner/src/websocket-tunnel-adapter.ts @@ -5,6 +5,8 @@ import { MAX_PAYLOAD_SIZE, wrappingAddU16, wrappingLteU16, wrappingSubU16 } from export const HIBERNATABLE_SYMBOL = Symbol("hibernatable"); +export type OpenReplayAction = "ignored" | "advanced" | "reset"; + export class WebSocketTunnelAdapter { #readyState: 0 | 1 | 2 | 3 = 0; #binaryType: "nodebuffer" | "arraybuffer" | "blob" = "nodebuffer"; @@ -104,6 +106,24 @@ export class WebSocketTunnelAdapter { this.#ws.dispatchEvent({ type: "open", rivetRequestId: requestId, target: this.#ws }); } + _handleOpenReplay(serverMessageIndex: number): OpenReplayAction { + if (!this.#hibernatable) { + return "ignored"; + } + if ( + serverMessageIndex === 0 && + this.#serverMessageIndex !== 0 + ) { + this.#serverMessageIndex = 0; + return "reset"; + } + if (wrappingLteU16(serverMessageIndex, this.#serverMessageIndex)) { + return "ignored"; + } + this.#serverMessageIndex = serverMessageIndex; + return "advanced"; + } + // Called by Tunnel when message is received _handleMessage( requestId: ArrayBuffer, diff --git a/engine/sdks/typescript/test-envoy/src/index.ts b/engine/sdks/typescript/test-envoy/src/index.ts index af612cfb3a..e98c3714a4 100644 --- a/engine/sdks/typescript/test-envoy/src/index.ts +++ b/engine/sdks/typescript/test-envoy/src/index.ts @@ -182,23 +182,21 @@ app.post("/api/rivet/start", async (c) => { let payload = await c.req.arrayBuffer(); return streamSSE(c, async (stream) => { - const stopped = Promise.withResolvers(); + c.req.raw.signal.addEventListener("abort", () => { + getLogger().debug("SSE aborted"); + }); + const envoy = startEnvoySync({ ...config, - serverlessStartPayload: payload, - onShutdown(reason: ShutdownReason) { - stopped.resolve(reason); - } }); - c.req.raw.signal.addEventListener("abort", () => { - getLogger().debug("SSE aborted, shutting down runner"); - envoy!.shutdown(true); - }); + envoy.startServerlessActor(payload); + + while (true) { + if (stream.closed || stream.aborted) break; - let reason = await stopped.promise; - if (reason === "serverless-early-exit") { - stream.writeSSE({ event: "stopping", data: "" }); + await stream.writeSSE({ event: "ping", data: "" }); + await stream.sleep(1000); } }); }); diff --git a/examples/CLAUDE.md b/examples/CLAUDE.md index 95ded3203a..07e5e379cf 100644 --- a/examples/CLAUDE.md +++ b/examples/CLAUDE.md @@ -1,10 +1,10 @@ # examples/CLAUDE.md -Guidelines for creating and maintaining examples in this repository. +- Follow these guidelines when creating and maintaining examples in this repository. ## README Format -All example READMEs must follow the template defined in `.claude/resources/EXAMPLE_TEMPLATE.md`. Key requirements: +- All example READMEs must follow `.claude/resources/EXAMPLE_TEMPLATE.md` and meet the key requirements below. - Use exact section headings: `## Getting Started`, `## Features`, `## Implementation`, `## Resources`, `## License` - Include `## Prerequisites` only for non-obvious dependencies (API keys, external services) - Focus features on RivetKit concepts demonstrated, not just app functionality @@ -14,17 +14,17 @@ All example READMEs must follow the template defined in `.claude/resources/EXAMP ### Directory Layout -Examples with frontend: +- Use this layout for examples with frontend (using `vite-plugin-srvx`): ``` example-name/ ├── src/ -│ └── index.ts # Actor definitions, registry setup, and registry.start() +│ ├── actors.ts # Actor definitions and registry setup +│ └── server.ts # Server entry point ├── frontend/ │ ├── App.tsx # Main React component │ └── main.tsx # React entry point ├── tests/ │ └── *.test.ts # Vitest tests -├── public/ # Vite build output (gitignored) ├── index.html # HTML entry point (for Vite) ├── package.json ├── tsconfig.json @@ -34,11 +34,30 @@ example-name/ └── README.md ``` -Backend-only examples: +- Use this layout for examples with separate frontend and backend dev servers: ``` example-name/ ├── src/ -│ └── index.ts # Actor definitions, registry setup, and registry.start() +│ ├── actors.ts # Actor definitions and registry setup +│ └── server.ts # Server entry point +├── frontend/ +│ ├── App.tsx +│ └── main.tsx +├── package.json +├── tsconfig.json +├── tsup.config.ts # For backend bundling +├── vite.config.ts +├── vitest.config.ts # Only if tests exist +├── turbo.json +└── README.md +``` + +- Use this layout for backend-only examples: +``` +example-name/ +├── src/ +│ ├── actors.ts # Actor definitions and registry setup +│ └── server.ts # Server entry point ├── package.json ├── tsconfig.json ├── turbo.json @@ -47,7 +66,8 @@ example-name/ ### Naming Conventions -- Actor definitions and server setup go in `src/index.ts` (single entry point) +- Actor definitions go in `src/actors.ts` +- Server entry point is always `src/server.ts` - Frontend entry is `frontend/main.tsx` with main component in `frontend/App.tsx` - Test files use `.test.ts` extension in `tests/` directory @@ -55,27 +75,44 @@ example-name/ ### Required Scripts -For examples with frontend: +- Use these scripts for examples with frontend (using `vite-plugin-srvx`): +```json +{ + "scripts": { + "dev": "vite", + "check-types": "tsc --noEmit", + "test": "vitest run", + "build": "vite build && vite build --mode server", + "start": "srvx --static=public/ dist/server.js" + } +} +``` + +- Use these scripts for examples with separate frontend and backend dev servers: ```json { "scripts": { - "dev": "concurrently -n server,vite \"tsx --watch src/index.ts\" \"vite\"", - "dev:server": "tsx --watch src/index.ts", + "dev:backend": "srvx --import tsx src/server.ts", + "dev:frontend": "vite", + "dev": "concurrently \"npm run dev:backend\" \"npm run dev:frontend\"", "check-types": "tsc --noEmit", "test": "vitest run", - "build": "vite build", - "start": "tsx src/index.ts" + "build:frontend": "vite build", + "build:backend": "tsup", + "build": "npm run build:backend && npm run build:frontend", + "start": "srvx --static=../frontend/dist dist/server.js" } } ``` -For backend-only examples: +- Use these scripts for backend-only examples: ```json { "scripts": { - "dev": "tsx --watch src/index.ts", + "dev": "npx srvx --import tsx src/server.ts", + "start": "npx srvx --import tsx src/server.ts", "check-types": "tsc --noEmit", - "start": "tsx src/index.ts" + "build": "tsup" } } ``` @@ -104,12 +141,18 @@ For backend-only examples: - Use `"rivetkit": "*"` for the main RivetKit package - Use `"@rivetkit/react": "*"` for React integration - Common dev dependencies: - - `tsx` for running TypeScript in development and production - - `typescript` for type checking - - `vite` and `@vitejs/plugin-react` for frontend (only for examples with frontend) - - `concurrently` for parallel dev servers (only for examples with frontend) - - `vitest` for testing -- `@hono/node-server` and `@hono/node-ws` are bundled in rivetkit and do not need to be added as direct dependencies +- `tsx` for running TypeScript in development +- `typescript` for type checking +- `vite` and `@vitejs/plugin-react` for frontend +- `vite-plugin-srvx` for unified dev server (when using vite-plugin-srvx pattern) +- `vitest` for testing +- `tsup` for bundling (only for separate frontend/backend examples) +- `concurrently` for parallel dev servers (only for separate frontend/backend examples) +- Common production dependencies: +- `hono` for the server framework (required for Vercel detection) +- `srvx` for serving in production (used by `start` script) +- `@hono/node-server` for Node.js HTTP server adapter +- `@hono/node-ws` for Node.js WebSocket support ## Configuration Files @@ -134,33 +177,61 @@ For backend-only examples: } ``` -Notes: +- Notes: - Include `"dom"` in lib for frontend examples - Include `"vite/client"` in types when using Vite - Omit `"frontend/**/*"` and `"tests/**/*"` from include if they don't exist - `allowImportingTsExtensions` and `rewriteRelativeImportExtensions` enable ESM-compliant `.ts` imports -### vite.config.ts +### tsup.config.ts + +- Use `tsup.config.ts` only for examples with separate frontend and backend dev servers (not using `vite-plugin-srvx`). + +```typescript +import { defineConfig } from "tsup"; + +export default defineConfig({ + entry: { + server: "src/server.ts", + }, + format: ["esm"], + outDir: "dist", + bundle: true, + splitting: false, + shims: true, +}); +``` -Only needed for examples with a frontend: +### vite.config.ts +- Use this `vite.config.ts` for examples using `vite-plugin-srvx` (unified dev): ```typescript import { defineConfig } from "vite"; import react from "@vitejs/plugin-react"; +import srvx from "vite-plugin-srvx"; + +export default defineConfig({ + plugins: [react(), ...srvx({ entry: "src/server.ts" })], +}); +``` + +- Use this `vite.config.ts` for examples with separate dev servers: +```typescript +import react from "@vitejs/plugin-react"; +import { defineConfig } from "vite"; export default defineConfig({ plugins: [react()], - publicDir: false, + root: "frontend", build: { - outDir: "public", + outDir: "dist", emptyOutDir: true, }, server: { - clearScreen: false, + host: "0.0.0.0", + port: 5173, proxy: { - "/actors": { target: "http://localhost:6420", ws: true }, - "/metadata": { target: "http://localhost:6420" }, - "/health": { target: "http://localhost:6420" }, + "/api/rivet/": "http://localhost:3000", }, }, }); @@ -183,7 +254,7 @@ export default defineConfig({ ### vercel.json -Vercel auto-detects Vite when it sees a `vite.config.ts` and ignores Hono. We must explicitly set the framework to Hono: +- Vercel auto-detects Vite when it sees `vite.config.ts` and ignores Hono, so explicitly set the framework to Hono: ```json { @@ -193,7 +264,7 @@ Vercel auto-detects Vite when it sees a `vite.config.ts` and ignores Hono. We mu ### turbo.json -All examples should extend the root turbo config: +- Extend the root turbo config in all examples: ```json { "$schema": "https://turbo.build/schema.json", @@ -206,14 +277,13 @@ All examples should extend the root turbo config: ``` .actorcore node_modules -public ``` ## Source Code Patterns ### Actor File Structure -Actor definitions (`export const myActor = actor({...})`) must appear at the top of the file, before any helper functions. Helper functions, type definitions used only by helpers, and utilities go after the actor definition. This keeps the actor's public API front-and-center. +- Put actor definitions (`export const myActor = actor({...})`) at the top of the file before helper functions, and place helper-only types and utilities after the actor definition. ```typescript // Good @@ -233,11 +303,9 @@ function helperFunction(...) { ... } export const myActor = actor({...}); ``` -Shared types/interfaces used by both the actor definition and helpers (e.g. `State`, `PlayerEntry`) should go above the actor since the actor definition depends on them. +- Put shared types and interfaces used by both actor definitions and helpers (for example `State` and `PlayerEntry`) above the actor definition. -### Entry Point (src/index.ts) - -The entry point defines actors, sets up the registry, and starts the server. The registry must be exported for client type inference. +### Actor Definitions (src/actors.ts) ```typescript import { actor, setup } from "rivetkit"; @@ -266,20 +334,45 @@ export const chatRoom = actor({ export const registry = setup({ use: { chatRoom }, }); +``` + +### Server Entry Point (src/server.ts) + +- Explicitly import from `"hono"` so Vercel can detect the framework. + +- Include at least: + +```typescript +import { Hono } from "hono"; +import { registry } from "./actors.ts"; -// Start the server on port 6420 -registry.start(); +const app = new Hono(); +app.all("/api/rivet/*", (c) => registry.handler(c.req.raw)); +export default app; ``` -### React Frontend (frontend/App.tsx) +- Use this pattern with additional routes: + +```typescript +import { Hono } from "hono"; +import { registry } from "./actors.ts"; + +const app = new Hono(); -RivetKit runs on port 6420 by default. Pass the endpoint explicitly to the client. +app.get("/api/foo", (c) => c.text("bar")); + +app.all("/api/rivet/*", (c) => registry.handler(c.req.raw)); + +export default app; +``` + +### React Frontend (frontend/App.tsx) ```typescript import { createRivetKit } from "@rivetkit/react"; -import type { registry } from "../src/index.ts"; +import type { registry } from "../src/actors.ts"; -const { useActor } = createRivetKit("http://localhost:6420"); +const { useActor } = createRivetKit(`${location.origin}/api/rivet`); export function App() { const actor = useActor({ @@ -314,7 +407,7 @@ createRoot(root).render( ```typescript import { setupTest } from "rivetkit/test"; import { expect, test } from "vitest"; -import { registry } from "../src/index.ts"; +import { registry } from "../src/actors.ts"; test("Description of test", async (ctx) => { const { client } = await setupTest(ctx, registry); @@ -328,7 +421,7 @@ test("Description of test", async (ctx) => { ## HTML Entry Point -For Vite-based examples: +- Use this HTML pattern for Vite-based examples: ```html @@ -349,7 +442,7 @@ For Vite-based examples: ## ESM Import Requirements -All imports must be ESM-compliant with explicit `.ts` extensions for relative imports: +- Keep all imports ESM-compliant with explicit `.ts` extensions for relative imports: ```typescript // Correct @@ -361,7 +454,7 @@ import { registry } from "./actors"; import { someUtil } from "../utils/helper"; ``` -This is enforced by the tsconfig options `allowImportingTsExtensions` and `rewriteRelativeImportExtensions`. +- This is enforced by `allowImportingTsExtensions` and `rewriteRelativeImportExtensions` in `tsconfig`. ## Best Practices @@ -375,7 +468,7 @@ This is enforced by the tsconfig options `allowImportingTsExtensions` and `rewri ## Vercel Examples -Vercel-optimized versions of examples are automatically generated using the script at `scripts/vercel-examples/generate-vercel-examples.ts`. These examples use the `hono/vercel` adapter and are configured specifically for Vercel serverless deployment. +- Generate Vercel-optimized example variants with `scripts/vercel-examples/generate-vercel-examples.ts`; these variants use `hono/vercel` and Vercel-focused serverless config. ### Generation Script @@ -395,13 +488,13 @@ npx tsx scripts/vercel-examples/generate-vercel-examples.ts --dry-run ### Naming Convention -Vercel examples are placed at `examples/{original-name}-vercel/`. For example: +- Place generated Vercel examples at `examples/{original-name}-vercel/`, for example: - `hello-world` → `hello-world-vercel` - `chat-room` → `chat-room-vercel` ### Directory Layout -Vercel examples with frontend: +- Use this layout for Vercel examples with frontend: ``` example-name-vercel/ ├── api/ @@ -421,7 +514,7 @@ example-name-vercel/ └── README.md # With Vercel-specific note and deploy button ``` -Vercel examples without frontend (API-only): +- Use this layout for Vercel examples without frontend (API-only): ``` example-name-vercel/ ├── api/ @@ -440,7 +533,7 @@ example-name-vercel/ #### api/index.ts -The API entry point uses the Hono Vercel adapter (built into the `hono` package): +- Use the Hono Vercel adapter (built into `hono`) in the API entry point: ```typescript import app from "../src/server.ts"; @@ -450,7 +543,7 @@ export default app; #### vercel.json -For examples with frontend: +- Use this `vercel.json` for examples with frontend: ```json { "framework": "vite", @@ -460,7 +553,7 @@ For examples with frontend: } ``` -For API-only examples: +- Use this `vercel.json` for API-only examples: ```json { "rewrites": [ @@ -471,7 +564,7 @@ For API-only examples: #### package.json -Key differences from origin examples: +- Apply these key differences from origin examples: - Removes `srvx` and `vite-plugin-srvx` - Uses `vercel dev` for development - Simplified build scripts @@ -479,11 +572,11 @@ Key differences from origin examples: #### README.md -Each Vercel example README includes: +- Include the following in each Vercel example README: - A note explaining it's the Vercel-optimized version with a link back to the origin - A "Deploy with Vercel" button for one-click deployment -Example header: +- Use this example header: ```markdown > **Note:** This is the Vercel-optimized version of the [hello-world](../hello-world) example. > It uses the `hono/vercel` adapter and is configured for Vercel deployment. @@ -493,11 +586,11 @@ Example header: ### Skipped Examples -The following example types are not converted to Vercel: +- Do not convert these example types to Vercel: - **Next.js examples** (`*-next-js`): Next.js has its own Vercel integration - **Cloudflare examples** (`*-cloudflare*`): Different runtime environment - **Deno examples**: Different runtime environment -- **Examples without `src/index.ts`**: Cannot be converted +- **Examples without `src/server.ts`**: Cannot be converted ### Workflow @@ -508,7 +601,7 @@ The following example types are not converted to Vercel: ## Frontend Style Guide -Examples should follow these design conventions: +- Follow these design conventions in examples: **Color Palette (Dark Theme)** - Primary accent: `#ff4f00` (orange) for interactive elements and highlights @@ -564,7 +657,7 @@ Examples should follow these design conventions: **Component Patterns** -*Buttons* +- Buttons: - Primary: `#ff4f00` background, white text - Secondary: `#2c2c2e` background, white text - Ghost: transparent background, `#ff4f00` text @@ -572,7 +665,7 @@ Examples should follow these design conventions: - Success: `#30d158` background, white text - Disabled: 50% opacity, `cursor: not-allowed` -*Form Inputs* +- Form Inputs: - Background: `#2c2c2e` - Border: 1px solid `#3a3a3c` - Border radius: 8px @@ -580,21 +673,21 @@ Examples should follow these design conventions: - Focus: border-color `#ff4f00`, box-shadow `0 0 0 3px rgba(255, 79, 0, 0.2)` - Placeholder text: `#6e6e73` -*Cards/Containers* +- Cards and containers: - Background: `#1c1c1e` - Border: 1px solid `#2c2c2e` - Border radius: 8px - Padding: 20px - Box shadow: `0 1px 3px rgba(0, 0, 0, 0.3)` - Header style (when applicable): - - Background: `#2c2c2e` - - Padding: 16px 20px - - Font size: 18px, weight 600 - - Border bottom: 1px solid `#2c2c2e` - - Border radius: 8px 8px 0 0 (top corners only) - - Negative margin to align with card edges: `-20px -20px 20px -20px` - -*Modals/Overlays* +- Background: `#2c2c2e` +- Padding: 16px 20px +- Font size: 18px, weight 600 +- Border bottom: 1px solid `#2c2c2e` +- Border radius: 8px 8px 0 0 (top corners only) +- Negative margin to align with card edges: `-20px -20px 20px -20px` + +- Modals and overlays: - Backdrop: `rgba(0, 0, 0, 0.75)` - Modal background: `#1c1c1e` - Border radius: 8px @@ -602,19 +695,19 @@ Examples should follow these design conventions: - Padding: 24px - Close button: top-right, 8px from edges -*Lists* +- Lists: - Item padding: 12px 16px - Dividers: 1px solid `#2c2c2e` - Hover background: `#2c2c2e` - Selected/active background: `rgba(255, 79, 0, 0.15)` -*Badges/Tags* +- Badges and tags: - Padding: 4px 8px - Border radius: 6px - Font size: 12px - Font weight: 500 -*Tabs* +- Tabs: - Container: `border-bottom: 1px solid #2c2c2e`, flex-wrap for overflow - Tab: `padding: 12px 16px`, no background, `border-radius: 0` - Tab border: `border-bottom: 2px solid transparent`, `margin-bottom: -1px` @@ -625,33 +718,32 @@ Examples should follow these design conventions: **UI States** -*Loading States* +- Loading states: - Spinner: 20px for inline, 32px for page-level - Skeleton placeholders: `#2c2c2e` background with subtle pulse animation - Loading text: "Loading..." in muted color - Button loading: show spinner, disable interaction, keep button width stable -*Empty States* +- Empty states: - Center content vertically and horizontally - Icon: 48px, muted color (`#6e6e73`) - Heading: 18px, primary text color - Description: 14px, muted color - Optional action button below description -*Error States* +- Error states: - Inline errors: `#ff3b30` text below input, 12px font size - Error banners: `#ff3b30` left border (4px), `rgba(255, 59, 48, 0.1)` background - Form validation: highlight input border in `#ff3b30` - Error icon: Lucide `AlertCircle` or `XCircle` -*Disabled States* +- Disabled states: - Opacity: 50% - Cursor: `not-allowed` - No hover/focus effects - Preserve layout (don't collapse or hide) -*Success States* +- Success states: - Color: `#30d158` - Icon: Lucide `CheckCircle` or `Check` - Toast/banner: `rgba(48, 209, 88, 0.1)` background with green left border - diff --git a/examples/actor-actions/vite.config.ts b/examples/actor-actions/vite.config.ts index bd1498f19b..9b7f9fd17e 100644 --- a/examples/actor-actions/vite.config.ts +++ b/examples/actor-actions/vite.config.ts @@ -12,7 +12,7 @@ export default defineConfig({ // Disable screen clearing so concurrently output stays readable clearScreen: false, proxy: { - // Forward manager API and WebSocket requests to the backend + // Forward actor API and WebSocket requests to the backend "/actors": { target: "http://localhost:6420", ws: true }, "/metadata": { target: "http://localhost:6420" }, "/health": { target: "http://localhost:6420" }, diff --git a/examples/ai-generated-actor/.gitignore b/examples/ai-generated-actor/.gitignore new file mode 100644 index 0000000000..dc6f607390 --- /dev/null +++ b/examples/ai-generated-actor/.gitignore @@ -0,0 +1,2 @@ +.actorcore +node_modules diff --git a/examples/ai-generated-actor/README.md b/examples/ai-generated-actor/README.md new file mode 100644 index 0000000000..83dde98af5 --- /dev/null +++ b/examples/ai-generated-actor/README.md @@ -0,0 +1,45 @@ +# AI-Generated Actor + +Use an AI chat to generate and iterate on Rivet Actor code, then deploy and test it live. + +## Getting Started + +```sh +git clone https://github.com/rivet-dev/rivet.git +cd rivet/examples/ai-generated-actor +pnpm install +pnpm dev +``` + +## Prerequisites + +- OpenAI API key set as `OPENAI_API_KEY` +- Install dependencies so the `secure-exec` package is present. This example uses the `secure-exec` package from `pkg.pr.new`. +- If you need to override the runtime package location, set `RIVETKIT_DYNAMIC_SECURE_EXEC_SPECIFIER` to a resolvable `secure-exec` entry file URL. + +## Features + +- AI-driven code generation using GPT-4o via the Vercel AI SDK with streaming responses +- Dynamic actor loading via `dynamicActor` from `rivetkit/dynamic` +- Per-key isolation where each actor key has its own AI agent, generated code, and dynamic actor instance +- Generic actor interface to call arbitrary actions on the generated actor +- Three-column layout: chat, generated code, and actor testing interface + +## Implementation + +The project uses two actors defined in [`src/actors.ts`](https://github.com/rivet-dev/rivet/tree/main/examples/ai-generated-actor/src/actors.ts): + +- `codeAgent` maintains chat history and generated code in actor state. It processes messages via a queue and streams AI responses using the Vercel AI SDK, extracting code blocks from the response to update the current actor source. +- `dynamicRunner` is a dynamic actor that loads its source code from the `codeAgent` with the matching key, executing the AI-generated code in a sandboxed isolate. + +The server in [`src/server.ts`](https://github.com/rivet-dev/rivet/tree/main/examples/ai-generated-actor/src/server.ts) exposes proxy endpoints for calling actions on the dynamic actor by name. + +The frontend in [`frontend/App.tsx`](https://github.com/rivet-dev/rivet/tree/main/examples/ai-generated-actor/frontend/App.tsx) provides a three-column interface for chatting with the AI, viewing generated code, and testing the deployed actor. + +## Resources + +Read more about [dynamic actors](https://rivet.dev/docs/actors/ai-and-user-generated-actors), [queues](https://rivet.dev/docs/actors/queues), [events](https://rivet.dev/docs/actors/events), and [state](https://rivet.dev/docs/actors/state). + +## License + +MIT diff --git a/examples/ai-generated-actor/frontend/App.tsx b/examples/ai-generated-actor/frontend/App.tsx new file mode 100644 index 0000000000..952b1b6dd5 --- /dev/null +++ b/examples/ai-generated-actor/frontend/App.tsx @@ -0,0 +1,479 @@ +import { createRivetKit } from "@rivetkit/react"; +import { createClient } from "rivetkit/client"; +import Editor from "@monaco-editor/react"; +import { useEffect, useRef, useState } from "react"; +import type { + ChatMessage, + CodeAgentState, + CodeUpdateEvent, + ResponseEvent, + registry, +} from "../src/actors/index.ts"; + +const rivetEndpoint = `${location.origin}/api/rivet`; + +const { useActor } = createRivetKit(rivetEndpoint); + +// Raw client for dynamicRunner (actions are unknown at compile time) +const client = createClient({ + endpoint: rivetEndpoint, + encoding: "json", +}); + +const REASONING_OPTIONS = [ + { value: "none", label: "None" }, + { value: "medium", label: "Medium" }, + { value: "high", label: "High" }, + { value: "extra_high", label: "Extra High" }, +] as const; + +// Chat column: interacts with the codeAgent actor +function ChatColumn({ + actorKey, + code, + onApiKeyStatus, + onCodeUpdate, +}: { + actorKey: string; + code: string; + onApiKeyStatus: (missing: boolean) => void; + onCodeUpdate: (code: string, revision: number) => void; +}) { + const agent = useActor({ + name: "codeAgent", + key: [actorKey], + }); + const [messages, setMessages] = useState([]); + const [status, setStatus] = useState("idle"); + const [error, setError] = useState(null); + const [input, setInput] = useState(""); + const [reasoning, setReasoning] = useState("none"); + const timelineRef = useRef(null); + + useEffect(() => { + if (!agent.connection) return; + agent.connection.getState().then((state: CodeAgentState) => { + setMessages(state.messages); + setStatus(state.status); + onApiKeyStatus(!state.hasApiKey); + onCodeUpdate(state.code, state.codeRevision); + }); + }, [agent.connection]); + + // Scroll to bottom when messages change + useEffect(() => { + if (timelineRef.current) { + timelineRef.current.scrollTop = timelineRef.current.scrollHeight; + } + }, [messages]); + + agent.useEvent("response", (payload: ResponseEvent) => { + if (payload.error) { + setError(payload.error); + } else if (payload.done) { + setError(null); + } + setMessages((prev) => { + const exists = prev.some((msg) => msg.id === payload.messageId); + if (!exists) { + // Assistant message placeholder from backend; add it. + return [ + ...prev, + { + id: payload.messageId, + role: "assistant" as const, + content: payload.content, + createdAt: Date.now(), + }, + ]; + } + return prev.map((msg) => + msg.id === payload.messageId + ? { ...msg, content: payload.content } + : msg, + ); + }); + }); + + agent.useEvent("codeUpdated", (payload: CodeUpdateEvent) => { + onCodeUpdate(payload.code, payload.revision); + }); + + agent.useEvent("statusChanged", (nextStatus: string) => { + setStatus(nextStatus); + }); + + const sendMessage = async () => { + if (!agent.connection) return; + const trimmed = input.trim(); + if (!trimmed) return; + + setError(null); + // Optimistic add + const userMsg: ChatMessage = { + id: `pending-${Date.now()}`, + role: "user", + content: trimmed, + createdAt: Date.now(), + }; + setMessages((prev) => [...prev, userMsg]); + setInput(""); + + // Send the current editor code along with the message so the AI can + // modify existing code rather than generating from scratch. + await agent.connection.send("chat", { + text: trimmed, + currentCode: code, + reasoning, + }); + }; + + const handleKeyDown = (event: React.KeyboardEvent) => { + if (event.key === "Enter" && !event.shiftKey) { + event.preventDefault(); + sendMessage(); + } + }; + + return ( +
+
+ Chat +
+ GPT-4o + + +
+
+ {error && ( +
setError(null)}> + {error} +
+ )} +
+ {messages.length === 0 ? ( +

+ Describe the actor you want to build. +

+ ) : ( + messages.map((msg) => ( +
+
+ {msg.role === "user" ? "You" : "AI"} +
+
+ {msg.content || ( + + Thinking... + + )} +
+
+ )) + )} +
+
+