questdb · kafka1991 · Apr 28, 2026 · Apr 29, 2026 · Apr 29, 2026 · Apr 29, 2026
diff --git a/CLAUDE.md b/CLAUDE.md
diff --git a/README.md b/README.md
diff --git a/ci/azurre-binaries-pipeline.yml → ci/azure-binaries-pipeline.yml b/ci/azurre-binaries-pipeline.yml → ci/azure-binaries-pipeline.yml
diff --git a/ci/azure-pipelines.yml b/ci/azure-pipelines.yml
@@ -81,23 +81,58 @@ steps:
     arguments: '--configuration $(buildConfiguration) --no-restore'
 
 - task: DotNetCoreCLI@2
-  displayName: 'Run all tests on $(osName)'
+  displayName: 'Run all tests on $(osName) (net8.0)'
   inputs:
     command: 'test'
     projects: 'src/net-questdb-client-tests/net-questdb-client-tests.csproj'
-    arguments: '--configuration $(buildConfiguration) --framework net9.0 --no-build --verbosity normal --logger trx --collect:"XPlat Code Coverage"'
+    arguments: '--configuration $(buildConfiguration) --framework net8.0 --no-build --verbosity normal --logger trx --blame-hang --blame-hang-timeout 120s --blame-hang-dump-type mini --filter "FullyQualifiedName!~QuestDbIntegrationTests"'
+    publishTestResults: true
+
+- task: DotNetCoreCLI@2
+  displayName: 'Run all tests on $(osName) (net9.0, with coverage)'
+  inputs:
+    command: 'test'
+    projects: 'src/net-questdb-client-tests/net-questdb-client-tests.csproj'
+    arguments: '--configuration $(buildConfiguration) --framework net9.0 --no-build --verbosity normal --logger trx --blame-hang --blame-hang-timeout 120s --blame-hang-dump-type mini --collect:"XPlat Code Coverage"'
     publishTestResults: true
   condition: eq(variables['osName'], 'Linux')
 
 - task: DotNetCoreCLI@2
-  displayName: 'Run tests on $(osName) (excluding integration tests)'
+  displayName: 'Run tests on $(osName) (net9.0, excluding integration tests)'
   inputs:
     command: 'test'
     projects: 'src/net-questdb-client-tests/net-questdb-client-tests.csproj'
-    arguments: '--configuration $(buildConfiguration) --framework net9.0 --no-build --verbosity normal --logger trx --collect:"XPlat Code Coverage" --filter "FullyQualifiedName!~QuestDbIntegrationTests"'
+    arguments: '--configuration $(buildConfiguration) --framework net9.0 --no-build --verbosity normal --logger trx --blame-hang --blame-hang-timeout 120s --blame-hang-dump-type mini --collect:"XPlat Code Coverage" --filter "FullyQualifiedName!~QuestDbIntegrationTests"'
     publishTestResults: true
   condition: ne(variables['osName'], 'Linux')
 
+- task: DotNetCoreCLI@2
+  displayName: 'Run all tests on $(osName) (net10.0)'
+  inputs:
+    command: 'test'
+    projects: 'src/net-questdb-client-tests/net-questdb-client-tests.csproj'
+    arguments: '--configuration $(buildConfiguration) --framework net10.0 --no-build --verbosity normal --logger trx --blame-hang --blame-hang-timeout 120s --blame-hang-dump-type mini --filter "FullyQualifiedName!~QuestDbIntegrationTests"'
+    publishTestResults: true
+
+- task: CopyFiles@2
+  displayName: 'Stage hang dumps + trx ($(osName))'
+  condition: always()
+  inputs:
+    sourceFolder: '$(Agent.TempDirectory)'
+    contents: |
+      **/*.dmp
+      **/*Sequence*.xml
+      **/*.trx
+    targetFolder: '$(Build.ArtifactStagingDirectory)/TestArtifacts'
+
+- task: PublishPipelineArtifact@1
+  displayName: 'Publish hang dumps + trx ($(osName))'
+  condition: always()
+  inputs:
+    targetPath: '$(Build.ArtifactStagingDirectory)/TestArtifacts'
+    artifact: 'TestArtifacts-$(osName)-$(Build.BuildId)'
+    publishLocation: 'pipeline'
+
 - task: PublishCodeCoverageResults@2
   displayName: 'Publish code coverage'
   inputs:

diff --git a/docs/qwp-benchmarks.md b/docs/qwp-benchmarks.md
@@ -0,0 +1,150 @@
+# .NET WebSocket / QWP — Performance Benchmarks
+
+## Environment
+
+- **Server**: QuestDB master build with `/write/v4` enabled, on `127.0.0.1:9000`
+- **Host**: Apple M4 Pro, 14 logical / 14 physical cores, macOS 15.2
+- **Runtime**: .NET 10.0.7, Arm64 RyuJIT AdvSIMD
+- **BenchmarkDotNet**: v0.13.12, in-process toolchain
+
+## TL;DR
+
+- ✅ **Ingest throughput** (`BenchInsertsWs`): WS beats HTTP by **3–6×** across narrow / wide / multi-table at `in_flight_window=128`. All throughput / alloc targets pass with margin.
+- ✅ **Ingest latency** (`BenchLatencyWs`, sync mode): WS is faster than HTTP at every batch size. Single-row WS p95 = **170 μs** vs HTTP **237 μs**; 10k-row batch shows **3.85× advantage**.
+- ✅ **Egress reads** (`BenchQueryWs`): WS reads **3.5–6.1×** faster than HTTP `/exec` from 10k–1M rows; peak **37 M rows/sec** on a 2-column 1M-row query. Per-batch decoder allocation cut **1170×** by column-scratch pooling.
+- ✅ **SF overhead** (`BenchSfThroughput`): SF is **0.83–1.43×** non-SF at the same IFW. SF is faster than non-SF at IFW=1 (single-frame ACK wait masks the disk-append cost); flat at 1.36–1.43× for IFW≥8. Within the ≤ 1.45× target.
+
+## Methodology
+
+- All benches run in-process against a real QuestDB instance on local loopback. `QDB_BENCH_ENDPOINT=127.0.0.1:9000` selects the live server; without it the bench falls back to in-process `DummyQwpServer` + `DummyHttpServer`.
+- Throughput / SF benches use a fast job (2 warmup + 3 iter, `InvocationCount=1`) — small sample, so error bars are wide. Use min / median / p95 for relative ordering rather than the BDN 99.9% CI.
+- Latency bench uses 1000 iterations with `InvocationCount=1` (every iteration is one full round-trip), so percentile columns reflect 1000 samples.
+- All senders are created in `[GlobalSetup]` so per-invocation cost is row-encoding + frame I/O + ACK wait, not slot/mmap/engine spin-up.
+
+## 1. `BenchInsertsWs` — Sustained throughput
+
+**Workload**: long-lived sender, send N rows, `SendAsync()`. Three row shapes (Narrow / Wide / MultiTable) with HTTP baselines and per-category Ratio columns.
+
+### Headline @ `InFlightWindow=128`, `Rows=100000`
+
+| Category | AFR | Mean WS | Mean HTTP | WS rows/sec | HTTP rows/sec | WS Ratio | Alloc WS / HTTP |
+|---|---|---|---|---|---|---|---|
+| **Narrow** | 1000 | 8.11 ms | 43.93 ms | 12.3 M | 2.28 M | **0.18 (5.4×)** | 13 / 19 KB (0.67×) |
+| **Narrow** | 10000 | 8.37 ms | 33.87 ms | 11.9 M | 2.95 M | **0.25 (4.1×)** | 13 / 19 KB (0.68×) |
+| **Wide** | 1000 | 28.08 ms | 123.24 ms | 3.56 M | 0.81 M | **0.23 (4.4×)** | 43 / 83 KB (0.52×) |
+| **Wide** | 10000 | 28.40 ms | 116.58 ms | 3.52 M | 0.86 M | **0.24 (4.1×)** | 43 / 83 KB (0.52×) |
+| **MultiTable** | 1000 | 5.73 ms | 36.04 ms | 17.5 M | 2.78 M | **0.16 (6.3×)** | 8.6 / 15 KB (0.57×) |
+| **MultiTable** | 10000 | 5.70 ms | 22.45 ms | 17.6 M | 4.45 M | **0.25 (4.0×)** | 8.6 / 15 KB (0.58×) |
+
+### Observations
+
+- **WS dominates HTTP across every row shape and AutoFlushRows setting** — minimum advantage 4.0×, peak 6.3× (MultiTable @ AFR=1000).
+- **Memory is uniformly lower for WS** — 0.52× to 0.68× HTTP allocations.
+- **Wide rows ceiling around 3.5 M rows/sec** — payload-bound; WS still preserves a 4× ratio over HTTP because protocol overhead matters more for wide rows.
+- **MultiTable peaks at 17.6 M rows/sec** — WS multiplexes 5 tables over a single connection without per-flush handshake cost.
+- **AutoFlushRows trade-off**: AFR=1000 gives the biggest WS advantage (more flushes amplify HTTP per-request overhead). AFR=10000 narrows the gap but raises absolute throughput modestly. Production sweet spot is AFR=1000–10000 depending on latency tolerance.
+
+## 2. `BenchLatencyWs` — Round-trip latency, sync mode (`in_flight_window=1`)
+
+**Workload**: persistent sender, send `RowsPerBatch` rows, `SendAsync()` and await. 1000 iterations per case.
+
+| RowsPerBatch | Method | Median | p95 | p100 (max) | Min | Ratio | Allocated |
+|---|---|---|---|---|---|---|---|
+| **1** | Http_Roundtrip | 150 μs | 237 μs | 283 μs | 70 μs | 1.00 | 8920 B |
+| **1** | **Ws_SyncRoundtrip** | **110 μs** | **170 μs** | **194 μs** | 63 μs | **0.78** | **0** |
+| **100** | Http_Roundtrip | 134 μs | 199 μs | 221 μs | 76 μs | 1.00 | 21504 B |
+| **100** | **Ws_SyncRoundtrip** | **103 μs** | **151 μs** | **175 μs** | 65 μs | **0.81** | **11608 B** |
+| **10000** | Http_Roundtrip | 2.40 ms | 2.52 ms | 2.61 ms | 2.15 ms | 1.00 | 1.21 MB |
+| **10000** | **Ws_SyncRoundtrip** | **610 μs** | **693 μs** | **736 μs** | 526 μs | **0.26** | **566 KB** |
+
+### Observations
+
+- **Single-row p95**: WS 170 μs vs HTTP 237 μs — 28% faster.
+- **Zero-allocation single-row roundtrip on WS** — re-used encoder buffers + persistent connection. HTTP allocates 8.9 KB per single-row roundtrip.
+- **10000-row batch**: **3.85× faster** (610 μs vs 2.40 ms) and 47% of HTTP allocation. This is the realistic batched-streaming case where WS pipelining inside the batch is decisive.
+- **Variance is consistent** — Min/Median/p95/p100 all show the same WS-vs-HTTP ordering across all batch sizes.
+
+Run at `IterationCount=100_000` if you need a strict p99.
+
+## 3. `BenchQueryWs` — Egress (read) throughput
+
+**Workload**: persistent `QwpQueryClient`, `SELECT * FROM table LIMIT N` against pre-seeded tables (1 M rows each). HTTP `/exec` baseline parses the JSON `dataset[][]` and counts rows so both methods do equivalent extraction work. 20 iterations × 5 warmup; error margin < 2 % at the 1 M-row regime.
+
+| Category | RowCount | Mean WS | Mean HTTP | WS rows/sec | WS Ratio |
+|---|---|---|---|---|---|
+| **Narrow** (2 cols) | 10 k  |   815 μs |   2.89 ms | 12.27 M | **0.28 (3.5×)** |
+| **Narrow** | 100 k |  3.77 ms |   17.7 ms | 26.54 M | **0.21 (4.7×)** |
+| **Narrow** | 1 M   |  27.0 ms |    164 ms | **37.08 M** | **0.16 (6.1×)** |
+| **Wide** (15 cols) | 10 k  |  2.48 ms |   6.56 ms |  4.03 M | **0.38 (2.6×)** |
+| **Wide** | 100 k |  16.9 ms |   59.3 ms |  5.91 M | **0.29 (3.5×)** |
+| **Wide** | 1 M   |   159 ms |    576 ms |  6.30 M | **0.28 (3.6×)** |
+
+- **3.5–6.1× faster than HTTP `/exec`**; peak **37 M rows/sec** at Narrow 1 M.
+- **Decoder allocations dropped 1170×** after column-scratch pooling (Wide 100 k: 10 MB → 8.6 KB / batch). `ValueBytes` / `StringHeap` / `NonNullIndex` / `StringOffsets` survive `Reset` and grow-and-reuse across batches; only varchar/symbol heap deltas allocate.
+- Wide rows cap at **~6 M rows/sec** — payload-bound, like the ingest side.
+
+## 4. `BenchSfThroughput` — Store-and-forward overhead
+
+**Workload**: long-lived senders (one with `sf_dir`, one without). Each iteration sends N rows, `SendAsync()`, then `Ping()` to wait for cumulative ACK — symmetric across both branches.
+
+| InFlightWindow | Rows   | Mean WS_NoSf | Mean WS_WithSf | SF Ratio | Alloc Ratio |
+|---|---|---|---|---|---|
+| 1   | 10000  |  2.86 ms |  2.70 ms | **0.94** | 1.13 |
+| 1   | 100000 | 36.94 ms | 28.46 ms | **0.83** | 1.13 |
+| 8   | 10000  |  1.12 ms |  1.54 ms | **1.38** | 1.14 |
+| 8   | 100000 | 10.29 ms | 13.85 ms | **1.36** | 1.14 |
+| 32  | 10000  |  1.12 ms |  1.59 ms | **1.43** | 1.14 |
+| 32  | 100000 |  9.80 ms | 13.06 ms | **1.36** | 1.14 |
+| 128 | 10000  |  1.05 ms |  1.44 ms | **1.37** | 1.14 |
+| 128 | 100000 |  8.99 ms | 12.41 ms | **1.38** | 1.14 |
+
+### Observations
+
+- **At IFW=1, SF is faster than non-SF** (0.83–0.94×). Both branches block per-frame on ACK, so the SF mmap append hides under the round-trip wait.
+- **At IFW≥8 the ratio flattens to 1.36–1.43**, regardless of IFW or Rows. Constant per-frame architectural cost — disk append + cursor-engine signaling + segment-ring bookkeeping. Does not scale with IFW: cursor engine pumps don't serialize on the in-flight window.
+- **Allocation overhead is uniform 1.13–1.14×** non-SF — segment-ring envelopes amortize once the sender is long-lived; only steady-state per-frame structures churn.
+
+SF's flat 1.36–1.43× tax at IFW≥8 is the per-frame architectural cost (disk append + cursor-engine signaling + segment-ring bookkeeping). At IFW=128 with long-lived senders that buys crash safety for ~10pp on the throughput side.
+
+## Caveats
+
+1. **N=3 iterations** for InsertsWs / SfThroughput → 99.9% CIs are wider than means. Use min / median / p95 for relative ordering; gate verdicts use min / median for conservatism.
+2. **Local loopback only** — TCP/WebSocket handshake on `127.0.0.1` is much faster than network round-trips. Real-network numbers will be higher in absolute terms; relative ratios should hold.
+3. **Single-host** — server and client share CPU and memory; cross-process cache contention may slightly inflate latency. The 14-core M4 Pro keeps contention minimal.
+4. **SF bench is happy-path ingest only** — the product justification for SF (reconnect + replay through server outage) is not exercised here. A transient-failure benchmark is separately needed.
+
+## Acceptance summary
+
+| Pillar | Status |
+|---|---|
+| WS narrow ingest throughput ≥ 1.5× HTTP @ IFW=128 | ✅ 4.05× — 5.42× (margin: 2.7–3.6×) |
+| WS wide ingest throughput ≥ 1.2× HTTP @ IFW=128 | ✅ 4.10× — 4.39× (margin: 3.4–3.7×) |
+| WS sync single-row p100 ≤ 1.5× HTTP | ✅ 0.69× (WS faster than HTTP) |
+| WS async 10000-row p100 ≤ HTTP 10000-row p100 | ✅ 0.28× (WS 3.6× faster) |
+| **WS egress reads ≥ HTTP `/exec` baseline** | ✅ 2.6× — 6.1× across narrow / wide × 10k / 100k / 1M |
+| **SF overhead ≤ 45%** | ✅ 0.83–1.43× (passes at every IFW; flat curve at IFW≥8) |
+| GC alloc / 1k rows ≤ 2× HTTP | ✅ 0.52× — 0.68× HTTP across all shapes |
+
+## Reproduction
+
+```fish
+# Ingest throughput / SF
+QDB_BENCH_ENDPOINT=127.0.0.1:9000 \
+  dotnet run -c Release --project src/net-questdb-client-benchmarks --framework net10.0 -- \
+  --filter '*BenchInsertsWs*' '*BenchSfThroughput*'
+
+# Egress reads (live QuestDB master)
+QDB_BENCH_ENDPOINT=127.0.0.1:9000 \
+  dotnet run -c Release --project src/net-questdb-client-benchmarks --framework net10.0 -- \
+  --filter '*BenchQueryWs*'
+
+# Latency, strict (100k samples for RowsPerBatch=1)
+QDB_BENCH_ENDPOINT=127.0.0.1:9000 \
+  dotnet run -c Release --project src/net-questdb-client-benchmarks --framework net10.0 -- \
+  --filter '*BenchLatencyWs*RowsPerBatch:*1*'
+
+# Latency quick all-batches
+QDB_BENCH_ENDPOINT=127.0.0.1:9000 \
+  dotnet run -c Release --project src/net-questdb-client-benchmarks --framework net10.0 -- \
+  --filter '*BenchLatencyWs*' --iterationCount 1000 --warmupCount 5
+```
diff --git a/examples.manifest.yaml b/examples.manifest.yaml
@@ -22,3 +22,24 @@
   header: |-
     [.NET client library](https://github.com/questdb/net-questdb-client)
   conf: http::addr=localhost:9000;
+
+- name: qwp-ingest
+  lang: csharp
+  path: src/example-qwp-ingest/Program.cs
+  header: |-
+    [.NET client library](https://github.com/questdb/net-questdb-client)
+  conf: ws::addr=localhost:9000;
+
+- name: qwp-ingest-auth-tls
+  lang: csharp
+  path: src/example-qwp-ingest-auth-tls/Program.cs
+  header: |-
+    [.NET client library](https://github.com/questdb/net-questdb-client)
+  conf: wss::addr=localhost:9000;username=admin;password=quest;tls_verify=unsafe_off;
+
+- name: qwp-query
+  lang: csharp
+  path: src/example-qwp-query/Program.cs
+  header: |-
+    [.NET client library](https://github.com/questdb/net-questdb-client)
+  conf: ws::addr=localhost:9000;target=any;