Skip to content

Commit b287a30

Browse files
authored
Merge pull request #84 from hotdata-dev/feat/skill-split-and-workflows
Split search and analytics skills; improve workflows and release finish
2 parents 4901e0b + 00803f2 commit b287a30

12 files changed

Lines changed: 586 additions & 262 deletions

File tree

Cargo.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,8 @@ pre-release-hook = ["git-cliff", "-o", "CHANGELOG.md", "--tag", "v{{version}}" ]
6060
publish = false
6161
pre-release-replacements = [
6262
{ file = "skills/hotdata/SKILL.md", search = "^version: .+", replace = "version: {{version}}", exactly = 1 },
63+
{ file = "skills/hotdata-search/SKILL.md", search = "^version: .+", replace = "version: {{version}}", exactly = 1 },
64+
{ file = "skills/hotdata-analytics/SKILL.md", search = "^version: .+", replace = "version: {{version}}", exactly = 1 },
6365
{ file = "skills/hotdata-geospatial/SKILL.md", search = "^version: .+", replace = "version: {{version}}", exactly = 1 },
6466
{ file = "README.md", search = "version-[0-9.]+-blue", replace = "version-{{version}}-blue", exactly = 1 },
6567
]

scripts/release.sh

Lines changed: 43 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,8 @@
22
# release.sh — two-phase release wrapper around cargo-release
33
#
44
# Usage:
5-
# scripts/release.sh prepare <version> # steps 0-2: branch, bump, push PR
6-
# scripts/release.sh finish # step 4: tag, publish, trigger dist
5+
# scripts/release.sh prepare <version> # branch, bump, changelog PR
6+
# scripts/release.sh finish # tag only (main is branch-protected)
77

88
set -euo pipefail
99

@@ -13,7 +13,7 @@ VERSION="${2:-}"
1313
usage() {
1414
echo "Usage:"
1515
echo " scripts/release.sh prepare <version> # create release branch and open PR"
16-
echo " scripts/release.sh finish # tag and publish from main"
16+
echo " scripts/release.sh finish # push v<version> tag from main (no main push)"
1717
exit 1
1818
}
1919

@@ -24,6 +24,16 @@ require_clean_tree() {
2424
fi
2525
}
2626

27+
read_crate_version() {
28+
local ver
29+
ver="$(grep -E '^version = ' Cargo.toml | head -1 | sed -E 's/^version = "([^"]+)".*/\1/')"
30+
if [ -z "$ver" ]; then
31+
echo "error: could not read version from Cargo.toml" >&2
32+
exit 1
33+
fi
34+
printf '%s' "$ver"
35+
}
36+
2737
case "$COMMAND" in
2838
prepare)
2939
if [ -z "$VERSION" ]; then
@@ -35,17 +45,21 @@ case "$COMMAND" in
3545

3646
require_clean_tree
3747

38-
# step 0: create release branch
3948
echo "→ Creating branch $BRANCH"
4049
git checkout -b "$BRANCH"
4150

42-
# step 2: bump versions, commit, push branch
4351
echo ""
4452
echo "→ Running cargo release (no publish, no tag)..."
45-
# git-cliff (pre-release hook) is often installed via cargo install
4653
export PATH="${HOME}/.cargo/bin:${PATH}"
4754
cargo release --no-publish --no-tag --no-confirm --allow-branch="$BRANCH" --execute "$VERSION"
4855

56+
if [ -f scripts/validate-changelog.py ]; then
57+
echo ""
58+
echo "→ Validating CHANGELOG.md against origin/main..."
59+
git fetch origin main 2>/dev/null || true
60+
python3 scripts/validate-changelog.py origin/main
61+
fi
62+
4963
echo ""
5064
echo "→ Opening pull request..."
5165
PR_URL=$(gh pr create \
@@ -77,15 +91,33 @@ case "$COMMAND" in
7791
fi
7892

7993
echo "→ Pulling latest main..."
80-
git pull
94+
git pull origin main
95+
96+
VERSION="$(read_crate_version)"
97+
TAG="v${VERSION}"
8198

8299
echo ""
83-
echo "→ Running cargo release (tagging release)..."
84-
export PATH="${HOME}/.cargo/bin:${PATH}"
85-
cargo release --no-confirm --execute
100+
echo "→ Release version from Cargo.toml: $VERSION (tag $TAG)"
101+
102+
if git rev-parse "$TAG" >/dev/null 2>&1; then
103+
echo "error: tag $TAG already exists locally. Delete it or pick a new version." >&2
104+
exit 1
105+
fi
106+
107+
if git ls-remote --exit-code --tags origin "refs/tags/${TAG}" >/dev/null 2>&1; then
108+
echo "error: tag $TAG already exists on origin." >&2
109+
exit 1
110+
fi
111+
112+
echo "→ Creating annotated tag $TAG (no commit to main)..."
113+
git tag -a "$TAG" -m "Release hotdata-cli version $VERSION"
114+
115+
echo "→ Pushing tag to origin..."
116+
git push origin "$TAG"
86117

87118
echo ""
88-
echo "✓ Release complete. Tag pushed and dist workflow triggered."
119+
echo "✓ Tag $TAG pushed. Dist/release workflow should run on GitHub."
120+
echo " (main was not pushed — version bump must already be merged via release PR.)"
89121
;;
90122

91123
*)

skills/hotdata-analytics/SKILL.md

Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,124 @@
1+
---
2+
name: hotdata-analytics
3+
description: Use this skill when the user wants OLAP-style SQL analytics in Hotdata — aggregations, GROUP BY, JOINs, reporting, exploratory queries, query run history, stored results, or materialized follow-up tables (Chain via datasets or managed databases). Activate for "analyze", "aggregate", "rollup", "pivot", "report", "metrics", "GROUP BY", "query history", "past queries", "query runs", "stored results", "materialize", "chain", "intermediate table", or sorted indexes for filters/range scans. Do not load for BM25/vector search or geospatial SQL — use hotdata-search or hotdata-geospatial. Requires the core hotdata skill for connections, tables, datasets, and auth.
4+
version: 0.2.3
5+
---
6+
7+
# Hotdata Analytics Skill
8+
9+
**OLAP-style analytics** in Hotdata: PostgreSQL-dialect SQL, query execution, run history, stored results, **Chain** materializations, and **sorted** indexes for filters and joins.
10+
11+
**Prerequisites:** Authenticate, workspace, and catalog discovery via the **`hotdata`** skill (`connections`, `tables`, `datasets`, `databases`).
12+
13+
**Related skills:** **`hotdata-search`** (BM25, vector, retrieval indexes), **`hotdata-geospatial`** (spatial SQL).
14+
15+
---
16+
17+
## Execute SQL
18+
19+
```bash
20+
hotdata query "<sql>" [--workspace-id <workspace_id>] [--connection <connection_id>] [--output table|json|csv]
21+
hotdata query status <query_run_id> [--output table|json|csv]
22+
```
23+
24+
- **PostgreSQL dialect.** Quote mixed-case identifiers: `"CustomerName"`.
25+
- Use **`hotdata tables list`** for schema discovery — not `information_schema` via `query`.
26+
- Fully qualified names: `<connection>.<schema>.<table>`, `datasets.<schema>.<table>`, `<database>.<schema>.<table>`.
27+
- Long-running queries may return `query_run_id` → poll with **`query status`** (exit `2` = still running). Do not re-run identical heavy SQL while polling.
28+
- For **workspace-wide** joins and naming, load **context:DATAMODEL** when listed (`hotdata context list``show DATAMODEL`) — see **`hotdata`** skill.
29+
30+
### OLAP patterns
31+
32+
Typical analytics SQL (all via `hotdata query`):
33+
34+
- **Aggregations:** `COUNT`, `SUM`, `AVG`, `MIN`, `MAX` with `GROUP BY`
35+
- **Joins:** `INNER` / `LEFT JOIN` across `<connection>.<schema>.<table>` names
36+
- **Filtering:** `WHERE` on partition-friendly columns (consider **sorted** indexes below)
37+
- **Ordering:** `ORDER BY` on metrics or dimensions
38+
- **Bounded exploration:** always `LIMIT` while iterating; widen once validated
39+
40+
Column names from CSV uploads may be case-sensitive — use double quotes when not all-lowercase.
41+
42+
---
43+
44+
## Query run history
45+
46+
Uses the **active workspace only** (no `--workspace-id`; set with `hotdata workspaces set`).
47+
48+
```bash
49+
hotdata queries list [--limit <int>] [--cursor <token>] [--status <csv>] [--output table|json|yaml]
50+
hotdata queries <query_run_id> [--output table|json|yaml]
51+
```
52+
53+
- `list` — status, duration, row count, SQL preview (default limit 20). Filter: `--status running,failed`.
54+
- `<query_run_id>` — full metadata, formatted SQL, `result_id` when present.
55+
- Use history to find recurring `WHERE` / `JOIN` / `GROUP BY` patterns before adding indexes (search skill) or chains.
56+
57+
---
58+
59+
## Stored results
60+
61+
```bash
62+
hotdata results list [--workspace-id <workspace_id>] [--limit <int>] [--offset <int>] [--output table|json|yaml]
63+
hotdata results <result_id> [--workspace-id <workspace_id>] [--output table|json|csv]
64+
```
65+
66+
- Prefer **`results <id>`** over re-running identical heavy queries.
67+
- Query footers may include `[result-id: rslt...]`; also available from `queries <query_run_id>`.
68+
69+
---
70+
71+
## Chain (materialized follow-ups)
72+
73+
**Pattern:** run SQL → materialize a smaller table → query the materialized name.
74+
75+
1. **Base query**
76+
77+
```bash
78+
hotdata query "SELECT ..."
79+
hotdata query status <query_run_id> # if async
80+
```
81+
82+
2. **Materialize** (pick one)
83+
84+
```bash
85+
hotdata datasets create --label "chain slice" --sql "SELECT ..." [--table-name chain_slice]
86+
hotdata datasets create --label "from saved" --query-id <query_id> [--table-name ...]
87+
```
88+
89+
Or managed parquet:
90+
91+
```bash
92+
hotdata databases create --name analytics --table slice
93+
hotdata databases tables load analytics slice --file ./slice.parquet
94+
```
95+
96+
3. **Chain query** — use printed **`full_name`** or `datasets list` **FULL NAME** column:
97+
98+
```bash
99+
hotdata query "SELECT * FROM datasets.main.chain_slice WHERE ..."
100+
hotdata query "SELECT * FROM analytics.public.slice WHERE ..."
101+
```
102+
103+
Document stable chains in **context:DATAMODEL → Derived tables (Chain)**.
104+
105+
Full procedure: [references/WORKFLOWS.md](references/WORKFLOWS.md).
106+
107+
---
108+
109+
## Sorted indexes (filters and range scans)
110+
111+
For equality, range, and sort-heavy OLAP — not full-text or vector (see **`hotdata-search`**):
112+
113+
```bash
114+
hotdata indexes create --connection-id <id> --schema <schema> --table <table> \
115+
--name idx_orders_created --columns created_at --type sorted [--async]
116+
```
117+
118+
List and delete use the same `hotdata indexes` commands as in the search skill; only **`--type sorted`** is the analytics focus here.
119+
120+
---
121+
122+
## Sandboxes and chains
123+
124+
Sandbox datasets use **`datasets.<sandbox_id>.<table>`**, not `datasets.main`. Run queries with active sandbox config or `hotdata sandbox <id> run hotdata query "..."`. See **`hotdata`** skill **Sandboxes**.
Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
# Analytics workflows
2+
3+
OLAP-style SQL, **History** (query runs and stored results), and **Chain** (materialized follow-ups). Requires **`hotdata`** for auth, workspaces, and catalog commands.
4+
5+
**Related:** **`hotdata-search`** for BM25/vector indexes and `hotdata search`; **`hotdata`** [WORKFLOWS.md](../../hotdata/references/WORKFLOWS.md) for datasets vs managed databases.
6+
7+
---
8+
9+
## History
10+
11+
**Goal:** Find prior work: query runs (execution history) and stored result rows.
12+
13+
### Query runs
14+
15+
Uses the **active workspace only** — no `--workspace-id` on `queries`. Set default workspace with `hotdata workspaces set` first.
16+
17+
```bash
18+
hotdata queries list [--limit N] [--cursor <token>] [--status <csv>]
19+
hotdata queries <query_run_id>
20+
```
21+
22+
- `list` — status, creation time, duration, row count, truncated SQL preview (default limit 20).
23+
- `--status` — filter comma-separated values, e.g. `--status running,failed`.
24+
- `<query_run_id>` — full metadata (timings, `result_id`, snapshot, hashes) and formatted SQL.
25+
- If a run has a `result_id`, fetch rows with `hotdata results <result_id>` below.
26+
27+
Use history to spot recurring `WHERE`, `JOIN`, `GROUP BY`, or search-style SQL before adding indexes (**`hotdata-search`**) or new Chain tables.
28+
29+
### Stored results
30+
31+
```bash
32+
hotdata results list [--workspace-id <workspace_id>] [--limit N] [--offset N]
33+
hotdata results <result_id> [--workspace-id <workspace_id>] [--output table|json|csv]
34+
```
35+
36+
- Query footers may include `[result-id: rslt...]` — record it for later.
37+
- Pick up `result_id` from `queries <query_run_id>` when present.
38+
- **Prefer `hotdata results <result_id>` over re-running identical heavy SQL.** Re-runs waste resources and may return different data.
39+
40+
Results are paginated; the CLI hints the next `--offset` when more rows exist.
41+
42+
---
43+
44+
## Chain
45+
46+
**Goal:** Follow-up analysis on a **bounded** intermediate without rescanning huge base tables.
47+
48+
**Pattern:** run SQL → materialize → query the materialized **qualified name**.
49+
50+
### 1. Base query
51+
52+
```bash
53+
hotdata query "SELECT ..."
54+
```
55+
56+
- Quote mixed-case columns with double quotes (PostgreSQL dialect).
57+
- If the CLI returns a `query_run_id`, poll instead of re-running:
58+
59+
```bash
60+
hotdata query status <query_run_id>
61+
```
62+
63+
Exit codes: `0` succeeded, `1` failed, `2` still running.
64+
65+
### 2. Materialize
66+
67+
Land a smaller table — pick one:
68+
69+
**Datasets** (CSV/JSON/URL/SQL snapshot → `datasets.<schema>.<table>`):
70+
71+
```bash
72+
hotdata datasets create --label "chain revenue slice" --sql "SELECT ..." [--table-name chain_revenue_slice]
73+
hotdata datasets create --label "from saved" --query-id <query_id> [--table-name ...]
74+
```
75+
76+
**Managed database** (parquet → `<database>.<schema>.<table>`):
77+
78+
```bash
79+
hotdata databases create --name chain_db --table revenue_slice
80+
hotdata databases tables load chain_db revenue_slice --file ./revenue_slice.parquet
81+
```
82+
83+
Note the printed **`full_name`** (e.g. `datasets.main.chain_revenue_slice` or `chain_db.public.revenue_slice`). For datasets, **`FULL NAME`** from `datasets list` is authoritative.
84+
85+
### 3. Chain query
86+
87+
Query using that name — do not hardcode `datasets.main` if the schema segment is a sandbox id:
88+
89+
```bash
90+
hotdata datasets list
91+
hotdata query "SELECT * FROM datasets.main.chain_revenue_slice WHERE ..."
92+
# Sandbox example (use actual full_name from create or list):
93+
# hotdata query "SELECT * FROM datasets.s_ufmblmvq.chain_revenue_slice WHERE ..."
94+
# Managed database:
95+
# hotdata query "SELECT * FROM chain_db.public.revenue_slice WHERE ..."
96+
```
97+
98+
### Sandbox context
99+
100+
For **sandbox-scoped** chain tables:
101+
102+
- Qualified name is **`datasets.<sandbox_id>.<table>`**, not `datasets.main`.
103+
- Run queries with **active sandbox** in config (`hotdata sandbox set`) **or** inside **`hotdata sandbox <sandbox_id> run hotdata query "…"`**.
104+
- Without sandbox context, you may get **access denied** on sandbox-only tables.
105+
106+
### Naming and documentation
107+
108+
- Prefer predictable `--table-name` values: `chain_<topic>_<YYYYMMDD>`.
109+
- Record long-lived chains in **context:DATAMODEL → Derived tables (Chain)** with the **full** SQL name you use (`datasets.…` or `database.schema.table`).
110+
- Promote join/grain findings to **context:DATAMODEL** when they should outlive the sandbox (**`hotdata`** skill).
111+
112+
### Guardrails
113+
114+
- Materialize when the base scan is large and the follow-up runs many times.
115+
- Keep Chain tables focused; avoid wide `SELECT *` materializations when a narrow projection suffices.
116+
- For upload format choice (datasets vs databases), see **`hotdata`** WORKFLOWS — [Datasets vs managed databases](../../hotdata/references/WORKFLOWS.md#datasets-vs-managed-databases).

skills/hotdata-geospatial/SKILL.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@ version: 0.2.3
88

99
Use this skill when working with geospatial data in Hotdata. Hotdata supports a subset of PostGIS-style functions using **PostgreSQL dialect SQL**. This reference is dataset-agnostic — apply it to any table with geometry columns.
1010

11+
**Related skills:** **`hotdata`** (core CLI), **`hotdata-search`** (BM25/vector), **`hotdata-analytics`** (OLAP SQL).
12+
1113
---
1214

1315
## Geometry Columns

0 commit comments

Comments
 (0)