diff --git a/.claude-plugin/marketplace.json b/.claude-plugin/marketplace.json index a71e77a..2f7c6eb 100644 --- a/.claude-plugin/marketplace.json +++ b/.claude-plugin/marketplace.json @@ -11,7 +11,7 @@ "name": "docent", "source": "./plugins/docent", "description": "Docent AI analysis tools for Claude Code", - "version": "0.1.8", + "version": "0.1.9", "author": { "name": "TransluceAI" }, diff --git a/.github/workflows/plugin-sanity.yml b/.github/workflows/plugin-sanity.yml new file mode 100644 index 0000000..156a377 --- /dev/null +++ b/.github/workflows/plugin-sanity.yml @@ -0,0 +1,100 @@ +name: Plugin sanity + +on: + push: + pull_request: + workflow_dispatch: + +permissions: + contents: read + +jobs: + sanity: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-python@v5 + with: + python-version: "3.12" + + - name: Validate Claude Code plugin package + run: | + python - <<'PY' + import json + import re + from pathlib import Path + + root = Path.cwd() + + def fail(message: str) -> None: + raise SystemExit(message) + + def load_json(path: Path) -> dict: + try: + return json.loads(path.read_text(encoding="utf-8")) + except Exception as exc: + fail(f"{path} is not valid JSON: {exc}") + + marketplace = load_json(root / ".claude-plugin" / "marketplace.json") + entries = marketplace.get("plugins") + if not isinstance(entries, list): + fail("marketplace plugins must be a list") + + docent_entries = [entry for entry in entries if entry.get("name") == "docent"] + if len(docent_entries) != 1: + fail("marketplace must contain exactly one docent plugin entry") + + entry = docent_entries[0] + plugin_dir = root / entry.get("source", "") + if not plugin_dir.is_dir(): + fail(f"marketplace source does not exist: {plugin_dir}") + + manifest = load_json(plugin_dir / ".claude-plugin" / "plugin.json") + if manifest.get("name") != "docent": + fail("plugin manifest name must be docent") + + version = manifest.get("version") + if not isinstance(version, str) or not re.fullmatch(r"\d+\.\d+\.\d+", version): + fail("plugin manifest version must be plain major.minor.patch") + if entry.get("version") != version: + fail("marketplace docent version must match plugin manifest version") + + required_files = [ + ".claude-plugin/plugin.json", + ".mcp.json", + "skills/docent/SKILL.md", + "skills/docent/analysis.md", + "skills/docent/dql-reference.md", + "skills/docent/ingestion-reference.md", + "skills/docent/ingestion.md", + "skills/docent/readings-reference.md", + "skills/docent/report.md", + ] + for rel_path in required_files: + path = plugin_dir / rel_path + if not path.is_file(): + fail(f"required plugin file is missing: {rel_path}") + if path.suffix == ".md" and not path.read_text(encoding="utf-8").strip(): + fail(f"markdown file is empty: {rel_path}") + + mcp = load_json(plugin_dir / ".mcp.json") + server = mcp.get("mcpServers", {}).get("docent") + if not isinstance(server, dict): + fail(".mcp.json must define mcpServers.docent") + if server.get("type") != "stdio" or server.get("command") != "uv": + fail("docent MCP server must run as uv stdio") + args = server.get("args") + if not isinstance(args, list) or "--from" not in args: + fail("docent MCP server args must include --from") + package = args[args.index("--from") + 1] + if package != "docent-python>=0.1.73": + fail("docent MCP server must require docent-python>=0.1.73") + + forbidden_names = {".mcp.local.json", "docent.env"} + for path in plugin_dir.rglob("*"): + if path.name in forbidden_names or path.name.startswith("docent.env."): + fail(f"local credential/config file must not be published: {path}") + + print("Claude Code plugin sanity checks passed") + PY diff --git a/plugins/docent/.claude-plugin/plugin.json b/plugins/docent/.claude-plugin/plugin.json index 1410e14..9b26a96 100644 --- a/plugins/docent/.claude-plugin/plugin.json +++ b/plugins/docent/.claude-plugin/plugin.json @@ -1,5 +1,5 @@ { "name": "docent", - "version": "0.1.8", + "version": "0.1.9", "description": "Docent AI analysis tools" } diff --git a/plugins/docent/.mcp.json b/plugins/docent/.mcp.json index c239498..a3f959c 100644 --- a/plugins/docent/.mcp.json +++ b/plugins/docent/.mcp.json @@ -3,7 +3,7 @@ "docent": { "type": "stdio", "command": "uv", - "args": ["tool", "run", "--from", "docent-python", "docent-mcp"] + "args": ["tool", "run", "--from", "docent-python>=0.1.73", "docent-mcp"] } } } diff --git a/plugins/docent/skills/docent/SKILL.md b/plugins/docent/skills/docent/SKILL.md index 3bfd11f..8134e2d 100644 --- a/plugins/docent/skills/docent/SKILL.md +++ b/plugins/docent/skills/docent/SKILL.md @@ -1,25 +1,21 @@ --- name: docent -description: Unified skill for the Docent AI platform. Includes instructions on how to analyze, report on, and ingest AI agent runs, as well as API references. +description: Docent is a platform for analyzing AI agent behavior. Always load this skill before interacting with the Docent platform. alwaysApply: true --- # Docent -This is the root skill for all Docent work. Use it whenever the user wants to analyze runs, ingest data, create or update reports, or look up how the Docent SDK works. +This is the root skill for all Docent work. This file is just a table of contents. In most cases you should read one of the guides below before starting to work with docent. Choose the guide that best matches your task. -## Choose the right guide - -- For analyzing or answering questions about agent runs, exploring collections: `./analysis.md` +- For exploring a collection of agent runs, analyzing data, answering questions about agent behavior: `./analysis.md` - For ingestion workflows that convert local logs or eval traces into Docent data: `./ingestion.md` - If the user is asking to manipulate data in the platform through code or the command line, see the SDK reference. -## API references +## Other available documentation - For the Readings API (`client.read`, `client.query`, batching, prompts, clustering): `./readings-reference.md` - For DQL syntax, schemas, quirks, and example queries: `./dql-reference.md` - For the reports API: `./report.md` (only if the user explicitly asks for a report) -- For ingestion-side data-model and conversion examples: the reference and pattern sections in `./ingestion.md` +- For ingestion-side data-model and conversion examples: `./ingestion-reference.md` - SDK reference is available by visiting [our online documentation](https://docs.transluce.org/llms.txt) - -Open only the sibling docs that match the user's task; do not load everything by default. diff --git a/plugins/docent/skills/docent/analysis.md b/plugins/docent/skills/docent/analysis.md index f774f69..5f55aa2 100644 --- a/plugins/docent/skills/docent/analysis.md +++ b/plugins/docent/skills/docent/analysis.md @@ -8,7 +8,7 @@ alwaysApply: true **The goal of a Docent analysis is to give the user justifiable trust in the results.** The user should have clear insight into what the analysis is doing and why it is being run. This is accomplished through two channels: * **Communication via the command line.** Explain what you found, what you plan to do, and why — before writing code. Surface blockers and intermediate findings in plain language. The user should never be left watching scripts run with no understanding of the analysis taking shape. -* **Readings in the Docent UI.** Readings make the analysis legible: the user can see every prompt sent to the LLM, every transcript analyzed, and every result returned — with citations back to the source material. Prefer readings over opaque DQL aggregations precisely because readings give the user a clear, inspectable visualization of the qualitative analysis performed. DQL summaries (counts, averages) are useful for orientation, but they are not self-explanatory the way a reading with cited evidence is. +* **Analysis plans in the Docent UI.** Analysis plans make the analysis legible: the user can see every prompt sent to the LLM, every transcript analyzed, and every result returned — with citations back to the source material. (Note: you may see references to "reading plans", which is an outdated term for analysis plans. They're the same thing.) You can interact with Docent by writing Python scripts that use the Docent SDK, and by calling Docent MCP tools. If Docent MCP tools are not available, alert the user that the Docent MCP server is not installed correctly. @@ -26,10 +26,8 @@ These apply throughout the entire analysis session: Good — explains the analytical choice so the user can redirect: > "Safety-monitoring is the broadest single safety indicator and it's scored for every run, so I'll use that as the primary ranking. I'll sample the 25 worst-scoring transcripts — enough to see patterns without blowing the analysis budget. If you'd rather focus on a specific failure type like co-rumination, we can narrow the filter." -* **Minimize wasted user attention.** Every tool call the user has to approve is a cost — and the approval screen shows the full code block, which can fill the user's entire screen and destroy context. Keep inline scripts short (under ~15 lines) so the user can read and approve them at a glance. For anything longer, write a named script file — the user then approves a short `uv run script_name.py` command instead of scrolling through 60 lines of inline Python. Run orientation queries independently (not in a monolithic script that fails as a unit). Fix syntax errors in-place rather than requiring an edit-rerun approval loop. -* **Speak in analysis terms, not platform terms.** The user is here to understand their data, not to learn Docent internals. Never use platform jargon in user-facing text. Translate to plain language: - - "reading" / "reading plan" → "analysis" - - "reading preset" → "saved analysis template" (or just omit — the user rarely needs to know) +* **Minimize wasted user attention.** Keep inline scripts short (under ~15 lines) so the user can read and approve them at a glance. For anything longer, write a named script file — the user then approves a short `uv run script_name.py` command instead of scrolling through 60 lines of inline Python. Run orientation queries independently (not in a monolithic script that fails as a unit). Fix syntax errors in-place rather than requiring an edit-rerun approval loop. +* **Avoid unnecessary docent-internal jargon.** The user is here to understand their data, not to learn Docent internals. - "flush" → never mention to the user - "DQL" / "DQL query" → "query," or just describe what you're checking - "template reading" / "scripted reading" → never mention; these are implementation details @@ -44,8 +42,6 @@ These apply throughout the entire analysis session: Good — narrates the investigation: > "Let me check how the models compare on safety scores — I'll look at the averages and the distribution of failures." - - Platform terminology belongs in code and code comments, not in conversation with the user. When in doubt, ask: would an analyst unfamiliar with Docent understand this sentence? * **Surface blocking errors to the user immediately.** If a script fails on permissions, unexpected data, or a problem you can't fix in one retry, tell the user what happened and why before attempting a fix. Don't silently retry multiple times — the user loses trust when they can't see what's going on. * **Don't raise concerns and then drop them.** If you notice a potential data integrity issue (e.g., "these score columns might be keyed by judge model, not subject model"), resolve it before proceeding — run a quick verification query, check the metadata, or ask the user. Raising a concern, saying "let me verify," and then continuing without verifying is worse than not noticing: the user now has false confidence that the issue was checked. @@ -68,11 +64,11 @@ client = Docent.from_url("https://docent.transluce.org/dashboard/668354d8-...") ``` This parses the domain and collection ID from the URL automatically. -The Docent SDK can be configured by a docent.env file in the working directory. The SDK will automatically discover and load a docent.env file if it exists. You do not need to explicitly source docent.env. Config files may use INI-style `[section]` headers for multi-profile support; select a profile with `Docent(profile="my-profile")` or the `DOCENT_PROFILE` environment variable. +The Docent SDK can be configured by a `docent.env` file. The SDK searches from the current working directory upward through parent directories, then falls back to `~/.docent/docent.env` if no local file exists. You do not need to explicitly source `docent.env`. Config files may use INI-style `[section]` headers for multi-profile support; select a profile with `Docent(profile="my-profile")` or the `DOCENT_PROFILE` environment variable. If you're not sure what collection the user is talking about: * If the user provides a Docent dashboard URL (e.g., `https://docent.transluce.org/dashboard/668354d8-...`), use `Docent.from_url()` or extract the collection ID from the last path segment (the UUID). -* Otherwise, check the `docent.env` file in the working directory for `DOCENT_COLLECTION_ID`. +* Otherwise, check the SDK-discovered `docent.env` file for `DOCENT_COLLECTION_ID`. * If neither is available, ask the user to paste the collection UUID. The main Docent deployment lives at https://docent.transluce.org but the user may connect a different deployment by overriding DOCENT_FRONTEND_URL in docent.env. The Docent SDK will print out the frontend URL when it is initialized, e.g. `Authenticating Docent client with frontend_url='https://docent.transluce.org'`. If you see a different frontend URL, use that URL in place of `https://docent.transluce.org` for any links. @@ -84,9 +80,10 @@ If you run into any issues or unexpected behavior with the Docent platform, paus * If authentication fails (HTTP 401) or no API key is configured, walk the user through setup: 1. Open the API keys page for them: `open https://docent.transluce.org/settings/api-keys` (macOS) or `xdg-open https://docent.transluce.org/settings/api-keys` (Linux). 2. Ask them to create a new API key (it will start with `dk_`). - 3. Write the key to a `docent.env` file in the working directory: `DOCENT_API_KEY=dk_...` (plus `DOCENT_API_URL` and `DOCENT_FRONTEND_URL` if not using the default instance). + 3. Write the key to a local `docent.env` file or `~/.docent/docent.env`: `DOCENT_API_KEY=dk_...` (plus `DOCENT_API_URL` and `DOCENT_FRONTEND_URL` if not using the default instance). 4. Verify connectivity by constructing a `Docent()` client — the constructor validates the API key automatically. -* If the SDK does not match what's documented in this SKILL.md, check whether the SDK is up to date. +* If the SDK does not match what's documented here, check whether the SDK is up to date. +* If the Docent MCP server is available but doesn't match the tools documented here, check whether the MCP server needs an upgrade (`uv tool upgrade docent`). If an upgrade was needed, ask the user to restart the session or MCP server. * Use the `get_reading_plan_results` MCP tool to inspect the results of an analysis. Call it with just `collection_id` and `plan_name` to see an overview of all steps and their statuses. Call it with an additional `step_name` to see the actual results for a specific step. * **When debugging, try first, ask second.** If the user asks you to debug a failed analysis and gives you a plan name, collection ID, or other identifying info, attempt the lookup immediately with whatever you have. A failed tool call is instant, informative, and free — it tells you exactly what went wrong. Asking the user to confirm inputs before trying adds a round-trip that produces nothing the tool call wouldn't have revealed faster. If the lookup fails, *then* ask for corrections with the error context in hand. @@ -99,26 +96,32 @@ When the user asks to see something in the Docent UI, or when you want to point | Collection dashboard | `https://docent.transluce.org/dashboard/{collection_id}` | | Agent run | `https://docent.transluce.org/dashboard/{collection_id}/agent_run/{agent_run_id}` | | Agent run at specific transcript/block | Same as above + `?transcript_idx={N}&block_idx={M}` | -| Reading plan | `https://docent.transluce.org/dashboard/{collection_id}/reading-plan/{reading_plan_id}` | +| Analysis plan | `https://docent.transluce.org/dashboard/{collection_id}/analysis-plan/{reading_plan_id}` | **When to use UI links instead of scripts:** * The user asks to "see" or "browse" something (e.g., rubric definitions, specific transcripts, judge outputs) — link them directly rather than extracting content into the terminal. * You want the user to inspect specific evidence — provide the URL so they can drill in. -* You're presenting analysis findings — include the reading plan URL so the user can verify claims. +* You're presenting analysis findings — include the analysis plan URL so the user can verify claims. **How to find IDs for constructing URLs:** Use `execute_dql` MCP tool queries against the relevant tables (`agent_runs`, `transcripts`, `judge_results`, `readings`, etc.) to look up IDs, then construct the URL. -## Reading transcripts (optional) +## Overview of analysis tools and terminology -You can use the get_agent_run_messages MCP tool to read the content of an individual agent run or transcript as needed. Use this sparingly; prefer readings for systematic analysis of agent behavior. However, you may decide to use get_agent_run_messages: -* To understand what a collection contains, if metadata doesn't make it clear -* To understand what a behavior of interest might concretely look like, when crafting a reading prompt to detect the behavior +DQL is a read-only subset of DQL that you can use to query agent runs in the docent database. DQL is useful for quantitative analysis of agent run metadata (e.g. which model gets the highest average score). DQL should never be used to inspect transcript content. Read ./dql-reference.md before using DQL. ---- +A reading is a structured batch of LLM calls. Readings are useful for qualitative analysis of agent run content (e.g. what mistakes is the agent making, how is it interacting with the user). Use readings instead of inspecting transcript content directly. See details in ./readings-reference.md. + +An analysis script is a Python script you write using the Docent SDK. An analysis script can perform DQL queries (client.query) and readings (client.read). + +When you run an analysis script, an analysis plan is displayed in the Docent UI. Each query and reading in the script is displayed as a separate card in the analysis plan. Readings require approval from the user before they are run. Results for both step types (DQL and reading) are displayed in interactive tables. -# Workflow +Once you have a question where qualitative analysis is clearly required, you can go ahead and create + run an analysis script with readings. If you need the user to clarify or refine the question, do that before writing the script. -This section describes the end-to-end process for a Docent analysis session. Follow it in order. +Note: the Docent UI is the primary place to view reading results. You do not need to fetch them, read them, and restate them to the user. If a summary or synthesis would be helpful, perform that as another reading in the same analysis script so it will show up in the UI. If a structured aggregation of reading results would be helpful, perform that as another DQL query in the same analysis script. + +# Example workflow + +This section describes the end-to-end process for a Docent analysis session. ## Step 1: Orient and brief the user @@ -130,12 +133,14 @@ If the user provided a dashboard URL, use `Docent.from_url()` in all scripts thr Use the `get_metadata_fields` MCP tool to understand the structure of agent run metadata for the current collection. Agent runs contain metadata that varies by collection — do not make assumptions about its structure. -Also call `list_reading_presets` to check if the collection has any saved analysis templates. These can be reused and are worth knowing about before proposing analysis directions. Do not mention "reading presets" to the user; if relevant presets exist, describe them by what they analyze (e.g., "there's a saved analysis for failure classification that we can reuse"). +Also call `list_reading_presets` to check if the collection has any saved reading presets. These can be reused and are worth knowing about before proposing analysis directions. **Immediately after these calls return, tell the user what you see** in 2-3 sentences: what kind of data is in this collection, what the key dimensions are (e.g., models, tasks, environments), and what scores or metrics are available. This is the user's first orientation to the dataset — don't skip it, and don't jump straight into writing queries. ### 1b. Run orientation queries, reporting as you go +Read `./dql-reference.md` for detailed information on how to write DQL queries. + Explore the data with a small number of targeted queries — 2-3 is usually enough; don't write 5+ "just in case." **Always use the `execute_dql` MCP tool, never a Python script or local aggregation.** It runs read-only DQL directly without an approval round-trip. If you have a genuine reason to use Python here (e.g., chaining a couple of queries), the aggregations themselves must still go through `client.execute_dql` using `uv run python3 -c "..."`. **Before each query, explain which metrics you chose and why** — not just "let me check scores." The user should understand your analytical reasoning well enough to redirect it before you run the query. Because they don't see the raw query output (only your reported findings), your one-line framing is the only window they have into what you're learning and why it matters. @@ -144,7 +149,8 @@ Report each finding as you get it — one sentence per query is enough. If a que **When presenting numbers, always explain the scale.** Don't show a table of values without telling the user what they mean. Are these averages of binary 0/10 scores (i.e., pass rates)? Continuous scores on a 0-10 scale? Higher-is-better or higher-is-worse? If the metric names are opaque (e.g., "poetic escalation," "beneficent goal-directed tenacity"), give the user a one-line plain-language description of what each one measures. If you don't know exactly how a metric is defined, say so rather than letting the user assume your labels are precise. -**When formatting tables:** +**Formatting ASCII tables:** +You may format findings as ASCII tables where appropriate. Only use ASCII tables for quick, informal updates. For presenting aggregations or slices of final results, use client.query in your analysis script. For any table of individual agent runs or transcripts, use client.query. (The Docent UI makes it convenient to inspect individual transcripts, unlike an ASCII table.) * Use plain-language column headers, not internal field names (e.g., "Co-rumination" not "avg_co_rum") * Label the scale once (e.g., "All scores 0-10, higher = safer" or "Pass rate out of 252 runs") * Don't bold arbitrary values without explaining the logic — if you bold the worst values, say "worst in bold" @@ -207,7 +213,7 @@ ORDER BY run_count DESC If the user has not precisely stated what analysis they want you to run, now is a good time to check in. Summarize what you learned in plain language (not raw query output) and propose 2-3 analysis directions. Let the user choose which question they want to focus on. The user needs early visibility and control over both the analytical direction and the intended deliverable. -**Stop and wait for the user to respond.** Do not propose directions and then immediately commit to one. This is the most common violation of this step: +**Stop and wait for the user to respond.** Do not propose directions and then immediately commit to one. Bad — proposes then bulldozes: > "Here are three directions we could take: (1) safety failures, (2) hardest scenarios, (3) empathy vs. safety tradeoff. Assuming you'd pick option 1, let me go ahead and write the analysis..." @@ -288,26 +294,16 @@ Before running your first analysis script against a collection: ### Build incrementally -**Build incrementally, not monolithically.** A phase is one analytical step: summarize, cluster, classify, compare. Each phase becomes a separate run-and-review cycle. Write the first phase of your script (e.g., summarize transcripts + propose clusters), run it, confirm it works and report the intermediate results to the user. Then extend the script with the next phase and run again — earlier steps are cached and won't re-run. See the phased clustering example in `./readings-reference.md` for this pattern. +A phase is one analytical step: summarize, cluster, classify, compare. Each phase becomes a separate run-and-review cycle. Write the first phase of your script (e.g., summarize transcripts + propose clusters), run it, confirm it works and report the intermediate results to the user. Then extend the script with the next phase and run again — earlier steps are cached and won't re-run. See the phased clustering example in `./readings-reference.md` for this pattern. Do not write a script covering all phases at once. A monolithic script that fails on line 50 wastes all the work after it and forces a full debug-edit-rerun cycle. Worse, you spend your entire turn budget debugging DQL syntax instead of delivering results. The phased approach means each run is short, each failure is isolated, and the user sees intermediate progress. ### Running and communicating -Analyses appear in a web UI for the user to approve — this is a key control affordance. You are responsible for running analysis scripts when appropriate; the user should not have to do so manually. +Analysis plans appear in a web UI for the user to approve — this is a key control affordance. You are responsible for running analysis scripts when appropriate; the user should not have to do so manually. Prefer to run analysis scripts in the background, so that you can still communicate with the user if the script pauses to wait for approval. **Surface the Docent UI link as soon as the analysis is submitted** — don't wait until results come back. The SDK's `flush()` opens a browser tab, but the user may not notice or may lose it among other tabs. Always tell the user explicitly: "The analysis is running — you can follow along and approve it here: [link]." This is especially important because the link is how the user inspects the evidence behind every finding. -### Hierarchical synthesis for large result sets - -When synthesizing more than ~30 reading results into a single analysis, do NOT put all results into one prompt. Instead: - -1. **Batch**: Split results into groups of 15-20 using DQL (e.g., `LIMIT 20 OFFSET 0`, `LIMIT 20 OFFSET 20`, etc.) -2. **Summarize each batch**: Run a synthesis reading per batch that produces a structured intermediate summary -3. **Final synthesis**: Aggregate the batch summaries (which are now ~5-10 items) into a single final reading - -Alternatively, if the per-item readings produce structured output (e.g., categories/enums), use DQL aggregation over `reading_results.output` to produce counts and distributions — this avoids context limits entirely and gives exact numbers. - **Be explicit about partial data.** When `get_reading_plan_results` returns truncated output (e.g., 50 of 132 results visible), state the exact fraction you saw and caveat derived numbers. Prefer using query aggregation over `reading_results.output` to get complete counts rather than parsing truncated tool output. For example, to get the full distribution of a structured output field across all results, query `reading_results` directly: ```sql @@ -322,22 +318,13 @@ GROUP BY category ORDER BY cnt DESC ``` -### Multi-phase analyses and approval round-trips - -Some analyses require mid-script blocking (e.g., the clustering pattern accesses step 2 results to derive enum values for step 3). Be aware that: - -* The script will submit steps 1-2 for approval, then **block** waiting for results. If the user hasn't approved yet, the script may time out or fail. -* The user will need to approve **twice**: once for the initial analysis steps, and again after the script resumes and submits the next phase. -* **Warn the user upfront** about multi-approval flows, but in plain terms: "This analysis has two phases — first I'll summarize each transcript, then once we see the patterns, I'll classify them into categories. You'll need to approve each phase in the Docent UI." - ## Critical workflow rules These are specific rules that follow from the principles above. They apply throughout the analysis: -* **Never present opaque Python computation as analysis results.** Orientation queries (Step 1) are for *your* understanding and can use `execute_dql()` and local Python. But once you move past orientation into actual analysis (Step 3), findings must go through Docent's inspectable pipeline — DQL query steps visible in the UI and LLM analyses with citable evidence. If the user's question requires categorization, comparison, or synthesis, use Docent analyses, not a Python script that outputs a table. The user has no way to verify, inspect, or drill into results that come from opaque code. Metadata aggregations via DQL are acceptable as supporting context (e.g., counts, averages), but the analytical conclusions should come from inspectable analyses the user can review in the Docent UI. -* **Recognize when you're about to violate the rule above.** The most common trigger is a user question that *feels* computational — grouping, ranking, statistical comparison, interaction effects. Your instinct will be to write Python that solves the problem directly (clustering algorithms, composite scores, significance tests). When you notice this instinct, go back to the Step 2b translation table and express the work as LLM analyses and DQL aggregations instead. -* **Never fall back to manual synthesis when an analysis step fails.** If a synthesis step fails (e.g., context overflow), fix the analysis design (batch it, sample it, use structured aggregation) and re-submit. Do not absorb the synthesis work into opaque Python scripts or agent-side summarization — this defeats the core value of Docent's inspectable, citable analysis. If you must do agent-side aggregation as a stopgap (e.g., counting structured output fields via a query), explicitly flag to the user that this step is not inspectable in the Docent UI and offer to re-run it properly. -* **If the user asks you to "summarize the agent runs", "classify the results", or similar**, they do not necessarily mean that you (the coding agent) should do so directly. In most cases, it is better to use Docent's LLM analysis (readings) for this. +* **Never present opaque Python computation as analysis results.** Orientation queries (Step 1) are for *your* understanding and can use `execute_dql()` and local Python. But once you move past orientation into actual analysis (Step 3), findings must go through Docent's inspectable pipeline — DQL query steps visible in the UI and analysis-plan readings with citable evidence. If the user's question requires categorization, comparison, or synthesis, use Docent analyses, not a Python script that outputs a table. The user has no way to verify, inspect, or drill into results that come from opaque code. Metadata aggregations via DQL are acceptable as supporting context (e.g., counts, averages), but the analytical conclusions should come from inspectable analyses the user can review in the Docent UI. +* **Don't fall back to manual synthesis when an analysis step fails.** If a synthesis step fails (e.g., context overflow), fix the analysis design (batch it, sample it, use structured aggregation) and re-submit. Do not absorb the synthesis work into opaque Python scripts or agent-side summarization — this defeats the core value of Docent's inspectable, citable analysis. If you must do agent-side aggregation as a stopgap (e.g., counting structured output fields via a query), explicitly flag to the user that this step is not inspectable in the Docent UI and offer to re-run it properly. +* If the user asks you to "read the agent runs", "summarize 10 transcripts", "classify the results", or similar, that not mean that you (the coding agent) should do so directly. Prefer to do this in an analysis plan using readings. * **Be transparent about reused work.** This has two parts: - **Existing scripts:** If you find an analysis script already on disk from a prior session, don't silently reuse or overwrite it. Tell the user what it does, what analytical choices are embedded in it (thresholds, sample sizes, which dimensions), and ask whether to reuse it or write a fresh one. - **Cached results:** After `flush()` returns, check the output for cache indicators (e.g., "cached (5 results)" in step status). If results came back cached, tell the user immediately: "These results are from a prior session — I'm pulling existing results rather than re-running. Want me to force a fresh analysis?" Do not narrate cached results as if you just computed them. The user needs to know whether they're looking at fresh work or replayed results. diff --git a/plugins/docent/skills/docent/dql-reference.md b/plugins/docent/skills/docent/dql-reference.md index ace2b86..1ce3444 100644 --- a/plugins/docent/skills/docent/dql-reference.md +++ b/plugins/docent/skills/docent/dql-reference.md @@ -9,7 +9,7 @@ Queries can only run over a single collection by design. Choose the right method based on context: * **`execute_dql` MCP tool** — Use for ad-hoc exploration and orientation (Step 1). Runs DQL directly without requiring user approval of inline scripts. Preferred for all exploratory queries. -* **`client.query()`** — Use inside reading plan scripts. Auto-registers the query as a UI-visible step. Use this for DQL that feeds data into readings, or that you want the user to see alongside reading results. Pass `name="..."` to give the step a display name. +* **`client.query()`** — Use inside analysis plan scripts. Query and results table appear in Docent UI. Use this for DQL that feeds data into readings, or that you want the user to see alongside reading results. Pass `name="..."` to give the step a display name. * **`client.execute_dql()`** — Use inside Python scripts for internal logic (e.g., conditional logic between reading steps, or data that feeds into scripted readings). Results are NOT shown in the Docent UI. ```python @@ -21,7 +21,7 @@ collection_id = "" # (Optional) inspect available tables/columns schema = client.get_dql_schema(collection_id) -# In reading plan scripts: query as a UI-visible step +# In analysis plan scripts: query as a UI-visible step rows = client.query( collection_id, "SELECT agent_runs.id AS agent_run_id FROM agent_runs LIMIT 10", @@ -44,11 +44,9 @@ raw_rows = client.dql_result_to_dicts(result) | `transcripts` | Individual transcripts tied to an agent run; stores serialized messages and per-transcript metadata. | | `transcript_groups` | Hierarchical groupings of transcripts for runs. | | `judge_results` | Scored rubric outputs keyed by agent run and rubric version. | -| `results` | Individual LLM analysis results from result sets. | | `readings` | Reading definitions (template or scripted LLM analysis). | | `reading_results` | Results from running readings. | | `reading_result_links` | Junction table linking readings to their results. | -| `analysis_sessions` | Session containers grouping readings together. | ### `agent_runs` @@ -154,17 +152,6 @@ For scripted readings, `arguments_dict` holds arbitrary user-supplied metadata p | `reading_id` | FK to readings.id. | | `result_id` | FK to reading_results.id. | -### `analysis_sessions` - -| Column | Description | -| --- | --- | -| `id` | Session identifier (UUID). | -| `collection_id` | Collection that owns the session. | -| `name` | Display name (from session_name or source script). | -| `readings_json` | Ordered list of step entries (readings, dql_only, headings). | -| `created_at` | When the session was created. | -| `updated_at` | Last modification time. | - ## JSON Metadata Access Patterns Docent stores user-supplied metadata as JSON. Access using Postgres operators: @@ -254,26 +241,6 @@ HAVING COUNT(t.id) > 1 ORDER BY transcript_count DESC; ``` -### Flagged Judge Results - -```sql -SELECT - jr.agent_run_id, - jr.rubric_id, - jr.result_metadata->>'label' AS label, - jr.output->>'score' AS score -FROM judge_results jr -WHERE jr.result_metadata->>'severity' = 'high' - AND EXISTS ( - SELECT 1 - FROM agent_runs ar - WHERE ar.id = jr.agent_run_id - AND ar.metadata_json->>'environment' = 'prod' - ) -ORDER BY CAST(jr.output->>'score' AS DOUBLE PRECISION) DESC -LIMIT 25; -``` - ### Completion Rate by Environment (CTE pattern) ```sql @@ -297,32 +264,6 @@ GROUP BY environment ORDER BY total_runs DESC; ``` -### Latest Rubric Scores by Model - -```sql -WITH latest_scores AS ( - SELECT - agent_run_id, - MAX(rubric_version) AS rubric_version - FROM judge_results - WHERE rubric_id = 'helpful_response_v1' - GROUP BY agent_run_id -) -SELECT - ar.id, - ar.metadata_json->'model'->>'name' AS model_name, - jr.output->>'score' AS score, - jr.result_metadata->>'label' AS label -FROM latest_scores ls -JOIN judge_results jr - ON jr.agent_run_id = ls.agent_run_id - AND jr.rubric_version = ls.rubric_version - AND jr.rubric_id = 'helpful_response_v1' -JOIN agent_runs ar ON ar.id = jr.agent_run_id -WHERE ar.metadata_json->>'environment' = 'prod' -ORDER BY CAST(jr.output->>'score' AS DOUBLE PRECISION) DESC -LIMIT 15; -``` ### Reading Results for a Specific Reading @@ -346,7 +287,7 @@ LIMIT 50; - **Single statement**: Batches or multiple statements are rejected. - **Explicit projection**: Wildcard projections (`*`) are disallowed. List the columns you need. - **Collection scoping**: A single query can only access data within a single collection. -- **Limit enforcement**: Every query is capped at 10,000 rows. Use pagination (`OFFSET`/`LIMIT`) for larger result sets. +- **Limit enforcement**: Every query is capped at 10,000 rows. Use pagination (`OFFSET`/`LIMIT`) for larger row collections. - **JSON performance**: Heavy JSON traversal across large collections can be slow. Prefer top-level fields when available. - **Type awareness**: Cast values explicitly when precision matters. diff --git a/plugins/docent/skills/docent/ingestion-reference.md b/plugins/docent/skills/docent/ingestion-reference.md new file mode 100644 index 0000000..a07ef6e --- /dev/null +++ b/plugins/docent/skills/docent/ingestion-reference.md @@ -0,0 +1,472 @@ +# Docent Ingestion Reference + +Load this file only when you need concrete code or detailed patterns while following `./ingestion.md`. + +## Source Discovery Helpers + +Use these snippets as starting points. Adapt them to the source layout instead of treating them as required framework code. + +```python +from collections import Counter +from pathlib import Path + + +def build_folder_tree(path: str, max_depth: int = 5) -> dict | None: + path_obj = Path(path) + + def recurse(current: Path, depth: int) -> dict | None: + if depth > max_depth or not current.is_dir(): + return None + + children = {} + file_extensions = Counter() + + for item in sorted(current.iterdir()): + if item.is_dir(): + children[item.name] = recurse(item, depth + 1) + else: + file_extensions[item.suffix.lower() or "no_ext"] += 1 + + return { + "children": children, + "file_counts": dict(file_extensions), + "total_files": sum(file_extensions.values()), + } + + return recurse(path_obj, 0) + + +def find_repeatable_template(tree: dict) -> dict: + def signature(node: dict | None) -> tuple: + if node is None: + return () + child_names = tuple(sorted(node.get("children", {}).keys())) + file_exts = tuple(sorted(node.get("file_counts", {}).keys())) + return (child_names, file_exts) + + signatures = {} + + def collect(node: dict | None, path: str = "") -> None: + if node is None: + return + sig = signature(node) + signatures.setdefault(sig, []).append(path) + for name, child in node.get("children", {}).items(): + collect(child, f"{path}/{name}") + + collect(tree) + repeated = [(sig, paths) for sig, paths in signatures.items() if len(paths) > 1 and sig[0]] + if not repeated: + return {"template_structure": None, "note": "No repeating pattern found"} + + repeated.sort(key=lambda item: len(item[1]), reverse=True) + return { + "template_structure": repeated[0][0], + "instance_count": len(repeated[0][1]), + "example_paths": repeated[0][1][:3], + } + + +def detect_inspect_files(path: Path) -> list[str]: + return [str(file) for file in path.rglob("*.eval")] +``` + +```python +from pathlib import Path + + +def sample_files_strategically(path: Path, template_info: dict) -> list[Path]: + samples = [] + + for instance_path in template_info.get("example_paths", [])[:2]: + instance = path / instance_path.lstrip("/") + for subdir in ["trajs", "trajectories", "logs", "results", ""]: + candidate = instance / subdir if subdir else instance + if candidate.exists(): + samples.extend(list(candidate.glob("*.json"))[:1]) + samples.extend(list(candidate.glob("*.jsonl"))[:1]) + if samples: + break + + if not samples: + samples = list(path.rglob("*.json"))[:3] + list(path.rglob("*.jsonl"))[:2] + + return samples[:5] +``` + +```python +def infer_json_schema(data: dict | list, max_depth: int = 5) -> dict: + if max_depth == 0: + return {"type": "any", "note": "truncated"} + + if isinstance(data, dict): + return { + "type": "object", + "fields": { + key: infer_json_schema(value, max_depth - 1) + for key, value in data.items() + }, + } + + if isinstance(data, list): + if not data: + return {"type": "array", "items": "unknown"} + item_schemas = [infer_json_schema(item, max_depth - 1) for item in data[:3]] + return {"type": "array", "items": item_schemas[0], "sample_count": len(data)} + + return {"type": type(data).__name__, "example": repr(data)[:100]} +``` + +## Inspect AI Logs + +When `.eval` files are detected, prefer the built-in loader: + +```python +from inspect_ai.log import read_eval_log +from docent.loaders.load_inspect import load_inspect_log + +eval_log = read_eval_log("path/to/file.eval") +agent_runs = load_inspect_log(eval_log) +print(f"Loaded {len(agent_runs)} runs from Inspect log") +``` + +## Transcript Sanity Check Warnings + +`check_agent_runs`, `check_agent_run`, `check_transcript`, and `check_messages` +return warning-level `TranscriptCheck` objects. They do not reject data by +themselves, but ingestion scripts should treat them as conversion errors unless +the warning category is explicitly understood, documented in the ingestion plan, +and accepted by the user. + +All possible warning codes from `docent.data_models.chat.checks`: + +| Code | When it appears | Fix or acceptance guidance | +| --- | --- | --- | +| `empty_message` | A message has no visible text, structured reasoning, assistant tool calls, or tool error. | Drop source noise, or preserve omitted source data in metadata if it is important. | +| `system_message_after_conversation_start` | A system message appears after a user, assistant, or tool turn. | Move setup text into the initial system prompt, or document that the source intentionally changes instructions mid-run. | +| `conversation_starts_with_tool_message` | The first non-system message is a tool response. | Check whether the assistant tool call was omitted or split into another transcript. | +| `consecutive_assistant_messages` | Two assistant messages are adjacent. | Usually merge adjacent assistant text, reasoning blocks, and tool calls into one assistant message unless the split is intentional. | +| `consecutive_user_messages` | Two user messages are adjacent. | Check whether they are separate conversations, or merge them if the source represents one user turn in fragments. | +| `assistant_tool_calls_interrupted` | A non-tool message appears before all previous assistant tool calls receive tool responses. | Place tool responses immediately after the assistant message that requested them, before the next user or assistant turn. | +| `missing_tool_response` | An assistant tool call never receives a matching tool response by the end of the transcript. | Add a tool message with the same `tool_call_id`, or document why the source lacks the response. | +| `reasoning_embedded_as_text` | Assistant text contains reasoning markers such as ``, ``, `reasoning:`, or `thinking:` but has no structured reasoning content block. | Move reasoning into `{"type": "reasoning", "reasoning": ...}` and keep user-visible answer text in `{"type": "text", "text": ...}`. | +| `tool_call_missing_id` | An assistant tool call has a blank `id`. | Populate a stable id so the corresponding tool message can refer to it via `tool_call_id`. | +| `tool_call_missing_function` | An assistant tool call has an id but a blank function name. | Populate the tool function name if it exists in the source data. | +| `duplicate_tool_call_id_in_assistant_message` | One assistant message contains the same tool call id more than once. | Use unique tool call ids within each assistant turn. | +| `duplicate_tool_call_id` | A tool call id was already emitted by an earlier assistant message in the same transcript. | Preserve source ids only when they are globally unique per transcript; otherwise generate stable unique ids during conversion. | +| `tool_response_missing_id` | A tool message has a blank `tool_call_id`. | Set `tool_call_id` to the id of the assistant tool call that produced the response. | +| `orphan_tool_response` | A tool message references a `tool_call_id` that no previous assistant tool call emitted. | Check whether the assistant tool call was omitted, assigned a different id, or split into another transcript. | +| `duplicate_tool_response` | Multiple tool messages respond to the same `tool_call_id`. | Keep one tool response per tool call unless the source intentionally streams partial tool outputs. | +| `tool_response_function_mismatch` | A tool message function name does not match the function name on the referenced assistant tool call. | Use the function name from the assistant tool call, or document a source-specific reason for the mismatch. | + +## Base Ingestion Script Shape + +Use this shape for custom data. Fill in `load_data` and `convert_to_agent_run` based on the confirmed plan. + +```python +import os +from pathlib import Path +from typing import Any + +from docent import Docent +from docent.data_models import AgentRun, Transcript +from docent.data_models.chat import ( + check_agent_runs, + format_check_report, + parse_chat_message, +) + + +DATA_PATH = Path("path/to/data") +COLLECTION_NAME = "collection-name" +DOCENT_API_KEY = os.environ["DOCENT_API_KEY"] + + +def load_data(path: Path) -> list[dict[str, Any]]: + records: list[dict[str, Any]] = [] + # Implement according to the confirmed source structure. + return records + + +def convert_to_agent_run(record: dict[str, Any]) -> AgentRun: + raw_messages = record.get("messages") or record.get("traj") or [] + messages = [parse_chat_message(message) for message in raw_messages] + + transcript = Transcript( + messages=messages, + metadata={}, # transcript-level fields from the mapping + ) + + return AgentRun( + transcripts=[transcript], + metadata={ + # scores, identifiers, grouping fields, and other mapped metadata + }, + ) + + +raw_data = load_data(DATA_PATH) +print(f"Loaded {len(raw_data)} source records") + +sample_errors = [] +for index, record in enumerate(raw_data[:10]): + try: + convert_to_agent_run(record) + except Exception as exc: + sample_errors.append({"index": index, "error": str(exc)}) + +if sample_errors: + raise RuntimeError(f"Sample conversion failed: {sample_errors[:5]}") + +agent_runs = [] +conversion_errors = [] +for index, record in enumerate(raw_data): + try: + agent_runs.append(convert_to_agent_run(record)) + except Exception as exc: + conversion_errors.append({"index": index, "error": str(exc)}) + +print(f"Converted {len(agent_runs)}/{len(raw_data)} source records") +if conversion_errors: + raise RuntimeError( + "Full conversion had failures. Fix or explicitly document every skipped " + f"source record before upload. Examples: {conversion_errors[:5]}" + ) + +sanity_report = check_agent_runs(agent_runs) +print(format_check_report(sanity_report)) +if sanity_report.has_warnings: + raise RuntimeError( + "AgentRun sanity checks produced warnings. Fix conversion problems, or " + "document accepted warning categories in ingestion-plan.md and confirm " + "with the user before upload." + ) + +client = Docent(api_key=DOCENT_API_KEY) +collection_id = client.create_collection(name=COLLECTION_NAME, description="") +upload_result = client.add_agent_runs(collection_id, agent_runs) +print(upload_result) +print(f"https://docent.transluce.org/collection/{collection_id}") +``` + +## Message Parsing + +Prefer `parse_chat_message` for dictionaries: + +```python +from docent.data_models.chat import parse_chat_message + +user_msg = parse_chat_message({"role": "user", "content": "What is 2+2?"}) +assistant_msg = parse_chat_message({"role": "assistant", "content": "The answer is 4."}) +system_msg = parse_chat_message({"role": "system", "content": "You are helpful."}) +``` + +Direct construction is also available when you need precise control: + +```python +from docent.data_models.chat import AssistantMessage, SystemMessage, UserMessage + +user_msg = UserMessage(content="Hello") +assistant_msg = AssistantMessage(content="Hi", model="gpt-4") +system_msg = SystemMessage(content="You are helpful.") +``` + +## Reasoning Handling + +Pay attention to reasoning during source analysis and sample conversion. +Deterministic sanity checks catch obvious structural issues such as adjacent +assistant messages and embedded reasoning markers, but they cannot decide +whether a source's reasoning stream was represented correctly. + +- Use `ContentReasoning` for visible reasoning summaries when the source exposes + them, and place those blocks on the same `AssistantMessage` as the answer text + and tool calls they belong to. +- If the source splits reasoning into separate assistant fragments, merge those + fragments into the following assistant message unless the split is semantically + intentional. +- Do not dump opaque or encrypted reasoning into user-visible text. Omit it or + preserve source-level counts/metadata, then document the omission in + `ingestion-plan.md`. +- During the sample conversion pass, inspect reasoning and tool-call turns + manually and record any accepted omissions or source-specific handling. + +```python +from docent.data_models.chat import AssistantMessage, ContentReasoning, ContentText + +assistant_msg = AssistantMessage( + content=[ + ContentReasoning(reasoning="The model's visible reasoning summary."), + ContentText(text="The answer shown to the user."), + ], +) +``` + +## Tool Calls + +Normalize raw tool calls before parsing messages if the source format differs from Docent's expected shape. + +```python +from docent.data_models.chat import AssistantMessage, ToolCall, ToolMessage + +assistant_msg = AssistantMessage( + content="Let me search for that.", + tool_calls=[ + ToolCall( + id="call_123", + function="web_search", + arguments={"query": "weather today"}, + type="function", + ) + ], +) + +tool_msg = ToolMessage( + content="Sunny, 72F", + tool_call_id="call_123", + function="web_search", +) +``` + +```python +from typing import Any + +from docent.data_models.chat import ToolCall + + +def parse_tool_calls(raw_calls: list[dict[str, Any]]) -> list[ToolCall]: + calls = [] + for index, raw_call in enumerate(raw_calls): + function_payload = raw_call.get("function", {}) + calls.append( + ToolCall( + id=raw_call.get("id", f"call_{index}"), + function=function_payload.get("name", raw_call.get("name", "")), + arguments=function_payload.get( + "arguments", + raw_call.get("arguments", {}), + ), + type="function", + ) + ) + return calls +``` + +## Simple Flat Records + +```python +from typing import Any + +from docent.data_models import AgentRun, Transcript +from docent.data_models.chat import parse_chat_message + + +def convert_simple(record: dict[str, Any]) -> AgentRun: + messages = [parse_chat_message(message) for message in record["messages"]] + metadata = {key: value for key, value in record.items() if key != "messages"} + metadata["scores"] = {"reward": record.get("reward", 0)} + + return AgentRun( + transcripts=[Transcript(messages=messages)], + metadata=metadata, + ) +``` + +## Pass@k Evaluation + +Use `TranscriptGroup` for attempts that belong to the same task-level `AgentRun`. + +```python +from typing import Any + +from docent.data_models import AgentRun, Transcript, TranscriptGroup +from docent.data_models.chat import parse_chat_message + + +def convert_pass_at_k(task_data: dict[str, Any]) -> AgentRun: + agent_run = AgentRun( + transcripts=[Transcript(messages=[])], + metadata={"task_id": task_data["task_id"]}, + ) + + groups = [] + transcripts = [] + + for index, attempt in enumerate(task_data["attempts"]): + group = TranscriptGroup( + name=f"Attempt {index + 1}", + agent_run_id=agent_run.id, + metadata={"k": index}, + ) + groups.append(group) + + transcript = Transcript( + messages=[parse_chat_message(message) for message in attempt["messages"]], + transcript_group_id=group.id, + metadata={"attempt": index}, + ) + transcripts.append(transcript) + + agent_run.transcripts = transcripts + agent_run.transcript_groups = groups + return agent_run +``` + +## Tree Or Branching Data + +Usually ingest each branch as its own `AgentRun`. Preserve tree structure in metadata. + +```python +from docent.data_models import AgentRun + +agent_run = AgentRun( + transcripts=[transcript], + metadata={ + "root_task_id": "task_123", + "branch_id": "branch_a_1", + "parent_branch_id": "branch_a", + "branch_depth": 2, + }, +) +``` + +## Multi-Agent Data + +Use one `Transcript` per agent in the same `AgentRun` when the agents share one episode-level outcome. + +```python +from docent.data_models import AgentRun, Transcript + +agent_run = AgentRun( + transcripts=[ + Transcript(messages=agent_1_messages, metadata={"agent_id": "agent_1"}), + Transcript(messages=agent_2_messages, metadata={"agent_id": "agent_2"}), + ], + metadata={ + "episode_id": "episode_42", + "scores": {"joint_reward": 0.85}, + }, +) +``` + +## Verification Snippet + +Prefer an SDK or API count when available. If count keys differ across SDK versions, log the raw collection details and manually verify the collection page. + +```python +collection_info = client.get_collection(collection_id) +print(collection_info) + +uploaded_count = None +if collection_info: + for key in ["agent_run_count", "num_agent_runs", "n_agent_runs", "total_runs"]: + if key in collection_info: + uploaded_count = collection_info[key] + break + +print("VERIFICATION REPORT") +print(f"Source records: {len(raw_data)}") +print(f"Converted: {len(agent_runs)}") +print(f"Failed conversions: {len(conversion_errors)}") +print(f"Uploaded count: {uploaded_count if uploaded_count is not None else 'unknown'}") +print(f"Collection URL: https://docent.transluce.org/collection/{collection_id}") +``` diff --git a/plugins/docent/skills/docent/ingestion.md b/plugins/docent/skills/docent/ingestion.md index 4209e30..d4da82b 100644 --- a/plugins/docent/skills/docent/ingestion.md +++ b/plugins/docent/skills/docent/ingestion.md @@ -3,800 +3,191 @@ name: ingestion description: Structured workflow for ingesting agent run data into Docent. Use when the user wants to upload evaluation logs or agent transcripts to Docent. Triggers on phrases like "ingest into Docent", "upload to Docent", "import runs to Docent", or when working with agent evaluation data that needs to be loaded into Docent for analysis. --- -# **Docent Ingestion Skill** +# Docent Ingestion Skill -This skill provides a structured workflow for converting transcripts and evaluation logs into the correct format for ingestion to Docent, an agent analysis tool. +Use this workflow to convert local transcripts, agent logs, or evaluation traces into Docent `AgentRun` data and upload them to a Docent collection. -## **Overview of Docent** +Keep the main workflow lightweight. Load `./ingestion-reference.md` only when you need concrete SDK examples, conversion snippets, source-inspection helpers, or examples for Inspect AI, tool calls, pass@k, branching, or multi-agent data. -Docent is a trace analysis tool that helps researchers analyze and debug agents. Researchers upload a “collection” of traces (“agent runs”) into Docent, where the tool enables them to: +## Core Rules -* Engage in structured data analysis such as grouping and joining to understand trends and create charts -* Quickly view traces of interest and capture human annotations of traces through labeling and comments -* Run a semantic search over transcripts by running a user-provided query of each transcript in their collection, and the cluster the results to understand high-level patterns -* Draft, refine, and iterate with the user on detailed rubrics to capture fuzzy behaviors like sycophancy, cheating, verbosity, etc. +- Work in four stages: context, planning, ingestion, verification. +- Create and maintain `ingestion-plan.md` in the working directory. +- Do not upload until the user confirms the proposed collection name, Docent hierarchy, field mappings, and omitted data. +- Never silently skip source data. Any file or field not ingested must be documented with a reason and expected impact. +- Save ingestion code to a file such as `ingest.py` or `ingest_.py`; do not rely on one-off inline Python for the final upload path. +- Use `parse_chat_message` from the Docent SDK for transcript messages, and make deliberate role mappings when the source roles differ from Docent's supported roles. +- Run deterministic `AgentRun` sanity checks before upload and resolve obvious conversion problems. -Docent accelerates researchers by helping them form hypotheses and directing them to read the most relevant transcripts. Researchers use Docent to qualitatively explain and understand shifts in quantitative metrics. Common use cases for Docent include: +## When Triggered -* Comparing between two checkpoints to understand a regression or to understand a quantitative tradeoff in their benchmark results -* Understanding an unexpected result. For instance, investigating why a checkpoint that receives high reward from a preference model (e.g. for code quality) appears to perform poorly with real users (e.g. PRs frequently rejected for low quality) -* Surfacing previously unknown failure modes. For instance, noticing that the timeout constraint is not explicit in an evaluation, causing thinking models to perform poorly compared to their non-thinking counterparts +If the user asks to "ingest", "upload", "import", or "move" traces, transcripts, or eval logs into Docent, briefly offer this structured workflow: -## **When to Offer This Workflow** +1. Gather context and credentials. +2. Inspect the data and propose a Docent organization. +3. Write and run an ingestion script. +4. Verify uploaded counts and warnings. -**Trigger conditions**: -The user mentions phrases like "ingest transcripts into Docent", "upload to Docent", "import runs to Docent,” “move data into Docent,” “upload traces to Docent” +If the user accepts or directly asks you to proceed, start Stage 1. If they decline, work freeform. -### **Initial offer:** +## Stage 1: Context -Offer the user a structured workflow for ingesting their transcripts into Docent. Briefly explain the four stages: +Before Python work, use an existing virtual environment if present. If no environment is active and `docent-python` is unavailable, ask before installing it. -1. **Context gathering**: User provides relevant context on their data, including the path to the data, how it was produced, and what kinds of analysis they would like to do -2. **Planning**: Understand how the user’s data is organized, plan an ingestion strategy, and recommend a suggested organization in Docent to the user. Surface the plan for user approval. - 1. Examine the overall data hierarchy by mapping the directory and file structure to understand if there are recurring patterns. - 2. For individual transcripts, identify all unique formats and create a template for ingesting each one. Map out a schema of each unique transcript format and map each field to the most appropriate class in Docent. - 3. Propose an organization structure in Docent (broken down into collections, agent runs, transcript groups, and transcripts) that fits the user’s analysis needs. -3. **Ingestion:** Given the suggested organization, plan how, write, and test a script that uploads all data from the user-provided directory to Docent. -4. **Testing**: After uploading to a collection in Docent, use the Docent SDK to pull down the collection data and verify that the metadata, transcript formats, and overall organization match expectations. +Collect only what is needed to plan: -Explain that you will ingest all the data provided in a directory of the user’s choosing. Explain the ask for context on the user’s analysis: while explaining is optional, it helps structure the data in Docent. Ask if they want to try this workflow or proceed freeform. +- API key: prefer `$DOCENT_API_KEY` or an SDK-discovered `docent.env` (current directory upward, then `~/.docent/docent.env`); ask only if neither is available. +- Data path: the file or directory to ingest. +- Optional context: what produced the data and what analysis the user wants to do in Docent. -If the user declines, work freeform. If the user accepts, proceed to Stage 1\. +Create `ingestion-plan.md` with this compact structure and append findings as the workflow proceeds: -## **Stage 1: Context Gathering** - -### **Environment Setup** - -Before running any Python commands, check for and activate a virtual environment: - -```shell -if [ -d "venv" ]; then - source venv/bin/activate -elif [ -d ".venv" ]; then - source .venv/bin/activate -fi -``` - -If there is no virtual environment present, prompt the user if they want to activate one -and proceed accordingly. - -Ensure the Docent SDK is installed. The package name is `docent-python`: - -```shell -pip install docent-python -``` - -### **Gathering Information** - -Collect only the essential information needed to start planning: - -- **API Key:** Check if `$DOCENT_API_KEY` is set in the environment or in a `docent.env` file. If not, ask: What is your Docent API key? (You can find or create one at: [https://docent.transluce.org/settings/api-keys](https://docent.transluce.org/settings/api-keys)) -- **Data Path:** What is the path to the files or directory you want to ingest? - -Once you have the data path, proceed to Stage 2 to analyze the data and create an ingestion plan. You will ask the user to confirm all details (including collection name, data context, and analysis goals) after presenting the plan. - -Create `ingestion-plan.md` in the working directory to log all decisions and findings throughout the workflow. Here is an example structure: - -``` +```markdown # Docent Ingestion Plan ## Configuration -- Data path: [from user] +- Data path: +- API key source: -## File Analysis -[to be filled in Stage 2a] +## Source Analysis +- File structure: +- Detected formats: +- Expected source record count: -## Schema -[to be filled in Stage 2b] +## Docent Model Orientation +- Documentation reviewed: +- Important SDK/model assumptions: -## Data Structure Proposal -[to be filled in Stage 2c] +## Proposed Docent Structure +- Collection: +- AgentRun unit: +- TranscriptGroup usage: +- Transcript usage: ## Field Mapping -[to be filled in Stage 2c] +| Source | Docent target | Notes | +| --- | --- | --- | ## Omitted Data -[MUST document any data not ingested and why] +| Field/File | Reason | Impact | +| --- | --- | --- | -## Plan Confirmation -- Collection name: [proposed, confirmed by user] -- Data context: [your understanding, confirmed by user] -- Analysis goals: [from user] +## Confirmation +- Collection name: +- Data context: +- Analysis goals: +- User confirmed: ## Execution Log -[to be filled in Stage 3] ## Verification -[to be filled in Stage 4 - compare expected vs actual counts] +- Source records: +- Converted: +- Failed conversions: +- Uploaded: +- Sanity warnings: +- Collection URL: ``` ---- - -## **Stage 2: Planning** - -### **Stage 2a: Understanding File Structure** +## Stage 2: Planning -Build understanding of the data organization to understand holistically how the user is storing their data and why they chose to organize it that way. Consider how this reflects on how they want their data stored in Docent. You can quickly get a sense of the data by using the appropriate strategies below. +### Orient on Docent Models -#### Build Structural Tree +Before designing the ingestion shape, review the ingestion-side SDK models and docs: -You can generate a folder-only tree with the following script, to see the overall directory structure. You may want to strategically list individual files in a few folders to understand them as well. +- Online SDK documentation: https://docs.transluce.org/llms.txt +- Local examples and snippets, as needed: `./ingestion-reference.md` -```py -import os -from pathlib import Path -from collections import Counter +At minimum, understand: -def build_folder_tree(path: str, max_depth: int = 5) -> dict: - """Build a tree of folder structure, detecting patterns.""" - path = Path(path) +- `Collection`, `AgentRun`, `TranscriptGroup`, and `Transcript` +- Message classes, `parse_chat_message`, supported roles, tool calls, and tool responses +- How the source represents reasoning, such as visible reasoning text, structured + summaries, opaque blobs, or split assistant fragments +- Where structured values belong: usually `AgentRun.metadata`, `Transcript.metadata`, scores, identifiers, and grouping fields +- ID behavior: the SDK assigns `AgentRun` IDs automatically - def _recurse(p: Path, depth: int) -> dict: - if depth > max_depth or not p.is_dir(): - return None +### Analyze Source Data - children = {} - file_extensions = Counter() +Inspect the data path enough to identify the repeatable unit that should become an `AgentRun`. - for item in sorted(p.iterdir()): - if item.is_dir(): - children[item.name] = _recurse(item, depth + 1) - else: - file_extensions[item.suffix.lower() or "no_ext"] += 1 +Look for: - return { - "children": children, - "file_counts": dict(file_extensions), - "total_files": sum(file_extensions.values()), - } +- Directory organization: experiment, model, checkpoint, date, task, sample, attempt, phase +- File formats: JSON, JSONL, Inspect `.eval`, logs, configs, metadata files +- Repeated templates: the same set of files or folders repeated across samples or experiments +- Transcript fields: `messages`, `conversation`, `dialogue`, `turns`, `traj`, `trajectory` +- Score and result fields: `score`, `reward`, `accuracy`, `correct`, `success`, `metric`, `result` +- Identifiers and grouping keys: `task_id`, `sample_id`, `episode`, `run_id`, `uuid` +- Special structures: pass@k attempts, tree/branching traces, multi-agent episodes, tool call sequences - return _recurse(path, 0) -``` +If Inspect `.eval` files are present, prefer the built-in Inspect loader. For mixed or unclear data, summarize your best interpretation and ask the user to confirm before coding. -Understanding what individual files are in a few folders may also be useful. List files in key directories to understand the naming conventions and file types present. +### Propose Docent Structure -#### Detect Naming Patterns +Most Docent analysis features, including rubrics, search, and clustering, operate at the `AgentRun` level. Structure data so each `AgentRun` is a meaningful analysis unit. -Examine folder and file names to understand the organizational logic. Sample a few names at different levels of the hierarchy and reason about what they might represent. +| Level | Use | +| --- | --- | +| `Collection` | One experiment, benchmark run, dataset, or cohesive ingestion batch | +| `AgentRun` | The primary item to analyze, compare, search, label, or score | +| `TranscriptGroup` | Attempts or phases within one `AgentRun`, such as pass@k | +| `Transcript` | One conversation history; use multiple transcripts for multi-agent runs | -Common patterns to look for (as suggestions, not strict rules): +Default: if unsure, make each independent task, episode, sample, or branch its own `AgentRun` with one `Transcript`. -- **Dates:** ISO format (2024-01-15), compact (20240115), or human-readable (jan\_15) -- **Model identifiers:** Model names, versions, or checkpoints -- **Sequential numbering:** run\_001, sample\_42, task\_5, episode\_100 -- **Experiment tags:** baseline, ablation, v2, control, treatment -- **Subdirectory conventions:** trajs/, logs/, results/, metadata/, configs/ +For tree or branching data, usually ingest each branch as its own `AgentRun` and use metadata such as `root_task_id`, `branch_id`, `parent_branch_id`, and `branch_depth` to preserve relationships. -Rather than pattern-matching, describe what you observe and hypothesize about the user's organizational intent. For example: +### Confirmation Gate -- "Folders appear to be organized by date, then by model name" -- "Each subfolder contains a `trajs/` directory with JSON files and a `config.yaml`" -- "File names include what looks like a task ID followed by an attempt number" +Before writing the final upload script, present the plan and wait for user confirmation. Include: -When listing out messages, you must use `parse_chat_message` from the Docent SDK. This means that you must -make an informed decision on each role that is provided and map it to one of the supported roles in Docent, -since the data provided might not use the same roles. +- Source structure and detected data type +- Proposed collection name +- Proposed `Collection` / `AgentRun` / `TranscriptGroup` / `Transcript` structure +- Key field mappings for messages, scores, identifiers, and metadata +- Any omitted files or fields, with reason and impact +- Expected source record count, if available +- Your understanding of the data context and analysis goals -Ask the user to confirm your interpretation if uncertain. +## Stage 3: Ingestion -#### Identify Repeatable Templates +For Inspect `.eval` files, use the built-in loader and proceed directly to sanity checks. See `./ingestion-reference.md` for the import pattern. -Find the structural unit that repeats across the directory (e.g., each experiment folder has the same subdirectory structure): +For custom data: -```py -def find_repeatable_template(tree: dict) -> dict: - """Find the pattern that repeats across the directory structure.""" +1. Write an ingestion script to the filesystem. +2. Load raw source records according to the confirmed file structure. +3. Convert a small sample into `AgentRun` objects. +4. Manually inspect sample turns with reasoning and tool calls to verify reasoning + was represented, merged, or intentionally omitted according to the plan. +5. Fix sample conversion issues. +6. Convert the full dataset and record conversion failures. +7. Run `check_agent_runs(agent_runs)` and inspect the formatted report. +8. Upload only after the conversion output and warnings match the confirmed plan. - def get_structure_signature(node: dict) -> tuple: - if node is None: - return () - children = node.get("children", {}) - child_names = tuple(sorted(children.keys())) - file_exts = tuple(sorted(node.get("file_counts", {}).keys())) - return (child_names, file_exts) +If a failure is not easily recoverable, such as unexpected data shape, authentication failure, API error, or ambiguous SDK error, stop and ask the user how they want to proceed. Include the exact error and the affected file or record when possible. - signatures = {} - def collect_signatures(node: dict, path: str = ""): - if node is None: - return - sig = get_structure_signature(node) - if sig not in signatures: - signatures[sig] = [] - signatures[sig].append(path) - for name, child in node.get("children", {}).items(): - collect_signatures(child, f"{path}/{name}") +### Sanity Checks - collect_signatures(tree) +`check_agent_runs` warnings are not necessarily schema errors, but they often reveal conversion mistakes. Fix warnings caused by data shaping. For warnings that may be legitimate, summarize categories, counts, and representative examples, then ask whether they are expected. - repeated = [(sig, paths) for sig, paths in signatures.items() - if len(paths) > 1 and sig[0]] - - if repeated: - repeated.sort(key=lambda x: len(x[1]), reverse=True) - return { - "template_structure": repeated[0][0], - "instance_count": len(repeated[0][1]), - "example_paths": repeated[0][1][:3], - } - return {"template_structure": None, "note": "No repeating pattern found"} -``` +Deterministic checks do not fully validate reasoning handling. Inspect source +reasoning during sample conversion, especially when the source stores reasoning +outside normal assistant text or splits reasoning from the answer/tool-call turn. -#### Detect Inspect AI Files - -Check for Inspect AI `.eval` files, which have a dedicated loader: - -```py -def detect_inspect_files(path: Path) -> list[str]: - """Detect Inspect .eval files that can use the built-in loader.""" - return [str(f) for f in path.rglob("*.eval")] -``` - -If Inspect `.eval` files are detected, use the built-in loader (see Stage 3). - -#### Decision Point - -Based on the structural analysis, determine next steps: - -| Structure Pattern | Action | -| :---- | :---- | -| Clear repeating template with trajs/logs subdirs | Proceed to schema inference on representative samples | -| Flat directory with consistent file types | Sample files directly for schema | -| Mixed/unclear structure | Ask user for clarification | -| Inspect .eval files present | Use built-in Inspect loader | -| No recognizable data files | Ask user to confirm path | - -Log the structural analysis to `ingestion-plan.md`. - ---- - -### **Stage 2b: Schema Inference** - -Sample files strategically based on the template structure identified in Stage 2a. - -#### Strategic Sampling - -```py -def sample_files_strategically(path: Path, template_info: dict) -> list[Path]: - """Sample files from representative locations within the template structure.""" - samples = [] - - if template_info.get("example_paths"): - for instance_path in template_info["example_paths"][:2]: - instance = path / instance_path.lstrip("/") - for subdir in ["trajs", "trajectories", "logs", "results", ""]: - candidate = instance / subdir if subdir else instance - if candidate.exists(): - json_files = list(candidate.glob("*.json"))[:1] - jsonl_files = list(candidate.glob("*.jsonl"))[:1] - samples.extend(json_files + jsonl_files) - if samples: - break - - if not samples: - samples = list(path.rglob("*.json"))[:3] + list(path.rglob("*.jsonl"))[:2] - - return samples[:5] -``` - -#### Infer Schema - -```py -def infer_json_schema(data: dict | list, max_depth: int = 5) -> dict: - """Recursively infer schema from JSON data.""" - if max_depth == 0: - return {"type": "any", "note": "truncated"} - - if isinstance(data, dict): - return { - "type": "object", - "fields": { - k: infer_json_schema(v, max_depth - 1) - for k, v in data.items() - } - } - elif isinstance(data, list): - if not data: - return {"type": "array", "items": "unknown"} - item_schemas = [infer_json_schema(item, max_depth - 1) for item in data[:3]] - return {"type": "array", "items": item_schemas[0], "sample_count": len(data)} - else: - return {"type": type(data).__name__, "example": repr(data)[:100]} -``` - -#### Classify Fields - -Identify fields that indicate transcript content, scores, and metadata: - -```py -TRANSCRIPT_INDICATORS = ["messages", "conversation", "transcript", "dialogue", "turns", "traj", "trajectory"] -SCORE_INDICATORS = ["score", "reward", "accuracy", "correct", "success", "metric", "result"] -ID_INDICATORS = ["id", "task_id", "sample_id", "episode", "run_id", "uuid"] - -def classify_fields(schema: dict) -> dict: - """Classify fields by their likely purpose.""" - classified = {"transcript": [], "scores": [], "identifiers": [], "metadata": []} - - def check_field(name: str, field_schema: dict, path: str = ""): - full_path = f"{path}.{name}" if path else name - name_lower = name.lower() - - if any(ind in name_lower for ind in TRANSCRIPT_INDICATORS): - classified["transcript"].append(full_path) - elif any(ind in name_lower for ind in SCORE_INDICATORS): - classified["scores"].append(full_path) - elif any(ind in name_lower for ind in ID_INDICATORS): - classified["identifiers"].append(full_path) - else: - classified["metadata"].append(full_path) - - if field_schema.get("type") == "object": - for sub_name, sub_schema in field_schema.get("fields", {}).items(): - check_field(sub_name, sub_schema, full_path) - - for name, field_schema in schema.get("fields", {}).items(): - check_field(name, field_schema) - - return classified -``` - -Log schema and field classification to `ingestion-plan.md`. - ---- - -### **Stage 2c: Docent Organization Proposal** - -Propose how to organize the data in Docent based on the user's analysis goals and data structure. - -#### Docent Hierarchy Best Practices - -**Critical:** Most Docent analysis features (rubrics, search, clustering) operate at the **AgentRun level**. Structure data accordingly: - -| Level | Purpose | When to Use | -| :---- | :---- | :---- | -| **Collection** | One experiment, benchmark run, or dataset | Usually one per ingestion; multiple if fundamentally different experiments | -| **AgentRun** | Primary analysis unit | One per complete unit you want to analyze, compare, or score. Rubrics run here. Search returns these. | -| **TranscriptGroup** | Logical groupings within an AgentRun | Multiple attempts (pass@k), phases of a task | -| **Transcript** | One agent's conversation history | One per agent in multi-agent setups; otherwise usually one per AgentRun | - -**Default:** If unsure, make each independent task/episode/sample its own AgentRun with a single Transcript. - -**Tree/branching data:** Ingest each branch as its own Transcript in its own AgentRun. Use metadata fields to identify how branches relate to each other (e.g., `parent_branch_id`, `branch_depth`, `root_task_id`). - -#### Data Pattern to Docent Mapping - -| Data Pattern | Collection | AgentRun | TranscriptGroup | Transcript | -| :---- | :---- | :---- | :---- | :---- | -| Simple evals | experiment | sample\_id, scores | — | messages | -| Pass@k | experiment | task\_id, best\_score | attempt\_k | messages per attempt | -| Tree/branching | experiment | one per branch, with metadata linking branches | — | messages for that branch | -| Multi-agent | experiment | episode\_id, joint\_scores | — | one per agent | - -#### Field Mapping - -Map each source field to a Docent location: - -| Source Field | Target Location | Target Field | Notes | -| :---- | :---- | :---- | :---- | -| messages | Transcript.messages | — | Convert via parse\_chat\_message | -| reward | AgentRun.metadata | scores.reward | | -| task\_id | AgentRun.metadata | task\_id | | - -#### Document Omitted Data - -**CRITICAL:** If ANY data will not be ingested, document it clearly: - -| Field/File | Reason for Omission | Impact | -| :---- | :---- | :---- | -| `debug_logs/` | Contains only debug output, not agent transcripts | None | -| `raw_api_responses` | Redundant with parsed messages | Low | - -**Never silently skip data.** - -#### Present Plan for Review - -Present the complete ingestion plan to the user and ask them to confirm all details: - -1. **Directory structure discovered** - what files/folders were found -2. **Data type detected** - what format the data appears to be in -3. **Proposed Docent hierarchy** - how data will be organized into collections, agent runs, transcript groups, and transcripts -4. **Key field mappings** - which fields map to scores, metadata, messages, etc. -5. **Omitted data** (if any) - what data will not be ingested and why -6. **Collection name** - propose a name based on the data, ask user to confirm or provide a different name -7. **Data context** - summarize your understanding of what this data represents (e.g., benchmark evaluation, agent task runs, multi-agent debate). Ask the user to confirm or clarify. -8. **Analysis goals** - ask what kinds of analysis they want to do in Docent (e.g., compare two model checkpoints, find failure modes, understand a metric regression). This helps ensure the data is structured appropriately. - -Wait for the user to confirm all details before proceeding to Stage 3. - ---- - -## **Stage 3: Ingestion** - -### **Environment Setup** - -Activate virtual environment if present: - -```shell -if [ -d "venv" ]; then - source venv/bin/activate -elif [ -d ".venv" ]; then - source .venv/bin/activate -fi -``` +Document any accepted warnings in `ingestion-plan.md` with counts and justification. -### **Handle Inspect AI Files** +## Stage 4: Verification -If Inspect `.eval` files were detected, use the built-in loader: +After upload, verify and log: -```py -from inspect_ai.log import read_eval_log -from docent.loaders.load_inspect import load_inspect_log +- Source records discovered +- Records converted successfully +- Conversion failures and representative errors +- Agent runs uploaded to Docent +- Whether source, converted, and uploaded counts match expectations +- Any accepted sanity warnings +- Collection URL -eval_log = read_eval_log("path/to/file.eval") -agent_runs = load_inspect_log(eval_log) -print(f"Loaded {len(agent_runs)} runs from Inspect log") -``` - -Skip to "Upload to Docent" below. - -### **Custom Data Loading** - -For non-Inspect data, build the ingestion script incrementally. - -**Important:** Always save ingestion scripts to the filesystem (e.g., `ingest.py` or `ingest_.py`) rather than running them inline. This aids in debugging, allows for iterative refinement, and provides a record of exactly how the data was ingested. - -**Error handling:** When running the ingestion script, if you encounter a failure that does not look easily recoverable (e.g., unexpected data format, authentication errors, API errors, or unclear error messages), stop and prompt the user for guidance rather than attempting repeated fixes. Describe the error clearly and ask how they would like to proceed. - -#### Load Data - -```py -import os -import json -from pathlib import Path -from docent import Docent -from docent.data_models import AgentRun, Transcript, TranscriptGroup -from docent.data_models.chat import parse_chat_message, ToolCall - -def load_data(path: str) -> list[dict]: - """Load data based on structure identified in Stage 2a.""" - path = Path(path) - records = [] - # Implementation based on detected template structure - return records - -raw_data = load_data(data_path) -print(f"Loaded {len(raw_data)} records") -``` - -#### Conversion Function - -```py -def convert_to_agent_run(record: dict) -> AgentRun: - """Convert a single record to AgentRun.""" - raw_messages = record.get("messages") or record.get("traj") or [] - messages = [parse_chat_message(m) for m in raw_messages] - - # Handle tool calls if present - for i, msg in enumerate(raw_messages): - if msg.get("role") == "assistant" and msg.get("tool_calls"): - messages[i].tool_calls = [ - ToolCall( - id=tc.get("id", f"call_{i}"), - function=tc.get("function", {}).get("name", tc.get("name", "")), - arguments=tc.get("function", {}).get("arguments", tc.get("arguments", {})), - type="function" - ) - for tc in msg["tool_calls"] - ] - - transcript = Transcript( - messages=messages, - metadata={...} # transcript-level metadata from mapping - ) - - return AgentRun( - transcripts=[transcript], - metadata={ - "scores": {...}, # from mapping - # other metadata from mapping - } - ) -``` - -#### Validation Loop - -Test conversion on a sample before full ingestion: - -```py -errors = [] -for i, record in enumerate(raw_data[:10]): - try: - agent_run = convert_to_agent_run(record) - print(f"✓ Record {i} converted successfully") - except Exception as e: - errors.append((i, str(e))) - print(f"✗ Record {i} failed: {e}") - -if errors: - print(f"\n{len(errors)} validation errors in first 10 records") -``` - -#### Full Conversion - -```py -agent_runs = [] -conversion_errors = [] - -for i, record in enumerate(raw_data): - try: - agent_runs.append(convert_to_agent_run(record)) - except Exception as e: - conversion_errors.append({"index": i, "error": str(e)}) - -print(f"Converted {len(agent_runs)}/{len(raw_data)} records") -if conversion_errors: - print(f"Errors ({len(conversion_errors)}): {conversion_errors[:5]}...") -``` - -### **Upload to Docent** - -```py -client = Docent(api_key=DOCENT_API_KEY) - -collection_id = client.create_collection( - name=collection_name, - description="", -) -print(f"Created collection: {collection_id}") - -client.add_agent_runs(collection_id, agent_runs) -print(f"Uploaded {len(agent_runs)} runs") - -print(f"View at: https://docent.transluce.org/collection/{collection_id}") -``` - ---- - -## **Stage 4: Testing & Verification** - -Verify that the upload succeeded and counts match expectations. - -### **Count Verification** - -```py -expected_runs = len(agent_runs) -failed_conversions = len(conversion_errors) -total_source_records = len(raw_data) - -print(f"\n{'='*50}") -print("VERIFICATION REPORT") -print(f"{'='*50}") -print(f"Source records found: {total_source_records}") -print(f"Successfully converted: {expected_runs}") -print(f"Failed to convert: {failed_conversions}") - -# Verify upload via Docent SDK -try: - collection_info = client.get_collection(collection_id) - uploaded_count = collection_info.get("agent_run_count", "unknown") - print(f"Uploaded to Docent: {uploaded_count}") - - if uploaded_count != expected_runs: - print(f"⚠️ WARNING: Count mismatch! Expected {expected_runs}, got {uploaded_count}") - else: - print(f"✓ Counts match!") -except Exception as e: - print(f"Could not verify upload count via API: {e}") - print(f"Please verify manually at: https://docent.transluce.org/collection/{collection_id}") -``` - -### **Log Verification Results** - -Update `ingestion-plan.md`: - -``` -## Verification - -### Counts -- Source records: [total_source_records] -- Converted successfully: [expected_runs] -- Conversion failures: [failed_conversions] -- Uploaded to Docent: [uploaded_count] -- **Status:** [MATCH / MISMATCH] - -### Errors (if any) -[List conversion errors with record index and error message] - -### Collection URL -https://docent.transluce.org/collection/[collection_id] -``` - ---- - -## **Reference** - -See the data model guidance and examples in this file for Docent data model documentation. - -For additional guidance on Docent data models and API usage, consult the official documentation: [https://docs.transluce.org/llms.txt](https://docs.transluce.org/llms.txt) - -## **Common Patterns** - -### **Inspect AI Logs** - -When `.eval` files detected, use the built-in loader: - -```py -from inspect_ai.log import read_eval_log -from docent.loaders.load_inspect import load_inspect_log - -eval_log = read_eval_log("path/to/file.eval") -agent_runs = load_inspect_log(eval_log) -``` - -### **Parsing Chat Messages** - -Use `parse_chat_message` to convert dictionaries to proper message objects: - -```py -from docent.data_models.chat import parse_chat_message - -# From dict - automatically determines message type from "role" -msg = parse_chat_message({ - "role": "user", - "content": "What's 2+2?" -}) - -msg = parse_chat_message({ - "role": "assistant", - "content": "The answer is 4." -}) - -msg = parse_chat_message({ - "role": "system", - "content": "You are a helpful assistant." -}) - -# Direct construction is also available -from docent.data_models.chat import UserMessage, AssistantMessage, SystemMessage -msg = UserMessage(content="Hello") -msg = AssistantMessage(content="Hi!", model="gpt-4") -``` - -### **Simple Dict to AgentRun** - -A common pattern for converting flat records: - -```py -from docent.data_models import AgentRun, Transcript -from docent.data_models.chat import parse_chat_message - -def convert_simple(record: dict) -> AgentRun: - messages = [parse_chat_message(m) for m in record["messages"]] - return AgentRun( - transcripts=[Transcript(messages=messages)], - metadata={ - "scores": {"reward": record.get("reward", 0)}, - **{k: v for k, v in record.items() if k != "messages"} - } - ) -``` - -### **Tool Calls** - -Handle assistant messages with tool calls and their responses: - -```py -from docent.data_models.chat import AssistantMessage, ToolMessage, ToolCall - -# Assistant making a tool call -assistant_msg = AssistantMessage( - content="Let me search for that.", - tool_calls=[ - ToolCall( - id="call_123", - function="web_search", - arguments={"query": "weather today"}, - type="function" - ) - ] -) - -# Tool response -tool_msg = ToolMessage( - content="Sunny, 72°F", - tool_call_id="call_123", - function="web_search" -) - -# Helper to parse tool calls from raw data -def parse_tool_calls(raw_calls: list) -> list[ToolCall]: - return [ - ToolCall( - id=tc["id"], - function=tc["function"]["name"], - arguments=tc["function"].get("arguments", {}), - type="function" - ) - for tc in raw_calls - ] -``` - -### **Pass@k Evaluation** - -Use `TranscriptGroup` for attempts: - -```py -from uuid import uuid4 -from docent.data_models import AgentRun, Transcript, TranscriptGroup - -def convert_pass_at_k(task_data: dict) -> AgentRun: - agent_run_id = str(uuid4()) - groups = [] - transcripts = [] - - for k, attempt in enumerate(task_data["attempts"]): - group = TranscriptGroup( - name=f"Attempt {k+1}", - agent_run_id=agent_run_id, - metadata={"k": k} - ) - groups.append(group) - - transcript = Transcript( - messages=[parse_chat_message(m) for m in attempt["messages"]], - transcript_group_id=group.id, - metadata={"attempt": k} - ) - transcripts.append(transcript) - - return AgentRun( - id=agent_run_id, - transcripts=transcripts, - transcript_groups=groups, - metadata={"task_id": task_data["task_id"]} - ) -``` - -### **Tree/Branching** - -Ingest each branch as its own `Transcript` in its own `AgentRun`. Use metadata to link branches: - -```py -AgentRun( - transcripts=[transcript], - metadata={ - "root_task_id": "task_123", - "branch_id": "branch_a_1", - "parent_branch_id": "branch_a", - "branch_depth": 2, - } -) -``` - -### **Multi-Agent** - -One `Transcript` per agent in the same `AgentRun`: - -```py -AgentRun( - transcripts=[ - Transcript(messages=agent_1_messages, metadata={"agent_id": "agent_1"}), - Transcript(messages=agent_2_messages, metadata={"agent_id": "agent_2"}), - ], - metadata={ - "episode_id": "episode_42", - "scores": {"joint_reward": 0.85} - } -) -``` - -### **Validation** - -Always validate by rendering before upload: - -```py -try: - _ = agent_run.text # Triggers validation - print("Valid") -except Exception as e: - print(f"Invalid: {e}") -``` +If the SDK cannot verify the uploaded count, provide the collection URL and record that manual verification is needed. diff --git a/plugins/docent/skills/docent/readings-reference.md b/plugins/docent/skills/docent/readings-reference.md index 35ea2db..990a475 100644 --- a/plugins/docent/skills/docent/readings-reference.md +++ b/plugins/docent/skills/docent/readings-reference.md @@ -12,11 +12,11 @@ Use scripted readings only when you need additional flexibility, e.g. varying th Readings are executed lazily: nothing runs until `flush()` is called. You normally do not need to call `flush()` manually. `flush()` is automatically called at script exit, and also anytime you attempt to access the output of a reading which has not been run yet. The system infers the execution DAG automatically. Re-running the same script is free: readings are content-addressed, so identical analyses reuse existing results. -When readings are flushed, they will appear in a web UI for the user to approve. The script will pause execution until the user approves the readings. They may also cancel the script and ask you to make changes. (Note: the reading plan interface in the web UI is read-only.) +When readings are flushed, they will appear as an analysis plan in the web UI for the user to approve. The script will pause execution until the user approves the readings. They may also cancel the script and ask you to make changes. (Note: the analysis plan interface in the web UI is read-only.) If you need a no-UI-approval flow for a trusted analysis, you may opt into SDK auto-approval by explicitly calling `client.flush(auto_approve=True)`. This reuses the same backend approval endpoint programmatically, including for dependent steps that are initially unresolved. -Some reading plans require mid-script blocking, for example if one step waits for reading results (using `.results`) in order to construct a later step. In these cases: +Some analysis plans require mid-script blocking, for example if one step waits for reading results (using `.results`) in order to construct a later step. In these cases: * The script may submit an initial set of steps for approval, then block waiting for results before it can continue. * The user may need to approve the plan more than once, unless you explicitly call `client.flush(auto_approve=True)` for each flush that should bypass manual approval. * Warn the user upfront about multi-approval flows so they know what to expect. @@ -32,8 +32,6 @@ You should feel free to iterate on your scripts, but avoid overwriting scripts w * Explore a new question on the same dataset -> create a new script * Take a different approach to the same question -> create a new script -Note: an obsolete version of the SDK provided an API called `LLMRequest`. If you encounter old code using LLMRequests, you can offer to migrate it to readings. - ## Core API ### `client.query(collection_id, dql, *, name=None) -> QueryResult` @@ -69,12 +67,13 @@ reading = client.read( For `ARRAY_AGG` columns, pass `is_list=True`: ```python -# agg = client.query(collection_id, f"SELECT array_agg(rr.id) AS results FROM reading_results rr ...") +# agg = client.query(collection_id, f"SELECT array_agg(rr.id ORDER BY rr.id) AS results FROM reading_results rr ...") reading = client.read( prompt_template=["Synthesize these results: ", agg.results.as_type("reading_result", is_list=True)], model="openai/gpt-5.4-mini", ) ``` +(Note: the ORDER BY is important. Without an ORDER BY, Postgres may later return results in a different order, invalidating the cache and triggering an expensive LLM call. If there's no natural order, you can order by ID.) **Scripted path** (explicit per-request prompts): ```python @@ -100,10 +99,15 @@ Parameters: - `reasoning_effort`: Optional `"minimal"` | `"low"` | `"medium"` | `"high"` - `max_new_tokens`: Optional maximum number of new tokens to generate per result - `collection_id`: Optional collection override (useful for scripted readings that don't infer it from a QueryResult) -- `cache_mode`: Controls caching granularity. The DQL query (if any) is always executed to resolve arguments regardless of cache mode. The content hash — covering prompt template, context config, model config, output schema, token limit, and resolved arguments — determines reading identity. - - `"reading"` (default): reuse an existing reading with matching content hash - - `"results"`: always create a new reading record, but reuse individual results to avoid redundant LLM calls - - `"none"`: no caching — force full re-evaluation +- `cache_mode`: Controls caching granularity. See below + +### Cache modes +The DQL query (if any) is always executed to resolve arguments regardless of cache mode. The content hash — covering prompt template, context config, model config, output schema, token limit, and resolved arguments — determines reading identity. +- `"reading"` (default): reuse an existing reading with matching content hash +- `"results"`: always create a new reading record, but reuse individual results to avoid redundant LLM calls +- `"none"`: no caching — force full re-evaluation + +Note: if some results for a reading succeeded and some errored, rerunning with cache_mode="reading" will not retry the errored results. This avoids wasting time retrying problematic prompts (e.g. too long, or blocked by LLM API safety filters). If you need to force retry all errored results, run with cache_mode="results". ### Transcript slices @@ -294,9 +298,6 @@ Glob filter rules: * Common pitfall: do not set `transcript_group_names=GlobFilter(include=("*",))` when the user asks to render only a specific transcript name. Including all transcript groups makes all visible descendants render, so it can override the intended narrow transcript selection. In that case, make `transcript_group_names` exclude-all and set only `transcript_names=GlobFilter(include=("",))`. * Transcript group filtering is path-scoped. Including a nested group makes that group and its visible descendants render, and any ancestors needed to reach it may render as wrappers. It does not make sibling branches visible. For example, if `G1` contains both `G2 -> G3` and `G2-prime`, including `G3` can render wrapper groups `G1` and `G2`, but `G2-prime` remains hidden unless it or one of its descendants is independently included. -### `client.show_query_result(query_result, name=None)` *(deprecated)* -Deprecated — `client.query()` now auto-registers a DQL-only step in the UI. Use the `name` parameter on `query()` instead. This method is a no-op for backwards compatibility. - ### `client.step_group(label) -> StepGroupContext` Opens a labeled step group in the session UI. Use as a context manager to auto-close the group scope: ```python @@ -356,15 +357,15 @@ Submits all pending readings to the server. Returns `plan_id` and per-entry `ent client.plan_name = "safety_failure_clustering" # Defaults to name of script ``` -Note: reading plans are grouped by name. -* If you create a new plan with the same name as an existing plan, it will be saved as a new version of the existing plan. Therefore, when you create a reading plan, give it a reasonably specific name to reduce chances of a collision. -* If you change the name of an existing reading plan, the new version will be saved as separate and unrelated. Therefore, you should avoid renaming reading plans unnecessarily. +Note: analysis plans are grouped by name. +* If you create a new plan with the same name as an existing plan, it will be saved as a new version of the existing plan. Therefore, when you create an analysis plan, give it a reasonably specific name to reduce chances of a collision. +* If you change the name of an existing analysis plan, the new version will be saved as separate and unrelated. Therefore, you should avoid renaming analysis plans unnecessarily. ### Default collection ID ```python client.default_collection_id = "" ``` -Used as a fallback when `flush()` resolves which collection to target. Automatically set from `DOCENT_COLLECTION_ID` in `docent.env` or the environment if present. Can also be passed to the `Docent()` constructor as `collection_id`. +Used as a fallback when `flush()` resolves which collection to target. Automatically set from `DOCENT_COLLECTION_ID` in the SDK-discovered `docent.env` or the environment if present. Can also be passed to the `Docent()` constructor as `collection_id`. ### Auto-flush On first `read()` call, an `atexit` handler is registered. Disable with `client.auto_flush = False`. @@ -396,7 +397,7 @@ If you need structured output, you may provide a JSON schema. String fields may optionally allow the LLM to cite parts of its input. * Fields such as "summary" or "description" or "explanation" should usually have citations. -* Do not include citations for fields such as "category", "classification" or any other field which is likely to be filtered on downstream. +* Enum fields must not have citations. * "Reasoning" fields should come before "decision" fields. That way, the LLM is generating the decision based on the reasoning, instead of justifying its decision post-hoc. ```python @@ -420,7 +421,6 @@ The quality of reading output depends on the quality the prompt you write. The L * How detailed or brief should output be? A short paragraph is a good default, but it depends on the nature of the analysis. * If you're asking for extensive (multi-paragraph) response, how should it be structured? Note: markdown is supported * If you are looking for a particular behavior, how exactly is that behavior defined? If you're proposing a specific definition, make sure the user signs off on it. -* If you are asking the LLM to analyze other reading results, remind it to cite those reading results, NOT the original transcripts which the results may refer to. ## Chosing clear names The name of each step (client.query and client.read) should fit on one line. Subject to that constraint, make step names descriptive. Ideally, the names make sense to a user without much context on your analysis. A descriptive name does not have to be wordy. @@ -497,7 +497,7 @@ summarize = client.read( # Step 2: Propose clusters from the summaries summaries = client.query( collection_id, - f"SELECT array_agg(rr.id) AS summaries " + f"SELECT array_agg(rr.id ORDER BY rr.id) AS summaries " f"FROM reading_results rr " f"JOIN reading_result_links rrl ON rrl.result_id = rr.id " f"WHERE rrl.reading_id = '{summarize}' ", @@ -553,7 +553,7 @@ print(f"Proposed {len(category_names)} clusters: {', '.join(category_names)}") **Stop here.** Run this script, review the proposed clusters, and report them to the user. If the clusters look right, proceed to Phase 2. If not, adjust the summarization prompt or sample and re-run. Re-running is free for unchanged steps (results are cached). -**If something goes wrong:** Check DQL query syntax first (see `dql-reference.md` quirks). Common issues: missing `is_list=True` on aggregated columns, or an empty result set from the sample query. If the clusters are too broad or too narrow, adjust the number of requested categories in the Step 2 prompt or focus the summarization prompt on a more specific aspect of behavior. +**If something goes wrong:** Check DQL query syntax first (see `dql-reference.md` quirks). Common issues: missing `is_list=True` on aggregated columns, or no rows returned by the sample query. If the clusters are too broad or too narrow, adjust the number of requested categories in the Step 2 prompt or focus the summarization prompt on a more specific aspect of behavior. ### Phase 2: Classify using the proposed clusters @@ -586,3 +586,13 @@ extract = client.read( ``` Run the extended script. Steps 1-2 are cached and won't re-run — only Step 3 executes. The user approves the classification step, and results come back. + +## Example: hierarchical synthesis + +When synthesizing more than ~30 reading results into a single analysis, do NOT put all results into one prompt. Instead: + +1. **Batch**: Split results into groups of 15-20 using DQL (e.g., `LIMIT 20 OFFSET 0`, `LIMIT 20 OFFSET 20`, etc.) +2. **Summarize each batch**: Run a synthesis reading per batch that produces a structured intermediate summary +3. **Final synthesis**: Aggregate the batch summaries (which are now ~5-10 items) into a single final reading + +Alternatively, if the per-item readings produce structured output (e.g., categories/enums), use DQL aggregation over `reading_results.output` to produce counts and distributions — this avoids context limits entirely and gives exact numbers. diff --git a/plugins/docent/skills/docent/report.md b/plugins/docent/skills/docent/report.md index 36037c5..e8e01c2 100644 --- a/plugins/docent/skills/docent/report.md +++ b/plugins/docent/skills/docent/report.md @@ -18,7 +18,7 @@ Use the report feature when: Do NOT use reports as a substitute for the analysis itself. First run the analysis using the Docent analysis skill, then generate the report from the results. **Preconditions:** -- Reports are created from reading plans. If there is no reading plan ID, tell the user they need to provide one. +- Reports are created from analysis plans. If there is no analysis plan ID, tell the user they need to provide one. - The user should be fairly clear about what they want the report to cover. If the scope, question, or audience is not clear enough to operationalize, ask before drafting. - Unless the user specifies another location, write the report markdown file in the current working directory. @@ -31,7 +31,7 @@ These should shape every decision about what to include and how to structure the The point of a report is to surface highly verifiable, traceable, actionable, and important insights for readers who are trying to make consequential decisions about agent behavior. - **Verifiable**: every meaningful claim should be backed by adjacent evidence, such as a DQL table, a reading result embed, or a citation to the exact underlying object. -- **Traceable**: a reader should be able to follow a claim back to the exact reading plan, step, result, DQL query, or transcript that supports it. +- **Traceable**: a reader should be able to follow a claim back to the exact analysis plan, step, result, DQL query, or transcript that supports it. - **Important**: focus on findings that are consequential for the user's terminal goal, such as improving performance, reducing unsafe behavior, or making some other important behavior change. Findings that happen only a tiny fraction of the time are usually not report-worthy unless the user explicitly says they matter. - **Actionable**: recommendations must be specific enough to imply an intervention the user can make to the agent. Only include them when there is a strong, coherent, and sound argument that the intervention should improve the observed behavior. @@ -60,7 +60,7 @@ The reader can now verify the "strategy differences dominate" claim by inspectin ### No platform jargon -Do not use platform jargon ("reading plan," "DQL," "reading result") in report prose visible to the reader. Use plain descriptions ("analysis," "query," "result"). Platform terminology is fine in shortcode attributes and developer-facing code. +Do not use platform jargon ("analysis plan," "DQL," "reading result") in report prose visible to the reader. Use plain descriptions ("analysis," "query," "result"). Platform terminology is fine in shortcode attributes and developer-facing code. --- @@ -191,7 +191,7 @@ Attributes: Behavior: Uses the page's `collection_id` automatically. Shows row count, execution time, truncation info, and a toggle to show/hide the raw DQL. Authoring guidance: -- Keep queries short, explicit, and cheap. Add `LIMIT` unless the full result set is needed. +- Keep queries short, explicit, and cheap. Add `LIMIT` unless every row is genuinely needed. - Use the body to explain why this table matters, not to restate column names. - **Key pattern**: aggregate reading results via DQL rather than stating numbers in prose. A `::dql-table` computing a distribution is always preferable to "52% are X" in text, because the reader can inspect the query. @@ -231,11 +231,7 @@ Use inline citations inside markdown sentences to link claims to specific eviden This claim is grounded in ::citation{type="reading_result" collection_id="collection-uuid" reading_result_id="reading-result-uuid"}. ``` -Use `short="true"` for a compact icon-only citation: - -```md -See ::citation{type="analysis_result" collection_id="collection-uuid" result_set_id="result-set-uuid" result_id="result-uuid" short="true"} for details. -``` +Use `short="true"` for a compact icon-only citation. Rules: - This is inline text, not a block shortcode. @@ -254,7 +250,6 @@ Rules: | Type | Required fields | Optional fields | |---|---|---| -| `analysis_result` | `result_set_id`, `result_id` | | | `reading_result` | `reading_result_id` | | | `block_content` | `agent_run_id`, `transcript_id` | `block_idx` (default `0`), `content_idx` | | `agent_run_metadata` | `agent_run_id`, `metadata_key` | | @@ -271,7 +266,7 @@ Use inline citations sparingly — they are most valuable for anchoring specific A strong report usually follows this shape: -1. Start with a single-`#` H1 at the very top, then a short intro. The UI renders pill-style links to the collection and reading plan beneath the H1 using the frontmatter IDs — you do not need to author those links. +1. Start with a single-`#` H1 at the very top, then a short intro. The UI renders pill-style links to the collection and analysis plan beneath the H1 using the frontmatter IDs — you do not need to author those links. 2. Follow the intro with headings and short narrative that states a question or claim. 3. Put the supporting `::dql-table`, `::reading-result`, `::reading-results-table`, and/or inline citations immediately next to that claim. 4. Use `::callout` for a key takeaway, caveat, or recommendation only when adjacent evidence already supports it. @@ -324,7 +319,7 @@ Reference material for getting data into the report and saving it to Docent. ## MCP endpoints -- `get_reading_plan_results(collection_id, plan_name)` — reading plan overview with step statuses and result counts. +- `get_reading_plan_results(collection_id, plan_name)` — analysis plan overview with step statuses and result counts. - `get_reading_plan_results(collection_id, plan_name, step_name)` — concrete outputs for a specific step. - `get_metadata_fields(collection_id)` — helps when you need tables grouped or filtered by run metadata. - `list_reading_presets(collection_id, owned_only=True)` — useful when a report needs to contextualize preset-backed readings. Set `owned_only=False` to inspect all collection presets. @@ -333,11 +328,11 @@ Reference material for getting data into the report and saving it to Docent. ## SDK methods -- `client.list_reading_plans(collection_id, name=..., owned_only=True)` — find matching reading plans. Pass `owned_only=False` to search all visible plans. +- `client.list_reading_plans(collection_id, name=..., owned_only=True)` — find matching analysis plans. Pass `owned_only=False` to search all visible plans. - `client.get_reading_plan(collection_id, plan_id)` — full plan with step metadata, reading IDs, DQL step definitions. - `client.get_reading_results(collection_id, reading_id)` — raw reading results for qualitative inspection. - `client.execute_dql(collection_id, dql, reading_plan_id=plan_id)` plus `client.dql_result_to_dicts(...)` — materialize DQL-backed evidence tied to the plan. -- `client.query(...)` — supplemental quantitative support. Keep the report anchored to the cited reading plan. +- `client.query(...)` — supplemental quantitative support. Keep the report anchored to the cited analysis plan. ## Persisting reports @@ -438,11 +433,11 @@ The moment you type a number in a markdown section, ask yourself: is there a DQL - Do not expect block shortcodes to work inside HTML embeds. - Do not rely on inline citations inside code fences or inline code. - Do not add `collection_id` to block shortcodes — they use the page's collection automatically. Do include `collection_id` on inline `::citation` shortcodes. -- Do not omit `LIMIT` in `::dql-table` queries unless the full result set is genuinely needed. +- Do not omit `LIMIT` in `::dql-table` queries unless every row is genuinely needed. ### Other mistakes -- Do not start a report without a reading plan ID. +- Do not start a report without an analysis plan ID. - Do not guess the report scope when the user's request is not clear enough. - Do not elevate a vanishingly rare edge case into a headline finding unless the user explicitly cares about it. - Do not recommend interventions unless you can explain why that intervention should improve the observed behavior.