Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
6d4c116
Spec for doc 2 skill
scosman Apr 6, 2026
ddf8a13
doc 2 skill architecture
scosman Apr 6, 2026
7cd3290
specs for doc 2 skill
scosman Apr 6, 2026
cadce40
implementation plan
scosman Apr 6, 2026
5c0f996
Phase 1: Add DocumentSkill data model and tag filtering for RAG runners
scosman Apr 6, 2026
c355bf4
Phase 2: Add DocSkillWorkflowRunner pipeline and SkillBuilder for doc…
scosman Apr 6, 2026
db4a997
Phase 3: Add doc skill API layer with CRUD, SSE run, progress, and so…
scosman Apr 6, 2026
3434bc7
Phase 4: Add doc skill frontend with templates, creation form, and ru…
scosman Apr 6, 2026
0770f67
Phase 5: Add doc skill list page, detail page, and docs entry point
scosman Apr 6, 2026
fb6036c
Phase 6: Add cross-linking between doc skills and generated skills, p…
scosman Apr 6, 2026
8934ad5
Fix CORS test when using custom frontend port
scosman Apr 6, 2026
8cdb641
UI polish: improve doc skill creation form layout, labels, and defaults
scosman Apr 6, 2026
4fd0ac7
Fix skill detail source link: rename to "Document Skill" and make ful…
scosman Apr 6, 2026
adb7fec
Fix double-create bug: await tick() before showing dialog, keep form …
scosman Apr 6, 2026
54e034f
Add skill file counts property and open folder action
scosman Apr 6, 2026
808ccba
better spacing for multi-intro
scosman Apr 7, 2026
c67491e
Redesign doc skill detail as entity page with config in left column
scosman Apr 7, 2026
a1fa19a
update API docs
scosman Apr 7, 2026
6f499bf
Merge branch 'scosman/safe_gen_schema' into scosman/doc_skills
scosman Apr 7, 2026
4864375
Fix skills dropdown not updating after doc skill creation
scosman Apr 7, 2026
8502a0c
Code review fixes: API docs, typed client, shared utilities
scosman Apr 7, 2026
4ec0fcf
Code review fixes: return model directly, fix extension stripping, sh…
scosman Apr 7, 2026
d9e8188
CR guidance
scosman Apr 7, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions .agents/api_code_review.md
Original file line number Diff line number Diff line change
Expand Up @@ -57,3 +57,10 @@ async def delete_project(
- Docstrings containing code artifacts, raw `Args:` blocks, or formatting that doesn't read as clean prose in OpenAPI
- Pydantic models used in API request/response types (nested included) missing a class docstring, if the class name alone isn't obvious
- Custom string types with validator-based constraints that don't surface in the OpenAPI schema. Use `StringConstraints` in the `Annotated` type definition so `minLength`/`maxLength` appear automatically (see `FilenameString`, `SkillNameString` for examples). Don't duplicate constraints in individual `Field()` calls.

**Client Usage**

All client calls should use the typed API via `api_client.ts`. The API is auto-generated to `api_schema.d.ts` with the `generate_schema.sh` command.
- SSE calls are only exception, may use fetch as typed client does not support SSE
- If an API isn't in api_schema, regenerate the schema -- don't fallback to fetch
- In `libs/types.ts` define exported types for the type names from auto-generation, eg `export type Task = components["schemas"]["Task"]`. Use these throughout codebase instead of long names from api_schema.d.ts
2 changes: 2 additions & 0 deletions app/desktop/desktop_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
from app.desktop.studio_server.chat import connect_chat_api
from app.desktop.studio_server.copilot_api import connect_copilot_api
from app.desktop.studio_server.data_gen_api import connect_data_gen_api
from app.desktop.studio_server.doc_skill_api import connect_doc_skill_api
from app.desktop.studio_server.dev_tools import connect_dev_tools
from app.desktop.studio_server.eval_api import connect_evals_api
from app.desktop.studio_server.finetune_api import connect_fine_tune_api
Expand Down Expand Up @@ -71,6 +72,7 @@ def make_app(tk_root: tk.Tk | None = None):
connect_import_api(app, tk_root=tk_root)
connect_tool_servers_api(app)
connect_skill_api(app)
connect_doc_skill_api(app)
connect_prompt_optimization_job_api(app)
connect_copilot_api(app)
connect_dev_tools(app)
Expand Down
358 changes: 358 additions & 0 deletions app/desktop/studio_server/doc_skill_api.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,358 @@
import asyncio
import json
import logging
from typing import Annotated, Awaitable, Callable

from fastapi import FastAPI, HTTPException, Path
from fastapi.responses import StreamingResponse
from kiln_ai.adapters.rag.progress import LogMessage
from kiln_ai.datamodel.chunk import ChunkerConfig
from kiln_ai.datamodel.document_skill import DocumentSkill
from kiln_ai.datamodel.extraction import Document, ExtractorConfig
from kiln_ai.datamodel.project import Project
from kiln_ai.datamodel.skill import Skill
from pydantic import BaseModel

from kiln_ai.adapters.rag.deduplication import filter_documents_by_tags
from kiln_server.project_api import project_from_id
from kiln_server.utils.agent_checks.policy import ALLOW_AGENT, DENY_AGENT

from .doc_skill_pipeline import (
DocSkillProgress,
DocSkillWorkflowRunner,
DocSkillWorkflowRunnerConfig,
)

logger = logging.getLogger(__name__)


class CreateDocSkillRequest(BaseModel):
"""Request to create a new document skill configuration."""

name: str
skill_name: str
skill_content_header: str
description: str | None = None
extractor_config_id: str
chunker_config_id: str
document_tags: list[str] | None = None
strip_file_extensions: bool = True


class UpdateDocSkillRequest(BaseModel):
"""Request to archive or unarchive a document skill."""

is_archived: bool


class DocSkillProgressRequest(BaseModel):
"""Request for batch doc skill progress. If doc_skill_ids is None, returns all."""

doc_skill_ids: list[str] | None = None


class DocSkillSourceResponse(BaseModel):
"""Links a generated skill back to its source document skill."""

doc_skill_id: str | None
doc_skill_name: str | None


def _get_doc_skill(project: Project, doc_skill_id: str) -> DocumentSkill:
doc_skill = DocumentSkill.from_id_and_parent_path(doc_skill_id, project.path)
if doc_skill is None:
raise HTTPException(status_code=404, detail="Doc skill not found.")
return doc_skill


def _get_filtered_documents(project: Project, tags: list[str] | None) -> list[Document]:
all_docs = project.documents(readonly=True)
if tags is None:
return all_docs
return filter_documents_by_tags(all_docs, tags)


def compute_doc_skill_progress(
project: Project, doc_skill: DocumentSkill
) -> DocSkillProgress:
if doc_skill.skill_id is not None:
docs = _get_filtered_documents(project, doc_skill.document_tags)
return DocSkillProgress(
total_document_count=len(docs),
total_document_extracted_count=len(docs),
total_document_chunked_count=len(docs),
skill_created=True,
)

docs = _get_filtered_documents(project, doc_skill.document_tags)
extracted = 0
chunked = 0

for doc in docs:
has_extraction = any(
ext.extractor_config_id == doc_skill.extractor_config_id
for ext in doc.extractions()
)
if has_extraction:
extracted += 1
for ext in doc.extractions():
if ext.extractor_config_id == doc_skill.extractor_config_id:
has_chunks = any(
cd.chunker_config_id == doc_skill.chunker_config_id
for cd in ext.chunked_documents()
)
if has_chunks:
chunked += 1
break
Comment thread
scosman marked this conversation as resolved.

return DocSkillProgress(
total_document_count=len(docs),
total_document_extracted_count=extracted,
total_document_chunked_count=chunked,
skill_created=False,
)


async def _build_workflow_runner(
project: Project, doc_skill: DocumentSkill
) -> DocSkillWorkflowRunner:
if not doc_skill.extractor_config_id:
raise HTTPException(status_code=422, detail="Extractor config not found.")
extractor_config = ExtractorConfig.from_id_and_parent_path(
doc_skill.extractor_config_id, project.path
)
if extractor_config is None:
raise HTTPException(status_code=422, detail="Extractor config not found.")

if not doc_skill.chunker_config_id:
raise HTTPException(status_code=422, detail="Chunker config not found.")
chunker_config = ChunkerConfig.from_id_and_parent_path(
doc_skill.chunker_config_id, project.path
)
if chunker_config is None:
raise HTTPException(status_code=422, detail="Chunker config not found.")

config = DocSkillWorkflowRunnerConfig(
doc_skill=doc_skill,
project=project,
extractor_config=extractor_config,
chunker_config=chunker_config,
)

initial_progress = compute_doc_skill_progress(project, doc_skill)
return DocSkillWorkflowRunner(config, initial_progress)


def _serialize_progress(progress: DocSkillProgress) -> dict:
return {
"total_document_count": progress.total_document_count,
"total_document_extracted_count": progress.total_document_extracted_count,
"total_document_extracted_error_count": progress.total_document_extracted_error_count,
"total_document_chunked_count": progress.total_document_chunked_count,
"total_document_chunked_error_count": progress.total_document_chunked_error_count,
"skill_created": progress.skill_created,
"logs": [
{"message": log.message, "level": log.level}
for log in (progress.logs or [])
],
}


async def run_doc_skill_workflow_with_status(
runner_factory: Callable[[], Awaitable[DocSkillWorkflowRunner]],
) -> StreamingResponse:
async def event_generator():
latest_progress = DocSkillProgress()

try:
runner = await runner_factory()
async for progress in runner.run():
latest_progress = progress.model_copy()
data = _serialize_progress(progress)
yield f"data: {json.dumps(data, ensure_ascii=False)}\n\n"
except asyncio.TimeoutError:
logger.info("Doc skill workflow runner timed out waiting for lock")
latest_progress.logs = [
LogMessage(
level="error",
message="Timed out after waiting for the lock to be acquired. This may be due to a concurrent pipeline running. You may retry in a few minutes.",
)
]
data = _serialize_progress(latest_progress)
yield f"data: {json.dumps(data, ensure_ascii=False)}\n\n"
except Exception as e:
logger.error(
f"Unexpected server error running doc skill workflow: {e}",
exc_info=True,
)
latest_progress.logs = [
LogMessage(
level="error",
message=f"Unexpected server error: {e}",
)
]
data = _serialize_progress(latest_progress)
yield f"data: {json.dumps(data, ensure_ascii=False)}\n\n"

yield "data: complete\n\n"

return StreamingResponse(
content=event_generator(),
media_type="text/event-stream",
)


ProjectId = Annotated[str, Path(description="The unique identifier of the project.")]
DocSkillId = Annotated[str, Path(description="The unique identifier of the doc skill.")]
SkillId = Annotated[str, Path(description="The unique identifier of the skill.")]


def connect_doc_skill_api(app: FastAPI):
@app.post(
"/api/projects/{project_id}/doc_skills",
tags=["Doc Skills"],
summary="Create Doc Skill",
openapi_extra=DENY_AGENT,
)
async def create_doc_skill(
project_id: ProjectId, request: CreateDocSkillRequest
) -> DocumentSkill:
project = project_from_id(project_id)

doc_skill = DocumentSkill(
name=request.name,
skill_name=request.skill_name,
skill_content_header=request.skill_content_header,
description=request.description,
extractor_config_id=request.extractor_config_id,
chunker_config_id=request.chunker_config_id,
document_tags=request.document_tags,
strip_file_extensions=request.strip_file_extensions,
)
doc_skill.parent = project
doc_skill.save_to_file()
return doc_skill

@app.get(
"/api/projects/{project_id}/doc_skills",
tags=["Doc Skills"],
summary="List Doc Skills",
openapi_extra=ALLOW_AGENT,
)
async def list_doc_skills(project_id: ProjectId) -> list[DocumentSkill]:
project = project_from_id(project_id)
doc_skills = project.document_skills(readonly=True)
return doc_skills

@app.get(
"/api/projects/{project_id}/doc_skills/{doc_skill_id}",
tags=["Doc Skills"],
summary="Get Doc Skill",
openapi_extra=ALLOW_AGENT,
)
async def get_doc_skill(
project_id: ProjectId, doc_skill_id: DocSkillId
) -> DocumentSkill:
project = project_from_id(project_id)
doc_skill = _get_doc_skill(project, doc_skill_id)
return doc_skill

@app.patch(
"/api/projects/{project_id}/doc_skills/{doc_skill_id}",
tags=["Doc Skills"],
summary="Update Doc Skill",
openapi_extra=DENY_AGENT,
)
async def update_doc_skill(
project_id: ProjectId, doc_skill_id: DocSkillId, request: UpdateDocSkillRequest
) -> DocumentSkill:
project = project_from_id(project_id)
doc_skill = _get_doc_skill(project, doc_skill_id)

doc_skill.is_archived = request.is_archived
doc_skill.save_to_file()

if doc_skill.skill_id:
skill = Skill.from_id_and_parent_path(doc_skill.skill_id, project.path)
if skill:
skill.is_archived = request.is_archived
skill.save_to_file()

return doc_skill

@app.get(
"/api/projects/{project_id}/doc_skills/{doc_skill_id}/run",
tags=["Doc Skills"],
summary="Run Doc Skill Pipeline",
openapi_extra=DENY_AGENT,
)
async def run_doc_skill(
project_id: ProjectId, doc_skill_id: DocSkillId
) -> StreamingResponse:
"""Triggers the extraction → chunking → skill creation pipeline via SSE. Uses GET for EventSource compatibility."""
project = project_from_id(project_id)
doc_skill = _get_doc_skill(project, doc_skill_id)

if doc_skill.is_archived:
raise HTTPException(
status_code=422, detail="Cannot run an archived doc skill."
)
if doc_skill.skill_id is not None:
raise HTTPException(
status_code=422, detail="This doc skill has already been built."
)

async def runner_factory():
return await _build_workflow_runner(project, doc_skill)

return await run_doc_skill_workflow_with_status(runner_factory)

@app.post(
"/api/projects/{project_id}/doc_skills/progress",
tags=["Doc Skills"],
summary="Get Doc Skill Progress",
openapi_extra=ALLOW_AGENT,
)
async def get_doc_skill_progress(
project_id: ProjectId,
request: DocSkillProgressRequest,
) -> dict[str, DocSkillProgress]:
"""Batch endpoint: returns progress for specified doc skill IDs, or all in project."""
project = project_from_id(project_id)
doc_skills: list[DocumentSkill] = []

if request.doc_skill_ids is not None:
for doc_skill_id in request.doc_skill_ids:
doc_skill = DocumentSkill.from_id_and_parent_path(
doc_skill_id, project.path
)
if doc_skill is None:
continue
doc_skills.append(doc_skill)
if not doc_skills:
return {}
else:
doc_skills = project.document_skills(readonly=True)

return {
ds.id: compute_doc_skill_progress(project, ds)
for ds in doc_skills
if ds.id is not None
}

@app.get(
"/api/projects/{project_id}/skills/{skill_id}/doc_skill_source",
tags=["Doc Skills"],
summary="Get Doc Skill Source for Skill",
openapi_extra=ALLOW_AGENT,
)
async def get_doc_skill_source(
project_id: ProjectId, skill_id: SkillId
) -> DocSkillSourceResponse:
project = project_from_id(project_id)
for ds in project.document_skills(readonly=True):
if ds.skill_id == skill_id:
return DocSkillSourceResponse(
doc_skill_id=ds.id, doc_skill_name=ds.name
)
return DocSkillSourceResponse(doc_skill_id=None, doc_skill_name=None)
Loading
Loading