-
-
') center/contain no-repeat",
- mask: "url('data:image/svg+xml;utf8, ') center/contain no-repeat",
- }}/>
-
-
-
Perspicacité
-
v2 · POC · eDIAM
-
-
-
- {/* New chat */}
-
-
- New chat
-
-
-
- {/* Quick switcher (cmd+K) */}
-
-
-
- Quick switcher…
- K
-
-
-
- {/* Search chats */}
-
-
-
-
-
-
-
- {/* Convo lists */}
-
- Yesterday
- {CONVOS.yesterday.map(c => (
- { setView("chat"); setActiveConvo(c.id); }}/>
- ))}
- Earlier this week
- {CONVOS.earlier.map(c => (
- { setView("chat"); setActiveConvo(c.id); }}/>
- ))}
-
- See all 125 conversations
-
-
-
- {/* Bottom nav */}
-
- } label="Knowledge bases" badge="12"
- active={view==="knowledge"} onClick={() => setView("knowledge")}/>
- } label="Literature survey" badge="3"
- active={view==="survey"} onClick={() => setView("survey")}/>
-
-
- {/* Theme + settings */}
-
- setTheme(theme==="light" ? "dark" : "light")}
- className="flex-1 flex items-center gap-2 px-3 py-2 rounded-xl border border-line bg-card text-[13px] hover:bg-paper-2 transition">
- {theme==="light" ? : }
- {theme==="light" ? "Light" : "Dark"}
-
-
-
-
-
-
- {/* Institutional footer */}
-
-
N
-
-
ICN UMR 7272 · 3iA Côte d'Azur
-
CNRS · Université Côte d'Azur
-
-
-
- );
-}
-
-function SectionLabel({ children, className = "" }) {
- return
{children}
;
-}
-
-function ConvoItem({ c, active, onClick }) {
- return (
-
- {c.title}
- {c.mode}
-
- );
-}
-
-function NavItem({ icon, label, badge, active, onClick }) {
- return (
-
- {icon}
- {label}
- {badge && {badge} }
-
- );
-}
-
-// ---------- Halo (signature device) ----------
-function Halo({ size = 720, x = "55%", y = "-22%", breathing = false, opacity }) {
- return (
-
- );
-}
-
-// ---------- Home view (empty-state) ----------
-function HomeView({ mode, onPick }) {
- const m = MODES.find(x => x.id === mode) ?? MODES[0];
- return (
-
-
-
-
Perspicacité · eDIAM
-
- Ask the literature.
-
-
- {m.label} mode · {m.sub}
-
-
-
- {SUGGESTED.map((s, i) => (
-
onPick(s.q, s.mode)}
- className="group text-left rounded-xl border border-line bg-card hover:border-line-strong hover:bg-paper-2 transition px-4 py-3.5 relative">
- {s.q}
-
- {MODES.find(x=>x.id===s.mode)?.label}
-
-
-
- ))}
-
-
-
- );
-}
-
-// ---------- Chat view ----------
-function ChatView({ conversation, thinkingOpen, setThinkingOpen, running, onAsk }) {
- const scrollRef = useRef(null);
- useEffect(() => {
- if (!scrollRef.current) return;
- scrollRef.current.scrollTop = scrollRef.current.scrollHeight;
- }, [conversation]);
-
- return (
-
- {/* halo */}
-
-
-
-
-
- {conversation.map((m, i) =>
- m.role === "user"
- ?
- :
- )}
-
-
- );
-}
-
-function UserMessage({ text }) {
- return (
-
- );
-}
-
-function AssistantMessage({ msg, thinkingOpen, setThinkingOpen, running }) {
- return (
-
- {/* mode tag */}
-
-
- {running && Working… }
- {!running && msg.done && {(msg.tokens?.elapsed ?? 18.4).toFixed(1)}s · ↑{msg.tokens?.up ?? 0} ↓{msg.tokens?.down ?? 0} }
-
-
- {/* Thinking trail */}
- {msg.thinking?.length > 0 && (
-
-
setThinkingOpen(o => !o)}
- className="w-full flex items-center gap-3 px-4 py-2.5 text-[12.5px] mute hover:bg-paper-2/60 transition">
-
- Thinking trail
-
- {msg.thinking.length} step{msg.thinking.length===1?"":"s"}{running ? " · live" : ""}
-
-
-
- {thinkingOpen && (
-
- {msg.thinking.map((s, i) => (
-
- {s.t}
-
- {s.k}
-
- {s.text}
- {s.meta && {s.meta} }
-
- ))}
- {running && (
-
- ∙∙∙
- …
-
- )}
-
- )}
-
- )}
-
- {/* Body */}
-
- {msg.body.map((b, i) => {
- if (b.kind === "p") return
{renderInline(b.text, msg.sources)}
;
- if (b.kind === "list") return (
-
- );
- return null;
- })}
- {running && msg.body.length > 0 &&
}
-
-
- {/* Sources */}
- {msg.sources?.length > 0 && (
- <>
-
-
- {msg.sources.map(s => )}
-
- >
- )}
-
- {/* Action row */}
- {msg.done && (
-
-
} label="Copy"/>
- } label="Export"/>
- }label="Pin"/>
-
- Follow-up suggestions
-
- )}
-
- );
-}
-
-function ActionBtn({ icon, label }) {
- return (
-
- {icon}{label}
-
- );
-}
-
-function stepStyle(k) {
- // outlined colored chips
- const map = {
- plan: { color: "var(--cnrs-blue)", bg: "color-mix(in srgb, var(--cnrs-blue) 12%, transparent)" },
- search: { color: "#5d6a78", bg: "color-mix(in srgb, #5d6a78 12%, transparent)" },
- fetch: { color: "var(--cnrs-coral)", bg: "color-mix(in srgb, var(--cnrs-coral) 14%, transparent)" },
- reflect: { color: "var(--cnrs-sage)", bg: "color-mix(in srgb, var(--cnrs-sage) 14%, transparent)" },
- compose: { color: "#a07a14", bg: "color-mix(in srgb, #ffeb6e 50%, transparent)" },
- };
- const s = map[k] ?? map.plan;
- return "";
-}
-
-// Render inline text and turn [N] markers into superscript citation pills
-function renderInline(text, sources) {
- if (!sources) return text;
- const parts = [];
- let last = 0;
- const re = /\[(\d+)\]/g;
- let m;
- while ((m = re.exec(text)) !== null) {
- if (m.index > last) parts.push(text.slice(last, m.index));
- const n = parseInt(m[1], 10);
- const src = sources.find(s => s.n === n);
- parts.push(
-
e.preventDefault()}
- className="cite-pill inline-flex items-center justify-center -translate-y-[2px] mx-[1px]"
- style={{
- background: src?.color ?? "var(--cnrs-blue)",
- color: "white", fontFamily: "var(--font-mono)",
- fontSize: 10, fontWeight: 600, width: 16, height: 16,
- borderRadius: 5, lineHeight: 1, letterSpacing: 0,
- }}
- title={src?.title}>
- {n}
-
- );
- last = m.index + m[0].length;
- }
- if (last < text.length) parts.push(text.slice(last));
- return parts;
-}
-
-function SourceCard({ s }) {
- return (
-
e.preventDefault()}
- className="group rounded-xl border border-line bg-card hover:bg-paper-2 hover:border-line-strong transition px-3.5 py-3 flex items-start gap-3">
- {s.letter}
-
-
{s.title}
-
- {s.venue}
- ·
- {s.year}
-
-
- [{s.n}]
-
- );
-}
-
-function ModeBadge({ mode }) {
- const m = MODES.find(x => x.id === mode) ?? MODES[0];
- return (
-
- {m.label}
-
- );
-}
-
-// ---------- Knowledge bases view ----------
-function KnowledgeView() {
- return (
-
-
-
-
-
-
Knowledge bases
-
Your literature corpora.
-
Curate DOIs and BibTeX into searchable, embedded knowledge bases.
-
-
- New KB
-
-
-
-
-
-
-
-
-
{KNOWLEDGE_BASES.length} total
-
-
Sort
-
Recent
-
-
-
- {KNOWLEDGE_BASES.map(k => (
-
-
- {k.title}
- {k.desc}
-
-
-
-
-
- Created {k.created}
-
- ))}
-
-
-
- );
-}
-
-function Stat({ label, value }) {
- return (
-
- );
-}
-
-// ---------- Literature survey view ----------
-function SurveyView() {
- const cols = [
- { id: "claim", label: "Claim", w: "26%" },
- { id: "ms2mol", label: "MS2Mol-XL", w: "18%" },
- { id: "specbert", label: "SpecBERT-2", w: "18%" },
- { id: "metamsflow",label: "MetaMS-Flow",w: "18%" },
- { id: "halo1", label: "Halo-1", w: "20%" },
- ];
- const rows = [
- ["Top-1 structure recovery on CASMI-22", "0.58 [1]", "0.51 [2]", "0.61 [3] ⚠︎", "—"],
- ["Pretraining objective", "Contrastive (MS²↔SMILES) [1]", "MLM on spectra [2]", "Diffusion [3]", "Domain-pretrained [4]"],
- ["Parameter count", "1.4B [1]", "340M [2]", "≈ 700M [3]", "220M [4]"],
- ["License", "Apache-2.0 [1]", "Non-commercial [2]", "Apache-2.0 [3]", "Internal [4]"],
- ["Fine-tunable in <1% labelled regime", "Reported [1]", "Yes [2]", "Unclear", "Yes [4]"],
- ];
-
- return (
-
-
-
-
-
-
Literature survey
-
MS foundation models — comparison.
-
Side-by-side claims across the four 2025–2026 candidate systems. Cells link back to the supporting passage.
-
-
- Export
- Add claim
-
-
-
-
-
-
-
- {cols.map(c => (
- {c.label}
- ))}
-
-
-
- {rows.map((r, i) => (
-
- {r.map((cell, j) => (
- {cell}
- ))}
-
- ))}
-
-
-
-
-
-
-
- );
-}
-
-// ---------- Composer ----------
-function Composer({ mode, setMode, input, setInput, dbCount, dbTotal, openDbs, send, running, stop, elapsed }) {
- const taRef = useRef(null);
- // auto-grow
- useEffect(() => {
- if (!taRef.current) return;
- taRef.current.style.height = "auto";
- taRef.current.style.height = Math.min(220, taRef.current.scrollHeight) + "px";
- }, [input]);
-
- const onKey = (e) => {
- if (e.key === "Enter" && !e.shiftKey) { e.preventDefault(); send(); }
- };
-
- return (
-
- {/* Status bar (Perplexity-style) */}
- {(running) && (
-
-
-
-
- Sending query…
-
- ·
- {elapsed.toFixed(1)}s
- ·
- 14 ↑ 0 ↓
- ·
- {mode}
- ·
- deepseek-v4-flash
-
- Stop
-
-
-
- )}
-
-
-
-
- Perspicacité may make mistakes. Verify against the linked sources.
-
-
- );
-}
-
-// ---------- Database picker (popover) ----------
-function DbPicker({ dbs, setDbs, onClose }) {
- const [q, setQ] = useState("");
- const filtered = DATABASES.filter(d => d.label.toLowerCase().includes(q.toLowerCase()));
- const kbItems = filtered.filter(d => d.kind === "kb");
- const webItems = filtered.filter(d => d.kind === "web");
-
- const toggle = (id) => setDbs(prev => ({ ...prev, [id]: !prev[id] }));
- const allOn = (kind, on) => setDbs(prev => {
- const next = { ...prev };
- DATABASES.forEach(d => { if (d.kind === kind) next[d.id] = on; });
- return next;
- });
-
- return (
-
-
-
e.stopPropagation()}
- className="absolute left-1/2 -translate-x-1/2 bottom-32 w-[520px] rounded-2xl border border-line-strong bg-card shadow-2xl overflow-hidden">
-
-
- setQ(e.target.value)}
- placeholder="Search databases…"
- className="flex-1 bg-transparent outline-none text-[14px] ink placeholder:opacity-50"/>
- Esc
-
-
- allOn("kb", true)} onNone={() => allOn("kb", false)}>
- {kbItems.map(d => (
- toggle(d.id)}/>
- ))}
-
- allOn("web", true)} onNone={() => allOn("web", false)}>
- {webItems.map(d => (
- toggle(d.id)}/>
- ))}
-
-
-
- {Object.values(dbs).filter(Boolean).length} selected
- Done
-
-
-
- );
-}
-
-function DbGroup({ title, count, onAll, onNone, children }) {
- return (
-
-
-
{title}
-
({count})
-
-
Select all
-
·
-
None
-
- {children}
-
- );
-}
-
-function DbRow({ d, on, onToggle }) {
- return (
-
-
- {on && }
-
- {d.label}
-
- {d.kind === "kb"
- ? <>{d.papers} papers · {d.chunks} chunks>
- : <>{d.papers}>
- }
-
-
- );
-}
-
-// ---------- Command palette ----------
-function CommandPalette({ onClose, onNav, onAsk, onNewChat, setMode }) {
- const [q, setQ] = useState("");
- const all = useMemo(() => ([
- { kind: "action", id: "new", label: "Start new chat", hint: "↵", run: onNewChat, key: "new chat" },
- { kind: "action", id: "kb", label: "Go to Knowledge bases", hint: "G K", run: () => onNav("knowledge"), key: "knowledge bases" },
- { kind: "action", id: "sv", label: "Go to Literature survey", hint: "G S", run: () => onNav("survey"), key: "literature survey" },
- ...MODES.map(m => ({ kind: "mode", id: "mode-"+m.id, label: "Mode: " + m.label, hint: m.id.slice(0,3).toUpperCase(), run: () => setMode(m.id), key: m.label })),
- ...CONVOS.yesterday.concat(CONVOS.earlier).map(c => ({ kind: "convo", id: c.id, label: c.title, hint: c.mode, run: () => onNav("chat"), key: c.title })),
- ...SUGGESTED.map((s, i) => ({ kind: "ask", id: "ask-"+i, label: "Ask: " + s.q, hint: MODES.find(m=>m.id===s.mode)?.label, run: () => onAsk(s.q, s.mode), key: s.q })),
- ]), []);
- const filtered = q.trim()
- ? all.filter(x => x.key.toLowerCase().includes(q.toLowerCase()))
- : all.slice(0, 12);
- const [sel, setSel] = useState(0);
-
- const onKey = (e) => {
- if (e.key === "ArrowDown") { e.preventDefault(); setSel(s => Math.min(filtered.length - 1, s + 1)); }
- else if (e.key === "ArrowUp") { e.preventDefault(); setSel(s => Math.max(0, s - 1)); }
- else if (e.key === "Enter") { e.preventDefault(); filtered[sel]?.run(); }
- };
-
- useEffect(() => { setSel(0); }, [q]);
-
- return (
-
-
-
e.stopPropagation()}
- className="absolute left-1/2 top-[16%] -translate-x-1/2 w-[640px] rounded-2xl border border-line-strong bg-card shadow-2xl overflow-hidden">
-
-
- setQ(e.target.value)} onKeyDown={onKey}
- placeholder="Search commands, chats, ask the literature…"
- className="flex-1 bg-transparent outline-none text-[15px] ink placeholder:opacity-50"/>
- Esc
-
-
- {filtered.length === 0 && (
-
- No matches. Press Enter to ask: "{q}"
-
- )}
- {filtered.map((x, i) => (
-
setSel(i)}
- className={"w-full flex items-center gap-3 px-4 py-2.5 text-left text-[13.5px] " +
- (i === sel ? "bg-cnrs-yellow/30" : "")}>
-
- {x.kind}
-
- {x.label}
- {x.hint}
-
- ))}
-
-
- ↑ ↓ navigate
- ↵ run
- Esc close
-
- Perspicacité · v2 POC
-
-
-
- );
-}
-
-// ---------- mount ----------
-ReactDOM.createRoot(document.getElementById("app")).render(
);
diff --git a/pyproject.toml b/pyproject.toml
index 5bc02e0b..006f9d08 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -2,9 +2,31 @@
name = "perspicacite"
version = "2.0.0"
description = "AI-powered scientific literature research assistant"
+readme = { file = "README.md", content-type = "text/markdown" }
license = { file = "LICENSE" }
+authors = [
+ { name = "Lucas Pradi" },
+ { name = "Louis-Félix Nothias" },
+ { name = "HolobiomicsLab" },
+]
+keywords = [
+ "rag",
+ "literature-review",
+ "knowledge-base",
+ "mcp",
+ "scientific-research",
+ "retrieval-augmented-generation",
+]
classifiers = [
+ "Development Status :: 4 - Beta",
"License :: OSI Approved :: Apache Software License",
+ "Programming Language :: Python :: 3",
+ "Programming Language :: Python :: 3.12",
+ "Operating System :: OS Independent",
+ "Environment :: Web Environment",
+ "Intended Audience :: Science/Research",
+ "Topic :: Scientific/Engineering",
+ "Topic :: Text Processing :: Indexing",
]
requires-python = ">=3.12"
dependencies = [
@@ -38,6 +60,13 @@ dependencies = [
"pathspec>=1.1.1",
]
+[project.urls]
+Homepage = "https://github.com/HolobiomicsLab/Perspicacite-AI"
+Repository = "https://github.com/HolobiomicsLab/Perspicacite-AI"
+Issues = "https://github.com/HolobiomicsLab/Perspicacite-AI/issues"
+Changelog = "https://github.com/HolobiomicsLab/Perspicacite-AI/blob/main/CHANGELOG.md"
+Paper = "https://ceur-ws.org/Vol-4085/paper77.pdf"
+
[project.optional-dependencies]
dev = [
"pytest>=8.0.0",
@@ -205,17 +234,122 @@ filterwarnings = [
]
[tool.ruff]
-target-version = "py311"
+target-version = "py312" # matches requires-python = ">=3.12"
line-length = 100
[tool.ruff.lint]
-select = ["E", "F", "I", "N", "W", "UP", "B", "SIM", "TCH", "RUF"]
+select = ["E", "F", "I", "N", "W", "UP", "B", "SIM", "TC", "RUF"]
+ignore = [
+ # --- Owned by `ruff format` (see line-length above) -------------------
+ # We don't also gate the lint check on layout/whitespace rules that the
+ # formatter is the single source of truth for. The pre-commit ruff-format
+ # hook cleans these on every touched file.
+ "E501", # line-too-long
+ "W291", # trailing whitespace
+ "W293", # blank line with whitespace
+ "E701", # multiple statements on one line (colon)
+ "E702", # multiple statements on one line (semicolon)
+ # --- Domain false-positives -------------------------------------------
+ # Ambiguous-unicode: this is a scientific + French codebase. µ, °, ×, –,
+ # α/β and accented French text are intentional notation, not typos.
+ "RUF001", # ambiguous char in a string
+ "RUF002", # ambiguous char in a docstring
+ "RUF003", # ambiguous char in a comment
+ # Pydantic and FastAPI resolve type annotations at *runtime*, so moving
+ # imports into `if TYPE_CHECKING:` blocks (what these rules want) breaks
+ # model/route annotation resolution. Keep the imports at module scope.
+ "TC001", # typing-only first-party import
+ "TC002", # typing-only third-party import
+ "TC003", # typing-only standard-library import
+ # Scientific variable names (T, pH, Kd, dG, ...) are domain-conventional.
+ "N806", # non-lowercase variable in function
+ "N814", # camelcase imported as constant (e.g. `as ET`, `as np`)
+ # Nested `with` blocks are acceptable and often clearer than a single
+ # parenthesized context manager; not worth mechanical churn to combine.
+ "SIM117", # multiple-with-statements
+ # Likewise, a nested `if A: if B:` is often clearer than collapsing into a
+ # single 3-4 term boolean chain (the remaining sites read better nested).
+ "SIM102", # collapsible-if
+ # Module-level imports placed after code are intentional here: a logging
+ # bootstrap (web/app.py, logging.py must configure structlog before
+ # importing anything that binds a logger at import time) and a
+ # pure-functions-before-heavy-imports layout in the cache modules. None
+ # are bugs; mechanically hoisting them risks the bootstrap ordering.
+ "E402", # module-import-not-at-top-of-file
+ # `(str, Enum)` is deliberate: these enums are serialised to JSON and
+ # compared as strings across the API/MCP boundary. Switching to `StrEnum`
+ # (what UP042 wants) changes `str()`/format/JSON semantics and would break
+ # wire compatibility. Keep the explicit mixin.
+ "UP042", # replace-str-enum
+]
+
+[tool.ruff.lint.per-file-ignores]
+# FastAPI puts callables (File/Query/Form/Depends) in argument defaults by
+# design, so B008 (function-call-in-default) is a false positive for handlers.
+"src/perspicacite/web/routers/*" = ["B008"]
+# Test code has different conventions than library code. These rules are
+# idiomatic noise in tests, not signal:
+# N802 - descriptive capitalised test names (test_..._AU, ..._HEAD encode
+# the field/SHA under test on purpose).
+# N818 - fake exception classes (FakeAuth, _AuthFail) deliberately omit the
+# "Error" suffix to read as stubs, not real error types.
+# E741 - `for l in lines:` when parsing fixture text line-by-line.
+# B017 - `pytest.raises(Exception)` is sometimes the point of the assertion.
+# RUF012- mutable class attrs on mock/recorder fixture classes are harmless.
+# SIM115- short-lived `open()` in a test body doesn't need a context manager.
+"tests/**" = ["N802", "N818", "E741", "B017", "RUF012", "SIM115"]
[tool.mypy]
-python_version = "3.11"
-strict = true
-warn_return_any = true
+python_version = "3.12"
+# Pragmatic-but-enforced (was blanket `strict = true`). We keep mypy's
+# *semantic* bug-catchers — the checks that find real defects — and check the
+# bodies of unannotated functions, but we deliberately do NOT gate on
+# annotation-completeness rules (every def fully typed, every generic
+# parameterised, every return narrowed). In a large research codebase those
+# produce churn without proportional safety. The bug-catchers stay on by
+# mypy default; the four noisy completeness flags below are explicitly off.
+check_untyped_defs = true # check bodies of unannotated fns (where many real bugs hide)
warn_unused_configs = true
+warn_redundant_casts = true
+warn_unused_ignores = true # keep `# type: ignore` comments honest
+strict_equality = true # flag non-overlapping-type comparisons
+# Off — annotation-completeness noise, not bug signal:
+disallow_untyped_defs = false # was -> no-untyped-def
+disallow_incomplete_defs = false # was -> no-untyped-def
+disallow_untyped_calls = false # was -> no-untyped-call
+disallow_any_generics = false # was -> type-arg
+warn_return_any = false # was -> no-any-return
+
+[[tool.mypy.overrides]]
+# Untyped / stubless third-party deps we don't control. Pulling in stub
+# packages (types-pandas, types-lxml, types-PyYAML, ...) for each is out of
+# scope for this hardening pass; suppress missing-import noise instead.
+module = [
+ "bibtexparser.*",
+ "bm25s.*",
+ "browser_cookie3.*",
+ "fitz.*",
+ "indicium.*",
+ "indicium_adapters.*",
+ "lxml.*",
+ "pandas.*",
+ "pyoxigraph.*",
+ "pytesseract.*",
+ "rank_bm25.*",
+ "rdflib.*",
+ "scilex.*",
+ "yaml.*",
+]
+ignore_missing_imports = true
+
+[[tool.mypy.overrides]]
+# indicium_layer.store types its SPARQL result loop against rdflib, which only
+# ships with the `indicia` extra. CI's lint env installs base deps only, so
+# rdflib resolves to Any there and a `# type: ignore[union-attr]` that is
+# genuinely *needed* in a full local/indicia checkout reads as *unused* in CI.
+# The divergence is expected — don't gate on unused-ignore for this module.
+module = ["perspicacite.indicium_layer.store"]
+warn_unused_ignores = false
[tool.coverage.run]
source = ["src/perspicacite"]
diff --git a/scripts/check_install.py b/scripts/check_install.py
index 8dc44880..4cc74c1b 100644
--- a/scripts/check_install.py
+++ b/scripts/check_install.py
@@ -55,7 +55,7 @@ def _check_venv() -> tuple[str, str]:
def _check_config() -> tuple[str, str]:
config = REPO_ROOT / "config.yml"
if not config.exists():
- return "FAIL", "config.yml missing — run: cp config.example.yml config.yml"
+ return "FAIL", "config.yml missing — run: cp config/example.yml config.yml"
return "OK", "config.yml present"
diff --git a/src/perspicacite/cli.py b/src/perspicacite/cli.py
index b3483c3f..cbf054b5 100644
--- a/src/perspicacite/cli.py
+++ b/src/perspicacite/cli.py
@@ -1,6 +1,7 @@
"""Command-line interface for Perspicacité v2."""
import asyncio
+import contextlib
import sys
from datetime import datetime
from pathlib import Path
@@ -9,6 +10,15 @@
import click
from perspicacite import __version__
+from perspicacite.cli_helpers import (
+ _add_bibtex_to_existing_kb,
+ _build_app_state_for_cli,
+ _create_kb_from_bibtex,
+ _print_github_repo_summary,
+ _print_skill_bundle_summary,
+ _run_query,
+ _start_mcp_and_web,
+)
from perspicacite.config import load_config
from perspicacite.logging import get_logger, setup_logging
from perspicacite.pipeline.asb.run_ingest import ingest_asb_run as ingest_asb_run_pipeline
@@ -52,6 +62,7 @@ def cli(ctx: click.Context, config: Path | None, verbose: bool) -> None:
try:
cfg = load_config(str(config) if config else None)
except Exception as e:
+ logger.debug("config load failed", error=str(e))
click.echo(f"Error loading config: {e}", err=True)
sys.exit(1)
@@ -220,6 +231,7 @@ def create_kb(
async def _create_empty() -> dict[str, Any]:
state = AppState()
await state.initialize()
+ assert state.session_store is not None
existing = await state.session_store.get_kb_metadata(name)
if existing is not None:
raise FileExistsError(f"KB '{name}' already exists")
@@ -390,6 +402,7 @@ def list_kb(ctx: click.Context, as_json: bool) -> None:
async def _run() -> None:
state = AppState()
await state.initialize()
+ assert state.session_store is not None
kbs = await state.session_store.list_kbs()
if as_json:
import json as _json
@@ -527,173 +540,6 @@ async def fail(self, jid, err):
asyncio.run(_run())
-def _start_mcp_and_web(config, app) -> None:
- """Start MCP server and web server on a single port."""
- import asyncio
- from contextlib import asynccontextmanager
-
- import uvicorn
-
- # Initialize MCP state
- from perspicacite.mcp.server import mcp, mcp_state
-
- asyncio.run(mcp_state.initialize(config))
-
- # Get MCP ASGI app
- mcp_app = mcp.http_app()
-
- # Combine web app + MCP app lifespans
- original_lifespan = app.router.lifespan_context
-
- @asynccontextmanager
- async def combined_lifespan(app_instance):
- async with original_lifespan(app_instance), mcp_app.lifespan(app_instance):
- yield
-
- app.router.lifespan_context = combined_lifespan
-
- # Mount MCP ASGI app — its internal routes are at /mcp
- app.mount("/", mcp_app)
-
- # Run single server
- uvicorn.run(
- app,
- host=config.server.host,
- port=config.server.port,
- reload=config.server.reload,
- )
-
-
-async def _create_kb_from_bibtex(
- config: Any,
- *,
- kb_name: str,
- bib_path: Path,
- description: str | None,
- session_db: Path,
- chroma_dir: Path,
-) -> dict[str, Any]:
- from perspicacite.pipeline.bibtex_kb import create_kb_from_bibtex
-
- return await create_kb_from_bibtex(
- config,
- kb_name=kb_name,
- bib_path=bib_path,
- description=description,
- session_db=session_db,
- chroma_dir=chroma_dir,
- )
-
-
-async def _add_bibtex_to_existing_kb(
- config: Any,
- *,
- kb_name: str,
- bib_path: Path,
- session_db: Path,
- chroma_dir: Path,
-) -> dict[str, Any]:
- from perspicacite.pipeline.bibtex_kb import add_bibtex_to_existing_kb
-
- return await add_bibtex_to_existing_kb(
- config,
- kb_name=kb_name,
- bib_path=bib_path,
- session_db=session_db,
- chroma_dir=chroma_dir,
- )
-
-
-async def _run_query(
- config: Any,
- query: str,
- kb: str,
- mode: str,
- provider: str,
- model: str | None,
-) -> None:
- """Run a RAG query and print the answer + sources to stdout."""
- from perspicacite.models.rag import RAGMode, RAGRequest
- from perspicacite.web.state import AppState
-
- state = AppState()
- await state.initialize()
-
- # Verify the KB exists so we fail fast with a clear message instead
- # of letting the RAG engine spit a chroma error.
- if await state.session_store.get_kb_metadata(kb) is None:
- click.echo(f"\nError: KB '{kb}' not found. List with: perspicacite list-kb", err=True)
- sys.exit(1)
-
- mode_map = {
- "basic": RAGMode.BASIC,
- "advanced": RAGMode.ADVANCED,
- "deep_research": RAGMode.DEEP_RESEARCH,
- "profound": RAGMode.PROFOUND, # backward-compat alias
- "contradiction": RAGMode.CONTRADICTION,
- }
- rag_mode = mode_map.get(mode, RAGMode.BASIC)
-
- # Effective model/provider: explicit flag → config default → dataclass default.
- eff_provider = provider or config.llm.default_provider
- eff_model = model or config.llm.default_model
-
- request = RAGRequest(
- query=query,
- kb_name=kb,
- mode=rag_mode,
- stream=False,
- provider=eff_provider,
- model=eff_model,
- )
-
- # Use the same RAGEngine the web/MCP layers use.
- full_answer_parts: list[str] = []
- sources: list[dict[str, Any]] = []
- try:
- async for event in state.rag_engine.query_stream(request):
- etype = getattr(event, "event", None)
- data = getattr(event, "data", None)
- if etype == "content" and data:
- # data is a JSON envelope { "delta": "..." }
- try:
- import json as _json
- delta = _json.loads(data).get("delta", "")
- except Exception:
- delta = str(data)
- if delta:
- full_answer_parts.append(delta)
- elif etype == "source" and data:
- try:
- import json as _json
- s = _json.loads(data)
- sources.append(s)
- except Exception:
- pass
- elif etype == "error" and data:
- click.echo(f"\n❌ Error from RAG engine: {data}", err=True)
- sys.exit(1)
- except Exception as exc:
- click.echo(f"\n❌ Query failed: {exc}", err=True)
- sys.exit(1)
-
- answer = "".join(full_answer_parts).strip()
- click.echo("\n📝 Answer:")
- if not answer:
- click.echo(" (no answer — KB might be empty for this query)")
- else:
- click.echo(answer)
- if sources:
- click.echo("\n📎 Sources:")
- for i, s in enumerate(sources, 1):
- title = s.get("title") or s.get("doi") or "(untitled)"
- year = s.get("year")
- doi = s.get("doi")
- tag = f" ({year})" if year else ""
- doi_tag = f" doi:{doi}" if doi else ""
- click.echo(f" [{i}] {title}{tag}{doi_tag}")
-
-
@cli.command(name="screen-papers")
@click.option(
"--input",
@@ -879,6 +725,7 @@ def build_capsule_cmd(ctx, paper_id: str, kb: str, force: bool) -> None:
async def _run() -> None:
state = AppState()
await state.initialize()
+ assert state.session_store is not None
kb_meta = await state.session_store.get_kb_metadata(kb)
if kb_meta is None:
click.echo(f"Error: KB '{kb}' not found", err=True)
@@ -919,6 +766,7 @@ def build_capsules_cmd(ctx, kb_name: str, force: bool) -> None:
async def _run() -> None:
state = AppState()
await state.initialize()
+ assert state.session_store is not None
kb_meta = await state.session_store.get_kb_metadata(kb_name)
if kb_meta is None:
click.echo(f"Error: KB '{kb_name}' not found", err=True)
@@ -937,6 +785,7 @@ async def _run() -> None:
counts[status] = counts.get(status, 0) + 1
click.echo(f" {paper.id}: {status}")
except Exception as exc:
+ logger.warning("capsule build failed", error=str(exc))
counts["errored"] += 1
click.echo(f" {paper.id}: errored — {exc}", err=True)
click.echo(f"Summary: {counts}")
@@ -976,6 +825,7 @@ def fetch_resources_cmd(
async def _run() -> None:
state = AppState()
await state.initialize()
+ assert state.session_store is not None
kb_meta = await state.session_store.get_kb_metadata(kb_name)
if kb_meta is None:
click.echo(f"Error: KB '{kb_name}' not found", err=True)
@@ -1069,7 +919,7 @@ def import_browser_cookies_cmd(
" uv pip install -e \".[cookies]\"",
err=True,
)
- raise SystemExit(2)
+ raise SystemExit(2) from None
from http.cookiejar import MozillaCookieJar
from pathlib import Path
@@ -1093,7 +943,7 @@ def import_browser_cookies_cmd(
"least once. On macOS you may need to grant keychain access.",
err=True,
)
- raise SystemExit(1)
+ raise SystemExit(1) from None
domain_filters = [d.lower() for d in (domains or ())]
jar = MozillaCookieJar()
@@ -1110,10 +960,8 @@ def import_browser_cookies_cmd(
seen_hosts[host] = seen_hosts.get(host, 0) + 1
jar.save(str(out), ignore_discard=True, ignore_expires=True)
- try:
+ with contextlib.suppress(OSError):
out.chmod(0o600)
- except OSError:
- pass
click.echo(
f"Wrote {matched} of {total} cookies to {out} "
@@ -1335,6 +1183,7 @@ def delete_kb_cmd(
async def _run() -> None:
state = AppState()
await state.initialize()
+ assert state.session_store is not None
kb = await state.session_store.get_kb_metadata(name)
if not kb:
click.echo(f"KB '{name}' not found.", err=True)
@@ -1353,6 +1202,7 @@ async def _run() -> None:
await state.vector_store.delete_collection(kb.collection_name)
collection_dropped = True
except Exception as exc:
+ logger.warning("chroma collection delete failed", error=str(exc))
click.echo(
f"Warning: failed to delete Chroma collection "
f"{kb.collection_name}: {exc}", err=True,
@@ -1731,15 +1581,6 @@ def check_cookies_cmd(
sys.exit(1)
-async def _build_app_state_for_cli(config: Any) -> Any:
- """Test seam: thin wrapper so unit tests can patch this without
- constructing the full AppState."""
- from perspicacite.web.state import AppState
- state = AppState()
- await state.initialize()
- return state
-
-
@cli.command("ingest-asb-run")
@click.argument(
"asb_run_dir",
@@ -1819,38 +1660,6 @@ async def _run() -> dict:
click.echo(f" workflow: {len(dag['nodes'])} nodes, {len(dag['edges'])} edges")
-def _print_github_repo_summary(summary: IngestSummary) -> None:
- """Human-readable summary line for the raw-repo path."""
- coords = ""
- if summary.repo_org and summary.repo_name:
- coords = f" ({summary.repo_org}/{summary.repo_name}"
- if summary.commit_sha:
- coords += f"@{summary.commit_sha}"
- coords += ")"
- click.echo(f"GitHub repo ingested into KB: {summary.kb_name}{coords}")
- click.echo(f" files: {summary.files_added}")
- click.echo(f" chunks: {summary.chunks_added}")
-
-
-def _print_skill_bundle_summary(summary: IngestSummary) -> None:
- """Human-readable summary line for a single bundle ingest."""
- suffix = ""
- if summary.bundle_name:
- suffix = f" (bundle: {summary.bundle_name})"
- click.echo(f"Skill bundle ingested into KB: {summary.kb_name}{suffix}")
- click.echo(f" files: {summary.files_added}")
- click.echo(f" chunks: {summary.chunks_added}")
- click.echo(f" linked papers: {summary.linked_papers_added}")
- if summary.linked_papers_skipped_non_doi:
- kinds = ", ".join(
- f"{kind}={value}"
- for kind, value in summary.linked_papers_skipped_non_doi[:5]
- )
- click.echo(
- f" skipped (non-DOI): {len(summary.linked_papers_skipped_non_doi)} ({kinds})"
- )
-
-
@cli.command("ingest-github-repo")
@click.argument("url", type=str)
@click.option("--kb-name", required=True, help="Target KB name.")
@@ -1944,10 +1753,7 @@ def ingest_skill_bundle_cmd(
# raw string (the orchestrator parses URLs).
source_arg: Path | str
candidate = Path(source)
- if candidate.exists():
- source_arg = candidate
- else:
- source_arg = source
+ source_arg = candidate if candidate.exists() else source
async def _run() -> IngestSummary:
app_state = await _build_app_state_for_cli(ctx.obj.get("config"))
diff --git a/src/perspicacite/cli_helpers.py b/src/perspicacite/cli_helpers.py
new file mode 100644
index 00000000..4181d541
--- /dev/null
+++ b/src/perspicacite/cli_helpers.py
@@ -0,0 +1,230 @@
+"""Implementation helpers for the Perspicacité CLI commands.
+
+These functions hold the business logic the ``@cli.command()`` wrappers in
+[cli.py](cli.py) delegate to. They are re-imported into ``perspicacite.cli`` so
+that ``perspicacite.cli.
`` stays a valid attribute (several unit tests
+``monkeypatch.setattr`` these names there).
+"""
+from __future__ import annotations
+
+import json
+import sys
+from pathlib import Path
+from typing import Any
+
+import click
+
+from perspicacite.logging import get_logger
+from perspicacite.pipeline.github_kb import IngestSummary
+
+logger = get_logger("perspicacite.cli_helpers")
+
+
+def _start_mcp_and_web(config, app) -> None:
+ """Start MCP server and web server on a single port."""
+ import asyncio
+ from contextlib import asynccontextmanager
+
+ import uvicorn
+
+ # Initialize MCP state
+ from perspicacite.mcp.server import mcp, mcp_state
+
+ asyncio.run(mcp_state.initialize(config))
+
+ # Get MCP ASGI app
+ mcp_app = mcp.http_app()
+
+ # Combine web app + MCP app lifespans
+ original_lifespan = app.router.lifespan_context
+
+ @asynccontextmanager
+ async def combined_lifespan(app_instance):
+ async with original_lifespan(app_instance), mcp_app.lifespan(app_instance):
+ yield
+
+ app.router.lifespan_context = combined_lifespan
+
+ # Mount MCP ASGI app — its internal routes are at /mcp
+ app.mount("/", mcp_app)
+
+ # Run single server
+ uvicorn.run(
+ app,
+ host=config.server.host,
+ port=config.server.port,
+ reload=config.server.reload,
+ )
+
+
+async def _create_kb_from_bibtex(
+ config: Any,
+ *,
+ kb_name: str,
+ bib_path: Path,
+ description: str | None,
+ session_db: Path,
+ chroma_dir: Path,
+) -> dict[str, Any]:
+ from perspicacite.pipeline.bibtex_kb import create_kb_from_bibtex
+
+ return await create_kb_from_bibtex(
+ config,
+ kb_name=kb_name,
+ bib_path=bib_path,
+ description=description,
+ session_db=session_db,
+ chroma_dir=chroma_dir,
+ )
+
+
+async def _add_bibtex_to_existing_kb(
+ config: Any,
+ *,
+ kb_name: str,
+ bib_path: Path,
+ session_db: Path,
+ chroma_dir: Path,
+) -> dict[str, Any]:
+ from perspicacite.pipeline.bibtex_kb import add_bibtex_to_existing_kb
+
+ return await add_bibtex_to_existing_kb(
+ config,
+ kb_name=kb_name,
+ bib_path=bib_path,
+ session_db=session_db,
+ chroma_dir=chroma_dir,
+ )
+
+
+async def _run_query(
+ config: Any,
+ query: str,
+ kb: str,
+ mode: str,
+ provider: str,
+ model: str | None,
+) -> None:
+ """Run a RAG query and print the answer + sources to stdout."""
+ from perspicacite.models.rag import RAGMode, RAGRequest
+ from perspicacite.web.state import AppState
+
+ state = AppState()
+ await state.initialize()
+ assert state.session_store is not None
+
+ # Verify the KB exists so we fail fast with a clear message instead
+ # of letting the RAG engine spit a chroma error.
+ if await state.session_store.get_kb_metadata(kb) is None:
+ click.echo(f"\nError: KB '{kb}' not found. List with: perspicacite list-kb", err=True)
+ sys.exit(1)
+
+ mode_map = {
+ "basic": RAGMode.BASIC,
+ "advanced": RAGMode.ADVANCED,
+ "deep_research": RAGMode.DEEP_RESEARCH,
+ "profound": RAGMode.PROFOUND, # backward-compat alias
+ "contradiction": RAGMode.CONTRADICTION,
+ }
+ rag_mode = mode_map.get(mode, RAGMode.BASIC)
+
+ # Effective model/provider: explicit flag → config default → dataclass default.
+ eff_provider = provider or config.llm.default_provider
+ eff_model = model or config.llm.default_model
+
+ request = RAGRequest(
+ query=query,
+ kb_name=kb,
+ mode=rag_mode,
+ provider=eff_provider,
+ model=eff_model,
+ )
+
+ # Use the same RAGEngine the web/MCP layers use.
+ full_answer_parts: list[str] = []
+ sources: list[dict[str, Any]] = []
+ try:
+ async for event in state.rag_engine.query_stream(request):
+ etype = getattr(event, "event", None)
+ data = getattr(event, "data", None)
+ if etype == "content" and data:
+ # data is a JSON envelope { "delta": "..." }
+ try:
+ import json as _json
+ delta = _json.loads(data).get("delta", "")
+ except Exception as exc:
+ logger.debug("content delta parse failed", error=str(exc))
+ delta = str(data)
+ if delta:
+ full_answer_parts.append(delta)
+ elif etype == "source" and data:
+ try:
+ import json as _json
+ s = _json.loads(data)
+ sources.append(s)
+ except (json.JSONDecodeError, ValueError, TypeError):
+ pass
+ elif etype == "error" and data:
+ click.echo(f"\n❌ Error from RAG engine: {data}", err=True)
+ sys.exit(1)
+ except Exception as exc:
+ logger.warning("query stream failed", error=str(exc))
+ click.echo(f"\n❌ Query failed: {exc}", err=True)
+ sys.exit(1)
+
+ answer = "".join(full_answer_parts).strip()
+ click.echo("\n📝 Answer:")
+ if not answer:
+ click.echo(" (no answer — KB might be empty for this query)")
+ else:
+ click.echo(answer)
+ if sources:
+ click.echo("\n📎 Sources:")
+ for i, s in enumerate(sources, 1):
+ title = s.get("title") or s.get("doi") or "(untitled)"
+ year = s.get("year")
+ doi = s.get("doi")
+ tag = f" ({year})" if year else ""
+ doi_tag = f" doi:{doi}" if doi else ""
+ click.echo(f" [{i}] {title}{tag}{doi_tag}")
+
+
+async def _build_app_state_for_cli(config: Any) -> Any:
+ """Test seam: thin wrapper so unit tests can patch this without
+ constructing the full AppState."""
+ from perspicacite.web.state import AppState
+ state = AppState()
+ await state.initialize()
+ return state
+
+
+def _print_github_repo_summary(summary: IngestSummary) -> None:
+ """Human-readable summary line for the raw-repo path."""
+ coords = ""
+ if summary.repo_org and summary.repo_name:
+ coords = f" ({summary.repo_org}/{summary.repo_name}"
+ if summary.commit_sha:
+ coords += f"@{summary.commit_sha}"
+ coords += ")"
+ click.echo(f"GitHub repo ingested into KB: {summary.kb_name}{coords}")
+ click.echo(f" files: {summary.files_added}")
+ click.echo(f" chunks: {summary.chunks_added}")
+
+
+def _print_skill_bundle_summary(summary: IngestSummary) -> None:
+ """Human-readable summary line for a single bundle ingest."""
+ suffix = ""
+ if summary.bundle_name:
+ suffix = f" (bundle: {summary.bundle_name})"
+ click.echo(f"Skill bundle ingested into KB: {summary.kb_name}{suffix}")
+ click.echo(f" files: {summary.files_added}")
+ click.echo(f" chunks: {summary.chunks_added}")
+ click.echo(f" linked papers: {summary.linked_papers_added}")
+ if summary.linked_papers_skipped_non_doi:
+ kinds = ", ".join(
+ f"{kind}={value}"
+ for kind, value in summary.linked_papers_skipped_non_doi[:5]
+ )
+ click.echo(
+ f" skipped (non-DOI): {len(summary.linked_papers_skipped_non_doi)} ({kinds})"
+ )
diff --git a/src/perspicacite/config/loader.py b/src/perspicacite/config/loader.py
index aa20ff56..68a11acc 100644
--- a/src/perspicacite/config/loader.py
+++ b/src/perspicacite/config/loader.py
@@ -46,7 +46,7 @@ def load_yaml_file(path: Path) -> dict[str, Any] | None:
def load_from_env() -> dict[str, Any]:
"""Load configuration overrides from environment variables."""
- overrides = {}
+ overrides: dict[str, Any] = {}
# Map of env var -> config path. Keep secrets (api keys, tokens) here so
# they don't have to live in config.yml. Standard ZOTERO_API_KEY also
@@ -72,16 +72,19 @@ def load_from_env() -> dict[str, Any]:
}
for env_var, (section, key) in env_mappings.items():
- if value := os.environ.get(env_var):
+ if raw := os.environ.get(env_var):
# Convert types
- if value.lower() in ("true", "false"):
- value = value.lower() == "true"
- elif value.isdigit():
- value = int(value)
+ converted: str | bool | int
+ if raw.lower() in ("true", "false"):
+ converted = raw.lower() == "true"
+ elif raw.isdigit():
+ converted = int(raw)
+ else:
+ converted = raw
if section not in overrides:
overrides[section] = {}
- overrides[section][key] = value
+ overrides[section][key] = converted
return overrides
diff --git a/src/perspicacite/indicium_layer/builder.py b/src/perspicacite/indicium_layer/builder.py
index 5fcb3105..e3c9dbc1 100644
--- a/src/perspicacite/indicium_layer/builder.py
+++ b/src/perspicacite/indicium_layer/builder.py
@@ -41,7 +41,6 @@
from perspicacite.indicium_layer.pruner import build_candidate_pairs
from perspicacite.indicium_layer.queries import (
ASB_NS,
- INDICIUM_NS,
IRI_ANCHOR_STATUS,
IRI_ASSERTED_BY,
IRI_CAPTION,
diff --git a/src/perspicacite/indicium_layer/invalidation.py b/src/perspicacite/indicium_layer/invalidation.py
index e4351301..fd402b9d 100644
--- a/src/perspicacite/indicium_layer/invalidation.py
+++ b/src/perspicacite/indicium_layer/invalidation.py
@@ -6,7 +6,7 @@
import indicium
-from perspicacite.indicium_layer.manifest import Manifest # noqa: TC001
+from perspicacite.indicium_layer.manifest import Manifest
def compute_paper_hash(text: str) -> str:
diff --git a/src/perspicacite/indicium_layer/store.py b/src/perspicacite/indicium_layer/store.py
index d9667f4f..ab1b83ce 100644
--- a/src/perspicacite/indicium_layer/store.py
+++ b/src/perspicacite/indicium_layer/store.py
@@ -25,9 +25,8 @@
from rdflib import Dataset, URIRef
from rdflib import Literal as RdflibLiteral
-# rdflib ≥ 6.0 deprecates ConjunctiveGraph in favour of Dataset.
-# Dataset is a drop-in for our usage (add/query named graphs).
-ConjunctiveGraph = Dataset # alias for backward compat in type hints below
+# rdflib ≥ 6.0 deprecates ConjunctiveGraph in favour of Dataset, a drop-in for
+# our add/query-named-graphs usage.
_RDF_SUBJECT = URIRef("http://www.w3.org/1999/02/22-rdf-syntax-ns#subject")
_RDF_PREDICATE = URIRef("http://www.w3.org/1999/02/22-rdf-syntax-ns#predicate")
@@ -59,7 +58,7 @@ def __init__(
backend = "oxigraph" if data_dir is not None else "memory"
self._backend = backend
if backend == "memory":
- self._g: ConjunctiveGraph | None = ConjunctiveGraph()
+ self._g: Dataset | None = Dataset()
self._oxistore = None
elif backend == "oxigraph":
self._g = None
@@ -116,6 +115,7 @@ def contains_iri(self, iri: str) -> bool:
assert self._g is not None
return any(self._g.quads((URIRef(iri), None, None, None)))
# oxigraph: cheap ASK
+ assert self._oxistore is not None # guaranteed by __init__ when backend == "oxigraph"
ask = f"ASK {{ {{ <{iri}> ?p ?o }} UNION {{ GRAPH ?g {{ <{iri}> ?p ?o }} }} }}"
return bool(self._oxistore.query(ask))
@@ -141,7 +141,7 @@ def _select_rdflib(self, sparql: str) -> list[dict[str, str]]:
out: list[dict[str, str]] = []
for row in results:
d: dict[str, str] = {}
- for var, val in row.asdict().items():
+ for var, val in row.asdict().items(): # type: ignore[union-attr] # pyoxigraph ResultRow always has asdict(); union includes non-row types due to broad query return stub
if val is None:
continue
d[str(var)] = str(val.toPython()) if isinstance(val, RdflibLiteral) else str(val)
@@ -153,8 +153,10 @@ def _select_rdflib(self, sparql: str) -> list[dict[str, str]]:
def _add_oxi(self, s: str, p: str, o: str | LiteralTuple, graph: str | None) -> None:
import pyoxigraph as oxi
+ assert self._oxistore is not None # guaranteed by __init__ when backend == "oxigraph"
subj = oxi.NamedNode(s)
pred = oxi.NamedNode(p)
+ obj: Any # NamedNode or Literal depending on branch; both are valid RDF term types
if isinstance(o, tuple):
_, val, dt = o
obj = oxi.Literal(
@@ -167,6 +169,7 @@ def _add_oxi(self, s: str, p: str, o: str | LiteralTuple, graph: str | None) ->
self._oxistore.add(oxi.Quad(subj, pred, obj, ctx))
def _select_oxi(self, sparql: str) -> list[dict[str, str]]:
+ assert self._oxistore is not None # guaranteed by __init__ when backend == "oxigraph"
solutions = self._oxistore.query(sparql)
out: list[dict[str, str]] = []
for sol in solutions:
diff --git a/src/perspicacite/integrations/zotero_ingest.py b/src/perspicacite/integrations/zotero_ingest.py
index 9d2535d3..65c06eef 100644
--- a/src/perspicacite/integrations/zotero_ingest.py
+++ b/src/perspicacite/integrations/zotero_ingest.py
@@ -308,7 +308,8 @@ async def build_kbs_from_zotero(
# Attach notes
try:
notes = await client.get_item_notes(it["key"])
- except Exception:
+ except Exception as exc:
+ logger.debug("zotero notes fetch failed", error=str(exc))
notes = []
if notes:
note_block = "\n\n# Notes\n\n" + "\n\n".join(notes)
diff --git a/src/perspicacite/integrations/zotero_license.py b/src/perspicacite/integrations/zotero_license.py
index 82207f4b..62697f89 100644
--- a/src/perspicacite/integrations/zotero_license.py
+++ b/src/perspicacite/integrations/zotero_license.py
@@ -12,6 +12,10 @@
import httpx
+from perspicacite.logging import get_logger
+
+logger = get_logger("perspicacite.integrations.zotero_license")
+
_PERMISSIVE_SPDX_PREFIXES = (
"CC0",
"CC-BY-4", "CC-BY-3", "CC-BY-2", "CC-BY-1",
@@ -168,7 +172,8 @@ async def classify(
# 4. Heuristic
return self._store(doi, self.heuristic(is_oa=is_oa or False))
- except Exception:
+ except Exception as exc:
+ logger.debug("zotero license probe failed", error=str(exc))
# Never crash the caller over a license lookup failure.
return LicenseInfo(spdx=None, classification="unknown", policy="reflavor", source="unknown")
finally:
@@ -194,8 +199,8 @@ async def _from_crossref(
info.source = "crossref"
if info.classification != "unknown":
return info
- except Exception:
- pass
+ except Exception as exc:
+ logger.debug("crossref_license_lookup_failed", doi=doi, error=str(exc))
return None
async def _from_openalex(
@@ -220,5 +225,6 @@ async def _from_openalex(
if info.classification != "unknown":
return info, is_oa
return None, is_oa
- except Exception:
+ except Exception as exc:
+ logger.debug("openalex_license_lookup_failed", doi=doi, error=str(exc))
return None, False
diff --git a/src/perspicacite/jobs/registry.py b/src/perspicacite/jobs/registry.py
index 426aa291..6d23f975 100644
--- a/src/perspicacite/jobs/registry.py
+++ b/src/perspicacite/jobs/registry.py
@@ -3,6 +3,7 @@
from __future__ import annotations
import asyncio
+import contextlib
import json
import uuid
from collections.abc import AsyncIterator
@@ -108,10 +109,11 @@ async def get(self, job_id: str) -> dict[str, Any] | None:
row = await cur.fetchone()
if row is None:
return None
- d = {k: row[k] for k in row.keys()}
+ # aiosqlite.Row is NOT a dict: iterating it yields column *values*, so we
+ # must go through .keys() to get column names. noqa: SIM118 (the rule
+ # assumes a real mapping where `in row` == `in row.keys()`).
+ d = {k: row[k] for k in row.keys()} # noqa: SIM118
if d.get("result"):
- try:
+ with contextlib.suppress(json.JSONDecodeError):
d["result"] = json.loads(d["result"])
- except json.JSONDecodeError:
- pass
return d
diff --git a/src/perspicacite/llm/agent_cli.py b/src/perspicacite/llm/agent_cli.py
index ec7c500d..9a2012a8 100644
--- a/src/perspicacite/llm/agent_cli.py
+++ b/src/perspicacite/llm/agent_cli.py
@@ -51,8 +51,8 @@
claude-sonnet-4-5: sonnet # names into CLI aliases
claude-haiku-4-5: haiku
-See ``config.claude_code.example.yml``, ``config.codex.example.yml``,
-``config.openclaw.example.yml``, and ``config.hermes.example.yml`` for
+See ``config/providers/claude-code.yml``, ``config/providers/codex.yml``,
+``config/providers/openclaw.yml``, and ``config/providers/hermes.yml`` for
ready-made presets.
**Caveat: rate limits are shared with your interactive agent
@@ -65,6 +65,7 @@
from __future__ import annotations
import asyncio
+import contextlib
import json
import os
import time
@@ -300,7 +301,7 @@ async def complete(
await proc.wait()
raise RuntimeError(
f"{self.provider_label}: CLI timed out after {self.timeout}s"
- )
+ ) from None
latency_ms = (time.monotonic() - t0) * 1000.0
if proc.returncode != 0:
@@ -308,10 +309,8 @@ async def complete(
out_str = (stdout or b"").decode("utf-8", errors="replace")
err = err_full[:500]
if out_path:
- try:
+ with contextlib.suppress(OSError):
os.unlink(out_path)
- except OSError:
- pass
# Detect rate-limit signals — raise structured error so the
# orchestrator / Wave 3.2 fallback chain can react.
from perspicacite.llm.errors import (
@@ -349,10 +348,8 @@ async def complete(
with open(out_path, encoding="utf-8", errors="replace") as fh:
raw = fh.read().strip()
finally:
- try:
+ with contextlib.suppress(OSError):
os.unlink(out_path)
- except OSError:
- pass
else:
raw = stdout.decode("utf-8", errors="replace").strip()
text, in_tokens, out_tokens, details = self._parse_output_full(raw)
@@ -389,8 +386,8 @@ async def complete(
completion_tokens=out_tokens,
latency_ms=latency_ms,
)
- except Exception:
- pass
+ except Exception as exc:
+ logger.debug("provenance_collector_emit_failed", provider=self.provider_label, error=str(exc))
# F2 (audit 2026-05-15): budget tracker — push usage even when
# no provenance collector is active. Prefer the CLI-reported
diff --git a/src/perspicacite/llm/claude_cli.py b/src/perspicacite/llm/claude_cli.py
index 72d8834b..d09cd04a 100644
--- a/src/perspicacite/llm/claude_cli.py
+++ b/src/perspicacite/llm/claude_cli.py
@@ -7,7 +7,7 @@
working unchanged.
See ``agent_cli.py`` for the underlying implementation and
-``config.claude_code.example.yml`` for a config-only setup.
+``config/providers/claude-code.yml`` for a config-only setup.
"""
from __future__ import annotations
@@ -48,7 +48,7 @@
}
-def ClaudeCLIClient(
+def ClaudeCLIClient( # noqa: N802 — back-compat factory named after the retired class
*,
executable: str = "claude",
timeout: float = 180.0,
diff --git a/src/perspicacite/llm/client.py b/src/perspicacite/llm/client.py
index 57861793..6caeaf75 100644
--- a/src/perspicacite/llm/client.py
+++ b/src/perspicacite/llm/client.py
@@ -57,7 +57,7 @@ def _emit_usage_telemetry(
if sink is None:
return
try:
- from perspicacite.rag.telemetry import emit_tokens, emit_cost
+ from perspicacite.rag.telemetry import emit_cost, emit_tokens
emit_tokens(
sink,
input_tokens=prompt_tokens,
@@ -66,21 +66,21 @@ def _emit_usage_telemetry(
provider=provider,
)
emit_cost(sink, usd=cost_usd, model=model, provider=provider)
- except Exception:
- pass
+ except Exception as exc:
+ logger.debug("usage_telemetry_emit_failed", error=str(exc))
# F9 (audit 2026-05-15): LiteLLM prints a "Give Feedback / Get Help"
# banner to stderr on every error, plus an "If you need to debug…" info
# line. These pollute our structured logs and operator terminals. Silence
# them at module load. The banner is gated on ``litellm.suppress_debug_info``
# (see litellm/utils.py and litellm/router.py).
-import logging as _stdlib_logging # noqa: E402
+import logging as _stdlib_logging
try:
import litellm as _litellm
_litellm.suppress_debug_info = True
-except Exception: # pragma: no cover — litellm is a hard dep
- pass
+except Exception as exc: # pragma: no cover — litellm is a hard dep
+ logger.debug("litellm debug suppression failed", error=str(exc))
_stdlib_logging.getLogger("LiteLLM").setLevel(_stdlib_logging.ERROR)
_stdlib_logging.getLogger("litellm").setLevel(_stdlib_logging.ERROR)
@@ -158,20 +158,7 @@ def _should_trigger_free_fallback(exc: Exception) -> bool:
if isinstance(exc, AuthError):
return True
msg = str(exc).lower()
- if any(k in msg for k in (
- "not a valid model",
- "model not found",
- "no endpoints found",
- "quota",
- "billing",
- "credit balance",
- "usage limit",
- "insufficient",
- "invalid api key",
- "authentication",
- )):
- return True
- return False
+ return bool(any(k in msg for k in ("not a valid model", "model not found", "no endpoints found", "quota", "billing", "credit balance", "usage limit", "insufficient", "invalid api key", "authentication")))
def _is_deterministic_fail(exc: Exception) -> bool:
@@ -189,7 +176,7 @@ def _is_deterministic_fail(exc: Exception) -> bool:
from perspicacite.llm.budget import BudgetExceededError
if isinstance(exc, BudgetExceededError):
return True
- except Exception: # pragma: no cover — module always importable
+ except ImportError: # pragma: no cover — module always importable
pass
# Detect the wrapped class without invoking _maybe_wrap_error
# (avoids circular logic with the retry decorator).
@@ -379,7 +366,7 @@ class AsyncLLMClient:
def __init__(self, config: LLMConfig):
self.config = config
- self._litellm = None
+ self._litellm: Any = None # optional dep; assigned lazily in _get_litellm
# Cache one AgentCLIClient instance per provider key (claude_cli,
# agent_cli, plus any user-defined alias).
self._agent_clis: dict[str, Any] = {}
@@ -457,7 +444,7 @@ def _get_agent_cli_client(self, provider: str) -> Any:
"hermes}.example.yml presets."
)
client = AgentCLIClient(
- executable=cli_cfg.executable,
+ executable=cli_cfg.executable or "", # guard already raised above if None/falsy
provider_label=provider,
prompt_via=cli_cfg.prompt_via,
prompt_flag=cli_cfg.prompt_flag,
@@ -520,8 +507,19 @@ def _build_model_string(self, provider: str, model: str) -> str:
@retry(
# F1 (audit 2026-05-15): never retry on deterministic-fail errors
# — auth errors won't suddenly become valid; budget breaches won't
- # heal. Retry every OTHER exception.
- retry=retry_if_exception(lambda e: not _is_deterministic_fail(e)),
+ # heal. Retry every OTHER *Exception*.
+ #
+ # `isinstance(e, Exception)` is load-bearing: tenacity catches
+ # BaseException, so without it the predicate would retry on
+ # asyncio.CancelledError (a BaseException, not an Exception). That
+ # swallows cancellation from asyncio.wait_for/timeout and makes every
+ # caller-side timeout ineffective: all 3 retry attempts run their full
+ # HTTP timeout to completion before the (now-stale) deadline is
+ # noticed. Letting cancellation propagate is what makes those timeouts
+ # actually bound wall-clock.
+ retry=retry_if_exception(
+ lambda e: isinstance(e, Exception) and not _is_deterministic_fail(e)
+ ),
stop=stop_after_attempt(3),
wait=wait_exponential(multiplier=1, min=2, max=10),
reraise=True,
@@ -1107,7 +1105,9 @@ async def complete(
)
# All free-tier models also failed — re-raise the last error.
- raise last_exc
+ # `from None` suppresses the implicit "during handling of
+ # primary_exc" chaining; last_exc already carries the real cause.
+ raise last_exc from None
async def complete_with_fallback(
self,
diff --git a/src/perspicacite/llm/embeddings.py b/src/perspicacite/llm/embeddings.py
index 0a98a311..9d5a90f5 100644
--- a/src/perspicacite/llm/embeddings.py
+++ b/src/perspicacite/llm/embeddings.py
@@ -52,7 +52,7 @@ def __init__(
):
self.model = model
self.batch_size = batch_size
- self._litellm = None
+ self._litellm: Any = None # optional dep; assigned lazily in _get_litellm
self._dimension = self._get_dimension()
def _get_litellm(self) -> Any:
@@ -186,7 +186,7 @@ def _best_device() -> str:
return "mps"
if torch.cuda.is_available():
return "cuda"
- except Exception:
+ except (ImportError, AttributeError):
pass
return "cpu"
@@ -281,7 +281,7 @@ def _get_model(self) -> Any:
raise ImportError(
"sentence-transformers not installed. "
"Install with: pip install sentence-transformers"
- )
+ ) from None
return self._model
@property
@@ -435,7 +435,7 @@ async def embed_query(self, texts: list[str]) -> list[list[float]]:
)
else:
# Apply a string prefix to each query text
- prefixed = [prompt_prefix + t for t in valid_texts]
+ prefixed = [(prompt_prefix or "") + t for t in valid_texts]
embeddings = await asyncio.get_running_loop().run_in_executor(
None,
lambda: model.encode(
@@ -623,7 +623,7 @@ async def embed(
# Partition input by routed provider; preserve original index.
buckets: dict[int, tuple[EmbeddingProvider, list[int], list[str]]] = {}
- for i, (t, ctype) in enumerate(zip(texts, content_types)):
+ for i, (t, ctype) in enumerate(zip(texts, content_types, strict=True)):
prov = self._by_type.get(ctype, self._default)
key = id(prov)
if key not in buckets:
@@ -635,7 +635,7 @@ async def embed(
out: list[list[float] | None] = [None] * len(texts)
for prov, indices, batch_texts in buckets.values():
vecs = await prov.embed(batch_texts)
- for idx, v in zip(indices, vecs):
+ for idx, v in zip(indices, vecs, strict=True):
out[idx] = v
if any(v is None for v in out):
@@ -704,7 +704,7 @@ async def embed(
out: list[list[float] | None] = [None] * len(texts)
miss_indices: list[int] = []
miss_texts: list[str] = []
- for i, (t, k) in enumerate(zip(texts, keys)):
+ for i, (t, k) in enumerate(zip(texts, keys, strict=True)):
if k is None:
out[i] = zero # empty/whitespace stays zero-vector
elif k in hits:
@@ -717,7 +717,7 @@ async def embed(
new_vecs = await self.inner.embed(miss_texts)
# Write to cache + slot into out in original order.
put_items: list[tuple[str, str, list[float]]] = []
- for idx, vec in zip(miss_indices, new_vecs):
+ for idx, vec in zip(miss_indices, new_vecs, strict=True):
out[idx] = vec
k = keys[idx]
if k is not None:
@@ -759,7 +759,7 @@ async def embed_query(self, texts: list[str]) -> list[list[float]]:
out: list[list[float] | None] = [None] * len(texts)
miss_indices: list[int] = []
miss_texts: list[str] = []
- for i, (t, k) in enumerate(zip(texts, keys)):
+ for i, (t, k) in enumerate(zip(texts, keys, strict=True)):
if k is None:
out[i] = zero
elif k in hits:
@@ -771,7 +771,7 @@ async def embed_query(self, texts: list[str]) -> list[list[float]]:
if miss_texts:
new_vecs = await self.inner.embed_query(miss_texts)
put_items: list[tuple[str, str, list[float]]] = []
- for idx, vec in zip(miss_indices, new_vecs):
+ for idx, vec in zip(miss_indices, new_vecs, strict=True):
out[idx] = vec
k = keys[idx]
if k is not None:
@@ -862,7 +862,7 @@ def _build_single(m: str) -> EmbeddingProvider:
by_type: dict[str, EmbeddingProvider] = {}
for ctype, ctype_model in embedding_models_per_type.items():
by_type[ctype] = _build_single(ctype_model)
- inner = TypedEmbeddingProvider(default=inner, by_content_type=by_type)
+ inner = TypedEmbeddingProvider(default=inner, by_content_type=by_type) # type: ignore[assignment] # TypedEmbeddingProvider.dimension is int|None; Protocol requires int; runtime safe
if not cache_enabled:
return inner
diff --git a/src/perspicacite/llm/errors.py b/src/perspicacite/llm/errors.py
index 758dd2f5..bacd819b 100644
--- a/src/perspicacite/llm/errors.py
+++ b/src/perspicacite/llm/errors.py
@@ -12,8 +12,13 @@
from __future__ import annotations
import re
+from collections.abc import Callable
from dataclasses import dataclass
+from perspicacite.logging import get_logger
+
+logger = get_logger("perspicacite.llm.errors")
+
class LLMError(RuntimeError):
"""Base class for Perspicacité LLM errors."""
@@ -53,7 +58,7 @@ def __init__(self, message: str, *, provider: str = "unknown"):
# (compiled pattern, retry_seconds_extractor). Extractors return None
# when no usable retry hint is available. First match wins.
-_RATE_LIMIT_PATTERNS: list[tuple[re.Pattern[str], callable]] = [
+_RATE_LIMIT_PATTERNS: list[tuple[re.Pattern[str], Callable[[re.Match[str]], int | None]]] = [
# Claude Code: "Rate limit reached. Try again in 1h 23m."
(
re.compile(r"rate\s*limit\s*reached.*?try\s*again\s*in\s*"
@@ -97,7 +102,8 @@ def detect_rate_limit(text: str) -> _RateLimitHit | None:
if m:
try:
seconds = extractor(m)
- except Exception:
+ except Exception as exc:
+ logger.debug("retry-after extract failed", error=str(exc))
seconds = None
return _RateLimitHit(retry_after_seconds=seconds)
return None
diff --git a/src/perspicacite/llm/tokens.py b/src/perspicacite/llm/tokens.py
index e1b86245..0433ea21 100644
--- a/src/perspicacite/llm/tokens.py
+++ b/src/perspicacite/llm/tokens.py
@@ -62,7 +62,7 @@ def count_tokens(text: str, model: str | None = None) -> int:
encoder = tiktoken.encoding_for_model(model)
return len(encoder.encode(text))
- except Exception:
+ except (ImportError, KeyError):
pass
# Fall back to character-based estimation
@@ -165,7 +165,7 @@ def truncate_messages(
return system_messages
# Keep most recent messages that fit
- kept_messages = []
+ kept_messages: list[dict[str, str]] = []
current_count = 0
for message in reversed(other_messages):
diff --git a/src/perspicacite/logging.py b/src/perspicacite/logging.py
index a5a96bfd..5728d646 100644
--- a/src/perspicacite/logging.py
+++ b/src/perspicacite/logging.py
@@ -59,13 +59,14 @@ def setup_logging(config: LoggingConfig, *, stream: Any = None) -> None:
return
# Choose a renderer for the final output
+ renderer: Any # JSONRenderer and ConsoleRenderer share no common structlog base
if config.format == "json":
renderer = structlog.processors.JSONRenderer()
else:
renderer = structlog.dev.ConsoleRenderer(colors=False)
# Shared processors that run for *all* structlog loggers
- shared_processors = [
+ shared_processors: list[Any] = [
structlog.contextvars.merge_contextvars,
structlog.stdlib.add_log_level,
structlog.stdlib.add_logger_name,
diff --git a/src/perspicacite/mcp/progress_adapter.py b/src/perspicacite/mcp/progress_adapter.py
index dcb83c81..8bf68130 100644
--- a/src/perspicacite/mcp/progress_adapter.py
+++ b/src/perspicacite/mcp/progress_adapter.py
@@ -17,6 +17,10 @@
import time
from typing import Any
+from perspicacite.logging import get_logger
+
+logger = get_logger("perspicacite.mcp.progress_adapter")
+
class MCPProgressAdapter:
"""Forwards RAG telemetry events to ``ctx.report_progress``."""
@@ -102,6 +106,7 @@ async def on_event(self, event: dict[str, Any]) -> None:
total=self._total,
message=msg,
)
- except Exception:
+ except Exception as exc:
+ logger.debug("progress callback failed", error=str(exc))
# Never let MCP transport hiccups break the RAG pipeline.
return
diff --git a/src/perspicacite/mcp/server.py b/src/perspicacite/mcp/server.py
index f47809ab..d6410465 100644
--- a/src/perspicacite/mcp/server.py
+++ b/src/perspicacite/mcp/server.py
@@ -27,12 +27,21 @@
from __future__ import annotations
import asyncio
+import contextlib
import json
import uuid
+from collections.abc import Callable
from pathlib import Path
-from typing import Any
+from typing import TYPE_CHECKING, Any
+
+if TYPE_CHECKING:
+ from fastmcp import FastMCP
from perspicacite.logging import get_logger
+from perspicacite.pipeline.asb.collection_ingest import (
+ ingest_asb_skill_collection,
+)
+from perspicacite.pipeline.asb.edam_filter import edam_pre_filter
from perspicacite.pipeline.asb.response import build_asb_response_metadata
from perspicacite.pipeline.asb.run_ingest import ingest_asb_run as ingest_asb_run_pipeline
from perspicacite.pipeline.github_kb import (
@@ -44,10 +53,6 @@
from perspicacite.pipeline.github_kb import (
ingest_skill_bundle as ingest_skill_bundle_pipeline,
)
-from perspicacite.pipeline.asb.collection_ingest import (
- ingest_asb_skill_collection,
-)
-from perspicacite.pipeline.asb.edam_filter import edam_pre_filter
from perspicacite.rag.paper_metadata_codec import decode_paper_metadata_json
logger = get_logger("perspicacite.mcp.server")
@@ -57,7 +62,7 @@
mcp = FastMCP("perspicacite")
except ImportError:
- mcp = None
+ mcp = None # type: ignore[assignment] # fastmcp not installed; mcp is a FastMCP[Any] at runtime
Context = Any # type: ignore[misc, assignment]
@@ -292,7 +297,8 @@ def _unbrace(s: str) -> str:
http_client=http_client,
enable_browser=enable_browser,
)
- except Exception:
+ except Exception as exc:
+ logger.debug("DOI resolution failed", error=str(exc))
resolved_doi = None
promoted = {
@@ -556,8 +562,8 @@ async def search_literature(
if _prov._last_quota_warning is not None:
mcp_warnings.append(_prov._last_quota_warning)
break
- except Exception:
- pass
+ except Exception as exc:
+ logger.debug("mcp_search_literature_warnings_collection_failed", error=str(exc))
# Crossref-enrich the returned papers (fills missing abstracts etc.).
if enrich and papers:
@@ -565,7 +571,7 @@ async def search_literature(
try:
papers = await asyncio.wait_for(enrich_papers(papers), timeout=10.0)
- except asyncio.TimeoutError:
+ except TimeoutError:
logger.warning(
"mcp_search_literature_enrich_timeout", n_papers=len(papers)
)
@@ -588,7 +594,8 @@ async def search_literature(
)
if already:
continue
- except Exception:
+ except Exception as exc:
+ logger.debug("dedup check failed", error=str(exc))
pass # dedup is best-effort; don't drop on error
filtered_papers.append(paper)
papers = filtered_papers
@@ -596,7 +603,7 @@ async def search_literature(
# Convert Paper models to dicts
results = []
for p in papers:
- pd = {
+ pd: dict[str, Any] = {
"id": p.id,
"title": p.title,
"year": p.year,
@@ -1250,8 +1257,11 @@ async def search_knowledge_base(
from perspicacite.llm.embeddings import create_embedding_provider
_kb_model_name = kb_meta.embedding_model.split("|")[0].strip()
+
# Normalise: strip the "st:" routing prefix before comparing names.
- _norm = lambda s: s.removeprefix("st:").strip()
+ def _norm(s: str) -> str:
+ return s.removeprefix("st:").strip()
+
if _norm(_kb_model_name) == _norm(state.embedding_provider.model_name):
_kb_embedding = state.embedding_provider
else:
@@ -1598,15 +1608,16 @@ async def add_papers_to_kb(
"springer_api_key": pdf_config.springer_api_key,
}
- from perspicacite.pipeline.download import retrieve_paper_content
import asyncio as _asyncio_local
+ from perspicacite.pipeline.download import retrieve_paper_content
+
async with httpx.AsyncClient(timeout=120.0, follow_redirects=True) as client:
# First pass: handle papers that need no network (pre-supplied
# full_text, skip_content_fetch, missing/non-canonical DOI). These
# are O(1) per paper and don't benefit from parallelism.
fetch_idxs: list[int] = []
- for i, (paper, pd) in enumerate(zip(paper_models, papers)):
+ for i, (paper, pd) in enumerate(zip(paper_models, papers, strict=True)):
# Caller can supply pre-fetched text directly via `full_text`
# (or pass `skip_content_fetch=True`) to bypass the slow
# Crossref/PMC/Unpaywall lookup loop. Useful for benchmark
@@ -1638,6 +1649,8 @@ async def add_papers_to_kb(
async def _fetch(idx: int) -> tuple[int, str | None, bool]:
paper = paper_models[idx]
try:
+ if paper.doi is None:
+ return idx, None, False
result = await retrieve_paper_content(
paper.doi,
url=paper.url,
@@ -1648,7 +1661,8 @@ async def _fetch(idx: int) -> tuple[int, str | None, bool]:
if result.success and result.full_text:
return idx, result.full_text, True
return idx, None, False
- except Exception:
+ except Exception as exc:
+ logger.debug("full-text fetch failed", error=str(exc))
return idx, None, False
if fetch_idxs:
@@ -1861,14 +1875,12 @@ async def generate_report(
# Emit the task_id immediately via ctx so the client can cancel.
if ctx is not None:
- try:
+ with contextlib.suppress(Exception):
await ctx.report_progress(
progress=0,
total=100,
message=f"Task started — task_id={task_id}",
)
- except Exception:
- pass
# Bind ctx for any nested LLM call via sampling. We use the
# contextvar token directly here (rather than the `with` form) to
@@ -2028,10 +2040,8 @@ def __init__(self, *sinks: Any) -> None:
def append(self, event: dict) -> None:
self.events.append(event)
for s in self._sinks:
- try:
+ with contextlib.suppress(Exception):
s.append(event)
- except Exception:
- pass
async def on_event_async(self, event: dict) -> None:
self.events.append(event)
@@ -2042,8 +2052,8 @@ async def on_event_async(self, event: dict) -> None:
await fn(event)
else:
s.append(event)
- except Exception:
- pass
+ except Exception as exc:
+ logger.debug("mcp_generate_report_telemetry_sink_fanout_failed", error=str(exc))
rag_request.telemetry_sink = _FanOutSink( # type: ignore[attr-defined]
_progress_sink, _response_collector
@@ -2111,7 +2121,7 @@ async def on_event_async(self, event: dict) -> None:
if _err.get("reason") == "cancelled":
cancelled_reason = "cancelled"
break
- except asyncio.TimeoutError:
+ except TimeoutError:
logger.warning(
"mcp_generate_report_timeout",
query=query,
@@ -2182,7 +2192,8 @@ def _build_provenance(cycles: int) -> dict[str, Any]:
validation_report: str | None = None
if extract_claims:
try:
- from perspicacite.pipeline.claims import extract_claims as _extract_claims, validate_claims
+ from perspicacite.pipeline.claims import extract_claims as _extract_claims
+ from perspicacite.pipeline.claims import validate_claims
_passages = [
{
"chunk_text": s.get("section") or s.get("title", ""),
@@ -2195,7 +2206,7 @@ def _build_provenance(cycles: int) -> dict[str, Any]:
domain_adapter = None
if domains:
try:
- from indicium_adapters import discover_adapters, compose_adapters
+ from indicium_adapters import compose_adapters, discover_adapters
discovered = discover_adapters()
valid = [discovered[d] for d in domains if d in discovered]
domain_adapter = compose_adapters(valid) if valid else None
@@ -2327,7 +2338,8 @@ async def screen_papers(
"abstract": r.abstract or md.get("abstract") or "",
}
)
- except Exception:
+ except Exception as exc:
+ logger.debug("screen_papers metadata fetch failed", error=str(exc))
items.append({"doi": doi, "title": doi, "abstract": ""})
else:
items.append({"title": c, "abstract": ""})
@@ -2352,11 +2364,11 @@ async def screen_papers(
results = _bm25(items, reference=query, method="bm25", threshold=threshold)
screened = []
- for r in results[:max_results]:
- entry: dict = {"score": r.score, "kept": r.kept, "reason": r.reason}
- if r.item.get("doi"):
- entry["doi"] = r.item["doi"]
- entry["title"] = r.item.get("title")
+ for sr in results[:max_results]:
+ entry: dict = {"score": sr.score, "kept": sr.kept, "reason": sr.reason}
+ if sr.item.get("doi"):
+ entry["doi"] = sr.item["doi"]
+ entry["title"] = sr.item.get("title")
screened.append(entry)
logger.info(
"mcp_screen_papers",
@@ -2486,6 +2498,7 @@ async def add_dois_to_kb(
**pdf_kwargs,
)
except Exception as e:
+ logger.warning("add_dois_to_kb download failed", error=str(e))
failed.append({"doi": doi, "reason": str(e)})
dl["failed"] += 1
continue
@@ -2743,6 +2756,7 @@ async def push_to_zotero(
try:
paper, doi, url = await _resolve_push_input(inp, http_client=http_client)
except Exception as exc:
+ logger.warning("push input resolution failed", error=str(exc))
route_err = str(exc)
if route_err is not None:
failed.append({"input": inp, "reason": route_err})
@@ -2838,6 +2852,7 @@ async def push_to_zotero(
)
entry["attached_pdf"] = bool(att_key)
except Exception as exc:
+ logger.warning("push_to_zotero pdf attach failed", error=str(exc))
entry["pdf_attach_error"] = str(exc)
elif pdf_path is None:
entry["attached_pdf"] = False
@@ -2890,6 +2905,7 @@ async def push_to_zotero(
entry["transcript_chars"] = len(md)
attached_transcript = True
except Exception as exc:
+ logger.warning("push_to_zotero transcript attach failed", error=str(exc))
entry["transcript_attach_error"] = str(exc)
# HTML attach: always for URL-route items, or as a
@@ -2927,6 +2943,7 @@ async def push_to_zotero(
entry["html_source"] = html_attach.tier
entry["html_chars"] = html_attach.char_count
except Exception as exc:
+ logger.warning("push_to_zotero html attach failed", error=str(exc))
entry["html_attach_error"] = str(exc)
# Step 3 (optional): supplementary attachments from capsule.
@@ -2955,6 +2972,7 @@ async def push_to_zotero(
if att_key:
attached_si.append(f.name)
except Exception as exc:
+ logger.warning("supplementary attach failed", error=str(exc))
si_errors.append({"file": f.name, "error": str(exc)})
entry["attached_supplementary"] = attached_si
if si_errors:
@@ -2962,6 +2980,7 @@ async def push_to_zotero(
created.append(entry)
except Exception as exc:
+ logger.warning("push_to_zotero item failed", error=str(exc))
failed.append({"input": inp, "reason": str(exc)})
logger.info(
@@ -3083,6 +3102,7 @@ async def push_notes_to_zotero(
)
created.append({"note_key": note_key, "parent_key": item_key})
except Exception as exc:
+ logger.warning("push_notes_to_zotero note create failed", error=str(exc))
failed.append({"input": entry, "reason": str(exc)})
logger.info(
@@ -3155,6 +3175,7 @@ async def ingest_url(
try:
paper = await extract_url(url, http_client=http_client)
except Exception as exc:
+ logger.warning("url extraction failed", error=str(exc))
return _json_error(f"url_extraction_failed: {exc}")
result: dict[str, Any] = {
@@ -3186,6 +3207,7 @@ async def ingest_url(
key = await zotero.create_item(paper)
result["zotero_key"] = key
except Exception as exc:
+ logger.warning("zotero create_item failed", error=str(exc))
result["zotero_error"] = str(exc)
if attach_html and result.get("zotero_key"):
@@ -3209,6 +3231,7 @@ async def ingest_url(
result["html_tier"] = cap.tier
result["html_chars"] = cap.char_count
except Exception as exc:
+ logger.warning("zotero html attach failed", error=str(exc))
result["html_attach_error"] = str(exc)
return _json_ok(result)
@@ -3305,6 +3328,7 @@ async def fail(self, jid: str, err: str) -> None:
job_id="mcp-inline",
)
except Exception as exc:
+ logger.warning("build_kbs_from_zotero build failed", error=str(exc))
return {"error": str(exc)}
if reg.err is not None:
return {"error": reg.err}
@@ -3461,6 +3485,7 @@ async def add_local_papers_to_kb(
parsed = await state.pdf_parser.parse(fp)
full_text = parsed.text or None
except Exception as exc:
+ logger.warning("local doc PDF parse failed", error=str(exc))
results.append(
{"file": raw_file, "status": "error", "reason": f"PDF parse failed: {exc}"}
)
@@ -3469,6 +3494,7 @@ async def add_local_papers_to_kb(
try:
full_text = fp.read_text(encoding="utf-8", errors="replace") or None
except Exception as exc:
+ logger.warning("local doc text read failed", error=str(exc))
results.append(
{"file": raw_file, "status": "error", "reason": f"Read failed: {exc}"}
)
@@ -3505,6 +3531,7 @@ async def add_local_papers_to_kb(
total_chunks += n
results.append({"file": raw_file, "title": title, "status": "ok", "chunks": n})
except Exception as exc:
+ logger.warning("local doc add_papers failed", error=str(exc))
results.append(
{"file": raw_file, "title": title, "status": "error", "reason": str(exc)}
)
@@ -3643,6 +3670,7 @@ def _arxiv_id_from_url(u: str) -> str | None:
}
)
except Exception as exc:
+ logger.warning("url fetch failed", error=str(exc))
results.append(
{
"url": url,
@@ -3675,6 +3703,7 @@ def _arxiv_id_from_url(u: str) -> str | None:
url=original_url,
)
except Exception as exc:
+ logger.warning("arxiv paper content fetch failed", error=str(exc))
for r in results:
if r.get("doi") == doi:
r["status"] = "arxiv_fetch_failed"
@@ -3685,14 +3714,14 @@ def _arxiv_id_from_url(u: str) -> str | None:
if r.get("doi") == doi:
r["status"] = "arxiv_no_content"
continue
- md = pc.metadata or {}
+ paper_meta: dict[str, Any] = pc.metadata or {}
p = Paper(
id=doi,
- title=md.get("title") or doi,
- authors=[Author(name=a) for a in (md.get("authors") or [])],
- year=md.get("year"),
+ title=paper_meta.get("title") or doi,
+ authors=[Author(name=a) for a in (paper_meta.get("authors") or [])],
+ year=paper_meta.get("year"),
doi=doi,
- abstract=pc.abstract or md.get("abstract"),
+ abstract=pc.abstract or paper_meta.get("abstract"),
full_text=pc.full_text,
source=PaperSource.OPENALEX,
content_type=pc.content_type,
@@ -3833,6 +3862,7 @@ async def build_capsules_for_kb(
counts[status] = counts.get(status, 0) + 1
per_paper.append({"paper_id": paper.id, **res})
except Exception as exc:
+ logger.warning("capsule build failed for paper", error=str(exc))
counts["errored"] += 1
per_paper.append({"paper_id": paper.id, "status": "errored", "error": str(exc)})
return {"total": len(rows), **counts, "per_paper": per_paper}
@@ -4009,6 +4039,7 @@ async def route_kbs(
method: str = "bm25",
top_k: int = 3,
score_threshold: float = 0.1,
+ kb_names: list[str] | None = None,
ctx: Context | None = None,
) -> dict:
"""Pick the most-relevant KBs for a query without actually running it.
@@ -4021,7 +4052,9 @@ async def route_kbs(
Args:
query: The research question.
candidate_kbs: Optional restricted list (KB names). ``None`` =
- consider every KB in the session store.
+ consider every KB in the session store. ``kb_names`` is accepted
+ as an alias (matching the param name every other multi-KB tool
+ uses); if both are given, ``candidate_kbs`` wins.
method: ``"bm25"`` (default, no LLM call) or ``"llm"`` (one
cheap LLM call scores every KB; better on semantic
mismatches).
@@ -4037,6 +4070,10 @@ async def route_kbs(
if isinstance(state, str):
return {"error": state}
+ # `kb_names` is an alias for `candidate_kbs` (every sibling tool names this
+ # arg `kb_names`, so agents reach for that first).
+ candidate_kbs = candidate_kbs or kb_names
+
all_kbs = await state.session_store.list_kbs()
if candidate_kbs:
wanted = set(candidate_kbs)
@@ -4373,6 +4410,7 @@ async def delete_knowledge_base(
await state.vector_store.delete_collection(kb.collection_name)
collection_dropped = True
except Exception as exc:
+ logger.warning("delete_kb collection drop failed", error=str(exc))
collection_error = str(exc)
deleted = await state.session_store.delete_kb_metadata(name)
logger.info(
@@ -4701,10 +4739,7 @@ async def ingest_skill_bundle(
# orchestrator parses URL strings itself.
source_arg: Path | str
candidate = Path(source)
- if candidate.exists():
- source_arg = candidate
- else:
- source_arg = source
+ source_arg = candidate if candidate.exists() else source
try:
summary = await ingest_skill_bundle_pipeline(
@@ -4859,7 +4894,8 @@ def _encode_cursor(start: int) -> str:
def _decode_cursor(cursor: str) -> int:
try:
return int(_base64.b64decode(cursor.encode()).decode())
- except Exception:
+ except Exception as exc:
+ logger.debug("cursor decode failed", error=str(exc))
return 0
@@ -5063,7 +5099,7 @@ async def zotero_get_paper_resources(
matched = [
it
for it in items
- if (it.get("data") or {}).get("DOI", "").lower().strip() == doi.lower().strip()
+ if (it.get("data") or {}).get("DOI", "").lower().strip() == (doi or "").lower().strip()
]
if not matched:
return {"error": "PAPER_NOT_FOUND", "message": f"DOI {doi} not in library"}
@@ -5085,7 +5121,7 @@ async def zotero_get_paper_resources(
return {"error": "ZOTERO_ERROR", "message": str(exc)}
item_doi = (zotero_item.get("data") or {}).get("DOI") or doi or ""
- item_key = zotero_item.get("key") or zotero_key
+ item_key = str(zotero_item.get("key") or zotero_key or "")
clf = LicenseClassifier()
async with httpx.AsyncClient() as http:
@@ -5163,6 +5199,8 @@ async def zotero_get_attachment_bytes(
# Fetch the attachment metadata first (filename, contentType, tags
# that may encode role_hint or license).
+ import httpx
+
c = await client._client()
try:
meta_r = await c.get(
@@ -5371,6 +5409,7 @@ async def fail(self, jid: str, err: str) -> None:
job_id="mcp-inline",
)
except Exception as exc:
+ logger.warning("zotero_ingest_collection_to_kb build failed", error=str(exc))
return {"error": str(exc)}
if reg.err is not None:
return {"error": reg.err}
@@ -5456,6 +5495,7 @@ async def web_search(
optimize_query=bool(optimize_query),
)
except Exception as exc:
+ logger.warning("web_search failed", error=str(exc))
return _json.dumps(
{
"papers": [],
@@ -5587,6 +5627,7 @@ async def search_by_passage(
try:
from perspicacite.retrieval.passage_search import search_passages
+ retriever: Any # MultiKBRetriever | DynamicKnowledgeBase — share no common typed base
# Multi-KB path
if kb_names and len(kb_names) > 1:
from perspicacite.retrieval.multi_kb import (
@@ -5748,6 +5789,7 @@ async def get_relevant_passages(
try:
from perspicacite.retrieval.passage_search import search_passages
+ retriever: Any # MultiKBRetriever | DynamicKnowledgeBase — share no common typed base
# Build retriever (same pattern as search_by_passage).
if kb_names and len(kb_names) > 1:
from perspicacite.retrieval.multi_kb import (
@@ -5940,7 +5982,7 @@ async def extract_parameters_from_passages(
dedup_key=lambda r: (r.get("name"), r.get("units")),
model=model,
)
- except asyncio.TimeoutError:
+ except TimeoutError:
logger.warning(
"mcp_extract_parameters_timeout",
n_passages=len(passage_objs),
@@ -6052,8 +6094,9 @@ async def extract_failure_modes_from_passages(
prompt = _FAILURE_EXTRACTION_PROMPT.format(context=context or "general")
# Hard cap: entire extraction (all batches) must finish within 80s.
- # Per-batch asyncio.wait_for(50s) doesn't help when LiteLLM/tenacity
- # retries eat CancelledError — the outer timeout is the real guard.
+ # Effective only because the LLM client's retry predicate lets
+ # CancelledError propagate (it no longer retries on cancellation); the
+ # per-batch wait_for(50s) and this outer cap both now bound wall-clock.
try:
async with asyncio.timeout(80.0):
records = await extract_structured(
@@ -6066,7 +6109,7 @@ async def extract_failure_modes_from_passages(
dedup_key=lambda r: (str(r.get("symptom", "")).strip().lower(),),
model=model,
)
- except asyncio.TimeoutError:
+ except TimeoutError:
logger.warning(
"mcp_extract_failure_modes_timeout",
n_passages=len(passage_objs),
@@ -6144,7 +6187,7 @@ async def extract_claims_from_passages(
adapter = None
if domains:
try:
- from indicium_adapters import discover_adapters, compose_adapters
+ from indicium_adapters import compose_adapters, discover_adapters
discovered = discover_adapters()
valid = [discovered[d] for d in domains if d in discovered]
adapter = compose_adapters(valid) if valid else None
@@ -6375,7 +6418,7 @@ async def query_claim_graph(
from perspicacite.indicium_layer import queries as _q
- _query_table = {
+ _query_table: dict[str, Callable[..., Any]] = {
"claims_supporting": _q.claims_supporting,
"claims_disputing": _q.claims_disputing,
"evidence_trace": _q.evidence_trace,
@@ -6531,7 +6574,8 @@ async def claim_graph_export(
raw_jld = store._g.serialize(format="json-ld")
try:
jld_list = json.loads(raw_jld) if isinstance(raw_jld, str) else raw_jld
- except Exception:
+ except Exception as exc:
+ logger.debug("claim_graph_export json-ld parse failed", error=str(exc))
jld_list = []
if format == "jsonld":
result = jld_list
@@ -6578,7 +6622,8 @@ async def claim_graph_export(
raw_jld = g.serialize(format="json-ld")
try:
jld_list = json.loads(raw_jld) if isinstance(raw_jld, str) else raw_jld
- except Exception:
+ except Exception as exc:
+ logger.debug("claim_graph_export json-ld parse failed", error=str(exc))
jld_list = []
if format == "jsonld":
result = jld_list
@@ -6737,7 +6782,7 @@ async def get_info() -> str:
# KB resources (Wave 5.1)
# =============================================================================
-from perspicacite.mcp import resources as _resources # noqa: E402
+from perspicacite.mcp import resources as _resources
@mcp.resource("perspicacite://kbs")
@@ -6768,7 +6813,7 @@ async def _kb_log_resource(name: str) -> str:
# Canned prompts (Wave 5.2)
# =============================================================================
-from perspicacite.mcp import prompts as _prompts # noqa: E402
+from perspicacite.mcp import prompts as _prompts
@mcp.prompt()
diff --git a/src/perspicacite/memory/session_store.py b/src/perspicacite/memory/session_store.py
index 65e34b2f..e5627d98 100644
--- a/src/perspicacite/memory/session_store.py
+++ b/src/perspicacite/memory/session_store.py
@@ -106,7 +106,8 @@ async def init_db(self) -> None:
"USING fts5(content, conversation_id UNINDEXED)"
)
self._fts_available = True
- except Exception:
+ except Exception as exc:
+ logger.debug("fts table unavailable", error=str(exc))
self._fts_available = False
if self._fts_available:
cur = await db.execute("SELECT count(*) FROM messages_fts")
@@ -149,10 +150,10 @@ async def get_conversation(self, conv_id: str) -> Conversation | None:
db.row_factory = aiosqlite.Row
# Get conversation
- row = await db.execute_fetchall(
+ row = list(await db.execute_fetchall(
"SELECT * FROM conversations WHERE id = ?",
(conv_id,),
- )
+ ))
if not row:
return None
@@ -304,7 +305,8 @@ async def search_conversations(self, query: str, limit: int = 20) -> list[dict[s
else:
raise RuntimeError("fts unavailable")
rows = await cur.fetchall()
- except Exception:
+ except Exception as exc:
+ logger.debug("fts search failed, falling back to like", error=str(exc))
like = f"%{q}%"
cur = await db.execute(
"SELECT conversation_id, substr(content, 1, 200) AS snippet "
@@ -358,10 +360,10 @@ async def get_kb_metadata(self, name: str) -> KnowledgeBase | None:
async with aiosqlite.connect(self.db_path) as db:
db.row_factory = aiosqlite.Row
- rows = await db.execute_fetchall(
+ rows = list(await db.execute_fetchall(
"SELECT * FROM kb_metadata WHERE name = ?",
(name,),
- )
+ ))
if not rows:
return None
@@ -516,7 +518,7 @@ async def delete_all_conversations(self) -> int:
"""
async with aiosqlite.connect(self.db_path) as db:
# Get count before deletion
- row = await db.execute_fetchall("SELECT COUNT(*) as count FROM conversations")
+ row = list(await db.execute_fetchall("SELECT COUNT(*) as count FROM conversations"))
count = row[0]["count"] if row else 0
# Purge the entire FTS index
diff --git a/src/perspicacite/models/documents.py b/src/perspicacite/models/documents.py
index c09f7ee2..9bc856e0 100644
--- a/src/perspicacite/models/documents.py
+++ b/src/perspicacite/models/documents.py
@@ -1,6 +1,6 @@
"""Document chunk models."""
-from typing import Any, Literal, Optional
+from typing import Any, Literal
from pydantic import BaseModel, Field
@@ -42,7 +42,7 @@ class ChunkMetadata(BaseModel):
symbol_name: str | None = None
symbol_kind: str | None = None # "function" | "class" | "method" | "cell" | "module"
parent_class: str | None = Field(
- None,
+ default=None,
description="If symbol_kind is a method, the enclosing class name. None otherwise.",
)
start_line: int | None = None # 1-indexed inclusive
@@ -63,7 +63,7 @@ class ChunkMetadata(BaseModel):
# round-trips through Chroma's scalar-only per-doc metadata. None
# for non-bundle papers. Decoded back to a dict at the retrieval
# boundary (see DynamicKnowledgeBase.search_two_pass).
- paper_metadata_json: Optional[str] = None
+ paper_metadata_json: str | None = None
def __repr__(self) -> str:
return (
diff --git a/src/perspicacite/models/papers.py b/src/perspicacite/models/papers.py
index 8433e789..01dbdb3d 100644
--- a/src/perspicacite/models/papers.py
+++ b/src/perspicacite/models/papers.py
@@ -1,10 +1,11 @@
"""Paper and document models."""
+import contextlib
from datetime import datetime
from enum import Enum
from typing import Any
-from pydantic import BaseModel, Field, field_validator
+from pydantic import BaseModel, Field, PrivateAttr, field_validator
class PaperSource(str, Enum):
@@ -104,6 +105,12 @@ class Paper(BaseModel):
license: str | None = None # OA license id from discovery (e.g. "cc-by")
metadata: dict[str, Any] = Field(default_factory=dict)
+ # Runtime-only tag (never serialized): which KB this Paper instance was
+ # loaded from. Set on enumeration paths (kb.py, cli.py) and read via
+ # getattr in fetch_orchestrator. Declared as a PrivateAttr so pydantic v2
+ # permits the assignment (plain attribute assignment would raise).
+ _kb_name: str | None = PrivateAttr(default=None)
+
@field_validator("year")
@classmethod
def validate_year(cls, v: int | None) -> int | None:
@@ -172,10 +179,8 @@ def from_bibtex(cls, entry: dict[str, Any]) -> "Paper":
year = None
year_str = entry.get("year")
if year_str:
- try:
+ with contextlib.suppress(ValueError):
year = int(year_str)
- except ValueError:
- pass
# Generate ID from DOI or PMID, or create from title
doi = entry.get("doi")
diff --git a/src/perspicacite/models/rag.py b/src/perspicacite/models/rag.py
index ffcff2f3..7178a7e3 100644
--- a/src/perspicacite/models/rag.py
+++ b/src/perspicacite/models/rag.py
@@ -338,6 +338,7 @@ class StreamEvent(BaseModel):
"figure_ref", # Figure-reference attachment (sub-project C, 2026-05-15)
"metadata", # Run summary: iteration_count, completion_reason
"diagnostic", # Early-return diagnostics: papers_searched, filtered_as_known, …
+ "revision", # Post-answer corrected text (e.g. copyright filter); client may render or ignore
]
data: str # JSON-encoded payload
diff --git a/src/perspicacite/models/search.py b/src/perspicacite/models/search.py
index 63ec6a64..26ca096d 100644
--- a/src/perspicacite/models/search.py
+++ b/src/perspicacite/models/search.py
@@ -57,7 +57,7 @@ class RetrievedChunk(BaseModel):
chunk: DocumentChunk
score: float = Field(le=1.0)
- retrieval_method: Literal["vector", "bm25", "hybrid"] = "vector"
+ retrieval_method: Literal["vector", "bm25", "hybrid", "reranked"] = "vector"
def __repr__(self) -> str:
return (
diff --git a/src/perspicacite/pipeline/asb/run_ingest.py b/src/perspicacite/pipeline/asb/run_ingest.py
index bcb1cd82..5de6442b 100644
--- a/src/perspicacite/pipeline/asb/run_ingest.py
+++ b/src/perspicacite/pipeline/asb/run_ingest.py
@@ -20,9 +20,10 @@
import json
import logging
-from datetime import datetime, timezone
+from collections.abc import Iterable
+from datetime import UTC, datetime
from pathlib import Path
-from typing import Any, Iterable
+from typing import Any
from perspicacite.pipeline.asb.card_parser import parse_cards
from perspicacite.pipeline.asb.chunk_producer import (
@@ -251,7 +252,8 @@ async def ingest_asb_run(
try:
import yaml
corpus = yaml.safe_load(corpus_path.read_text()) or {}
- except Exception:
+ except Exception as exc:
+ logger.debug("corpus.yaml parse failed extra=%s", {"error": str(exc)})
corpus = {}
known_dois = {
((p or {}).get("doi") or "").lower().strip()
@@ -400,7 +402,7 @@ def _kb_description(
def _now_iso() -> str:
- return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
+ return datetime.now(UTC).strftime("%Y-%m-%dT%H:%M:%SZ")
async def _make_or_get_kb(name: str, *, description: str = "", app_state: Any = None):
@@ -421,7 +423,7 @@ async def _make_or_get_kb(name: str, *, description: str = "", app_state: Any =
)
# Production path: re-use search_to_kb's _create_kb_if_missing pattern.
from perspicacite.pipeline.search_to_kb import _create_kb_if_missing
- kb_meta, _created = await _create_kb_if_missing(
+ _kb_meta, _created = await _create_kb_if_missing(
app_state, name, description,
)
# Construct the in-memory DynamicKnowledgeBase backed by that metadata.
diff --git a/src/perspicacite/pipeline/asb/skill_kb_writer.py b/src/perspicacite/pipeline/asb/skill_kb_writer.py
index 2dc4319c..9db29c97 100644
--- a/src/perspicacite/pipeline/asb/skill_kb_writer.py
+++ b/src/perspicacite/pipeline/asb/skill_kb_writer.py
@@ -6,7 +6,7 @@
from __future__ import annotations
import json
-from datetime import datetime, timezone
+from datetime import UTC, datetime
from pathlib import Path
@@ -52,8 +52,10 @@ def write_skill_kb_entries(
if " | " in rest:
tail = " | " + rest.split(" | ", 1)[1]
new_notes = (prefix + stamp + tail).strip()
- # Collapse leading " | " if prefix is empty
- new_notes = new_notes.lstrip(" | ")
+ # Collapse a leading " | " separator if prefix is empty. Use
+ # removeprefix (exact substring), not lstrip, which strips any
+ # leading run of ' ' and '|' chars — the classic multi-char footgun.
+ new_notes = new_notes.removeprefix(" | ")
data["notes"] = new_notes
else:
sep = " | " if original_notes else ""
@@ -65,4 +67,4 @@ def write_skill_kb_entries(
def _now_iso() -> str:
"""UTC RFC3339 timestamp (Z-suffixed)."""
- return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
+ return datetime.now(UTC).strftime("%Y-%m-%dT%H:%M:%SZ")
diff --git a/src/perspicacite/pipeline/asb/skill_parser.py b/src/perspicacite/pipeline/asb/skill_parser.py
index 96c94d9f..2022e0d2 100644
--- a/src/perspicacite/pipeline/asb/skill_parser.py
+++ b/src/perspicacite/pipeline/asb/skill_parser.py
@@ -66,10 +66,7 @@ def _parse_one_skill(*, skill_dir: Path, index_entry: dict) -> ParsedSkill:
tools_list = (
tools_raw.get("tools") if isinstance(tools_raw, dict) else tools_raw
) or []
- if isinstance(envs_raw, dict):
- envs_list = envs_raw.get("environments") or []
- else:
- envs_list = envs_raw or []
+ envs_list = envs_raw.get("environments") or [] if isinstance(envs_raw, dict) else envs_raw or []
if isinstance(params_raw, dict):
params_list = params_raw.get("parameters") or []
else:
@@ -78,14 +75,11 @@ def _parse_one_skill(*, skill_dir: Path, index_entry: dict) -> ParsedSkill:
papers_list = papers_raw.get("papers") or []
else:
papers_list = papers_raw or []
- if isinstance(links_raw, dict):
- links_list = links_raw.get("links") or []
- else:
- links_list = links_raw or []
+ links_list = links_raw.get("links") or [] if isinstance(links_raw, dict) else links_raw or []
return ParsedSkill(
- slug=slug,
- name=index_entry.get("name", slug),
+ slug=slug or "",
+ name=index_entry.get("name") or slug or "",
description=index_entry.get("description") or frontmatter.get("description") or "",
edam_operation=index_entry.get("edam_operation") or frontmatter.get("edam_operation"),
edam_topics=frontmatter.get("edam_topics") or [],
diff --git a/src/perspicacite/pipeline/bibtex_kb.py b/src/perspicacite/pipeline/bibtex_kb.py
index a512fb91..bcc054b7 100644
--- a/src/perspicacite/pipeline/bibtex_kb.py
+++ b/src/perspicacite/pipeline/bibtex_kb.py
@@ -2,6 +2,7 @@
from __future__ import annotations
+import contextlib
import re
from pathlib import Path
from typing import Any
@@ -215,9 +216,11 @@ async def enrich_papers_with_pdf(
local_path = _parse_local_file_field(paper.pdf_url)
if local_path:
try:
- result = await pdf_parser.parse(local_path)
- if result.text:
- paper.full_text = result.text
+ # pdf_parser.parse returns ParsedContent; keep a separate var
+ # so the later `result` (PaperContent) has its own type.
+ parsed_result = await pdf_parser.parse(local_path)
+ if parsed_result.text:
+ paper.full_text = parsed_result.text
stats["local_pdf"] += 1
continue
except Exception as ex:
@@ -353,10 +356,8 @@ async def create_kb_from_bibtex(
chunks_added = await dkb.add_papers(papers, include_full_text=True)
except Exception:
logger.exception("bibtex_kb_embed_failed", collection=collection_name)
- try:
+ with contextlib.suppress(Exception):
await vector_store.delete_collection(collection_name)
- except Exception:
- pass
import aiosqlite
async with aiosqlite.connect(session_store.db_path) as db:
diff --git a/src/perspicacite/pipeline/capsule_builder.py b/src/perspicacite/pipeline/capsule_builder.py
index 9112380a..dc20c845 100644
--- a/src/perspicacite/pipeline/capsule_builder.py
+++ b/src/perspicacite/pipeline/capsule_builder.py
@@ -306,8 +306,8 @@ async def build_capsule(
existing = json.loads(meta_path.read_text())
if existing.get("capsule_version", "0.0") >= app_state.config.capsule.min_version:
return {"status": "skipped", "capsule_dir": str(cap)}
- except Exception:
- pass # fall through and rebuild
+ except Exception as exc:
+ logger.debug("existing capsule metadata read failed", error=str(exc)) # fall through and rebuild
# 1. Parse PDF if available
text = ""
@@ -371,7 +371,8 @@ async def _write_supplementary_manifest(cap: Path, *, paper: Paper) -> int:
try:
from perspicacite.pipeline.download.supplementary import discover_supplementary
result = await discover_supplementary(doi)
- except Exception:
+ except Exception as exc:
+ logger.debug("supplementary discovery failed", error=str(exc))
return 0
items = result.get("items") or []
if not items:
diff --git a/src/perspicacite/pipeline/checkpoint.py b/src/perspicacite/pipeline/checkpoint.py
index ea3d36c7..fe188469 100644
--- a/src/perspicacite/pipeline/checkpoint.py
+++ b/src/perspicacite/pipeline/checkpoint.py
@@ -9,6 +9,7 @@
from __future__ import annotations
+import contextlib
import json
import os
import time
@@ -108,7 +109,5 @@ def save(self, state: CheckpointState) -> None:
os.replace(tmp, self.path)
def delete(self) -> None:
- try:
+ with contextlib.suppress(FileNotFoundError):
self.path.unlink()
- except FileNotFoundError:
- pass
diff --git a/src/perspicacite/pipeline/chunking.py b/src/perspicacite/pipeline/chunking.py
index 453139bb..3eddc137 100644
--- a/src/perspicacite/pipeline/chunking.py
+++ b/src/perspicacite/pipeline/chunking.py
@@ -117,7 +117,7 @@ def _chunk_by_semantic(
paragraphs = [p.strip() for p in text.split("\n\n") if p.strip()]
chunks = []
- current_chunk = []
+ current_chunk: list[str] = []
current_length = 0
chunk_index = 0
@@ -202,7 +202,7 @@ def _chunk_by_section(
return _chunk_by_tokens(text, paper, config)
current_section = "Introduction"
- for i, part in enumerate(parts):
+ for _i, part in enumerate(parts):
if not part.strip():
continue
diff --git a/src/perspicacite/pipeline/chunking_advanced.py b/src/perspicacite/pipeline/chunking_advanced.py
index 8c86b56c..f6a53e14 100644
--- a/src/perspicacite/pipeline/chunking_advanced.py
+++ b/src/perspicacite/pipeline/chunking_advanced.py
@@ -31,25 +31,33 @@
logger = get_logger("perspicacite.pipeline.chunking_advanced")
# Optional imports for advanced features
+np: Any
try:
import numpy as np
-except Exception:
- np = None
+except Exception as exc:
+ logger.debug("numpy not available", error=str(exc))
+ np = None # optional dep fallback
+tiktoken: Any
try:
import tiktoken
-except Exception:
- tiktoken = None
+except Exception as exc:
+ logger.debug("tiktoken not available", error=str(exc))
+ tiktoken = None # optional dep fallback
+AutoTokenizer: Any
try:
from transformers import AutoTokenizer
-except Exception:
- AutoTokenizer = None
+except Exception as exc:
+ logger.debug("transformers not available", error=str(exc))
+ AutoTokenizer = None # optional dep fallback
+SentenceTransformer: Any
try:
from sentence_transformers import SentenceTransformer
-except Exception:
- SentenceTransformer = None
+except Exception as exc:
+ logger.debug("sentence_transformers not available", error=str(exc))
+ SentenceTransformer = None # optional dep fallback
# =============================================================================
@@ -207,7 +215,7 @@ def _split_sentences(text: str) -> list[str]:
text = text.replace("\r\n", "\n").replace("\r", "\n")
parts = [p.strip() for p in text.split("\n") if p.strip()]
sentences: list[str] = []
- buffer = []
+ buffer: list[str] = []
for part in parts:
chunks = re.split(r"(?<=[.!?])\s+", part)
@@ -384,7 +392,7 @@ def append_chunk(end_index: int):
cur_tokens = 0
cur_centroid = None
- for idx, (s, v) in enumerate(zip(sentences, sent_vecs)):
+ for idx, (s, v) in enumerate(zip(sentences, sent_vecs, strict=True)):
s_tokens = len(encode(s))
if not cur_sent_indices:
@@ -681,6 +689,7 @@ def _split_span_by_tokens(span_text: str) -> list[str]:
"Reply ONLY with the JSON object."
)
+ window_chunks: list[str] = [] # declared once here; assigned in both branches below
try:
response = await llm_client.complete(prompt, temperature=0.0, max_tokens=1024)
except Exception as e:
@@ -704,41 +713,43 @@ def _split_span_by_tokens(span_text: str) -> list[str]:
for sp in raw_spans:
if not isinstance(sp, dict):
continue
- s = sp.get("start")
- e = sp.get("end")
- if isinstance(s, int) and isinstance(e, int):
- spans.append((s, e))
- except Exception as e:
- logger.warning(f"chunking: failed to parse agentic spans: {e}")
+ sp_s = sp.get("start")
+ sp_e = sp.get("end")
+ if isinstance(sp_s, int) and isinstance(sp_e, int):
+ spans.append((sp_s, sp_e))
+ except Exception as parse_err:
+ # Python 3 deletes the except-variable at block exit;
+ # bind message before block ends so it's usable outside.
+ logger.warning(f"chunking: failed to parse agentic spans: {parse_err}")
# Normalize spans → sorted, clipped, contiguous
norm_spans: list[tuple[int, int]] = []
if spans:
spans.sort(key=lambda x: (x[0], x[1]))
cur = 0
- for s, e in spans:
- s = max(0, min(s, wlen))
- e = max(0, min(e, wlen))
- if e < s:
- s, e = e, s
- if s > cur:
- norm_spans.append((cur, s))
- cur = s
- if e > cur:
- norm_spans.append((cur, e))
- cur = e
+ for ns, ne in spans:
+ ns = max(0, min(ns, wlen))
+ ne = max(0, min(ne, wlen))
+ if ne < ns:
+ ns, ne = ne, ns
+ if ns > cur:
+ norm_spans.append((cur, ns))
+ cur = ns
+ if ne > cur:
+ norm_spans.append((cur, ne))
+ cur = ne
if cur < wlen:
norm_spans.append((cur, wlen))
else:
norm_spans = [(0, wlen)]
# Enforce token constraints: merge small, split large
- window_chunks: list[str] = []
+ window_chunks = []
buffer = pending_small if pending_small else ""
pending_small = ""
- for s, e in norm_spans:
- span_txt = win_text[s:e]
+ for rs, re_ in norm_spans:
+ span_txt = win_text[rs:re_]
candidate = (buffer + (" " if buffer and span_txt else "") + span_txt).strip()
if not candidate:
continue
diff --git a/src/perspicacite/pipeline/chunking_code.py b/src/perspicacite/pipeline/chunking_code.py
index 31d6ab1d..42649f62 100644
--- a/src/perspicacite/pipeline/chunking_code.py
+++ b/src/perspicacite/pipeline/chunking_code.py
@@ -14,9 +14,12 @@
import re
from typing import Any
+from perspicacite.logging import get_logger
from perspicacite.models.documents import ChunkMetadata, DocumentChunk
from perspicacite.models.papers import Paper
+logger = get_logger("perspicacite.pipeline.chunking_code")
+
_DOCSTRING_MAX = 500
# When a class' source segment exceeds this many characters we additionally
@@ -72,9 +75,8 @@ def _chunk_python_ast(
if isinstance(node, ast.Import):
for alias in node.names:
imports.append(alias.name.split(".")[0])
- elif isinstance(node, ast.ImportFrom):
- if node.module:
- imports.append(node.module.split(".")[0])
+ elif isinstance(node, ast.ImportFrom) and node.module:
+ imports.append(node.module.split(".")[0])
chunks: list[DocumentChunk] = []
base_id = paper.id
@@ -306,7 +308,8 @@ def _chunk_notebook(text: str, paper: Paper, *, file_path: str) -> list[Document
importlib.import_module("tree_sitter_languages")
HAS_TREE_SITTER = True
-except Exception:
+except Exception as exc:
+ logger.debug("tree_sitter_languages not available", error=str(exc))
HAS_TREE_SITTER = False
@@ -337,15 +340,17 @@ def _chunk_treesitter(
return None
try:
from tree_sitter_languages import get_parser # type: ignore
- except Exception:
+ except ImportError:
return None
try:
parser = get_parser(language)
- except Exception:
+ except Exception as exc:
+ logger.debug("treesitter_get_parser_failed", language=language, error=str(exc))
return None
try:
tree = parser.parse(text.encode("utf-8"))
- except Exception:
+ except Exception as exc:
+ logger.debug("treesitter_parse_failed", language=language, error=str(exc))
return None
lines = text.splitlines()
@@ -402,7 +407,7 @@ def _ts_extract_name(node: Any) -> str | None:
if child.type in ("identifier", "type_identifier", "name"):
try:
return child.text.decode("utf-8")
- except Exception:
+ except (AttributeError, UnicodeDecodeError):
return None
# Some grammars nest the name one level deeper (e.g. function_declarator).
for child in node.children:
diff --git a/src/perspicacite/pipeline/chunking_dispatch.py b/src/perspicacite/pipeline/chunking_dispatch.py
index be188396..1a011c1e 100644
--- a/src/perspicacite/pipeline/chunking_dispatch.py
+++ b/src/perspicacite/pipeline/chunking_dispatch.py
@@ -16,7 +16,7 @@
import hashlib
import re
from pathlib import Path
-from typing import Any
+from typing import Any, Literal, cast
from langchain_text_splitters import Language, RecursiveCharacterTextSplitter
@@ -200,7 +200,12 @@ def _to_chunk_config(config: Any) -> ChunkConfig:
if method not in {"token", "semantic", "agentic", "section_aware"}:
method = "token"
chunk_size, chunk_overlap = _chunk_size_overlap(config)
- return ChunkConfig(method=method, chunk_size=chunk_size, chunk_overlap=chunk_overlap)
+ # cast: method is validated against the allowed set above
+ return ChunkConfig(
+ method=cast("Literal['token', 'semantic', 'agentic', 'section_aware']", method),
+ chunk_size=chunk_size,
+ chunk_overlap=chunk_overlap,
+ )
async def chunk_document(
diff --git a/src/perspicacite/pipeline/cite_graph.py b/src/perspicacite/pipeline/cite_graph.py
index 28901bc0..0b221bef 100644
--- a/src/perspicacite/pipeline/cite_graph.py
+++ b/src/perspicacite/pipeline/cite_graph.py
@@ -18,8 +18,11 @@
from dataclasses import dataclass, field
from perspicacite.config.schema import CiteGraphConfig
+from perspicacite.logging import get_logger
from perspicacite.pipeline.external.fetch_github import fetch_github_repo
+logger = get_logger("perspicacite.pipeline.cite_graph")
+
_GITHUB_REPO_RE = re.compile(r"github\.com/([\w.-]+/[\w.-]+)", re.IGNORECASE)
@@ -215,7 +218,10 @@ def apply_cite_graph_filters(
OPENALEX_BASE,
_fetch_seed_work,
fetch_cited_by_works,
- openalex_id_for_doi,
+ # Re-exported into this module's namespace so the DOI-resolution path is
+ # patchable/observable from tests (test_cite_graph_openalex_id asserts the
+ # openalex_id path skips it). Not called directly in this module's body.
+ openalex_id_for_doi, # noqa: F401
)
@@ -297,7 +303,8 @@ def _hit_from_oa_work(work: dict) -> CiteHit | None:
positions.append((i, word))
positions.sort()
abstract = " ".join(w for _, w in positions)
- except Exception:
+ except Exception as exc:
+ logger.debug("abstract inverted-index reconstruction failed", error=str(exc))
abstract = None
return CiteHit(
doi=doi, title=title, year=year, venue=venue,
@@ -421,7 +428,8 @@ async def enrich_kb_from_cite_graph(
if isinstance(blob, dict):
scripts = (blob.get("scripts") or [])[:3]
h.scripts = list(scripts)
- except Exception:
+ except Exception as exc:
+ logger.debug("github_repo_fetch_failed", doi=h.doi, error=str(exc))
continue
return top
diff --git a/src/perspicacite/pipeline/download/__init__.py b/src/perspicacite/pipeline/download/__init__.py
index 2c96ae19..16dc3e74 100644
--- a/src/perspicacite/pipeline/download/__init__.py
+++ b/src/perspicacite/pipeline/download/__init__.py
@@ -48,29 +48,29 @@
from .unpaywall import get_open_access_url
__all__ = [
- # Unified pipeline (preferred)
- "retrieve_paper_content",
+ "ContentResult",
+ "DownloadResult",
+ "PDFDownloader",
"PaperContent",
"PaperDiscovery",
- # Legacy (will be removed after full migration)
- "get_pdf_with_fallback",
+ "aaas",
+ "acs",
+ "alternative",
+ "arxiv",
+ "elsevier",
"get_content_with_fallback",
# Common utilities
"get_open_access_url",
"get_pdf_from_alternative_endpoint",
- "DownloadResult",
- "ContentResult",
- "PDFDownloader",
+ # Legacy (will be removed after full migration)
+ "get_pdf_with_fallback",
+ "openalex_oa",
+ "pmc",
+ # Unified pipeline (preferred)
+ "retrieve_paper_content",
+ "rsc",
+ "springer",
# Publisher modules
"unpaywall",
- "arxiv",
"wiley",
- "elsevier",
- "aaas",
- "acs",
- "rsc",
- "springer",
- "alternative",
- "openalex_oa",
- "pmc",
]
diff --git a/src/perspicacite/pipeline/download/alternative.py b/src/perspicacite/pipeline/download/alternative.py
index e3ed6676..2a6fa613 100644
--- a/src/perspicacite/pipeline/download/alternative.py
+++ b/src/perspicacite/pipeline/download/alternative.py
@@ -59,8 +59,9 @@ async def download_from_alternative_endpoint(
# Look for PDF in tags
embeds = soup.find_all("embed", type="application/pdf")
for embed in embeds:
- src = embed.get("src")
- if src:
+ src_raw = embed.get("src")
+ if src_raw:
+ src: str = str(src_raw) # bibtexparser values type as str|AttributeValueList
pdf_url = src if src.startswith(("http:", "https:")) else urljoin(url, src)
logger.info("alternative_endpoint_pdf_found", source="embed", url=pdf_url)
pdf_response = await client.get(pdf_url)
@@ -70,18 +71,20 @@ async def download_from_alternative_endpoint(
# Look for PDF in