Skip to content

Commit dbe3fdb

Browse files
Merge pull request #31 from PromptExecution/codex-issue-23-document-queue
feat: add document inventory queue
2 parents 8d66bb7 + 59dd2cf commit dbe3fdb

6 files changed

Lines changed: 494 additions & 10 deletions

File tree

.codex

Whitespace-only changes.

crates/ledgerr-mcp/src/contract.rs

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ pub const PUBLISHED_TOOLS: [ToolContractSpec; 7] = [
4848
"ingest_pdf",
4949
"ingest_rows",
5050
"get_raw_context",
51+
"document_inventory",
5152
],
5253
},
5354
ToolContractSpec {
@@ -254,6 +255,13 @@ pub enum DocumentsArgs {
254255
workbook_path: PathBuf,
255256
rows: Vec<TransportRow>,
256257
},
258+
DocumentInventory {
259+
directory: PathBuf,
260+
#[serde(default)]
261+
recursive: bool,
262+
#[serde(default, skip_serializing_if = "Vec::is_empty")]
263+
statuses: Vec<String>,
264+
},
257265
}
258266

259267
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, JsonSchema)]

crates/ledgerr-mcp/src/lib.rs

Lines changed: 249 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,59 @@ pub struct GetRawContextResponse {
9494
pub bytes: Vec<u8>,
9595
}
9696

97+
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
98+
pub enum DocumentQueueStatusRequest {
99+
InvalidName,
100+
Ready,
101+
Ingested,
102+
}
103+
104+
impl DocumentQueueStatusRequest {
105+
pub fn as_str(self) -> &'static str {
106+
match self {
107+
Self::InvalidName => "invalid_name",
108+
Self::Ready => "ready",
109+
Self::Ingested => "ingested",
110+
}
111+
}
112+
113+
pub fn parse(value: &str) -> Option<Self> {
114+
match value {
115+
"invalid_name" => Some(Self::InvalidName),
116+
"ready" => Some(Self::Ready),
117+
"ingested" => Some(Self::Ingested),
118+
_ => None,
119+
}
120+
}
121+
}
122+
123+
#[derive(Debug, Clone, PartialEq, Eq)]
124+
pub struct DocumentInventoryRequest {
125+
pub directory: PathBuf,
126+
pub recursive: bool,
127+
pub statuses: Vec<DocumentQueueStatusRequest>,
128+
}
129+
130+
#[derive(Debug, Clone, PartialEq, Eq)]
131+
pub struct DocumentRecordResponse {
132+
pub file_name: String,
133+
pub document_path: String,
134+
pub raw_context_ref: String,
135+
pub status: DocumentQueueStatusRequest,
136+
pub blocked_reason: Option<String>,
137+
pub next_hint: String,
138+
pub vendor: Option<String>,
139+
pub account_id: Option<String>,
140+
pub year_month: Option<String>,
141+
pub document_type: Option<String>,
142+
pub ingested_tx_ids: Vec<String>,
143+
}
144+
145+
#[derive(Debug, Clone, PartialEq, Eq)]
146+
pub struct DocumentInventoryResponse {
147+
pub documents: Vec<DocumentRecordResponse>,
148+
}
149+
97150
#[derive(Debug, Clone, PartialEq, Eq)]
98151
pub struct SampleTxRequest {
99152
pub tx_id: String,
@@ -290,6 +343,10 @@ impl From<FilenameError> for ToolError {
290343

291344
pub trait TurboLedgerTools {
292345
fn list_accounts(&self) -> Result<Vec<AccountSummary>, ToolError>;
346+
fn document_inventory(
347+
&self,
348+
request: DocumentInventoryRequest,
349+
) -> Result<DocumentInventoryResponse, ToolError>;
293350
fn validate_source_filename(&self, file_name: &str) -> Result<StatementFilename, ToolError>;
294351
fn ingest_statement_rows(
295352
&self,
@@ -453,6 +510,13 @@ impl TurboLedgerService {
453510
})
454511
}
455512

513+
pub fn document_inventory_tool(
514+
&self,
515+
request: DocumentInventoryRequest,
516+
) -> Result<DocumentInventoryResponse, ToolError> {
517+
self.document_inventory(request)
518+
}
519+
456520
pub fn ontology_upsert_entities(
457521
&self,
458522
request: OntologyUpsertEntitiesRequest,
@@ -940,6 +1004,39 @@ impl TurboLedgerTools for TurboLedgerService {
9401004
Ok(out)
9411005
}
9421006

1007+
fn document_inventory(
1008+
&self,
1009+
request: DocumentInventoryRequest,
1010+
) -> Result<DocumentInventoryResponse, ToolError> {
1011+
// Queue discovery is intentionally derived on demand from the filesystem plus
1012+
// known ingested artifacts. That keeps the first cut deterministic and avoids
1013+
// introducing claim/prioritization state before the queue semantics settle.
1014+
let directory =
1015+
resolve_document_inventory_directory(self.workbook_path(), &request.directory)?;
1016+
let known_source_refs = self
1017+
.classification_state
1018+
.lock()
1019+
.map_err(|_| ToolError::Internal("classification lock poisoned".to_string()))?
1020+
.tx_rows
1021+
.iter()
1022+
.map(|(tx_id, row)| (tx_id.clone(), PathBuf::from(&row.source_ref)))
1023+
.collect::<Vec<_>>();
1024+
let mut documents = collect_document_paths(&directory, request.recursive)?
1025+
.into_iter()
1026+
.map(|path| build_document_record(self, &known_source_refs, path))
1027+
.collect::<Result<Vec<_>, _>>()?;
1028+
documents.retain(|document| {
1029+
request.statuses.is_empty() || request.statuses.contains(&document.status)
1030+
});
1031+
documents.sort_by(|left, right| {
1032+
document_status_rank(left.status)
1033+
.cmp(&document_status_rank(right.status))
1034+
.then_with(|| left.file_name.cmp(&right.file_name))
1035+
.then_with(|| left.document_path.cmp(&right.document_path))
1036+
});
1037+
Ok(DocumentInventoryResponse { documents })
1038+
}
1039+
9431040
fn validate_source_filename(&self, file_name: &str) -> Result<StatementFilename, ToolError> {
9441041
Ok(StatementFilename::parse(file_name)?)
9451042
}
@@ -1621,6 +1718,158 @@ fn now_timestamp() -> String {
16211718
}
16221719
}
16231720

1721+
fn resolve_document_inventory_directory(
1722+
workbook_path: &std::path::Path,
1723+
directory: &std::path::Path,
1724+
) -> Result<PathBuf, ToolError> {
1725+
if directory
1726+
.components()
1727+
.any(|component| component == std::path::Component::ParentDir)
1728+
{
1729+
return Err(ToolError::InvalidInput(
1730+
"directory must not contain parent traversal components".to_string(),
1731+
));
1732+
}
1733+
1734+
let resolved = if directory.is_absolute() {
1735+
directory.to_path_buf()
1736+
} else {
1737+
let base = workbook_path
1738+
.parent()
1739+
.filter(|parent| !parent.as_os_str().is_empty())
1740+
.map(std::path::Path::to_path_buf)
1741+
.unwrap_or(std::env::current_dir().map_err(|e| ToolError::Internal(e.to_string()))?);
1742+
base.join(directory)
1743+
};
1744+
1745+
if !resolved.is_dir() {
1746+
return Err(ToolError::InvalidInput(format!(
1747+
"directory '{}' does not exist or is not a directory",
1748+
resolved.display()
1749+
)));
1750+
}
1751+
Ok(resolved)
1752+
}
1753+
1754+
fn collect_document_paths(
1755+
directory: &std::path::Path,
1756+
recursive: bool,
1757+
) -> Result<Vec<PathBuf>, ToolError> {
1758+
let mut documents = Vec::new();
1759+
collect_document_paths_into(directory, recursive, &mut documents)?;
1760+
documents.sort();
1761+
Ok(documents)
1762+
}
1763+
1764+
fn collect_document_paths_into(
1765+
directory: &std::path::Path,
1766+
recursive: bool,
1767+
documents: &mut Vec<PathBuf>,
1768+
) -> Result<(), ToolError> {
1769+
let mut entries = std::fs::read_dir(directory)
1770+
.map_err(|e| ToolError::Internal(e.to_string()))?
1771+
.collect::<Result<Vec<_>, _>>()
1772+
.map_err(|e| ToolError::Internal(e.to_string()))?;
1773+
entries.sort_by_key(|entry| entry.path());
1774+
1775+
for entry in entries {
1776+
let path = entry.path();
1777+
if path.is_dir() {
1778+
if recursive {
1779+
collect_document_paths_into(&path, true, documents)?;
1780+
}
1781+
continue;
1782+
}
1783+
let is_pdf = path
1784+
.extension()
1785+
.and_then(|ext| ext.to_str())
1786+
.map(|ext| ext.eq_ignore_ascii_case("pdf"))
1787+
.unwrap_or(false);
1788+
if is_pdf {
1789+
documents.push(path);
1790+
}
1791+
}
1792+
1793+
Ok(())
1794+
}
1795+
1796+
fn build_document_record(
1797+
service: &TurboLedgerService,
1798+
known_source_refs: &[(String, PathBuf)],
1799+
document_path: PathBuf,
1800+
) -> Result<DocumentRecordResponse, ToolError> {
1801+
let file_name = document_path
1802+
.file_name()
1803+
.and_then(|name| name.to_str())
1804+
.ok_or_else(|| {
1805+
ToolError::InvalidInput("document path must have a UTF-8 filename".to_string())
1806+
})?
1807+
.to_string();
1808+
1809+
match service.validate_source_filename(&file_name) {
1810+
Ok(parsed) => {
1811+
let raw_context_ref = document_path.with_extension("rkyv");
1812+
let ingested_tx_ids = known_source_refs
1813+
.iter()
1814+
.filter(|(_, source_ref)| source_ref_matches(source_ref, &raw_context_ref))
1815+
.map(|(tx_id, _)| tx_id.clone())
1816+
.collect::<Vec<_>>();
1817+
let status = if ingested_tx_ids.is_empty() {
1818+
DocumentQueueStatusRequest::Ready
1819+
} else {
1820+
DocumentQueueStatusRequest::Ingested
1821+
};
1822+
1823+
Ok(DocumentRecordResponse {
1824+
file_name,
1825+
document_path: document_path.display().to_string(),
1826+
raw_context_ref: raw_context_ref.display().to_string(),
1827+
status,
1828+
blocked_reason: None,
1829+
next_hint: if ingested_tx_ids.is_empty() {
1830+
"call_proxy_ingest_pdf".to_string()
1831+
} else {
1832+
"review_existing_rows".to_string()
1833+
},
1834+
vendor: Some(parsed.vendor),
1835+
account_id: Some(parsed.account),
1836+
year_month: Some(format!("{:04}-{:02}", parsed.year, parsed.month)),
1837+
document_type: Some(parsed.doc_type),
1838+
ingested_tx_ids,
1839+
})
1840+
}
1841+
Err(_) => Ok(DocumentRecordResponse {
1842+
file_name,
1843+
document_path: document_path.display().to_string(),
1844+
raw_context_ref: document_path.with_extension("rkyv").display().to_string(),
1845+
status: DocumentQueueStatusRequest::InvalidName,
1846+
blocked_reason: Some("invalid_contract_name".to_string()),
1847+
next_hint: "rename_then_retry".to_string(),
1848+
vendor: None,
1849+
account_id: None,
1850+
year_month: None,
1851+
document_type: None,
1852+
ingested_tx_ids: Vec::new(),
1853+
}),
1854+
}
1855+
}
1856+
1857+
fn source_ref_matches(source_ref: &std::path::Path, expected: &std::path::Path) -> bool {
1858+
let source_canonical = std::fs::canonicalize(source_ref).ok();
1859+
let expected_canonical = std::fs::canonicalize(expected).ok();
1860+
source_canonical.as_ref() == expected_canonical.as_ref()
1861+
|| source_ref == expected
1862+
|| source_ref.file_name() == expected.file_name()
1863+
}
1864+
1865+
fn document_status_rank(status: DocumentQueueStatusRequest) -> u8 {
1866+
match status {
1867+
DocumentQueueStatusRequest::Ingested => 0,
1868+
DocumentQueueStatusRequest::Ready => 1,
1869+
DocumentQueueStatusRequest::InvalidName => 2,
1870+
}
1871+
}
1872+
16241873
fn persisted_state_path(workbook_path: &std::path::Path) -> PathBuf {
16251874
PathBuf::from(format!("{}.ledgerr-state.json", workbook_path.display()))
16261875
}

0 commit comments

Comments
 (0)