@@ -94,6 +94,59 @@ pub struct GetRawContextResponse {
9494 pub bytes : Vec < u8 > ,
9595}
9696
97+ #[ derive( Debug , Clone , Copy , PartialEq , Eq , PartialOrd , Ord ) ]
98+ pub enum DocumentQueueStatusRequest {
99+ InvalidName ,
100+ Ready ,
101+ Ingested ,
102+ }
103+
104+ impl DocumentQueueStatusRequest {
105+ pub fn as_str ( self ) -> & ' static str {
106+ match self {
107+ Self :: InvalidName => "invalid_name" ,
108+ Self :: Ready => "ready" ,
109+ Self :: Ingested => "ingested" ,
110+ }
111+ }
112+
113+ pub fn parse ( value : & str ) -> Option < Self > {
114+ match value {
115+ "invalid_name" => Some ( Self :: InvalidName ) ,
116+ "ready" => Some ( Self :: Ready ) ,
117+ "ingested" => Some ( Self :: Ingested ) ,
118+ _ => None ,
119+ }
120+ }
121+ }
122+
123+ #[ derive( Debug , Clone , PartialEq , Eq ) ]
124+ pub struct DocumentInventoryRequest {
125+ pub directory : PathBuf ,
126+ pub recursive : bool ,
127+ pub statuses : Vec < DocumentQueueStatusRequest > ,
128+ }
129+
130+ #[ derive( Debug , Clone , PartialEq , Eq ) ]
131+ pub struct DocumentRecordResponse {
132+ pub file_name : String ,
133+ pub document_path : String ,
134+ pub raw_context_ref : String ,
135+ pub status : DocumentQueueStatusRequest ,
136+ pub blocked_reason : Option < String > ,
137+ pub next_hint : String ,
138+ pub vendor : Option < String > ,
139+ pub account_id : Option < String > ,
140+ pub year_month : Option < String > ,
141+ pub document_type : Option < String > ,
142+ pub ingested_tx_ids : Vec < String > ,
143+ }
144+
145+ #[ derive( Debug , Clone , PartialEq , Eq ) ]
146+ pub struct DocumentInventoryResponse {
147+ pub documents : Vec < DocumentRecordResponse > ,
148+ }
149+
97150#[ derive( Debug , Clone , PartialEq , Eq ) ]
98151pub struct SampleTxRequest {
99152 pub tx_id : String ,
@@ -290,6 +343,10 @@ impl From<FilenameError> for ToolError {
290343
291344pub trait TurboLedgerTools {
292345 fn list_accounts ( & self ) -> Result < Vec < AccountSummary > , ToolError > ;
346+ fn document_inventory (
347+ & self ,
348+ request : DocumentInventoryRequest ,
349+ ) -> Result < DocumentInventoryResponse , ToolError > ;
293350 fn validate_source_filename ( & self , file_name : & str ) -> Result < StatementFilename , ToolError > ;
294351 fn ingest_statement_rows (
295352 & self ,
@@ -453,6 +510,13 @@ impl TurboLedgerService {
453510 } )
454511 }
455512
513+ pub fn document_inventory_tool (
514+ & self ,
515+ request : DocumentInventoryRequest ,
516+ ) -> Result < DocumentInventoryResponse , ToolError > {
517+ self . document_inventory ( request)
518+ }
519+
456520 pub fn ontology_upsert_entities (
457521 & self ,
458522 request : OntologyUpsertEntitiesRequest ,
@@ -940,6 +1004,39 @@ impl TurboLedgerTools for TurboLedgerService {
9401004 Ok ( out)
9411005 }
9421006
1007+ fn document_inventory (
1008+ & self ,
1009+ request : DocumentInventoryRequest ,
1010+ ) -> Result < DocumentInventoryResponse , ToolError > {
1011+ // Queue discovery is intentionally derived on demand from the filesystem plus
1012+ // known ingested artifacts. That keeps the first cut deterministic and avoids
1013+ // introducing claim/prioritization state before the queue semantics settle.
1014+ let directory =
1015+ resolve_document_inventory_directory ( self . workbook_path ( ) , & request. directory ) ?;
1016+ let known_source_refs = self
1017+ . classification_state
1018+ . lock ( )
1019+ . map_err ( |_| ToolError :: Internal ( "classification lock poisoned" . to_string ( ) ) ) ?
1020+ . tx_rows
1021+ . iter ( )
1022+ . map ( |( tx_id, row) | ( tx_id. clone ( ) , PathBuf :: from ( & row. source_ref ) ) )
1023+ . collect :: < Vec < _ > > ( ) ;
1024+ let mut documents = collect_document_paths ( & directory, request. recursive ) ?
1025+ . into_iter ( )
1026+ . map ( |path| build_document_record ( self , & known_source_refs, path) )
1027+ . collect :: < Result < Vec < _ > , _ > > ( ) ?;
1028+ documents. retain ( |document| {
1029+ request. statuses . is_empty ( ) || request. statuses . contains ( & document. status )
1030+ } ) ;
1031+ documents. sort_by ( |left, right| {
1032+ document_status_rank ( left. status )
1033+ . cmp ( & document_status_rank ( right. status ) )
1034+ . then_with ( || left. file_name . cmp ( & right. file_name ) )
1035+ . then_with ( || left. document_path . cmp ( & right. document_path ) )
1036+ } ) ;
1037+ Ok ( DocumentInventoryResponse { documents } )
1038+ }
1039+
9431040 fn validate_source_filename ( & self , file_name : & str ) -> Result < StatementFilename , ToolError > {
9441041 Ok ( StatementFilename :: parse ( file_name) ?)
9451042 }
@@ -1621,6 +1718,158 @@ fn now_timestamp() -> String {
16211718 }
16221719}
16231720
1721+ fn resolve_document_inventory_directory (
1722+ workbook_path : & std:: path:: Path ,
1723+ directory : & std:: path:: Path ,
1724+ ) -> Result < PathBuf , ToolError > {
1725+ if directory
1726+ . components ( )
1727+ . any ( |component| component == std:: path:: Component :: ParentDir )
1728+ {
1729+ return Err ( ToolError :: InvalidInput (
1730+ "directory must not contain parent traversal components" . to_string ( ) ,
1731+ ) ) ;
1732+ }
1733+
1734+ let resolved = if directory. is_absolute ( ) {
1735+ directory. to_path_buf ( )
1736+ } else {
1737+ let base = workbook_path
1738+ . parent ( )
1739+ . filter ( |parent| !parent. as_os_str ( ) . is_empty ( ) )
1740+ . map ( std:: path:: Path :: to_path_buf)
1741+ . unwrap_or ( std:: env:: current_dir ( ) . map_err ( |e| ToolError :: Internal ( e. to_string ( ) ) ) ?) ;
1742+ base. join ( directory)
1743+ } ;
1744+
1745+ if !resolved. is_dir ( ) {
1746+ return Err ( ToolError :: InvalidInput ( format ! (
1747+ "directory '{}' does not exist or is not a directory" ,
1748+ resolved. display( )
1749+ ) ) ) ;
1750+ }
1751+ Ok ( resolved)
1752+ }
1753+
1754+ fn collect_document_paths (
1755+ directory : & std:: path:: Path ,
1756+ recursive : bool ,
1757+ ) -> Result < Vec < PathBuf > , ToolError > {
1758+ let mut documents = Vec :: new ( ) ;
1759+ collect_document_paths_into ( directory, recursive, & mut documents) ?;
1760+ documents. sort ( ) ;
1761+ Ok ( documents)
1762+ }
1763+
1764+ fn collect_document_paths_into (
1765+ directory : & std:: path:: Path ,
1766+ recursive : bool ,
1767+ documents : & mut Vec < PathBuf > ,
1768+ ) -> Result < ( ) , ToolError > {
1769+ let mut entries = std:: fs:: read_dir ( directory)
1770+ . map_err ( |e| ToolError :: Internal ( e. to_string ( ) ) ) ?
1771+ . collect :: < Result < Vec < _ > , _ > > ( )
1772+ . map_err ( |e| ToolError :: Internal ( e. to_string ( ) ) ) ?;
1773+ entries. sort_by_key ( |entry| entry. path ( ) ) ;
1774+
1775+ for entry in entries {
1776+ let path = entry. path ( ) ;
1777+ if path. is_dir ( ) {
1778+ if recursive {
1779+ collect_document_paths_into ( & path, true , documents) ?;
1780+ }
1781+ continue ;
1782+ }
1783+ let is_pdf = path
1784+ . extension ( )
1785+ . and_then ( |ext| ext. to_str ( ) )
1786+ . map ( |ext| ext. eq_ignore_ascii_case ( "pdf" ) )
1787+ . unwrap_or ( false ) ;
1788+ if is_pdf {
1789+ documents. push ( path) ;
1790+ }
1791+ }
1792+
1793+ Ok ( ( ) )
1794+ }
1795+
1796+ fn build_document_record (
1797+ service : & TurboLedgerService ,
1798+ known_source_refs : & [ ( String , PathBuf ) ] ,
1799+ document_path : PathBuf ,
1800+ ) -> Result < DocumentRecordResponse , ToolError > {
1801+ let file_name = document_path
1802+ . file_name ( )
1803+ . and_then ( |name| name. to_str ( ) )
1804+ . ok_or_else ( || {
1805+ ToolError :: InvalidInput ( "document path must have a UTF-8 filename" . to_string ( ) )
1806+ } ) ?
1807+ . to_string ( ) ;
1808+
1809+ match service. validate_source_filename ( & file_name) {
1810+ Ok ( parsed) => {
1811+ let raw_context_ref = document_path. with_extension ( "rkyv" ) ;
1812+ let ingested_tx_ids = known_source_refs
1813+ . iter ( )
1814+ . filter ( |( _, source_ref) | source_ref_matches ( source_ref, & raw_context_ref) )
1815+ . map ( |( tx_id, _) | tx_id. clone ( ) )
1816+ . collect :: < Vec < _ > > ( ) ;
1817+ let status = if ingested_tx_ids. is_empty ( ) {
1818+ DocumentQueueStatusRequest :: Ready
1819+ } else {
1820+ DocumentQueueStatusRequest :: Ingested
1821+ } ;
1822+
1823+ Ok ( DocumentRecordResponse {
1824+ file_name,
1825+ document_path : document_path. display ( ) . to_string ( ) ,
1826+ raw_context_ref : raw_context_ref. display ( ) . to_string ( ) ,
1827+ status,
1828+ blocked_reason : None ,
1829+ next_hint : if ingested_tx_ids. is_empty ( ) {
1830+ "call_proxy_ingest_pdf" . to_string ( )
1831+ } else {
1832+ "review_existing_rows" . to_string ( )
1833+ } ,
1834+ vendor : Some ( parsed. vendor ) ,
1835+ account_id : Some ( parsed. account ) ,
1836+ year_month : Some ( format ! ( "{:04}-{:02}" , parsed. year, parsed. month) ) ,
1837+ document_type : Some ( parsed. doc_type ) ,
1838+ ingested_tx_ids,
1839+ } )
1840+ }
1841+ Err ( _) => Ok ( DocumentRecordResponse {
1842+ file_name,
1843+ document_path : document_path. display ( ) . to_string ( ) ,
1844+ raw_context_ref : document_path. with_extension ( "rkyv" ) . display ( ) . to_string ( ) ,
1845+ status : DocumentQueueStatusRequest :: InvalidName ,
1846+ blocked_reason : Some ( "invalid_contract_name" . to_string ( ) ) ,
1847+ next_hint : "rename_then_retry" . to_string ( ) ,
1848+ vendor : None ,
1849+ account_id : None ,
1850+ year_month : None ,
1851+ document_type : None ,
1852+ ingested_tx_ids : Vec :: new ( ) ,
1853+ } ) ,
1854+ }
1855+ }
1856+
1857+ fn source_ref_matches ( source_ref : & std:: path:: Path , expected : & std:: path:: Path ) -> bool {
1858+ let source_canonical = std:: fs:: canonicalize ( source_ref) . ok ( ) ;
1859+ let expected_canonical = std:: fs:: canonicalize ( expected) . ok ( ) ;
1860+ source_canonical. as_ref ( ) == expected_canonical. as_ref ( )
1861+ || source_ref == expected
1862+ || source_ref. file_name ( ) == expected. file_name ( )
1863+ }
1864+
1865+ fn document_status_rank ( status : DocumentQueueStatusRequest ) -> u8 {
1866+ match status {
1867+ DocumentQueueStatusRequest :: Ingested => 0 ,
1868+ DocumentQueueStatusRequest :: Ready => 1 ,
1869+ DocumentQueueStatusRequest :: InvalidName => 2 ,
1870+ }
1871+ }
1872+
16241873fn persisted_state_path ( workbook_path : & std:: path:: Path ) -> PathBuf {
16251874 PathBuf :: from ( format ! ( "{}.ledgerr-state.json" , workbook_path. display( ) ) )
16261875}
0 commit comments