1+ use std:: path:: Path ;
2+
3+ /// Represents a detected file format
4+ #[ derive( Debug , Clone , PartialEq ) ]
5+ pub enum FileFormat {
6+ Pdf ,
7+ Docx ,
8+ Xlsx ,
9+ Xls ,
10+ Pptx ,
11+ Png ,
12+ Jpeg ,
13+ Tiff ,
14+ Bmp ,
15+ Json ,
16+ Xml ,
17+ Html ,
18+ Text ,
19+ Unknown ,
20+ }
21+
22+ impl FileFormat {
23+ /// Convert to Ruby symbol representation
24+ pub fn to_symbol ( & self ) -> & ' static str {
25+ match self {
26+ FileFormat :: Pdf => "pdf" ,
27+ FileFormat :: Docx => "docx" ,
28+ FileFormat :: Xlsx => "xlsx" ,
29+ FileFormat :: Xls => "xls" ,
30+ FileFormat :: Pptx => "pptx" ,
31+ FileFormat :: Png => "png" ,
32+ FileFormat :: Jpeg => "jpeg" ,
33+ FileFormat :: Tiff => "tiff" ,
34+ FileFormat :: Bmp => "bmp" ,
35+ FileFormat :: Json => "json" ,
36+ FileFormat :: Xml => "xml" ,
37+ FileFormat :: Html => "xml" , // HTML is treated as XML in Ruby
38+ FileFormat :: Text => "text" ,
39+ FileFormat :: Unknown => "unknown" ,
40+ }
41+ }
42+ }
43+
44+ /// Central format detection logic
45+ pub struct FormatDetector ;
46+
47+ impl FormatDetector {
48+ /// Detect format from filename and content
49+ /// Prioritizes content detection over extension when both are available
50+ pub fn detect ( filename : Option < & str > , content : Option < & [ u8 ] > ) -> FileFormat {
51+ // First try content-based detection if content is provided
52+ if let Some ( data) = content {
53+ let format = Self :: detect_from_content ( data) ;
54+ // If we got a definitive format from content, use it
55+ if !matches ! ( format, FileFormat :: Text | FileFormat :: Unknown ) {
56+ return format;
57+ }
58+ }
59+
60+ // Fall back to extension-based detection
61+ if let Some ( name) = filename {
62+ let ext_format = Self :: detect_from_extension ( name) ;
63+ if ext_format != FileFormat :: Unknown {
64+ return ext_format;
65+ }
66+ }
67+
68+ // If content detection returned Text and no extension match, return Text
69+ if let Some ( data) = content {
70+ let format = Self :: detect_from_content ( data) ;
71+ if format == FileFormat :: Text {
72+ return FileFormat :: Text ;
73+ }
74+ }
75+
76+ FileFormat :: Unknown
77+ }
78+
79+ /// Detect format from file extension
80+ pub fn detect_from_extension ( filename : & str ) -> FileFormat {
81+ let path = Path :: new ( filename) ;
82+ let ext = match path. extension ( ) . and_then ( |s| s. to_str ( ) ) {
83+ Some ( e) => e. to_lowercase ( ) ,
84+ None => return FileFormat :: Unknown ,
85+ } ;
86+
87+ match ext. as_str ( ) {
88+ "pdf" => FileFormat :: Pdf ,
89+ "docx" => FileFormat :: Docx ,
90+ "xlsx" => FileFormat :: Xlsx ,
91+ "xls" => FileFormat :: Xls ,
92+ "pptx" => FileFormat :: Pptx ,
93+ "png" => FileFormat :: Png ,
94+ "jpg" | "jpeg" => FileFormat :: Jpeg ,
95+ "tiff" | "tif" => FileFormat :: Tiff ,
96+ "bmp" => FileFormat :: Bmp ,
97+ "json" => FileFormat :: Json ,
98+ "xml" => FileFormat :: Xml ,
99+ "html" | "htm" => FileFormat :: Html ,
100+ "txt" | "text" | "md" | "markdown" | "csv" => FileFormat :: Text ,
101+ _ => FileFormat :: Unknown ,
102+ }
103+ }
104+
105+ /// Detect format from file content (magic bytes)
106+ pub fn detect_from_content ( data : & [ u8 ] ) -> FileFormat {
107+ if data. is_empty ( ) {
108+ return FileFormat :: Text ; // Empty files are treated as text
109+ }
110+
111+ // PDF
112+ if data. len ( ) >= 4 && data. starts_with ( b"%PDF" ) {
113+ return FileFormat :: Pdf ;
114+ }
115+
116+ // PNG
117+ if data. len ( ) >= 8 && data. starts_with ( & [ 0x89 , 0x50 , 0x4E , 0x47 , 0x0D , 0x0A , 0x1A , 0x0A ] ) {
118+ return FileFormat :: Png ;
119+ }
120+
121+ // JPEG
122+ if data. len ( ) >= 3 && data. starts_with ( & [ 0xFF , 0xD8 , 0xFF ] ) {
123+ return FileFormat :: Jpeg ;
124+ }
125+
126+ // BMP
127+ if data. len ( ) >= 2 && data. starts_with ( b"BM" ) {
128+ return FileFormat :: Bmp ;
129+ }
130+
131+ // TIFF (little-endian or big-endian)
132+ if data. len ( ) >= 4 {
133+ if data. starts_with ( b"II\x2A \x00 " ) || data. starts_with ( b"MM\x00 \x2A " ) {
134+ return FileFormat :: Tiff ;
135+ }
136+ }
137+
138+ // OLE Compound Document (old Excel/Word)
139+ if data. len ( ) >= 4 && data. starts_with ( & [ 0xD0 , 0xCF , 0x11 , 0xE0 ] ) {
140+ return FileFormat :: Xls ; // Old Office format, usually Excel
141+ }
142+
143+ // ZIP archive (could be DOCX, XLSX, PPTX)
144+ if data. len ( ) >= 2 && data. starts_with ( b"PK" ) {
145+ return Self :: detect_office_format ( data) ;
146+ }
147+
148+ // XML
149+ if data. len ( ) >= 5 {
150+ let start = String :: from_utf8_lossy ( & data[ 0 ..5 . min ( data. len ( ) ) ] ) ;
151+ if start. starts_with ( "<?xml" ) || start. starts_with ( "<!" ) {
152+ return FileFormat :: Xml ;
153+ }
154+ }
155+
156+ // HTML
157+ if data. len ( ) >= 14 {
158+ let start = String :: from_utf8_lossy ( & data[ 0 ..14 . min ( data. len ( ) ) ] ) . to_lowercase ( ) ;
159+ if start. contains ( "<!doctype" ) || start. contains ( "<html" ) {
160+ return FileFormat :: Html ;
161+ }
162+ }
163+
164+ // JSON
165+ if let Some ( & first_non_ws) = data. iter ( ) . find ( |& & b| !b" \t \n \r " . contains ( & b) ) {
166+ if first_non_ws == b'{' || first_non_ws == b'[' {
167+ return FileFormat :: Json ;
168+ }
169+ }
170+
171+ // Default to text for unrecognized formats
172+ FileFormat :: Text
173+ }
174+
175+ /// Detect specific Office format from ZIP data
176+ fn detect_office_format ( data : & [ u8 ] ) -> FileFormat {
177+ // Look for Office-specific directory names in first 2KB of ZIP
178+ let check_len = 2000 . min ( data. len ( ) ) ;
179+ let content = String :: from_utf8_lossy ( & data[ 0 ..check_len] ) ;
180+
181+ // Check for format-specific markers
182+ if content. contains ( "word/" ) || content. contains ( "word/_rels" ) {
183+ FileFormat :: Docx
184+ } else if content. contains ( "xl/" ) || content. contains ( "xl/_rels" ) {
185+ FileFormat :: Xlsx
186+ } else if content. contains ( "ppt/" ) || content. contains ( "ppt/_rels" ) {
187+ FileFormat :: Pptx
188+ } else {
189+ // Default to XLSX for generic ZIP (most common Office format)
190+ FileFormat :: Xlsx
191+ }
192+ }
193+
194+
195+ /// Get all supported extensions
196+ pub fn supported_extensions ( ) -> Vec < & ' static str > {
197+ vec ! [
198+ "pdf" , "docx" , "xlsx" , "xls" , "pptx" ,
199+ "png" , "jpg" , "jpeg" , "tiff" , "tif" , "bmp" ,
200+ "json" , "xml" , "html" , "htm" ,
201+ "txt" , "text" , "md" , "markdown" , "csv"
202+ ]
203+ }
204+ }
205+
206+ #[ cfg( test) ]
207+ mod tests {
208+ use super :: * ;
209+
210+ #[ test]
211+ fn test_detect_pdf ( ) {
212+ let pdf_data = b"%PDF-1.5\n " ;
213+ assert_eq ! ( FormatDetector :: detect_from_content( pdf_data) , FileFormat :: Pdf ) ;
214+ }
215+
216+ #[ test]
217+ fn test_detect_png ( ) {
218+ let png_data = & [ 0x89 , 0x50 , 0x4E , 0x47 , 0x0D , 0x0A , 0x1A , 0x0A ] ;
219+ assert_eq ! ( FormatDetector :: detect_from_content( png_data) , FileFormat :: Png ) ;
220+ }
221+
222+ #[ test]
223+ fn test_detect_from_extension ( ) {
224+ assert_eq ! ( FormatDetector :: detect_from_extension( "document.pdf" ) , FileFormat :: Pdf ) ;
225+ assert_eq ! ( FormatDetector :: detect_from_extension( "Document.PDF" ) , FileFormat :: Pdf ) ;
226+ assert_eq ! ( FormatDetector :: detect_from_extension( "data.xlsx" ) , FileFormat :: Xlsx ) ;
227+ }
228+
229+ #[ test]
230+ fn test_empty_data ( ) {
231+ assert_eq ! ( FormatDetector :: detect_from_content( & [ ] ) , FileFormat :: Text ) ;
232+ }
233+ }
0 commit comments