Skip to content

Commit e1a9ec4

Browse files
authored
Merge pull request #11 from cpetersen/refactors
Refactors
2 parents 2be63b8 + 0283ba2 commit e1a9ec4

12 files changed

Lines changed: 2041 additions & 304 deletions

File tree

Rakefile

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,34 @@ Rake::ExtensionTask.new("parsekit", spec) do |ext|
1818
ext.cross_platform = %w[x86_64-linux arm64-darwin x86_64-darwin aarch64-linux]
1919
end
2020

21+
# Work around rake-compiler trying to stage non-existent build artifacts
22+
# This happens when dependencies generate files during their build process
23+
# Create dummy files for the ones that cause errors to satisfy rake-compiler
24+
task :before_compile do
25+
# Common build artifacts that rake-compiler tries to copy but don't exist after clean
26+
problem_files = [
27+
"ext/parsekit/target/release/build/clang-sys-*/out/common.rs",
28+
"ext/parsekit/target/release/build/mupdf-sys-*/out/bindings.rs",
29+
"ext/parsekit/target/release/build/rb-sys-*/out/*.rs",
30+
"ext/parsekit/target/release/build/typenum-*/out/*.rs",
31+
"ext/parsekit/target/release/build/rav1e-*/out/*.rs"
32+
]
33+
34+
problem_files.each do |pattern|
35+
Dir.glob(pattern).each do |file|
36+
# These files will be regenerated during the actual build
37+
# We just need them to exist to prevent rake errors
38+
unless File.exist?(file)
39+
FileUtils.mkdir_p(File.dirname(file))
40+
FileUtils.touch(file)
41+
end
42+
end
43+
end
44+
end
45+
46+
# Ensure our workaround runs before compilation
47+
task compile: :before_compile
48+
2149
# Default task runs compile then tests
2250
task default: [:compile, :spec]
2351

Lines changed: 233 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,233 @@
1+
use std::path::Path;
2+
3+
/// Represents a detected file format
4+
#[derive(Debug, Clone, PartialEq)]
5+
pub enum FileFormat {
6+
Pdf,
7+
Docx,
8+
Xlsx,
9+
Xls,
10+
Pptx,
11+
Png,
12+
Jpeg,
13+
Tiff,
14+
Bmp,
15+
Json,
16+
Xml,
17+
Html,
18+
Text,
19+
Unknown,
20+
}
21+
22+
impl FileFormat {
23+
/// Convert to Ruby symbol representation
24+
pub fn to_symbol(&self) -> &'static str {
25+
match self {
26+
FileFormat::Pdf => "pdf",
27+
FileFormat::Docx => "docx",
28+
FileFormat::Xlsx => "xlsx",
29+
FileFormat::Xls => "xls",
30+
FileFormat::Pptx => "pptx",
31+
FileFormat::Png => "png",
32+
FileFormat::Jpeg => "jpeg",
33+
FileFormat::Tiff => "tiff",
34+
FileFormat::Bmp => "bmp",
35+
FileFormat::Json => "json",
36+
FileFormat::Xml => "xml",
37+
FileFormat::Html => "xml", // HTML is treated as XML in Ruby
38+
FileFormat::Text => "text",
39+
FileFormat::Unknown => "unknown",
40+
}
41+
}
42+
}
43+
44+
/// Central format detection logic
45+
pub struct FormatDetector;
46+
47+
impl FormatDetector {
48+
/// Detect format from filename and content
49+
/// Prioritizes content detection over extension when both are available
50+
pub fn detect(filename: Option<&str>, content: Option<&[u8]>) -> FileFormat {
51+
// First try content-based detection if content is provided
52+
if let Some(data) = content {
53+
let format = Self::detect_from_content(data);
54+
// If we got a definitive format from content, use it
55+
if !matches!(format, FileFormat::Text | FileFormat::Unknown) {
56+
return format;
57+
}
58+
}
59+
60+
// Fall back to extension-based detection
61+
if let Some(name) = filename {
62+
let ext_format = Self::detect_from_extension(name);
63+
if ext_format != FileFormat::Unknown {
64+
return ext_format;
65+
}
66+
}
67+
68+
// If content detection returned Text and no extension match, return Text
69+
if let Some(data) = content {
70+
let format = Self::detect_from_content(data);
71+
if format == FileFormat::Text {
72+
return FileFormat::Text;
73+
}
74+
}
75+
76+
FileFormat::Unknown
77+
}
78+
79+
/// Detect format from file extension
80+
pub fn detect_from_extension(filename: &str) -> FileFormat {
81+
let path = Path::new(filename);
82+
let ext = match path.extension().and_then(|s| s.to_str()) {
83+
Some(e) => e.to_lowercase(),
84+
None => return FileFormat::Unknown,
85+
};
86+
87+
match ext.as_str() {
88+
"pdf" => FileFormat::Pdf,
89+
"docx" => FileFormat::Docx,
90+
"xlsx" => FileFormat::Xlsx,
91+
"xls" => FileFormat::Xls,
92+
"pptx" => FileFormat::Pptx,
93+
"png" => FileFormat::Png,
94+
"jpg" | "jpeg" => FileFormat::Jpeg,
95+
"tiff" | "tif" => FileFormat::Tiff,
96+
"bmp" => FileFormat::Bmp,
97+
"json" => FileFormat::Json,
98+
"xml" => FileFormat::Xml,
99+
"html" | "htm" => FileFormat::Html,
100+
"txt" | "text" | "md" | "markdown" | "csv" => FileFormat::Text,
101+
_ => FileFormat::Unknown,
102+
}
103+
}
104+
105+
/// Detect format from file content (magic bytes)
106+
pub fn detect_from_content(data: &[u8]) -> FileFormat {
107+
if data.is_empty() {
108+
return FileFormat::Text; // Empty files are treated as text
109+
}
110+
111+
// PDF
112+
if data.len() >= 4 && data.starts_with(b"%PDF") {
113+
return FileFormat::Pdf;
114+
}
115+
116+
// PNG
117+
if data.len() >= 8 && data.starts_with(&[0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A]) {
118+
return FileFormat::Png;
119+
}
120+
121+
// JPEG
122+
if data.len() >= 3 && data.starts_with(&[0xFF, 0xD8, 0xFF]) {
123+
return FileFormat::Jpeg;
124+
}
125+
126+
// BMP
127+
if data.len() >= 2 && data.starts_with(b"BM") {
128+
return FileFormat::Bmp;
129+
}
130+
131+
// TIFF (little-endian or big-endian)
132+
if data.len() >= 4 {
133+
if data.starts_with(b"II\x2A\x00") || data.starts_with(b"MM\x00\x2A") {
134+
return FileFormat::Tiff;
135+
}
136+
}
137+
138+
// OLE Compound Document (old Excel/Word)
139+
if data.len() >= 4 && data.starts_with(&[0xD0, 0xCF, 0x11, 0xE0]) {
140+
return FileFormat::Xls; // Old Office format, usually Excel
141+
}
142+
143+
// ZIP archive (could be DOCX, XLSX, PPTX)
144+
if data.len() >= 2 && data.starts_with(b"PK") {
145+
return Self::detect_office_format(data);
146+
}
147+
148+
// XML
149+
if data.len() >= 5 {
150+
let start = String::from_utf8_lossy(&data[0..5.min(data.len())]);
151+
if start.starts_with("<?xml") || start.starts_with("<!") {
152+
return FileFormat::Xml;
153+
}
154+
}
155+
156+
// HTML
157+
if data.len() >= 14 {
158+
let start = String::from_utf8_lossy(&data[0..14.min(data.len())]).to_lowercase();
159+
if start.contains("<!doctype") || start.contains("<html") {
160+
return FileFormat::Html;
161+
}
162+
}
163+
164+
// JSON
165+
if let Some(&first_non_ws) = data.iter().find(|&&b| !b" \t\n\r".contains(&b)) {
166+
if first_non_ws == b'{' || first_non_ws == b'[' {
167+
return FileFormat::Json;
168+
}
169+
}
170+
171+
// Default to text for unrecognized formats
172+
FileFormat::Text
173+
}
174+
175+
/// Detect specific Office format from ZIP data
176+
fn detect_office_format(data: &[u8]) -> FileFormat {
177+
// Look for Office-specific directory names in first 2KB of ZIP
178+
let check_len = 2000.min(data.len());
179+
let content = String::from_utf8_lossy(&data[0..check_len]);
180+
181+
// Check for format-specific markers
182+
if content.contains("word/") || content.contains("word/_rels") {
183+
FileFormat::Docx
184+
} else if content.contains("xl/") || content.contains("xl/_rels") {
185+
FileFormat::Xlsx
186+
} else if content.contains("ppt/") || content.contains("ppt/_rels") {
187+
FileFormat::Pptx
188+
} else {
189+
// Default to XLSX for generic ZIP (most common Office format)
190+
FileFormat::Xlsx
191+
}
192+
}
193+
194+
195+
/// Get all supported extensions
196+
pub fn supported_extensions() -> Vec<&'static str> {
197+
vec![
198+
"pdf", "docx", "xlsx", "xls", "pptx",
199+
"png", "jpg", "jpeg", "tiff", "tif", "bmp",
200+
"json", "xml", "html", "htm",
201+
"txt", "text", "md", "markdown", "csv"
202+
]
203+
}
204+
}
205+
206+
#[cfg(test)]
207+
mod tests {
208+
use super::*;
209+
210+
#[test]
211+
fn test_detect_pdf() {
212+
let pdf_data = b"%PDF-1.5\n";
213+
assert_eq!(FormatDetector::detect_from_content(pdf_data), FileFormat::Pdf);
214+
}
215+
216+
#[test]
217+
fn test_detect_png() {
218+
let png_data = &[0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A];
219+
assert_eq!(FormatDetector::detect_from_content(png_data), FileFormat::Png);
220+
}
221+
222+
#[test]
223+
fn test_detect_from_extension() {
224+
assert_eq!(FormatDetector::detect_from_extension("document.pdf"), FileFormat::Pdf);
225+
assert_eq!(FormatDetector::detect_from_extension("Document.PDF"), FileFormat::Pdf);
226+
assert_eq!(FormatDetector::detect_from_extension("data.xlsx"), FileFormat::Xlsx);
227+
}
228+
229+
#[test]
230+
fn test_empty_data() {
231+
assert_eq!(FormatDetector::detect_from_content(&[]), FileFormat::Text);
232+
}
233+
}

ext/parsekit/src/lib.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ use magnus::{function, prelude::*, Error, Ruby};
22

33
mod parser;
44
mod error;
5+
mod format_detector;
56

67
/// Initialize the ParseKit module and its submodules
78
#[magnus::init]

0 commit comments

Comments
 (0)