Initial program

Simon Smith · Simon Smith · commit 36083f877cd4 · 2025-12-11T09:02:43.000+01:00
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -0,0 +1,6 @@
+[package]
+name = "csv_splitter"
+version = "0.1.0"
+edition = "2024"
+
+[dependencies]
diff --git a/README.md b/README.md
@@ -1,2 +1,8 @@
 # csv_splitter
-A simple efficient program to split a large CSV file by a number of lines
+A simple efficient program to split a large CSV file by a max number of lines.
+It could techically be used for other file types but it will copy the header into each new file chunk.
+
+Usage:
+```
+csv_splitter <input.csv> <output_dir> <lines_per_file>
+```
diff --git a/src/main.rs b/src/main.rs
@@ -0,0 +1,126 @@
+use std::env;
+use std::fs::{self, File};
+use std::io::{self, BufRead, BufReader, BufWriter, Write};
+use std::path::{Path, PathBuf};
+
+fn main() {
+    if let Err(err) = run() {
+        eprintln!("Error: {err}");
+        std::process::exit(1);
+    }
+}
+
+fn run() -> io::Result<()> {
+    // Args: <input.csv> <output_dir> <lines_per_file>
+    let mut args = env::args().skip(1);
+    let print_usage_and_exit = || -> ! {
+        eprintln!("Usage: csv_splitter <input.csv> <output_dir> <lines_per_file>");
+        std::process::exit(1);
+    };
+
+    let input_path = match args.next() {
+        Some(p) => PathBuf::from(p),
+        None => print_usage_and_exit(),
+    };
+
+    let output_dir = match args.next() {
+        Some(p) => PathBuf::from(p),
+        None => print_usage_and_exit(),
+    };
+
+    let lines_per_file: u64 = match args.next() {
+        Some(s) => s.parse().unwrap_or_else(|_| {
+            eprintln!("lines_per_file must be a positive integer");
+            std::process::exit(1);
+        }),
+        None => print_usage_and_exit(),
+    };
+
+    if lines_per_file == 0 {
+        eprintln!("lines_per_file must be > 0");
+        std::process::exit(1);
+    }
+
+    split_csv_by_lines(&input_path, &output_dir, lines_per_file)
+}
+
+/// Split a CSV into multiple files, each with:
+/// - the original header
+/// - up to `lines_per_file` *data* lines (header is not counted)
+fn split_csv_by_lines(input_path: &Path, output_dir: &Path, lines_per_file: u64) -> io::Result<()> {
+    // Ensure output directory exists
+    fs::create_dir_all(output_dir)?;
+
+    // Open input with a reasonably large buffer
+    let input_file = File::open(input_path)?;
+    let mut reader = BufReader::with_capacity(1024 * 1024, input_file);
+
+    // Read header line
+    let mut header = String::new();
+    let bytes_read = reader.read_line(&mut header)?;
+    if bytes_read == 0 {
+        return Err(io::Error::new(
+            io::ErrorKind::UnexpectedEof,
+            "Input file is empty; no header found",
+        ));
+    }
+    fn make_filename(chunk_index: u64) -> String {
+        format!("chunk_{:06}.csv", chunk_index)
+    }
+
+    // Helper to open a new chunk writer and immediately write the header
+    fn open_chunk(
+        output_dir: &Path,
+        chunk_index: u64,
+        header: &str,
+    ) -> io::Result<BufWriter<File>> {
+        let file_name = make_filename(chunk_index);
+        let path = output_dir.join(file_name);
+        let file = File::create(path)?;
+        let mut writer = BufWriter::with_capacity(1024 * 1024, file);
+        writer.write_all(header.as_bytes())?;
+        Ok(writer)
+    }
+
+    let mut chunk_index: u64 = 1;
+    let mut lines_in_chunk: u64 = 0;
+
+    let mut writer = open_chunk(output_dir, chunk_index, &header)?;
+
+    // Reused buffer for each input line (data lines)
+    let mut line = String::new();
+
+    loop {
+        line.clear();
+        let bytes = reader.read_line(&mut line)?;
+        if bytes == 0 {
+            break; // EOF
+        }
+
+        writer.write_all(line.as_bytes())?;
+        lines_in_chunk += 1;
+
+        if lines_in_chunk >= lines_per_file {
+            // Finish this chunk
+            drop(writer); // closes & flushes
+
+            // Prepare next chunk
+            chunk_index += 1;
+            lines_in_chunk = 0;
+            writer = open_chunk(output_dir, chunk_index, &header)?;
+        }
+    }
+
+    // Done reading. Close last writer.
+    drop(writer);
+
+    // If the last chunk has no data lines (only a header) and it's not the first,
+    // remove that empty trailing file (happens when total_lines % lines_per_file == 0).
+    if lines_in_chunk == 0 && chunk_index > 1 {
+        let last_path = output_dir.join(make_filename(chunk_index));
+        // Ignore error if remove fails; we are at the end anyway.
+        let _ = fs::remove_file(last_path);
+    }
+
+    Ok(())
+}