|
| 1 | +use std::env; |
| 2 | +use std::fs::{self, File}; |
| 3 | +use std::io::{self, BufRead, BufReader, BufWriter, Write}; |
| 4 | +use std::path::{Path, PathBuf}; |
| 5 | + |
| 6 | +fn main() { |
| 7 | + if let Err(err) = run() { |
| 8 | + eprintln!("Error: {err}"); |
| 9 | + std::process::exit(1); |
| 10 | + } |
| 11 | +} |
| 12 | + |
| 13 | +fn run() -> io::Result<()> { |
| 14 | + // Args: <input.csv> <output_dir> <lines_per_file> |
| 15 | + let mut args = env::args().skip(1); |
| 16 | + let print_usage_and_exit = || -> ! { |
| 17 | + eprintln!("Usage: csv_splitter <input.csv> <output_dir> <lines_per_file>"); |
| 18 | + std::process::exit(1); |
| 19 | + }; |
| 20 | + |
| 21 | + let input_path = match args.next() { |
| 22 | + Some(p) => PathBuf::from(p), |
| 23 | + None => print_usage_and_exit(), |
| 24 | + }; |
| 25 | + |
| 26 | + let output_dir = match args.next() { |
| 27 | + Some(p) => PathBuf::from(p), |
| 28 | + None => print_usage_and_exit(), |
| 29 | + }; |
| 30 | + |
| 31 | + let lines_per_file: u64 = match args.next() { |
| 32 | + Some(s) => s.parse().unwrap_or_else(|_| { |
| 33 | + eprintln!("lines_per_file must be a positive integer"); |
| 34 | + std::process::exit(1); |
| 35 | + }), |
| 36 | + None => print_usage_and_exit(), |
| 37 | + }; |
| 38 | + |
| 39 | + if lines_per_file == 0 { |
| 40 | + eprintln!("lines_per_file must be > 0"); |
| 41 | + std::process::exit(1); |
| 42 | + } |
| 43 | + |
| 44 | + split_csv_by_lines(&input_path, &output_dir, lines_per_file) |
| 45 | +} |
| 46 | + |
| 47 | +/// Split a CSV into multiple files, each with: |
| 48 | +/// - the original header |
| 49 | +/// - up to `lines_per_file` *data* lines (header is not counted) |
| 50 | +fn split_csv_by_lines(input_path: &Path, output_dir: &Path, lines_per_file: u64) -> io::Result<()> { |
| 51 | + // Ensure output directory exists |
| 52 | + fs::create_dir_all(output_dir)?; |
| 53 | + |
| 54 | + // Open input with a reasonably large buffer |
| 55 | + let input_file = File::open(input_path)?; |
| 56 | + let mut reader = BufReader::with_capacity(1024 * 1024, input_file); |
| 57 | + |
| 58 | + // Read header line |
| 59 | + let mut header = String::new(); |
| 60 | + let bytes_read = reader.read_line(&mut header)?; |
| 61 | + if bytes_read == 0 { |
| 62 | + return Err(io::Error::new( |
| 63 | + io::ErrorKind::UnexpectedEof, |
| 64 | + "Input file is empty; no header found", |
| 65 | + )); |
| 66 | + } |
| 67 | + fn make_filename(chunk_index: u64) -> String { |
| 68 | + format!("chunk_{:06}.csv", chunk_index) |
| 69 | + } |
| 70 | + |
| 71 | + // Helper to open a new chunk writer and immediately write the header |
| 72 | + fn open_chunk( |
| 73 | + output_dir: &Path, |
| 74 | + chunk_index: u64, |
| 75 | + header: &str, |
| 76 | + ) -> io::Result<BufWriter<File>> { |
| 77 | + let file_name = make_filename(chunk_index); |
| 78 | + let path = output_dir.join(file_name); |
| 79 | + let file = File::create(path)?; |
| 80 | + let mut writer = BufWriter::with_capacity(1024 * 1024, file); |
| 81 | + writer.write_all(header.as_bytes())?; |
| 82 | + Ok(writer) |
| 83 | + } |
| 84 | + |
| 85 | + let mut chunk_index: u64 = 1; |
| 86 | + let mut lines_in_chunk: u64 = 0; |
| 87 | + |
| 88 | + let mut writer = open_chunk(output_dir, chunk_index, &header)?; |
| 89 | + |
| 90 | + // Reused buffer for each input line (data lines) |
| 91 | + let mut line = String::new(); |
| 92 | + |
| 93 | + loop { |
| 94 | + line.clear(); |
| 95 | + let bytes = reader.read_line(&mut line)?; |
| 96 | + if bytes == 0 { |
| 97 | + break; // EOF |
| 98 | + } |
| 99 | + |
| 100 | + writer.write_all(line.as_bytes())?; |
| 101 | + lines_in_chunk += 1; |
| 102 | + |
| 103 | + if lines_in_chunk >= lines_per_file { |
| 104 | + // Finish this chunk |
| 105 | + drop(writer); // closes & flushes |
| 106 | + |
| 107 | + // Prepare next chunk |
| 108 | + chunk_index += 1; |
| 109 | + lines_in_chunk = 0; |
| 110 | + writer = open_chunk(output_dir, chunk_index, &header)?; |
| 111 | + } |
| 112 | + } |
| 113 | + |
| 114 | + // Done reading. Close last writer. |
| 115 | + drop(writer); |
| 116 | + |
| 117 | + // If the last chunk has no data lines (only a header) and it's not the first, |
| 118 | + // remove that empty trailing file (happens when total_lines % lines_per_file == 0). |
| 119 | + if lines_in_chunk == 0 && chunk_index > 1 { |
| 120 | + let last_path = output_dir.join(make_filename(chunk_index)); |
| 121 | + // Ignore error if remove fails; we are at the end anyway. |
| 122 | + let _ = fs::remove_file(last_path); |
| 123 | + } |
| 124 | + |
| 125 | + Ok(()) |
| 126 | +} |
0 commit comments