Skip to content

Commit 36083f8

Browse files
author
Simon Smith
committed
Initial program
1 parent 393c663 commit 36083f8

4 files changed

Lines changed: 146 additions & 1 deletion

File tree

Cargo.lock

Lines changed: 7 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
[package]
2+
name = "csv_splitter"
3+
version = "0.1.0"
4+
edition = "2024"
5+
6+
[dependencies]

README.md

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,8 @@
11
# csv_splitter
2-
A simple efficient program to split a large CSV file by a number of lines
2+
A simple efficient program to split a large CSV file by a max number of lines.
3+
It could techically be used for other file types but it will copy the header into each new file chunk.
4+
5+
Usage:
6+
```
7+
csv_splitter <input.csv> <output_dir> <lines_per_file>
8+
```

src/main.rs

Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,126 @@
1+
use std::env;
2+
use std::fs::{self, File};
3+
use std::io::{self, BufRead, BufReader, BufWriter, Write};
4+
use std::path::{Path, PathBuf};
5+
6+
fn main() {
7+
if let Err(err) = run() {
8+
eprintln!("Error: {err}");
9+
std::process::exit(1);
10+
}
11+
}
12+
13+
fn run() -> io::Result<()> {
14+
// Args: <input.csv> <output_dir> <lines_per_file>
15+
let mut args = env::args().skip(1);
16+
let print_usage_and_exit = || -> ! {
17+
eprintln!("Usage: csv_splitter <input.csv> <output_dir> <lines_per_file>");
18+
std::process::exit(1);
19+
};
20+
21+
let input_path = match args.next() {
22+
Some(p) => PathBuf::from(p),
23+
None => print_usage_and_exit(),
24+
};
25+
26+
let output_dir = match args.next() {
27+
Some(p) => PathBuf::from(p),
28+
None => print_usage_and_exit(),
29+
};
30+
31+
let lines_per_file: u64 = match args.next() {
32+
Some(s) => s.parse().unwrap_or_else(|_| {
33+
eprintln!("lines_per_file must be a positive integer");
34+
std::process::exit(1);
35+
}),
36+
None => print_usage_and_exit(),
37+
};
38+
39+
if lines_per_file == 0 {
40+
eprintln!("lines_per_file must be > 0");
41+
std::process::exit(1);
42+
}
43+
44+
split_csv_by_lines(&input_path, &output_dir, lines_per_file)
45+
}
46+
47+
/// Split a CSV into multiple files, each with:
48+
/// - the original header
49+
/// - up to `lines_per_file` *data* lines (header is not counted)
50+
fn split_csv_by_lines(input_path: &Path, output_dir: &Path, lines_per_file: u64) -> io::Result<()> {
51+
// Ensure output directory exists
52+
fs::create_dir_all(output_dir)?;
53+
54+
// Open input with a reasonably large buffer
55+
let input_file = File::open(input_path)?;
56+
let mut reader = BufReader::with_capacity(1024 * 1024, input_file);
57+
58+
// Read header line
59+
let mut header = String::new();
60+
let bytes_read = reader.read_line(&mut header)?;
61+
if bytes_read == 0 {
62+
return Err(io::Error::new(
63+
io::ErrorKind::UnexpectedEof,
64+
"Input file is empty; no header found",
65+
));
66+
}
67+
fn make_filename(chunk_index: u64) -> String {
68+
format!("chunk_{:06}.csv", chunk_index)
69+
}
70+
71+
// Helper to open a new chunk writer and immediately write the header
72+
fn open_chunk(
73+
output_dir: &Path,
74+
chunk_index: u64,
75+
header: &str,
76+
) -> io::Result<BufWriter<File>> {
77+
let file_name = make_filename(chunk_index);
78+
let path = output_dir.join(file_name);
79+
let file = File::create(path)?;
80+
let mut writer = BufWriter::with_capacity(1024 * 1024, file);
81+
writer.write_all(header.as_bytes())?;
82+
Ok(writer)
83+
}
84+
85+
let mut chunk_index: u64 = 1;
86+
let mut lines_in_chunk: u64 = 0;
87+
88+
let mut writer = open_chunk(output_dir, chunk_index, &header)?;
89+
90+
// Reused buffer for each input line (data lines)
91+
let mut line = String::new();
92+
93+
loop {
94+
line.clear();
95+
let bytes = reader.read_line(&mut line)?;
96+
if bytes == 0 {
97+
break; // EOF
98+
}
99+
100+
writer.write_all(line.as_bytes())?;
101+
lines_in_chunk += 1;
102+
103+
if lines_in_chunk >= lines_per_file {
104+
// Finish this chunk
105+
drop(writer); // closes & flushes
106+
107+
// Prepare next chunk
108+
chunk_index += 1;
109+
lines_in_chunk = 0;
110+
writer = open_chunk(output_dir, chunk_index, &header)?;
111+
}
112+
}
113+
114+
// Done reading. Close last writer.
115+
drop(writer);
116+
117+
// If the last chunk has no data lines (only a header) and it's not the first,
118+
// remove that empty trailing file (happens when total_lines % lines_per_file == 0).
119+
if lines_in_chunk == 0 && chunk_index > 1 {
120+
let last_path = output_dir.join(make_filename(chunk_index));
121+
// Ignore error if remove fails; we are at the end anyway.
122+
let _ = fs::remove_file(last_path);
123+
}
124+
125+
Ok(())
126+
}

0 commit comments

Comments
 (0)