Skip to content

Commit 07a828f

Browse files
Replaced CSV with custom csv parser
1 parent 5120d8c commit 07a828f

File tree

4 files changed

+182
-60
lines changed

4 files changed

+182
-60
lines changed

Cargo.toml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,6 @@ visitor = ["sqlparser_derive"]
4747
[dependencies]
4848
bigdecimal = { version = "0.4.1", features = ["serde"], optional = true }
4949
log = "0.4"
50-
csv = "1.4.0"
5150
recursive = { version = "0.1.1", optional = true}
5251

5352
serde = { version = "1.0", default-features = false, features = ["derive", "alloc"], optional = true }

src/ast/mod.rs

Lines changed: 44 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -4576,19 +4576,21 @@ impl fmt::Display for Statement {
45764576
}
45774577

45784578
let mut null_symbol = "\\N";
4579-
let mut writer_builder = csv::WriterBuilder::new();
4579+
let mut delimiter = '\t';
4580+
let mut quote = '"';
4581+
let mut escape = '\\';
45804582

45814583
// Apply options
45824584
for option in options {
45834585
match option {
45844586
CopyOption::Delimiter(c) => {
4585-
writer_builder.delimiter(*c as u8);
4587+
delimiter = *c;
45864588
}
45874589
CopyOption::Quote(c) => {
4588-
writer_builder.quote(*c as u8);
4590+
quote = *c;
45894591
}
45904592
CopyOption::Escape(c) => {
4591-
writer_builder.escape(*c as u8);
4593+
escape = *c;
45924594
}
45934595
CopyOption::Null(null) => {
45944596
null_symbol = null;
@@ -4601,25 +4603,19 @@ impl fmt::Display for Statement {
46014603
for option in legacy_options {
46024604
match option {
46034605
CopyLegacyOption::Delimiter(c) => {
4604-
writer_builder.delimiter(*c as u8);
4605-
}
4606-
CopyLegacyOption::Header => {
4607-
writer_builder.has_headers(true);
4606+
delimiter = *c;
46084607
}
46094608
CopyLegacyOption::Null(null) => {
46104609
null_symbol = null;
46114610
}
46124611
CopyLegacyOption::Csv(csv_options) => {
46134612
for csv_option in csv_options {
46144613
match csv_option {
4615-
CopyLegacyCsvOption::Header => {
4616-
writer_builder.has_headers(true);
4617-
}
46184614
CopyLegacyCsvOption::Quote(c) => {
4619-
writer_builder.quote(*c as u8);
4615+
quote = *c;
46204616
}
46214617
CopyLegacyCsvOption::Escape(c) => {
4622-
writer_builder.escape(*c as u8);
4618+
escape = *c;
46234619
}
46244620
_ => {}
46254621
}
@@ -4631,19 +4627,43 @@ impl fmt::Display for Statement {
46314627

46324628
if !values.is_empty() {
46334629
writeln!(f, ";")?;
4634-
let mut writer = writer_builder.from_writer(vec![]);
4630+
4631+
// Simple CSV writer
46354632
for row in values {
4636-
writer
4637-
.write_record(
4638-
row.iter()
4639-
.map(|column| column.as_deref().unwrap_or(null_symbol)),
4640-
)
4641-
.map_err(|_| fmt::Error)?
4633+
for (idx, column) in row.iter().enumerate() {
4634+
if idx > 0 {
4635+
write!(f, "{}", delimiter)?;
4636+
}
4637+
4638+
let field_value = column.as_deref().unwrap_or(null_symbol);
4639+
4640+
// Check if field needs quoting
4641+
let needs_quoting = field_value.contains(delimiter)
4642+
|| field_value.contains(quote)
4643+
|| field_value.contains('\n')
4644+
|| field_value.contains('\r');
4645+
4646+
if needs_quoting {
4647+
write!(f, "{}", quote)?;
4648+
for ch in field_value.chars() {
4649+
if ch == quote {
4650+
// Escape quote by doubling it
4651+
write!(f, "{}{}", quote, quote)?;
4652+
} else if ch == escape {
4653+
// Escape escape character
4654+
write!(f, "{}{}", escape, escape)?;
4655+
} else {
4656+
write!(f, "{}", ch)?;
4657+
}
4658+
}
4659+
write!(f, "{}", quote)?;
4660+
} else {
4661+
write!(f, "{}", field_value)?;
4662+
}
4663+
}
4664+
writeln!(f)?;
46424665
}
4643-
writer.flush().map_err(|_| fmt::Error)?;
4644-
let data = String::from_utf8(writer.into_inner().map_err(|_| fmt::Error)?)
4645-
.map_err(|_| fmt::Error)?;
4646-
write!(f, "{}", data)?;
4666+
46474667
write!(f, "\\.")?;
46484668
}
46494669
Ok(())

src/parser/mod.rs

Lines changed: 114 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -9542,25 +9542,22 @@ impl<'a> Parser<'a> {
95429542
return self.expected("COPY ... FROM STDIN with CSV body", self.peek_token());
95439543
};
95449544

9545-
let mut reader_builder = csv::ReaderBuilder::new();
9546-
reader_builder.has_headers(false);
9547-
9545+
let mut delimiter = '\t';
9546+
let mut quote = '"';
9547+
let mut escape = '\\';
95489548
let mut null_symbol = "\\N";
95499549

95509550
// Apply options
95519551
for option in options {
95529552
match option {
95539553
CopyOption::Delimiter(c) => {
9554-
reader_builder.delimiter(*c as u8);
9555-
}
9556-
CopyOption::Header(has_header) => {
9557-
reader_builder.has_headers(*has_header);
9554+
delimiter = *c;
95589555
}
95599556
CopyOption::Quote(c) => {
9560-
reader_builder.quote(*c as u8);
9557+
quote = *c;
95619558
}
95629559
CopyOption::Escape(c) => {
9563-
reader_builder.escape(Some(*c as u8));
9560+
escape = *c;
95649561
}
95659562
CopyOption::Null(null) => {
95669563
null_symbol = null;
@@ -9573,25 +9570,19 @@ impl<'a> Parser<'a> {
95739570
for option in legacy_options {
95749571
match option {
95759572
CopyLegacyOption::Delimiter(c) => {
9576-
reader_builder.delimiter(*c as u8);
9577-
}
9578-
CopyLegacyOption::Header => {
9579-
reader_builder.has_headers(true);
9573+
delimiter = *c;
95809574
}
95819575
CopyLegacyOption::Null(null) => {
95829576
null_symbol = null;
95839577
}
95849578
CopyLegacyOption::Csv(csv_options) => {
95859579
for csv_option in csv_options {
95869580
match csv_option {
9587-
CopyLegacyCsvOption::Header => {
9588-
reader_builder.has_headers(true);
9589-
}
95909581
CopyLegacyCsvOption::Quote(c) => {
9591-
reader_builder.quote(*c as u8);
9582+
quote = *c;
95929583
}
95939584
CopyLegacyCsvOption::Escape(c) => {
9594-
reader_builder.escape(Some(*c as u8));
9585+
escape = *c;
95959586
}
95969587
_ => {}
95979588
}
@@ -9601,28 +9592,116 @@ impl<'a> Parser<'a> {
96019592
}
96029593
}
96039594

9595+
// Simple CSV parser
96049596
let mut result = vec![];
9605-
let mut reader = reader_builder.from_reader(body.as_bytes());
9606-
for record in reader.records() {
9607-
let record = match record {
9608-
Ok(rec) => rec,
9609-
Err(e) => {
9610-
return Err(ParserError::ParserError(format!(
9611-
"Error parsing CSV data: {}",
9612-
e
9613-
)))
9614-
}
9615-
};
9616-
let mut row = vec![];
9617-
for field in record.iter() {
9618-
if field == null_symbol {
9619-
row.push(None);
9597+
let mut current_row = vec![];
9598+
let mut current_field = String::new();
9599+
let mut in_quotes = false;
9600+
let mut chars = body.chars().peekable();
9601+
let mut expected_column_count: Option<usize> = None;
9602+
let mut row_number = 0;
9603+
9604+
while let Some(ch) = chars.next() {
9605+
if in_quotes {
9606+
if ch == quote {
9607+
// Check if it's an escaped quote
9608+
if let Some(&next_ch) = chars.peek() {
9609+
if next_ch == quote {
9610+
// Escaped quote
9611+
current_field.push(quote);
9612+
chars.next();
9613+
} else {
9614+
// End of quoted field
9615+
in_quotes = false;
9616+
}
9617+
} else {
9618+
// End of quoted field at end of input
9619+
in_quotes = false;
9620+
}
9621+
} else if ch == escape {
9622+
// Escape character
9623+
if let Some(next_ch) = chars.next() {
9624+
current_field.push(next_ch);
9625+
}
9626+
} else {
9627+
current_field.push(ch);
9628+
}
9629+
} else if ch == quote {
9630+
in_quotes = true;
9631+
} else if ch == delimiter {
9632+
// End of field
9633+
if current_field == null_symbol {
9634+
current_row.push(None);
96209635
} else {
9621-
row.push(Some(field.to_string()));
9636+
current_row.push(Some(current_field.clone()));
9637+
}
9638+
current_field.clear();
9639+
} else if ch == '\n' || ch == '\r' {
9640+
// End of record
9641+
if ch == '\r' {
9642+
// Skip \n if it follows \r
9643+
if let Some(&'\n') = chars.peek() {
9644+
chars.next();
9645+
}
9646+
}
9647+
if !current_field.is_empty() || !current_row.is_empty() {
9648+
if current_field == null_symbol {
9649+
current_row.push(None);
9650+
} else {
9651+
current_row.push(Some(current_field.clone()));
9652+
}
9653+
current_field.clear();
9654+
9655+
// Validate column count
9656+
row_number += 1;
9657+
if let Some(expected) = expected_column_count {
9658+
if current_row.len() != expected {
9659+
return Err(ParserError::ParserError(format!(
9660+
"CSV row {} has {} columns, but expected {} columns based on first row",
9661+
row_number,
9662+
current_row.len(),
9663+
expected
9664+
)));
9665+
}
9666+
} else {
9667+
// First row establishes the expected column count
9668+
expected_column_count = Some(current_row.len());
9669+
}
9670+
9671+
result.push(current_row.clone());
9672+
current_row.clear();
9673+
}
9674+
} else {
9675+
current_field.push(ch);
9676+
}
9677+
}
9678+
9679+
// Handle remaining field/row
9680+
if !current_field.is_empty() || !current_row.is_empty() {
9681+
if current_field == null_symbol {
9682+
current_row.push(None);
9683+
} else {
9684+
current_row.push(Some(current_field));
9685+
}
9686+
9687+
// Validate column count for last row
9688+
row_number += 1;
9689+
if let Some(expected) = expected_column_count {
9690+
if current_row.len() != expected {
9691+
return Err(ParserError::ParserError(format!(
9692+
"CSV row {} has {} columns, but expected {} columns based on first row",
9693+
row_number,
9694+
current_row.len(),
9695+
expected
9696+
)));
96229697
}
96239698
}
9624-
result.push(row);
9699+
// Note: if this is the first and only row, we don't need to set expected_column_count
9700+
// since there's nothing to validate against
9701+
9702+
result.push(current_row);
96259703
}
9704+
96269705
Ok(result)
96279706
}
96289707

tests/sqlparser_postgres.rs

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1045,6 +1045,30 @@ fn parse_copy_from_stdin() {
10451045
12,KARL,BERRY,2017-11-02 19:15:42.308637+08,11.001
10461046
\."#;
10471047
pg_and_generic().verified_stmt(sql_comma_separated);
1048+
1049+
let incorrect_csv_sql = r#"COPY public.actor (actor_id, first_name, last_name, last_update, value) FROM STDIN (FORMAT csv, DELIMITER ',');
1050+
1,PENELOPE,GUINESS,2006-02-15 09:34:33,0.11111
1051+
2,NICK,WAHLBERG,2006-02-15 09:34:33
1052+
\."#;
1053+
let parsed = pg_and_generic().parse_sql_statements(incorrect_csv_sql);
1054+
assert_eq!(
1055+
parsed.unwrap_err(),
1056+
ParserError::ParserError(
1057+
"CSV row 2 has 4 columns, but expected 5 columns based on first row".to_string()
1058+
)
1059+
);
1060+
1061+
let mixed_incorrect_separators = r#"COPY public.actor (actor_id, first_name, last_name, last_update, value) FROM STDIN (FORMAT csv, DELIMITER ',');
1062+
1,PENELOPE,GUINESS,2006-02-15 09:34:33,0.11111
1063+
2 NICK WAHLBERG 2006-02-15 09:34:33,0.22222
1064+
\."#;
1065+
let parsed = pg_and_generic().parse_sql_statements(mixed_incorrect_separators);
1066+
assert_eq!(
1067+
parsed.unwrap_err(),
1068+
ParserError::ParserError(
1069+
"CSV row 2 has 2 columns, but expected 5 columns based on first row".to_string()
1070+
)
1071+
);
10481072
}
10491073

10501074
#[test]

0 commit comments

Comments
 (0)