Skip to content

Commit 4fe9b8e

Browse files
committed
[tabular] multi-line support
1 parent e6c30e6 commit 4fe9b8e

26 files changed

Lines changed: 494 additions & 72 deletions

src/base/auto_mem.hh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -300,7 +300,7 @@ public:
300300

301301
auto_buffer& append(std::string_view sv)
302302
{
303-
if (this->ab_size + sv.length() < this->ab_capacity) {
303+
if (this->ab_size + sv.length() > this->ab_capacity) {
304304
this->expand_by(sv.length() + 1024);
305305
}
306306
memcpy(&this->ab_buffer[this->ab_size], sv.data(), sv.length());

src/base/separated_string.cc

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -197,8 +197,22 @@ separated_string::iterator::update()
197197
bool saw_quote = false;
198198
const char* quote_start = nullptr;
199199
const char* quote_end = nullptr;
200+
this->i_unterminated_quote = false;
200201

201202
const char* p = this->i_pos;
203+
// Continuation case: the prior chunk ended mid-quote and the
204+
// caller stitched the next chunk on. Begin parsing this cell
205+
// already inside the quoted region — value bytes start at p,
206+
// not after a leading `"`. Only applies to the very first cell
207+
// of the buffer.
208+
if (this->i_parent.ss_resume && this->i_parent.ss_resume->rs_in_quote
209+
&& p == ss.ss_str)
210+
{
211+
in_quotes = true;
212+
saw_quote = true;
213+
quote_start = p;
214+
state = OTHER;
215+
}
202216
while (p < data_end) {
203217
if (!in_quotes && *p == sep_ch) {
204218
if (sep_ch == ' ' && p + 1 < data_end) {
@@ -320,6 +334,14 @@ separated_string::iterator::update()
320334
}
321335
this->i_next_pos = (p < data_end) ? p + 1 : data_end;
322336

337+
// The parse loop only exits with `in_quotes` still set when it
338+
// ran off data_end without seeing a close-quote. Surface that
339+
// to the caller so they can decide whether more bytes are
340+
// pending (e.g. a CSV cell whose value spans physical lines).
341+
if (in_quotes) {
342+
this->i_unterminated_quote = true;
343+
}
344+
323345
if (saw_quote) {
324346
// Use the span between the quotes. An unterminated
325347
// quote takes everything up to the current position.

src/base/separated_string.hh

Lines changed: 47 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,8 +59,29 @@ struct separated_string {
5959

6060
const char* ss_str;
6161
size_t ss_len;
62+
// Cross-buffer parser state. Captured by `iterator::suspend()`
63+
// when a buffer ends mid-row (typically a quoted cell whose
64+
// closing `"` lives in a not-yet-arrived chunk) and replayed by
65+
// assigning the snapshot to `ss_resume` on the next buffer.
66+
// Carries:
67+
// - rs_index — the row-level cell index the next iterator
68+
// should report on its first cell, so a continuation that
69+
// finishes column 7 of 10 reports indices 7..9 instead of
70+
// restarting at 0.
71+
// - rs_in_quote — when true, the first cell starts already
72+
// inside an open quoted region: no leading `"` required,
73+
// separators stay suppressed until a closing `"`, and `""`
74+
// is still an embedded literal. If the close never arrives,
75+
// the trailing cell again carries `unterminated_quote()` and
76+
// the caller can keep stitching.
77+
struct resume_state {
78+
size_t rs_index{0};
79+
bool rs_in_quote{false};
80+
};
81+
6282
char ss_separator{','};
6383
std::optional<size_t> ss_expected_count;
84+
std::optional<resume_state> ss_resume;
6485

6586
separated_string(const char* str, size_t len) : ss_str(str), ss_len(len) {}
6687

@@ -103,10 +124,17 @@ struct separated_string {
103124
// show" from "at data_end, natural end" so operator==(end())
104125
// doesn't cut iteration short.
105126
bool i_in_ghost{false};
127+
// True when update() reached data_end while still inside a
128+
// quoted region — the closing `"` was never seen because the
129+
// buffer was cut short. Only the trailing cell can ever
130+
// carry it. Callers (e.g. multi-line CSV ingest) can use it
131+
// to decide whether to splice in more bytes and re-parse.
132+
bool i_unterminated_quote{false};
106133

107134
iterator(const separated_string& ss, const char* pos)
108135
: i_parent(ss), i_pos(pos), i_next_pos(pos), i_value_start(pos),
109-
i_value_end(pos), i_index(0)
136+
i_value_end(pos),
137+
i_index(ss.ss_resume ? ss.ss_resume->rs_index : 0)
110138
{
111139
this->update();
112140
}
@@ -140,6 +168,24 @@ struct separated_string {
140168

141169
cell_kind kind() const { return this->i_kind; }
142170

171+
// True only on the trailing cell when the buffer ended inside
172+
// an open quoted region. Stays false on every other cell.
173+
bool unterminated_quote() const
174+
{
175+
return this->i_unterminated_quote;
176+
}
177+
178+
// Snapshot the cross-buffer parser state at the current cell.
179+
// Typical use: when `unterminated_quote()` returns true on the
180+
// trailing cell, capture the snapshot, accumulate the cell's
181+
// partial value into your own buffer, then assign the snapshot
182+
// to the next buffer's `ss_resume` so the continuation iterator
183+
// picks up at the right index already inside the open quote.
184+
resume_state suspend() const
185+
{
186+
return {this->i_index, this->i_unterminated_quote};
187+
}
188+
143189
bool operator==(const iterator& other) const
144190
{
145191
return (&this->i_parent == &other.i_parent)

src/base/separated_string.tests.cc

Lines changed: 136 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -271,6 +271,142 @@ TEST_CASE("unterminated quote captures rest of input")
271271
CHECK(cells[0].value == "start,here,no-close");
272272
}
273273

274+
TEST_CASE("unterminated_quote flag flips only on the trailing open cell")
275+
{
276+
auto input = std::string(R"(a,"unterminated cell)");
277+
separated_string ss(input.data(), input.length());
278+
std::vector<bool> flags;
279+
for (auto iter = ss.begin(); iter != ss.end(); ++iter) {
280+
flags.push_back(iter.unterminated_quote());
281+
}
282+
REQUIRE(flags.size() == 2);
283+
CHECK_FALSE(flags[0]);
284+
CHECK(flags[1]);
285+
}
286+
287+
TEST_CASE("unterminated_quote stays false when every quote is closed")
288+
{
289+
auto input = std::string(R"("one","two","three")");
290+
separated_string ss(input.data(), input.length());
291+
for (auto iter = ss.begin(); iter != ss.end(); ++iter) {
292+
CHECK_FALSE(iter.unterminated_quote());
293+
}
294+
}
295+
296+
TEST_CASE("unterminated_quote sees through `\"\"` escapes")
297+
{
298+
// The `""` is an embedded literal, not a close-quote — so the
299+
// outer quoted region is still open at end-of-buffer.
300+
auto input = std::string(R"("a""b,rest)");
301+
separated_string ss(input.data(), input.length());
302+
auto iter = ss.begin();
303+
REQUIRE(iter != ss.end());
304+
CHECK(iter.unterminated_quote());
305+
++iter;
306+
CHECK(iter == ss.end());
307+
}
308+
309+
TEST_CASE("ss_resume: continuation closes and emits remaining cells")
310+
{
311+
// Caller previously parsed `"line one` (open quote), is now
312+
// feeding the rest after a `\n` glue: `line two",x,y`.
313+
auto input = std::string(R"(line two",x,y)");
314+
separated_string ss(input.data(), input.length());
315+
ss.ss_resume = separated_string::resume_state{0, true};
316+
auto cells = std::vector<parsed_cell>{};
317+
for (auto iter = ss.begin(); iter != ss.end(); ++iter) {
318+
cells.push_back(parsed_cell{(*iter).to_string(), iter.kind()});
319+
CHECK_FALSE(iter.unterminated_quote());
320+
}
321+
REQUIRE(cells.size() == 3);
322+
CHECK(cells[0].value == "line two");
323+
CHECK(cells[1].value == "x");
324+
CHECK(cells[2].value == "y");
325+
}
326+
327+
TEST_CASE("ss_resume: continuation still unterminated re-flags")
328+
{
329+
// Open-quote chunk N+1: still no closing `"` in this buffer
330+
// either. Caller will need to keep stitching.
331+
auto input = std::string("more middle text");
332+
separated_string ss(input.data(), input.length());
333+
ss.ss_resume = separated_string::resume_state{0, true};
334+
auto iter = ss.begin();
335+
REQUIRE(iter != ss.end());
336+
CHECK((*iter).to_string() == "more middle text");
337+
CHECK(iter.unterminated_quote());
338+
++iter;
339+
CHECK(iter == ss.end());
340+
}
341+
342+
TEST_CASE("ss_resume: `\"\"` inside a continuation stays embedded")
343+
{
344+
// `""` in a continuation chunk is still a literal `"`, not a
345+
// close-quote; the surrounding region remains open until a
346+
// lone `"`.
347+
auto input = std::string(R"(has ""embedded"" still open)");
348+
separated_string ss(input.data(), input.length());
349+
ss.ss_resume = separated_string::resume_state{0, true};
350+
auto iter = ss.begin();
351+
REQUIRE(iter != ss.end());
352+
CHECK((*iter).to_string() == R"(has ""embedded"" still open)");
353+
CHECK(iter.unterminated_quote());
354+
}
355+
356+
TEST_CASE("ss_resume: continuation that closes immediately")
357+
{
358+
// Edge case: the buffer is just the close-quote, separator, next
359+
// cell. The leading character closes the open region and yields
360+
// an empty cell value (no bytes from this chunk belonged to the
361+
// quoted run).
362+
auto input = std::string(R"(",tail)");
363+
separated_string ss(input.data(), input.length());
364+
ss.ss_resume = separated_string::resume_state{0, true};
365+
auto cells = std::vector<parsed_cell>{};
366+
for (auto iter = ss.begin(); iter != ss.end(); ++iter) {
367+
cells.push_back(parsed_cell{(*iter).to_string(), iter.kind()});
368+
}
369+
REQUIRE(cells.size() == 2);
370+
CHECK(cells[0].value.empty());
371+
CHECK(cells[1].value == "tail");
372+
}
373+
374+
TEST_CASE("ss_resume: suspend() preserves row-level cell index")
375+
{
376+
// Buffer 1: 3 closed cells, then a 4th that opens a quote and
377+
// never closes. suspend() snapshot should report index 3 +
378+
// in-quote.
379+
auto buf1 = std::string(R"(a,b,c,"open and unfinished)");
380+
separated_string ss1(buf1.data(), buf1.length());
381+
std::optional<separated_string::resume_state> snap;
382+
for (auto iter = ss1.begin(); iter != ss1.end(); ++iter) {
383+
if (iter.unterminated_quote()) {
384+
snap = iter.suspend();
385+
}
386+
}
387+
REQUIRE(snap.has_value());
388+
CHECK(snap->rs_index == 3);
389+
CHECK(snap->rs_in_quote);
390+
391+
// Buffer 2: continuation closes the cell, then two more. Indices
392+
// emitted should be 3, 4, 5 — the snapshot's row-level position
393+
// carried across the buffer boundary.
394+
auto buf2 = std::string(R"(rest of cell",d,e)");
395+
separated_string ss2(buf2.data(), buf2.length());
396+
ss2.ss_resume = *snap;
397+
std::vector<size_t> indices;
398+
std::vector<std::string> values;
399+
for (auto iter = ss2.begin(); iter != ss2.end(); ++iter) {
400+
indices.push_back(iter.index());
401+
values.push_back((*iter).to_string());
402+
}
403+
REQUIRE(indices.size() == 3);
404+
CHECK(indices == std::vector<size_t>{3, 4, 5});
405+
CHECK(values[0] == "rest of cell");
406+
CHECK(values[1] == "d");
407+
CHECK(values[2] == "e");
408+
}
409+
274410
TEST_CASE("newlines inside a quoted cell are preserved verbatim")
275411
{
276412
auto cells = tokenize("\"line1\nline2\",next");

0 commit comments

Comments
 (0)