tstack
diff --git a/‎NEWS.md‎
Lines changed: 15 additions & 7 deletions b/‎NEWS.md‎
Lines changed: 15 additions & 7 deletions
diff --git a/‎docs/schemas/format-v1.schema.json‎
Lines changed: 1 addition & 1 deletion b/‎docs/schemas/format-v1.schema.json‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/source/formats.rst‎
Lines changed: 30 additions & 1 deletion b/‎docs/source/formats.rst‎
Lines changed: 30 additions & 1 deletion
diff --git a/‎src/base/intern_string.cc‎
Lines changed: 41 additions & 2 deletions b/‎src/base/intern_string.cc‎
Lines changed: 41 additions & 2 deletions
diff --git a/‎src/base/intern_string.hh‎
Lines changed: 30 additions & 0 deletions b/‎src/base/intern_string.hh‎
Lines changed: 30 additions & 0 deletions
diff --git a/‎src/base/intern_string.tests.cc‎
Lines changed: 31 additions & 0 deletions b/‎src/base/intern_string.tests.cc‎
Lines changed: 31 additions & 0 deletions
diff --git a/‎src/base/separated_string.cc‎
Lines changed: 109 additions & 2 deletions b/‎src/base/separated_string.cc‎
Lines changed: 109 additions & 2 deletions
diff --git a/‎src/base/separated_string.hh‎
Lines changed: 4 additions & 1 deletion b/‎src/base/separated_string.hh‎
Lines changed: 4 additions & 1 deletion
@@ -10,13 +10,6 @@ Features:
   The `z`/`Z` keys can also be used to increase/decrease the
   context by one in the LOG, TEXT, and TIMELINE views.  Context
   lines are styled using the new `context-line` theme style.
-* Added a log format for the `fsck_apfs` and `fsck_hfs` tools on
-  macOS, covering both the `started`/`completed` lifecycle lines
-  and legacy `run` entries.  This replaces the previous
-  `fsck_hfs_log` format, which only matched the start lines.
-  The new format exposes `device`, `tool`, and `action` fields,
-  groups messages by device in the TIMELINE view, and highlights
-  `error:` lines and `FILESYSTEM CLEAN` status messages.
 * Added a built-in `metrics_log` format that recognizes CSV
   files whose first column header is `Time`/`Timestamp`/`ts`/
   `Date...` and whose subsequent rows begin with a parseable
@@ -43,6 +36,21 @@ Features:
     target any table, including search-table columns.
   - `:clear-timeline-metric <label>` removes metrics.
   Up to four metrics can be added.
+* Added support for "tabular" formats (e.g. CSV, TSV).
+  The format definition for this type of file sets
+  `file-type` to `tabular` and then defines the known
+  columns.  When opening a file of this type, the
+  separator will be automatically detected and the header
+  compared against the columns defined in the tabular
+  formats.  If a good match is found, it will be used as
+  the format for the file.
+* Added a log format for the `fsck_apfs` and `fsck_hfs` tools on
+  macOS, covering both the `started`/`completed` lifecycle lines
+  and legacy `run` entries.  This replaces the previous
+  `fsck_hfs_log` format, which only matched the start lines.
+  The new format exposes `device`, `tool`, and `action` fields,
+  groups messages by device in the TIMELINE view, and highlights
+  `error:` lines and `FILESYSTEM CLEAN` status messages.
 * Log format value definitions now accept a `unit` object
   with `suffix` and `divisor` properties.  `suffix` specifies
   how numeric fields are humanized. `divisor` normalizes
 
@@ -943,7 +943,7 @@
                     "enum": [
                         "text",
                         "json",
-                        "csv"
+                        "tabular"
                     ]
                 },
                 "max-unrecognized-lines": {
 
@@ -55,6 +55,23 @@ See the following formats that are built into lnav as examples:
 * `cloudflare_log.json <https://github.com/tstack/lnav/blob/master/src/formats/cloudflare_log.json>`_
 * `github_events_log.json <https://github.com/tstack/lnav/blob/master/src/formats/github_events_log.json>`_
 
+.. _tabular_format:
+
+Tabular files
+-------------
+
+Delimited files (CSV, TSV, and similar) can be parsed by declaring
+a format with :code:`"file-type": "tabular"`.  The first row of the
+file must be a header naming each column; the separator is
+auto-detected from the header and is one of comma, tab, semicolon,
+pipe (:code:`|`), or runs of two-or-more spaces.
+
+Each column is mapped to a :code:`value` definition by name.  The
+standard field bindings work the same as for other types of formats.
+A row may use a single :code:`-` or :code:`--` to indicate that
+:code:`opid-field` or :code:`thread-id-field` is absent for that
+row.
+
 logfmt
 ------
 
@@ -194,7 +211,19 @@ object with the following fields:
     The `PCRE2 <http://www.pcre.org>`_ library is used by **lnav** to do all
     regular expression matching.
 
-:json: True if each log line is JSON-encoded.
+:file-type: The shape of the file.  One of:
+
+  :text: Plain-text log files matched by one or more
+    :code:`regex` patterns.  This is the default.
+  :json: Each line is a JSON object (JSON-lines).  The
+    :code:`value` definitions name the JSON properties to
+    extract and :code:`line-format` controls how messages
+    are rendered.
+  :tabular: A delimited file whose first row is a header
+    naming each column.  See :ref:`tabular_format`.
+
+:json: (Deprecated, use :code:`"file-type": "json"` instead.) True if
+  each log line is JSON-encoded.
 
 :converter: An object that describes how an input file can be detected and
   then converted to a form that can be interpreted by **lnav**.  For
 
@@ -776,8 +776,8 @@ string_fragment::transform_codepoints(
         }
         auto cp = read_res.unwrap();
         auto new_cp = xform(cp);
-        ww898::utf::utf8::write(
-            new_cp, [&out](const char b) { out.push_back(b); });
+        ww898::utf::utf8::write(new_cp,
+                                [&out](const char b) { out.push_back(b); });
     }
     return out;
 }
@@ -817,6 +817,45 @@ string_fragment::column_width() const
     return retval;
 }
 
+std::optional<uint32_t>
+string_fragment::cursor_impl::lookahead() const
+{
+    if (this->ci_next_index >= this->ci_end) {
+        return std::nullopt;
+    }
+
+    int32_t index = this->ci_next_index;
+    auto read_res = ww898::utf::utf8::read(
+        [this, &index]() { return this->ci_string[index++]; });
+    if (read_res.isErr()) {
+        return this->ci_string[this->ci_next_index];
+    }
+    return read_res.unwrap();
+}
+
+std::optional<uint32_t>
+string_fragment::cursor_impl::next()
+{
+    this->ci_lookbehind = this->ci_next_lookbehind;
+    if (this->ci_next_index >= this->ci_end) {
+        return std::nullopt;
+    }
+
+    int32_t index = this->ci_next_index;
+    auto read_res = ww898::utf::utf8::read(
+        [this, &index]() { return this->ci_string[index++]; });
+    uint32_t retval;
+    if (read_res.isErr()) {
+        retval = this->ci_string[this->ci_next_index];
+        this->ci_next_index += 1;
+    } else {
+        retval = read_res.unwrap();
+        this->ci_next_index = index;
+    }
+    this->ci_next_lookbehind = retval;
+    return retval;
+}
+
 struct single_producer : string_fragment_producer {
     explicit single_producer(const string_fragment& sf) : sp_frag(sf) {}
 
 
@@ -783,6 +783,36 @@ struct string_fragment {
 
     uint64_t bloom_bits() const;
 
+    class cursor_impl {
+    public:
+        std::optional<uint32_t> lookbehind() const
+        {
+            return this->ci_lookbehind;
+        }
+
+        std::optional<uint32_t> lookahead() const;
+
+        std::optional<uint32_t> next();
+
+    private:
+        friend string_fragment;
+
+        explicit cursor_impl(const string_fragment& parent)
+            : ci_string(parent.sf_string),
+              ci_end(parent.sf_end),
+              ci_next_index(parent.sf_begin)
+        {
+        }
+
+        const char* ci_string;
+        int32_t ci_end;
+        int32_t ci_next_index;
+        std::optional<uint32_t> ci_lookbehind;
+        std::optional<uint32_t> ci_next_lookbehind;
+    };
+
+    cursor_impl cursor() const { return cursor_impl(*this); }
+
     const char* sf_string;
     int32_t sf_begin;
     int32_t sf_end;
 
@@ -453,3 +453,34 @@ TEST_CASE("string_fragment::word helpers with wide chars")
         CHECK(sf.curr_word(4) == std::optional<int>(0));
     }
 }
+
+TEST_CASE("string_fragment::cursor")
+{
+    {
+        const auto input = ""_frag;
+        auto cursor = input.cursor();
+        CHECK_FALSE(cursor.lookbehind().has_value());
+        CHECK_FALSE(cursor.lookahead().has_value());
+        CHECK_FALSE(cursor.next().has_value());
+    }
+    {
+        const auto input = "hello"_frag;
+        auto cursor = input.cursor();
+        CHECK_FALSE(cursor.lookbehind().has_value());
+        CHECK('h' == cursor.lookahead());
+        CHECK('h' == cursor.lookahead());
+        CHECK('h' == cursor.next());
+        CHECK_FALSE(cursor.lookbehind().has_value());
+        CHECK('e' == cursor.lookahead());
+        CHECK('e' == cursor.next());
+        CHECK('h' == cursor.lookbehind());
+        CHECK('l' == cursor.next());
+        CHECK('l' == cursor.next());
+        CHECK('o' == cursor.next());
+        CHECK_FALSE(cursor.next().has_value());
+        CHECK('o' == cursor.lookbehind());
+        CHECK_FALSE(cursor.next().has_value());
+        CHECK('o' == cursor.lookbehind());
+        CHECK_FALSE(cursor.lookahead().has_value());
+    }
+}
@@ -40,6 +40,97 @@ is_suffix_char(char ch)
     return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') || ch == '%';
 }
 
+std::optional<char>
+separated_string::detect_separator(const string_fragment& str)
+{
+    struct sep_state {
+        char ss_char;
+        size_t ss_count{0};
+    };
+
+    size_t comma = 0;
+    size_t tab = 0;
+    size_t semi = 0;
+    size_t vbar = 0;
+    size_t space = 0;
+
+    auto in_quote = false;
+    auto has_leading_spaces = false;
+
+    auto cur = str.cursor();
+    while (cur.lookahead() == ' ') {
+        (void) cur.next();
+        has_leading_spaces = true;
+    }
+    while (true) {
+        auto ch = cur.next();
+        if (!ch) {
+            break;
+        }
+
+        auto behind = cur.lookbehind();
+        auto ahead = cur.lookahead();
+        if (in_quote) {
+            if (ch == '"') {
+                in_quote = false;
+            }
+        } else if (ch == '"') {
+            in_quote = true;
+        } else if (ch == '\t') {
+            if (behind && behind != '\t') {
+                tab += 1;
+            }
+        } else if (ch == ',') {
+            if (behind && ahead && behind != ' ' && ahead != ' ') {
+                comma += 1;
+            }
+        } else if (ch == ';') {
+            if (behind && ahead && behind != ' ' && ahead != ' ') {
+                semi += 1;
+            }
+        } else if (ch == '|') {
+            if (behind && ahead && behind != ' ' && ahead != ' ') {
+                vbar += 1;
+            }
+        } else if (ch == ' ') {
+            if (behind && ahead && behind != ' ' && ahead == ' ') {
+                space += 1;
+            }
+        }
+    }
+
+    if (has_leading_spaces) {
+        if (space > 0) {
+            return ' ';
+        }
+        return std::nullopt;
+    }
+
+    if (in_quote) {
+        return std::nullopt;
+    }
+
+    std::array<sep_state, 5> states = {{
+        {',', comma},
+        {'\t', tab},
+        {';', semi},
+        {'|', vbar},
+        {' ', space},
+    }};
+
+    std::sort(states.begin(),
+              states.end(),
+              [](const sep_state& a, const sep_state& b) {
+                  return a.ss_count > b.ss_count;
+              });
+
+    if (states[0].ss_count == 0 || states[0].ss_count == states[1].ss_count) {
+        return std::nullopt;
+    }
+
+    return states[0].ss_char;
+}
+
 std::string
 separated_string::unescape_quoted(string_fragment sf)
 {
@@ -108,7 +199,22 @@ separated_string::iterator::update()
     const char* p = this->i_pos;
     while (p < data_end) {
         if (!in_quotes && *p == sep_ch) {
-            break;
+            if (sep_ch == ' ' && p + 1 < data_end) {
+                if ((!this->i_parent.ss_expected_count
+                     || this->i_index + 1
+                         < this->i_parent.ss_expected_count.value())
+                    && p + 1 < data_end && *(p + 1) == ' ')
+                {
+                    while (p + 1 < data_end && *(p + 1) == ' ') {
+                        p += 1;
+                    }
+                    break;
+                }
+                state = TRAIL_WS;
+                p += 1;
+            } else {
+                break;
+            }
         }
         const char c = *p;
 
@@ -206,7 +312,8 @@ separated_string::iterator::update()
     // end of input, convention says one more empty cell should be
     // emitted.  Defer it to the next update() call via
     // i_pending_ghost so the user still sees the current cell first.
-    if (p < data_end && p + 1 == data_end) {
+    if (p < data_end && p + 1 == data_end && this->i_parent.ss_separator != ' ')
+    {
         this->i_pending_ghost = true;
     }
     this->i_next_pos = (p < data_end) ? p + 1 : data_end;
 
@@ -55,9 +55,12 @@ struct separated_string {
         other,  // anything else — text, identifiers, JSON blobs, etc.
     };
 
+    static std::optional<char> detect_separator(const string_fragment& str);
+
     const char* ss_str;
     size_t ss_len;
     char ss_separator{','};
+    std::optional<size_t> ss_expected_count;
 
     separated_string(const char* str, size_t len) : ss_str(str), ss_len(len) {}
 
@@ -119,9 +122,9 @@ struct separated_string {
 
         iterator& operator++()
         {
+            this->i_index += 1;
             this->i_pos = this->i_next_pos;
             this->update();
-            this->i_index += 1;
 
             return *this;
         }
Original file line number	Diff line number	Diff line change
`@@ -943,7 +943,7 @@`
`943`	`943`	`"enum": [`
`944`	`944`	`"text",`
`945`	`945`	`"json",`
`946`		`- "csv"`
	`946`	`+ "tabular"`
`947`	`947`	`]`
`948`	`948`	`},`
`949`	`949`	`"max-unrecognized-lines": {`