Skip to content

Commit 15549cb

Browse files
authored
feat(indent): opt-in extensions for non-YAML indentation languages (commentExcept, rawBlock, flowColonSeparator) (#41)
1 parent 70a8019 commit 15549cb

4 files changed

Lines changed: 283 additions & 2 deletions

File tree

src/gen-lexer.ts

Lines changed: 69 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -304,6 +304,7 @@ export function createLexer(grammar: CstGrammar, intern?: LexerIntern) {
304304
const kNewlineModeTok = kOf(newline?.token ?? null);
305305
const kIndentTok = kOf(indent?.indentToken ?? null), kDedentTok = kOf(indent?.dedentToken ?? null), kIndentNewlineTok = kOf(indent?.newlineToken ?? null);
306306
const kBlockScalarTok = kOf(indent?.blockScalar?.token ?? null);
307+
const kRawBlockTok = kOf(indent?.rawBlock?.token ?? null);
307308
const kPlainCont = kOf(plainContinuationTokenName);
308309
const tColon = puLitOf.get(':') ?? 0;
309310

@@ -343,6 +344,14 @@ export function createLexer(grammar: CstGrammar, intern?: LexerIntern) {
343344
// denoted, so the "newline-or-EOF" alternation is unchanged.
344345
const blockScalarSig = /[|>](?:[1-9][+-]?|[+-][1-9]?|[+-]|)[ \t]*(?:(?<=[ \t])#[^\n]*)?(?:\r?\n|$)/y;
345346
if (indent?.blockScalar) indentTokenNames.add(indent.blockScalar.token);
347+
// Raw content blocks: a line-TRAILING introducer (e.g. Pug-style `tag:mode` at end of a line)
348+
// whose SIGNATURE must match from the introducer char through end-of-line. Sticky, like
349+
// blockScalarSig. `introChar` is the first char of the signature's match (a cheap pre-filter).
350+
const rawBlockSig = indent?.rawBlock
351+
? new RegExp(indent.rawBlock.signature ?? ':(?:[A-Za-z][A-Za-z0-9-]*)?[ \\t]*(?:\\r?\\n|$)', 'y')
352+
: null;
353+
const rawBlockChar = indent?.rawBlock?.introChar ?? ':';
354+
if (indent?.rawBlock) indentTokenNames.add(indent.rawBlock.token);
346355
// Col-0 strings (`---`/`...`) that always end a block scalar — a document boundary outranks
347356
// indentation — and, when one heads the introducer's line, mark a document-ROOT scalar.
348357
const blockScalarDocMarkers = indent?.blockScalar?.documentMarkers ?? [];
@@ -684,7 +693,10 @@ export function createLexer(grammar: CstGrammar, intern?: LexerIntern) {
684693
throw new Error(`Tab character used in indentation at offset ${p}`);
685694
}
686695
}
687-
if (lineComment && source.startsWith(lineComment, p)) { // comment-only line — ignored
696+
if (lineComment && source.startsWith(lineComment, p)
697+
// commentExcept: a comment introducer immediately followed by this string is NOT a
698+
// comment line (e.g. `//` strip-comments vs `//!` doc-comments) — fall through to tokens.
699+
&& !(indent?.commentExcept && source.startsWith(indent.commentExcept, p + lineComment.length))) {
688700
let e = p; while (e < source.length && source[e] !== '\n') e++;
689701
pos = e; pendingComment = true; continue; // next iteration consumes the newline
690702
}
@@ -904,6 +916,58 @@ export function createLexer(grammar: CstGrammar, intern?: LexerIntern) {
904916
continue;
905917
}
906918

919+
// ── Raw content block: a line-TRAILING `:mode` introducer (per
920+
// indent.rawBlock.signature, matched at this position through end of line) captures all
921+
// following lines more indented than the introducer's line as ONE verbatim token (blank
922+
// lines included). The introducer must be GLUED to preceding line content (`script:`,
923+
// `article:md`) or sit at the line lead (`:md` — implicit element). The analogue of the
924+
// YAML block scalar above, but introduced at line END rather than by a leading `|`/`>`. ──
925+
if (indent?.rawBlock && flowDepth === 0 && rawBlockSig && source[pos] === rawBlockChar
926+
&& ((rawBlockSig.lastIndex = pos), rawBlockSig.test(source))) {
927+
let lineBegin = pos; while (lineBegin > 0 && source[lineBegin - 1] !== '\n') lineBegin--;
928+
const beforeText = source.slice(lineBegin, pos);
929+
// GLUED means: the introducer follows the line's tag-head/attrs with NO top-level
930+
// whitespace anywhere before it (whitespace inside balanced parens/quotes is fine —
931+
// `div(a="1" b):md`). A trailing colon after inline TEXT (`label Size:`) has a
932+
// top-level space, so it stays text and never opens a raw block.
933+
const glued = beforeText.length > 0 && /\S/.test(beforeText) && (() => {
934+
let depth = 0, quote = '';
935+
const lead = beforeText.match(/^[ \t]*/)![0].length; // leading indentation is fine
936+
for (let i = lead; i < beforeText.length; i++) {
937+
const ch = beforeText[i];
938+
if (quote) { if (ch === quote) quote = ''; continue; }
939+
if (ch === '"' || ch === "'" || ch === '`') quote = ch;
940+
else if (ch === '(') depth++;
941+
else if (ch === ')') depth = Math.max(0, depth - 1);
942+
else if ((ch === ' ' || ch === '\t') && depth === 0) return false;
943+
}
944+
return true;
945+
})();
946+
const atLead = /^[ \t]*$/.test(beforeText);
947+
if (glued || atLead) {
948+
const startPos = pos;
949+
const parent = indentStack[indentStack.length - 1];
950+
let p = pos; while (p < source.length && source[p] !== '\n') p++; if (p < source.length) p++; // skip the header line
951+
while (p < source.length) {
952+
let q = p, c = 0;
953+
while (q < source.length && source[q] === ' ') { q++; c++; }
954+
if (q >= source.length) { p = q; break; }
955+
if (source[q] === '\n' || source[q] === '\r') { // blank line — part of the block
956+
p = q + 1; if (source[q] === '\r' && source[p] === '\n') p++;
957+
continue;
958+
}
959+
if (c > parent) { // content line — more indented than the introducer's line
960+
let e = q; while (e < source.length && source[e] !== '\n') e++; p = e < source.length ? e + 1 : e;
961+
}
962+
else break; // dedent → the raw block ends
963+
}
964+
push(mkNamed(indent.rawBlock.token, source.slice(startPos, p), startPos, kRawBlockTok));
965+
pos = p;
966+
lineStart = true;
967+
continue;
968+
}
969+
}
970+
907971
// Close an interpolation hole (interpClose at baseline depth) → resume the template span.
908972
if (templateStack.length > 0 && source.startsWith(tplInterpClose, pos)) {
909973
const depth = templateStack[templateStack.length - 1];
@@ -982,7 +1046,10 @@ export function createLexer(grammar: CstGrammar, intern?: LexerIntern) {
9821046
// the separator — emit it as the `:` punctuation literal here. Gated on flow (block-context `:`
9831047
// separators are handled by the KEY-position lookaheads). yaml-test-suite 5MUD / 5T43 / 9MMW
9841048
// / C2DT / K3WX (quoted key) and the flow-collection-key cohort.
985-
if (indent && flowDepth > 0 && source[pos] === ':') {
1049+
// flowColonSeparator: false disables the YAML `"key":value` / `}: value` flow
1050+
// separator carve-out, for indentation grammars with `:name`-shaped tokens that
1051+
// may legally follow a quoted value or a flow-close delimiter.
1052+
if (indent && indent.flowColonSeparator !== false && flowDepth > 0 && source[pos] === ':') {
9861053
const prevTok = tokens[tokens.length - 1];
9871054
if (prevTok && (stringTokenNames.has(prevTok.type) || (prevTok.type === '' && flowCloseSet.has(prevTok.text)))) {
9881055
push(mkPu(':', pos, tColon));

src/types.ts

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -326,6 +326,23 @@ export interface IndentConfig {
326326
// control sigil, not content; absent → the block-scalar token's own scope (introducer reads as the
327327
// body string). The body always keeps the token scope; only the introducer capture is re-scoped.
328328
blockScalar?: { introducers: string[]; token: string; documentMarkers?: string[]; indicatorScope?: string };
329+
// Set false to disable the YAML flow `:` key-separator carve-out (a `:` glued after a quoted
330+
// scalar / flow-close is forced punctuation). Indentation grammars with `:name`-shaped tokens
331+
// (bound-attribute shorthand) need those to survive after values. Default true (YAML behavior).
332+
flowColonSeparator?: boolean;
333+
// A comment introducer immediately followed by this string is NOT a comment line — it falls
334+
// through to ordinary tokenization (e.g. comment '//' + commentExcept '!' → `//!` doc-comment
335+
// lines lex as real tokens and stay visible to the indent stack, while `//` lines vanish).
336+
commentExcept?: string;
337+
// Raw content blocks: a line-TRAILING introducer (`tag:mode` at end of line, or a bare `:mode`
338+
// at the line lead) captures all following more-indented lines as ONE verbatim token — the
339+
// analogue of `blockScalar` for languages whose raw regions are introduced from the END of a
340+
// line (Pug-style filters/content modes) rather than by a leading `|`/`>`. `signature` is a
341+
// sticky-regex SOURCE matched at the introducer char through end-of-line (default
342+
// `:(?:[A-Za-z][A-Za-z0-9-]*)?[ \t]*(?:\r?\n|$)`); `introChar` is its first char (a cheap
343+
// pre-filter, default ':'). The introducer must be GLUED to the line's content (no top-level
344+
// whitespace before it — whitespace inside balanced parens/quotes is fine) or sit at line lead.
345+
rawBlock?: { token: string; signature?: string; introChar?: string };
329346
// Compact-notation indicators (YAML `-` / `?`): a block entry indicator whose nested node begins
330347
// INLINE on the same line (`- item: a`, `? - x`). The node's true indentation is then the column
331348
// of its first char AFTER the indicator, not the indicator's own column — so a following SIBLING

test/check.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ const GATES: Gate[] = [
3838
{ group: 'vue', name: 'directives', args: ['test/vue-directives.ts'] },
3939
{ group: 'vue', name: 'embed-boundary', args: ['test/vue-embed-boundary.ts'] },
4040
{ group: 'vue', name: 'interp-expr', args: ['test/vue-interp-expr.ts'] },
41+
{ group: 'core', name: 'indent-extensions', args: ['test/indent-extensions.ts'] },
4142
{ group: 'yaml', name: 'issue12-regressions', args: ['test/yaml-issue12-regressions.ts'] },
4243
{ group: 'yaml', name: 'depth-witnesses', args: ['test/yaml-depth-witnesses.ts'] },
4344
{ group: 'yaml', name: 'depth-sites', args: ['test/depth-sites.ts'] },

0 commit comments

Comments
 (0)