feat: expose tokenSplitChars/tokenMergeChar; fix Leipzig gloss examples

dani-polani · claude · dani-polani · commit f37353f1972f · 2026-06-23T04:15:21.000+03:00
The gloss examples rendered "1SG.NOM" as "1SGNOM" because the default
tokenSplitChars (".-|") consumes the period as a boundary and does not
draw it. Leipzig glosses pack features into one morpheme with periods
(go.PST.IPFV = one token), so the period must be preserved.

- Add settings.tokenSplitChars and settings.tokenMergeChar to the API
- Rework gloss examples to the verified interface config: gloss on top,
  source middle, translation bottom; tokenSplitChars "-|"; gloss-source
  pair arcs hidden with 12px gap; source-translation arcs shown
- Replace broken German 4-line example with a clean 4-tier French stack
  (gloss / IPA / source / translation)
- Document the "split char is not rendered" gotcha on /api
- Update OpenAPI schema, SKILL.md, references/api.md
- 3 new tests for tokenization settings (62 total)

Co-Authored-By: Claude Opus 4.8 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/bitext/src/lib/api/align.test.ts b/bitext/src/lib/api/align.test.ts
@@ -172,6 +172,43 @@ describe('buildAlignUrl', () => {
 		expect(state.settings.lineOpacity).toBe(0.7);
 	});
 
+	it('keeps periods inside gloss tokens when tokenSplitChars omits the dot', () => {
+		// With default ".-|", "1SG.NOM" would split into two tokens. With "-|" it stays one,
+		// so word index 1 of the gloss line is "go.PST.IPFV" and the connection resolves.
+		const result = buildAlignUrl(ORIGIN, {
+			lines: ['1SG.NOM go.PST.IPFV', 'Я ходил'],
+			alignments: [
+				[0, 0, 1, 0],
+				[0, 1, 1, 1]
+			],
+			settings: { tokenSplitChars: '-|' }
+		});
+		if (!('url' in result)) throw new Error('expected url');
+		const state = decodeState(new URL(result.url).searchParams.get('data'));
+		expect(state.settings.tokenSplitChars).toBe('-|');
+		// Connection from gloss word 1 must point at the single "go.PST.IPFV" token (l0-1).
+		const c = state.project.connections.find((conn) => conn.lowerTokenId === 'l1-1');
+		expect(c?.upperTokenId).toBe('l0-1');
+	});
+
+	it('rejects gloss word index that only resolves under the default split chars', () => {
+		// Under default ".-|", "1SG.NOM PST.IPFV" has 4 tokens, so word 3 exists.
+		// We do NOT pass tokenSplitChars here, proving the dot still splits by default.
+		const result = buildAlignUrl(ORIGIN, {
+			lines: ['Я ходил', '1SG.NOM PST.IPFV'],
+			alignments: [[0, 1, 1, 3]]
+		});
+		if (!('url' in result)) throw new Error('expected url');
+		const state = decodeState(new URL(result.url).searchParams.get('data'));
+		expect(state.project.connections[0]!.lowerTokenId).toBe('l1-3'); // "IPFV"
+	});
+
+	it('rejects invalid tokenMergeChar (more than one character)', () => {
+		expect(
+			parseAlignBody({ lines: ['a', 'b'], settings: { tokenMergeChar: '++' } })
+		).toMatchObject({ err: expect.stringContaining('tokenMergeChar') });
+	});
+
 	it('applies per-line options (font, sizePx, rtl)', () => {
 		const result = buildAlignUrl(ORIGIN, {
 			lines: [
diff --git a/bitext/src/lib/api/align.ts b/bitext/src/lib/api/align.ts
@@ -48,6 +48,18 @@ export interface SettingsInput {
 	showNumbers?: boolean;
 	/** Tint word tokens in the color of their connection. */
 	colorTokensByLink?: boolean;
+	/**
+	 * Characters (besides whitespace) that split text into separate word tokens.
+	 * Default is ".-|". For Leipzig glosses, set "-|" so periods stay inside a token
+	 * (e.g. "go.PST.IPFV" is one token instead of three). The split character itself is
+	 * not drawn, so any character you keep here disappears from the rendered text.
+	 */
+	tokenSplitChars?: string;
+	/**
+	 * Single character that joins parts into one alignment token while rendering as a space
+	 * (e.g. "is+playing" shows "is playing" but counts as one word). Default is "+".
+	 */
+	tokenMergeChar?: string;
 }
 
 /** Per-adjacent-pair controls. `upper` and `lower` are 0-based line indices (lower = upper + 1). */
@@ -119,6 +131,10 @@ function parseSettingsInput(val: unknown): { ok: SettingsInput } | { err: string
 		return { err: 'settings.showNumbers must be a boolean' };
 	if (v.colorTokensByLink !== undefined && typeof v.colorTokensByLink !== 'boolean')
 		return { err: 'settings.colorTokensByLink must be a boolean' };
+	if (v.tokenSplitChars !== undefined && typeof v.tokenSplitChars !== 'string')
+		return { err: 'settings.tokenSplitChars must be a string' };
+	if (v.tokenMergeChar !== undefined && (typeof v.tokenMergeChar !== 'string' || v.tokenMergeChar.length > 1))
+		return { err: 'settings.tokenMergeChar must be a single character' };
 
 	return {
 		ok: {
@@ -129,7 +145,9 @@ function parseSettingsInput(val: unknown): { ok: SettingsInput } | { err: string
 			lineThickness: typeof v.lineThickness === 'number' ? v.lineThickness : undefined,
 			lineOpacity: typeof v.lineOpacity === 'number' ? v.lineOpacity : undefined,
 			showNumbers: typeof v.showNumbers === 'boolean' ? v.showNumbers : undefined,
-			colorTokensByLink: typeof v.colorTokensByLink === 'boolean' ? v.colorTokensByLink : undefined
+			colorTokensByLink: typeof v.colorTokensByLink === 'boolean' ? v.colorTokensByLink : undefined,
+			tokenSplitChars: typeof v.tokenSplitChars === 'string' ? v.tokenSplitChars : undefined,
+			tokenMergeChar: typeof v.tokenMergeChar === 'string' ? v.tokenMergeChar : undefined
 		}
 	};
 }
diff --git a/bitext/src/routes/api/+page.svelte b/bitext/src/routes/api/+page.svelte
@@ -352,6 +352,27 @@
 					<td class={tdTypeClass}>true</td>
 					<td class={tdDescClass}>Tint word tokens in the color of their connection.</td>
 				</tr>
+				<tr>
+					<td class={tdClass}>tokenSplitChars</td>
+					<td class={tdTypeClass}>string</td>
+					<td class={tdTypeClass}><span class={codeClass}>.-|</span></td>
+					<td class={tdDescClass}
+						>Characters (besides whitespace) that split text into tokens. The split character is
+						<strong>not</strong> rendered. Set to <span class={codeClass}>-|</span> to keep periods
+						inside Leipzig gloss morphemes (e.g. <span class={codeClass}>go.PST.IPFV</span> stays one
+						token).</td
+					>
+				</tr>
+				<tr class="bg-gray-50/50 dark:bg-gray-800/20">
+					<td class={tdClass}>tokenMergeChar</td>
+					<td class={tdTypeClass}>string (1 char)</td>
+					<td class={tdTypeClass}><span class={codeClass}>+</span></td>
+					<td class={tdDescClass}
+						>Joins parts into one token while rendering as a space, e.g. <span class={codeClass}
+							>is+playing</span
+						> → "is playing" (one word).</td
+					>
+				</tr>
 			</tbody>
 		</table>
 	</div>
@@ -422,29 +443,31 @@
 		</table>
 	</div>
 
-	<h3 class={subheadingClass}>Example — 3 lines, gloss row with tighter gap and no connectors</h3>
+	<h3 class={subheadingClass}>Example — interlinear (Leipzig) gloss</h3>
 	<pre class="{preClass} mt-3">{`curl -X POST ${apiBase}/api/align \\
   -H "Content-Type: application/json" \\
   -d '{
     "lines": [
+      { "text": "1SG.NOM go.PST.IPFV", "sizePx": 22 },
       { "text": "Я ходил", "sizePx": 40 },
-      { "text": "1SG.NOM PST.IPFV", "sizePx": 22 },
       { "text": "I have been going", "sizePx": 36 }
     ],
     "alignments": [
-      [0, 0, 1, 0], [0, 0, 1, 1],
-      [0, 1, 1, 2], [0, 1, 1, 3]
+      [0, 0, 1, 0], [0, 1, 1, 1],
+      [1, 0, 2, 0],
+      [1, 1, 2, 1], [1, 1, 2, 2], [1, 1, 2, 3]
     ],
+    "settings": { "tokenSplitChars": "-|" },
     "pairs": [
-      { "upper": 0, "lower": 1, "gapPx": 12, "showConnectors": false },
-      { "upper": 1, "lower": 2, "gapPx": 80, "showConnectors": false }
+      { "upper": 0, "lower": 1, "gapPx": 12, "showConnectors": false }
     ]
   }'`}</pre>
 	<p class="mt-2 text-sm text-gray-500 dark:text-gray-400">
-		Gloss is adjacent to source (lines 0–1) with a 12 px gap and hidden arcs — tokens are colored
-		but no lines are drawn. Dots in the gloss text are split characters: <span class={codeClass}>"1SG.NOM"</span>
-		becomes two tokens (word 0 = <em>1SG</em>, word 1 = <em>NOM</em>). The free translation sits
-		below with a larger gap.
+		The gloss tier sits directly above the source (lines 0–1) with a tight 12 px gap and hidden arcs
+		— its tokens are color-matched to the source words but no lines are drawn. The source→translation
+		pair keeps its arcs. Note <span class={codeClass}>"tokenSplitChars": "-|"</span>: this drops the
+		period from the split set so a Leipzig morpheme like <span class={codeClass}>go.PST.IPFV</span>
+		stays a single token instead of rendering as <span class={codeClass}>goPSTIPFV</span>.
 	</p>
 
 	<!-- ── Word indices ────────────────────────────────────────── -->
@@ -458,6 +481,14 @@
 		<span class={codeClass}>|</span> also create word boundaries. Punctuation is not split into separate
 		tokens by default.
 	</p>
+	<p class="mt-3">
+		The split character itself is <strong>not rendered</strong> — it is consumed as a boundary. So
+		<span class={codeClass}>go.PST.IPFV</span> with the default split set displays as three tokens
+		<span class={codeClass}>go</span> <span class={codeClass}>PST</span> <span class={codeClass}
+			>IPFV</span
+		> with the dots removed. If you need the period to stay (common in Leipzig glosses), override
+		<span class={codeClass}>settings.tokenSplitChars</span> to <span class={codeClass}>"-|"</span>.
+	</p>
 	<p class="mt-3">Example — <em>"Bonjour le monde"</em>:</p>
 	<div class={tableClass}>
 		<table class="min-w-full divide-y divide-gray-200 text-sm dark:divide-gray-700">
diff --git a/bitext/src/routes/api/align/openapi.json/+server.ts b/bitext/src/routes/api/align/openapi.json/+server.ts
@@ -205,6 +205,17 @@ export const GET: RequestHandler = ({ url }) => {
 						colorTokensByLink: {
 							type: 'boolean',
 							description: 'Tint word tokens in the color of their connection. Default: true.'
+						},
+						tokenSplitChars: {
+							type: 'string',
+							description:
+								'Characters (besides whitespace) that split text into separate word tokens. Default: ".-|". For Leipzig glosses set "-|" so periods stay inside a token (e.g. "go.PST.IPFV" is one token). The split character itself is not rendered.'
+						},
+						tokenMergeChar: {
+							type: 'string',
+							maxLength: 1,
+							description:
+								'Single character that joins parts into one token while rendering as a space (e.g. "is+playing" → "is playing", one word). Default: "+".'
 						}
 					}
 				},
diff --git a/word-aligner-skill.zip b/word-aligner-skill.zip
diff --git a/word-aligner-skill/SKILL.md b/word-aligner-skill/SKILL.md
@@ -67,30 +67,36 @@ If uncertain about tokenization, call `GET https://aligner.tinygods.dev/api/alig
 }
 ```
 
-**Interlinear gloss**: place the gloss line directly under the source (adjacent), connect source→gloss tokens, hide the arcs with `showConnectors: false`, use a small gap (12px). Put the free translation below the gloss with a larger gap and no connectors.
+**Interlinear (Leipzig) gloss** — three lines: gloss on top, source in the middle, free translation at the bottom.
 
-`"1SG.NOM PST.IPFV"` — dots are default split chars, so this yields 4 tokens: `1SG`[0] `NOM`[1] `PST`[2] `IPFV`[3].
+Important: a Leipzig gloss uses periods to pack grammatical features into one morpheme (`go.PST.IPFV` = "go" + past + imperfective, **one** token). The default `tokenSplitChars` is `".-|"`, which would split on the period and hide it — rendering `goPSTIPFV`. To keep the periods, set `"tokenSplitChars": "-|"` (drop the dot).
+
+Layout rules:
+- Gloss sits directly above the source: hide its connector arcs (`showConnectors: false`) and use a tight gap (`gapPx: 12`). The gloss tokens still get colors via their connections to the source.
+- The source→translation pair keeps its arcs (omit it from `pairs`).
+- Connect each gloss token to its source word, each source word to its translation word(s).
 
 ```json
 {
   "lines": [
+    {"text": "1SG.NOM go.PST.IPFV", "sizePx": 22},
     {"text": "Я ходил", "sizePx": 40},
-    {"text": "1SG.NOM PST.IPFV", "sizePx": 22},
     {"text": "I have been going", "sizePx": 36}
   ],
   "alignments": [
-    [0,0,1,0], [0,0,1,1],
-    [0,1,1,2], [0,1,1,3]
+    [0,0,1,0], [0,1,1,1],
+    [1,0,2,0],
+    [1,1,2,1], [1,1,2,2], [1,1,2,3]
   ],
+  "settings": {"tokenSplitChars": "-|"},
   "pairs": [
-    {"upper": 0, "lower": 1, "gapPx": 12, "showConnectors": false},
-    {"upper": 1, "lower": 2, "gapPx": 80, "showConnectors": false}
+    {"upper": 0, "lower": 1, "gapPx": 12, "showConnectors": false}
   ]
 }
 ```
 
-Gloss tokens inherit colors from their source-word group. Arcs are hidden on both pairs; only the color coding is visible.
+Line 0 (gloss) has 2 whitespace-separated tokens: `1SG.NOM`[0] and `go.PST.IPFV`[1]. "ходил" maps to "have been going" (one-to-many, shared color); the gloss above it is color-matched but arc-free.
 
 ## Full parameter reference
 
-See [references/api.md](references/api.md) for the complete parameter tables: `LineInput`, `SettingsInput` (palette, lineStyle, lineThickness, lineOpacity, background, theme, showNumbers, colorTokensByLink), and `PairInput` (gapPx, showConnectors).
+See [references/api.md](references/api.md) for the complete parameter tables: `LineInput`, `SettingsInput` (palette, lineStyle, lineThickness, lineOpacity, background, theme, showNumbers, colorTokensByLink, tokenSplitChars, tokenMergeChar), and `PairInput` (gapPx, showConnectors).
diff --git a/word-aligner-skill/references/api.md b/word-aligner-skill/references/api.md
@@ -50,6 +50,8 @@ Global visual overrides. All fields optional; unset fields use defaults.
 | `theme`             | `light` `dark`              | `light`   | UI theme (affects token chip color) |
 | `showNumbers`       | boolean                     | `false`   | Show line numbers next to each line |
 | `colorTokensByLink` | boolean                     | `true`    | Tint word tokens in the color of their connection |
+| `tokenSplitChars`   | string                      | `.-\|`    | Characters (besides whitespace) that split text into tokens. The split char is **not** rendered. Set to `-\|` to keep periods inside Leipzig gloss morphemes (`go.PST.IPFV` = one token) |
+| `tokenMergeChar`    | string (1 char)             | `+`       | Joins parts into one token while rendering as a space, e.g. `is+playing` → `is playing` (one word) |
 
 **Palette colors:**
 - `pastel` — soft pink, blue, green, yellow, purple, cyan (great for educational content)
@@ -135,44 +137,55 @@ Returns the same `{ "url": "..." }` response. Useful for opening the editor pre-
 }
 ```
 
-### Three lines — source + gloss + free translation
-Gloss is adjacent to source (lines 0–1), arcs hidden but colors shown. Translation is below.
-Dots in gloss are split chars: `"1SG.NOM PST.IPFV"` → tokens `1SG`[0] `NOM`[1] `PST`[2] `IPFV`[3].
+### Interlinear (Leipzig) gloss — gloss / source / free translation
+Gloss on top, source in the middle, free translation at the bottom. The gloss→source pair has
+its arcs hidden (`showConnectors: false`) and a tight 12 px gap; gloss tokens stay color-coded.
+The source→translation pair keeps its arcs.
+
+Set `tokenSplitChars` to `"-|"` (drop the dot) so Leipzig periods stay inside a morpheme:
+`"go.PST.IPFV"` is one token, not three.
 
 ```json
 {
   "lines": [
+    {"text": "1SG.NOM go.PST.IPFV", "sizePx": 22},
     {"text": "Я ходил", "sizePx": 40},
-    {"text": "1SG.NOM PST.IPFV", "sizePx": 22},
     {"text": "I have been going", "sizePx": 36}
   ],
   "alignments": [
-    [0,0,1,0], [0,0,1,1],
-    [0,1,1,2], [0,1,1,3]
+    [0,0,1,0], [0,1,1,1],
+    [1,0,2,0],
+    [1,1,2,1], [1,1,2,2], [1,1,2,3]
   ],
+  "settings": {"tokenSplitChars": "-|"},
   "pairs": [
-    {"upper": 0, "lower": 1, "gapPx": 12, "showConnectors": false},
-    {"upper": 1, "lower": 2, "gapPx": 80, "showConnectors": false}
+    {"upper": 0, "lower": 1, "gapPx": 12, "showConnectors": false}
   ]
 }
 ```
 
-### Four lines with per-line typography
+### Four tiers — gloss / IPA / source / translation
+A full Leipzig stack. The three top tiers (gloss, IPA, source) are tightly stacked with arcs
+hidden; only the source→translation pair draws arcs. `tokenSplitChars: "-|"` keeps the periods
+inside gloss morphemes.
+
 ```json
 {
   "lines": [
-    {"text": "Ich habe geschlafen", "font": "Noto Sans", "sizePx": 40},
-    {"text": "I have slept", "sizePx": 40},
-    {"text": "ich-hab-e geschlafen", "sizePx": 20},
-    {"text": "1SG-AUX-1SG slept.PTCP", "sizePx": 20}
+    {"text": "1SG eat.PRS.1SG INDF.F apple", "sizePx": 22},
+    {"text": "ʒə mɑ̃ʒ yn pɔm", "font": "Noto Sans", "sizePx": 26},
+    {"text": "Je mange une pomme", "sizePx": 40},
+    {"text": "I eat an apple", "sizePx": 30}
   ],
   "alignments": [
-    [0,0,1,0], [0,1,1,1], [0,2,1,2],
-    [2,0,3,0], [2,1,3,1]
+    [0,0,1,0], [0,1,1,1], [0,2,1,2], [0,3,1,3],
+    [1,0,2,0], [1,1,2,1], [1,2,2,2], [1,3,2,3],
+    [2,0,3,0], [2,1,3,1], [2,2,3,2], [2,3,3,3]
   ],
+  "settings": {"tokenSplitChars": "-|"},
   "pairs": [
-    {"upper": 1, "lower": 2, "gapPx": 40, "showConnectors": false},
-    {"upper": 2, "lower": 3, "gapPx": 60}
+    {"upper": 0, "lower": 1, "gapPx": 12, "showConnectors": false},
+    {"upper": 1, "lower": 2, "gapPx": 12, "showConnectors": false}
   ]
 }
 ```