Skip to content

Commit f37353f

Browse files
dani-polaniclaude
andcommitted
feat: expose tokenSplitChars/tokenMergeChar; fix Leipzig gloss examples
The gloss examples rendered "1SG.NOM" as "1SGNOM" because the default tokenSplitChars (".-|") consumes the period as a boundary and does not draw it. Leipzig glosses pack features into one morpheme with periods (go.PST.IPFV = one token), so the period must be preserved. - Add settings.tokenSplitChars and settings.tokenMergeChar to the API - Rework gloss examples to the verified interface config: gloss on top, source middle, translation bottom; tokenSplitChars "-|"; gloss-source pair arcs hidden with 12px gap; source-translation arcs shown - Replace broken German 4-line example with a clean 4-tier French stack (gloss / IPA / source / translation) - Document the "split char is not rendered" gotcha on /api - Update OpenAPI schema, SKILL.md, references/api.md - 3 new tests for tokenization settings (62 total) Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
1 parent a688f82 commit f37353f

7 files changed

Lines changed: 153 additions & 37 deletions

File tree

bitext/src/lib/api/align.test.ts

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -172,6 +172,43 @@ describe('buildAlignUrl', () => {
172172
expect(state.settings.lineOpacity).toBe(0.7);
173173
});
174174

175+
it('keeps periods inside gloss tokens when tokenSplitChars omits the dot', () => {
176+
// With default ".-|", "1SG.NOM" would split into two tokens. With "-|" it stays one,
177+
// so word index 1 of the gloss line is "go.PST.IPFV" and the connection resolves.
178+
const result = buildAlignUrl(ORIGIN, {
179+
lines: ['1SG.NOM go.PST.IPFV', 'Я ходил'],
180+
alignments: [
181+
[0, 0, 1, 0],
182+
[0, 1, 1, 1]
183+
],
184+
settings: { tokenSplitChars: '-|' }
185+
});
186+
if (!('url' in result)) throw new Error('expected url');
187+
const state = decodeState(new URL(result.url).searchParams.get('data'));
188+
expect(state.settings.tokenSplitChars).toBe('-|');
189+
// Connection from gloss word 1 must point at the single "go.PST.IPFV" token (l0-1).
190+
const c = state.project.connections.find((conn) => conn.lowerTokenId === 'l1-1');
191+
expect(c?.upperTokenId).toBe('l0-1');
192+
});
193+
194+
it('rejects gloss word index that only resolves under the default split chars', () => {
195+
// Under default ".-|", "1SG.NOM PST.IPFV" has 4 tokens, so word 3 exists.
196+
// We do NOT pass tokenSplitChars here, proving the dot still splits by default.
197+
const result = buildAlignUrl(ORIGIN, {
198+
lines: ['Я ходил', '1SG.NOM PST.IPFV'],
199+
alignments: [[0, 1, 1, 3]]
200+
});
201+
if (!('url' in result)) throw new Error('expected url');
202+
const state = decodeState(new URL(result.url).searchParams.get('data'));
203+
expect(state.project.connections[0]!.lowerTokenId).toBe('l1-3'); // "IPFV"
204+
});
205+
206+
it('rejects invalid tokenMergeChar (more than one character)', () => {
207+
expect(
208+
parseAlignBody({ lines: ['a', 'b'], settings: { tokenMergeChar: '++' } })
209+
).toMatchObject({ err: expect.stringContaining('tokenMergeChar') });
210+
});
211+
175212
it('applies per-line options (font, sizePx, rtl)', () => {
176213
const result = buildAlignUrl(ORIGIN, {
177214
lines: [

bitext/src/lib/api/align.ts

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,18 @@ export interface SettingsInput {
4848
showNumbers?: boolean;
4949
/** Tint word tokens in the color of their connection. */
5050
colorTokensByLink?: boolean;
51+
/**
52+
* Characters (besides whitespace) that split text into separate word tokens.
53+
* Default is ".-|". For Leipzig glosses, set "-|" so periods stay inside a token
54+
* (e.g. "go.PST.IPFV" is one token instead of three). The split character itself is
55+
* not drawn, so any character you keep here disappears from the rendered text.
56+
*/
57+
tokenSplitChars?: string;
58+
/**
59+
* Single character that joins parts into one alignment token while rendering as a space
60+
* (e.g. "is+playing" shows "is playing" but counts as one word). Default is "+".
61+
*/
62+
tokenMergeChar?: string;
5163
}
5264

5365
/** Per-adjacent-pair controls. `upper` and `lower` are 0-based line indices (lower = upper + 1). */
@@ -119,6 +131,10 @@ function parseSettingsInput(val: unknown): { ok: SettingsInput } | { err: string
119131
return { err: 'settings.showNumbers must be a boolean' };
120132
if (v.colorTokensByLink !== undefined && typeof v.colorTokensByLink !== 'boolean')
121133
return { err: 'settings.colorTokensByLink must be a boolean' };
134+
if (v.tokenSplitChars !== undefined && typeof v.tokenSplitChars !== 'string')
135+
return { err: 'settings.tokenSplitChars must be a string' };
136+
if (v.tokenMergeChar !== undefined && (typeof v.tokenMergeChar !== 'string' || v.tokenMergeChar.length > 1))
137+
return { err: 'settings.tokenMergeChar must be a single character' };
122138

123139
return {
124140
ok: {
@@ -129,7 +145,9 @@ function parseSettingsInput(val: unknown): { ok: SettingsInput } | { err: string
129145
lineThickness: typeof v.lineThickness === 'number' ? v.lineThickness : undefined,
130146
lineOpacity: typeof v.lineOpacity === 'number' ? v.lineOpacity : undefined,
131147
showNumbers: typeof v.showNumbers === 'boolean' ? v.showNumbers : undefined,
132-
colorTokensByLink: typeof v.colorTokensByLink === 'boolean' ? v.colorTokensByLink : undefined
148+
colorTokensByLink: typeof v.colorTokensByLink === 'boolean' ? v.colorTokensByLink : undefined,
149+
tokenSplitChars: typeof v.tokenSplitChars === 'string' ? v.tokenSplitChars : undefined,
150+
tokenMergeChar: typeof v.tokenMergeChar === 'string' ? v.tokenMergeChar : undefined
133151
}
134152
};
135153
}

bitext/src/routes/api/+page.svelte

Lines changed: 41 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -352,6 +352,27 @@
352352
<td class={tdTypeClass}>true</td>
353353
<td class={tdDescClass}>Tint word tokens in the color of their connection.</td>
354354
</tr>
355+
<tr>
356+
<td class={tdClass}>tokenSplitChars</td>
357+
<td class={tdTypeClass}>string</td>
358+
<td class={tdTypeClass}><span class={codeClass}>.-|</span></td>
359+
<td class={tdDescClass}
360+
>Characters (besides whitespace) that split text into tokens. The split character is
361+
<strong>not</strong> rendered. Set to <span class={codeClass}>-|</span> to keep periods
362+
inside Leipzig gloss morphemes (e.g. <span class={codeClass}>go.PST.IPFV</span> stays one
363+
token).</td
364+
>
365+
</tr>
366+
<tr class="bg-gray-50/50 dark:bg-gray-800/20">
367+
<td class={tdClass}>tokenMergeChar</td>
368+
<td class={tdTypeClass}>string (1 char)</td>
369+
<td class={tdTypeClass}><span class={codeClass}>+</span></td>
370+
<td class={tdDescClass}
371+
>Joins parts into one token while rendering as a space, e.g. <span class={codeClass}
372+
>is+playing</span
373+
> → "is playing" (one word).</td
374+
>
375+
</tr>
355376
</tbody>
356377
</table>
357378
</div>
@@ -422,29 +443,31 @@
422443
</table>
423444
</div>
424445

425-
<h3 class={subheadingClass}>Example — 3 lines, gloss row with tighter gap and no connectors</h3>
446+
<h3 class={subheadingClass}>Example — interlinear (Leipzig) gloss</h3>
426447
<pre class="{preClass} mt-3">{`curl -X POST ${apiBase}/api/align \\
427448
-H "Content-Type: application/json" \\
428449
-d '{
429450
"lines": [
451+
{ "text": "1SG.NOM go.PST.IPFV", "sizePx": 22 },
430452
{ "text": "Я ходил", "sizePx": 40 },
431-
{ "text": "1SG.NOM PST.IPFV", "sizePx": 22 },
432453
{ "text": "I have been going", "sizePx": 36 }
433454
],
434455
"alignments": [
435-
[0, 0, 1, 0], [0, 0, 1, 1],
436-
[0, 1, 1, 2], [0, 1, 1, 3]
456+
[0, 0, 1, 0], [0, 1, 1, 1],
457+
[1, 0, 2, 0],
458+
[1, 1, 2, 1], [1, 1, 2, 2], [1, 1, 2, 3]
437459
],
460+
"settings": { "tokenSplitChars": "-|" },
438461
"pairs": [
439-
{ "upper": 0, "lower": 1, "gapPx": 12, "showConnectors": false },
440-
{ "upper": 1, "lower": 2, "gapPx": 80, "showConnectors": false }
462+
{ "upper": 0, "lower": 1, "gapPx": 12, "showConnectors": false }
441463
]
442464
}'`}</pre>
443465
<p class="mt-2 text-sm text-gray-500 dark:text-gray-400">
444-
Gloss is adjacent to source (lines 0–1) with a 12 px gap and hidden arcs — tokens are colored
445-
but no lines are drawn. Dots in the gloss text are split characters: <span class={codeClass}>"1SG.NOM"</span>
446-
becomes two tokens (word 0 = <em>1SG</em>, word 1 = <em>NOM</em>). The free translation sits
447-
below with a larger gap.
466+
The gloss tier sits directly above the source (lines 0–1) with a tight 12 px gap and hidden arcs
467+
— its tokens are color-matched to the source words but no lines are drawn. The source→translation
468+
pair keeps its arcs. Note <span class={codeClass}>"tokenSplitChars": "-|"</span>: this drops the
469+
period from the split set so a Leipzig morpheme like <span class={codeClass}>go.PST.IPFV</span>
470+
stays a single token instead of rendering as <span class={codeClass}>goPSTIPFV</span>.
448471
</p>
449472

450473
<!-- ── Word indices ────────────────────────────────────────── -->
@@ -458,6 +481,14 @@
458481
<span class={codeClass}>|</span> also create word boundaries. Punctuation is not split into separate
459482
tokens by default.
460483
</p>
484+
<p class="mt-3">
485+
The split character itself is <strong>not rendered</strong> — it is consumed as a boundary. So
486+
<span class={codeClass}>go.PST.IPFV</span> with the default split set displays as three tokens
487+
<span class={codeClass}>go</span> <span class={codeClass}>PST</span> <span class={codeClass}
488+
>IPFV</span
489+
> with the dots removed. If you need the period to stay (common in Leipzig glosses), override
490+
<span class={codeClass}>settings.tokenSplitChars</span> to <span class={codeClass}>"-|"</span>.
491+
</p>
461492
<p class="mt-3">Example — <em>"Bonjour le monde"</em>:</p>
462493
<div class={tableClass}>
463494
<table class="min-w-full divide-y divide-gray-200 text-sm dark:divide-gray-700">

bitext/src/routes/api/align/openapi.json/+server.ts

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -205,6 +205,17 @@ export const GET: RequestHandler = ({ url }) => {
205205
colorTokensByLink: {
206206
type: 'boolean',
207207
description: 'Tint word tokens in the color of their connection. Default: true.'
208+
},
209+
tokenSplitChars: {
210+
type: 'string',
211+
description:
212+
'Characters (besides whitespace) that split text into separate word tokens. Default: ".-|". For Leipzig glosses set "-|" so periods stay inside a token (e.g. "go.PST.IPFV" is one token). The split character itself is not rendered.'
213+
},
214+
tokenMergeChar: {
215+
type: 'string',
216+
maxLength: 1,
217+
description:
218+
'Single character that joins parts into one token while rendering as a space (e.g. "is+playing" → "is playing", one word). Default: "+".'
208219
}
209220
}
210221
},

word-aligner-skill.zip

619 Bytes
Binary file not shown.

word-aligner-skill/SKILL.md

Lines changed: 15 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -67,30 +67,36 @@ If uncertain about tokenization, call `GET https://aligner.tinygods.dev/api/alig
6767
}
6868
```
6969

70-
**Interlinear gloss**: place the gloss line directly under the source (adjacent), connect source→gloss tokens, hide the arcs with `showConnectors: false`, use a small gap (12px). Put the free translation below the gloss with a larger gap and no connectors.
70+
**Interlinear (Leipzig) gloss** — three lines: gloss on top, source in the middle, free translation at the bottom.
7171

72-
`"1SG.NOM PST.IPFV"` — dots are default split chars, so this yields 4 tokens: `1SG`[0] `NOM`[1] `PST`[2] `IPFV`[3].
72+
Important: a Leipzig gloss uses periods to pack grammatical features into one morpheme (`go.PST.IPFV` = "go" + past + imperfective, **one** token). The default `tokenSplitChars` is `".-|"`, which would split on the period and hide it — rendering `goPSTIPFV`. To keep the periods, set `"tokenSplitChars": "-|"` (drop the dot).
73+
74+
Layout rules:
75+
- Gloss sits directly above the source: hide its connector arcs (`showConnectors: false`) and use a tight gap (`gapPx: 12`). The gloss tokens still get colors via their connections to the source.
76+
- The source→translation pair keeps its arcs (omit it from `pairs`).
77+
- Connect each gloss token to its source word, each source word to its translation word(s).
7378

7479
```json
7580
{
7681
"lines": [
82+
{"text": "1SG.NOM go.PST.IPFV", "sizePx": 22},
7783
{"text": "Я ходил", "sizePx": 40},
78-
{"text": "1SG.NOM PST.IPFV", "sizePx": 22},
7984
{"text": "I have been going", "sizePx": 36}
8085
],
8186
"alignments": [
82-
[0,0,1,0], [0,0,1,1],
83-
[0,1,1,2], [0,1,1,3]
87+
[0,0,1,0], [0,1,1,1],
88+
[1,0,2,0],
89+
[1,1,2,1], [1,1,2,2], [1,1,2,3]
8490
],
91+
"settings": {"tokenSplitChars": "-|"},
8592
"pairs": [
86-
{"upper": 0, "lower": 1, "gapPx": 12, "showConnectors": false},
87-
{"upper": 1, "lower": 2, "gapPx": 80, "showConnectors": false}
93+
{"upper": 0, "lower": 1, "gapPx": 12, "showConnectors": false}
8894
]
8995
}
9096
```
9197

92-
Gloss tokens inherit colors from their source-word group. Arcs are hidden on both pairs; only the color coding is visible.
98+
Line 0 (gloss) has 2 whitespace-separated tokens: `1SG.NOM`[0] and `go.PST.IPFV`[1]. "ходил" maps to "have been going" (one-to-many, shared color); the gloss above it is color-matched but arc-free.
9399

94100
## Full parameter reference
95101

96-
See [references/api.md](references/api.md) for the complete parameter tables: `LineInput`, `SettingsInput` (palette, lineStyle, lineThickness, lineOpacity, background, theme, showNumbers, colorTokensByLink), and `PairInput` (gapPx, showConnectors).
102+
See [references/api.md](references/api.md) for the complete parameter tables: `LineInput`, `SettingsInput` (palette, lineStyle, lineThickness, lineOpacity, background, theme, showNumbers, colorTokensByLink, tokenSplitChars, tokenMergeChar), and `PairInput` (gapPx, showConnectors).

word-aligner-skill/references/api.md

Lines changed: 30 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,8 @@ Global visual overrides. All fields optional; unset fields use defaults.
5050
| `theme` | `light` `dark` | `light` | UI theme (affects token chip color) |
5151
| `showNumbers` | boolean | `false` | Show line numbers next to each line |
5252
| `colorTokensByLink` | boolean | `true` | Tint word tokens in the color of their connection |
53+
| `tokenSplitChars` | string | `.-\|` | Characters (besides whitespace) that split text into tokens. The split char is **not** rendered. Set to `-\|` to keep periods inside Leipzig gloss morphemes (`go.PST.IPFV` = one token) |
54+
| `tokenMergeChar` | string (1 char) | `+` | Joins parts into one token while rendering as a space, e.g. `is+playing``is playing` (one word) |
5355

5456
**Palette colors:**
5557
- `pastel` — soft pink, blue, green, yellow, purple, cyan (great for educational content)
@@ -135,44 +137,55 @@ Returns the same `{ "url": "..." }` response. Useful for opening the editor pre-
135137
}
136138
```
137139

138-
### Three lines — source + gloss + free translation
139-
Gloss is adjacent to source (lines 0–1), arcs hidden but colors shown. Translation is below.
140-
Dots in gloss are split chars: `"1SG.NOM PST.IPFV"` → tokens `1SG`[0] `NOM`[1] `PST`[2] `IPFV`[3].
140+
### Interlinear (Leipzig) gloss — gloss / source / free translation
141+
Gloss on top, source in the middle, free translation at the bottom. The gloss→source pair has
142+
its arcs hidden (`showConnectors: false`) and a tight 12 px gap; gloss tokens stay color-coded.
143+
The source→translation pair keeps its arcs.
144+
145+
Set `tokenSplitChars` to `"-|"` (drop the dot) so Leipzig periods stay inside a morpheme:
146+
`"go.PST.IPFV"` is one token, not three.
141147

142148
```json
143149
{
144150
"lines": [
151+
{"text": "1SG.NOM go.PST.IPFV", "sizePx": 22},
145152
{"text": "Я ходил", "sizePx": 40},
146-
{"text": "1SG.NOM PST.IPFV", "sizePx": 22},
147153
{"text": "I have been going", "sizePx": 36}
148154
],
149155
"alignments": [
150-
[0,0,1,0], [0,0,1,1],
151-
[0,1,1,2], [0,1,1,3]
156+
[0,0,1,0], [0,1,1,1],
157+
[1,0,2,0],
158+
[1,1,2,1], [1,1,2,2], [1,1,2,3]
152159
],
160+
"settings": {"tokenSplitChars": "-|"},
153161
"pairs": [
154-
{"upper": 0, "lower": 1, "gapPx": 12, "showConnectors": false},
155-
{"upper": 1, "lower": 2, "gapPx": 80, "showConnectors": false}
162+
{"upper": 0, "lower": 1, "gapPx": 12, "showConnectors": false}
156163
]
157164
}
158165
```
159166

160-
### Four lines with per-line typography
167+
### Four tiers — gloss / IPA / source / translation
168+
A full Leipzig stack. The three top tiers (gloss, IPA, source) are tightly stacked with arcs
169+
hidden; only the source→translation pair draws arcs. `tokenSplitChars: "-|"` keeps the periods
170+
inside gloss morphemes.
171+
161172
```json
162173
{
163174
"lines": [
164-
{"text": "Ich habe geschlafen", "font": "Noto Sans", "sizePx": 40},
165-
{"text": "I have slept", "sizePx": 40},
166-
{"text": "ich-hab-e geschlafen", "sizePx": 20},
167-
{"text": "1SG-AUX-1SG slept.PTCP", "sizePx": 20}
175+
{"text": "1SG eat.PRS.1SG INDF.F apple", "sizePx": 22},
176+
{"text": "ʒə mɑ̃ʒ yn pɔm", "font": "Noto Sans", "sizePx": 26},
177+
{"text": "Je mange une pomme", "sizePx": 40},
178+
{"text": "I eat an apple", "sizePx": 30}
168179
],
169180
"alignments": [
170-
[0,0,1,0], [0,1,1,1], [0,2,1,2],
171-
[2,0,3,0], [2,1,3,1]
181+
[0,0,1,0], [0,1,1,1], [0,2,1,2], [0,3,1,3],
182+
[1,0,2,0], [1,1,2,1], [1,2,2,2], [1,3,2,3],
183+
[2,0,3,0], [2,1,3,1], [2,2,3,2], [2,3,3,3]
172184
],
185+
"settings": {"tokenSplitChars": "-|"},
173186
"pairs": [
174-
{"upper": 1, "lower": 2, "gapPx": 40, "showConnectors": false},
175-
{"upper": 2, "lower": 3, "gapPx": 60}
187+
{"upper": 0, "lower": 1, "gapPx": 12, "showConnectors": false},
188+
{"upper": 1, "lower": 2, "gapPx": 12, "showConnectors": false}
176189
]
177190
}
178191
```

0 commit comments

Comments
 (0)