Skip to content

Commit 6ad79d9

Browse files
committed
refactor(painter-dom): move run direction heuristic into features/inline-direction
The shouldAssignPerRunRtlDir / normalizeRtlDateTokenForWordParity helpers and their regexes (RTL_DATE_LIKE_TOKEN_RE, STRONG_RTL_CHAR_RE, LATIN_DIGIT_NEUTRAL_ONLY_RE) lived at the bottom of renderer.ts. After #3307 moved the rtl-paragraph feature folder to inline-direction with an explicit axis scope, the renderer was the wrong home: these helpers are paint-time decisions about how to project w:rPr/w:rtl onto a rendered span's dir attribute, which is exactly what features/inline-direction owns. Extract into features/inline-direction/run-direction.ts: - Combine the two-step decision (set dir=rtl? else set dir=ltr for date-like?) into a single resolveRunDirectionAttribute helper that returns 'rtl' | 'ltr' | null. - Expose normalizeRtlDateTokenForWordParity alongside since it shares RTL_DATE_LIKE_TOKEN_RE. - Inline the decision table as JSDoc, explicitly scoping the heuristic to current SD-3098 fixtures and pointing at the spec sections plus the known follow-up gaps (w:dir, w:bdo, w:lang/@bidi numeric, presentation forms). Renderer collapses the per-span direction logic to one helper call. 22 new unit tests in run-direction.test.ts cover both branches of the rtl-tagged decision table, the date-like ltr fallback for non-tagged runs, and the regex coverage smoke tests.
1 parent f87cd08 commit 6ad79d9

4 files changed

Lines changed: 332 additions & 44 deletions

File tree

packages/layout-engine/painters/dom/src/features/inline-direction/index.ts

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,3 +20,11 @@
2020
*/
2121

2222
export { applyRtlStyles, shouldUseSegmentPositioning } from './rtl-styles.js';
23+
export {
24+
resolveRunDirectionAttribute,
25+
normalizeRtlDateTokenForWordParity,
26+
RTL_DATE_LIKE_TOKEN_RE,
27+
STRONG_RTL_CHAR_RE,
28+
LATIN_DIGIT_NEUTRAL_ONLY_RE,
29+
type RunDirAttribute,
30+
} from './run-direction.js';
Lines changed: 211 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,211 @@
1+
import { describe, expect, it } from 'vitest';
2+
import {
3+
resolveRunDirectionAttribute,
4+
normalizeRtlDateTokenForWordParity,
5+
RTL_DATE_LIKE_TOKEN_RE,
6+
STRONG_RTL_CHAR_RE,
7+
LATIN_DIGIT_NEUTRAL_ONLY_RE,
8+
} from './run-direction.js';
9+
10+
describe('resolveRunDirectionAttribute', () => {
11+
describe('rtl-tagged runs', () => {
12+
it('returns "rtl" for Hebrew text', () => {
13+
expect(
14+
resolveRunDirectionAttribute({
15+
runText: 'שלום',
16+
effectiveText: 'שלום',
17+
isRtlTagged: true,
18+
}),
19+
).toBe('rtl');
20+
});
21+
22+
it('returns "rtl" for Arabic text', () => {
23+
expect(
24+
resolveRunDirectionAttribute({
25+
runText: 'مرحبا',
26+
effectiveText: 'مرحبا',
27+
isRtlTagged: true,
28+
}),
29+
).toBe('rtl');
30+
});
31+
32+
it('returns null for Latin-only text (Word-parity: §17.3.2.30 unspecified)', () => {
33+
expect(
34+
resolveRunDirectionAttribute({
35+
runText: 'Hello',
36+
effectiveText: 'Hello',
37+
isRtlTagged: true,
38+
}),
39+
).toBe(null);
40+
});
41+
42+
it('returns null for digit-only text', () => {
43+
expect(
44+
resolveRunDirectionAttribute({
45+
runText: '2026',
46+
effectiveText: '2026',
47+
isRtlTagged: true,
48+
}),
49+
).toBe(null);
50+
});
51+
52+
it('returns "rtl" for date-like numeric (isolates the date as RTL unit)', () => {
53+
expect(
54+
resolveRunDirectionAttribute({
55+
runText: '2026-03-15',
56+
effectiveText: '2026-03-15',
57+
isRtlTagged: true,
58+
}),
59+
).toBe('rtl');
60+
});
61+
62+
it('returns "rtl" for mixed strong-RTL + Latin (Hebrew present)', () => {
63+
expect(
64+
resolveRunDirectionAttribute({
65+
runText: 'first שלום',
66+
effectiveText: 'first שלום',
67+
isRtlTagged: true,
68+
}),
69+
).toBe('rtl');
70+
});
71+
72+
it('returns "rtl" for empty text (honor source signal when no content)', () => {
73+
expect(
74+
resolveRunDirectionAttribute({
75+
runText: '',
76+
effectiveText: '',
77+
isRtlTagged: true,
78+
}),
79+
).toBe('rtl');
80+
});
81+
82+
it('returns "rtl" for whitespace-only text', () => {
83+
expect(
84+
resolveRunDirectionAttribute({
85+
runText: ' ',
86+
effectiveText: ' ',
87+
isRtlTagged: true,
88+
}),
89+
).toBe('rtl');
90+
});
91+
92+
// Fail-safe: anything that doesn't match the Latin/digit/neutral set OR the
93+
// strong-RTL set still honors the source signal. East Asian, presentation
94+
// forms, symbols outside the neutral set all fall into this branch.
95+
it('returns "rtl" for text that is neither Latin nor strong-RTL', () => {
96+
expect(
97+
resolveRunDirectionAttribute({
98+
runText: '世界',
99+
effectiveText: '世界',
100+
isRtlTagged: true,
101+
}),
102+
).toBe('rtl');
103+
});
104+
105+
it('uses effectiveText when runText is undefined', () => {
106+
expect(
107+
resolveRunDirectionAttribute({
108+
runText: undefined,
109+
effectiveText: 'שלום',
110+
isRtlTagged: true,
111+
}),
112+
).toBe('rtl');
113+
});
114+
});
115+
116+
describe('non-rtl-tagged runs', () => {
117+
it('returns "ltr" for date-like numeric (Word-parity in RTL paragraph)', () => {
118+
expect(
119+
resolveRunDirectionAttribute({
120+
runText: '2026-03-15',
121+
effectiveText: '2026-03-15',
122+
isRtlTagged: false,
123+
}),
124+
).toBe('ltr');
125+
});
126+
127+
it('returns null for plain Latin (let paragraph + UBA decide)', () => {
128+
expect(
129+
resolveRunDirectionAttribute({
130+
runText: 'Hello',
131+
effectiveText: 'Hello',
132+
isRtlTagged: false,
133+
}),
134+
).toBe(null);
135+
});
136+
137+
it('returns null for Hebrew text without w:rtl (paragraph context resolves)', () => {
138+
expect(
139+
resolveRunDirectionAttribute({
140+
runText: 'שלום',
141+
effectiveText: 'שלום',
142+
isRtlTagged: false,
143+
}),
144+
).toBe(null);
145+
});
146+
147+
it('returns null when runText is undefined (no date pattern to match)', () => {
148+
expect(
149+
resolveRunDirectionAttribute({
150+
runText: undefined,
151+
effectiveText: '2026-03-15',
152+
isRtlTagged: false,
153+
}),
154+
).toBe(null);
155+
});
156+
});
157+
});
158+
159+
describe('normalizeRtlDateTokenForWordParity', () => {
160+
const RLM = '\u200F';
161+
162+
it('wraps separators with RLM in date-like text', () => {
163+
expect(normalizeRtlDateTokenForWordParity('2026-03-15')).toBe(`2026${RLM}-${RLM}03${RLM}-${RLM}15`);
164+
});
165+
166+
it('handles slash separators', () => {
167+
expect(normalizeRtlDateTokenForWordParity('15/03/2026')).toBe(`15${RLM}/${RLM}03${RLM}/${RLM}2026`);
168+
});
169+
170+
it('handles dot separators', () => {
171+
expect(normalizeRtlDateTokenForWordParity('1.2.3')).toBe(`1${RLM}.${RLM}2${RLM}.${RLM}3`);
172+
});
173+
174+
it('wraps the leading sign too (no special-case for leading "-")', () => {
175+
// Implementation is text.replace(/[./-]/g, ...). The leading sign is also
176+
// a `-`, so it gets RLM-wrapped. This matches the pre-extraction behavior.
177+
expect(normalizeRtlDateTokenForWordParity('-2026-03')).toBe(`${RLM}-${RLM}2026${RLM}-${RLM}03`);
178+
});
179+
180+
it('returns unchanged for non-date text', () => {
181+
expect(normalizeRtlDateTokenForWordParity('Hello world')).toBe('Hello world');
182+
expect(normalizeRtlDateTokenForWordParity('2026')).toBe('2026'); // no separator
183+
expect(normalizeRtlDateTokenForWordParity('שלום')).toBe('שלום');
184+
});
185+
});
186+
187+
describe('regex coverage smoke tests', () => {
188+
it('RTL_DATE_LIKE_TOKEN_RE matches numeric dates', () => {
189+
expect(RTL_DATE_LIKE_TOKEN_RE.test('2026-03-15')).toBe(true);
190+
expect(RTL_DATE_LIKE_TOKEN_RE.test('15/03/2026')).toBe(true);
191+
expect(RTL_DATE_LIKE_TOKEN_RE.test('1.2.3')).toBe(true);
192+
expect(RTL_DATE_LIKE_TOKEN_RE.test('-2026-03')).toBe(true);
193+
expect(RTL_DATE_LIKE_TOKEN_RE.test('2026')).toBe(false); // no separator
194+
expect(RTL_DATE_LIKE_TOKEN_RE.test('a-b-c')).toBe(false);
195+
});
196+
197+
it('STRONG_RTL_CHAR_RE matches Hebrew and Arabic core blocks', () => {
198+
expect(STRONG_RTL_CHAR_RE.test('שלום')).toBe(true);
199+
expect(STRONG_RTL_CHAR_RE.test('مرحبا')).toBe(true);
200+
expect(STRONG_RTL_CHAR_RE.test('Hello')).toBe(false);
201+
expect(STRONG_RTL_CHAR_RE.test('2026')).toBe(false);
202+
});
203+
204+
it('LATIN_DIGIT_NEUTRAL_ONLY_RE matches Latin + digit + neutral chars', () => {
205+
expect(LATIN_DIGIT_NEUTRAL_ONLY_RE.test('Hello world')).toBe(true);
206+
expect(LATIN_DIGIT_NEUTRAL_ONLY_RE.test('copy 2')).toBe(true);
207+
expect(LATIN_DIGIT_NEUTRAL_ONLY_RE.test('a/b-c.d')).toBe(true);
208+
expect(LATIN_DIGIT_NEUTRAL_ONLY_RE.test('שלום')).toBe(false);
209+
expect(LATIN_DIGIT_NEUTRAL_ONLY_RE.test('Hello שלום')).toBe(false);
210+
});
211+
});
Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
/**
2+
* Run-level direction helpers for DomPainter.
3+
*
4+
* These helpers encode paint-time decisions about how to project the OOXML
5+
* `w:rPr/w:rtl` signal onto a rendered span's `dir` attribute, plus a narrow
6+
* Word-parity workaround for RTL-tagged date-like numeric runs.
7+
*
8+
* The heuristic is intentionally scoped to current Word-parity fixtures
9+
* (SD-3098 mixed-bidi date tokens). It is NOT a full implementation of
10+
* §17.3.2.30 semantics - notably absent: `w:dir` embedding (§17.3.2.8),
11+
* `w:bdo` override (§17.3.2.3), and `w:lang/@bidi` Hebrew vs Arabic numeric
12+
* differences. Those gaps are tracked separately; see SD-2767 follow-ups.
13+
*
14+
* @spec ECMA-376 §17.3.2.30 (rtl), §17.17.4 (boolean property)
15+
*/
16+
17+
/**
18+
* Matches numeric date-like tokens such as `2026-03-15`, `15/03/2026`, `1.2.3`.
19+
* Used by both the run direction resolver and the paint-time RLM injection
20+
* for Word parity on RTL date strings.
21+
*/
22+
export const RTL_DATE_LIKE_TOKEN_RE = /^-?\d+(?:[./-]\d+)+$/;
23+
24+
/**
25+
* Matches strong-RTL characters in the Hebrew / Arabic / Syriac core blocks.
26+
*
27+
* Known gap: misses Hebrew presentation forms (FB1D-FB4F) and Arabic
28+
* presentation forms (FB50-FDFF, FE70-FEFF). Tracked under SD-2767 follow-ups.
29+
*/
30+
export const STRONG_RTL_CHAR_RE = /[\u0590-\u08FF]/;
31+
32+
/**
33+
* Matches runs whose content is exclusively Latin / digit / neutral. Used as
34+
* the "skip per-run dir=rtl" guard: per §17.3.2.30, behavior of w:rtl on
35+
* strongly LTR text is unspecified, and Word's empirical output for these
36+
* runs does not visually reorder.
37+
*/
38+
export const LATIN_DIGIT_NEUTRAL_ONLY_RE = /^[\s0-9A-Za-z./\-_:,+()]+$/;
39+
40+
const RLM = '\u200F';
41+
42+
/**
43+
* Word-parity workaround for RTL date-like tokens.
44+
*
45+
* Word internally injects RLM around numeric separators in RTL date strings,
46+
* preserving LTR order for the digits while keeping the run RTL. The browser's
47+
* UBA alone does not match this. We mirror Word by injecting RLM at paint
48+
* time only - the DOM text differs from the PM model and from the exported
49+
* OOXML, which both keep the original separators.
50+
*
51+
* Intentionally narrow: only matches numeric date-like patterns so other
52+
* numeric content is unaffected. Scope is current SD-3098 fixtures.
53+
*/
54+
export const normalizeRtlDateTokenForWordParity = (text: string): string => {
55+
if (!RTL_DATE_LIKE_TOKEN_RE.test(text)) {
56+
return text;
57+
}
58+
return text.replace(/[./-]/g, (separator) => `${RLM}${separator}${RLM}`);
59+
};
60+
61+
/**
62+
* Compute the `dir` attribute (if any) to apply to a rendered run span.
63+
*
64+
* Decision table:
65+
* - rtl-tagged + empty text -> 'rtl' (no content to classify, honor source signal)
66+
* - rtl-tagged + date-like numeric -> 'rtl' (isolates the date as a unit)
67+
* - rtl-tagged + contains strong-RTL chars -> 'rtl' (standard case)
68+
* - rtl-tagged + only Latin/digit/neutral -> null (per §17.3.2.30, unspecified;
69+
* Word does not visually reorder these, so omit dir to inherit paragraph)
70+
* - rtl-tagged + other (e.g. East Asian, presentation forms) -> 'rtl' (fail-safe)
71+
* - NOT rtl-tagged + date-like numeric text -> 'ltr' (Word-parity: keeps date
72+
* LTR-classified within an RTL paragraph context so digits don't drift)
73+
* - NOT rtl-tagged + anything else -> null (let paragraph + UBA decide)
74+
*/
75+
export type RunDirAttribute = 'rtl' | 'ltr' | null;
76+
77+
export const resolveRunDirectionAttribute = (opts: {
78+
/** Original run text from the model. */
79+
runText: string | undefined;
80+
/** Post-token-resolution text used for rendering (e.g. field token expansion). */
81+
effectiveText: string;
82+
/** True when the source OOXML carries `w:rPr/w:rtl`. */
83+
isRtlTagged: boolean;
84+
}): RunDirAttribute => {
85+
if (opts.isRtlTagged) {
86+
const sample = (opts.runText ?? opts.effectiveText).trim();
87+
if (!sample) return 'rtl';
88+
if (RTL_DATE_LIKE_TOKEN_RE.test(sample)) return 'rtl';
89+
if (STRONG_RTL_CHAR_RE.test(sample)) return 'rtl';
90+
if (LATIN_DIGIT_NEUTRAL_ONLY_RE.test(sample)) return null;
91+
return 'rtl';
92+
}
93+
94+
if (typeof opts.runText === 'string' && RTL_DATE_LIKE_TOKEN_RE.test(opts.runText)) {
95+
return 'ltr';
96+
}
97+
98+
return null;
99+
};

0 commit comments

Comments
 (0)