Skip to content

Commit f86b762

Browse files
authored
Merge pull request #3352 from superdoc-dev/caio-pizzol/SD-3169-presentation-forms-classification
fix(direction): recognize Hebrew/Arabic presentation forms as strong-RTL (SD-3169)
2 parents df091a7 + 4b702f5 commit f86b762

4 files changed

Lines changed: 92 additions & 6 deletions

File tree

packages/layout-engine/painters/dom/src/features/inline-direction/run-direction.test.ts

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -201,6 +201,31 @@ describe('regex coverage smoke tests', () => {
201201
expect(STRONG_RTL_CHAR_RE.test('2026')).toBe(false);
202202
});
203203

204+
// SD-3169: presentation forms used by legacy fonts must classify as strong-RTL
205+
// for mixed-bidi boundary detection to fire on them. Run-direction rendering
206+
// already fails safe for these (unknown text → 'rtl'), but the regex must
207+
// recognize them directly so the painter's helper stays consistent with the
208+
// mixed-bidi-backspace boundary detector.
209+
it('STRONG_RTL_CHAR_RE matches Hebrew/Arabic presentation forms', () => {
210+
// Hebrew Presentation Forms FB1D-FB4F
211+
expect(STRONG_RTL_CHAR_RE.test('\uFB21')).toBe(true); // Hebrew Letter Wide Alef
212+
expect(STRONG_RTL_CHAR_RE.test('\uFB4F')).toBe(true); // Hebrew Ligature Alef Lamed
213+
// Arabic Presentation Forms-A FB50-FDFF
214+
expect(STRONG_RTL_CHAR_RE.test('\uFB50')).toBe(true); // Arabic Letter Alef Wasla Isolated
215+
expect(STRONG_RTL_CHAR_RE.test('\uFDF2')).toBe(true); // Arabic Ligature Allah Isolated
216+
// Arabic Presentation Forms-B FE70-FEFF
217+
expect(STRONG_RTL_CHAR_RE.test('\uFE70')).toBe(true); // Arabic Fathatan Isolated
218+
expect(STRONG_RTL_CHAR_RE.test('\uFEFC')).toBe(true); // Arabic Ligature Lam With Alef Final
219+
});
220+
221+
it('STRONG_RTL_CHAR_RE excludes noncharacters and the BOM', () => {
222+
// FDD0-FDEF are Unicode noncharacters in the Arabic-A range.
223+
expect(STRONG_RTL_CHAR_RE.test('\uFDD0')).toBe(false);
224+
expect(STRONG_RTL_CHAR_RE.test('\uFDEF')).toBe(false);
225+
// FEFF is ZERO WIDTH NO-BREAK SPACE / BOM, not RTL.
226+
expect(STRONG_RTL_CHAR_RE.test('\uFEFF')).toBe(false);
227+
});
228+
204229
it('LATIN_DIGIT_NEUTRAL_ONLY_RE matches Latin + digit + neutral chars', () => {
205230
expect(LATIN_DIGIT_NEUTRAL_ONLY_RE.test('Hello world')).toBe(true);
206231
expect(LATIN_DIGIT_NEUTRAL_ONLY_RE.test('copy 2')).toBe(true);

packages/layout-engine/painters/dom/src/features/inline-direction/run-direction.ts

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -22,12 +22,16 @@
2222
export const RTL_DATE_LIKE_TOKEN_RE = /^-?\d+(?:[./-]\d+)+$/;
2323

2424
/**
25-
* Matches strong-RTL characters in the Hebrew / Arabic / Syriac core blocks.
25+
* Matches strong-RTL characters across Hebrew, Arabic, and adjacent RTL scripts
26+
* including presentation forms (FB1D-FB4F Hebrew, FB50-FDFF Arabic-A,
27+
* FE70-FEFF Arabic-B). The block range covers Hebrew, Arabic, Syriac, NKo,
28+
* etc.; the Script properties add presentation forms without including
29+
* noncharacters (FDD0-FDEF) or the BOM (FEFF).
2630
*
27-
* Known gap: misses Hebrew presentation forms (FB1D-FB4F) and Arabic
28-
* presentation forms (FB50-FDFF, FE70-FEFF). Tracked under SD-2767 follow-ups.
31+
* AIDEV-NOTE: also duplicated in super-editor's mixed-bidi-backspace extension.
32+
* Consolidating crosses a layer boundary; tracked under SD-3169 follow-ups.
2933
*/
30-
export const STRONG_RTL_CHAR_RE = /[\u0590-\u08FF]/;
34+
export const STRONG_RTL_CHAR_RE = /[\u0590-\u08FF\p{Script=Hebrew}\p{Script=Arabic}]/u;
3135

3236
/**
3337
* Matches runs whose content is exclusively Latin / digit / neutral. Used as
@@ -67,7 +71,7 @@ export const normalizeRtlDateTokenForWordParity = (text: string): string => {
6771
* - rtl-tagged + contains strong-RTL chars -> 'rtl' (standard case)
6872
* - rtl-tagged + only Latin/digit/neutral -> null (per §17.3.2.30, unspecified;
6973
* Word does not visually reorder these, so omit dir to inherit paragraph)
70-
* - rtl-tagged + other (e.g. East Asian, presentation forms) -> 'rtl' (fail-safe)
74+
* - rtl-tagged + other (e.g. East Asian, symbols outside the neutral set) -> 'rtl' (fail-safe)
7175
* - NOT rtl-tagged + date-like numeric text -> 'ltr' (Word-parity: keeps date
7276
* LTR-classified within an RTL paragraph context so digits don't drift)
7377
* - NOT rtl-tagged + anything else -> null (let paragraph + UBA decide)

packages/super-editor/src/editors/v1/extensions/mixed-bidi-backspace/mixed-bidi-backspace.js

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,13 @@
11
// @ts-nocheck
22
import { Extension } from '@core/Extension.js';
33

4-
const STRONG_RTL_CHAR_RE = /[\u0590-\u08FF]/;
4+
// SD-3169: widen beyond Hebrew/Arabic core blocks to include Hebrew/Arabic
5+
// presentation forms (FB1D-FB4F, FB50-FDFF, FE70-FEFF) used by legacy fonts
6+
// and some authoring tools. The Unicode Script properties catch presentation
7+
// forms while excluding noncharacters (FDD0-FDEF) and the BOM (FEFF).
8+
// AIDEV-NOTE: also duplicated in painter-dom features/inline-direction/run-direction.ts.
9+
// Consolidating crosses a layer boundary; tracked under SD-3169 follow-ups.
10+
const STRONG_RTL_CHAR_RE = /[\u0590-\u08FF\p{Script=Hebrew}\p{Script=Arabic}]/u;
511
const STRONG_LTR_CHAR_RE = /[A-Za-z\u00C0-\u024F]/;
612

713
const isStrongRtl = (char) => STRONG_RTL_CHAR_RE.test(char);

packages/super-editor/src/editors/v1/extensions/mixed-bidi-backspace/mixed-bidi-backspace.test.js

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -175,4 +175,55 @@ describe('mixedBidiBackspace (chain command)', () => {
175175
expect(__TEST_ONLY__.hasMixedDirectionBoundary('A', 'B')).toBe(false);
176176
expect(__TEST_ONLY__.hasMixedDirectionBoundary('א', 'ש')).toBe(false);
177177
});
178+
179+
// SD-3169: Hebrew/Arabic presentation forms (legacy ligature codepoints used
180+
// by older fonts and some legacy systems) live outside the Hebrew/Arabic
181+
// core blocks. The Phase 6 STRONG_RTL_CHAR_RE = /[\u0590-\u08FF]/ missed
182+
// them, so a paragraph mixing presentation-form Hebrew/Arabic with Latin
183+
// would not have its boundary detected and mixed-bidi Backspace would not
184+
// fire. Pin via the helper and the end-to-end command path.
185+
describe('SD-3169 Hebrew/Arabic presentation forms', () => {
186+
it('hasMixedDirectionBoundary recognizes Hebrew presentation forms (FB1D-FB4F)', () => {
187+
// \uFB21 = Hebrew Letter Wide Alef. Boundary against Latin must register.
188+
expect(__TEST_ONLY__.hasMixedDirectionBoundary('\uFB21', 'A')).toBe(true);
189+
expect(__TEST_ONLY__.hasMixedDirectionBoundary('A', '\uFB21')).toBe(true);
190+
// \uFB4F = Hebrew Ligature Alef Lamed (last code point in the range).
191+
expect(__TEST_ONLY__.hasMixedDirectionBoundary('\uFB4F', 'B')).toBe(true);
192+
});
193+
194+
it('hasMixedDirectionBoundary recognizes Arabic Presentation Forms-A (FB50-FDFF)', () => {
195+
// \uFB50 = Arabic Letter Alef Wasla Isolated Form (first code point).
196+
expect(__TEST_ONLY__.hasMixedDirectionBoundary('\uFB50', 'A')).toBe(true);
197+
// \uFDF2 = Arabic Ligature Allah Isolated Form.
198+
expect(__TEST_ONLY__.hasMixedDirectionBoundary('\uFDF2', 'A')).toBe(true);
199+
});
200+
201+
it('hasMixedDirectionBoundary recognizes Arabic Presentation Forms-B (FE70-FEFF)', () => {
202+
// \uFE70 = Arabic Fathatan Isolated Form (first code point).
203+
expect(__TEST_ONLY__.hasMixedDirectionBoundary('\uFE70', 'A')).toBe(true);
204+
// \uFEFC = Arabic Ligature Lam With Alef Final Form (last letter form).
205+
expect(__TEST_ONLY__.hasMixedDirectionBoundary('\uFEFC', 'A')).toBe(true);
206+
});
207+
208+
it('hasMixedDirectionBoundary excludes noncharacters in the Arabic A range', () => {
209+
// FDD0-FDEF are Unicode noncharacters, not strong-RTL.
210+
// FEFF is ZERO WIDTH NO-BREAK SPACE (BOM), not strong-RTL.
211+
expect(__TEST_ONLY__.hasMixedDirectionBoundary('\uFDD0', 'A')).toBe(false);
212+
expect(__TEST_ONLY__.hasMixedDirectionBoundary('\uFEFF', 'A')).toBe(false);
213+
});
214+
215+
it('returns true and mutates tr on presentation-form-Hebrew + Latin boundary', () => {
216+
const { state, view, tr } = setupContext({
217+
text: '\uFB21A',
218+
charLefts: [10, 20],
219+
caretRect: makeRect(20, 10, 1, 12),
220+
selectionFrom: 11,
221+
pmBase: 10,
222+
});
223+
224+
const handled = mixedBidiBackspace()({ state, view, tr, dispatch: vi.fn() });
225+
expect(handled).toBe(true);
226+
expect(tr.delete).toHaveBeenCalledWith(10, 11);
227+
});
228+
});
178229
});

0 commit comments

Comments
 (0)