Skip to content

Commit c85cfec

Browse files
tupizzharbournick
authored andcommitted
fix(super-converter): bibliography/index/TOA field-code import fidelity (SD-3066)
Real Word documents surfaced several import defects in the block field-code family (BIBLIOGRAPHY, INDEX, XE, TOA). This addresses them and unifies the shared shape behind one set of helpers. Fixes: - Multi-run instruction aggregation joined fragments with an injected separator space, corrupting instructions Word splits across runs (e.g. `XE " Building Standard "`). Join verbatim; literal spacing is preserved. - Table of Authorities was dropped on import: the v2 importer had no `sd:tableOfAuthorities` handler, so the node was silently discarded. Register tableOfAuthoritiesImporter alongside index/bibliography. - A content control wrapping a block field imported as an inline `structuredContent` node; inside a block-only documentPartObject this threw "Invalid content for node type documentPartObject" and the editor failed to mount. Classify an SDT whose content is a block field as structuredContentBlock. - Bibliography neither captured nor replayed instructionTokens (unlike index/toa), so split BIBLIOGRAPHY instructions did not round-trip. Add the attribute and wire it through preprocessor, encode and decode. Refactors (DRY/KISS): - Extract buildBlockFieldNode (shared by the bibliography/index/toa preprocessors) and wrapParagraphsAsComplexField (shared by their translator decoders), mirroring the existing inline-field helpers. - Centralize BLOCK_FIELD_XML_NAMES so the paragraph importer and SDT classifier agree on which sd:* nodes are block content. Adds RED→GREEN unit tests for each fix and the new shared helpers. Full super-converter suite passes (2990). Linear: SD-3066 (parent of SD-3005)
1 parent de5553c commit c85cfec

24 files changed

Lines changed: 420 additions & 163 deletions
Lines changed: 5 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
import { normalizeFieldContentToParagraphs } from './normalize-field-content.js';
1+
import { buildBlockFieldNode } from './build-block-field-node.js';
22

33
/**
44
* Processes a BIBLIOGRAPHY instruction and creates an `sd:bibliography` node.
@@ -7,17 +7,10 @@ import { normalizeFieldContentToParagraphs } from './normalize-field-content.js'
77
*
88
* @param {import('../../v2/types/index.js').OpenXmlNode[]} nodesToCombine The nodes to combine.
99
* @param {string} instrText The instruction text.
10+
* @param {import('../../v2/docxHelper').ParsedDocx} [_docx] The docx object (unused).
11+
* @param {Array<{type: string, text?: string}>} [instructionTokens] Raw instruction tokens.
1012
* @returns {import('../../v2/types/index.js').OpenXmlNode[]}
1113
*/
12-
export function preProcessBibliographyInstruction(nodesToCombine, instrText) {
13-
return [
14-
{
15-
name: 'sd:bibliography',
16-
type: 'element',
17-
attributes: {
18-
instruction: instrText,
19-
},
20-
elements: normalizeFieldContentToParagraphs(nodesToCombine),
21-
},
22-
];
14+
export function preProcessBibliographyInstruction(nodesToCombine, instrText, _docx, instructionTokens = null) {
15+
return buildBlockFieldNode('sd:bibliography', nodesToCombine, instrText, instructionTokens);
2316
}

packages/super-editor/src/editors/v1/core/super-converter/field-references/fld-preprocessors/bibliography-preprocessor.test.js

Lines changed: 26 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -29,16 +29,18 @@ describe('preProcessBibliographyInstruction', () => {
2929
// bibliography PM node declares `content: 'paragraph+'`, so emitting loose
3030
// runs as direct children crashes the schema. The preprocessor must group
3131
// adjacent inline nodes into a synthesized <w:p>.
32-
const r1 = { name: 'w:r', type: 'element', elements: [{ name: 'w:t', elements: [{ text: 'Smith, J. (2024). ' }] }] };
32+
const r1 = {
33+
name: 'w:r',
34+
type: 'element',
35+
elements: [{ name: 'w:t', elements: [{ text: 'Smith, J. (2024). ' }] }],
36+
};
3337
const r2 = { name: 'w:r', type: 'element', elements: [{ name: 'w:t', elements: [{ text: 'Document Formats.' }] }] };
3438

3539
const result = preProcessBibliographyInstruction([r1, r2], 'BIBLIOGRAPHY \\l 1033 ');
3640

3741
expect(result).toHaveLength(1);
3842
expect(result[0].name).toBe('sd:bibliography');
39-
expect(result[0].elements).toEqual([
40-
{ name: 'w:p', type: 'element', elements: [r1, r2] },
41-
]);
43+
expect(result[0].elements).toEqual([{ name: 'w:p', type: 'element', elements: [r1, r2] }]);
4244
});
4345

4446
it('preserves existing w:p children as-is (multi-paragraph field)', () => {
@@ -63,4 +65,24 @@ describe('preProcessBibliographyInstruction', () => {
6365
{ name: 'w:p', type: 'element', elements: [trailingRun] },
6466
]);
6567
});
68+
69+
it('preserves instructionTokens so split instructions round-trip (SD-3066)', () => {
70+
// Parity with index/toa: a BIBLIOGRAPHY instruction split across runs
71+
// (e.g. 'BIBLIOGRAPHY ' + '\\l 1033 ') must keep its raw fragments so the
72+
// exporter can rebuild the original runs instead of collapsing to one.
73+
const instructionTokens = [
74+
{ type: 'text', text: 'BIBLIOGRAPHY ' },
75+
{ type: 'text', text: '\\l 1033 ' },
76+
];
77+
78+
const result = preProcessBibliographyInstruction([], 'BIBLIOGRAPHY \\l 1033', null, instructionTokens);
79+
80+
expect(result[0].attributes.instructionTokens).toEqual(instructionTokens);
81+
});
82+
83+
it('omits instructionTokens when none are provided', () => {
84+
const result = preProcessBibliographyInstruction([], 'BIBLIOGRAPHY');
85+
86+
expect(result[0].attributes).not.toHaveProperty('instructionTokens');
87+
});
6688
});
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
import { normalizeFieldContentToParagraphs } from './normalize-field-content.js';
2+
3+
/**
4+
* Build a block-level field node (`sd:bibliography`, `sd:index`,
5+
* `sd:tableOfAuthorities`) from the runs a complex field collected between its
6+
* `separate` and `end` fldChars.
7+
*
8+
* These three fields share one shape: an `sd:*` element carrying the raw
9+
* instruction (plus its token fragments, when the instruction was split across
10+
* runs) whose children are the field's generated paragraphs. The result is
11+
* normalized so loose inline runs are wrapped into paragraphs, satisfying the
12+
* `paragraph+` PM schema (see normalize-field-content / SD-3005).
13+
*
14+
* @param {string} xmlName The `sd:*` element name to emit.
15+
* @param {import('../../v2/types/index.js').OpenXmlNode[]} nodesToCombine The collected result nodes.
16+
* @param {string} instrText The field instruction text.
17+
* @param {Array<{type: string, text?: string}> | null} [instructionTokens] Raw instruction-run fragments.
18+
* @returns {import('../../v2/types/index.js').OpenXmlNode[]}
19+
*/
20+
export function buildBlockFieldNode(xmlName, nodesToCombine, instrText, instructionTokens = null) {
21+
return [
22+
{
23+
name: xmlName,
24+
type: 'element',
25+
attributes: {
26+
instruction: instrText,
27+
...(instructionTokens ? { instructionTokens } : {}),
28+
},
29+
elements: normalizeFieldContentToParagraphs(nodesToCombine),
30+
},
31+
];
32+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
import { describe, expect, it } from 'vitest';
2+
import { buildBlockFieldNode } from './build-block-field-node.js';
3+
4+
describe('buildBlockFieldNode', () => {
5+
it('emits the given sd:* element with the instruction and normalized paragraphs', () => {
6+
const run = { name: 'w:r', type: 'element', elements: [] };
7+
8+
const result = buildBlockFieldNode('sd:index', [run], 'INDEX \\c 2');
9+
10+
expect(result).toEqual([
11+
{
12+
name: 'sd:index',
13+
type: 'element',
14+
attributes: { instruction: 'INDEX \\c 2' },
15+
elements: [{ name: 'w:p', type: 'element', elements: [run] }],
16+
},
17+
]);
18+
});
19+
20+
it('includes instructionTokens only when provided', () => {
21+
const tokens = [{ type: 'text', text: 'INDEX ' }, { type: 'tab' }];
22+
23+
const withTokens = buildBlockFieldNode('sd:tableOfAuthorities', [], 'INDEX', tokens);
24+
expect(withTokens[0].attributes.instructionTokens).toEqual(tokens);
25+
26+
const withoutTokens = buildBlockFieldNode('sd:tableOfAuthorities', [], 'INDEX');
27+
expect(withoutTokens[0].attributes).not.toHaveProperty('instructionTokens');
28+
});
29+
30+
it('synthesizes an empty paragraph when there is no content', () => {
31+
const result = buildBlockFieldNode('sd:bibliography', [], 'BIBLIOGRAPHY');
32+
33+
expect(result[0].elements).toEqual([{ name: 'w:p', type: 'element', elements: [] }]);
34+
});
35+
});
Lines changed: 2 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
import { normalizeFieldContentToParagraphs } from './normalize-field-content.js';
1+
import { buildBlockFieldNode } from './build-block-field-node.js';
22

33
/**
44
* Processes an INDEX instruction and creates an `sd:index` node.
@@ -9,15 +9,5 @@ import { normalizeFieldContentToParagraphs } from './normalize-field-content.js'
99
* @returns {import('../../v2/types/index.js').OpenXmlNode[]}
1010
*/
1111
export function preProcessIndexInstruction(nodesToCombine, instrText, _docx, instructionTokens = null) {
12-
return [
13-
{
14-
name: 'sd:index',
15-
type: 'element',
16-
attributes: {
17-
instruction: instrText,
18-
...(instructionTokens ? { instructionTokens } : {}),
19-
},
20-
elements: normalizeFieldContentToParagraphs(nodesToCombine),
21-
},
22-
];
12+
return buildBlockFieldNode('sd:index', nodesToCombine, instrText, instructionTokens);
2313
}
Lines changed: 2 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
import { normalizeFieldContentToParagraphs } from './normalize-field-content.js';
1+
import { buildBlockFieldNode } from './build-block-field-node.js';
22

33
/**
44
* Processes a TOA (Table of Authorities) instruction and creates an `sd:tableOfAuthorities` node.
@@ -12,15 +12,5 @@ import { normalizeFieldContentToParagraphs } from './normalize-field-content.js'
1212
* @returns {import('../../v2/types/index.js').OpenXmlNode[]}
1313
*/
1414
export function preProcessToaInstruction(nodesToCombine, instrText, _docx, instructionTokens = null) {
15-
return [
16-
{
17-
name: 'sd:tableOfAuthorities',
18-
type: 'element',
19-
attributes: {
20-
instruction: instrText,
21-
...(instructionTokens ? { instructionTokens } : {}),
22-
},
23-
elements: normalizeFieldContentToParagraphs(nodesToCombine),
24-
},
25-
];
15+
return buildBlockFieldNode('sd:tableOfAuthorities', nodesToCombine, instrText, instructionTokens);
2616
}

packages/super-editor/src/editors/v1/core/super-converter/field-references/preProcessNodesForFldChar.js

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -178,7 +178,13 @@ export const preProcessNodesForFldChar = (nodes = [], docx) => {
178178
currentField.instructionTokens.push(...instructionTokens);
179179
const instrTextValue = instrTextEl?.elements?.[0]?.text;
180180
if (instrTextValue != null) {
181-
currentField.instrText += `${instrTextValue} `;
181+
// SD-3066: join instrText fragments verbatim. Word preserves the
182+
// literal spaces inside each run, so an instruction split across
183+
// runs (e.g. ' XE "' + 'Building Standard' + '" ') already carries
184+
// its own separators. Injecting a space per fragment corrupted the
185+
// entry text to 'XE " Building Standard "'. The leading/trailing
186+
// whitespace is trimmed by finalizeField via `instrText.trim()`.
187+
currentField.instrText += `${instrTextValue}`;
182188
}
183189
if (instructionTokens.some((token) => token.type === 'tab')) {
184190
currentField.instrText += '\t';

packages/super-editor/src/editors/v1/core/super-converter/field-references/preProcessNodesForFldChar.test.js

Lines changed: 62 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -683,7 +683,11 @@ describe('preProcessNodesForFldChar', () => {
683683
{
684684
nodes: [{ name: 'w:r', elements: [{ name: 'w:t', elements: [{ type: 'text', text: 'link text' }] }] }],
685685
fieldInfo: {
686-
instrText: 'HYPERLINK "http://example.com" ',
686+
// SD-3066: verbatim concatenation of the two instrText runs
687+
// ('HYPERLINK "http://example.com"' + ' ') is a single trailing
688+
// space. The previous expectation of three spaces reflected the
689+
// old per-fragment injected separator, not the literal source text.
690+
instrText: 'HYPERLINK "http://example.com" ',
687691
instructionTokens: [
688692
{ type: 'text', text: 'HYPERLINK "http://example.com"' },
689693
{ type: 'text', text: ' ' },
@@ -746,6 +750,63 @@ describe('preProcessNodesForFldChar', () => {
746750
expect(processedNodes[0].attributes.instruction).toBe('XE "Term"');
747751
});
748752

753+
it('processes fldSimple INDEX fields, wrapping loose result runs in a paragraph (SD-3066)', () => {
754+
// The ticket flags w:fldSimple as a primary INDEX signal. A fldSimple INDEX
755+
// carries its generated entries as loose runs; the index PM node requires
756+
// `paragraph+`, so the preprocessor must wrap them (normalizeFieldContentToParagraphs,
757+
// the SD-3005 fix). This guards both the fldSimple dispatch and that wrapping.
758+
const nodes = [
759+
{
760+
name: 'w:fldSimple',
761+
attributes: { 'w:instr': 'INDEX \\c 2' },
762+
elements: [
763+
{ name: 'w:r', elements: [{ name: 'w:t', elements: [{ type: 'text', text: 'apple, 3' }] }] },
764+
{ name: 'w:r', elements: [{ name: 'w:t', elements: [{ type: 'text', text: 'banana, 5' }] }] },
765+
],
766+
},
767+
];
768+
769+
const { processedNodes } = preProcessNodesForFldChar(nodes, mockDocx);
770+
expect(processedNodes).toHaveLength(1);
771+
expect(processedNodes[0].name).toBe('sd:index');
772+
expect(processedNodes[0].attributes.instruction).toBe('INDEX \\c 2');
773+
// Loose runs wrapped into a single paragraph so the PM `paragraph+` schema holds.
774+
expect(processedNodes[0].elements).toHaveLength(1);
775+
expect(processedNodes[0].elements[0].name).toBe('w:p');
776+
expect(processedNodes[0].elements[0].elements).toHaveLength(2);
777+
});
778+
779+
it('joins instruction text split across multiple instrText runs verbatim (SD-3066)', () => {
780+
// Word commonly splits an XE instruction across runs, with the literal
781+
// spaces preserved inside each run: ' XE "' + 'Building Standard' + '" '.
782+
// The aggregated instruction must reconstruct the literal string, not
783+
// inject a separator space per fragment (which produced
784+
// 'XE " Building Standard "' with spurious internal spaces).
785+
const nodes = [
786+
{ name: 'w:r', elements: [{ name: 'w:fldChar', attributes: { 'w:fldCharType': 'begin' } }] },
787+
{
788+
name: 'w:r',
789+
elements: [
790+
{ name: 'w:instrText', attributes: { 'xml:space': 'preserve' }, elements: [{ type: 'text', text: ' XE "' }] },
791+
],
792+
},
793+
{ name: 'w:r', elements: [{ name: 'w:instrText', elements: [{ type: 'text', text: 'Building Standard' }] }] },
794+
{
795+
name: 'w:r',
796+
elements: [
797+
{ name: 'w:instrText', attributes: { 'xml:space': 'preserve' }, elements: [{ type: 'text', text: '" ' }] },
798+
],
799+
},
800+
{ name: 'w:r', elements: [{ name: 'w:fldChar', attributes: { 'w:fldCharType': 'separate' } }] },
801+
{ name: 'w:r', elements: [{ name: 'w:fldChar', attributes: { 'w:fldCharType': 'end' } }] },
802+
];
803+
804+
const { processedNodes } = preProcessNodesForFldChar(nodes, mockDocx);
805+
expect(processedNodes).toHaveLength(1);
806+
expect(processedNodes[0].name).toBe('sd:indexEntry');
807+
expect(processedNodes[0].attributes.instruction).toBe('XE "Building Standard"');
808+
});
809+
749810
it('passes field-sequence rPr into body NUMWORDS fields when cached-result runs have no styling', () => {
750811
const nodes = [
751812
{

packages/super-editor/src/editors/v1/core/super-converter/v2/importer/docxImporter.js

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ import { tableNodeHandlerEntity } from './tableImporter.js';
3333
import { tableOfContentsHandlerEntity } from './tableOfContentsImporter.js';
3434
import { indexHandlerEntity, indexEntryHandlerEntity } from './indexImporter.js';
3535
import { bibliographyHandlerEntity } from './bibliographyImporter.js';
36+
import { tableOfAuthoritiesHandlerEntity } from './tableOfAuthoritiesImporter.js';
3637
import { preProcessNodesForFldChar } from '../../field-references';
3738
import { preProcessPageFieldsOnly } from '../../field-references/preProcessPageFieldsOnly.js';
3839
import { ensureNumberingCache } from './numberingCache.js';
@@ -344,6 +345,7 @@ export const defaultNodeListHandler = () => {
344345
tableOfContentsHandlerEntity,
345346
indexHandlerEntity,
346347
bibliographyHandlerEntity,
348+
tableOfAuthoritiesHandlerEntity,
347349
indexEntryHandlerEntity,
348350
autoPageHandlerEntity,
349351
autoTotalPageCountEntity,

packages/super-editor/src/editors/v1/core/super-converter/v2/importer/paragraphNodeImporter.js

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
// @ts-check
22
import { translator as wPNodeTranslator } from '../../v3/handlers/w/p/index.js';
3+
import { BLOCK_FIELD_XML_NAMES } from '../../v3/handlers/sd/shared/block-field-xml-names.js';
34

45
const PARAGRAPH_PROPERTIES_XML_NAME = 'w:pPr';
5-
const BLOCK_FIELD_XML_NAMES = new Set(['sd:tableOfContents', 'sd:index', 'sd:bibliography', 'sd:tableOfAuthorities']);
66

77
const hasMeaningfulParagraphContent = (elements = []) =>
88
elements.some((element) => element?.name && element.name !== PARAGRAPH_PROPERTIES_XML_NAME);

0 commit comments

Comments
 (0)