Skip to content

Commit 7f7ff93

Browse files
committed
fix: extract duplicate block identity normalization from docxImporter
1 parent 6c9c7a3 commit 7f7ff93

3 files changed

Lines changed: 202 additions & 0 deletions

File tree

packages/super-editor/src/core/super-converter/v2/importer/docxImporter.js

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ import { ensureNumberingCache } from './numberingCache.js';
3232
import { commentRangeStartHandlerEntity, commentRangeEndHandlerEntity } from './commentRangeImporter.js';
3333
import { permStartHandlerEntity } from './permStartImporter.js';
3434
import { permEndHandlerEntity } from './permEndImporter.js';
35+
import { normalizeDuplicateBlockIdentitiesInContent } from './normalizeDuplicateBlockIdentitiesInContent.js';
3536
import bookmarkStartAttrConfigs from '@converter/v3/handlers/w/bookmark-start/attributes/index.js';
3637
import bookmarkEndAttrConfigs from '@converter/v3/handlers/w/bookmark-end/attributes/index.js';
3738
import { translator as wStylesTranslator } from '@converter/v3/handlers/w/styles/index.js';
@@ -169,6 +170,7 @@ export const createDocumentJson = (docx, converter, editor) => {
169170
parsedContent = filterOutRootInlineNodes(parsedContent);
170171
parsedContent = normalizeTableBookmarksInContent(parsedContent, editor);
171172
collapseWhitespaceNextToInlinePassthrough(parsedContent);
173+
parsedContent = normalizeDuplicateBlockIdentitiesInContent(parsedContent);
172174

173175
const result = {
174176
type: 'doc',
@@ -688,6 +690,7 @@ const importHeadersFooters = (docx, converter, mainEditor, numbering, translated
688690

689691
// Safety: drop inline-only nodes at the root of header docs
690692
schema = filterOutRootInlineNodes(schema);
693+
schema = normalizeDuplicateBlockIdentitiesInContent(schema);
691694

692695
if (!converter.headerIds.ids) converter.headerIds.ids = [];
693696
converter.headerIds.ids.push(rId);
@@ -727,6 +730,7 @@ const importHeadersFooters = (docx, converter, mainEditor, numbering, translated
727730

728731
// Safety: drop inline-only nodes at the root of footer docs
729732
schema = filterOutRootInlineNodes(schema);
733+
schema = normalizeDuplicateBlockIdentitiesInContent(schema);
730734

731735
if (!converter.footerIds.ids) converter.footerIds.ids = [];
732736
converter.footerIds.ids.push(rId);
Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
import { generateDocxRandomId } from '@core/helpers/generateDocxRandomId.js';
2+
3+
const PARAGRAPH_IDENTITY_ATTRS = ['sdBlockId', 'paraId'];
4+
const TABLE_IDENTITY_ATTRS = ['sdBlockId', 'paraId', 'blockId'];
5+
const DEFAULT_BLOCK_IDENTITY_ATTRS = ['sdBlockId', 'blockId', 'paraId'];
6+
7+
/** Maps block node types to safe block-identity attribute lookup order. */
8+
const BLOCK_IDENTITY_ATTRS = {
9+
paragraph: PARAGRAPH_IDENTITY_ATTRS,
10+
heading: DEFAULT_BLOCK_IDENTITY_ATTRS,
11+
listItem: DEFAULT_BLOCK_IDENTITY_ATTRS,
12+
table: TABLE_IDENTITY_ATTRS,
13+
tableRow: TABLE_IDENTITY_ATTRS,
14+
tableCell: TABLE_IDENTITY_ATTRS,
15+
tableHeader: TABLE_IDENTITY_ATTRS,
16+
sdt: DEFAULT_BLOCK_IDENTITY_ATTRS,
17+
structuredContentBlock: DEFAULT_BLOCK_IDENTITY_ATTRS,
18+
};
19+
20+
function toIdentityValue(value) {
21+
if (typeof value === 'string' && value.length > 0) return value;
22+
if (typeof value === 'number' && Number.isFinite(value)) return String(value);
23+
return undefined;
24+
}
25+
26+
function resolvePrimaryBlockIdentity(node) {
27+
if (!node || typeof node !== 'object') return undefined;
28+
29+
const attrPriority = BLOCK_IDENTITY_ATTRS[node.type];
30+
if (!attrPriority) return undefined;
31+
32+
const attrs = typeof node.attrs === 'object' && node.attrs ? node.attrs : {};
33+
for (const attr of attrPriority) {
34+
const value = toIdentityValue(attrs[attr]);
35+
if (value) return { id: value, source: attr };
36+
}
37+
return undefined;
38+
}
39+
40+
function nextUniqueDocxId(usedIds) {
41+
let id = generateDocxRandomId();
42+
while (usedIds.has(id)) {
43+
id = generateDocxRandomId();
44+
}
45+
return id;
46+
}
47+
48+
function dedupeBlockIdentitiesInNode(node, usedIds) {
49+
if (!node || typeof node !== 'object') return;
50+
51+
const identity = resolvePrimaryBlockIdentity(node);
52+
if (identity) {
53+
if (usedIds.has(identity.id)) {
54+
const replacementId = nextUniqueDocxId(usedIds);
55+
node.attrs = { ...node.attrs, [identity.source]: replacementId };
56+
usedIds.add(replacementId);
57+
} else {
58+
usedIds.add(identity.id);
59+
}
60+
}
61+
62+
if (Array.isArray(node.content)) {
63+
node.content.forEach((child) => dedupeBlockIdentitiesInNode(child, usedIds));
64+
}
65+
}
66+
67+
/**
68+
* Deduplicate block identities during import so document-api targeting remains stable.
69+
*
70+
* Word files can occasionally contain duplicate stable block IDs across blocks.
71+
* Since stable IDs are used for deterministic targeting in the adapters,
72+
* duplicates break deterministic targeting and mutations.
73+
*
74+
* Only safe block identity attributes are rewritten: sdBlockId, paraId, and blockId.
75+
*
76+
* @param {Array<{type?: string, attrs?: Record<string, unknown>, content?: unknown[]}>} content
77+
* @returns {Array<{type?: string, attrs?: Record<string, unknown>, content?: unknown[]}>}
78+
*/
79+
export function normalizeDuplicateBlockIdentitiesInContent(content = []) {
80+
if (!Array.isArray(content) || content.length === 0) return content;
81+
82+
const usedIds = new Set();
83+
content.forEach((node) => dedupeBlockIdentitiesInNode(node, usedIds));
84+
85+
return content;
86+
}
Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
import { describe, it, expect } from 'vitest';
2+
import { normalizeDuplicateBlockIdentitiesInContent } from './normalizeDuplicateBlockIdentitiesInContent.js';
3+
4+
describe('normalizeDuplicateBlockIdentitiesInContent', () => {
5+
const paragraph = (attrs = {}, text = 'text') => ({
6+
type: 'paragraph',
7+
attrs,
8+
marks: [],
9+
content: [{ type: 'text', text, marks: [] }],
10+
});
11+
12+
const table = (content = [], attrs = {}) => ({ type: 'table', attrs, marks: [], content });
13+
const row = (content = [], attrs = {}) => ({ type: 'tableRow', attrs, marks: [], content });
14+
const cell = (content = [], attrs = {}) => ({ type: 'tableCell', attrs, marks: [], content });
15+
const image = (attrs = {}) => ({ type: 'image', attrs, marks: [] });
16+
17+
it('deduplicates duplicate paraId values while keeping the first occurrence unchanged', () => {
18+
const content = [paragraph({ paraId: 'DUPLICATE' }, 'A'), paragraph({ paraId: 'DUPLICATE' }, 'B')];
19+
20+
normalizeDuplicateBlockIdentitiesInContent(content);
21+
22+
expect(content[0].attrs.paraId).toBe('DUPLICATE');
23+
expect(content[1].attrs.paraId).not.toBe('DUPLICATE');
24+
expect(content[1].attrs.paraId).toMatch(/^[0-9A-F]{8}$/);
25+
});
26+
27+
it('rewrites the field that actually provided the identity (sdBlockId fallback for paragraph)', () => {
28+
const content = [paragraph({ sdBlockId: 'SAME' }, 'A'), paragraph({ sdBlockId: 'SAME' }, 'B')];
29+
30+
normalizeDuplicateBlockIdentitiesInContent(content);
31+
32+
expect(content[0].attrs.sdBlockId).toBe('SAME');
33+
expect(content[1].attrs.sdBlockId).not.toBe('SAME');
34+
expect(content[1].attrs.sdBlockId).toMatch(/^[0-9A-F]{8}$/);
35+
expect(content[1].attrs.paraId).toBeUndefined();
36+
});
37+
38+
it('prioritizes sdBlockId over paraId when both are present on paragraphs', () => {
39+
const content = [
40+
paragraph({ paraId: 'P1', sdBlockId: 'SAME' }, 'A'),
41+
paragraph({ paraId: 'P2', sdBlockId: 'SAME' }, 'B'),
42+
];
43+
44+
normalizeDuplicateBlockIdentitiesInContent(content);
45+
46+
expect(content[0].attrs.sdBlockId).toBe('SAME');
47+
expect(content[1].attrs.sdBlockId).not.toBe('SAME');
48+
expect(content[1].attrs.sdBlockId).toMatch(/^[0-9A-F]{8}$/);
49+
expect(content[0].attrs.paraId).toBe('P1');
50+
expect(content[1].attrs.paraId).toBe('P2');
51+
});
52+
53+
it('deduplicates table blockId when paraId/sdBlockId are not present', () => {
54+
const content = [table([], { blockId: 'TABLE-ID' }), table([], { blockId: 'TABLE-ID' })];
55+
56+
normalizeDuplicateBlockIdentitiesInContent(content);
57+
58+
expect(content[0].attrs.blockId).toBe('TABLE-ID');
59+
expect(content[1].attrs.blockId).not.toBe('TABLE-ID');
60+
expect(content[1].attrs.blockId).toMatch(/^[0-9A-F]{8}$/);
61+
});
62+
63+
it('does not rewrite non-block identity fields (e.g. image attrs.id)', () => {
64+
const content = [image({ id: '42', src: 'a.png' }), image({ id: '42', src: 'b.png' })];
65+
66+
normalizeDuplicateBlockIdentitiesInContent(content);
67+
68+
expect(content[0].attrs.id).toBe('42');
69+
expect(content[1].attrs.id).toBe('42');
70+
});
71+
72+
it('deduplicates identities across nested table block nodes', () => {
73+
const content = [
74+
table(
75+
[
76+
row(
77+
[
78+
cell([paragraph({ paraId: 'CELLPARA' }, 'R1C1')], { paraId: 'CELLID' }),
79+
cell([paragraph({ paraId: 'CELLPARA' }, 'R1C2')], { paraId: 'CELLID' }),
80+
],
81+
{ paraId: 'ROWID' },
82+
),
83+
row([cell([paragraph({ paraId: 'ROWID' }, 'R2C1')], { paraId: 'CELLID' })], { paraId: 'ROWID' }),
84+
],
85+
{ paraId: 'TABLEID' },
86+
),
87+
];
88+
89+
normalizeDuplicateBlockIdentitiesInContent(content);
90+
91+
const identities = new Set();
92+
const duplicates = new Set();
93+
const collect = (node) => {
94+
if (!node || typeof node !== 'object') return;
95+
const attrs = node.attrs ?? {};
96+
const id =
97+
(typeof attrs.paraId === 'string' && attrs.paraId) ||
98+
(typeof attrs.sdBlockId === 'string' && attrs.sdBlockId) ||
99+
(typeof attrs.blockId === 'string' && attrs.blockId) ||
100+
(typeof attrs.id === 'string' && attrs.id) ||
101+
(typeof attrs.uuid === 'string' && attrs.uuid);
102+
if (id) {
103+
if (identities.has(id)) duplicates.add(id);
104+
identities.add(id);
105+
}
106+
if (Array.isArray(node.content)) node.content.forEach(collect);
107+
};
108+
109+
content.forEach(collect);
110+
expect(duplicates.size).toBe(0);
111+
});
112+
});

0 commit comments

Comments
 (0)