Skip to content

Commit 4a5f585

Browse files
authored
Merge pull request #5897 from rtibbles/tiptap-paste-strip-images
Strip <img> tags from pasted HTML in TipTap editor
2 parents 00660e1 + ebc5e87 commit 4a5f585

5 files changed

Lines changed: 223 additions & 63 deletions

File tree

contentcuration/contentcuration/frontend/shared/views/TipTapEditor/TipTapEditor/composables/useEditor.js

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ import { CodeBlockSyntaxHighlight } from '../extensions/CodeBlockSyntaxHighlight
1010
import { CustomLink } from '../extensions/Link';
1111
import { Math } from '../extensions/Math';
1212
import { createCustomMarkdownSerializer } from '../utils/markdownSerializer';
13+
import { transformPastedHTML } from '../utils/pasteTransform';
1314

1415
export function useEditor() {
1516
const editor = ref(null);
@@ -42,6 +43,7 @@ export function useEditor() {
4243
class: 'prose prose-sm sm:prose lg:prose-lg xl:prose-2xl focus:outline-none',
4344
dir: 'auto',
4445
},
46+
transformPastedHTML: html => transformPastedHTML(html),
4547
},
4648
onCreate: () => {
4749
isReady.value = true;

contentcuration/contentcuration/frontend/shared/views/TipTapEditor/TipTapEditor/composables/useToolbarActions.js

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import { computed, inject } from 'vue';
22
import { getTipTapEditorStrings } from '../TipTapEditorStrings';
3-
import { sanitizePastedHTML } from '../utils/markdown';
3+
import { transformPastedHTML } from '../utils/pasteTransform';
44

55
export function useToolbarActions(emit) {
66
const editor = inject('editor', null);
@@ -165,7 +165,7 @@ export function useToolbarActions(emit) {
165165
if (item.types.includes('text/html')) {
166166
const htmlBlob = await item.getType('text/html');
167167
const html = await htmlBlob.text();
168-
const cleaned = sanitizePastedHTML(html);
168+
const cleaned = transformPastedHTML(html);
169169

170170
editor.value.chain().focus().insertContent(cleaned).run();
171171
return;

contentcuration/contentcuration/frontend/shared/views/TipTapEditor/TipTapEditor/utils/markdown.js

Lines changed: 0 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -58,67 +58,6 @@ export const paramsToMathMd = ({ latex }) => {
5858
return `$$${latex || ''}$$`;
5959
};
6060

61-
export function sanitizePastedHTML(html) {
62-
if (!html) return '';
63-
// This code ine 55 to 66 is geneted with the help of LLM with the prompt
64-
// "Create a function that sanitizes HTML pasted from Microsoft
65-
// Word by removing Word-specific tags, styles, and classes while preserving other formatting."
66-
let cleaned = html;
67-
cleaned = cleaned.replace(/<!--\[if.*?endif\]-->/gis, '');
68-
cleaned = cleaned.replace(/<\/?(w|m|o|v):[^>]*>/gis, '');
69-
const parser = new DOMParser();
70-
const doc = parser.parseFromString(cleaned, 'text/html');
71-
doc.querySelectorAll('*').forEach(el => {
72-
if (el.hasAttribute('style')) {
73-
const style = el.getAttribute('style') || '';
74-
const filtered = style
75-
.split(';')
76-
.map(s => s.trim())
77-
.filter(s => s && !s.toLowerCase().startsWith('mso-'))
78-
.join('; ');
79-
if (filtered) {
80-
el.setAttribute('style', filtered);
81-
} else {
82-
el.removeAttribute('style');
83-
}
84-
}
85-
if (el.hasAttribute('class')) {
86-
const cls = el
87-
.getAttribute('class')
88-
.split(/\s+/)
89-
.filter(c => c && !/^Mso/i.test(c))
90-
.join(' ');
91-
if (cls) {
92-
el.setAttribute('class', cls);
93-
} else {
94-
el.removeAttribute('class');
95-
}
96-
}
97-
});
98-
const strikeElements = doc.querySelectorAll('s, strike, del');
99-
strikeElements.forEach(el => {
100-
const nestedLists = el.querySelectorAll('ul, ol');
101-
if (nestedLists.length > 0) {
102-
nestedLists.forEach(list => {
103-
el.parentNode.insertBefore(list, el.nextSibling);
104-
});
105-
}
106-
});
107-
const lists = doc.querySelectorAll('ul, ol');
108-
lists.forEach(list => {
109-
const items = list.querySelectorAll(':scope > li');
110-
items.forEach(item => {
111-
const nestedLists = Array.from(item.children).filter(
112-
child => child.tagName === 'UL' || child.tagName === 'OL',
113-
);
114-
nestedLists.forEach(nestedList => {
115-
item.appendChild(nestedList);
116-
});
117-
});
118-
});
119-
return doc.body.innerHTML;
120-
}
121-
12261
/**
12362
* Pre-processes a raw Markdown string to convert custom syntax into HTML tags
12463
* that Tiptap's extensions can understand. This is our custom "loader".
Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
function stripMsoConditionalComments(html) {
2+
return html.replace(/<!--\[if.*?endif\]-->/gis, '');
3+
}
4+
5+
function stripOfficeNamespacedTags(html) {
6+
return html.replace(/<\/?(w|m|o|v):[^>]*>/gis, '');
7+
}
8+
9+
function filterMsoStyleDeclarations(doc) {
10+
doc.querySelectorAll('[style]').forEach(el => {
11+
const filtered = el
12+
.getAttribute('style')
13+
.split(';')
14+
.map(s => s.trim())
15+
.filter(s => s && !s.toLowerCase().startsWith('mso-'))
16+
.join('; ');
17+
if (filtered) {
18+
el.setAttribute('style', filtered);
19+
} else {
20+
el.removeAttribute('style');
21+
}
22+
});
23+
}
24+
25+
function filterMsoClasses(doc) {
26+
doc.querySelectorAll('[class]').forEach(el => {
27+
const cls = el
28+
.getAttribute('class')
29+
.split(/\s+/)
30+
.filter(c => c && !/^Mso/i.test(c))
31+
.join(' ');
32+
if (cls) {
33+
el.setAttribute('class', cls);
34+
} else {
35+
el.removeAttribute('class');
36+
}
37+
});
38+
}
39+
40+
function hoistListsOutOfStrike(doc) {
41+
doc.querySelectorAll('s, strike, del').forEach(el => {
42+
el.querySelectorAll('ul, ol').forEach(list => {
43+
el.parentNode.insertBefore(list, el.nextSibling);
44+
});
45+
});
46+
}
47+
48+
function reparentNestedListsInLi(doc) {
49+
doc.querySelectorAll('ul, ol').forEach(list => {
50+
list.querySelectorAll(':scope > li').forEach(item => {
51+
Array.from(item.children)
52+
.filter(child => child.tagName === 'UL' || child.tagName === 'OL')
53+
.forEach(nestedList => item.appendChild(nestedList));
54+
});
55+
});
56+
}
57+
58+
function stripImages(doc) {
59+
doc.querySelectorAll('img').forEach(el => el.remove());
60+
}
61+
62+
const STRING_TRANSFORMS = [stripMsoConditionalComments, stripOfficeNamespacedTags];
63+
64+
const DOM_TRANSFORMS = [
65+
filterMsoStyleDeclarations,
66+
filterMsoClasses,
67+
hoistListsOutOfStrike,
68+
reparentNestedListsInLi,
69+
stripImages,
70+
];
71+
72+
export function transformPastedHTML(html) {
73+
if (!html) return '';
74+
let cleaned = html;
75+
for (const transform of STRING_TRANSFORMS) {
76+
cleaned = transform(cleaned);
77+
}
78+
const doc = new DOMParser().parseFromString(cleaned, 'text/html');
79+
for (const transform of DOM_TRANSFORMS) {
80+
transform(doc);
81+
}
82+
return doc.body.innerHTML;
83+
}
Lines changed: 136 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,136 @@
1+
import { transformPastedHTML } from '../TipTapEditor/utils/pasteTransform';
2+
3+
describe('transformPastedHTML', () => {
4+
describe('empty inputs', () => {
5+
it('returns empty string for empty input', () => {
6+
expect(transformPastedHTML('')).toBe('');
7+
});
8+
9+
it('returns empty string for null', () => {
10+
expect(transformPastedHTML(null)).toBe('');
11+
});
12+
13+
it('returns empty string for undefined', () => {
14+
expect(transformPastedHTML(undefined)).toBe('');
15+
});
16+
});
17+
18+
describe('image stripping', () => {
19+
it('strips a single remote img', () => {
20+
const input = '<p>before <img src="https://example.com/x.png"> after</p>';
21+
expect(transformPastedHTML(input)).toBe('<p>before after</p>');
22+
});
23+
24+
it('strips a data: URI img', () => {
25+
const input = '<p><img src="data:image/png;base64,iVBORw0KGgo="></p>';
26+
expect(transformPastedHTML(input)).toBe('<p></p>');
27+
});
28+
29+
it('strips img with no src', () => {
30+
const input = '<p><img></p>';
31+
expect(transformPastedHTML(input)).toBe('<p></p>');
32+
});
33+
34+
it.each([
35+
['http', '<img src="http://x.test/a.png">'],
36+
['blob', '<img src="blob:https://x.test/abc">'],
37+
['file', '<img src="file:///tmp/a.png">'],
38+
['relative', '<img src="../a.png">'],
39+
])('strips img with %s scheme', (_scheme, imgTag) => {
40+
expect(transformPastedHTML(`<p>${imgTag}</p>`)).toBe('<p></p>');
41+
});
42+
43+
it('strips multiple imgs in different parents', () => {
44+
const input = [
45+
'<p>top <img src="a"></p>',
46+
'<img src="b">',
47+
'<ul><li><img src="c"> item</li></ul>',
48+
].join('');
49+
const output = transformPastedHTML(input);
50+
expect(output).not.toContain('<img');
51+
expect(output).toContain('<p>top </p>');
52+
expect(output).toContain('<ul><li> item</li></ul>');
53+
});
54+
55+
it('preserves surrounding marks when stripping mixed imgs', () => {
56+
const input =
57+
'<p><strong>bold</strong> <img src="a"> <em>italic</em> <a href="https://x">link</a></p>';
58+
const output = transformPastedHTML(input);
59+
expect(output).not.toContain('<img');
60+
expect(output).toContain('<strong>bold</strong>');
61+
expect(output).toContain('<em>italic</em>');
62+
expect(output).toContain('<a href="https://x">link</a>');
63+
});
64+
});
65+
66+
describe('Word/Office cleanup', () => {
67+
it('removes MSO conditional comments', () => {
68+
const input = '<p>before <!--[if gte mso 9]><xml>junk</xml><![endif]--> after</p>';
69+
expect(transformPastedHTML(input)).toBe('<p>before after</p>');
70+
});
71+
72+
it('removes Office-namespaced tags (w:, m:, o:, v:)', () => {
73+
const input =
74+
'<p>before<w:hint val="x"></w:hint><o:p></o:p><m:r></m:r><v:rect></v:rect>after</p>';
75+
const output = transformPastedHTML(input);
76+
expect(output).not.toMatch(/<\/?[wmov]:/);
77+
expect(output).toContain('before');
78+
expect(output).toContain('after');
79+
});
80+
81+
it('strips mso-* style declarations while keeping other styles', () => {
82+
const input =
83+
'<p style="mso-list:l0 level1; color: red; mso-bidi-font-size: 11pt; font-size: 12pt">x</p>';
84+
const output = transformPastedHTML(input);
85+
expect(output).not.toMatch(/mso-/);
86+
expect(output).toContain('color: red');
87+
expect(output).toContain('font-size: 12pt');
88+
});
89+
90+
it('removes the style attribute entirely when all declarations were mso-*', () => {
91+
const input = '<p style="mso-list:l0 level1;mso-bidi-font-size: 11pt">x</p>';
92+
expect(transformPastedHTML(input)).toBe('<p>x</p>');
93+
});
94+
95+
it('strips Mso* classes (case-insensitive) while keeping other classes', () => {
96+
const input = '<p class="MsoNormal kept-class MSOPlain">x</p>';
97+
const output = transformPastedHTML(input);
98+
expect(output).toContain('class="kept-class"');
99+
expect(output).not.toMatch(/Mso/i);
100+
});
101+
102+
it('removes the class attribute entirely when all classes were Mso*', () => {
103+
const input = '<p class="MsoNormal MsoListParagraph">x</p>';
104+
expect(transformPastedHTML(input)).toBe('<p>x</p>');
105+
});
106+
107+
it('hoists nested lists out of strike/s/del wrappers', () => {
108+
const input = '<s><ul><li>a</li></ul></s>';
109+
const output = transformPastedHTML(input);
110+
expect(output).toContain('<ul><li>a</li></ul>');
111+
expect(output.indexOf('</s>')).toBeLessThan(output.indexOf('<ul>'));
112+
});
113+
114+
it('re-parents nested lists inside <li> to the end of the <li>', () => {
115+
const input = '<ul><li>text<ul><li>nested</li></ul>more text</li></ul>';
116+
const output = transformPastedHTML(input);
117+
expect(output).toMatch(/<li>textmore text<ul><li>nested<\/li><\/ul><\/li>/);
118+
});
119+
});
120+
121+
describe('idempotency', () => {
122+
it.each([
123+
['<p>plain text</p>'],
124+
['<p>before <img src="x"> after</p>'],
125+
['<p style="mso-bidi-font-size:11pt;color:red">x</p>'],
126+
['<p class="MsoNormal kept">x</p>'],
127+
['<s><ul><li>a</li></ul></s>'],
128+
['<ul><li>text<ul><li>n</li></ul>more</li></ul>'],
129+
['<!--[if gte mso 9]>x<![endif]--><p>y</p>'],
130+
])('is idempotent: f(f(x)) === f(x) for %s', input => {
131+
const once = transformPastedHTML(input);
132+
const twice = transformPastedHTML(once);
133+
expect(twice).toBe(once);
134+
});
135+
});
136+
});

0 commit comments

Comments
 (0)