Skip to content

Commit 5d52db3

Browse files
feat(word-diff): implement word-level diffing for granular text changes (#2817)
1 parent 8075858 commit 5d52db3

File tree

5 files changed

+730
-2
lines changed

5 files changed

+730
-2
lines changed
Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,122 @@
1+
import { describe, it, expect } from 'vitest';
2+
import { tokenizeWords, computeWordDiff, getWordChanges } from '../../editor/word-diff';
3+
4+
describe('tokenizeWords', () => {
5+
it('should tokenize a basic sentence', () => {
6+
const tokens = tokenizeWords('The quick fox');
7+
expect(tokens).toEqual([
8+
{ text: 'The', offset: 0 },
9+
{ text: ' ', offset: 3 },
10+
{ text: 'quick', offset: 4 },
11+
{ text: ' ', offset: 9 },
12+
{ text: 'fox', offset: 10 },
13+
]);
14+
});
15+
16+
it('should handle multiple spaces between words', () => {
17+
const tokens = tokenizeWords('hello world');
18+
expect(tokens).toEqual([
19+
{ text: 'hello', offset: 0 },
20+
{ text: ' ', offset: 5 },
21+
{ text: 'world', offset: 7 },
22+
]);
23+
});
24+
25+
it('should handle leading and trailing whitespace', () => {
26+
const tokens = tokenizeWords(' hello ');
27+
expect(tokens).toEqual([
28+
{ text: ' ', offset: 0 },
29+
{ text: 'hello', offset: 2 },
30+
{ text: ' ', offset: 7 },
31+
]);
32+
});
33+
34+
it('should return empty array for empty string', () => {
35+
expect(tokenizeWords('')).toEqual([]);
36+
});
37+
38+
it('should handle a single word', () => {
39+
expect(tokenizeWords('hello')).toEqual([{ text: 'hello', offset: 0 }]);
40+
});
41+
42+
it('should handle punctuation attached to words', () => {
43+
const tokens = tokenizeWords('Hello, world!');
44+
expect(tokens).toEqual([
45+
{ text: 'Hello,', offset: 0 },
46+
{ text: ' ', offset: 6 },
47+
{ text: 'world!', offset: 7 },
48+
]);
49+
});
50+
});
51+
52+
describe('computeWordDiff', () => {
53+
it('should return empty array for identical strings', () => {
54+
expect(computeWordDiff('hello world', 'hello world')).toEqual([]);
55+
});
56+
57+
it('should detect a single word replacement', () => {
58+
const changes = getWordChanges('The quick fox', 'The fast fox');
59+
expect(changes).toEqual([{ type: 'replace', oldFrom: 4, oldTo: 9, newText: 'fast' }]);
60+
});
61+
62+
it('should detect multiple word replacements', () => {
63+
const changes = getWordChanges(
64+
'The quick brown fox jumps over the lazy dog',
65+
'The fast brown fox leaps over the lazy cat',
66+
);
67+
expect(changes).toEqual([
68+
{ type: 'replace', oldFrom: 4, oldTo: 9, newText: 'fast' },
69+
{ type: 'replace', oldFrom: 20, oldTo: 25, newText: 'leaps' },
70+
{ type: 'replace', oldFrom: 40, oldTo: 43, newText: 'cat' },
71+
]);
72+
});
73+
74+
it('should detect word insertion', () => {
75+
const changes = getWordChanges('The fox', 'The quick fox');
76+
expect(changes).toHaveLength(1);
77+
expect(changes[0].type).toBe('insert');
78+
// "quick " is inserted (word + trailing space before "fox")
79+
expect(changes[0]).toHaveProperty('newText', 'quick ');
80+
});
81+
82+
it('should detect word deletion', () => {
83+
const changes = getWordChanges('The quick fox', 'The fox');
84+
expect(changes).toHaveLength(1);
85+
expect(changes[0].type).toBe('delete');
86+
// "quick " (word + space) is removed as a contiguous block
87+
expect(changes[0]).toHaveProperty('oldFrom', 4);
88+
expect(changes[0]).toHaveProperty('oldTo', 10);
89+
});
90+
91+
it('should handle complete rewrite', () => {
92+
const changes = getWordChanges('hello world', 'goodbye earth');
93+
// Each word is replaced separately since the space is a shared separator
94+
expect(changes.length).toBeGreaterThanOrEqual(1);
95+
expect(changes.every((c) => c.type === 'replace')).toBe(true);
96+
});
97+
98+
it('should handle empty old text', () => {
99+
const diff = computeWordDiff('', 'hello');
100+
expect(diff).toEqual([{ type: 'insert', insertAt: 0, newText: 'hello' }]);
101+
});
102+
103+
it('should handle empty new text', () => {
104+
const diff = computeWordDiff('hello', '');
105+
expect(diff).toEqual([{ type: 'delete', oldFrom: 0, oldTo: 5 }]);
106+
});
107+
108+
it('should handle both empty', () => {
109+
expect(computeWordDiff('', '')).toEqual([]);
110+
});
111+
112+
it('should preserve whitespace tokens as equal', () => {
113+
const diff = computeWordDiff('a b c', 'a x c');
114+
const changes = diff.filter((op) => op.type !== 'equal');
115+
expect(changes).toEqual([{ type: 'replace', oldFrom: 2, oldTo: 3, newText: 'x' }]);
116+
});
117+
118+
it('should handle sentence with punctuation changes', () => {
119+
const changes = getWordChanges('The company shall provide services.', 'The company must provide services.');
120+
expect(changes).toEqual([{ type: 'replace', oldFrom: 12, oldTo: 17, newText: 'must' }]);
121+
});
122+
});

packages/ai/src/ai-actions/editor/editor-adapter.ts

Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import type { Editor, FoundMatch, MarkType } from '../../shared';
22
import type { Node as ProseMirrorNode, Mark } from 'prosemirror-model';
33
import { generateId, stripListPrefix } from '../../shared';
4+
import { getWordChanges } from './word-diff';
45

56
/**
67
* Default highlight color for text selections.
@@ -630,6 +631,17 @@ export class EditorAdapter {
630631
const replacementEnd = suggestedText.length - suffix;
631632
const replacementText = suggestedText.slice(prefix, replacementEnd);
632633

634+
// Try word-level diff for more granular tracked changes
635+
const wordChanges = getWordChanges(originalText.slice(prefix, originalTextLength - suffix), replacementText);
636+
637+
if (wordChanges.length > 1) {
638+
// Multiple word-level changes: apply each separately in reverse order
639+
// so that earlier positions remain valid while we modify later ones.
640+
this.applyGranularChanges(changeFrom, changeTo, wordChanges);
641+
return;
642+
}
643+
644+
// 0 or 1 word change: use existing single-replacement logic (better mark handling)
633645
const segments = this.collectTextSegments(changeFrom, changeTo);
634646
const nodes = this.buildTextNodes(changeFrom, changeTo, replacementText, segments);
635647
const tr = state.tr.delete(changeFrom, changeTo);
@@ -642,6 +654,95 @@ export class EditorAdapter {
642654
this.editor.dispatch(tr);
643655
}
644656

657+
/**
658+
* Applies multiple word-level changes in a single transaction.
659+
* Changes are applied in reverse document order to preserve positions.
660+
*
661+
* @param rangeFrom - Start of the overall change range in document positions
662+
* @param rangeTo - End of the overall change range in document positions
663+
* @param changes - Word diff operations with character offsets relative to the range text
664+
* @private
665+
*/
666+
private applyGranularChanges(
667+
rangeFrom: number,
668+
rangeTo: number,
669+
changes: Array<
670+
| { type: 'replace'; oldFrom: number; oldTo: number; newText: string }
671+
| { type: 'delete'; oldFrom: number; oldTo: number }
672+
| { type: 'insert'; insertAt: number; newText: string }
673+
>,
674+
): void {
675+
const { state } = this.editor;
676+
if (!state) {
677+
return;
678+
}
679+
680+
// Pre-compute all document positions from the current (unmodified) state.
681+
// Character offsets in changes are relative to the range text (rangeFrom..rangeTo).
682+
const mappedChanges = changes.map((change) => {
683+
if (change.type === 'insert') {
684+
return {
685+
...change,
686+
docPos: this.mapCharOffsetToPosition(rangeFrom, rangeTo, change.insertAt),
687+
};
688+
}
689+
return {
690+
...change,
691+
docFrom: this.mapCharOffsetToPosition(rangeFrom, rangeTo, change.oldFrom),
692+
docTo: this.mapCharOffsetToPosition(rangeFrom, rangeTo, change.oldTo),
693+
};
694+
});
695+
696+
// Apply changes in forward order, remapping pre-computed positions through
697+
// steps added during this loop so that length changes from earlier
698+
// replacements are accounted for.
699+
const tr = state.tr;
700+
const baseSteps = tr.steps.length;
701+
for (let i = 0; i < mappedChanges.length; i++) {
702+
const change = mappedChanges[i];
703+
704+
// Remap pre-computed positions through steps added in this loop
705+
const remap = (pos: number) => {
706+
for (let s = baseSteps; s < tr.steps.length; s++) {
707+
const step = tr.steps[s] as { getMap?: () => { map: (p: number) => number } } | undefined;
708+
if (step && typeof step.getMap === 'function') {
709+
pos = step.getMap().map(pos);
710+
}
711+
}
712+
return pos;
713+
};
714+
715+
if (change.type === 'delete') {
716+
tr.delete(remap(change.docFrom), remap(change.docTo));
717+
} else if (change.type === 'insert') {
718+
const marks = this.getMarksAtPosition(change.docPos);
719+
const node = state.schema.text(change.newText, marks);
720+
tr.insert(remap(change.docPos), node);
721+
} else {
722+
// replace: use replaceWith for a single atomic step when available,
723+
// fall back to delete+insert for test mocks that lack replaceWith.
724+
const from = remap(change.docFrom);
725+
const to = remap(change.docTo);
726+
const segments = this.collectTextSegments(from, to);
727+
const nodes = this.buildTextNodes(from, to, change.newText, segments);
728+
if (typeof (tr as Record<string, unknown>).replaceWith === 'function') {
729+
(
730+
tr as unknown as { replaceWith: (from: number, to: number, content: ProseMirrorNode[]) => void }
731+
).replaceWith(from, to, nodes);
732+
} else {
733+
tr.delete(from, to);
734+
let insertPos = from;
735+
for (const node of nodes) {
736+
tr.insert(insertPos, node);
737+
insertPos += node.nodeSize;
738+
}
739+
}
740+
}
741+
}
742+
743+
this.editor.dispatch(tr);
744+
}
745+
645746
/**
646747
* Replaces text in the document while intelligently preserving ProseMirror marks.
647748
* Uses a diffing algorithm to minimize document changes by only replacing changed portions.

0 commit comments

Comments
 (0)