Skip to content

Commit eb789ff

Browse files
committed
Improve word detection with grapheme-aware unicode logic
1 parent af1729b commit eb789ff

3 files changed

Lines changed: 179 additions & 1 deletion

File tree

anycode-base/src/code.ts

Lines changed: 91 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ import {
1010
import Parser from 'web-tree-sitter';
1111
import History from './history';
1212
import { Selection } from './selection';
13-
import { getWasmPath } from './utils';
13+
import { getGraphemeAt, getNextGraphemeIndex, getPrevGraphemeIndex, getWasmPath, isWordGrapheme } from './utils';
1414
import type { Lang } from './lang';
1515

1616
import javascript from './langs/javascript';
@@ -61,6 +61,20 @@ export type Position = {
6161
column: number;
6262
}
6363

64+
export type WordHighlight = {
65+
text: string;
66+
token: string | null;
67+
};
68+
69+
export function areWordHighlightsEqual(
70+
a: WordHighlight | null,
71+
b: WordHighlight | null
72+
): boolean {
73+
if (a === b) return true;
74+
if (!a || !b) return false;
75+
return a.text === b.text && a.token === b.token;
76+
}
77+
6478

6579
export interface HighlighedNode {
6680
name: string | null;
@@ -977,6 +991,82 @@ export class Code {
977991
return columns;
978992
}
979993

994+
public getWordAtOffset(offset: number): WordHighlight | null {
995+
if (offset < 0 || offset > this.length()) {
996+
return null;
997+
}
998+
const pos = this.getPosition(offset);
999+
return this.getWordAtPosition(pos.line, pos.column);
1000+
}
1001+
1002+
public getWordAtPosition(lineIndex: number, columnIndex: number): WordHighlight | null {
1003+
if (lineIndex < 0 || lineIndex >= this.linesLength()) {
1004+
return null;
1005+
}
1006+
const lineText = this.line(lineIndex);
1007+
if (columnIndex < 0 || columnIndex > lineText.length) {
1008+
return null;
1009+
}
1010+
1011+
// Anchor: grapheme under cursor, otherwise grapheme on the left.
1012+
let anchor = -1;
1013+
if (columnIndex < lineText.length && isWordGrapheme(getGraphemeAt(lineText, columnIndex))) {
1014+
anchor = columnIndex;
1015+
} else if (columnIndex > 0) {
1016+
const prev = getPrevGraphemeIndex(lineText, columnIndex);
1017+
if (isWordGrapheme(getGraphemeAt(lineText, prev))) {
1018+
anchor = prev;
1019+
}
1020+
}
1021+
if (anchor === -1) {
1022+
return null;
1023+
}
1024+
1025+
// Expand to the left by grapheme clusters.
1026+
let start = anchor;
1027+
while (start > 0) {
1028+
const prev = getPrevGraphemeIndex(lineText, start);
1029+
if (prev === start || !isWordGrapheme(getGraphemeAt(lineText, prev))) {
1030+
break;
1031+
}
1032+
start = prev;
1033+
}
1034+
1035+
// Expand to the right by grapheme clusters.
1036+
let end = getNextGraphemeIndex(lineText, anchor);
1037+
while (end < lineText.length) {
1038+
if (!isWordGrapheme(getGraphemeAt(lineText, end))) {
1039+
break;
1040+
}
1041+
const next = getNextGraphemeIndex(lineText, end);
1042+
if (next === end) {
1043+
break;
1044+
}
1045+
end = next;
1046+
}
1047+
1048+
if (start === end) {
1049+
return null;
1050+
}
1051+
1052+
const text = lineText.slice(start, end);
1053+
1054+
// Find the class (name) of the token at the cursor position
1055+
let classs: string | null = null;
1056+
const nodes = this.getLineNodes(lineIndex);
1057+
let currentCharCount = 0;
1058+
for (const node of nodes) {
1059+
const nextCharCount = currentCharCount + node.text.length;
1060+
if (start >= currentCharCount && start < nextCharCount) {
1061+
classs = node.name;
1062+
break;
1063+
}
1064+
currentCharCount = nextCharCount;
1065+
}
1066+
1067+
return { text, token: classs };
1068+
}
1069+
9801070
clone() {
9811071
return new Code(this.getContent(), this.filename, this.language!);
9821072
}

anycode-base/src/utils.ts

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -219,6 +219,17 @@ export function getNextGraphemeIndex(line: string, fromColumn: number): number {
219219
return line.length;
220220
}
221221

222+
export function getGraphemeAt(line: string, index: number): string {
223+
if (index < 0 || index >= line.length) return '';
224+
const next = getNextGraphemeIndex(line, index);
225+
if (next <= index) return '';
226+
return line.slice(index, next);
227+
}
228+
229+
export function isWordGrapheme(g: string): boolean {
230+
return /[\p{L}\p{N}_]/u.test(g);
231+
}
232+
222233
export function minimize(str: string, maxLength: number = 100): string {
223234
const newlineIndex = str.indexOf('\n');
224235
let result = str;

anycode-base/tests/word.test.ts

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
import { describe, it, expect } from 'vitest';
2+
import { Code } from '../src/code';
3+
4+
describe('Code.getWordAtPosition', () => {
5+
it('should return correct words under cursor', () => {
6+
const code = new Code('const find_tree = 123;\n another_word\n', 'test.js', 'javascript');
7+
8+
// Line 0: "const find_tree = 123;"
9+
// "const" is cols 0-5
10+
expect(code.getWordAtPosition(0, 0)).toEqual({ text: 'const', token: null });
11+
expect(code.getWordAtPosition(0, 3)).toEqual({ text: 'const', token: null });
12+
expect(code.getWordAtPosition(0, 5)).toEqual({ text: 'const', token: null });
13+
14+
// Space at col 5 is adjacent to both "const" and "find_tree" (since index 5 is space and index 4 is 't')
15+
// Actually, let's verify if index 5 is space:
16+
// 'c'(0), 'o'(1), 'n'(2), 's'(3), 't'(4), ' '(5)
17+
// At col 5, index 5 is ' ', index 4 is 't' (word char). So it touches "const"
18+
expect(code.getWordAtPosition(0, 5)).toEqual({ text: 'const', token: null });
19+
20+
// Col 6 is 'f'. Touches "find_tree"
21+
expect(code.getWordAtPosition(0, 6)).toEqual({ text: 'find_tree', token: null });
22+
expect(code.getWordAtPosition(0, 10)).toEqual({ text: 'find_tree', token: null }); // 't' in tree
23+
expect(code.getWordAtPosition(0, 15)).toEqual({ text: 'find_tree', token: null }); // right after 'e' (touches it)
24+
// Space at col 16: "const find_tree = 123;"
25+
// Index 15 is '=' (not word char), index 16 is ' '. Neither is word char.
26+
expect(code.getWordAtPosition(0, 16)).toBe(null);
27+
28+
// "123" is cols 18-21
29+
expect(code.getWordAtPosition(0, 18)).toEqual({ text: '123', token: null });
30+
expect(code.getWordAtPosition(0, 21)).toEqual({ text: '123', token: null });
31+
32+
// Line 1: " another_word"
33+
expect(code.getWordAtPosition(1, 0)).toBe(null); // too far from word
34+
expect(code.getWordAtPosition(1, 1)).toBe(null); // too far
35+
expect(code.getWordAtPosition(1, 2)).toEqual({ text: 'another_word', token: null }); // at 'a'
36+
});
37+
38+
it('should retrieve words by offset', () => {
39+
const code = new Code('abc def', 'test.js', 'javascript');
40+
// 'a'(0), 'b'(1), 'c'(2), ' '(3), 'd'(4), 'e'(5), 'f'(6)
41+
expect(code.getWordAtOffset(0)).toEqual({ text: 'abc', token: null });
42+
expect(code.getWordAtOffset(1)).toEqual({ text: 'abc', token: null });
43+
expect(code.getWordAtOffset(3)).toEqual({ text: 'abc', token: null }); // touches 'c'
44+
expect(code.getWordAtOffset(4)).toEqual({ text: 'def', token: null });
45+
expect(code.getWordAtOffset(7)).toEqual({ text: 'def', token: null }); // touches 'f'
46+
expect(code.getWordAtOffset(8)).toBe(null); // out of range
47+
});
48+
49+
it('should detect unicode words (cyrillic and cjk)', () => {
50+
const code = new Code('привет 你好 _x42', 'test.js', 'javascript');
51+
52+
// Cyrillic
53+
expect(code.getWordAtPosition(0, 0)).toEqual({ text: 'привет', token: null });
54+
expect(code.getWordAtPosition(0, 3)).toEqual({ text: 'привет', token: null });
55+
expect(code.getWordAtPosition(0, 6)).toEqual({ text: 'привет', token: null }); // right boundary touch
56+
57+
// CJK
58+
expect(code.getWordAtPosition(0, 7)).toEqual({ text: '你好', token: null });
59+
expect(code.getWordAtPosition(0, 8)).toEqual({ text: '你好', token: null });
60+
expect(code.getWordAtPosition(0, 9)).toEqual({ text: '你好', token: null }); // right boundary touch
61+
62+
// underscore + digits still works
63+
expect(code.getWordAtPosition(0, 10)).toEqual({ text: '_x42', token: null });
64+
expect(code.getWordAtPosition(0, 14)).toEqual({ text: '_x42', token: null }); // right boundary touch
65+
});
66+
67+
it('should not treat emoji grapheme clusters as word characters', () => {
68+
const code = new Code('foo 👨‍👩‍👧‍👦 bar', 'test.js', 'javascript');
69+
70+
expect(code.getWordAtPosition(0, 1)).toEqual({ text: 'foo', token: null });
71+
expect(code.getWordAtPosition(0, 3)).toEqual({ text: 'foo', token: null }); // boundary touch
72+
expect(code.getWordAtPosition(0, 5)).toBe(null); // emoji start
73+
expect(code.getWordAtPosition(0, 10)).toBe(null); // inside emoji cluster (code unit index)
74+
expect(code.getWordAtPosition(0, 15)).toBe(null); // whitespace after emoji cluster
75+
expect(code.getWordAtPosition(0, 16)).toEqual({ text: 'bar', token: null });
76+
});
77+
});

0 commit comments

Comments
 (0)