Skip to content

Commit eea635b

Browse files
committed
adding russian and ukrainian v4 optimized stressing lexicons
1 parent 69b81c9 commit eea635b

3 files changed

Lines changed: 47 additions & 14 deletions

File tree

wiktionary_pron/scripts/lexicon.js

Lines changed: 47 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,8 @@ const LEXICON_LANGUAGES = {
99
Czech: "czech_lexicon.zip",
1010
French: "french_lexicon_v3.zip",
1111
Lithuanian: "lt_lexicon.zip",
12-
Ukrainian: "uk_lexicon.zip",
13-
Russian: "ru_lexicon.zip",
12+
Ukrainian: "uk_lexicon_v4.zip",
13+
Russian: "ru_lexicon_v4.zip",
1414
Icelandic: "is_lexicon.zip",
1515
};
1616

@@ -29,7 +29,7 @@ class OptimizedV3Lexicon {
2929
};
3030
}
3131

32-
async loadFromBlob(blob) {
32+
async loadFromBlob(blob, language) {
3333
const startTime = performance.now();
3434

3535
try {
@@ -40,7 +40,7 @@ class OptimizedV3Lexicon {
4040
updateLoadingText("", "", "Parsing lexicon data");
4141

4242
const parseStart = performance.now();
43-
await this.parseV3Data(jsonStr);
43+
await this.parseV3Data(jsonStr, language);
4444
this.stats.parseTime = performance.now() - parseStart;
4545

4646
this.calculateMemoryUsage();
@@ -62,21 +62,49 @@ class OptimizedV3Lexicon {
6262
}
6363
}
6464

65-
async parseV3Data(jsonStr) {
65+
async parseV3Data(jsonStr, language) {
6666
const data = JSON.parse(jsonStr);
6767

6868
if (Array.isArray(data)) {
6969
// V3 format with prefix compression: [[prefix_len, suffix, ipa], ...]
70-
console.log("📂 Processing V3 prefix compression format");
70+
const isV4Format = language === "Russian" || language === "Ukrainian";
71+
72+
if (isV4Format) {
73+
console.log("📂 Processing V4 prefix/value compression format");
74+
} else {
75+
console.log("📂 Processing V3 prefix compression format");
76+
}
7177

7278
let currentKey = "";
7379
const totalEntries = data.length;
7480
const progressInterval = Math.floor(totalEntries / 50); // Update every 2%
81+
const STRESS_MARK = "\u0301";
7582

7683
for (let i = 0; i < data.length; i++) {
77-
const [prefixLen, suffix, ipa] = data[i];
78-
currentKey = currentKey.substring(0, prefixLen) + suffix;
79-
this.entries.set(currentKey, ipa);
84+
if (isV4Format) {
85+
// V4 DECODING LOGIC
86+
const [prefixLen, suffix, valueEncoding] = data[i];
87+
currentKey = currentKey.substring(0, prefixLen) + suffix;
88+
89+
let finalValue;
90+
if (typeof valueEncoding === "number") {
91+
// It's an integer: the index of the stressed vowel.
92+
const stressPos = valueEncoding;
93+
finalValue =
94+
currentKey.slice(0, stressPos + 1) +
95+
STRESS_MARK +
96+
currentKey.slice(stressPos + 1);
97+
} else {
98+
// It's a string: an exception (e.g., multi-form). Use it directly.
99+
finalValue = valueEncoding;
100+
}
101+
this.entries.set(currentKey, finalValue);
102+
} else {
103+
// V3 DECODING LOGIC (original code)
104+
const [prefixLen, suffix, ipa] = data[i];
105+
currentKey = currentKey.substring(0, prefixLen) + suffix;
106+
this.entries.set(currentKey, ipa);
107+
}
80108

81109
// Progress update with yielding for responsiveness
82110
if (i % progressInterval === 0) {
@@ -89,8 +117,8 @@ class OptimizedV3Lexicon {
89117
)}%)`,
90118
);
91119

92-
// Yield control every 4th progress update to prevent blocking
93-
if (i % (progressInterval * 4) === 0) {
120+
// Yield control every 2nd progress update to prevent blocking
121+
if (i % (progressInterval * 2) === 0) {
94122
await new Promise((resolve) => setTimeout(resolve, 0));
95123
}
96124
}
@@ -280,8 +308,13 @@ async function loadLexicon(language) {
280308
let worker;
281309

282310
try {
283-
// Special handling for French optimized format
284-
if (language === "French" || language === "German") {
311+
// Special handling for optimized format
312+
if (
313+
language === "French" ||
314+
language === "German" ||
315+
language === "Ukrainian" ||
316+
language === "Russian"
317+
) {
285318
return await loadOptimizedLexicon(language);
286319
}
287320

@@ -348,7 +381,7 @@ async function loadOptimizedLexicon(language) {
348381

349382
const optimizedLexicon = new OptimizedV3Lexicon();
350383
optimizedLexicon.stats.downloadTime = downloadTime;
351-
const success = await optimizedLexicon.loadFromBlob(blob);
384+
const success = await optimizedLexicon.loadFromBlob(blob, language);
352385

353386
if (!success) {
354387
throw new Error("Failed to load optimized lexicon");
4.62 MB
Binary file not shown.
690 KB
Binary file not shown.

0 commit comments

Comments
 (0)