Skip to content

Commit fb89cc7

Browse files
committed
Clean up translation scripts
* Strip whitespace * Remove unused imports * Import lib modules directly to play nice with IDE
1 parent 09066c3 commit fb89cc7

2 files changed

Lines changed: 70 additions & 80 deletions

File tree

scripts/lib/translation_filler.py

Lines changed: 30 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,7 @@
11
import os
2-
import sys
32
import json
43
import logging
54
from anthropic import Anthropic
6-
from translation import Translation
75
from iso639 import Lang
86
from iso639.exceptions import InvalidLanguageValue
97

@@ -28,78 +26,78 @@ def narrow_down_source_and_target(source_content, target_content, missing_keys):
2826
"""
2927
# Find common keys between source and target
3028
common_keys = [k for k in source_content if k in target_content]
31-
29+
3230
# Find the maximum number of parts in any missing key
3331
max_parts = 0
3432
for key in missing_keys:
3533
parts = key.split('.')
3634
max_parts = max(max_parts, len(parts))
37-
35+
3836
# Start with an empty set of related keys
3937
related_keys = set()
40-
38+
4139
# Iterate from max_parts-1 down to 0
4240
for num_matching_parts in range(max_parts-1, -1, -1):
4341
# For each common key, check if it matches any missing key with at least num_matching_parts
4442
for common_key in common_keys:
4543
# Skip keys we've already added
4644
if common_key in related_keys:
4745
continue
48-
46+
4947
common_parts = common_key.split('.')
50-
48+
5149
# Check against each missing key
5250
for missing_key in missing_keys:
5351
missing_parts = missing_key.split('.')
54-
52+
5553
# Check if we have at least num_matching_parts matching parts
5654
matches = 0
5755
for i in range(min(len(common_parts), len(missing_parts))):
5856
if common_parts[i] == missing_parts[i]:
5957
matches += 1
6058
else:
6159
break
62-
60+
6361
# If we have enough matching parts, add this key to our context
6462
if matches >= num_matching_parts:
6563
related_keys.add(common_key)
6664
break
67-
65+
6866
# If we have enough context keys, we can stop
6967
if len(related_keys) >= 5:
7068
break
71-
69+
7270
# Convert to list and limit to 5 keys if needed
7371
related_keys = list(related_keys)
7472
if len(related_keys) > 5:
7573
related_keys = related_keys[:5]
76-
74+
7775
# Create narrowed down dictionaries with only the keys to keep
7876
narrowed_source = {k: source_content[k] for k in related_keys if k in source_content}
7977
narrowed_target = {k: target_content[k] for k in related_keys if k in target_content}
80-
78+
8179
# Add missing keys to the source dictionary
8280
for key in missing_keys:
8381
if key in source_content:
8482
narrowed_source[key] = source_content[key]
85-
83+
8684
return narrowed_source, narrowed_target, missing_keys
8785

8886
def fill_up_gaps_in_content(source_content, target_content, missing_keys_list, language_code):
8987
# Get API key from environment if not provided
9088
api_key = os.getenv('ANTHROPIC_API_KEY')
9189
if not api_key:
9290
raise ValueError("API key must be set in ANTHROPIC_API_KEY environment variable")
93-
91+
9492
# Create client
9593
client = Anthropic(api_key=api_key)
96-
94+
9795
# Get the full language name
9896
language_name = get_language_name(language_code)
99-
97+
10098
# Check if target has any existing translations
10199
has_existing_translations = len(target_content) > 0
102-
100+
103101
if has_existing_translations:
104102
# Original prompt when we have existing translations to learn from
105103
prompt = f"""Here are two translation files:
@@ -140,7 +138,7 @@ def fill_up_gaps_in_content(source_content, target_content, missing_keys_list, l
140138
schema_properties = {}
141139
for key in missing_keys_list:
142140
schema_properties[key] = {"type": "string"}
143-
141+
144142
# Build the required keys list
145143
required_keys = missing_keys_list
146144

@@ -163,61 +161,61 @@ def fill_up_gaps_in_content(source_content, target_content, missing_keys_list, l
163161
}
164162
)
165163
logger.debug(message.content)
166-
164+
167165
return json.loads(message.content[0].text.strip())
168166

169167
def fill_up_translation(source_translation, target_translation, language_code, output_path, batch_size=10):
170168
source_content = source_translation.entries
171169
target_content = target_translation.entries
172-
170+
173171
all_missing_keys = find_missing_keys(source_content, target_content)
174-
172+
175173
if not all_missing_keys:
176174
logger.debug("No missing keys found. Translation is complete.")
177175
return target_translation
178-
176+
179177
total_missing = len(all_missing_keys)
180178
language_name = get_language_name(language_code)
181179
logger.info(f"Found {total_missing} missing keys for {language_name} ({language_code}). Processing in batches of {batch_size}.")
182-
180+
183181
# Process missing keys in batches using while loop
184182
batch_start = 0
185183
while batch_start < total_missing:
186184
batch_end = min(batch_start + batch_size, total_missing)
187185
batch_missing_keys = all_missing_keys[batch_start:batch_end]
188-
186+
189187
batch_num = (batch_start // batch_size) + 1
190188
total_batches = (total_missing + batch_size - 1) // batch_size
191189
logger.info(f"Processing batch {batch_num}/{total_batches} ({len(batch_missing_keys)} keys)")
192-
190+
193191
# Narrow down source and target content for this batch
194192
narrowed_source, narrowed_target, _ = narrow_down_source_and_target(
195193
source_content, target_content, batch_missing_keys
196194
)
197-
195+
198196
# Fill up translations using Claude for this batch
199197
partial_json = fill_up_gaps_in_content(narrowed_source, narrowed_target, batch_missing_keys, language_code)
200-
198+
201199
# Update target translation with new translations
202200
for key, value in partial_json.items():
203201
if key in batch_missing_keys:
204202
logger.info(f"Filling up key: {key}")
205203
target_translation.set(key, value)
206204
# Update target_content so subsequent batches see this translation
207205
target_content[key] = value
208-
206+
209207
# Save to disk after each batch
210208
target_translation.save_as_json(output_path)
211209
logger.info(f"Batch {batch_num}/{total_batches} completed and saved to {output_path}")
212-
210+
213211
# Move to next batch
214212
batch_start = batch_end
215-
213+
216214
# Final summary
217215
remaining_missing = find_missing_keys(source_content, target_translation.entries)
218216
if remaining_missing:
219217
logger.warning(f"Translation completed with {len(remaining_missing)} keys still missing")
220218
else:
221219
logger.info("All translations completed successfully!")
222-
220+
223221
return target_translation

0 commit comments

Comments
 (0)