-
Notifications
You must be signed in to change notification settings - Fork 4.2k
Expand file tree
/
Copy pathtranslate_docs.py
More file actions
531 lines (480 loc) · 21.1 KB
/
Copy pathtranslate_docs.py
File metadata and controls
531 lines (480 loc) · 21.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
# ruff: noqa
import os
import sys
import argparse
import subprocess
from pathlib import Path
from openai import OpenAI
from concurrent.futures import ThreadPoolExecutor
# import logging
# logging.basicConfig(level=logging.INFO)
# logging.getLogger("openai").setLevel(logging.DEBUG)
OPENAI_MODEL = os.environ.get("OPENAI_MODEL", "gpt-5.5")
ENABLE_CODE_SNIPPET_EXCLUSION = True
# gpt-4.5 needed this for better quality
ENABLE_SMALL_CHUNK_TRANSLATION = False
SEARCH_EXCLUSION = """---
search:
exclude: true
---
"""
# Define the source and target directories
source_dir = "docs"
REPO_ROOT = Path(__file__).resolve().parents[2]
languages = {
"ja": "Japanese",
"ko": "Korean",
"zh": "Chinese",
# Add more languages here, e.g., "fr": "French"
}
# Initialize OpenAI client
api_key = os.getenv("PROD_OPENAI_API_KEY") or os.getenv("OPENAI_API_KEY")
openai_client = OpenAI(api_key=api_key)
# Define dictionaries for translation control
do_not_translate = [
"OpenAI",
"Agents SDK",
"Hello World",
"Model context protocol",
"MCP",
"structured outputs",
"Chain-of-Thought",
"Chat Completions",
"Computer-Using Agent",
"Code Interpreter",
"Function Calling",
"LLM",
"Operator",
"Playground",
"Realtime API",
"Sora",
"Agents as tools",
"Agents-as-tools",
# Add more terms here
]
eng_to_non_eng_mapping = {
"ja": {
"agents": "エージェント",
"agent orchestration": "エージェントオーケストレーション",
"orchestrating multiple agents": "エージェントオーケストレーション",
"computer use": "コンピュータ操作",
"OAI hosted tools": "OpenAI がホストするツール",
"well formed data": "適切な形式のデータ",
"guardrail": "ガードレール",
"handoffs": "ハンドオフ",
"function tools": "関数ツール",
"tracing": "トレーシング",
"code examples": "コード例",
"vector store": "ベクトルストア",
"deep research": "ディープリサーチ",
"category": "カテゴリー",
"user": "ユーザー",
"parameter": "パラメーター",
"processor": "プロセッサー",
"server": "サーバー",
"web search": "Web 検索",
"file search": "ファイル検索",
"streaming": "ストリーミング",
"system prompt": "システムプロンプト",
"Python first": "Python ファースト",
# Add more Japanese mappings here
},
"ko": {
"agents": "에이전트",
"agent orchestration": "에이전트 오케스트레이션",
"computer use": "컴퓨터 사용",
"OAI hosted tools": "OpenAI 호스트하는 도구",
"well formed data": "적절한 형식의 데이터",
"guardrail": "가드레일",
"orchestrating multiple agents": "에이전트 오케스트레이션",
"handoffs": "핸드오프",
"function tools": "함수 도구",
"function calling": "함수 호출",
"tracing": "트레이싱",
"code examples": "코드 예제",
"vector store": "벡터 스토어",
"deep research": "딥 리서치",
"category": "카테고리",
"user": "사용자",
"parameter": "매개변수",
"processor": "프로세서",
"server": "서버",
"web search": "웹 검색",
"file search": "파일 검색",
"streaming": "스트리밍",
"system prompt": "시스템 프롬프트",
"Python-first": "파이썬 우선",
"interruption": "인터럽션(중단 처리)",
"TypeScript-first": "TypeScript 우선",
"Human in the loop": "휴먼인더루프 (HITL)",
"Hosted tool": "호스티드 툴",
"Hosted MCP server tools": "호스티드 MCP 서버 도구",
"raw": "원문",
"Realtime Agents": "실시간 에이전트",
"Build your first agent in minutes.": "단 몇 분 만에 첫 에이전트를 만들 수 있습니다",
"Let's build": "시작하기",
},
"zh": {
"agents": "智能体",
"agent orchestration": "智能体编排",
"orchestrating multiple agents": "智能体编排",
"computer use": "计算机操作",
"OAI hosted tools": "由OpenAI托管的工具",
"well formed data": "格式良好的数据",
"guardrail": "安全防护措施",
"handoffs": "任务转移",
"function tools": "工具调用",
"tracing": "追踪",
"code examples": "代码示例",
"vector store": "向量存储",
"deep research": "深度研究",
"category": "目录",
"user": "用户",
"parameter": "参数",
"processor": "进程",
"server": "服务",
"web search": "网络检索",
"file search": "文件检索",
"streaming": "流式传输",
"system prompt": "系统提示词",
"Python first": "Python 优先",
# Add more mappings here
},
# Add more languages here
}
eng_to_non_eng_instructions = {
"common": [
"* The term 'examples' must be code examples when the page mentions the code examples in the repo, it can be translated as either 'code examples' or 'sample code'.",
"* The term 'primitives' can be translated as basic components.",
"* When the terms 'instructions' and 'tools' are mentioned as API parameter names, they must be kept as is.",
"* The terms 'temperature', 'top_p', 'max_tokens', 'presence_penalty', 'frequency_penalty' as parameter names must be kept as is.",
"* Keep the original structure like `* **The thing**: foo`; this needs to be translated as `* **(translation)**: (translation)`",
],
"ja": [
"* The term 'result' in the Runner guide context must be translated like 'execution results'",
"* The term 'raw' in 'raw response events' must be kept as is",
"* You must consistently use polite wording such as です/ます rather than である/なのだ.",
# Add more Japanese mappings here
],
"ko": [
"* 공손하고 중립적인 문체(합니다/입니다체)를 일관되게 사용하세요.",
"* 개발자 문서이므로 자연스러운 의역을 허용하되 정확성을 유지하세요.",
"* 'instructions', 'tools' 같은 API 매개변수와 temperature, top_p, max_tokens, presence_penalty, frequency_penalty 등은 영문 그대로 유지하세요.",
"* 문장이 아닌 불릿 항목 끝에는 마침표를 찍지 마세요.",
],
"zh": [
"* The term 'examples' must be code examples when the page mentions the code examples in the repo, it can be translated as either 'code examples' or 'sample code'.",
"* The term 'primitives' can be translated as basic components.",
"* When the terms 'instructions' and 'tools' are mentioned as API parameter names, they must be kept as is.",
"* The terms 'temperature', 'top_p', 'max_tokens', 'presence_penalty', 'frequency_penalty' as parameter names must be kept as is.",
"* Keep the original structure like `* **The thing**: foo`; this needs to be translated as `* **(translation)**: (translation)`",
],
# Add more languages here
}
def built_instructions(target_language: str, lang_code: str) -> str:
do_not_translate_terms = "\n".join(do_not_translate)
specific_terms = "\n".join(
[f"* {k} -> {v}" for k, v in eng_to_non_eng_mapping.get(lang_code, {}).items()]
)
specific_instructions = "\n".join(
eng_to_non_eng_instructions.get("common", [])
+ eng_to_non_eng_instructions.get(lang_code, [])
)
return f"""You are an expert technical translator.
Your task: translate the markdown passed as a user input from English into {target_language}.
The inputs are the official OpenAI Agents SDK framework documentation, and your translation outputs'll be used for serving the official {target_language} version of them. Thus, accuracy, clarity, and fidelity to the original are critical.
############################
## OUTPUT REQUIREMENTS ##
############################
You must return **only** the translated markdown. Do not include any commentary, metadata, or explanations. The original markdown structure must be strictly preserved.
#########################
## GENERAL RULES ##
#########################
- Be professional and polite.
- Keep the tone **natural** and concise.
- Do not omit any content. If a segment should stay in English, copy it verbatim.
- Do not change the markdown data structure, including the indentations.
- Section titles starting with # or ## must be a noun form rather than a sentence.
- Section titles must be translated except for the Do-Not-Translate list.
- Keep all placeholders such as `CODE_BLOCK_*` and `CODE_LINE_PREFIX` unchanged.
- Convert asset paths: `./assets/…` → `../assets/…`.
*Example:* `` → ``
- Treat the **Do‑Not‑Translate list** and **Term‑Specific list** as case‑insensitive; preserve the original casing you see.
- Skip translation for:
- Inline code surrounded by single back‑ticks ( `like_this` ).
- Fenced code blocks delimited by ``` or ~~~, including all comments inside them.
- Link URLs inside `[label](URL)` – translate the label, never the URL.
#########################
## HARD CONSTRAINTS ##
#########################
- Never insert spaces immediately inside emphasis markers. Use `**bold**`, not `** bold **`.
- Preserve the number of emphasis markers from the source: if the source uses `**` or `__`, keep the same pair count.
- Ensure one space after heading markers: `##Heading` -> `## Heading`.
- Ensure one space after list markers: `-Item` -> `- Item`, `*Item` -> `* Item` (does not apply to `**`).
- Trim spaces inside link/image labels: `[ Label ](url)` -> `[Label](url)`.
###########################
## GOOD / BAD EXAMPLES ##
###########################
- Good: This is **bold** text.
- Bad: This is ** bold ** text.
- Good: ## Heading
- Bad: ##Heading
- Good: - Item
- Bad: -Item
- Good: [Label](https://example.com)
- Bad: [ Label ](https://example.com)
#########################
## LANGUAGE‑SPECIFIC ##
#########################
*(applies only when {target_language} = Japanese)*
- Insert a half‑width space before and after all alphanumeric terms.
- Add a half‑width space just outside markdown emphasis markers: ` **太字** ` (good) vs `** 太字 **` (bad).
*(applies only when {target_language} = Korean)*
- Do not alter spaces around code/identifiers; keep them as in the original.
- Do not add stray spaces around markdown emphasis: `**굵게**` (good) vs `** 굵게 **` (bad).
#########################
## DO NOT TRANSLATE ##
#########################
When replacing the following terms, do not have extra spaces before/after them:
{do_not_translate_terms}
#########################
## TERM‑SPECIFIC ##
#########################
Translate these terms exactly as provided (no extra spaces):
{specific_terms}
#########################
## EXTRA GUIDELINES ##
#########################
{specific_instructions}
- When translating Markdown tables, preserve the exact table structure, including all delimiters (|), header separators (---), and row/column counts. Only translate the cell contents. Do not add, remove, or reorder columns or rows.
#########################
## IF UNSURE ##
#########################
If you are uncertain about a term, leave the original English term in parentheses after your translation.
#########################
## WORKFLOW ##
#########################
Follow the following workflow to translate the given markdown text data:
1. Read the input markdown text given by the user.
2. Translate the markdown file into {target_language}, carefully following the requirements above.
3. Perform a self-review to check for the following common issues:
- Naturalness, accuracy, and consistency throughout the text.
- Spacing inside markdown syntax such as `*` or `_`; `**bold**` is correct whereas `** bold **` is not.
- Unwanted spaces inside link or image labels, such as `[ Label ](url)`.
- Headings or list markers missing a space after their marker.
4. If improvements are necessary, refine the content without changing the original meaning.
5. Continue improving the translation until you are fully satisfied with the result.
6. Once the final output is ready, return **only** the translated markdown text. No extra commentary.
"""
# Function to translate and save files
def translate_file(file_path: str, target_path: str, lang_code: str) -> None:
print(f"Translating {file_path} into a different language: {lang_code}")
with open(file_path, encoding="utf-8") as f:
content = f.read()
# Split content into lines
lines: list[str] = content.splitlines()
chunks: list[str] = []
current_chunk: list[str] = []
# Split content into chunks of up to 120 lines, ensuring splits occur before section titles
in_code_block = False
code_blocks: list[str] = []
code_block_chunks: list[str] = []
for line in lines:
if (
ENABLE_SMALL_CHUNK_TRANSLATION is True
and len(current_chunk) >= 120 # required for gpt-4.5
and not in_code_block
and line.startswith("#")
):
chunks.append("\n".join(current_chunk))
current_chunk = []
if ENABLE_CODE_SNIPPET_EXCLUSION is True and line.strip().startswith("```"):
code_block_chunks.append(line)
if in_code_block is True:
code_blocks.append("\n".join(code_block_chunks))
current_chunk.append(f"CODE_BLOCK_{(len(code_blocks) - 1):03}")
code_block_chunks.clear()
in_code_block = not in_code_block
continue
if in_code_block is True:
code_block_chunks.append(line)
else:
current_chunk.append(line)
if current_chunk:
chunks.append("\n".join(current_chunk))
# Translate each chunk separately and combine results
translated_content: list[str] = []
for chunk in chunks:
instructions = built_instructions(languages[lang_code], lang_code)
if OPENAI_MODEL.startswith("gpt-5"):
response = openai_client.responses.create(
model=OPENAI_MODEL,
instructions=instructions,
input=chunk,
reasoning={"effort": "none"},
text={"verbosity": "low"},
)
translated_content.append(response.output_text)
elif OPENAI_MODEL.startswith("o"):
response = openai_client.responses.create(
model=OPENAI_MODEL,
instructions=instructions,
input=chunk,
)
translated_content.append(response.output_text)
else:
response = openai_client.responses.create(
model=OPENAI_MODEL,
instructions=instructions,
input=chunk,
temperature=0.0,
)
translated_content.append(response.output_text)
translated_text = "\n".join(translated_content)
for idx, code_block in enumerate(code_blocks):
translated_text = translated_text.replace(f"CODE_BLOCK_{idx:03}", code_block)
# FIXME: enable mkdocs search plugin to seamlessly work with i18n plugin
translated_text = SEARCH_EXCLUSION + translated_text
# Save the combined translated content
with open(target_path, "w", encoding="utf-8") as f:
f.write(translated_text)
def git_last_commit_timestamp(path: str) -> int:
try:
relative_path = os.path.relpath(path, REPO_ROOT)
result = subprocess.run(
["git", "-C", str(REPO_ROOT), "log", "-1", "--format=%ct", "--", relative_path],
capture_output=True,
text=True,
check=False,
)
if result.returncode != 0:
return 0
output = result.stdout.strip()
if not output:
return 0
return int(output)
except Exception:
return 0
def should_translate_based_on_translation(file_path: str) -> bool:
relative_path = os.path.relpath(file_path, source_dir)
ja_path = os.path.join(source_dir, "ja", relative_path)
en_timestamp = git_last_commit_timestamp(file_path)
if en_timestamp == 0:
return True
ja_timestamp = git_last_commit_timestamp(ja_path)
if ja_timestamp == 0:
return True
return ja_timestamp < en_timestamp
def translate_single_source_file(
file_path: str, *, check_translation_outdated: bool = True
) -> None:
relative_path = os.path.relpath(file_path, source_dir)
if "ref/" in relative_path or not file_path.endswith(".md"):
return
if check_translation_outdated and not should_translate_based_on_translation(file_path):
print(f"Skipping {file_path}: The translated one is up-to-date.")
return
for lang_code in languages:
target_dir = os.path.join(source_dir, lang_code)
target_path = os.path.join(target_dir, relative_path)
# Ensure the target directory exists
os.makedirs(os.path.dirname(target_path), exist_ok=True)
# Translate and save the file
translate_file(file_path, target_path, lang_code)
def normalize_source_file_arg(file_arg: str) -> str:
if file_arg.startswith(f"{source_dir}/"):
return file_arg[len(source_dir) + 1 :]
if os.path.isabs(file_arg):
return os.path.relpath(file_arg, source_dir)
return file_arg
def translate_source_files(
file_paths: list[str], *, check_translation_outdated: bool = True
) -> None:
unique_paths = list(dict.fromkeys(file_paths))
if not unique_paths:
return
concurrency = min(6, len(unique_paths))
if concurrency <= 1:
translate_single_source_file(
unique_paths[0], check_translation_outdated=check_translation_outdated
)
return
with ThreadPoolExecutor(max_workers=concurrency) as executor:
futures = [
executor.submit(
translate_single_source_file,
path,
check_translation_outdated=check_translation_outdated,
)
for path in unique_paths
]
for future in futures:
future.result()
def main():
parser = argparse.ArgumentParser(description="Translate documentation files")
parser.add_argument(
"--file",
action="append",
type=str,
help="Specific file to translate (relative to docs directory).",
)
parser.add_argument(
"--file-list",
type=str,
help="Path to a newline-delimited file list to translate.",
)
parser.add_argument(
"--mode",
choices=["only-changes", "full"],
default="only-changes",
help="Translation mode. 'only-changes' translates only when the Japanese file is older than the English source.",
)
args = parser.parse_args()
check_translation_outdated = args.mode == "only-changes"
if args.file or args.file_list:
file_args: list[str] = []
if args.file:
file_args.extend(args.file)
if args.file_list:
with open(args.file_list, encoding="utf-8") as f:
file_args.extend([line.strip() for line in f.read().splitlines() if line.strip()])
file_paths: list[str] = []
for file_arg in file_args:
relative_file = normalize_source_file_arg(file_arg)
file_path = os.path.join(source_dir, relative_file)
if os.path.exists(file_path):
file_paths.append(file_path)
else:
print(f"Warning: File {file_path} does not exist; skipping.")
if not file_paths:
print("Error: No valid files found to translate")
sys.exit(1)
translate_source_files(file_paths, check_translation_outdated=check_translation_outdated)
print("Translation completed for requested file(s)")
else:
# Traverse the source directory (original behavior)
for root, _, file_names in os.walk(source_dir):
# Skip the target directories
if any(lang in root for lang in languages):
continue
# Increasing this will make the translation faster; you can decide considering the model's capacity
concurrency = 6
with ThreadPoolExecutor(max_workers=concurrency) as executor:
futures = []
for file_name in file_names:
filepath = os.path.join(root, file_name)
futures.append(
executor.submit(
translate_single_source_file,
filepath,
check_translation_outdated=check_translation_outdated,
)
)
if len(futures) >= concurrency:
for future in futures:
future.result()
futures.clear()
print("Translation completed.")
if __name__ == "__main__":
# translate_single_source_file("docs/index.md")
main()