From b4bca6176c6d9df20aecd2668c0ccebc1cf8d681 Mon Sep 17 00:00:00 2001 From: Alter-xyz <88554920+alterxyz@users.noreply.github.com> Date: Mon, 21 Jul 2025 15:43:23 +0800 Subject: [PATCH 1/2] feat: auto translate Powered By Dify --- .github/workflows/translate.yml | 72 ++++++ tools/translate/.env.example | 1 + tools/translate/.gitignore | 1 + tools/translate/README.md | 125 +++++++++ tools/translate/README_CN.md | 131 ++++++++++ tools/translate/README_JA.md | 125 +++++++++ tools/translate/main.py | 429 +++++++++++++++++++++++++++++++ tools/translate/requirements.txt | 3 + tools/translate/termbase_i18n.md | 64 +++++ 9 files changed, 951 insertions(+) create mode 100644 .github/workflows/translate.yml create mode 100644 tools/translate/.env.example create mode 100644 tools/translate/.gitignore create mode 100644 tools/translate/README.md create mode 100644 tools/translate/README_CN.md create mode 100644 tools/translate/README_JA.md create mode 100644 tools/translate/main.py create mode 100644 tools/translate/requirements.txt create mode 100644 tools/translate/termbase_i18n.md diff --git a/.github/workflows/translate.yml b/.github/workflows/translate.yml new file mode 100644 index 000000000..549bfaeab --- /dev/null +++ b/.github/workflows/translate.yml @@ -0,0 +1,72 @@ +name: Auto Translate Docs + +on: + push: + branches-ignore: + - 'main' + +jobs: + translate: + runs-on: ubuntu-latest + permissions: + contents: write + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + fetch-depth: 0 # Fetches all history for git diff + token: ${{ secrets.GITHUB_TOKEN }} + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.9' + + - name: Install dependencies + run: pip install httpx aiofiles python-dotenv + + - name: Get changed markdown files + id: changed-files + run: | + # Get the list of changed files between the current and previous commit + # We filter for .md and .mdx files that are inside the language directories + files=$(git diff --name-only ${{ github.event.before }} ${{ github.sha }} | grep -E '^(en|zh-hans|ja-jp|plugin-dev-en|plugin-dev-zh|plugin-dev-ja)/.*(\.md|\.mdx)$' || true) + if [[ -z "$files" ]]; then + echo "No markdown files to translate." + echo "files=" >> $GITHUB_OUTPUT + else + # The script expects absolute paths, but we run it from the root, so relative is fine. + echo "files<> $GITHUB_OUTPUT + echo "$files" >> $GITHUB_OUTPUT + echo "EOF" >> $GITHUB_OUTPUT + fi + + - name: Run translation script + if: steps.changed-files.outputs.files + env: + DIFY_API_KEY: ${{ secrets.DIFY_API_KEY }} + run: | + echo "Files to translate:" + echo "${{ steps.changed-files.outputs.files }}" + + echo "${{ steps.changed-files.outputs.files }}" | while IFS= read -r file; do + if [[ -n "$file" ]]; then + echo "Translating $file..." + python tools/translate/main.py "$file" "$DIFY_API_KEY" + fi + done + + - name: Commit and push changes + run: | + git config --global user.name 'github-actions[bot]' + git config --global user.email 'github-actions[bot]@users.noreply.github.com' + # Check if there are any changes to commit + if [[ -n $(git status --porcelain) ]]; then + git add . + git commit -m "docs: auto-translate documentation" + # Push to the same branch the workflow was triggered from + git push origin HEAD:${{ github.ref_name }} + echo "Translated files have been pushed to the branch." + else + echo "No new translations to commit." + fi diff --git a/tools/translate/.env.example b/tools/translate/.env.example new file mode 100644 index 000000000..3e36dd65b --- /dev/null +++ b/tools/translate/.env.example @@ -0,0 +1 @@ +dify_api_key=your_dify_api_key_here \ No newline at end of file diff --git a/tools/translate/.gitignore b/tools/translate/.gitignore new file mode 100644 index 000000000..2eea525d8 --- /dev/null +++ b/tools/translate/.gitignore @@ -0,0 +1 @@ +.env \ No newline at end of file diff --git a/tools/translate/README.md b/tools/translate/README.md new file mode 100644 index 000000000..169cc72d4 --- /dev/null +++ b/tools/translate/README.md @@ -0,0 +1,125 @@ +# Automatic Document Translation + +Multi-language document auto-translation system based on GitHub Actions and Dify AI, supporting English, Chinese, and Japanese trilingual translation. + +> **Other Languages**: [中文](README.md) | [日本語](README_JA.md) + +## How It Works + +1. **Trigger Condition**: Automatically runs when pushing to non-main branches +2. **Smart Detection**: Automatically identifies modified `.md/.mdx` files and determines source language +3. **Translation Logic**: + - ✅ Translates new documents to other languages + - ❌ Skips existing translation files (avoids overwriting manual edits) +4. **Auto Commit**: Translation results are automatically pushed to the current branch + +## System Features + +- 🌐 **Multi-language Support**: Configuration-based language mapping, theoretically supports any language extension +- 📚 **Terminology Consistency**: Built-in professional terminology database, LLM intelligently follows terminology to ensure unified technical vocabulary translation +- 🔄 **Concurrent Processing**: Smart concurrency control, translates multiple target languages simultaneously +- 🛡️ **Fault Tolerance**: 3-retry mechanism with exponential backoff strategy +- ⚡ **Incremental Translation**: Only processes changed files, avoids redundant work +- 🧠 **High-Performance Models**: Uses high-performance LLM models to ensure translation quality + +## Usage + +### For Document Writers + +1. Write/modify documents in any language directory +2. Push to branch (non-main) +3. Wait 0.5-1 minute for automatic translation completion +4. **View Translation Results**: + - Create Pull Request for local viewing and subsequent editing + - Or view Actions push commit details on GitHub to directly review translation quality + +### Supported Language Directories + +- **General Documentation**: `en/` ↔ `zh-hans/` ↔ `ja-jp/` +- **Plugin Development Documentation**: `plugin-dev-en/` ↔ `plugin-dev-zh/` ↔ `plugin-dev-ja/` + +Note: System architecture supports extending more languages, just modify configuration files + +## Important Notes + +- System only translates new documents, won't overwrite existing translations +- To update existing translations, manually delete target files then retrigger +- Terminology translation follows professional vocabulary in `termbase_i18n.md`, LLM has intelligent terminology recognition capabilities +- Translation quality depends on configured high-performance models, recommend using high-performance base models in Dify Studio + +### System Configuration + +#### Terminology Database + +Edit `tools/translate/termbase_i18n.md` to update professional terminology translation reference table. + +#### Translation Model + +Visit Dify Studio to adjust translation prompts or change base models. + +--- + +## 🔧 Development and Deployment Configuration + +### Local Development Environment + +#### 1. Create Virtual Environment + +```bash +# Create virtual environment +python -m venv venv + +# Activate virtual environment +# macOS/Linux: +source venv/bin/activate +# Windows: +# venv\Scripts\activate +``` + +#### 2. Install Dependencies + +```bash +pip install -r tools/translate/requirements.txt +``` + +#### 3. Configure API Key + +Create `.env` file in `tools/translate/` directory: + +```bash +DIFY_API_KEY=your_dify_api_key_here +``` + +#### 4. Run Translation + +```bash +# Interactive mode (recommended for beginners) +python tools/translate/main.py + +# Command line mode (specify file) +python tools/translate/main.py path/to/file.mdx [DIFY_API_KEY] +``` + +> **Tip**: Right-click in IDE and select "Copy Relative Path" to use as parameter + +### Deploy to Other Repositories + +1. **Copy Files**: + - `.github/workflows/translate.yml` + - `tools/translate/` entire directory + +2. **Configure GitHub Secrets**: + - Repository Settings → Secrets and variables → Actions + - Add `DIFY_API_KEY` secret + +3. **Test**: Modify documents in branch to verify automatic translation functionality + +### Technical Details + +- Concurrent translation limited to 2 tasks to avoid excessive API pressure +- Supports `.md` and `.mdx` file formats +- Based on Dify API workflow mode + +## TODO + +- [ ] Support updating existing translations diff --git a/tools/translate/README_CN.md b/tools/translate/README_CN.md new file mode 100644 index 000000000..b7ff23521 --- /dev/null +++ b/tools/translate/README_CN.md @@ -0,0 +1,131 @@ +# 自动翻译文档 + +基于 GitHub Actions 和 Dify AI 的文档自动翻译系统,支持英文、中文、日文三语互译。 + +> **其他语言**: [English](README_EN.md) | [日本語](README_JA.md) + +## 工作原理 + +1. **触发条件**: 推送到非 main 分支时自动运行 +2. **智能检测**: 自动识别修改的 `.md/.mdx` 文件并判断源语言 +3. **翻译逻辑**: + - ✅ 翻译新增文档到其他语言 + - ❌ 跳过已存在的翻译文件(避免覆盖手动修改) +4. **自动提交**: 翻译结果自动推送到当前分支 + +## 系统特性 + +- 🌐 **多语言支持**: 基于配置的语言映射,理论上支持任意语言扩展 +- 📚 **术语表一致性**: 内置专业术语库,LLM 智能遵循术语表确保技术词汇翻译统一 +- 🔄 **并发处理**: 智能并发控制,同时翻译多个目标语言 +- 🛡️ **容错机制**: 3 次重试机制,指数退避策略 +- ⚡ **增量翻译**: 只处理变更文件,避免重复工作 +- 🧠 **高性能模型**: 使用性能较强的 LLM 模型确保翻译质量 + +## 使用方法 + +### 文档编写者 + +1. 在任意语言目录下编写/修改文档 +2. 推送到分支(非 main) +3. 等待 0.5-1 分钟自动翻译完成 +4. **查看翻译结果**: + - 创建 Pull Request 进行本地查看和后续编辑 + - 或在 GitHub 查看 Actions 推送的 commit 详情,直接审查翻译质量 + +### 支持的语言目录 + +- **通用文档**: `en/` ↔ `zh-hans/` ↔ `ja-jp/` +- **插件开发文档**: `plugin-dev-en/` ↔ `plugin-dev-zh/` ↔ `plugin-dev-ja/` + +注:系统架构支持扩展更多语言,只需修改配置文件 + +## 注意事项 + +- 系统只翻译新文档,不会覆盖已存在的翻译 +- 如需更新现有翻译,请手动删除目标文件后重新触发 +- 术语翻译遵循 `termbase_i18n.md` 中的专业词汇表,LLM 具备智能术语识别能力 +- 翻译质量依赖于配置的高性能模型,建议在 Dify Studio 中使用性能较强的基座模型 + +### 系统配置 + +#### 术语表 + +编辑 `tools/translate/termbase_i18n.md` 更新专业术语翻译对照表。 + +#### 翻译模型 + +访问 Dify Studio 调整翻译 prompt 或更换基座模型。 + +--- + +--- + +--- + +## 🔧 开发和部署配置 + +### 本地开发环境 + +#### 1. 创建虚拟环境 + +```bash +# 创建虚拟环境 +python -m venv venv + +# 激活虚拟环境 +# macOS/Linux: +source venv/bin/activate +# Windows: +# venv\Scripts\activate +``` + +#### 2. 安装依赖 + +```bash +pip install -r tools/translate/requirements.txt +``` + +#### 3. 配置 API 密钥 + +在 `tools/translate/` 目录下创建 `.env` 文件: + +```bash +DIFY_API_KEY=your_dify_api_key_here +``` + +#### 4. 运行翻译 + +```bash +# 交互模式(推荐新手使用) +python tools/translate/main.py + +# 命令行模式(指定文件) +python tools/translate/main.py path/to/file.mdx [DIFY_API_KEY] +``` + +> **提示**: 在 IDE 中右键选择"复制相对路径"作为参数传入 + +### 部署到其他仓库 + +1. **复制文件**: + + - `.github/workflows/translate.yml` + - `tools/translate/` 整个目录 + +2. **配置 GitHub Secrets**: + + - Repository Settings → Secrets and variables → Actions + - 添加 `DIFY_API_KEY` 密钥 + +3. **测试**: 在分支中修改文档,验证自动翻译功能 + +### 技术细节 + +- 并发翻译限制为 2 个任务,避免 API 压力过大 +- 支持 `.md` 和 `.mdx` 文件格式 +- 基于 Dify API 的 workflow 模式 + +## TODO + +- [ ] 支持更新现有翻译 diff --git a/tools/translate/README_JA.md b/tools/translate/README_JA.md new file mode 100644 index 000000000..92a2595b7 --- /dev/null +++ b/tools/translate/README_JA.md @@ -0,0 +1,125 @@ +# 自動ドキュメント翻訳 + +GitHub Actions と Dify AI に基づく多言語ドキュメント自動翻訳システム。英語、中国語、日本語の三言語相互翻訳をサポートします。 + +> **他の言語**: [中文](README.md) | [English](README_EN.md) + +## 動作原理 + +1. **トリガー条件**: main以外のブランチにプッシュすると自動実行 +2. **スマート検出**: 変更された `.md/.mdx` ファイルを自動識別し、元言語を判定 +3. **翻訳ロジック**: + - ✅ 新規ドキュメントを他の言語に翻訳 + - ❌ 既存の翻訳ファイルはスキップ(手動編集の上書きを回避) +4. **自動コミット**: 翻訳結果を現在のブランチに自動プッシュ + +## システム特徴 + +- 🌐 **多言語サポート**: 設定ベースの言語マッピング、理論的には任意の言語拡張をサポート +- 📚 **用語の一貫性**: 内蔵専門用語データベース、LLMが用語表を賢く遵守し技術用語翻訳の統一を確保 +- 🔄 **並行処理**: スマート並行制御、複数のターゲット言語を同時翻訳 +- 🛡️ **フォールトトレランス**: 3回再試行メカニズム、指数バックオフ戦略 +- ⚡ **増分翻訳**: 変更ファイルのみ処理、重複作業を回避 +- 🧠 **高性能モデル**: 高性能LLMモデルを使用し翻訳品質を確保 + +## 使用方法 + +### ドキュメント作成者向け + +1. 任意の言語ディレクトリでドキュメントを作成/修正 +2. ブランチ(main以外)にプッシュ +3. 0.5-1分待って自動翻訳完了 +4. **翻訳結果の確認**: + - Pull Requestを作成してローカル確認と後続編集 + - またはGitHubでActionsプッシュコミット詳細を確認し、翻訳品質を直接レビュー + +### サポート言語ディレクトリ + +- **一般ドキュメント**: `en/` ↔ `zh-hans/` ↔ `ja-jp/` +- **プラグイン開発ドキュメント**: `plugin-dev-en/` ↔ `plugin-dev-zh/` ↔ `plugin-dev-ja/` + +注:システムアーキテクチャはより多くの言語拡張をサポート、設定ファイルの修正のみで可能 + +## 注意事項 + +- システムは新規ドキュメントのみ翻訳、既存翻訳は上書きしません +- 既存翻訳を更新する場合、対象ファイルを手動削除してから再トリガー +- 用語翻訳は `termbase_i18n.md` の専門用語表に従い、LLMはスマート用語認識機能を持つ +- 翻訳品質は設定された高性能モデルに依存、Dify Studioで高性能ベースモデル使用を推奨 + +### システム設定 + +#### 用語データベース + +`tools/translate/termbase_i18n.md` を編集して専門用語翻訳対照表を更新。 + +#### 翻訳モデル + +Dify Studio で翻訳プロンプトの調整やベースモデルの変更。 + +--- + +## 🔧 開発とデプロイ設定 + +### ローカル開発環境 + +#### 1. 仮想環境作成 + +```bash +# 仮想環境作成 +python -m venv venv + +# 仮想環境アクティベート +# macOS/Linux: +source venv/bin/activate +# Windows: +# venv\Scripts\activate +``` + +#### 2. 依存関係インストール + +```bash +pip install -r tools/translate/requirements.txt +``` + +#### 3. APIキー設定 + +`tools/translate/` ディレクトリに `.env` ファイルを作成: + +```bash +DIFY_API_KEY=your_dify_api_key_here +``` + +#### 4. 翻訳実行 + +```bash +# インタラクティブモード(初心者推奨) +python tools/translate/main.py + +# コマンドラインモード(ファイル指定) +python tools/translate/main.py path/to/file.mdx [DIFY_API_KEY] +``` + +> **ヒント**: IDEで右クリック「相対パスをコピー」をパラメータとして使用 + +### 他のリポジトリへのデプロイ + +1. **ファイルコピー**: + - `.github/workflows/translate.yml` + - `tools/translate/` ディレクトリ全体 + +2. **GitHub Secrets設定**: + - Repository Settings → Secrets and variables → Actions + - `DIFY_API_KEY` シークレットを追加 + +3. **テスト**: ブランチでドキュメント修正し、自動翻訳機能を検証 + +### 技術詳細 + +- 並行翻訳は2タスクに制限、過度なAPI圧迫を回避 +- `.md` と `.mdx` ファイル形式をサポート +- Dify API ワークフローモードベース + +## TODO + +- [ ] 既存翻訳の更新サポート diff --git a/tools/translate/main.py b/tools/translate/main.py new file mode 100644 index 000000000..05ddb21fd --- /dev/null +++ b/tools/translate/main.py @@ -0,0 +1,429 @@ +import httpx +import os +import sys +import asyncio +import aiofiles + +docs_structure = { + "general_help": { + "English": "en", + "Chinese": "zh-hans", + "Japanese": "ja-jp" + }, + "plugin_dev": { + "English": "plugin-dev-en", + "Chinese": "plugin-dev-zh", + "Japanese": "plugin-dev-ja" + } +} + + +async def translate_text(file_path, dify_api_key, original_language, target_language1, termbase_path=None, max_retries=3): + """ + Translate text using Dify API with termbase from `tools/translate/termbase_i18n.md` + """ + if termbase_path is None: + # Get project root directory + script_dir = os.path.dirname(os.path.abspath(__file__)) + base_dir = os.path.dirname(os.path.dirname(script_dir)) # Two levels up + termbase_path = os.path.join(base_dir, "tools", "translate", "termbase_i18n.md") + + url = "https://api.dify.ai/v1/workflows/run" + + termbase = await load_md_mdx(termbase_path) + the_doc = await load_md_mdx(file_path) + payload = { + "response_mode": "blocking", + "user": "Dify", + "inputs": { + "original_language": original_language, + "output_language1": target_language1, + "the_doc": the_doc, + "termbase": termbase + } + } + + headers = { + "Authorization": "Bearer " + dify_api_key, + "Content-Type": "application/json" + } + + # Retry mechanism + for attempt in range(max_retries): + try: + # Add delay to avoid concurrent pressure + if attempt > 0: + delay = attempt * 2 # Incremental delay: 2s, 4s, 6s + print(f"Retrying in {delay} seconds... (attempt {attempt + 1}/{max_retries})") + await asyncio.sleep(delay) + + async with httpx.AsyncClient(timeout=120.0) as client: # Increase timeout to 120 seconds + response = await client.post(url, json=payload, headers=headers) + + # Check HTTP status code + if response.status_code != 200: + print(f"HTTP Error: {response.status_code}") + print(f"Response: {response.text}") + if attempt == max_retries - 1: # Last attempt + return "" + continue + + try: + response_data = response.json() + print(f"API Response: {response_data}") # Debug info + + # Extract output1 + output1 = response_data.get("data", {}).get("outputs", {}).get("output1", "") + if not output1: + print("Warning: No output1 found in response") + print(f"Full response: {response_data}") + return output1 + except Exception as e: + print(f"Error parsing response: {e}") + print(f"Response text: {response.text}") + if attempt == max_retries - 1: # Last attempt + return "" + continue + + except httpx.ReadTimeout as e: + print(f"Request timeout (attempt {attempt + 1}/{max_retries}): {str(e)}") + if attempt == max_retries - 1: # Last attempt + print(f"All {max_retries} attempts failed due to timeout") + return "" + except Exception as e: + print(f"Unexpected error (attempt {attempt + 1}/{max_retries}): {str(e)}") + if attempt == max_retries - 1: # Last attempt + return "" + + return "" + + +async def load_md_mdx(file_path): + async with aiofiles.open(file_path, "r", encoding="utf-8") as f: + content = await f.read() + return content + + +def determine_doc_type_and_language(file_path): + """ + Determine document type and current language based on file path + Returns (doc_type, current_language, language_name) + """ + path_parts = file_path.split(os.sep) + + for doc_type, languages in docs_structure.items(): + for lang_name, lang_code in languages.items(): + if lang_code in path_parts: + return doc_type, lang_code, lang_name + + return None, None, None + + +def get_language_code_name_map(doc_type): + """ + Get mapping from language code to language name + """ + code_to_name = {} + for lang_name, lang_code in docs_structure[doc_type].items(): + code_to_name[lang_code] = lang_name + return code_to_name + + +def generate_target_path(file_path, current_lang_code, target_lang_code): + """ + Generate target language file path + """ + return file_path.replace(current_lang_code, target_lang_code) + + +async def save_translated_content(content, file_path): + """ + Save translated content to file + """ + try: + print(f"Attempting to save to: {file_path}") + print(f"Content length: {len(content)} characters") + + # Ensure directory exists + os.makedirs(os.path.dirname(file_path), exist_ok=True) + + # Save file + async with aiofiles.open(file_path, "w", encoding="utf-8") as f: + await f.write(content) + + # Verify file was saved successfully + if os.path.exists(file_path): + file_size = os.path.getsize(file_path) + print(f"✓ Translated content saved to {file_path} (size: {file_size} bytes)") + else: + print(f"✗ Failed to save file: {file_path}") + except Exception as e: + print(f"Error saving file {file_path}: {str(e)}") + + +async def translate_single_file(file_path, dify_api_key, current_lang_name, target_lang_code, target_lang_name, current_lang_code, semaphore): + """ + Async translate single file (using semaphore to control concurrency) + """ + async with semaphore: # Control concurrency + # Generate target file path + target_file_path = generate_target_path(file_path, current_lang_code, target_lang_code) + + print(f"Source: {file_path}") + print(f"Target: {target_file_path}") + + # Check if target file exists + if os.path.exists(target_file_path): + print(f"Target file already exists: {target_file_path}") + return + + print(f"Translating from {current_lang_name} to {target_lang_name}...") + + try: + # Call translation function + translated_content = await translate_text( + file_path, + dify_api_key, + current_lang_name, + target_lang_name + ) + + print(f"Translation result length: {len(translated_content)} characters") + + if translated_content and translated_content.strip(): + # Save translation result + await save_translated_content(translated_content, target_file_path) + else: + print(f"Error: Translation failed for {target_lang_name} - empty or no content returned") + except Exception as e: + print(f"Error translating to {target_lang_name}: {str(e)}") + import traceback + traceback.print_exc() + + +async def main_async(file_path, dify_api_key=None): + """ + Async main function + """ + # Get script directory + script_dir = os.path.dirname(os.path.abspath(__file__)) + + # Try to load API key from .env file + env_path = os.path.join(script_dir, '.env') + if os.path.exists(env_path) and dify_api_key is None: + try: + # Import dotenv only when needed + import importlib.util + dotenv_spec = importlib.util.find_spec("dotenv") + if dotenv_spec is not None: + from dotenv import load_dotenv + load_dotenv(env_path) + dify_api_key = os.getenv('DIFY_API_KEY') or os.getenv('dify_api_key') + else: + raise ImportError + except ImportError: + # Manual parsing of .env file if dotenv is not available + with open(env_path, 'r') as f: + for line in f: + if line.strip().startswith('DIFY_API_KEY=') or line.strip().startswith('dify_api_key='): + dify_api_key = line.strip().split('=', 1)[1].strip('"\'') + break + + if not dify_api_key: + print("Error: DIFY_API_KEY not found. Please provide it as parameter or in .env file.") + return + + # Determine document type and current language + doc_type, current_lang_code, current_lang_name = determine_doc_type_and_language(file_path) + + if not doc_type: + print(f"Error: Unable to determine document type and language for {file_path}") + return + + print(f"Document type: {doc_type}, Current language: {current_lang_name} ({current_lang_code})") + + # Get all languages for current document type + code_to_name = get_language_code_name_map(doc_type) + + # Create semaphore to limit concurrency (avoid excessive API pressure) + semaphore = asyncio.Semaphore(2) + + # Create all translation tasks + tasks = [] + for target_lang_code, target_lang_name in code_to_name.items(): + # Skip current language + if target_lang_code == current_lang_code: + continue + + task = translate_single_file( + file_path, + dify_api_key, + current_lang_name, + target_lang_code, + target_lang_name, + current_lang_code, + semaphore + ) + tasks.append(task) + + # Execute all translation tasks + if tasks: + print("Running translations concurrently...") + await asyncio.gather(*tasks) + print("All translations completed!") + else: + print("No translations needed.") + + +def get_file_path_interactive(): + """ + Interactive file path input + """ + while True: + print("Please enter the file path to translate:") + print("请输入要翻译的文件路径:") + print("翻訳するファイルパスを入力してください:") + file_path = input("File path / 文件路径 / ファイルパス: ").strip() + + if not file_path: + print("File path cannot be empty. Please try again.") + print("文件路径不能为空,请重新输入。") + print("ファイルパスは空にできません。再度入力してください。") + continue + + # Remove quotes if user copy-pasted with quotes + file_path = file_path.strip('\'"') + + # Check if file exists + if not os.path.exists(file_path): + print(f"File does not exist: {file_path}") + print(f"文件不存在: {file_path}") + print(f"ファイルが存在しません: {file_path}") + print("Please check if the path is correct.") + print("请检查路径是否正确。") + print("パスが正しいか確認してください。") + continue + + # Check if it's a file + if not os.path.isfile(file_path): + print(f"The specified path is not a file: {file_path}") + print(f"指定的路径不是文件: {file_path}") + print(f"指定されたパスはファイルではありません: {file_path}") + continue + + # Check file extension + if not (file_path.endswith('.md') or file_path.endswith('.mdx')): + print(f"Warning: File is not .md or .mdx format: {file_path}") + print(f"警告: 文件不是 .md 或 .mdx 格式: {file_path}") + print(f"警告: ファイルは .md または .mdx 形式ではありません: {file_path}") + confirm = input("Continue anyway? (y/n) / 是否继续? (y/n) / 続行しますか? (y/n): ").strip().lower() + if confirm not in ['y', 'yes', 'Y', 'YES']: + continue + + return file_path + + +def load_local_api_key(): + """ + Load API key from local .env file + """ + script_dir = os.path.dirname(os.path.abspath(__file__)) + env_path = os.path.join(script_dir, '.env') + + if not os.path.exists(env_path): + print(f"Error: .env file not found: {env_path}") + print(f"错误: 未找到 .env 文件: {env_path}") + print(f"エラー: .env ファイルが見つかりません: {env_path}") + print("Please create .env file and add: DIFY_API_KEY=your_api_key") + print("请在当前目录创建 .env 文件并添加: DIFY_API_KEY=your_api_key") + print(".env ファイルを作成し、DIFY_API_KEY=your_api_key を追加してください") + return None + + try: + # Try using dotenv + import importlib.util + dotenv_spec = importlib.util.find_spec("dotenv") + if dotenv_spec is not None: + from dotenv import load_dotenv + load_dotenv(env_path) + api_key = os.getenv('DIFY_API_KEY') or os.getenv('dify_api_key') + else: + # Manual parsing of .env file + api_key = None + with open(env_path, 'r') as f: + for line in f: + line = line.strip() + if line.startswith('DIFY_API_KEY=') or line.startswith('dify_api_key='): + api_key = line.split('=', 1)[1].strip('"\'') + break + except Exception as e: + print(f"Error reading .env file: {e}") + print(f"读取 .env 文件时出错: {e}") + print(f".env ファイルの読み取りエラー: {e}") + return None + + if not api_key: + print("Error: DIFY_API_KEY not found in .env file") + print("错误: 在 .env 文件中未找到 DIFY_API_KEY") + print("エラー: .env ファイルに DIFY_API_KEY が見つかりません") + print("Please ensure .env file contains: DIFY_API_KEY=your_api_key") + print("请确保 .env 文件包含: DIFY_API_KEY=your_api_key") + print(".env ファイルに DIFY_API_KEY=your_api_key が含まれていることを確認してください") + return None + + print("✓ Successfully loaded local API key") + print("✓ 成功加载本地 API key") + print("✓ ローカル API キーの読み込みに成功しました") + return api_key + + +def main(file_path, dify_api_key=None): + """ + Sync wrapper function to run async main function + """ + asyncio.run(main_async(file_path, dify_api_key)) + + +if __name__ == "__main__": + # If no parameters provided, enter interactive mode + if len(sys.argv) == 1: + print("=== Dify Documentation Translation Tool ===") + print("=== Dify 文档翻译工具 ===") + print("=== Dify ドキュメント翻訳ツール ===") + print() + + # Interactive file path input + file_path = get_file_path_interactive() + + # Load local API key + dify_api_key = load_local_api_key() + if not dify_api_key: + sys.exit(1) + + print() + print(f"Starting translation for file: {file_path}") + print(f"开始翻译文件: {file_path}") + print(f"ファイルの翻訳を開始: {file_path}") + main(file_path, dify_api_key) + + # Command line argument mode + elif len(sys.argv) >= 2: + file_path = sys.argv[1] + dify_api_key = None + + # Parse command line arguments + for i, arg in enumerate(sys.argv[2:], 2): + if dify_api_key is None: + dify_api_key = arg + + main(file_path, dify_api_key) + + else: + print("Usage: python main.py [file_path] [dify_api_key]") + print(" No arguments: Enter interactive mode") + print(" file_path: File path to translate") + print(" dify_api_key: (Optional) Dify API key") + sys.exit(1) + + + diff --git a/tools/translate/requirements.txt b/tools/translate/requirements.txt new file mode 100644 index 000000000..2bcd4dc95 --- /dev/null +++ b/tools/translate/requirements.txt @@ -0,0 +1,3 @@ +python-dotenv>=1.0.0 +httpx>=0.25.0 +aiofiles>=23.0.0 \ No newline at end of file diff --git a/tools/translate/termbase_i18n.md b/tools/translate/termbase_i18n.md new file mode 100644 index 000000000..07d45aa97 --- /dev/null +++ b/tools/translate/termbase_i18n.md @@ -0,0 +1,64 @@ +# Dify Termbase (EN/CN/JP) + +| English Term | 中文 (简体) | 日本語 | Definition | +| :--------------------------------------- | :--------------- | :----------------------------- | :-------------------------------------------------------------------------------------------------------------------------------------------------------- | +| **Agent** | 智能代理 | エージェント | An autonomous AI system capable of making decisions and executing tasks based on environmental information. | +| **Agentic Workflow** | 智能体工作流 | エージェンティックワークフロー | A task orchestration method that allows AI systems to autonomously solve complex problems through multiple steps. | +| **Automatic Speech Recognition (ASR)** | 自动语音识别 | 自動音声認識 | Technology that converts human speech into text and serves as the foundation for voice interaction applications. | +| **Backbone of Thought (BoT)** | 思维骨架 | 思考の骨格 | A structured thinking framework that provides the main structure for reasoning in large language models. | +| **Chain of Thought (CoT)** | 思维链 | 思考の連鎖 | A prompting technique that guides large language models to display their step-by-step thinking process. | +| **Chatflow** | 对话流 | チャットフロー | 一种面向对话场景的工作流编排模式,专为需要多步逻辑处理的交互式应用设计。(CN) | +| **Chunking** | 分段 | チャンキング | A processing technique that splits long text into smaller content blocks, enabling retrieval systems to find relevant information more precisely. | +| **Citation and Attribution** | 引用与归属 | 引用と帰属 | Features that allow AI systems to clearly indicate the sources of information, increasing the credibility and transparency of responses. | +| **Domain-Specific Language (DSL)** | 领域特定语言 | ドメイン固有言語 | A programming language or configuration format designed for a specific application domain. | +| **Extract, Transform, Load (ETL)** | 提取、转换、加载 | 抽出・変換・読み込み | A classic data processing workflow: extracting raw data, transforming it into a format suitable for analysis, and then loading it into the target system. | +| **Frequency Penalty** | 频率惩罚 | 頻度ペナルティ | A text generation control parameter that increases output diversity by reducing the probability of generating frequently occurring vocabulary. | +| **Full-text Search** | 全文检索 | 全文検索 | 索引文档中的所有词汇,从而允许用户查询任意词汇,并返回包含这些词汇的文本片段。(CN) | +| **Function Calling** | 函数调用 | 関数呼び出し | The capability of large language models to recognize when to call specific functions and provide the required parameters. | +| **General Chunking Pattern** | 通用分段模式 | 一般的なチャンキングパターン | A simple text splitting strategy that divides documents into mutually independent content blocks. | +| **Graph of Thought (GoT)** | 思维图 | 思考のグラフ | A method of representing the thinking process as a network structure, capturing complex relationships between concepts. | +| **Hybrid Search** | 混合检索 | ハイブリッド検索 | A search method that combines the advantages of keyword matching and semantic search to provide more comprehensive retrieval results. | +| **Inverted Index** | 倒排索引 | 転置インデックス | A core data structure of search engines that records which documents each word appears in. | +| **Keyword Search** | 关键词检索 | キーワード検索 | A search method based on exact matching that finds documents containing specific vocabulary. | +| **Knowledge Base** | 知识库 | 知識ベース | A database that stores structured information in AI applications, providing a source of professional knowledge for models. | +| **Knowledge Retrieval** | 知识检索 | 知識検索 | The process of finding information from a knowledge base that is most relevant to a user's question. | +| **Large Language Model (LLM)** | 大型语言模型 | 大規模言語モデル | An AI model trained on massive amounts of text that can understand and generate human language. | +| **Local Model Inference** | 本地模型推理 | ローカルモデル推論 | The process of running AI models on a user's own device rather than relying on cloud services. | +| **Max_tokens** | 最大标记数 | 最大トークン数 | A parameter that controls the maximum number of characters the model generates in a single response. | +| **Memory** | 记忆 | メモリ | The ability of AI systems to save and use historical interaction information, keeping multi-turn conversations coherent. | +| **Metadata** | 元数据 | メタデータ | 描述数据的数据,提供关于内容的结构化信息,如文档的创建时间、作者、标题、标签、文件格式等属性信息等。(CN) | +| **Metadata Filtering** | 元数据筛选 | メタデータフィルタリング | A technique that utilizes document attribute information for content filtering. | +| **Model-as-a-Service (MaaS)** | 模型即服务 | サービスとしてのモデル | A cloud service model where providers offer access to pre-trained models through APIs. | +| **Multimodal Model** | 多模态模型 | マルチモーダルモデル | A model capable of processing multiple types of input data, such as text, images, audio, etc. | +| **Multi-path Retrieval** | 多路召回 | マルチパス検索 | A strategy for obtaining information in parallel through multiple retrieval methods. | +| **Multi-tool-call** | 多工具调用 | マルチツール呼び出し | The ability of a model to call multiple different tools in a single response. | +| **Parent-Child Chunking** | 父子分段模式 | 親子チャンキング | An advanced text splitting strategy that creates two levels of content blocks. | +| **Predefined Model** | 预定义模型 | 事前定義モデル | A ready-made model trained and provided by AI vendors that users can directly call without training themselves. | +| **Presence Penalty** | 存在惩罚 | 存在ペナルティ | A parameter setting that prevents language models from repeating content. | +| **Prompt** | 提示词 | プロンプト | Input text that guides AI models to generate specific responses. | +| **Q\&A Mode** | 问答模式 | Q\&A モード | A special indexing strategy that automatically generates question-answer pairs for document content. | +| **Reasoning and Acting (ReAct)** | 推理与行动 | 推論と行動 | An AI agent framework that enables models to alternate between thinking and executing operations. | +| **ReRank** | 重排序 | 再ランキング | A technique for secondary sorting of preliminary retrieval results to improve the relevance of final results. | +| **Rerank Model** | 重新排序模型 | 再ランキングモデル | A model specifically designed to evaluate the relevance of retrieval results to queries and reorder them. | +| **Response_format** | 响应格式 | レスポンス形式 | A specification of the structure type for model output, such as plain text, JSON, or HTML. | +| **Retrieval Test** | 召回测试 | 検索テスト | A functionality for verifying the effectiveness of knowledge base retrieval. | +| **Retrieval-Augmented Generation (RAG)** | 检索增强生成 | 検索拡張生成 | A technical architecture that combines external knowledge retrieval and language generation. | +| **Reverse Calling** | 反向调用 | リバースコーリング | A bidirectional mechanism for plugins to interact with platforms, allowing plugins to actively call platform functionality. | +| **Score Threshold** | 分数阈值 | スコア閾値 | A similarity threshold for filtering retrieval results. | +| **Software Development Kit (SDK)** | 软件开发工具包 | (N/A) | 一组用于开发特定平台或服务应用程序的工具集合。(CN) | +| **Semantic Search** | 语义检索 | セマンティック検索 | A retrieval method based on understanding and matching text meaning rather than simple keyword matching. | +| **Session Variables** | 会话变量 | セッション変数 | A mechanism for storing multi-turn dialogue context information. | +| **Speech-to-Text (STT)** | 语音转文字 | 音声からテキスト変換 | Technology that converts users' voice input into text data. | +| **Stream-tool-call** | 流式工具调用 | ストリームツール呼び出し | A real-time processing mode that allows AI systems to call external tools while generating responses. | +| **Streaming Response** | 流式结果返回 | ストリーミングレスポンス | A real-time response mechanism where AI systems return content to users as it is generated. | +| **Temperature** | 温度 | 温度 | A parameter controlling the randomness of language model output. | +| **Text Embedding** | 文本嵌入 | テキスト埋め込み | The process of converting text into numerical vectors. | +| **Text-to-Speech (TTS)** | 文本转语音 | テキスト音声変換 | Technology that converts written text into natural speech. | +| **Tool Calling** | 工具调用 | ツール呼び出し | The ability of AI systems to identify and use external functionality. | +| **TopK** | TopK | TopK | A parameter controlling the number of retrieval results returned. | +| **TopP (Nucleus Sampling)** | 核采样 | 核サンプリング | A text generation control method that selects the next word from a probability-weighted subset of vocabulary. | +| **Tree of Thought (ToT)** | 思维树 | 思考の木 | A thinking method for exploring multiple reasoning paths. | +| **Vector Database** | 向量数据库 | ベクトルデータベース | A database system specialized in storing and searching vector embeddings. | +| **Vector Retrieval** | 向量检索 | ベクトル検索 | A search method based on text vector embedding similarity. | +| **Vision** | 视觉能力 | ビジョン機能 | The functionality of multimodal LLMs to understand and process images. | +| **Workflow** | 工作流 | ワークフロー | A task orchestration method that breaks down complex AI applications into multiple independent nodes. | From 5b4f88147e49766d04b4c0befd31d700a72ff743 Mon Sep 17 00:00:00 2001 From: Alter-xyz <88554920+alterxyz@users.noreply.github.com> Date: Thu, 24 Jul 2025 22:03:06 +0800 Subject: [PATCH 2/2] chore: update --- .github/workflows/translate.yml | 105 +++++++++++++++++++++++++++++--- tools/translate/main.py | 32 +++++++++- 2 files changed, 124 insertions(+), 13 deletions(-) diff --git a/.github/workflows/translate.yml b/.github/workflows/translate.yml index 549bfaeab..37dac25fb 100644 --- a/.github/workflows/translate.yml +++ b/.github/workflows/translate.yml @@ -1,13 +1,23 @@ name: Auto Translate Docs on: + workflow_run: + workflows: ["Process Documentation"] + types: + - completed + branches-ignore: + - 'main' push: branches-ignore: - 'main' + paths-ignore: + - '.github/workflows/**' jobs: translate: runs-on: ubuntu-latest + # Only run if the workflow_run event was successful, or if it's a direct push + if: github.event_name == 'push' || github.event.workflow_run.conclusion == 'success' permissions: contents: write steps: @@ -16,6 +26,8 @@ jobs: with: fetch-depth: 0 # Fetches all history for git diff token: ${{ secrets.GITHUB_TOKEN }} + # For workflow_run events, checkout the head of the triggering workflow + ref: ${{ github.event_name == 'workflow_run' && github.event.workflow_run.head_sha || github.sha }} - name: Set up Python uses: actions/setup-python@v4 @@ -28,16 +40,57 @@ jobs: - name: Get changed markdown files id: changed-files run: | - # Get the list of changed files between the current and previous commit + # Get the list of newly added files between the current and previous commit # We filter for .md and .mdx files that are inside the language directories - files=$(git diff --name-only ${{ github.event.before }} ${{ github.sha }} | grep -E '^(en|zh-hans|ja-jp|plugin-dev-en|plugin-dev-zh|plugin-dev-ja)/.*(\.md|\.mdx)$' || true) - if [[ -z "$files" ]]; then - echo "No markdown files to translate." + # Only include added (A) files, skip modified (M) and deleted (D) files + + # Determine the commit SHA to use based on event type + if [[ "${{ github.event_name }}" == "workflow_run" ]]; then + current_sha="${{ github.event.workflow_run.head_sha }}" + echo "Using workflow_run head_sha: $current_sha" + else + current_sha="${{ github.sha }}" + echo "Using github.sha: $current_sha" + fi + + # Try different approaches to get the diff + if [[ -n "${{ github.event.before }}" && "${{ github.event_name }}" == "push" ]]; then + echo "Using github.event.before: ${{ github.event.before }}" + files=$(git diff --name-status ${{ github.event.before }} $current_sha | grep -E '^A\s+' | cut -f2 | grep -E '^(en|en-us|zh-hans|ja-jp|plugin-dev-en|plugin-dev-zh|plugin-dev-ja|versions)/.*(\.md|\.mdx)$' || true) + else + echo "Using HEAD~1 for comparison" + files=$(git diff --name-status HEAD~1 $current_sha | grep -E '^A\s+' | cut -f2 | grep -E '^(en|en-us|zh-hans|ja-jp|plugin-dev-en|plugin-dev-zh|plugin-dev-ja|versions)/.*(\.md|\.mdx)$' || true) + fi + + echo "Detected files (Added only):" + echo "$files" + + # Filter out files that don't actually exist + existing_files="" + if [[ -n "$files" ]]; then + while IFS= read -r file; do + if [[ -n "$file" && -f "$file" ]]; then + if [[ -z "$existing_files" ]]; then + existing_files="$file" + else + existing_files="$existing_files"$'\n'"$file" + fi + else + echo "Skipping non-existent file: $file" + fi + done <<< "$files" + fi + + echo "Final files to translate:" + echo "$existing_files" + + if [[ -z "$existing_files" ]]; then + echo "No new markdown files to translate." echo "files=" >> $GITHUB_OUTPUT else # The script expects absolute paths, but we run it from the root, so relative is fine. echo "files<> $GITHUB_OUTPUT - echo "$files" >> $GITHUB_OUTPUT + echo "$existing_files" >> $GITHUB_OUTPUT echo "EOF" >> $GITHUB_OUTPUT fi @@ -49,12 +102,35 @@ jobs: echo "Files to translate:" echo "${{ steps.changed-files.outputs.files }}" - echo "${{ steps.changed-files.outputs.files }}" | while IFS= read -r file; do + # Create temporary file list + echo "${{ steps.changed-files.outputs.files }}" > /tmp/files_to_translate.txt + + # Start all translation processes in parallel + pids=() + while IFS= read -r file; do if [[ -n "$file" ]]; then - echo "Translating $file..." - python tools/translate/main.py "$file" "$DIFY_API_KEY" + echo "Starting translation for $file..." + python tools/translate/main.py "$file" "$DIFY_API_KEY" & + pids+=($!) + fi + done < /tmp/files_to_translate.txt + + # Wait for all background processes to complete + echo "Waiting for ${#pids[@]} translation processes to complete..." + failed=0 + for pid in "${pids[@]}"; do + if ! wait "$pid"; then + echo "Translation process $pid failed" + failed=1 fi done + + if [ $failed -eq 1 ]; then + echo "Some translations failed" + exit 1 + fi + + echo "All translations completed successfully" - name: Commit and push changes run: | @@ -64,8 +140,17 @@ jobs: if [[ -n $(git status --porcelain) ]]; then git add . git commit -m "docs: auto-translate documentation" - # Push to the same branch the workflow was triggered from - git push origin HEAD:${{ github.ref_name }} + # Push to the appropriate branch based on event type + if [[ "${{ github.event_name }}" == "workflow_run" ]]; then + # For workflow_run events, push to the head branch of the triggering workflow + branch_ref="${{ github.event.workflow_run.head_branch }}" + echo "Pushing to workflow_run head branch: $branch_ref" + git push origin HEAD:$branch_ref + else + # For push events, push to the same branch the workflow was triggered from + echo "Pushing to current branch: ${{ github.ref_name }}" + git push origin HEAD:${{ github.ref_name }} + fi echo "Translated files have been pushed to the branch." else echo "No new translations to commit." diff --git a/tools/translate/main.py b/tools/translate/main.py index 05ddb21fd..e1b990a63 100644 --- a/tools/translate/main.py +++ b/tools/translate/main.py @@ -14,6 +14,21 @@ "English": "plugin-dev-en", "Chinese": "plugin-dev-zh", "Japanese": "plugin-dev-ja" + }, + "version_28x": { + "English": "versions/2-8-x/en-us", + "Chinese": "versions/2-8-x/zh-cn", + "Japanese": "versions/2-8-x/ja-jp" + }, + "version_30x": { + "English": "versions/3-0-x/en-us", + "Chinese": "versions/3-0-x/zh-cn", + "Japanese": "versions/3-0-x/ja-jp" + }, + "version_31x": { + "English": "versions/3-1-x/en-us", + "Chinese": "versions/3-1-x/zh-cn", + "Japanese": "versions/3-1-x/ja-jp" } } @@ -109,12 +124,23 @@ def determine_doc_type_and_language(file_path): Determine document type and current language based on file path Returns (doc_type, current_language, language_name) """ - path_parts = file_path.split(os.sep) + # Normalize path separators + normalized_path = file_path.replace(os.sep, '/') + # Collect all possible matches and find the longest one + matches = [] for doc_type, languages in docs_structure.items(): for lang_name, lang_code in languages.items(): - if lang_code in path_parts: - return doc_type, lang_code, lang_name + # Normalize lang_code path separators too + normalized_lang_code = lang_code.replace(os.sep, '/') + if normalized_lang_code in normalized_path: + matches.append((len(normalized_lang_code), doc_type, lang_code, lang_name)) + + # Return the match with the longest lang_code (most specific) + if matches: + matches.sort(reverse=True) # Sort by length descending + _, doc_type, lang_code, lang_name = matches[0] + return doc_type, lang_code, lang_name return None, None, None