NiuTrans
diff --git a/‎README.md‎
Lines changed: 335 additions & 314 deletions b/‎README.md‎
Lines changed: 335 additions & 314 deletions
diff --git a/‎README_CN.md‎
Lines changed: 339 additions & 311 deletions b/‎README_CN.md‎
Lines changed: 339 additions & 311 deletions
diff --git a/‎index.html‎
Lines changed: 464 additions & 192 deletions b/‎index.html‎
Lines changed: 464 additions & 192 deletions
diff --git a/‎lib/__init__.py‎
Lines changed: 27 additions & 0 deletions b/‎lib/__init__.py‎
Lines changed: 27 additions & 0 deletions
diff --git a/‎lib/fetch/core.py‎
Lines changed: 51 additions & 8 deletions b/‎lib/fetch/core.py‎
Lines changed: 51 additions & 8 deletions
diff --git a/‎lib/fetch/http.py‎
Lines changed: 6 additions & 1 deletion b/‎lib/fetch/http.py‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎lib/fetch/utils.py‎
Lines changed: 58 additions & 8 deletions b/‎lib/fetch/utils.py‎
Lines changed: 58 additions & 8 deletions
diff --git a/‎lib/js_bundler.py‎
Lines changed: 3 additions & 1 deletion b/‎lib/js_bundler.py‎
Lines changed: 3 additions & 1 deletion
@@ -18,6 +18,7 @@
     'FETCH_MAX_CHARS_PDF', 'FETCH_MAX_BYTES',
     'SKIP_DOMAINS', 'MODEL_PRICING',
     'QWEN_PRICING_CNY', 'DEFAULT_USD_CNY_RATE',
+    'MT_PROVIDER_CONFIG',
 ]
 
 # ══════════════════════════════════════════════════════════
@@ -139,6 +140,29 @@ def _resolve_base_url():
 ]
 
 
+# ── Machine Translation Provider (optional, for faster/cheaper translation) ──
+# When configured, translation uses a dedicated MT API (e.g. NiuTrans) instead
+# of the cheap LLM model.  Config stored in server_config.json under 'mt_provider'.
+# Priority: server_config.json > default (disabled)
+def _resolve_mt_provider_config():
+    """Resolve MT provider config from server_config.json.
+
+    Returns dict with: provider, api_url, api_key, app_id, enabled
+    """
+    mt = _SAVED_CONFIG.get('mt_provider', {})
+    if not isinstance(mt, dict):
+        return {}
+    return {
+        'provider': mt.get('provider', 'niutrans'),
+        'api_url': mt.get('api_url', ''),
+        'api_key': mt.get('api_key', ''),
+        'app_id': mt.get('app_id', ''),
+        'enabled': bool(mt.get('enabled', False)),
+    }
+
+MT_PROVIDER_CONFIG = _resolve_mt_provider_config()
+
+
 # ── Trading module (default OFF — enable in Settings or with TRADING_ENABLED=1) ──
 # Priority: env-var > features.json > default(off)
 def _resolve_trading_enabled():
@@ -562,6 +586,9 @@ def _rcfg(env_key, saved_key, default):
     # Debug mode flag
     _mod.DEBUG_MODE = _resolve_debug_mode()
 
+    # Machine translation provider
+    _mod.MT_PROVIDER_CONFIG = _resolve_mt_provider_config()
+
     # Model defaults (from model_defaults section)
     _md = _SAVED_CONFIG.get('model_defaults', {})
     if _md.get('fallback_model') is not None:
 
@@ -338,36 +338,65 @@ def fetch_contents_for_results(results, max_fetch=None,
                 len(to_fetch), target_ok, max_chars)
     t0 = time.time()
     ok_count = 0
+    url_timings = []  # list of (url, elapsed, ok, chars, method)
     def _do(r):
-        return r, fetch_page_content(r['url'], max_chars=max_chars,
+        url = r['url']
+        fetch_t0 = time.time()
+        content = fetch_page_content(url, max_chars=max_chars,
                                      pdf_max_chars=_lib.FETCH_MAX_CHARS_PDF)
-    with ThreadPoolExecutor(max_workers=8) as pool:
+        fetch_elapsed = time.time() - fetch_t0
+        return r, content, fetch_elapsed
+    with ThreadPoolExecutor(max_workers=16) as pool:
         futs = {pool.submit(_do, r): r for r in to_fetch}
         pending = set(futs.keys())
+        cancelled_urls = []
         try:
             for fut in as_completed(futs, timeout=90):
                 pending.discard(fut)
                 try:
-                    result, content = fut.result()
-                    if content and len(content) > 50:
+                    result, content, fetch_elapsed = fut.result()
+                    url = result['url']
+                    ok = bool(content and len(content) > 50)
+                    chars = len(content) if content else 0
+                    url_timings.append((url, fetch_elapsed, ok, chars))
+                    if ok:
                         result['full_content'] = content
                         ok_count += 1
+                    # Log individual slow fetches (>5s)
+                    if fetch_elapsed > 5:
+                        logger.info('[Fetch] ⚠ SLOW url=%.80s  %.1fs  ok=%s chars=%d',
+                                    url, fetch_elapsed, ok, chars)
                 except Exception as e:
                     logger.warning('[Fetch] fetch_contents thread error: %s', e, exc_info=True)
                 # Race-to-N: once we have enough content, stop waiting
                 if ok_count >= target_ok and pending:
                     elapsed_so_far = time.time() - t0
+                    cancelled_urls = [futs[p]['url'][:60] for p in pending]
                     logger.info('[Fetch] Race-to-N: got %d/%d pages in %.1fs, '
-                                'cancelling %d slow fetches',
+                                'cancelling %d slow fetches: %s',
                                 ok_count, len(to_fetch), elapsed_so_far,
-                                len(pending))
+                                len(pending),
+                                ', '.join(cancelled_urls[:5]))
                     for p in pending:
                         p.cancel()
                     break
         except TimeoutError:
             logger.warning('[Fetch] fetch_contents: as_completed timeout (90s)', exc_info=True)
     elapsed = time.time() - t0
-    logger.info('[Fetch] fetch_contents done: %d/%d got content in %.1fs', ok_count, len(to_fetch), elapsed)
+
+    # Summarize URL timing breakdown
+    if url_timings:
+        url_timings.sort(key=lambda x: -x[1])  # slowest first
+        slow_summary = '  '.join(
+            f'[{"✓" if ok else "✗"}]{url[:50]}={et:.1f}s'
+            for url, et, ok, _chars in url_timings[:8]
+        )
+        logger.info('[Fetch] fetch_contents done: %d/%d got content in %.1fs  '
+                    'slowest: %s',
+                    ok_count, len(to_fetch), elapsed, slow_summary)
+    else:
+        logger.info('[Fetch] fetch_contents done: %d/%d got content in %.1fs',
+                    ok_count, len(to_fetch), elapsed)
     return results
 
 
@@ -385,7 +414,7 @@ def _do(u):
                                      pdf_max_chars=pdf_max_chars, timeout=timeout)
     # Total deadline = per-request timeout + generous buffer for download + parsing
     deadline = max(timeout * 4, 120)
-    with ThreadPoolExecutor(max_workers=4) as pool:
+    with ThreadPoolExecutor(max_workers=8) as pool:
         futs = {pool.submit(_do, u): u for u in urls}
         done_count = 0
         try:
@@ -418,3 +447,17 @@ def extract_urls_from_text(text):
         u = u.rstrip('.,;:!?')
         if u not in seen and len(u) > 10: seen.add(u); unique.append(u)
     return unique[:5]
+
+
+
+def get_fetch_cache_stats() -> dict:
+    """Return diagnostic stats for all fetch caches.
+
+    Useful for admin endpoints and periodic health checks.
+    Includes hit rates, eviction counts, and current sizes.
+    """
+    return {
+        'fetch_cache': _fetch_cache.stats,
+        'html_head_cache': _html_head_cache.stats,
+        'circuit_breaker': _circuit.get_status(),
+    }
@@ -174,7 +174,9 @@ def try_browser_fetch(url, max_chars, reason='unknown'):
             attempt_num = _browser_fallback_stats['attempts']
         logger.info('[Fetch] Browser fallback ATTEMPT #%d reason=%s — %s',
                     attempt_num, reason, url[:100])
+        bf_t0 = time.time()
         text = fetch_url_via_browser(url, max_chars=max_chars, timeout=25)
+        bf_elapsed = time.time() - bf_t0
         if text:
             # ── Guard: browser extension may also return bot-protection pages ──
             from lib.fetch.utils import _is_bot_extracted_text
@@ -187,12 +189,15 @@ def try_browser_fetch(url, max_chars, reason='unknown'):
             with _browser_fallback_lock:
                 _browser_fallback_stats['success'] += 1
             _fetch_cache.put(url, text)
+            logger.info('[Fetch] Browser fallback OK in %.1fs — %s (%d chars)',
+                        bf_elapsed, url[:80], len(text))
             if max_chars and len(text) > max_chars:
                 return text[:max_chars] + '\n[…truncated]'
             return text
         with _browser_fallback_lock:
             _browser_fallback_stats['fail'] += 1
-        logger.info('[Fetch] Browser fallback returned empty — %s', url[:80])
+        logger.info('[Fetch] Browser fallback returned empty in %.1fs — %s',
+                    bf_elapsed, url[:80])
         _log_browser_fallback_stats()
         return None
     except Exception as e:
 
@@ -238,27 +238,77 @@ def get_status(self):
 
 
 class _FetchCache:
-    def __init__(self, ttl=600, max_size=200):
+    """TTL-based URL content cache with LRU eviction.
+
+    Tracks hits, misses, TTL expirations, and capacity evictions for
+    diagnostic visibility. Stats accessible via ``stats`` property.
+    """
+    def __init__(self, ttl=600, max_size=200, name='fetch'):
         self._data, self._lock = {}, threading.Lock()
         self._ttl, self._max = ttl, max_size
+        self._name = name
+        # Diagnostic counters
+        self._hits = 0
+        self._misses = 0
+        self._ttl_expirations = 0
+        self._capacity_evictions = 0
+        self._puts = 0
+
     def get(self, url):
         with self._lock:
             e = self._data.get(url)
-            if e and e[1] > time.time(): return e[0]
-            self._data.pop(url, None); return None
+            if e and e[1] > time.time():
+                self._hits += 1
+                return e[0]
+            if e:
+                # Entry exists but TTL expired
+                self._ttl_expirations += 1
+                self._data.pop(url, None)
+                logger.debug('[%sCache] TTL expired for %.80s (ttl=%ds, size=%d)',
+                             self._name, url, self._ttl, len(self._data))
+            self._misses += 1
+            return None
+
     def put(self, url, content):
-        if not content: return
+        if not content:
+            return
         with self._lock:
+            self._puts += 1
             if len(self._data) >= self._max:
-                del self._data[min(self._data, key=lambda k: self._data[k][1])]
+                evicted_url = min(self._data, key=lambda k: self._data[k][1])
+                del self._data[evicted_url]
+                self._capacity_evictions += 1
+                logger.debug('[%sCache] Capacity eviction (%d/%d): %.80s',
+                             self._name, len(self._data), self._max,
+                             evicted_url)
             self._data[url] = (content, time.time() + self._ttl)
+
     @property
     def size(self):
-        with self._lock: return len(self._data)
+        with self._lock:
+            return len(self._data)
 
-_fetch_cache = _FetchCache(ttl=600, max_size=200)
+    @property
+    def stats(self):
+        """Return diagnostic stats dict."""
+        with self._lock:
+            total = self._hits + self._misses
+            return {
+                'name': self._name,
+                'size': len(self._data),
+                'max_size': self._max,
+                'ttl': self._ttl,
+                'hits': self._hits,
+                'misses': self._misses,
+                'hit_rate_pct': round(self._hits / max(total, 1) * 100),
+                'ttl_expirations': self._ttl_expirations,
+                'capacity_evictions': self._capacity_evictions,
+                'puts': self._puts,
+            }
+
+_fetch_cache = _FetchCache(ttl=600, max_size=200, name='Fetch')
 # Light cache: store raw HTML head (first 20KB) for publish-date extraction
-_html_head_cache = _FetchCache(ttl=600, max_size=300)
+_html_head_cache = _FetchCache(ttl=600, max_size=300, name='HtmlHead')
 
 
 # ═══════════════════════════════════════════════════════
 
@@ -1,7 +1,7 @@
 """JS bundler — concatenate app scripts into a single bundle at startup.
 
 Eliminates the HTTP/1.1 waterfall problem where browsers limit to 6 concurrent
-connections per host, causing 16 JS files to download in 3-4 serial waves.
+connections per host, causing 18 JS files to download in 3-4 serial waves.
 With the bundle, the browser fetches 1 file (gzip ~250KB) in a single request.
 
 The bundle is rebuilt at startup and whenever any source file changes.
@@ -20,6 +20,7 @@
 
 # ── Load order MUST match index.html (dependencies flow top → bottom) ──
 _BUNDLE_FILES = [
+    'i18n.js',         # MUST be first — t() is used by all other modules
     'idb-cache.js',
     'core.js',
     'export-images.js',
@@ -30,6 +31,7 @@
     'translation.js',
     'upload.js',
     'image-gen.js',
+    'paper-reader.js',
     'project.js',
     'memory.js',
     'scheduler.js',