Skip to content

Commit 34332e1

Browse files
committed
introduce machine translation interface for enhanced translation (NiuTrans default); folder behavior fixes
1 parent 8e8fb87 commit 34332e1

33 files changed

Lines changed: 5081 additions & 1002 deletions

README.md

Lines changed: 335 additions & 314 deletions
Large diffs are not rendered by default.

README_CN.md

Lines changed: 339 additions & 311 deletions
Large diffs are not rendered by default.

index.html

Lines changed: 464 additions & 192 deletions
Large diffs are not rendered by default.

lib/__init__.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
'FETCH_MAX_CHARS_PDF', 'FETCH_MAX_BYTES',
1919
'SKIP_DOMAINS', 'MODEL_PRICING',
2020
'QWEN_PRICING_CNY', 'DEFAULT_USD_CNY_RATE',
21+
'MT_PROVIDER_CONFIG',
2122
]
2223

2324
# ══════════════════════════════════════════════════════════
@@ -139,6 +140,29 @@ def _resolve_base_url():
139140
]
140141

141142

143+
# ── Machine Translation Provider (optional, for faster/cheaper translation) ──
144+
# When configured, translation uses a dedicated MT API (e.g. NiuTrans) instead
145+
# of the cheap LLM model. Config stored in server_config.json under 'mt_provider'.
146+
# Priority: server_config.json > default (disabled)
147+
def _resolve_mt_provider_config():
148+
"""Resolve MT provider config from server_config.json.
149+
150+
Returns dict with: provider, api_url, api_key, app_id, enabled
151+
"""
152+
mt = _SAVED_CONFIG.get('mt_provider', {})
153+
if not isinstance(mt, dict):
154+
return {}
155+
return {
156+
'provider': mt.get('provider', 'niutrans'),
157+
'api_url': mt.get('api_url', ''),
158+
'api_key': mt.get('api_key', ''),
159+
'app_id': mt.get('app_id', ''),
160+
'enabled': bool(mt.get('enabled', False)),
161+
}
162+
163+
MT_PROVIDER_CONFIG = _resolve_mt_provider_config()
164+
165+
142166
# ── Trading module (default OFF — enable in Settings or with TRADING_ENABLED=1) ──
143167
# Priority: env-var > features.json > default(off)
144168
def _resolve_trading_enabled():
@@ -562,6 +586,9 @@ def _rcfg(env_key, saved_key, default):
562586
# Debug mode flag
563587
_mod.DEBUG_MODE = _resolve_debug_mode()
564588

589+
# Machine translation provider
590+
_mod.MT_PROVIDER_CONFIG = _resolve_mt_provider_config()
591+
565592
# Model defaults (from model_defaults section)
566593
_md = _SAVED_CONFIG.get('model_defaults', {})
567594
if _md.get('fallback_model') is not None:

lib/fetch/core.py

Lines changed: 51 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -338,36 +338,65 @@ def fetch_contents_for_results(results, max_fetch=None,
338338
len(to_fetch), target_ok, max_chars)
339339
t0 = time.time()
340340
ok_count = 0
341+
url_timings = [] # list of (url, elapsed, ok, chars, method)
341342
def _do(r):
342-
return r, fetch_page_content(r['url'], max_chars=max_chars,
343+
url = r['url']
344+
fetch_t0 = time.time()
345+
content = fetch_page_content(url, max_chars=max_chars,
343346
pdf_max_chars=_lib.FETCH_MAX_CHARS_PDF)
344-
with ThreadPoolExecutor(max_workers=8) as pool:
347+
fetch_elapsed = time.time() - fetch_t0
348+
return r, content, fetch_elapsed
349+
with ThreadPoolExecutor(max_workers=16) as pool:
345350
futs = {pool.submit(_do, r): r for r in to_fetch}
346351
pending = set(futs.keys())
352+
cancelled_urls = []
347353
try:
348354
for fut in as_completed(futs, timeout=90):
349355
pending.discard(fut)
350356
try:
351-
result, content = fut.result()
352-
if content and len(content) > 50:
357+
result, content, fetch_elapsed = fut.result()
358+
url = result['url']
359+
ok = bool(content and len(content) > 50)
360+
chars = len(content) if content else 0
361+
url_timings.append((url, fetch_elapsed, ok, chars))
362+
if ok:
353363
result['full_content'] = content
354364
ok_count += 1
365+
# Log individual slow fetches (>5s)
366+
if fetch_elapsed > 5:
367+
logger.info('[Fetch] ⚠ SLOW url=%.80s %.1fs ok=%s chars=%d',
368+
url, fetch_elapsed, ok, chars)
355369
except Exception as e:
356370
logger.warning('[Fetch] fetch_contents thread error: %s', e, exc_info=True)
357371
# Race-to-N: once we have enough content, stop waiting
358372
if ok_count >= target_ok and pending:
359373
elapsed_so_far = time.time() - t0
374+
cancelled_urls = [futs[p]['url'][:60] for p in pending]
360375
logger.info('[Fetch] Race-to-N: got %d/%d pages in %.1fs, '
361-
'cancelling %d slow fetches',
376+
'cancelling %d slow fetches: %s',
362377
ok_count, len(to_fetch), elapsed_so_far,
363-
len(pending))
378+
len(pending),
379+
', '.join(cancelled_urls[:5]))
364380
for p in pending:
365381
p.cancel()
366382
break
367383
except TimeoutError:
368384
logger.warning('[Fetch] fetch_contents: as_completed timeout (90s)', exc_info=True)
369385
elapsed = time.time() - t0
370-
logger.info('[Fetch] fetch_contents done: %d/%d got content in %.1fs', ok_count, len(to_fetch), elapsed)
386+
387+
# Summarize URL timing breakdown
388+
if url_timings:
389+
url_timings.sort(key=lambda x: -x[1]) # slowest first
390+
slow_summary = ' '.join(
391+
f'[{"✓" if ok else "✗"}]{url[:50]}={et:.1f}s'
392+
for url, et, ok, _chars in url_timings[:8]
393+
)
394+
logger.info('[Fetch] fetch_contents done: %d/%d got content in %.1fs '
395+
'slowest: %s',
396+
ok_count, len(to_fetch), elapsed, slow_summary)
397+
else:
398+
logger.info('[Fetch] fetch_contents done: %d/%d got content in %.1fs',
399+
ok_count, len(to_fetch), elapsed)
371400
return results
372401

373402

@@ -385,7 +414,7 @@ def _do(u):
385414
pdf_max_chars=pdf_max_chars, timeout=timeout)
386415
# Total deadline = per-request timeout + generous buffer for download + parsing
387416
deadline = max(timeout * 4, 120)
388-
with ThreadPoolExecutor(max_workers=4) as pool:
417+
with ThreadPoolExecutor(max_workers=8) as pool:
389418
futs = {pool.submit(_do, u): u for u in urls}
390419
done_count = 0
391420
try:
@@ -418,3 +447,17 @@ def extract_urls_from_text(text):
418447
u = u.rstrip('.,;:!?')
419448
if u not in seen and len(u) > 10: seen.add(u); unique.append(u)
420449
return unique[:5]
450+
451+
452+
453+
def get_fetch_cache_stats() -> dict:
454+
"""Return diagnostic stats for all fetch caches.
455+
456+
Useful for admin endpoints and periodic health checks.
457+
Includes hit rates, eviction counts, and current sizes.
458+
"""
459+
return {
460+
'fetch_cache': _fetch_cache.stats,
461+
'html_head_cache': _html_head_cache.stats,
462+
'circuit_breaker': _circuit.get_status(),
463+
}

lib/fetch/http.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -174,7 +174,9 @@ def try_browser_fetch(url, max_chars, reason='unknown'):
174174
attempt_num = _browser_fallback_stats['attempts']
175175
logger.info('[Fetch] Browser fallback ATTEMPT #%d reason=%s — %s',
176176
attempt_num, reason, url[:100])
177+
bf_t0 = time.time()
177178
text = fetch_url_via_browser(url, max_chars=max_chars, timeout=25)
179+
bf_elapsed = time.time() - bf_t0
178180
if text:
179181
# ── Guard: browser extension may also return bot-protection pages ──
180182
from lib.fetch.utils import _is_bot_extracted_text
@@ -187,12 +189,15 @@ def try_browser_fetch(url, max_chars, reason='unknown'):
187189
with _browser_fallback_lock:
188190
_browser_fallback_stats['success'] += 1
189191
_fetch_cache.put(url, text)
192+
logger.info('[Fetch] Browser fallback OK in %.1fs — %s (%d chars)',
193+
bf_elapsed, url[:80], len(text))
190194
if max_chars and len(text) > max_chars:
191195
return text[:max_chars] + '\n[…truncated]'
192196
return text
193197
with _browser_fallback_lock:
194198
_browser_fallback_stats['fail'] += 1
195-
logger.info('[Fetch] Browser fallback returned empty — %s', url[:80])
199+
logger.info('[Fetch] Browser fallback returned empty in %.1fs — %s',
200+
bf_elapsed, url[:80])
196201
_log_browser_fallback_stats()
197202
return None
198203
except Exception as e:

lib/fetch/utils.py

Lines changed: 58 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -238,27 +238,77 @@ def get_status(self):
238238

239239

240240
class _FetchCache:
241-
def __init__(self, ttl=600, max_size=200):
241+
"""TTL-based URL content cache with LRU eviction.
242+
243+
Tracks hits, misses, TTL expirations, and capacity evictions for
244+
diagnostic visibility. Stats accessible via ``stats`` property.
245+
"""
246+
def __init__(self, ttl=600, max_size=200, name='fetch'):
242247
self._data, self._lock = {}, threading.Lock()
243248
self._ttl, self._max = ttl, max_size
249+
self._name = name
250+
# Diagnostic counters
251+
self._hits = 0
252+
self._misses = 0
253+
self._ttl_expirations = 0
254+
self._capacity_evictions = 0
255+
self._puts = 0
256+
244257
def get(self, url):
245258
with self._lock:
246259
e = self._data.get(url)
247-
if e and e[1] > time.time(): return e[0]
248-
self._data.pop(url, None); return None
260+
if e and e[1] > time.time():
261+
self._hits += 1
262+
return e[0]
263+
if e:
264+
# Entry exists but TTL expired
265+
self._ttl_expirations += 1
266+
self._data.pop(url, None)
267+
logger.debug('[%sCache] TTL expired for %.80s (ttl=%ds, size=%d)',
268+
self._name, url, self._ttl, len(self._data))
269+
self._misses += 1
270+
return None
271+
249272
def put(self, url, content):
250-
if not content: return
273+
if not content:
274+
return
251275
with self._lock:
276+
self._puts += 1
252277
if len(self._data) >= self._max:
253-
del self._data[min(self._data, key=lambda k: self._data[k][1])]
278+
evicted_url = min(self._data, key=lambda k: self._data[k][1])
279+
del self._data[evicted_url]
280+
self._capacity_evictions += 1
281+
logger.debug('[%sCache] Capacity eviction (%d/%d): %.80s',
282+
self._name, len(self._data), self._max,
283+
evicted_url)
254284
self._data[url] = (content, time.time() + self._ttl)
285+
255286
@property
256287
def size(self):
257-
with self._lock: return len(self._data)
288+
with self._lock:
289+
return len(self._data)
258290

259-
_fetch_cache = _FetchCache(ttl=600, max_size=200)
291+
@property
292+
def stats(self):
293+
"""Return diagnostic stats dict."""
294+
with self._lock:
295+
total = self._hits + self._misses
296+
return {
297+
'name': self._name,
298+
'size': len(self._data),
299+
'max_size': self._max,
300+
'ttl': self._ttl,
301+
'hits': self._hits,
302+
'misses': self._misses,
303+
'hit_rate_pct': round(self._hits / max(total, 1) * 100),
304+
'ttl_expirations': self._ttl_expirations,
305+
'capacity_evictions': self._capacity_evictions,
306+
'puts': self._puts,
307+
}
308+
309+
_fetch_cache = _FetchCache(ttl=600, max_size=200, name='Fetch')
260310
# Light cache: store raw HTML head (first 20KB) for publish-date extraction
261-
_html_head_cache = _FetchCache(ttl=600, max_size=300)
311+
_html_head_cache = _FetchCache(ttl=600, max_size=300, name='HtmlHead')
262312

263313

264314
# ═══════════════════════════════════════════════════════

lib/js_bundler.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
"""JS bundler — concatenate app scripts into a single bundle at startup.
22
33
Eliminates the HTTP/1.1 waterfall problem where browsers limit to 6 concurrent
4-
connections per host, causing 16 JS files to download in 3-4 serial waves.
4+
connections per host, causing 18 JS files to download in 3-4 serial waves.
55
With the bundle, the browser fetches 1 file (gzip ~250KB) in a single request.
66
77
The bundle is rebuilt at startup and whenever any source file changes.
@@ -20,6 +20,7 @@
2020

2121
# ── Load order MUST match index.html (dependencies flow top → bottom) ──
2222
_BUNDLE_FILES = [
23+
'i18n.js', # MUST be first — t() is used by all other modules
2324
'idb-cache.js',
2425
'core.js',
2526
'export-images.js',
@@ -30,6 +31,7 @@
3031
'translation.js',
3132
'upload.js',
3233
'image-gen.js',
34+
'paper-reader.js',
3335
'project.js',
3436
'memory.js',
3537
'scheduler.js',

0 commit comments

Comments
 (0)