chore(deploy): download urls in parallel in config-processor (#6351)

corneliusroemer · web-flow · commit 3947ca806bc0 · 2026-05-26T16:00:53.000+02:00
Resolves #6350 Reduces time to process PPX config down from 2m to 6s: ``` ./generate_local_test_config.sh --values ~/code/pathoplexus/loculus_values/values.yaml --from-live --live-host preview-main.pathoplexus.org ``` Some savings also transfer to faster pod startup (via reduced time of init container) of backend/website, where this script also does lots of downloading. For example backend init container on Loculus takes only 6s vs 18s. On PPX this is even more marked, we might save almost a minute. Before (on Loculus main): ``` ../generate_local_test_config.sh --from-live 6.46s user 0.72s system 39% cpu 18.193 total ``` After: ``` ../generate_local_test_config.sh --from-live 3.99s user 0.44s system 100% cpu 4.403 total ``` ### Screenshot <img width="1087" height="849" alt="image" src="https://github.com/user-attachments/assets/57ea838b-13bf-45e8-ba5f-a161abb8e92d" /> Also works in init container on ArgoCD: <img width="1319" height="396" alt="image" src="https://github.com/user-attachments/assets/dfe5edd4-44a3-4084-8b93-72da38a849be" /> 🚀 Preview: https://speed-up-deploy.loculus.org
diff --git a/kubernetes/config-processor/config-processor.py b/kubernetes/config-processor/config-processor.py
@@ -1,9 +1,14 @@
 import os
 import re
 import shutil
+import threading
+from concurrent.futures import ThreadPoolExecutor, as_completed
 
 import requests
 
+DEFAULT_MAX_WORKERS = 16
+thread_local = threading.local()
+
 
 def copy_structure(input_dir, output_dir):
     for root, dirs, files in os.walk(input_dir):
@@ -16,30 +21,74 @@ def copy_structure(input_dir, output_dir):
             os.makedirs(os.path.dirname(file_path), exist_ok=True)
             shutil.copy(os.path.join(root, file), file_path)
 
-def replace_url_with_content(file_content):
+
+def download_urls(urls):
+    if not urls:
+        return {}
+
+    max_workers = DEFAULT_MAX_WORKERS
+    while True:
+        try:
+            return download_urls_with_workers(urls, max_workers)
+        except requests.exceptions.RequestException as error:
+            if "Too many open files" not in str(error) or max_workers == 1:
+                raise
+            max_workers = max(max_workers // 2, 1)
+            print(f"Too many open files while downloading URLs, retrying with {max_workers} worker(s)")
+
+
+def download_urls_with_workers(urls, max_workers):
+    print(f"Downloading {len(urls)} unique URL(s) with {max_workers} worker(s)")
+
+    downloaded_content = {}
+    with ThreadPoolExecutor(max_workers=max_workers) as executor:
+        future_to_url = {executor.submit(download_url, url): url for url in urls}
+        for future in as_completed(future_to_url):
+            url = future_to_url[future]
+            response = future.result()
+            if response.status_code == 200:
+                downloaded_content[url] = response.text.strip()
+            else:
+                error_details = f"URL: {url}, Status Code: {response.status_code}, Reason: {response.reason}"
+                raise ValueError(f"Problem downloading {error_details}")
+    return downloaded_content
+
+
+def download_url(url):
+    if not hasattr(thread_local, "session"):
+        thread_local.session = requests.Session()
+    return thread_local.session.get(url)
+
+
+def replace_url_with_content(file_content, downloaded_content):
     urls = re.findall(r'\[\[URL:([^\]]*)\]\]', file_content)
     for url in set(urls):
-        response = requests.get(url)
-        if response.status_code == 200:
-            file_content = file_content.replace(f"[[URL:{url}]]", response.text.strip())
-        else:
-            error_details = f"URL: {url}, Status Code: {response.status_code}, Reason: {response.reason}"
-            raise ValueError(f"Problem downloading {error_details}")
+        file_content = file_content.replace(f"[[URL:{url}]]", downloaded_content[url])
     return file_content
 
 def make_substitutions(file_content, substitutions):
     for key, value in substitutions.items():
         file_content = file_content.replace(f"[[{key}]]", value)
     return file_content
 
+def collect_urls(output_dir):
+    urls = set()
+    for root, dirs, files in os.walk(output_dir):
+        for file in files:
+            file_path = os.path.join(root, file)
+            with open(file_path) as f:
+                urls.update(re.findall(r'\[\[URL:([^\]]*)\]\]', f.read()))
+    return urls
+
 def process_files(output_dir, substitutions):
+    downloaded_content = download_urls(collect_urls(output_dir))
     for root, dirs, files in os.walk(output_dir):
         for file in files:
             file_path = os.path.join(root, file)
             with open(file_path, 'r+') as f:
                 print(f"Processing {file_path}")
                 content = f.read()
-                new_content = replace_url_with_content(content)
+                new_content = replace_url_with_content(content, downloaded_content)
                 new_content = make_substitutions(new_content, substitutions)
                 if new_content != content:
                     f.seek(0)