chore(tests): use download_models.py in tests data preparation (#432)

tybulewicz · web-flow · commit 973001188d89 · 2025-11-06T06:43:38.000Z
* fix(tests): update dataset url in pre-commits test preparation

* Use  to fetch test data
diff --git a/.github/workflows/test_accuracy.yml b/.github/workflows/test_accuracy.yml
@@ -25,7 +25,7 @@ jobs:
           uv sync --locked --extra tests --extra-index-url https://download.pytorch.org/whl/cpu
       - name: Prepare test data
         run: |
-          uv run python tests/accuracy/prepare_data.py -d data
+          uv run python tests/accuracy/download_models.py -d data -j tests/accuracy/public_scope.json -l
       - name: Run Python Test
         run: |
           uv run pytest --data=./data tests/accuracy/test_accuracy.py
diff --git a/.github/workflows/test_precommit.yml b/.github/workflows/test_precommit.yml
@@ -27,7 +27,7 @@ jobs:
           uv sync --locked --extra tests --extra-index-url https://download.pytorch.org/whl/cpu
       - name: Prepare test data
         run: |
-          uv run python tests/precommit/prepare_data.py -d data -p tests/precommit/public_scope.json
+          uv run python tests/accuracy/download_models.py -d data -j tests/precommit/public_scope.json -l
       - name: Run test
         run: |
           uv run pytest --data=./data tests/functional
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -56,9 +56,9 @@ Set up your development environment to start contributing. This involves install
    ```bash
    uv run pre-commit run --all-files
    uv run pytest tests/unit
-   uv run python tests/precommit/prepare_data.py -d data -p tests/precommit/public_scope.json
+   uv run python tests/accuracy/download_models.py -d data -j tests/precommit/public_scope.json -l
    uv run pytest --data=./data tests/functional
-   uv run python tests/accuracy/prepare_data.py -d data
+   uv run python tests/accuracy/download_models.py -d data -j tests/accuracy/public_scope.json -l
    uv run pytest --data=./data tests/accuracy/test_accuracy.py
    ```
 
diff --git a/examples/visual_prompting/README.md b/examples/visual_prompting/README.md
@@ -25,7 +25,7 @@ To run the pipeline out-of-the box you can download the test data by running the
 
 ```bash
 pip install httpx
-python tests/accuracy/prepare_data.py -d data
+python tests/accuracy/download_models.py -d data -j tests/accuracy/public_scope.json -l
 ```
 
 and then run
diff --git a/examples/zsl_visual_prompting/README.md b/examples/zsl_visual_prompting/README.md
@@ -28,7 +28,7 @@ To run the pipeline out-of-the box you can download the test data by running the
 
 ```bash
 pip install httpx
-python tests/accuracy/prepare_data.py -d data
+python tests/accuracy/download_models.py -d data -j tests/accuracy/public_scope.json -l
 ```
 
 and then run
diff --git a/tests/accuracy/download_models.py b/tests/accuracy/download_models.py
@@ -1,3 +1,4 @@
+#!#!/usr/bin/env -S uv run --script
 #
 # Copyright (C) 2025 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
@@ -6,12 +7,18 @@
 import asyncio
 import json
 import time
+from io import BytesIO
 from pathlib import Path
+from zipfile import ZipFile
 
 import httpx
 
 
 async def stream_file(client, url, filename, semaphore):
+    if Path(filename).exists():
+        print(f"Skipping already downloaded {filename}")
+        return
+
     async with semaphore:
         start_time = time.time()
         total_bytes = 0
@@ -28,6 +35,30 @@ async def stream_file(client, url, filename, semaphore):
         print(f"Downloaded {url} - {total_bytes:.2f} MB in {download_time:.2f}s ({speed_mbps:.2f} MB/s)")
 
 
+async def download_single_image(client, url, filename):
+    image = await client.get(url)
+    with Path(filename).open("wb") as im:
+        im.write(image.content)
+
+
+async def download_images(data_dir):
+    async with httpx.AsyncClient(timeout=20.0) as client:
+        COCO128_URL = "https://storage.geti.intel.com/geti_predict/test/images/coco128.zip"
+        archive = await client.get(COCO128_URL, follow_redirects=True)
+        with ZipFile(BytesIO(archive.content)) as zfile:
+            zfile.extractall(data_dir)
+
+        image_downloads = [
+            (
+                "https://storage.geti.intel.com/geti_predict/test/images/BloodImage_00007.jpg",
+                data_dir / "BloodImage_00007.jpg",
+            ),
+            ("https://storage.geti.intel.com/geti_predict/test/images/cards.png", data_dir / "cards.png"),
+        ]
+
+        await asyncio.gather(*[download_single_image(client, url, filename) for url, filename in image_downloads])
+
+
 async def main():
     parser = argparse.ArgumentParser()
     parser.add_argument(
@@ -44,6 +75,12 @@ async def main():
         required=True,
         help="Path to the JSON file with model information",
     )
+    parser.add_argument(
+        "-l",
+        "--legacy",
+        action="store_true",
+        help="Download models using legacy directory structure (used in public_scope.json",
+    )
     args = parser.parse_args()
 
     with args.json_path.open("r") as f:
@@ -54,18 +91,34 @@ async def main():
     args.data_dir.mkdir(parents=True, exist_ok=True)
     async with httpx.AsyncClient(timeout=60.0) as client:
         tasks = []
-        for model_entry in models_data:
-            model_name = model_entry["name"]
+
+        model_names = []
+        for model_data in models_data:
+            model_names.append(model_data["name"])
+            if args.legacy and "encoder" in model_data:
+                model_names.append(model_data["encoder"])
+            if args.legacy and "extra_model" in model_data:
+                model_names.append(model_data["extra_model"])
+
+        for model_name in model_names:
             download_url = base_path + model_name
+            if args.legacy:
+                if model_name.endswith(".onnx"):
+                    download_url = base_path + model_name.replace(".", "/model.")
+                else:
+                    download_url = base_path + model_name.replace(".", "/openvino.")
             save_path = args.data_dir / model_name
             save_path.parent.mkdir(parents=True, exist_ok=True)
+
             tasks.append(stream_file(client, download_url, save_path, semaphore))
 
             if model_name.endswith(".xml"):
                 tasks.append(
                     stream_file(client, download_url.replace(".xml", ".bin"), save_path.with_suffix(".bin"), semaphore),
                 )
 
+        tasks.append(download_images(args.data_dir))
+
         print(f"Starting download of {len(tasks)} files with max 10 concurrent downloads...")
         await asyncio.gather(*tasks)
         print(f"All {len(tasks)} files downloaded successfully!")
diff --git a/tests/accuracy/prepare_data.py b/tests/accuracy/prepare_data.py
diff --git a/tests/precommit/prepare_data.py b/tests/precommit/prepare_data.py
diff --git a/tests/precommit/public_scope.json b/tests/precommit/public_scope.json