Merge pull request #12 from escapecloud/task/update_alternative_technology_dataset

bencehezso · web-flow · commit 2b146ce684b9 · 2025-03-12T12:48:55.000+01:00
Issue #9 – Fetch Alternative Technology data from remote storage
diff --git a/assets/template/index.html b/assets/template/index.html
@@ -230,7 +230,7 @@ <h2>Resource Inventory ({{ total_resources }})</h2>
                         {% for resource_type in resource_inventory %}
                           <div class="col-md-3 col-sm-12 service-model-item" data-service-model="{{ resource_type.resource_type | trim }}">
                             <div class="resource-box">
-                              <img src="{{ resource_type.icon | trim }}" alt="{{ resource_type.name | trim }}">
+                              <img src="assets/{{ resource_type.icon | trim }}" alt="{{ resource_type.name | trim }}">
                               <h3>{{ resource_type.name | trim }}</h3>
                               <h5>
                                 {{ resource_type.count }} Resource{% if resource_type.count != 1 %}s{% endif %} Available
diff --git a/core/utils_report.py b/core/utils_report.py
@@ -144,7 +144,7 @@ def generate_html_report(report_path, metadata, resource_type_mapping, resource_
         str(item["resource_type"]): {
             **item,
             "name": resource_type_mapping.get(str(item["resource_type"]), {}).get("name", "Unknown Resource"),
-            "icon": resource_type_mapping.get(str(item["resource_type"]), {}).get("icon", "assets/icons/default.png")
+            "icon": "/assets" + resource_type_mapping.get(str(item["resource_type"]), {}).get("icon", "/icons/default.png")
         }
         for item in resource_inventory
     }
@@ -342,7 +342,8 @@ def transform_resource_inventory_for_pdf(resource_inventory, resource_type_mappi
 
         resource_name = resource_info.get("name", "Unknown Resource")
         # Construct icon_url from the resource_info, default if not found
-        icon_path = resource_info.get("icon", "/assets/icons/default.png")
+        icon_path = "/assets" + resource_info.get("icon", "/icons/default.png")
+
         # Prepend report_storage to form the full path to the icon
         icon_url = f"{report_path}{icon_path}"
 
@@ -439,7 +440,7 @@ def transform_alt_tech_for_pdf(resource_inventory, resource_type_mapping, altern
         rtype_info = resource_type_mapping.get(rtype_str, {})
         resource_name = rtype_info.get("name", "Unknown Resource")
 
-        icon_path = rtype_info.get("icon", "/assets/icons/default.png")
+        icon_path = "/assets" + rtype_info.get("icon", "/icons/default.png")
         icon_url = f"{report_path}{icon_path}"
 
         count = alt_counts.get(rtype_str, 0)
diff --git a/datasets/data.db b/datasets/data.db
diff --git a/main.py b/main.py
@@ -27,6 +27,7 @@
 from utils.constants import REGION_CHOICES, REQUIRED_FIELDS_AZURE, REQUIRED_FIELDS_AWS
 from utils.validate import validate_region, validate_config
 from utils.azure import select_subscription, select_resource_group, is_azure_cli_logged_in
+from utils.data import initialize_dataset
 
 # Configure the root logger to ensure logs propagate from all modules
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
@@ -417,6 +418,9 @@ def main():
     # Print ASCII art
     console.print(ascii_art, style="bold cyan")
 
+    # Ensure latest dataset is available before proceeding
+    initialize_dataset()
+
     args = parse_arguments()
 
     # Check if the cloud provider is specified
diff --git a/utils/data.py b/utils/data.py
@@ -0,0 +1,129 @@
+import os
+import gzip
+import shutil
+import hashlib
+import time
+import requests
+from datetime import datetime, timedelta
+from pathlib import Path
+from requests.exceptions import RequestException, ConnectionError, Timeout
+
+# Constants
+DATASET_FOLDER = Path("datasets")
+REMOTE_STORAGE_URL = "https://cloudexit-oss-data-eu.fsn1.your-objectstorage.com"
+
+def get_monday_date():
+    now = datetime.utcnow()
+    monday = now - timedelta(days=now.weekday())
+
+    if now.weekday() == 0 and now.hour < 8:
+        last_monday = monday - timedelta(days=7)
+        return last_monday.strftime("cloudexit-%Y-%m-%d.db.gz")
+    else:
+        return monday.strftime("cloudexit-%Y-%m-%d.db.gz")
+
+def compute_file_hash(filepath):
+    hash_sha256 = hashlib.sha256()
+    with open(filepath, "rb") as f:
+        for chunk in iter(lambda: f.read(4096), b""):
+            hash_sha256.update(chunk)
+    return hash_sha256.hexdigest()
+
+def download_file(url, destination, retries=3, delay=5):
+    for attempt in range(retries):
+        try:
+            response = requests.get(url, stream=True, timeout=30)
+            response.raise_for_status()
+
+            with open(destination, "wb") as f:
+                shutil.copyfileobj(response.raw, f)
+
+            print(f"[INFO] Download successful: {destination}")
+            return True
+
+        except ConnectionError:
+            print(f"[ERROR] Connection failed while downloading {url}. Retrying ({attempt + 1}/{retries})...")
+        except Timeout:
+            print(f"[ERROR] Request timed out while downloading {url}. Retrying ({attempt + 1}/{retries})...")
+        except RequestException as e:
+            print(f"[ERROR] Failed to download {url}: {e}")
+            break
+
+        time.sleep(delay)
+
+    print(f"[ERROR] Unable to download file after {retries} attempts: {url}")
+    return False
+
+def fetch_remote_checksum(checksum_url, retries=3, delay=5):
+    for attempt in range(retries):
+        try:
+            response = requests.get(checksum_url, timeout=10)
+            response.raise_for_status()
+            return response.text.strip().split()[0]
+
+        except ConnectionError:
+            print(f"[ERROR] Connection failed when fetching {checksum_url}. Retrying ({attempt + 1}/{retries})...")
+        except Timeout:
+            print(f"[ERROR] Request timed out when fetching {checksum_url}. Retrying ({attempt + 1}/{retries})...")
+        except RequestException as e:
+            print(f"[ERROR] Failed to fetch {checksum_url}: {e}")
+            break
+
+        time.sleep(delay)
+
+    print(f"[ERROR] Unable to fetch remote checksum after {retries} attempts.")
+    return None
+
+def initialize_dataset():
+    DATASET_FOLDER.mkdir(exist_ok=True)
+
+    latest_file = get_monday_date()
+    latest_file_url = f"{REMOTE_STORAGE_URL}/{latest_file}"
+    latest_checksum_url = f"{REMOTE_STORAGE_URL}/{latest_file}.sha256"
+    latest_symlink_file = f"{REMOTE_STORAGE_URL}/cloudexit-latest.db.gz"
+    latest_symlink_checksum_url = f"{REMOTE_STORAGE_URL}/cloudexit-latest.db.gz.sha256"
+
+    local_db_path = DATASET_FOLDER / "data.db"
+    local_compressed_path = DATASET_FOLDER / latest_file
+
+    # Fetch checksum for the date-based file
+    remote_checksum = fetch_remote_checksum(latest_checksum_url)
+    if not remote_checksum:
+        print(f"[INFO] Unable to fetch remote checksum from {latest_checksum_url}.")
+        print(f"[INFO] Trying latest symlink from {latest_symlink_checksum_url}...")
+        remote_checksum = fetch_remote_checksum(latest_symlink_checksum_url)
+        latest_file_url = latest_symlink_file
+        latest_file = "cloudexit-latest.db.gz"
+        local_compressed_path = DATASET_FOLDER / latest_file
+
+    if not remote_checksum:
+        print("[ERROR] Unable to fetch any remote checksum. Skipping update.")
+
+    else:
+        # Check if local compressed file exists
+        if local_compressed_path.exists():
+            local_checksum = compute_file_hash(local_compressed_path)
+            if local_checksum == remote_checksum:
+                print("[INFO] Local dataset is up-to-date. No download needed.")
+                return
+            else:
+                print("[INFO] Local dataset is outdated. Removing old files and downloading new dataset...")
+
+                # Remove all old compressed and extracted files
+                for file in DATASET_FOLDER.glob("cloudexit-*.db.gz"):
+                    os.remove(file)
+                if local_db_path.exists():
+                    os.remove(local_db_path)
+
+        # Download and extract dataset
+        if download_file(latest_file_url, local_compressed_path):
+            print(f"[INFO] Download successful. Extracting dataset from {latest_file}...")
+
+            with gzip.open(local_compressed_path, "rb") as f_in, open(local_db_path, "wb") as f_out:
+                shutil.copyfileobj(f_in, f_out)
+
+            print("[INFO] Dataset updated successfully.")
+
+    if not any(DATASET_FOLDER.iterdir()):
+        print("[ERROR] Dataset folder is empty! Cannot proceed without data.")
+        exit(1)