BLAST-AI-ML
diff --git a/‎.pre-commit-config.yaml‎
Lines changed: 1 addition & 1 deletion b/‎.pre-commit-config.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎README.md‎
Lines changed: 1 addition & 1 deletion b/‎README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎dashboard/README.md‎
Lines changed: 4 additions & 3 deletions b/‎dashboard/README.md‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎dashboard/app.py‎
Lines changed: 11 additions & 5 deletions b/‎dashboard/app.py‎
Lines changed: 11 additions & 5 deletions
diff --git a/‎dashboard/error_manager.py‎
Lines changed: 6 additions & 1 deletion b/‎dashboard/error_manager.py‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎dashboard/model_manager.py‎
Lines changed: 82 additions & 104 deletions b/‎dashboard/model_manager.py‎
Lines changed: 82 additions & 104 deletions
diff --git a/‎dashboard/utils.py‎
Lines changed: 5 additions & 30 deletions b/‎dashboard/utils.py‎
Lines changed: 5 additions & 30 deletions
@@ -1,7 +1,7 @@
 repos:
 - repo: https://github.com/astral-sh/ruff-pre-commit
   # Ruff version
-  rev: v0.15.2
+  rev: v0.15.7
   hooks:
     # Run the linter
     - id: ruff-check
 
@@ -25,7 +25,7 @@ To display ML predictions, the application requires the following:
 - **Simulation and experimental data points**: Each data point consists of values for the scalar inputs and outputs defined in the experiment configuration file.
 Data points are stored in a [MongoDB](https://www.mongodb.com/) database, where each experiment is represented by a separate collection.
 Experimental and simulation data points are stored in the same collection and are distinguished by the `experimental_flag` attribute.
-- **ML models**: Machine learning models that interpolate between data points and are stored in a separate MongoDB collection named `models`.
+- **ML models**: Machine learning models that interpolate between data points and are stored in [MLflow](https://mlflow.org/).
 - **Simulation movies** (optional): For certain experiments, users can click on simulation data points to visualize simulation movies.
 The corresponding MP4 files are stored in the Perlmutter shared file system at `/global/cfs/cdirs/m558/superfacility/simulation_data`.
 This directory is mounted on the container image running on Spin.
 
@@ -54,10 +54,11 @@ conda-lock install --name synapse-gui environment-lock.yml
 
 2. Move to the [dashboard/](./) directory.
 
-3. Set up the database settings (read-only):
+3. Set up the database settings (read-only) and the AmSC MLflow API key:
    ```bash
    export SF_DB_HOST='127.0.0.1'
    export SF_DB_READONLY_PASSWORD='your_password_here'  # Use SINGLE quotes around the password!
+   export AM_SC_API_KEY='your_amsc_api_key_here'        # Required when MLflow tracking_uri is AmSC
    ```
 
 4. Activate the conda environment `synapse-gui`:
@@ -85,11 +86,11 @@ conda-lock install --name synapse-gui environment-lock.yml
 
 4. Run the Docker container:
    ```bash
-   docker run --network=host -v /etc/localtime:/etc/localtime -v $PWD/ml:/app/ml -e SF_DB_HOST='127.0.0.1' -e SF_DB_READONLY_PASSWORD='your_password_here' synapse-gui
+   docker run --network=host -v /etc/localtime:/etc/localtime -v $PWD/ml:/app/ml -e SF_DB_HOST='127.0.0.1' -e SF_DB_READONLY_PASSWORD='your_password_here' -e AM_SC_API_KEY='your_amsc_api_key_here' synapse-gui
    ```
    For debugging, you can enter the container without starting the app:
    ```bash
-   docker run --network=host -v /etc/localtime:/etc/localtime -v $PWD/ml:/app/ml -e SF_DB_HOST='127.0.0.1' -e SF_DB_READONLY_PASSWORD='your_password_here' -it synapse-gui bash
+   docker run --network=host -v /etc/localtime:/etc/localtime -v $PWD/ml:/app/ml -e SF_DB_HOST='127.0.0.1' -e SF_DB_READONLY_PASSWORD='your_password_here' -e AM_SC_API_KEY='your_amsc_api_key_here' -it synapse-gui bash
    ```
    Note that `-v /etc/localtime:/etc/localtime` is necessary to synchronize the time zone in the container with the host machine.
 
 
@@ -6,7 +6,7 @@
 from trame.ui.vuetify3 import SinglePageWithDrawerLayout
 from trame.widgets import plotly, router, vuetify3 as vuetify, html
 
-from model_manager import ModelManager
+from model_manager import ModelManager, model_type_tag_dict
 from outputs_manager import OutputManager
 from optimization_manager import OptimizationManager
 from parameters_manager import ParametersManager
@@ -16,6 +16,7 @@
 from error_manager import error_panel, add_error
 from utils import (
     data_depth_panel,
+    load_config_dict,
     load_experiments,
     load_database,
     load_data,
@@ -64,14 +65,18 @@ def update(
         state.experiment
     )
     # load data
-    db = load_database(state.experiment)
-    exp_data, sim_data = load_data(db)
+    config_dict = load_config_dict(state.experiment)
+    db = load_database(config_dict)
+    exp_data, sim_data = load_data(db, state.experiment, state.experiment_date_range)
     # reset output
     if reset_output:
         out_manager = OutputManager(output_variables)
     # reset model
     if reset_model:
-        mod_manager = ModelManager(db)
+        mod_manager = ModelManager(
+            config_dict=config_dict,
+            model_type_tag=model_type_tag_dict[state.model_type],
+        )
         opt_manager = OptimizationManager(mod_manager)
     # reset parameters
     if reset_parameters:
@@ -257,7 +262,8 @@ def find_simulation(event, db):
 
 
 def open_simulation_dialog(event):
-    db = load_database(state.experiment)
+    config_dict = load_config_dict(state.experiment)
+    db = load_database(config_dict)
     try:
         data_directory, file_path = find_simulation(event, db)
         state.simulation_video = file_path.endswith(".mp4")
 
@@ -1,8 +1,13 @@
 from trame.widgets import vuetify3 as vuetify, html
-from state_manager import state
+from state_manager import state, server
 
 
 def add_error(title, msg):
+    if not server.running:
+        # Outside of a Trame app (e.g. check_model.py), raise a Python error
+        # to surface the error to the caller.
+        raise RuntimeError(f"{title}: {msg}")
+    # Otherwise: Inside a Trame app, add the error to the state.
     state.errors.append(
         {
             "id": state.error_counter,
 
@@ -5,13 +5,11 @@
 import os
 import yaml
 import re
+import mlflow
 from sfapi_client import AsyncClient
 from sfapi_client.compute import Machine
-from lume_model.models.torch_model import TorchModel
-from lume_model.models.ensemble import NNEnsemble
-from lume_model.models.gp_model import GPModel
 from trame.widgets import vuetify3 as vuetify
-from utils import verify_input_variables, timer, load_config_dict, create_date_filter
+from utils import timer, load_config_dict, create_date_filter
 from error_manager import add_error
 from sfapi_manager import monitor_sfapi_job
 from state_manager import state
@@ -23,107 +21,96 @@
 }
 
 
+def enable_amsc_x_api_key(config_dict):
+    """
+    MLflow authentication helper for the AmSC MLflow server.
+    Standard MLflow does not automatically inject custom headers like 'X-Api-Key'.
+    This patches the http_request function to ensure every request to the server
+    includes the AmSC API key.
+
+    See https://gitlab.com/amsc2/ai-services/model-services/intro-to-mlflow-pytorch for more details.
+    """
+    import mlflow.utils.rest_utils as rest_utils
+
+    mlflow_cfg = config_dict.get("mlflow") or {}
+    api_key_env = mlflow_cfg.get("api_key_env")
+    if not api_key_env:
+        title = "Unable to enable AmSC X-Api-Key authentication"
+        msg = "MLFlow configuration is missing 'mlflow.api_key_env'"
+        add_error(title, msg)
+        print(msg)
+        return
+
+    api_key = os.environ.get(api_key_env)
+    if not api_key:
+        title = "Unable to enable AmSC X-Api-Key authentication"
+        msg = f"Environment variable '{api_key_env}' in 'mlflow.api_key_env' is not set"
+        add_error(title, msg)
+        print(msg)
+        return
+    _orig = rest_utils.http_request
+
+    def patched(host_creds, endpoint, method, *args, **kwargs):
+        if "headers" in kwargs and kwargs["headers"] is not None:
+            h = dict(kwargs["headers"])
+            h["X-Api-Key"] = api_key
+            kwargs["headers"] = h
+        else:
+            h = dict(kwargs.get("extra_headers") or {})
+            h["X-Api-Key"] = api_key
+            kwargs["extra_headers"] = h
+        return _orig(host_creds, endpoint, method, *args, **kwargs)
+
+    rest_utils.http_request = patched
+
+
 class ModelManager:
-    def __init__(self, db):
+    def __init__(self, config_dict, model_type_tag):
         print("Initializing model manager...")
-        # Set initial default values
         self.__model = None
         self.__is_neural_network = False
         self.__is_gaussian_process = False
         self.__is_neural_network_ensemble = False
+        self.__model_type_tag = model_type_tag
 
-        # Download model information from the database
-        collection = db["models"]
-        model_type_tag = model_type_tag_dict[state.model_type]
-        query = {"experiment": state.experiment, "model_type": model_type_tag}
-        count = collection.count_documents(query)
-
-        if count == 0:
-            print(
-                f"No model found for experiment: {state.experiment} and model type: {model_type_tag}"
-            )
-            return
-        elif count > 1:
+        if "mlflow" not in config_dict or not config_dict["mlflow"].get("tracking_uri"):
             print(
-                f"Multiple models found ({count}) for experiment: {state.experiment} and model type: {model_type_tag}!"
+                f"No mlflow.tracking_uri in configuration file for {config_dict['experiment']}; cannot load model from MLflow."
             )
             return
 
-        # Load model information from the database
-        document = collection.find_one(query)
-        # Save model files in a temporary directory,
-        # so that it can then be loaded with lume_model
-        with tempfile.TemporaryDirectory() as temp_dir:
-            # Open content of the top-level YAML file
-            yaml_file_content = document["yaml_file_content"]
-            model_filename = f"{state.experiment}.yml"
-            with open(os.path.join(temp_dir, model_filename), "w") as f:
-                f.write(yaml_file_content)
+        mlflow.set_tracking_uri(config_dict["mlflow"]["tracking_uri"])
+        # When using the AmSC MLflow: inject the X-Api-Key into the requests to authenticate with the MLflow server
+        # (See https://gitlab.com/amsc2/ai-services/model-services/intro-to-mlflow-pytorch)
+        if (
+            config_dict["mlflow"]["tracking_uri"]
+            == "https://mlflow.american-science-cloud.org"
+        ):
+            enable_amsc_x_api_key(config_dict)
 
-            # Extract list of files to download
-            files_to_download = []
-            if state.model_type == "Neural Network (ensemble)":
-                models_info = yaml.safe_load(yaml_file_content)
-                # Download yaml file for each model within the ensemble
-                for model in models_info["models"]:
-                    yaml_file_name = model.replace("_model.jit", ".yml")
-                    with open(os.path.join(temp_dir, yaml_file_name), "wb") as f:
-                        f.write(document[yaml_file_name])
-                    model_info = yaml.safe_load(document[yaml_file_name])
-                    # Extract files to download
-                    files_to_download += (
-                        [model_info["model"]]
-                        + model_info["input_transformers"]
-                        + model_info["output_transformers"]
-                    )
-            else:
-                # Extract files to download
-                model_info = yaml.safe_load(yaml_file_content)
-                files_to_download = (
-                    [model_info["model"]]
-                    + model_info["input_transformers"]
-                    + model_info["output_transformers"]
-                )
-
-            # Download all the files that define the model(s)
-            for filename in files_to_download:
-                with open(os.path.join(temp_dir, filename), "wb") as f:
-                    f.write(document[filename])
+        experiment = config_dict["experiment"]
+        model_name = f"{experiment}_{model_type_tag}"
 
-            # Check consistency of the model file
-            print("Reading model file...")
-            model_file = os.path.join(temp_dir, f"{state.experiment}.yml")
-            if not os.path.isfile(model_file):
-                title = f"Model file {model_file} not found"
-                msg = f"Unable to find the model file for {state.experiment}"
-                add_error(title, msg)
-                print(msg)
-                return
-            elif not verify_input_variables(model_file, state.experiment):
-                title = "Model file input variable mismatch"
-                msg = f"Model file {model_file} has different input variables than the configuration file for {state.experiment}"
-                add_error(title, msg)
-                print(msg)
-                return
-
-            # Load model with lume_model
-            try:
-                if state.model_type == "Neural Network (single)":
-                    self.__is_neural_network = True
-                    self.__model = TorchModel(model_file)
-                elif state.model_type == "Neural Network (ensemble)":
-                    self.__is_neural_network_ensemble = True
-                    self.__model = NNEnsemble(model_file)
-                elif state.model_type == "Gaussian Process":
-                    self.__is_gaussian_process = True
-                    self.__model = GPModel.from_yaml(model_file)
-                else:
-                    raise ValueError(f"Unsupported model type: {state.model_type}")
-            except Exception as e:
-                title = f"Unable to load model {state.model_type}"
-                msg = f"Error occurred when loading model: {e}"
-                add_error(title, msg)
-                print(msg)
+        try:
+            # Download model from MLflow server
+            self.__model = (
+                mlflow.pyfunc.load_model(f"models:/{model_name}/latest")
+                .unwrap_python_model()
+                .model
+            )
+            if model_type_tag == "NN":
+                self.__is_neural_network = True
+            elif model_type_tag == "ensemble_NN":
+                self.__is_neural_network_ensemble = True
+            elif model_type_tag == "GP":
+                self.__is_gaussian_process = True
+            else:
+                raise ValueError(f"Unsupported model type: {model_type_tag}")
+        except Exception as e:
+            title = f"Unable to load model {model_type_tag}"
+            msg = f"Error occurred when loading model from MLflow: {e}"
+            add_error(title, msg)
+            print(msg)
 
     def avail(self):
         print("Checking model availability...")
@@ -153,22 +140,13 @@ def evaluate(self, parameters, output):
                 mean = output_dict[output]
                 mean_error = 0.0  # trick to collapse error range when lower/upper bounds are not predicted
             elif self.__is_gaussian_process or self.__is_neural_network_ensemble:
-                if self.__is_gaussian_process:
-                    # TODO use "exp" only once experimental data is available for all experiments
-                    task_tag = "exp" if state.experiment == "bella-ip2" else "sim"
-                    output_key = [key for key in output_dict.keys() if task_tag in key][
-                        0
-                    ]
-                elif self.__is_neural_network_ensemble:
-                    output_key = list(output_dict.keys())[0]
-
                 # compute mean, standard deviation and mean error
                 # (call detach method to detach gradients from tensors)
-                mean = output_dict[output_key].mean.detach()
-                std_dev = output_dict[output_key].variance.sqrt().detach()
+                mean = output_dict[output].mean.detach()
+                std_dev = output_dict[output].variance.sqrt().detach()
                 mean_error = 2.0 * std_dev
             else:
-                raise ValueError(f"Unsupported model type: {state.model_type}")
+                raise ValueError(f"Unsupported model type: {self.__model_type_tag}")
             # compute lower/upper bounds for error range
             lower = mean - mean_error
             upper = mean + mean_error
 
@@ -103,15 +103,13 @@ def create_date_filter(experiment_date_range):
 
 
 @timer
-def load_data(db):
+def load_data(db, experiment, date_range=None):
     print("Loading data from database...")
     # create date filter if date range is set
-    date_filter = create_date_filter(state.experiment_date_range)
+    date_filter = create_date_filter(date_range)
     # load experiment and simulation data points in dataframes
-    exp_data = pd.DataFrame(
-        db[state.experiment].find({"experiment_flag": 1, **date_filter})
-    )
-    sim_data = pd.DataFrame(db[state.experiment].find({"experiment_flag": 0}))
+    exp_data = pd.DataFrame(db[experiment].find({"experiment_flag": 1, **date_filter}))
+    sim_data = pd.DataFrame(db[experiment].find({"experiment_flag": 0}))
     # Store '_id', 'date' as string
     for key in ["_id", "date"]:
         if key in exp_data.columns:
@@ -121,32 +119,9 @@ def load_data(db):
     return (exp_data, sim_data)
 
 
-def verify_input_variables(model_file, experiment):
-    print("Checking model consistency...")
-    # read configuration file
-    input_vars, _, _ = load_variables(experiment)
-    config_vars = [input_var["name"] for input_var in input_vars.values()]
-    config_vars.sort()
-    # read model file
-    with open(model_file) as f:
-        model_str = f.read()
-    # load model dictionary
-    model_dict = yaml.safe_load(model_str)
-    # load model input variables list
-    model_vars = list(model_dict["input_variables"].keys())
-    model_vars.sort()
-    # check if configuration list and model list match
-    match = config_vars == model_vars
-    if not match:
-        print("Input variables in configuration file and model file do not match")
-    return match
-
-
 @timer
-def load_database(experiment):
+def load_database(config_dict):
     print("Loading database...")
-    # load configuration dictionary
-    config_dict = load_config_dict(experiment)
     # read database information from configuration dictionary
     db_host = config_dict["database"]["host"]
     db_port = config_dict["database"]["port"]