Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ dependencies:
- python=3.8.15
- pip<=21.2.4
- pip:
- mlflow
- mlflow==2.4.1
- cloudpickle==2.2.0
- psutil==5.8.0
- scikit-learn==0.24.2
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
mlflow
mlflow==2.4.1
cloudpickle==2.2.0
psutil==5.8.0
scikit-learn==0.24.2
scikit-learn==0.24.2
92 changes: 42 additions & 50 deletions tutorials/get-started-notebooks/pipeline.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -275,7 +275,7 @@
" description=\"Custom environment for Credit Card Defaults pipeline\",\n",
" tags={\"scikit-learn\": \"0.24.2\"},\n",
" conda_file=os.path.join(dependencies_dir, \"conda.yaml\"),\n",
" image=\"mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu20.04:latest\",\n",
" image=\"mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu22.04:latest\",\n",
" version=\"0.2.0\",\n",
")\n",
"pipeline_job_env = ml_client.environments.create_or_update(pipeline_job_env)\n",
Expand Down Expand Up @@ -536,97 +536,89 @@
},
"outputs": [],
"source": [
"\n",
"%%writefile {train_src_dir}/train.py\n",
"import argparse\n",
"from sklearn.ensemble import GradientBoostingClassifier\n",
"from sklearn.metrics import classification_report\n",
"import os\n",
"import pandas as pd\n",
"import numpy as np\n",
"import mlflow\n",
"\n",
"from sklearn.ensemble import GradientBoostingClassifier\n",
"from sklearn.metrics import classification_report, accuracy_score, f1_score\n",
"\n",
"\n",
"def select_first_file(path):\n",
" \"\"\"Selects first file in folder, use under assumption there is only one file in folder\n",
" Args:\n",
" path (str): path to directory or file to choose\n",
" Returns:\n",
" str: full path of selected file\n",
" \"\"\"\n",
" \"\"\"Selects first file in folder (assumes single file).\"\"\"\n",
" files = os.listdir(path)\n",
" return os.path.join(path, files[0])\n",
"\n",
"\n",
"# Start Logging\n",
"mlflow.start_run()\n",
"\n",
"# enable autologging\n",
"mlflow.sklearn.autolog()\n",
"\n",
"os.makedirs(\"./outputs\", exist_ok=True)\n",
"\n",
"\n",
"def main():\n",
" \"\"\"Main function of the script.\"\"\"\n",
"\n",
" # input and output arguments\n",
" parser = argparse.ArgumentParser()\n",
" parser.add_argument(\"--train_data\", type=str, help=\"path to train data\")\n",
" parser.add_argument(\"--test_data\", type=str, help=\"path to test data\")\n",
" parser.add_argument(\"--n_estimators\", required=False, default=100, type=int)\n",
" parser.add_argument(\"--learning_rate\", required=False, default=0.1, type=float)\n",
" parser.add_argument(\"--registered_model_name\", type=str, help=\"model name\")\n",
" parser.add_argument(\"--model\", type=str, help=\"path to model file\")\n",
"\n",
" # ✅ Keep this arg to avoid breaking existing job/pipeline definitions\n",
" # Option A will NOT use it to register via MLflow.\n",
" parser.add_argument(\"--registered_model_name\", type=str, required=False, help=\"model name (ignored in Option A)\")\n",
"\n",
" parser.add_argument(\"--model\", type=str, help=\"path to model output folder\")\n",
" args = parser.parse_args()\n",
"\n",
" # paths are mounted as folder, therefore, we are selecting the file from folder\n",
" train_df = pd.read_csv(select_first_file(args.train_data))\n",
" # ✅ Option A: Disable autologging to avoid AML backend unsupported endpoints\n",
" mlflow.sklearn.autolog(disable=True)\n",
"\n",
" # Extracting the label column\n",
" y_train = train_df.pop(\"default payment next month\")\n",
" os.makedirs(\"./outputs\", exist_ok=True)\n",
"\n",
" # convert the dataframe values to array\n",
" X_train = train_df.values\n",
" # Start run (Azure ML handles tracking URI)\n",
" mlflow.start_run()\n",
"\n",
" # paths are mounted as folder, therefore, we are selecting the file from folder\n",
" test_df = pd.read_csv(select_first_file(args.test_data))\n",
" # Optional: record the intended model name as a tag for traceability\n",
" if args.registered_model_name:\n",
" mlflow.set_tag(\"registered_model_name_requested\", args.registered_model_name)\n",
"\n",
" # Extracting the label column\n",
" y_test = test_df.pop(\"default payment next month\")\n",
" # Load data\n",
" train_df = pd.read_csv(select_first_file(args.train_data))\n",
" y_train = train_df.pop(\"default payment next month\").to_numpy()\n",
" X_train = train_df.values\n",
"\n",
" # convert the dataframe values to array\n",
" test_df = pd.read_csv(select_first_file(args.test_data))\n",
" y_test = test_df.pop(\"default payment next month\").to_numpy()\n",
" X_test = test_df.values\n",
"\n",
" print(f\"Training with data of shape {X_train.shape}\")\n",
"\n",
" # Train\n",
" clf = GradientBoostingClassifier(\n",
" n_estimators=args.n_estimators, learning_rate=args.learning_rate\n",
" n_estimators=args.n_estimators,\n",
" learning_rate=args.learning_rate\n",
" )\n",
" clf.fit(X_train, y_train)\n",
"\n",
" # Evaluate\n",
" y_pred = clf.predict(X_test)\n",
"\n",
" print(classification_report(y_test, y_pred))\n",
"\n",
" # Registering the model to the workspace\n",
" print(\"Registering the model via MLFlow\")\n",
" mlflow.sklearn.log_model(\n",
" sk_model=clf,\n",
" registered_model_name=args.registered_model_name,\n",
" artifact_path=args.registered_model_name,\n",
" )\n",
" accuracy = accuracy_score(y_test, y_pred)\n",
" f1 = f1_score(y_test, y_pred)\n",
"\n",
" # Saving the model to a file\n",
" mlflow.sklearn.save_model(\n",
" sk_model=clf,\n",
" path=os.path.join(args.model, \"trained_model\"),\n",
" )\n",
" # Log metrics only (safe)\n",
" mlflow.log_metric(\"accuracy\", accuracy)\n",
" mlflow.log_metric(\"f1_score\", f1)\n",
"\n",
" # Stop Logging\n",
" # ✅ Save model artifact (still using MLflow serialization, but NOT registering)\n",
" model_output_path = os.path.join(args.model, \"trained_model\")\n",
" mlflow.sklearn.save_model(sk_model=clf, path=model_output_path)\n",
"\n",
" print(f\"Model saved to {model_output_path}\")\n",
" mlflow.end_run()\n",
"\n",
"\n",
"if __name__ == \"__main__\":\n",
" main()"
" main()\n"
]
},
{
Expand Down
Loading