diff --git a/data-platform/data-science/oracle-data-science/customizing-ads-model-deployment-artifacts/LICENSE b/data-platform/data-science/oracle-data-science/customizing-ads-model-deployment-artifacts/LICENSE new file mode 100644 index 000000000..4c17d2626 --- /dev/null +++ b/data-platform/data-science/oracle-data-science/customizing-ads-model-deployment-artifacts/LICENSE @@ -0,0 +1,35 @@ +Copyright (c) 2026 Oracle and/or its affiliates. + +The Universal Permissive License (UPL), Version 1.0 + +Subject to the condition set forth below, permission is hereby granted to any +person obtaining a copy of this software, associated documentation and/or data +(collectively the "Software"), free of charge and under any and all copyright +rights in the Software, and any and all patent rights owned or freely +licensable by each licensor hereunder covering either (i) the unmodified +Software as contributed to or provided by such licensor, or (ii) the Larger +Works (as defined below), to deal in both + +(a) the Software, and +(b) any piece of software and/or hardware listed in the lrgrwrks.txt file if +one is included with the Software (each a "Larger Work" to which the Software +is contributed by such licensors), + +without restriction, including without limitation the rights to copy, create +derivative works of, display, perform, and distribute the Software and make, +use, sell, offer for sale, import, export, have made, and have sold the +Software and the Larger Work(s), and to sublicense the foregoing rights on +either these or other terms. + +This license is subject to the following condition: +The above copyright notice and either this complete permission notice or at +a minimum a reference to the UPL must be included in all copies or +substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file diff --git a/data-platform/data-science/oracle-data-science/customizing-ads-model-deployment-artifacts/README.md b/data-platform/data-science/oracle-data-science/customizing-ads-model-deployment-artifacts/README.md new file mode 100644 index 000000000..59cf24acb --- /dev/null +++ b/data-platform/data-science/oracle-data-science/customizing-ads-model-deployment-artifacts/README.md @@ -0,0 +1,29 @@ +# Overview + +Reviewed: 2026.04.16 + +This project demonstrates how to deploy a machine learning model using the ADS SDK while customizing the default generated model artifacts, which is often required in production scenarios. + +While ADS provides a standard template for model artifacts, real-world use cases frequently require additional logic. In this notebook, we focus on modifying the generated artifacts to incorporate feature engineering directly into the deployment pipeline. + +The main advantage of this approach is that feature engineering is executed as part of the model inference process, eliminating the need to repeat these steps each time the model is invoked. + +Specifically, the notebook covers: +1. Building a model to predict Titanic survival using a Scikit-learn pipeline +2. Preparing model artifacts using the ADS SDK +3. Customizing the generated artifact to include feature engineering in score.py +4. Registering, deploying, and invoking the model + +# Environment + +Conda environment: generalml_p311_cpu_x86_64_v1 +Created: April 2026 + +# Prerequisites +- Access to OCI Data Science +- Required IAM permissions for model registration and deployment +- Basic familiarity with Python, Pandas, and Scikit-learn + +# License +Copyright (c) 2026 Oracle and/or its affiliates. +Licensed under the Universal Permissive License (UPL), Version 1.0. diff --git a/data-platform/data-science/oracle-data-science/customizing-ads-model-deployment-artifacts/files/example for customized artifacts files/runtime(example).yaml b/data-platform/data-science/oracle-data-science/customizing-ads-model-deployment-artifacts/files/example for customized artifacts files/runtime(example).yaml new file mode 100644 index 000000000..fde374a4d --- /dev/null +++ b/data-platform/data-science/oracle-data-science/customizing-ads-model-deployment-artifacts/files/example for customized artifacts files/runtime(example).yaml @@ -0,0 +1,23 @@ + +MODEL_ARTIFACT_VERSION: '3.0' +MODEL_DEPLOYMENT: + INFERENCE_CONDA_ENV: + INFERENCE_ENV_PATH: 'oci://pub-conda-env@/conda_environments/cpu/General Machine Learning for CPUs on Python 3.11/1.0/generalml_p311_cpu_x86_64_v1' + INFERENCE_ENV_SLUG: '' + INFERENCE_ENV_TYPE: published + INFERENCE_PYTHON_VERSION: '3.11' +MODEL_PROVENANCE: + PROJECT_OCID: '' + TENANCY_OCID: '' + TRAINING_CODE: + ARTIFACT_DIRECTORY: /home/datascience/custom_ads/sklearn_artifact_dir5 + TRAINING_COMPARTMENT_OCID: '' + TRAINING_CONDA_ENV: + TRAINING_ENV_PATH: '' + TRAINING_ENV_SLUG: '' + TRAINING_ENV_TYPE: '' + TRAINING_PYTHON_VERSION: '' + TRAINING_REGION: '' + TRAINING_RESOURCE_OCID: '' + USER_OCID: '' + VM_IMAGE_INTERNAL_ID: '' diff --git a/data-platform/data-science/oracle-data-science/customizing-ads-model-deployment-artifacts/files/example for customized artifacts files/score(example).py b/data-platform/data-science/oracle-data-science/customizing-ads-model-deployment-artifacts/files/example for customized artifacts files/score(example).py new file mode 100644 index 000000000..467c7ea5c --- /dev/null +++ b/data-platform/data-science/oracle-data-science/customizing-ads-model-deployment-artifacts/files/example for customized artifacts files/score(example).py @@ -0,0 +1,163 @@ +# score.py 1.0 generated by ADS 2.11.19 on 20260415_092331 +import json +import os +import cloudpickle +import pandas as pd +import numpy as np +from functools import lru_cache + + +model_name = 'model.pkl' + + +""" + Inference script. This script is used for prediction by scoring server when schema is known. +""" + +def create_features(df: pd.DataFrame) -> pd.DataFrame: + data = df.copy() + if {"sibsp", "parch"}.issubset(data.columns): + data["family_size"] = data["sibsp"].fillna(0) + data["parch"].fillna(0) + 1 + data["is_alone"] = (data["family_size"] == 1).astype(int) + for col in ["age", "fare", "embarked"]: + if col in data.columns: + data[f"{col}_missing"] = data[col].isna().astype(int) + return data + + +@lru_cache(maxsize=10) +def load_model(model_file_name=model_name): + """ + Loads model from the serialized format + + Returns + ------- + model: a model instance on which predict API can be invoked + """ + model_dir = os.path.dirname(os.path.realpath(__file__)) + contents = os.listdir(model_dir) + if model_file_name in contents: + print(f'Start loading {model_file_name} from model directory {model_dir} ...') + with open(os.path.join(os.path.dirname(os.path.realpath(__file__)), model_file_name), "rb") as file: + loaded_model = cloudpickle.load(file) + + print("Model is successfully loaded.") + return loaded_model + else: + raise Exception(f'{model_file_name} is not found in model directory {model_dir}') + +@lru_cache(maxsize=1) +def fetch_data_type_from_schema(input_schema_path=os.path.join(os.path.dirname(os.path.realpath(__file__)), "input_schema.json")): + """ + Returns data type information fetch from input_schema.json. + + Parameters + ---------- + input_schema_path: path of input schema. + + Returns + ------- + data_type: data type fetch from input_schema.json. + + """ + data_type = {} + if os.path.exists(input_schema_path): + schema = json.load(open(input_schema_path)) + for col in schema['schema']: + data_type[col['name']] = col['dtype'] + else: + print("input_schema has to be passed in in order to recover the same data type. pass `X_sample` in `ads.model.framework.sklearn_model.SklearnModel.prepare` function to generate the input_schema. Otherwise, the data type might be changed after serialization/deserialization.") + return data_type + +def deserialize(data, input_schema_path): + """ + Deserialize json serialization data to data in original type when sent to predict. + + Parameters + ---------- + data: serialized input data. + input_schema_path: path of input schema. + + Returns + ------- + data: deserialized input data. + + """ + + import base64 + from io import BytesIO, StringIO # added StringIO + if isinstance(data, bytes): + return data + + data_type = data.get('data_type', '') if isinstance(data, dict) else '' + json_data = data.get('data', data) if isinstance(data, dict) else data + + if "numpy.ndarray" in data_type: + load_bytes = BytesIO(base64.b64decode(json_data.encode('utf-8'))) + return np.load(load_bytes, allow_pickle=True) + if "pandas.core.series.Series" in data_type: + return pd.Series(json_data) + if "pandas.core.frame.DataFrame" in data_type or isinstance(json_data, str): + return pd.read_json(StringIO(json_data), dtype=fetch_data_type_from_schema(input_schema_path)) # add StringIO for better practice + if isinstance(json_data, dict): + return pd.DataFrame.from_dict(json_data) + return json_data + + +def pre_inference(data, input_schema_path): + """ + Preprocess data + + Parameters + ---------- + data: Data format as expected by the predict API of the core estimator. + input_schema_path: path of input schema. + + Returns + ------- + data: Data format after any processing. + + """ + return deserialize(data, input_schema_path) + +def post_inference(yhat): + """ + Post-process the model results + + Parameters + ---------- + yhat: Data format after calling model.predict. + + Returns + ------- + yhat: Data format after any processing. + + """ + if isinstance(yhat, pd.core.frame.DataFrame): + yhat = yhat.values + if isinstance(yhat, np.ndarray): + yhat = yhat.tolist() + return yhat + +def predict(data, model=load_model(), input_schema_path=os.path.join(os.path.dirname(os.path.realpath(__file__)), "input_schema.json")): + """ + Returns prediction given the model and data to predict + + Parameters + ---------- + model: Model instance returned by load_model API. + data: Data format as expected by the predict API of the core estimator. For eg. in case of sckit models it could be numpy array/List of list/Pandas DataFrame. + input_schema_path: path of input schema. + + Returns + ------- + predictions: Output from scoring server + Format: {'prediction': output from model.predict method} + + """ + features = pre_inference(data, input_schema_path) + features = create_features(features) # added to apply the customization + yhat = post_inference( + model.predict(features) + ) + return {'prediction': yhat} \ No newline at end of file diff --git a/data-platform/data-science/oracle-data-science/customizing-ads-model-deployment-artifacts/files/models.py b/data-platform/data-science/oracle-data-science/customizing-ads-model-deployment-artifacts/files/models.py new file mode 100644 index 000000000..7a7919aba --- /dev/null +++ b/data-platform/data-science/oracle-data-science/customizing-ads-model-deployment-artifacts/files/models.py @@ -0,0 +1,80 @@ +from sklearn.ensemble import RandomForestClassifier +from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, make_scorer, f1_score +from sklearn.pipeline import Pipeline +from sklearn.model_selection import RandomizedSearchCV +import numpy as np + + +def build_model(random_state: int = 42): + """Random Forest classifier - base estimator for HPO.""" + return RandomForestClassifier( + random_state=random_state, + n_jobs=-1 + ) + + +def build_pipeline(preprocessor, model): + """Attach preprocessing and model in one sklearn Pipeline.""" + return Pipeline( + steps=[ + ("preprocess", preprocessor), + ("model", model), + ] + ) + + +def optimize_hyperparameters(pipeline, X_train, y_train, random_state: int = 42): + """ + Run RandomizedSearchCV optimizing for F1 on the minority class (label=1). + Returns the best fitted pipeline. + """ + param_dist = { + "model__n_estimators": [100, 200, 300, 500], + "model__max_depth": [10, 15, 20, 30, None], + "model__min_samples_split": [2, 5, 10, 20], + "model__min_samples_leaf": [1, 2, 4, 8], + "model__max_features": ["sqrt", "log2", 0.3, 0.5], + "model__class_weight": [ + "balanced", + "balanced_subsample", + {0: 1, 1: 2}, # penalize missing class 1 twice as much + {0: 1, 1: 3}, # penalize missing class 1 three times as much + ], + } + + # Optimize for F1 on minority class specifically + scorer = make_scorer(f1_score, pos_label=1) + + search = RandomizedSearchCV( + estimator=pipeline, + param_distributions=param_dist, + n_iter=30, # number of parameter combinations to try + scoring=scorer, + cv=5, # 5-fold cross validation + verbose=2, + random_state=random_state, + n_jobs=-1 + ) + + search.fit(X_train, y_train) + + print(f"\nBest F1 (class 1): {search.best_score_:.4f}") + print(f"Best params: {search.best_params_}") + + return search.best_estimator_ + + +def evaluate_model(pipeline, X_test, y_test): + """Return key evaluation artifacts for test data.""" + y_pred = pipeline.predict(X_test) + y_prob = pipeline.predict_proba(X_test)[:, 1] + + report = classification_report(y_test, y_pred) + cm = confusion_matrix(y_test, y_pred) + auc = roc_auc_score(y_test, y_prob) + + return { + "classification_report": report, + "confusion_matrix": cm, + "roc_auc": auc, + } \ No newline at end of file diff --git a/data-platform/data-science/oracle-data-science/customizing-ads-model-deployment-artifacts/files/pre_processing.py b/data-platform/data-science/oracle-data-science/customizing-ads-model-deployment-artifacts/files/pre_processing.py new file mode 100644 index 000000000..26382654e --- /dev/null +++ b/data-platform/data-science/oracle-data-science/customizing-ads-model-deployment-artifacts/files/pre_processing.py @@ -0,0 +1,45 @@ +import pandas as pd +from sklearn.compose import ColumnTransformer +from sklearn.preprocessing import OneHotEncoder +from category_encoders import TargetEncoder + + +def create_features(df: pd.DataFrame) -> pd.DataFrame: + """Create a few simple Titanic features for demo purposes.""" + data = df.copy() + + if {"sibsp", "parch"}.issubset(data.columns): + data["family_size"] = data["sibsp"].fillna(0) + data["parch"].fillna(0) + 1 + data["is_alone"] = (data["family_size"] == 1).astype(int) + + for col in ["age", "fare", "embarked"]: + if col in data.columns: + data[f"{col}_missing"] = data[col].isna().astype(int) + + return data + + +def split_column_types( + X: pd.DataFrame, + low_cardinality_threshold: int = 10, +): + """Split columns to numeric / low-card categorical / high-card categorical.""" + numeric_cols = X.select_dtypes(include=["number", "bool"]).columns.tolist() + categorical_cols = X.select_dtypes(include=["object", "category"]).columns.tolist() + + low_card = [c for c in categorical_cols if X[c].nunique(dropna=True) <= low_cardinality_threshold] + high_card = [c for c in categorical_cols if c not in low_card] + + return numeric_cols, low_card, high_card + + +def build_preprocessor(numeric_cols, low_card_cols, high_card_cols): + """Build ColumnTransformer with OneHot + TargetEncoder + numeric passthrough.""" + return ColumnTransformer( + transformers=[ + ("onehot", OneHotEncoder(handle_unknown="ignore"), low_card_cols), + ("target", TargetEncoder(handle_unknown="value", handle_missing="value"), high_card_cols), + ("num", "passthrough", numeric_cols), + ], + remainder="drop", + ) diff --git a/data-platform/data-science/oracle-data-science/customizing-ads-model-deployment-artifacts/files/train.ipynb b/data-platform/data-science/oracle-data-science/customizing-ads-model-deployment-artifacts/files/train.ipynb new file mode 100644 index 000000000..916e4a63e --- /dev/null +++ b/data-platform/data-science/oracle-data-science/customizing-ads-model-deployment-artifacts/files/train.ipynb @@ -0,0 +1,427 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "c866127d-6ceb-42be-b78d-ac13bfcdea45", + "metadata": {}, + "source": [ + "# Custom ADS SDK Deployment" + ] + }, + { + "cell_type": "markdown", + "id": "cd8d8e0b", + "metadata": {}, + "source": [ + "Authour: Assaf Rabinowicz\\\n", + "Date: April 2026" + ] + }, + { + "cell_type": "markdown", + "id": "edf947d5-c698-498e-a10b-b90b5f06b780", + "metadata": {}, + "source": [ + "### The following notebook covers:\n", + "1. Building a model to predict Titanic survival using a Scikit-learn pipeline and validate the model\n", + "2. Preparing model artifacts using the ADS SDK\n", + "3. Customizing the generated model artifact to incorporate feature engineering into the deployment script\n", + "4. Registering, deploying, and invoking the model" + ] + }, + { + "cell_type": "markdown", + "id": "1b73720c-427a-4efd-ab30-01b834e81630", + "metadata": {}, + "source": [ + "### Runtime:\n", + "- generalml_p311_cpu_x86_64_v1 \n", + "- Additional libraries were installed, including category_encoders and seaborn" + ] + }, + { + "cell_type": "markdown", + "id": "cba7068b", + "metadata": {}, + "source": [ + "## Packages and Data Import" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9d7198a0", + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "import pandas as pd\n", + "import seaborn as sns\n", + "from sklearn.datasets import fetch_openml\n", + "from sklearn.model_selection import train_test_split\n", + "import requests\n", + "\n", + "import oci\n", + "from oci.auth.signers import get_resource_principals_signer\n", + "import ads\n", + "from ads.model.generic_model import GenericModel\n", + "\n", + "from pre_processing import create_features, split_column_types, build_preprocessor\n", + "from models import build_model, build_pipeline, evaluate_model,optimize_hyperparameters\n", + "\n", + "ads.set_auth(auth='resource_principal')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0ea61076", + "metadata": {}, + "outputs": [], + "source": [ + "raw = fetch_openml(name=\"titanic\", version=1, as_frame=True)\n", + "df = raw.frame.copy()" + ] + }, + { + "cell_type": "markdown", + "id": "2aa23ea3", + "metadata": {}, + "source": [ + "## Data Processing" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "27d8184b", + "metadata": {}, + "outputs": [], + "source": [ + "df[\"survived\"] = pd.to_numeric(df[\"survived\"], errors=\"coerce\")\n", + "df = df.dropna(subset=[\"survived\"])\n", + "df[\"survived\"] = df[\"survived\"].astype(int)\n", + "\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "88f2d1be", + "metadata": {}, + "outputs": [], + "source": [ + "df_processed = create_features(df)\n", + "\n", + "target_col = \"survived\"\n", + "X = df_processed.drop(columns=[target_col])\n", + "y = df_processed[target_col]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3fc6997b", + "metadata": {}, + "outputs": [], + "source": [ + "X_train, X_test, y_train, y_test = train_test_split(\n", + " X, y,\n", + " test_size=0.2,\n", + " random_state=42,\n", + " stratify=y\n", + ")\n", + "\n", + "numeric_cols, low_card_cols, high_card_cols = split_column_types(X_train)\n", + "\n", + "print(f\"Train shape: {X_train.shape}, Test shape: {X_test.shape}\")\n", + "print(f\"Low-card categorical columns: {low_card_cols}\")\n", + "print(f\"High-card categorical columns: {high_card_cols}\")" + ] + }, + { + "cell_type": "markdown", + "id": "a2b00dbe", + "metadata": {}, + "source": [ + "## Modeling" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3a348357", + "metadata": {}, + "outputs": [], + "source": [ + "preprocessor = build_preprocessor(numeric_cols, low_card_cols, high_card_cols) # build_preprocessor uses category_encoders liberary. You can install it, or alternatively use OneHotEncoder for all categorical variales\n", + "model = build_model(random_state=42) # using RandomForestClassifier model\n", + "pipeline = build_pipeline(preprocessor, model)\n", + "best_pipeline = optimize_hyperparameters(pipeline, X_train, y_train) #best model of hyperparameter optimization is selected" + ] + }, + { + "cell_type": "markdown", + "id": "7161af1e-6789-4a02-8713-49a87843671b", + "metadata": {}, + "source": [ + "## Validation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1cc052b6", + "metadata": {}, + "outputs": [], + "source": [ + "results = evaluate_model(best_pipeline, X_test, y_test)\n", + "\n", + "print(\"Classification report:\")\n", + "print(results[\"classification_report\"])\n", + "print(f\"ROC-AUC: {results['roc_auc']:.4f}\")\n", + "\n", + "cm = results[\"confusion_matrix\"]\n", + "plt.figure(figsize=(5, 4))\n", + "sns.heatmap(cm, annot=True, fmt=\"d\", cmap=\"Blues\")\n", + "plt.title(\"Confusion Matrix (Test Set)\")\n", + "plt.xlabel(\"Predicted\")\n", + "plt.ylabel(\"Actual\")\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "cba4ec45", + "metadata": {}, + "source": [ + "## Preparing Model Artifacts Using ADS" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "923a7d59-1b3b-4f4b-b28b-167901149e02", + "metadata": {}, + "outputs": [], + "source": [ + "X_sample = X.iloc[[0]]\n", + "artifact_dir = ''" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dce3811b-c3a1-44eb-9607-a46596004312", + "metadata": {}, + "outputs": [], + "source": [ + "# Here we use GenericModel for creating the model artifacts. \n", + "#Another alternative for sklearn models\\pipelines is: ads.model.framework.sklearn_model.SklearnModel\n", + "artifact_object = GenericModel(estimator=best_pipeline,\n", + " artifact_dir=artifact_dir)" + ] + }, + { + "cell_type": "markdown", + "id": "c3db7e53-b99b-463b-875e-26afc8aa71fc", + "metadata": {}, + "source": [ + "### Important Note:\n", + "- If you specify inference_conda_env=generalml_p311_cpu_x86_64_v1 in the prepare function below, the base version of this environment will be used, which does not include the category_encoders library.\n", + "- Therefore, if your scoring depends on category_encoders, you must publish an updated conda environment that includes this library and provide its URL, as shown in the example below.\n", + "- For more details on publishing a conda environment, see: https://docs.oracle.com/en-us/iaas/Content/data-science/using/conda_publishs_object.htm" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0501418f-1406-4d25-8ff4-56a557ce9833", + "metadata": {}, + "outputs": [], + "source": [ + "artifact_object.prepare(\n", + " inference_conda_env='oci://pub-conda-env@/cpu/generalml_p311_cpu_x86_64_v1/1.0/generalml_p311_cpu_x86_64_v1',\n", + " inference_python_version=\"3.11\",\n", + " X_sample=X_sample,\n", + " force_overwrite=True\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "7b83c740-e1bf-47db-80fa-39dcf41eeb82", + "metadata": {}, + "source": [ + "## Customizing the Model Artifacts" + ] + }, + { + "cell_type": "markdown", + "id": "e0286770-3077-4243-9a23-5901d6517127", + "metadata": {}, + "source": [ + "1. Add the create_features logic in the score.py:\n", + " 1. Copy the create_features function from pre_processing.py and paste it into score.py\n", + " 2. Modify the predict function to call create_features after pre_inference:\\\n", + " features = pre_inference(data, input_schema_path)\\\n", + " features = create_features(features) # added to apply the customization\n", + " yhat = post_inference(model.predict(features))\n", + "2. Update the deserialize function in score.py\n", + " 1. add 'import StringIO'\n", + " 2. Wrap the json_data in the str path:\\\n", + " if \"pandas.core.frame.DataFrame\" in data_type or isinstance(json_data, str):\\\n", + " return pd.read_json(StringIO(json_data), dtype=fetch_data_type_from_schema(input_schema_path)) # add StringIO for better practice\n", + "3. In case of installing new libareries requires to scoring (like category_encoders in our case):\n", + " 1. Publish your custom conda environment\n", + " 2. Update the runtime.yaml file by replacnig the INFERENCE_ENV_PATH value with the path to your published custom conda environment. Example:\\\n", + " INFERENCE_ENV_PATH: 'oci://pub-conda-env@<'your_name_space'>/conda_environments/cpu/General Machine Learning for CPUs on Python 3.11/1.0/generalml_p311_cpu_x86_64_v1'\n", + "4. Another option to modify score.py is to add the score_py_uri argument to the prepare() method rather than adjusting it ad hoc. The score_py_uri argument points to a pre-created score.py file." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4e8e5a70-8874-4bd1-a779-7513eeba4e17", + "metadata": {}, + "outputs": [], + "source": [ + "# Reloading in required in order to update the model in the memory for the next steps\n", + "artifact_object.reload()" + ] + }, + { + "cell_type": "markdown", + "id": "27d769a9-5868-4248-a424-9613297fb1ba", + "metadata": {}, + "source": [ + "## Testing the Model Locally" + ] + }, + { + "cell_type": "markdown", + "id": "384e5df1-ee41-4bcd-9ea2-b4a1b323b122", + "metadata": {}, + "source": [ + "Two tests are recommended:\n", + "1. Verify method – validates the logic in the score.py file by testing the model’s inference pipeline end-to-end.\n", + "2. Introspect – ensures that the model artifact directory is correctly structured and contains all required components.\n", + "\n", + "Both tests must pass before model registration." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8040e868-f590-4814-9740-3f53e51495fd", + "metadata": {}, + "outputs": [], + "source": [ + "sample = raw.frame.iloc[[0]].copy()\n", + "\n", + "test_input = {\n", + " \"data\": sample.to_json(orient=\"records\")\n", + "}\n", + "artifact_object.verify(test_input)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "71c9b2e9-8e8c-414b-8933-acccf558aff3", + "metadata": {}, + "outputs": [], + "source": [ + "artifact_object.introspect()" + ] + }, + { + "cell_type": "markdown", + "id": "c1d16ea8-5252-41e9-8fa4-add962b143a7", + "metadata": {}, + "source": [ + "## Saving and Deploying the Model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1866b365-fd66-400d-9693-18ae69979f8e", + "metadata": {}, + "outputs": [], + "source": [ + "model_id = artifact_object.save(\n", + " display_name=\"titanic-survived-pipeline\", # All arguments are optional; if not provided, they will be replaced with default values.\n", + " description=\"sklearn classification pipeline with feature engineering\",\n", + " ignore_pending_changes=True)\n", + "\n", + "print(f\"Model saved: {model_id}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "adc1f0cd-0d8b-48ba-a738-284416249924", + "metadata": {}, + "outputs": [], + "source": [ + "deployed_model = artifact_object.deploy(\n", + " display_name=\"titanic-survived-pipeline deployment\", # All arguments are optional; if not provided, they will be replaced with default values.\n", + " deployment_log_group_id=\"\",\n", + " deployment_predict_log_id=\"\",\n", + " deployment_instance_shape=\"VM.Standard.E4.Flex\",\n", + " deployment_ocpus=1,\n", + " deployment_memory_in_gbs=16\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "c1bd0486-5fca-45b0-abfd-1ac452f210df", + "metadata": {}, + "source": [ + "## Scoring" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fd8b12d6-c9f4-498b-805d-7ad4587e7e8f", + "metadata": {}, + "outputs": [], + "source": [ + "endpoint = deployed_model.url + \"/predict\" # you can also find the endpoint in the UI Console\n", + "\n", + "signer = get_resource_principals_signer()\n", + "response = requests.post(\n", + " endpoint,\n", + " json=test_input,\n", + " auth=signer\n", + ")\n", + "print(response.json())" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python [conda env:generalml_p311_cpu_x86_64_v1]", + "language": "python", + "name": "conda-env-generalml_p311_cpu_x86_64_v1-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/data-platform/data-science/oracle-data-science/your-first-data-science-project/files/Guide for Your First Data Science Project prerequisites.pdf b/data-platform/data-science/oracle-data-science/your-first-data-science-project/files/Guide for Your First Data Science Project prerequisites.pdf index 26650eeab..ab9dab655 100644 Binary files a/data-platform/data-science/oracle-data-science/your-first-data-science-project/files/Guide for Your First Data Science Project prerequisites.pdf and b/data-platform/data-science/oracle-data-science/your-first-data-science-project/files/Guide for Your First Data Science Project prerequisites.pdf differ diff --git a/data-platform/data-science/oracle-data-science/your-first-data-science-project/files/adult_income shared.ipynb b/data-platform/data-science/oracle-data-science/your-first-data-science-project/files/adult_income shared.ipynb index e2acb29c6..4aefeca28 100644 --- a/data-platform/data-science/oracle-data-science/your-first-data-science-project/files/adult_income shared.ipynb +++ b/data-platform/data-science/oracle-data-science/your-first-data-science-project/files/adult_income shared.ipynb @@ -963,6 +963,26 @@ "#automl_model.deploy(display_name=\"Demo Adults Income Model 1\")" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "deccf815-83fa-4b43-ac12-11a8dae5f956", + "metadata": {}, + "outputs": [], + "source": [ + "# # You can custom the deployment:\n", + "# deployed_model = automl_model.deploy(\n", + "# display_name=\"Demo Adults Income Model 1\",\n", + "# deployment_log_group_id=\"\",\n", + "# deployment_predict_log_id=\"\",\n", + "# deployment_instance_shape=\"VM.Standard.E4.Flex\", # the vm shape\n", + "# deployment_ocpus=1,\n", + "# deployment_memory_in_gbs=16,\n", + "# defined_tags={\"default_tags\": {}}, # please keep it\n", + "# freeform_tags={}\n", + "# )" + ] + }, { "cell_type": "markdown", "id": "8d4757f6-7509-4516-b5c0-9eb635965a6b",