Merge pull request #2787 from oracle-devrel/oci-sdk-deployment

iosyed · web-flow · commit 6dacb343237a · 2026-04-16T16:37:30.000+01:00
Initial commit for customizing ads deployment and changes for your-fi…
diff --git a/data-platform/data-science/oracle-data-science/customizing-ads-model-deployment-artifacts/LICENSE b/data-platform/data-science/oracle-data-science/customizing-ads-model-deployment-artifacts/LICENSE
@@ -0,0 +1,35 @@
+Copyright (c) 2026 Oracle and/or its affiliates.
+
+The Universal Permissive License (UPL), Version 1.0
+
+Subject to the condition set forth below, permission is hereby granted to any
+person obtaining a copy of this software, associated documentation and/or data
+(collectively the "Software"), free of charge and under any and all copyright
+rights in the Software, and any and all patent rights owned or freely
+licensable by each licensor hereunder covering either (i) the unmodified
+Software as contributed to or provided by such licensor, or (ii) the Larger
+Works (as defined below), to deal in both
+
+(a) the Software, and
+(b) any piece of software and/or hardware listed in the lrgrwrks.txt file if
+one is included with the Software (each a "Larger Work" to which the Software
+is contributed by such licensors),
+
+without restriction, including without limitation the rights to copy, create
+derivative works of, display, perform, and distribute the Software and make,
+use, sell, offer for sale, import, export, have made, and have sold the
+Software and the Larger Work(s), and to sublicense the foregoing rights on
+either these or other terms.
+
+This license is subject to the following condition:
+The above copyright notice and either this complete permission notice or at
+a minimum a reference to the UPL must be included in all copies or
+substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/data-platform/data-science/oracle-data-science/customizing-ads-model-deployment-artifacts/README.md b/data-platform/data-science/oracle-data-science/customizing-ads-model-deployment-artifacts/README.md
@@ -0,0 +1,29 @@
+# Overview
+
+Reviewed: 2026.04.16
+
+This project demonstrates how to deploy a machine learning model using the ADS SDK while customizing the default generated model artifacts, which is often required in production scenarios.
+
+While ADS provides a standard template for model artifacts, real-world use cases frequently require additional logic. In this notebook, we focus on modifying the generated artifacts to incorporate feature engineering directly into the deployment pipeline.
+
+The main advantage of this approach is that feature engineering is executed as part of the model inference process, eliminating the need to repeat these steps each time the model is invoked.
+
+Specifically, the notebook covers:
+1. Building a model to predict Titanic survival using a Scikit-learn pipeline
+2. Preparing model artifacts using the ADS SDK
+3. Customizing the generated artifact to include feature engineering in score.py
+4. Registering, deploying, and invoking the model
+
+# Environment
+
+Conda environment: generalml_p311_cpu_x86_64_v1
+Created: April 2026
+
+# Prerequisites
+- Access to OCI Data Science
+- Required IAM permissions for model registration and deployment
+- Basic familiarity with Python, Pandas, and Scikit-learn
+
+# License
+Copyright (c) 2026 Oracle and/or its affiliates.
+Licensed under the Universal Permissive License (UPL), Version 1.0.
diff --git a/data-platform/data-science/oracle-data-science/customizing-ads-model-deployment-artifacts/files/example for customized artifacts files/runtime(example).yaml b/data-platform/data-science/oracle-data-science/customizing-ads-model-deployment-artifacts/files/example for customized artifacts files/runtime(example).yaml
@@ -0,0 +1,23 @@
+
+MODEL_ARTIFACT_VERSION: '3.0'
+MODEL_DEPLOYMENT:
+  INFERENCE_CONDA_ENV:
+    INFERENCE_ENV_PATH: 'oci://pub-conda-env@<your_name_space>/conda_environments/cpu/General Machine Learning for CPUs on Python 3.11/1.0/generalml_p311_cpu_x86_64_v1'
+    INFERENCE_ENV_SLUG: ''
+    INFERENCE_ENV_TYPE: published
+    INFERENCE_PYTHON_VERSION: '3.11'
+MODEL_PROVENANCE:
+  PROJECT_OCID: ''
+  TENANCY_OCID: ''
+  TRAINING_CODE:
+    ARTIFACT_DIRECTORY: /home/datascience/custom_ads/sklearn_artifact_dir5
+  TRAINING_COMPARTMENT_OCID: ''
+  TRAINING_CONDA_ENV:
+    TRAINING_ENV_PATH: ''
+    TRAINING_ENV_SLUG: ''
+    TRAINING_ENV_TYPE: ''
+    TRAINING_PYTHON_VERSION: ''
+  TRAINING_REGION: ''
+  TRAINING_RESOURCE_OCID: ''
+  USER_OCID: ''
+  VM_IMAGE_INTERNAL_ID: ''
diff --git a/data-platform/data-science/oracle-data-science/customizing-ads-model-deployment-artifacts/files/example for customized artifacts files/score(example).py b/data-platform/data-science/oracle-data-science/customizing-ads-model-deployment-artifacts/files/example for customized artifacts files/score(example).py
@@ -0,0 +1,163 @@
+# score.py 1.0 generated by ADS 2.11.19 on 20260415_092331
+import json
+import os
+import cloudpickle
+import pandas as pd
+import numpy as np
+from functools import lru_cache
+
+
+model_name = 'model.pkl'
+
+
+"""
+   Inference script. This script is used for prediction by scoring server when schema is known.
+"""
+
+def create_features(df: pd.DataFrame) -> pd.DataFrame:
+    data = df.copy()
+    if {"sibsp", "parch"}.issubset(data.columns):
+        data["family_size"] = data["sibsp"].fillna(0) + data["parch"].fillna(0) + 1
+        data["is_alone"] = (data["family_size"] == 1).astype(int)
+    for col in ["age", "fare", "embarked"]:
+        if col in data.columns:
+            data[f"{col}_missing"] = data[col].isna().astype(int)
+    return data
+
+
+@lru_cache(maxsize=10)
+def load_model(model_file_name=model_name):
+    """
+    Loads model from the serialized format
+
+    Returns
+    -------
+    model:  a model instance on which predict API can be invoked
+    """
+    model_dir = os.path.dirname(os.path.realpath(__file__))
+    contents = os.listdir(model_dir)
+    if model_file_name in contents:
+        print(f'Start loading {model_file_name} from model directory {model_dir} ...')
+        with open(os.path.join(os.path.dirname(os.path.realpath(__file__)), model_file_name), "rb") as file:
+            loaded_model = cloudpickle.load(file)
+
+        print("Model is successfully loaded.")
+        return loaded_model
+    else:
+        raise Exception(f'{model_file_name} is not found in model directory {model_dir}')
+
+@lru_cache(maxsize=1)
+def fetch_data_type_from_schema(input_schema_path=os.path.join(os.path.dirname(os.path.realpath(__file__)), "input_schema.json")):
+    """
+    Returns data type information fetch from input_schema.json.
+
+    Parameters
+    ----------
+    input_schema_path: path of input schema.
+
+    Returns
+    -------
+    data_type: data type fetch from input_schema.json.
+
+    """
+    data_type = {}
+    if os.path.exists(input_schema_path):
+        schema = json.load(open(input_schema_path))
+        for col in schema['schema']:
+            data_type[col['name']] = col['dtype']
+    else:
+        print("input_schema has to be passed in in order to recover the same data type. pass `X_sample` in `ads.model.framework.sklearn_model.SklearnModel.prepare` function to generate the input_schema. Otherwise, the data type might be changed after serialization/deserialization.")
+    return data_type
+
+def deserialize(data, input_schema_path):
+    """
+    Deserialize json serialization data to data in original type when sent to predict.
+
+    Parameters
+    ----------
+    data: serialized input data.
+    input_schema_path: path of input schema.
+
+    Returns
+    -------
+    data: deserialized input data.
+
+    """
+
+    import base64
+    from io import BytesIO, StringIO  # added StringIO
+    if isinstance(data, bytes):
+        return data
+
+    data_type = data.get('data_type', '') if isinstance(data, dict) else ''
+    json_data = data.get('data', data) if isinstance(data, dict) else data
+
+    if "numpy.ndarray" in data_type:
+        load_bytes = BytesIO(base64.b64decode(json_data.encode('utf-8')))
+        return np.load(load_bytes, allow_pickle=True)
+    if "pandas.core.series.Series" in data_type:
+        return pd.Series(json_data)
+    if "pandas.core.frame.DataFrame" in data_type or isinstance(json_data, str):
+        return pd.read_json(StringIO(json_data), dtype=fetch_data_type_from_schema(input_schema_path)) # add StringIO for better practice
+    if isinstance(json_data, dict):
+        return pd.DataFrame.from_dict(json_data)
+    return json_data
+
+
+def pre_inference(data, input_schema_path):
+    """
+    Preprocess data
+
+    Parameters
+    ----------
+    data: Data format as expected by the predict API of the core estimator.
+    input_schema_path: path of input schema.
+
+    Returns
+    -------
+    data: Data format after any processing.
+
+    """
+    return deserialize(data, input_schema_path)
+
+def post_inference(yhat):
+    """
+    Post-process the model results
+
+    Parameters
+    ----------
+    yhat: Data format after calling model.predict.
+
+    Returns
+    -------
+    yhat: Data format after any processing.
+
+    """
+    if isinstance(yhat, pd.core.frame.DataFrame):
+        yhat = yhat.values
+    if isinstance(yhat, np.ndarray):
+        yhat = yhat.tolist()
+    return yhat
+
+def predict(data, model=load_model(), input_schema_path=os.path.join(os.path.dirname(os.path.realpath(__file__)), "input_schema.json")):
+    """
+    Returns prediction given the model and data to predict
+
+    Parameters
+    ----------
+    model: Model instance returned by load_model API.
+    data: Data format as expected by the predict API of the core estimator. For eg. in case of sckit models it could be numpy array/List of list/Pandas DataFrame.
+    input_schema_path: path of input schema.
+
+    Returns
+    -------
+    predictions: Output from scoring server
+        Format: {'prediction': output from model.predict method}
+
+    """
+    features = pre_inference(data, input_schema_path)
+    features = create_features(features) # added to apply the customization
+    yhat = post_inference(
+        model.predict(features)
+    )
+    return {'prediction': yhat}
diff --git a/data-platform/data-science/oracle-data-science/customizing-ads-model-deployment-artifacts/files/models.py b/data-platform/data-science/oracle-data-science/customizing-ads-model-deployment-artifacts/files/models.py
@@ -0,0 +1,80 @@
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, make_scorer, f1_score
+from sklearn.pipeline import Pipeline
+from sklearn.model_selection import RandomizedSearchCV
+import numpy as np
+
+
+def build_model(random_state: int = 42):
+    """Random Forest classifier - base estimator for HPO."""
+    return RandomForestClassifier(
+        random_state=random_state,
+        n_jobs=-1
+    )
+
+
+def build_pipeline(preprocessor, model):
+    """Attach preprocessing and model in one sklearn Pipeline."""
+    return Pipeline(
+        steps=[
+            ("preprocess", preprocessor),
+            ("model", model),
+        ]
+    )
+
+
+def optimize_hyperparameters(pipeline, X_train, y_train, random_state: int = 42):
+    """
+    Run RandomizedSearchCV optimizing for F1 on the minority class (label=1).
+    Returns the best fitted pipeline.
+    """
+    param_dist = {
+        "model__n_estimators": [100, 200, 300, 500],
+        "model__max_depth": [10, 15, 20, 30, None],
+        "model__min_samples_split": [2, 5, 10, 20],
+        "model__min_samples_leaf": [1, 2, 4, 8],
+        "model__max_features": ["sqrt", "log2", 0.3, 0.5],
+        "model__class_weight": [
+            "balanced",
+            "balanced_subsample",
+            {0: 1, 1: 2},   # penalize missing class 1 twice as much
+            {0: 1, 1: 3},   # penalize missing class 1 three times as much
+        ],
+    }
+
+    # Optimize for F1 on minority class specifically
+    scorer = make_scorer(f1_score, pos_label=1)
+
+    search = RandomizedSearchCV(
+        estimator=pipeline,
+        param_distributions=param_dist,
+        n_iter=30,           # number of parameter combinations to try
+        scoring=scorer,
+        cv=5,                # 5-fold cross validation
+        verbose=2,
+        random_state=random_state,
+        n_jobs=-1
+    )
+
+    search.fit(X_train, y_train)
+
+    print(f"\nBest F1 (class 1): {search.best_score_:.4f}")
+    print(f"Best params: {search.best_params_}")
+
+    return search.best_estimator_
+
+
+def evaluate_model(pipeline, X_test, y_test):
+    """Return key evaluation artifacts for test data."""
+    y_pred = pipeline.predict(X_test)
+    y_prob = pipeline.predict_proba(X_test)[:, 1]
+
+    report = classification_report(y_test, y_pred)
+    cm = confusion_matrix(y_test, y_pred)
+    auc = roc_auc_score(y_test, y_prob)
+
+    return {
+        "classification_report": report,
+        "confusion_matrix": cm,
+        "roc_auc": auc,
+    }
diff --git a/data-platform/data-science/oracle-data-science/customizing-ads-model-deployment-artifacts/files/pre_processing.py b/data-platform/data-science/oracle-data-science/customizing-ads-model-deployment-artifacts/files/pre_processing.py
@@ -0,0 +1,45 @@
+import pandas as pd
+from sklearn.compose import ColumnTransformer
+from sklearn.preprocessing import OneHotEncoder
+from category_encoders import TargetEncoder
+
+
+def create_features(df: pd.DataFrame) -> pd.DataFrame:
+    """Create a few simple Titanic features for demo purposes."""
+    data = df.copy()
+
+    if {"sibsp", "parch"}.issubset(data.columns):
+        data["family_size"] = data["sibsp"].fillna(0) + data["parch"].fillna(0) + 1
+        data["is_alone"] = (data["family_size"] == 1).astype(int)
+
+    for col in ["age", "fare", "embarked"]:
+        if col in data.columns:
+            data[f"{col}_missing"] = data[col].isna().astype(int)
+
+    return data
+
+
+def split_column_types(
+    X: pd.DataFrame,
+    low_cardinality_threshold: int = 10,
+):
+    """Split columns to numeric / low-card categorical / high-card categorical."""
+    numeric_cols = X.select_dtypes(include=["number", "bool"]).columns.tolist()
+    categorical_cols = X.select_dtypes(include=["object", "category"]).columns.tolist()
+
+    low_card = [c for c in categorical_cols if X[c].nunique(dropna=True) <= low_cardinality_threshold]
+    high_card = [c for c in categorical_cols if c not in low_card]
+
+    return numeric_cols, low_card, high_card
+
+
+def build_preprocessor(numeric_cols, low_card_cols, high_card_cols):
+    """Build ColumnTransformer with OneHot + TargetEncoder + numeric passthrough."""
+    return ColumnTransformer(
+        transformers=[
+            ("onehot", OneHotEncoder(handle_unknown="ignore"), low_card_cols),
+            ("target", TargetEncoder(handle_unknown="value", handle_missing="value"), high_card_cols),
+            ("num", "passthrough", numeric_cols),
+        ],
+        remainder="drop",
+    )
diff --git a/data-platform/data-science/oracle-data-science/customizing-ads-model-deployment-artifacts/files/train.ipynb b/data-platform/data-science/oracle-data-science/customizing-ads-model-deployment-artifacts/files/train.ipynb
diff --git a/data-platform/data-science/oracle-data-science/your-first-data-science-project/files/Guide for Your First Data Science Project prerequisites.pdf b/data-platform/data-science/oracle-data-science/your-first-data-science-project/files/Guide for Your First Data Science Project prerequisites.pdf
diff --git a/data-platform/data-science/oracle-data-science/your-first-data-science-project/files/adult_income shared.ipynb b/data-platform/data-science/oracle-data-science/your-first-data-science-project/files/adult_income shared.ipynb