Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
Copyright (c) 2026 Oracle and/or its affiliates.

The Universal Permissive License (UPL), Version 1.0

Subject to the condition set forth below, permission is hereby granted to any
person obtaining a copy of this software, associated documentation and/or data
(collectively the "Software"), free of charge and under any and all copyright
rights in the Software, and any and all patent rights owned or freely
licensable by each licensor hereunder covering either (i) the unmodified
Software as contributed to or provided by such licensor, or (ii) the Larger
Works (as defined below), to deal in both

(a) the Software, and
(b) any piece of software and/or hardware listed in the lrgrwrks.txt file if
one is included with the Software (each a "Larger Work" to which the Software
is contributed by such licensors),

without restriction, including without limitation the rights to copy, create
derivative works of, display, perform, and distribute the Software and make,
use, sell, offer for sale, import, export, have made, and have sold the
Software and the Larger Work(s), and to sublicense the foregoing rights on
either these or other terms.

This license is subject to the following condition:
The above copyright notice and either this complete permission notice or at
a minimum a reference to the UPL must be included in all copies or
substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# Overview

Reviewed: 2026.04.16

This project demonstrates how to deploy a machine learning model using the ADS SDK while customizing the default generated model artifacts, which is often required in production scenarios.

While ADS provides a standard template for model artifacts, real-world use cases frequently require additional logic. In this notebook, we focus on modifying the generated artifacts to incorporate feature engineering directly into the deployment pipeline.

The main advantage of this approach is that feature engineering is executed as part of the model inference process, eliminating the need to repeat these steps each time the model is invoked.

Specifically, the notebook covers:
1. Building a model to predict Titanic survival using a Scikit-learn pipeline
2. Preparing model artifacts using the ADS SDK
3. Customizing the generated artifact to include feature engineering in score.py
4. Registering, deploying, and invoking the model

# Environment

Conda environment: generalml_p311_cpu_x86_64_v1
Created: April 2026

# Prerequisites
- Access to OCI Data Science
- Required IAM permissions for model registration and deployment
- Basic familiarity with Python, Pandas, and Scikit-learn

# License
Copyright (c) 2026 Oracle and/or its affiliates.
Licensed under the Universal Permissive License (UPL), Version 1.0.
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@

MODEL_ARTIFACT_VERSION: '3.0'
MODEL_DEPLOYMENT:
INFERENCE_CONDA_ENV:
INFERENCE_ENV_PATH: 'oci://pub-conda-env@<your_name_space>/conda_environments/cpu/General Machine Learning for CPUs on Python 3.11/1.0/generalml_p311_cpu_x86_64_v1'
INFERENCE_ENV_SLUG: ''
INFERENCE_ENV_TYPE: published
INFERENCE_PYTHON_VERSION: '3.11'
MODEL_PROVENANCE:
PROJECT_OCID: ''
TENANCY_OCID: ''
TRAINING_CODE:
ARTIFACT_DIRECTORY: /home/datascience/custom_ads/sklearn_artifact_dir5
TRAINING_COMPARTMENT_OCID: ''
TRAINING_CONDA_ENV:
TRAINING_ENV_PATH: ''
TRAINING_ENV_SLUG: ''
TRAINING_ENV_TYPE: ''
TRAINING_PYTHON_VERSION: ''
TRAINING_REGION: ''
TRAINING_RESOURCE_OCID: ''
USER_OCID: ''
VM_IMAGE_INTERNAL_ID: ''
Original file line number Diff line number Diff line change
@@ -0,0 +1,163 @@
# score.py 1.0 generated by ADS 2.11.19 on 20260415_092331
import json
import os
import cloudpickle
import pandas as pd
import numpy as np
from functools import lru_cache


model_name = 'model.pkl'


"""
Inference script. This script is used for prediction by scoring server when schema is known.
"""

def create_features(df: pd.DataFrame) -> pd.DataFrame:
data = df.copy()
if {"sibsp", "parch"}.issubset(data.columns):
data["family_size"] = data["sibsp"].fillna(0) + data["parch"].fillna(0) + 1
data["is_alone"] = (data["family_size"] == 1).astype(int)
for col in ["age", "fare", "embarked"]:
if col in data.columns:
data[f"{col}_missing"] = data[col].isna().astype(int)
return data


@lru_cache(maxsize=10)
def load_model(model_file_name=model_name):
"""
Loads model from the serialized format

Returns
-------
model: a model instance on which predict API can be invoked
"""
model_dir = os.path.dirname(os.path.realpath(__file__))
contents = os.listdir(model_dir)
if model_file_name in contents:
print(f'Start loading {model_file_name} from model directory {model_dir} ...')
with open(os.path.join(os.path.dirname(os.path.realpath(__file__)), model_file_name), "rb") as file:
loaded_model = cloudpickle.load(file)

print("Model is successfully loaded.")
return loaded_model
else:
raise Exception(f'{model_file_name} is not found in model directory {model_dir}')

@lru_cache(maxsize=1)
def fetch_data_type_from_schema(input_schema_path=os.path.join(os.path.dirname(os.path.realpath(__file__)), "input_schema.json")):
"""
Returns data type information fetch from input_schema.json.

Parameters
----------
input_schema_path: path of input schema.

Returns
-------
data_type: data type fetch from input_schema.json.

"""
data_type = {}
if os.path.exists(input_schema_path):
schema = json.load(open(input_schema_path))
for col in schema['schema']:
data_type[col['name']] = col['dtype']
else:
print("input_schema has to be passed in in order to recover the same data type. pass `X_sample` in `ads.model.framework.sklearn_model.SklearnModel.prepare` function to generate the input_schema. Otherwise, the data type might be changed after serialization/deserialization.")
return data_type

def deserialize(data, input_schema_path):
"""
Deserialize json serialization data to data in original type when sent to predict.

Parameters
----------
data: serialized input data.
input_schema_path: path of input schema.

Returns
-------
data: deserialized input data.

"""

import base64
from io import BytesIO, StringIO # added StringIO
if isinstance(data, bytes):
return data

data_type = data.get('data_type', '') if isinstance(data, dict) else ''
json_data = data.get('data', data) if isinstance(data, dict) else data

if "numpy.ndarray" in data_type:
load_bytes = BytesIO(base64.b64decode(json_data.encode('utf-8')))
return np.load(load_bytes, allow_pickle=True)
if "pandas.core.series.Series" in data_type:
return pd.Series(json_data)
if "pandas.core.frame.DataFrame" in data_type or isinstance(json_data, str):
return pd.read_json(StringIO(json_data), dtype=fetch_data_type_from_schema(input_schema_path)) # add StringIO for better practice
if isinstance(json_data, dict):
return pd.DataFrame.from_dict(json_data)
return json_data


def pre_inference(data, input_schema_path):
"""
Preprocess data

Parameters
----------
data: Data format as expected by the predict API of the core estimator.
input_schema_path: path of input schema.

Returns
-------
data: Data format after any processing.

"""
return deserialize(data, input_schema_path)

def post_inference(yhat):
"""
Post-process the model results

Parameters
----------
yhat: Data format after calling model.predict.

Returns
-------
yhat: Data format after any processing.

"""
if isinstance(yhat, pd.core.frame.DataFrame):
yhat = yhat.values
if isinstance(yhat, np.ndarray):
yhat = yhat.tolist()
return yhat

def predict(data, model=load_model(), input_schema_path=os.path.join(os.path.dirname(os.path.realpath(__file__)), "input_schema.json")):
"""
Returns prediction given the model and data to predict

Parameters
----------
model: Model instance returned by load_model API.
data: Data format as expected by the predict API of the core estimator. For eg. in case of sckit models it could be numpy array/List of list/Pandas DataFrame.
input_schema_path: path of input schema.

Returns
-------
predictions: Output from scoring server
Format: {'prediction': output from model.predict method}

"""
features = pre_inference(data, input_schema_path)
features = create_features(features) # added to apply the customization
yhat = post_inference(
model.predict(features)
)
return {'prediction': yhat}
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, make_scorer, f1_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV
import numpy as np


def build_model(random_state: int = 42):
"""Random Forest classifier - base estimator for HPO."""
return RandomForestClassifier(
random_state=random_state,
n_jobs=-1
)


def build_pipeline(preprocessor, model):
"""Attach preprocessing and model in one sklearn Pipeline."""
return Pipeline(
steps=[
("preprocess", preprocessor),
("model", model),
]
)


def optimize_hyperparameters(pipeline, X_train, y_train, random_state: int = 42):
"""
Run RandomizedSearchCV optimizing for F1 on the minority class (label=1).
Returns the best fitted pipeline.
"""
param_dist = {
"model__n_estimators": [100, 200, 300, 500],
"model__max_depth": [10, 15, 20, 30, None],
"model__min_samples_split": [2, 5, 10, 20],
"model__min_samples_leaf": [1, 2, 4, 8],
"model__max_features": ["sqrt", "log2", 0.3, 0.5],
"model__class_weight": [
"balanced",
"balanced_subsample",
{0: 1, 1: 2}, # penalize missing class 1 twice as much
{0: 1, 1: 3}, # penalize missing class 1 three times as much
],
}

# Optimize for F1 on minority class specifically
scorer = make_scorer(f1_score, pos_label=1)

search = RandomizedSearchCV(
estimator=pipeline,
param_distributions=param_dist,
n_iter=30, # number of parameter combinations to try
scoring=scorer,
cv=5, # 5-fold cross validation
verbose=2,
random_state=random_state,
n_jobs=-1
)

search.fit(X_train, y_train)

print(f"\nBest F1 (class 1): {search.best_score_:.4f}")
print(f"Best params: {search.best_params_}")

return search.best_estimator_


def evaluate_model(pipeline, X_test, y_test):
"""Return key evaluation artifacts for test data."""
y_pred = pipeline.predict(X_test)
y_prob = pipeline.predict_proba(X_test)[:, 1]

report = classification_report(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
auc = roc_auc_score(y_test, y_prob)

return {
"classification_report": report,
"confusion_matrix": cm,
"roc_auc": auc,
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from category_encoders import TargetEncoder


def create_features(df: pd.DataFrame) -> pd.DataFrame:
"""Create a few simple Titanic features for demo purposes."""
data = df.copy()

if {"sibsp", "parch"}.issubset(data.columns):
data["family_size"] = data["sibsp"].fillna(0) + data["parch"].fillna(0) + 1
data["is_alone"] = (data["family_size"] == 1).astype(int)

for col in ["age", "fare", "embarked"]:
if col in data.columns:
data[f"{col}_missing"] = data[col].isna().astype(int)

return data


def split_column_types(
X: pd.DataFrame,
low_cardinality_threshold: int = 10,
):
"""Split columns to numeric / low-card categorical / high-card categorical."""
numeric_cols = X.select_dtypes(include=["number", "bool"]).columns.tolist()
categorical_cols = X.select_dtypes(include=["object", "category"]).columns.tolist()

low_card = [c for c in categorical_cols if X[c].nunique(dropna=True) <= low_cardinality_threshold]
high_card = [c for c in categorical_cols if c not in low_card]

return numeric_cols, low_card, high_card


def build_preprocessor(numeric_cols, low_card_cols, high_card_cols):
"""Build ColumnTransformer with OneHot + TargetEncoder + numeric passthrough."""
return ColumnTransformer(
transformers=[
("onehot", OneHotEncoder(handle_unknown="ignore"), low_card_cols),
("target", TargetEncoder(handle_unknown="value", handle_missing="value"), high_card_cols),
("num", "passthrough", numeric_cols),
],
remainder="drop",
)
Loading