diff --git a/.github/workflows/cml.yaml b/.github/workflows/cml.yaml new file mode 100644 index 0000000..3620d07 --- /dev/null +++ b/.github/workflows/cml.yaml @@ -0,0 +1,26 @@ +name: mlops-linear-regression +on: [push] +jobs: + run: + runs-on: [ubuntu-latest] + container: docker://dvcorg/cml-py3:latest + steps: + - uses: actions/checkout@v2 + - name: 'Train my linear model' + env: + repo_token: ${{ secrets.GITHUB_TOKEN }} + run: | + # Your ML workflow goes here + python -m pip install --upgrade pip + pip install -r requirements.txt + python tensorflow_linear_regression_model.py + + echo "## Model Metrics" > report.md + cat metrics.txt >> report.md + + echo "\n## Model Performance" >> report.md + echo "Model performance metrics are on the plot below." >> report.md + + cml-publish model_results.png --md >> report.md + + cml-send-comment report.md diff --git a/main.py b/main.py deleted file mode 100644 index a46baeb..0000000 --- a/main.py +++ /dev/null @@ -1,88 +0,0 @@ -# The data set used in this example is from http://archive.ics.uci.edu/ml/datasets/Wine+Quality -# P. Cortez, A. Cerdeira, F. Almeida, T. Matos and J. Reis. -# Modeling wine preferences by data mining from physicochemical properties. In Decision Support Systems, Elsevier, 47(4):547-553, 2009. - -import os -import warnings -import sys - -import pandas as pd -import numpy as np -from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score -from sklearn.model_selection import train_test_split -from sklearn.linear_model import ElasticNet -from urllib.parse import urlparse -import mlflow -import mlflow.sklearn - -import logging - -logging.basicConfig(level=logging.WARN) -logger = logging.getLogger(__name__) - - -def eval_metrics(actual, pred): - rmse = np.sqrt(mean_squared_error(actual, pred)) - mae = mean_absolute_error(actual, pred) - r2 = r2_score(actual, pred) - return rmse, mae, r2 - - -if __name__ == "__main__": - warnings.filterwarnings("ignore") - np.random.seed(40) - - # Read the wine-quality csv file from the URL - csv_url = ( - "http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv" - ) - try: - data = pd.read_csv(csv_url, sep=";") - except Exception as e: - logger.exception( - "Unable to download training & test CSV, check your internet connection. Error: %s", e - ) - - # Split the data into training and test sets. (0.75, 0.25) split. - train, test = train_test_split(data) - - # The predicted column is "quality" which is a scalar from [3, 9] - train_x = train.drop(["quality"], axis=1) - test_x = test.drop(["quality"], axis=1) - train_y = train[["quality"]] - test_y = test[["quality"]] - - alpha = float(sys.argv[1]) if len(sys.argv) > 1 else 0.5 - l1_ratio = float(sys.argv[2]) if len(sys.argv) > 2 else 0.5 - - with mlflow.start_run(): - lr = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, random_state=42) - lr.fit(train_x, train_y) - - predicted_qualities = lr.predict(test_x) - - (rmse, mae, r2) = eval_metrics(test_y, predicted_qualities) - - print("Elasticnet model (alpha=%f, l1_ratio=%f):" % (alpha, l1_ratio)) - print(" RMSE: %s" % rmse) - print(" MAE: %s" % mae) - print(" R2: %s" % r2) - - mlflow.log_param("alpha", alpha) - mlflow.log_param("l1_ratio", l1_ratio) - mlflow.log_metric("rmse", rmse) - mlflow.log_metric("r2", r2) - mlflow.log_metric("mae", mae) - - tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme - - # Model registry does not work with file store - if tracking_url_type_store != "file": - - # Register the model - # There are other ways to use the Model Registry, which depends on the use case, - # please refer to the doc for more information: - # https://mlflow.org/docs/latest/model-registry.html#api-workflow - mlflow.sklearn.log_model(lr, "model", registered_model_name="ElasticnetWineModel") - else: - mlflow.sklearn.log_model(lr, "model") diff --git a/requirements.txt b/requirements.txt index d8baf06..6bb6c73 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,3 @@ -mlflow -cloudpickle==2.2.0 -scikit-learn==1.0.2 +tensorflow +numpy +matplotlib diff --git a/tensorflow_linear_regression_model.py b/tensorflow_linear_regression_model.py new file mode 100644 index 0000000..31ef863 --- /dev/null +++ b/tensorflow_linear_regression_model.py @@ -0,0 +1,102 @@ +# Import modules and packages +import tensorflow as tf +import numpy as np +import matplotlib.pyplot as plt + + +# Functions and procedures +def plot_predictions(train_data, train_labels, test_data, test_labels, predictions): + """ + Plots training data, test data and compares predictions. + """ + plt.figure(figsize=(6, 5)) + # Plot training data in blue + plt.scatter(train_data, train_labels, c="b", label="Training data") + # Plot test data in green + plt.scatter(test_data, test_labels, c="g", label="Testing data") + # Plot the predictions in red (predictions were made on the test data) + plt.scatter(test_data, predictions, c="r", label="Predictions") + # Show the legend + plt.legend(shadow='True') + # Set grids + plt.grid(which='major', c='#cccccc', linestyle='--', alpha=0.5) + # Some text + plt.title('Model Results', family='Arial', fontsize=14) + plt.xlabel('X axis values', family='Arial', fontsize=11) + plt.ylabel('Y axis values', family='Arial', fontsize=11) + # Show + plt.savefig('model_results.png', dpi=120) + + + +def mae(y_test, y_pred): + """ + Calculuates mean absolute error between y_test and y_preds. + """ + return tf.metrics.mean_absolute_error(y_test, y_pred) + + +def mse(y_test, y_pred): + """ + Calculates mean squared error between y_test and y_preds. + """ + return tf.metrics.mean_squared_error(y_test, y_pred) + + +# Check Tensorflow version +print(tf.__version__) + + +# Create features +X = np.arange(-100, 100, 4) + +# Create labels +y = np.arange(-90, 110, 4) + + +# Split data into train and test sets +X_train = X[:40] # first 40 examples (80% of data) +y_train = y[:40] + +X_test = X[40:] # last 10 examples (20% of data) +y_test = y[40:] + + +# Take a single example of X +input_shape = X[0].shape + +# Take a single example of y +output_shape = y[0].shape + + +# Set random seed +tf.random.set_seed(42) + +# Create a model using the Sequential API +model = tf.keras.Sequential([ + tf.keras.layers.Dense(1), + tf.keras.layers.Dense(1) + ]) + +# Compile the model +model.compile(loss = tf.keras.losses.mae, + optimizer = tf.keras.optimizers.SGD(), + metrics = ['mae']) + +# Fit the model +model.fit(X_train, y_train, epochs=100) + + +# Make and plot predictions for model_1 +y_preds = model.predict(X_test) +plot_predictions(train_data=X_train, train_labels=y_train, test_data=X_test, test_labels=y_test, predictions=y_preds) + + +# Calculate model_1 metrics +mae_1 = np.round(float(mae(y_test, y_preds.squeeze()).numpy()), 2) +mse_1 = np.round(float(mse(y_test, y_preds.squeeze()).numpy()), 2) +print(f'\nMean Absolute Error = {mae_1}, Mean Squared Error = {mse_1}.') + +# Write metrics to file +with open('metrics.txt', 'w') as outfile: + outfile.write(f'\nMean Absolute Error = {mae_1}, Mean Squared Error = {mse_1}.')