diff --git a/NAMESPACE b/NAMESPACE index e2bcfc5..f1ae3b0 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,6 +1,8 @@ # Generated by roxygen2: do not edit by hand +S3method(modelStudio,dalex._explainer.object.Explainer) S3method(modelStudio,explainer) +S3method(modelStudio,python.builtin.object) export(modelStudio) export(modelStudioOptions) import(progress) diff --git a/R/modelStudio.R b/R/modelStudio.R index 592f6b9..168736f 100644 --- a/R/modelStudio.R +++ b/R/modelStudio.R @@ -384,6 +384,15 @@ modelStudio.explainer <- function(explainer, model_studio } +#:# alias for reticulate pickle/dalex Explainer +#' @noRd +#' @export +modelStudio.python.builtin.object <- modelStudio.explainer + +#' @noRd +#' @export +modelStudio.dalex._explainer.object.Explainer <- modelStudio.explainer + #' @noRd #' @title remove_file_paths #' diff --git a/README.md b/README.md index 6a9a264..b4718e2 100644 --- a/README.md +++ b/README.md @@ -16,7 +16,7 @@ The main `modelStudio()` function computes various (instance and dataset level) [**explain FIFA20**](https://pbiecek.github.io/explainFIFA20/)   [explain Lung Cancer](https://github.com/hbaniecki/transparent_xai/)   [**R & Python examples**](http://modelstudio.drwhy.ai/articles/vignette_examples.html)   -[More Resources](https://modeloriented.github.io/modelStudio/#more)   +[More Resources](http://modelstudio.drwhy.ai/#more-resources)   [**FAQ & Troubleshooting**](https://github.com/ModelOriented/modelStudio/issues/54) ![](man/figures/short.gif) @@ -73,7 +73,7 @@ install.packages("iBreakDown") # packages for explainer objects install.packages("DALEX") -devtools::install_github("ModelOriented/DALEXtra") +install.packages("DALEXtra") ``` ### mlr [dashboard](https://modeloriented.github.io/modelStudio/mlr.html) @@ -94,7 +94,7 @@ test <- data[-index, ] # mlr ClassifTask takes target as factor train$survived <- as.factor(train$survived) -# prepare the model +# fit a model task <- makeClassifTask(id = "titanic", data = train, target = "survived") @@ -137,7 +137,7 @@ test <- data[-index, ] train_matrix <- model.matrix(survived ~.-1, train) test_matrix <- model.matrix(survived ~.-1, test) -# prepare the model +# fit a model xgb_matrix <- xgb.DMatrix(train_matrix, label = train$survived) params <- list(eta = 0.01, subsample = 0.6, max_depth = 7, min_child_weight = 3, objective = "binary:logistic", eval_metric = "auc") @@ -161,96 +161,88 @@ modelStudio(explainer, ### scikit-learn [dashboard](https://modeloriented.github.io/modelStudio/scikit-learn.html) -Use `pickle` Python module and `reticulate` R package to easily produce modelStudio for scikit-learn model. +Use `pickle` Python module and `reticulate` R package to easily make a studio for a scikit-learn model. -In this example we fit a Pipeline MLPClassifier on the titanic data. First install the `dalex` package. +In this example we will fit a Pipeline MLPClassifier model on titanic data. + +Install the `dalex` package. ```bash pip3 install dalex --force ``` -Make an explainer object in Python: +First, use `dalex` in Python: ```python -# import modules +# load packages and data import dalex as dx -from dalex import datasets -from sklearn.neural_network import MLPClassifier -from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder -from sklearn.impute import SimpleImputer +from sklearn.model_selection import train_test_split from sklearn.pipeline import Pipeline -from sklearn.tree import DecisionTreeRegressor +from sklearn.preprocessing import StandardScaler, OneHotEncoder +from sklearn.impute import SimpleImputer from sklearn.compose import ColumnTransformer +from sklearn.neural_network import MLPClassifier -# load the data -data = datasets.load_titanic() +data = dx.datasets.load_titanic() X = data.drop(columns='survived') y = data.survived -# make a pipeline model +# split the data +X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1) + +# fit a pipeline model numeric_features = ['age', 'fare', 'sibsp', 'parch'] -numeric_transformer = Pipeline(steps=[ +numeric_transformer = Pipeline( + steps=[ ('imputer', SimpleImputer(strategy='median')), - ('scaler', StandardScaler())]) - + ('scaler', StandardScaler()) + ] +) categorical_features = ['gender', 'class', 'embarked'] -categorical_transformer = Pipeline(steps=[ +categorical_transformer = Pipeline( + steps=[ ('imputer', SimpleImputer(strategy='constant', fill_value='missing')), - ('onehot', OneHotEncoder(handle_unknown='ignore'))]) + ('onehot', OneHotEncoder(handle_unknown='ignore')) + ] +) preprocessor = ColumnTransformer( - transformers=[ - ('num', numeric_transformer, numeric_features), - ('cat', categorical_transformer, categorical_features)]) - - -clf = Pipeline(steps=[('preprocessor', preprocessor), - ('classifier', MLPClassifier(hidden_layer_sizes=(150,100,50), - max_iter=500, random_state=0))]) - -clf.fit(X, y) + transformers=[ + ('num', numeric_transformer, numeric_features), + ('cat', categorical_transformer, categorical_features) + ] +) + +model = Pipeline( + steps=[ + ('preprocessor', preprocessor), + ('classifier', MLPClassifier(hidden_layer_sizes=(150,100,50), max_iter=500, random_state=0)) + ] +) +model.fit(X_train, y_train) -# make an explainer -explainer = dx.Explainer(clf, X, y) +# create an explainer for the model +explainer = dx.Explainer(model, X_test, y_test, label = 'scikit-learn') -# remove these functions before dump +#! remove residual_function before dump ! explainer.residual_function = None -explainer.predict_function = None # pack the explainer into a pickle file -import pickle -pickle_out = open("explainer_titanic.pickle","wb") +import pickle +pickle_out = open("explainer_scikitlearn.pickle","wb") pickle.dump(explainer, pickle_out) -pickle_out.close() +pickle_out.close() ``` -Then use `modelStudio` in R: +Then, use `modelStudio` in R: ```r -# use reticulate to load the explainer from a pickle file +# load the explainer from the pickle file library(reticulate) -explainer <- py_load_object('explainer_titanic.pickle') - -# make a predict_function -predict_function <- function(model, data) { - if ("predict_proba" %in% names(model)) { - pred <- model$predict_proba(data) - if (ncol(pred) == 2) { - pred <- pred[,2] - } - } else { - pred <- model$predict(data) - } - pred -} - -# adjust the explainer -explainer$predict_function <- predict_function -explainer$label <- 'scikit-learn' -class(explainer) <- c(class(explainer), 'explainer') - -# make a modelStudio +explainer <- py_load_object('explainer_scikitlearn.pickle', pickle = "pickle") + +# make a studio for the model library(modelStudio) modelStudio(explainer) ``` diff --git a/inst/WORDLIST b/inst/WORDLIST index 1f89891..3401999 100644 --- a/inst/WORDLIST +++ b/inst/WORDLIST @@ -28,6 +28,7 @@ tensorflow Shapley cran CRAN -MLPCLassifier +MLPClassifier keras lightGBM +customizable diff --git a/pkgdown/_pkgdown.yml b/pkgdown/_pkgdown.yml index afdd162..c8e041b 100644 --- a/pkgdown/_pkgdown.yml +++ b/pkgdown/_pkgdown.yml @@ -3,4 +3,5 @@ template: default_assets: false params: ganalytics: UA-5650686-14 - noindex: true \ No newline at end of file + noindex: true + \ No newline at end of file diff --git a/pkgdown/favicon/h2o.html b/pkgdown/favicon/h2o.html new file mode 100644 index 0000000..396a5d2 --- /dev/null +++ b/pkgdown/favicon/h2o.html @@ -0,0 +1,1795 @@ + + + + + + + + + + + + + +
+
+
+ + + + diff --git a/pkgdown/favicon/scikit-learn.html b/pkgdown/favicon/scikit-learn.html index c87d20a..6873080 100644 --- a/pkgdown/favicon/scikit-learn.html +++ b/pkgdown/favicon/scikit-learn.html @@ -1787,9 +1787,9 @@
-
+
- - + + diff --git a/vignettes/vignette_examples.Rmd b/vignettes/vignette_examples.Rmd index 400cd4b..9b42e54 100644 --- a/vignettes/vignette_examples.Rmd +++ b/vignettes/vignette_examples.Rmd @@ -53,7 +53,7 @@ test <- data[-index, ] # mlr ClassifTask takes target as factor train$survived <- as.factor(train$survived) -# prepare the model +# fit a model task <- makeClassifTask(id = "titanic", data = train, target = "survived") @@ -97,7 +97,7 @@ test <- data[-index, ] # mlr3 TaskClassif takes target as factor train$survived <- as.factor(train$survived) -# prepare the model +# fit a model task <- TaskClassif$new(id = "titanic", backend = train, target = "survived") @@ -140,7 +140,7 @@ test <- data[-index, ] train_matrix <- model.matrix(survived ~.-1, train) test_matrix <- model.matrix(survived ~.-1, test) -# prepare the model +# fit a model xgb_matrix <- xgb.DMatrix(train_matrix, label = train$survived) params <- list(eta = 0.01, subsample = 0.6, max_depth = 7, min_child_weight = 3, objective = "binary:logistic", eval_metric = "auc") @@ -180,7 +180,7 @@ test <- data[-index, ] # caret train takes target as factor train$survived <- as.factor(train$survived) -# prepare the model +# fit a model cv <- trainControl(method = "repeatedcv", number = 3, repeats = 10) @@ -205,111 +205,142 @@ modelStudio(explainer, new_observation) ``` -### h2o - -TBD +### h2o [dashboard](https://modeloriented.github.io/modelStudio/h2o.html) ```{r eval = FALSE} +# load packages and data +library(h2o) library(DALEXtra) +library(modelStudio) + +data <- DALEX::titanic_imputed + +# init h2o +h2o::h2o.init() + +# split the data +h2o_split <- h2o.splitFrame(as.h2o(data)) +train <- h2o_split[[1]] +test <- as.data.frame(h2o_split[[2]]) + +# h2o automl takes target as factor +train$survived <- as.factor(train$survived) + +# fit a model +automl <- h2o.automl(y = "survived", + training_frame = train, + max_runtime_secs = 30) +model <- automl@leader -explain_h2o() +# stop h2o progress printing +h2o.no_progress() + +# create an explainer for the model +explainer <- explain_h2o(model, + data = test, + y = test$survived, + label = "h2o") + +# pick observations +new_observation <- test[1:2, ] +rownames(new_observation) <- c("id1", "id2") + +# make a studio for the model +modelStudio(explainer, + new_observation, + B = 5) +# shutdown h2o +h2o::h2o.shutdown(prompt = FALSE) ``` -## Python -### scikit-learn [dashboard](https://modeloriented.github.io/modelStudio/scikit-learn.html) +## Python -Use `pickle` Python module and `reticulate` R package to easily produce modelStudio for scikit-learn model. +Use `pickle` Python module and `reticulate` R package to easily make a studio for a model. -In this example we fit a Pipeline MLPClassifier on the titanic data. First install the `dalex` package. +Install the `dalex` package. ```{bash, eval=FALSE, engine="sh"} pip3 install dalex --force ``` -Make an explainer object in Python: +### scikit-learn [dashboard](https://modeloriented.github.io/modelStudio/scikit-learn.html) + +In this example we will fit a Pipeline MLPClassifier model on titanic data. + +First, use `dalex` in Python: ```{r eval = FALSE} -# import modules +# load packages and data import dalex as dx -from dalex import datasets -from sklearn.neural_network import MLPClassifier -from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder -from sklearn.impute import SimpleImputer +from sklearn.model_selection import train_test_split from sklearn.pipeline import Pipeline -from sklearn.tree import DecisionTreeRegressor +from sklearn.preprocessing import StandardScaler, OneHotEncoder +from sklearn.impute import SimpleImputer from sklearn.compose import ColumnTransformer +from sklearn.neural_network import MLPClassifier -# load the data -data = datasets.load_titanic() +data = dx.datasets.load_titanic() X = data.drop(columns='survived') y = data.survived -# make a pipeline model +# split the data +X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1) + +# fit a pipeline model numeric_features = ['age', 'fare', 'sibsp', 'parch'] -numeric_transformer = Pipeline(steps=[ +numeric_transformer = Pipeline( + steps=[ ('imputer', SimpleImputer(strategy='median')), - ('scaler', StandardScaler())]) - + ('scaler', StandardScaler()) + ] +) categorical_features = ['gender', 'class', 'embarked'] -categorical_transformer = Pipeline(steps=[ +categorical_transformer = Pipeline( + steps=[ ('imputer', SimpleImputer(strategy='constant', fill_value='missing')), - ('onehot', OneHotEncoder(handle_unknown='ignore'))]) + ('onehot', OneHotEncoder(handle_unknown='ignore')) + ] +) preprocessor = ColumnTransformer( - transformers=[ - ('num', numeric_transformer, numeric_features), - ('cat', categorical_transformer, categorical_features)]) - - -clf = Pipeline(steps=[('preprocessor', preprocessor), - ('classifier', MLPClassifier(hidden_layer_sizes=(150,100,50), - max_iter=500, random_state=0))]) + transformers=[ + ('num', numeric_transformer, numeric_features), + ('cat', categorical_transformer, categorical_features) + ] +) -clf.fit(X, y) +model = Pipeline( + steps=[ + ('preprocessor', preprocessor), + ('classifier', MLPClassifier(hidden_layer_sizes=(150,100,50), max_iter=500, random_state=0)) + ] +) +model.fit(X_train, y_train) -# make an explainer -explainer = dx.Explainer(clf, X, y) +# create an explainer for the model +explainer = dx.Explainer(model, X_test, y_test, label = 'scikit-learn') -# remove these functions before dump +#! remove residual_function before dump ! explainer.residual_function = None -explainer.predict_function = None # pack the explainer into a pickle file -import pickle -pickle_out = open("explainer_titanic.pickle","wb") +import pickle +pickle_out = open("explainer_scikitlearn.pickle","wb") pickle.dump(explainer, pickle_out) -pickle_out.close() +pickle_out.close() ``` -Then use modelStudio in R: +Then, use `modelStudio` in R: ```{r eval = FALSE} -# use reticulate to load the explainer from a pickle file +# load the explainer from the pickle file library(reticulate) -explainer <- py_load_object('explainer_titanic.pickle') - -# make a predict_function -predict_function <- function(model, data) { - if ("predict_proba" %in% names(model)) { - pred <- model$predict_proba(data) - if (ncol(pred) == 2) { - pred <- pred[,2] - } - } else { - pred <- model$predict(data) - } - pred -} - -# adjust the explainer -explainer$predict_function <- predict_function -explainer$label <- 'scikit-learn' -class(explainer) <- c(class(explainer), 'explainer') - -# make a modelStudio +explainer <- py_load_object('explainer_scikitlearn.pickle', pickle = "pickle") + +# make a studio for the model library(modelStudio) modelStudio(explainer) ``` diff --git a/vignettes/vignette_modelStudio.Rmd b/vignettes/vignette_modelStudio.Rmd index 5aa0ba7..d4d5fe5 100644 --- a/vignettes/vignette_modelStudio.Rmd +++ b/vignettes/vignette_modelStudio.Rmd @@ -18,7 +18,7 @@ knitr::opts_chunk$set( ) ``` -`modelStudio::modelStudio` computes various (instance and dataset level) model explanations and produces an interactive, customisable dashboard made with D3.js. It consists of multiple panels for plots with their short descriptions. Easily save and share the dashboard with others. Tools for model exploration unite with tools for EDA (Exploratory Data Analysis) to give a broad overview of the model behavior. +`modelStudio::modelStudio` computes various (instance and dataset level) model explanations and produces an interactive, customizable dashboard made with D3.js. It consists of multiple panels for plots with their short descriptions. Easily save and share the dashboard with others. Tools for model exploration unite with tools for EDA (Exploratory Data Analysis) to give a broad overview of the model behavior. Let's use `DALEX::HR` dataset to explore `modelStudio` parameters: