diff --git a/dianna/methods/kernelshap_tabular.py b/dianna/methods/kernelshap_tabular.py new file mode 100644 index 00000000..d2bf47e3 --- /dev/null +++ b/dianna/methods/kernelshap_tabular.py @@ -0,0 +1,84 @@ +from typing import List +from typing import Optional +from typing import Union +import numpy as np +import shap +from shap import KernelExplainer +from dianna import utils + + +class KERNELSHAPTabular: + """Wrapper around the SHAP Kernel explainer for tabular data.""" + + def __init__( + self, + training_data: np.array, + mode: str = "classification", + feature_names: List[int] = None, + training_data_kmeans: Optional[int] = None, + ) -> None: + """Initializer of KERNELSHAPTabular. + + Training data must be provided for the explainer to estimate the expected + values. + + More information can be found in the API guide: + https://github.com/shap/shap/blob/master/shap/explainers/_kernel.py + + Arguments: + training_data (np.array): training data, which should be numpy 2d array + mode (str, optional): "classification" or "regression" + feature_names (list(str), optional): list of names corresponding to the columns + in the training data. + training_data_kmeans(int, optional): summarize the whole training set with + weighted kmeans + """ + if training_data_kmeans: + self.training_data = shap.kmeans(training_data, training_data_kmeans) + else: + self.training_data = training_data + self.feature_names = feature_names + self.mode = mode + self.explainer: KernelExplainer + + def explain( + self, + model_or_function: Union[str, callable], + input_tabular: np.array, + link: str = "identity", + **kwargs, + ) -> np.array: + """Run the KernelSHAP explainer. + + Args: + model_or_function (callable or str): The function that runs the model to be explained + or the path to a ONNX model on disk. + input_tabular (np.ndarray): Data to be explained. + link (str): A generalized linear model link to connect the feature importance values + to the model. Must be either "identity" or "logit". + kwargs: These parameters are passed on + + Other keyword arguments: see the documentation for KernelExplainer: + https://github.com/shap/shap/blob/master/shap/explainers/_kernel.py + + Returns: + explanation: An Explanation object containing the KernelExplainer explanations + for each class. + """ + init_instance_kwargs = utils.get_kwargs_applicable_to_function( + KernelExplainer, kwargs + ) + self.explainer = KernelExplainer( + model_or_function, self.training_data, link, **init_instance_kwargs + ) + + explain_instance_kwargs = utils.get_kwargs_applicable_to_function( + self.explainer.shap_values, kwargs + ) + + saliency = self.explainer.shap_values(input_tabular, **explain_instance_kwargs) + + if self.mode == 'regression': + return saliency[0] + + return saliency diff --git a/dianna/methods/lime_tabular.py b/dianna/methods/lime_tabular.py index d72bbc22..59fe5c40 100644 --- a/dianna/methods/lime_tabular.py +++ b/dianna/methods/lime_tabular.py @@ -119,11 +119,11 @@ def explain( **explain_instance_kwargs, ) - if self.mode == "regression": + if self.mode == 'regression': local_exp = sorted(explanation.local_exp[1]) saliency = [i[1] for i in local_exp] - elif self.mode == "classification": + elif self.mode == 'classification': # extract scores from lime explainer saliency = [] for i in range(self.top_labels): diff --git a/tests/methods/test_shap_tabular.py b/tests/methods/test_shap_tabular.py new file mode 100644 index 00000000..f2ecc7fe --- /dev/null +++ b/tests/methods/test_shap_tabular.py @@ -0,0 +1,35 @@ +"""Test LIME tabular method.""" +from unittest import TestCase +import numpy as np +import dianna +from dianna.methods.kernelshap_tabular import KERNELSHAPTabular +from tests.utils import run_model + + +class LIMEOnTabular(TestCase): + """Suite of LIME tests for the tabular case.""" + + def test_shap_tabular_classification_correct_output_shape(self): + """Test whether the output of explainer has the correct shape.""" + training_data = np.random.random((10, 2)) + input_data = np.random.random(2) + feature_names = ["feature_1", "feature_2"] + explainer = KERNELSHAPTabular(training_data, + mode ='classification', + feature_names=feature_names,) + exp = explainer.explain( + run_model, + input_data, + ) + assert len(exp[0]) == len(feature_names) + + def test_shap_tabular_regression_correct_output_shape(self): + """Test whether the output of explainer has the correct length.""" + training_data = np.random.random((10, 2)) + input_data = np.random.random(2) + feature_names = ["feature_1", "feature_2"] + exp = dianna.explain_tabular(run_model, input_tabular=input_data, method='kernelshap', + mode ='regression', training_data = training_data, + training_data_kmeans = 2, feature_names=feature_names) + + assert len(exp) == len(feature_names) diff --git a/tutorials/kernelshap_tabular_penguin.ipynb b/tutorials/kernelshap_tabular_penguin.ipynb new file mode 100644 index 00000000..ffcc5ab7 --- /dev/null +++ b/tutorials/kernelshap_tabular_penguin.ipynb @@ -0,0 +1,434 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\"Logo_ER10\"\n", + "\n", + "### Model Interpretation using KernelSHAP for penguin dataset classifier\n", + "This notebook demonstrates the use of DIANNA with the SHAP Kernel explainer method for tabular data on the penguins dataset." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Colab setup" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "running_in_colab = 'google.colab' in str(get_ipython())\n", + "if running_in_colab:\n", + " # install dianna\n", + " !python3 -m pip install dianna[notebooks]\n", + " \n", + " # download data used in this demo\n", + " import os \n", + " base_url = 'https://raw.githubusercontent.com/dianna-ai/dianna/main/tutorials/'\n", + " paths_to_download = ['models/penguin_model.onnx']\n", + " for path in paths_to_download:\n", + " !wget {base_url + path} -P {os.path.dirname(path)}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Import libraries" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import dianna\n", + "import numpy as np\n", + "import pandas as pd\n", + "import seaborn as sns\n", + "from sklearn.model_selection import train_test_split\n", + "from dianna.utils.onnx_runner import SimpleModelRunner\n", + "\n", + "from numba.core.errors import NumbaDeprecationWarning\n", + "import warnings\n", + "# silence the Numba deprecation warnings in shap\n", + "warnings.simplefilter('ignore', category=NumbaDeprecationWarning)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 1 - Loading the data\n", + "Load penguins dataset." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "penguins = sns.load_dataset('penguins')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Prepare the data" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
bill_length_mmbill_depth_mmflipper_length_mmbody_mass_g
039.118.7181.03750.0
139.517.4186.03800.0
240.318.0195.03250.0
436.719.3193.03450.0
539.320.6190.03650.0
...............
33847.213.7214.04925.0
34046.814.3215.04850.0
34150.415.7222.05750.0
34245.214.8212.05200.0
34349.916.1213.05400.0
\n", + "

342 rows × 4 columns

\n", + "
" + ], + "text/plain": [ + " bill_length_mm bill_depth_mm flipper_length_mm body_mass_g\n", + "0 39.1 18.7 181.0 3750.0\n", + "1 39.5 17.4 186.0 3800.0\n", + "2 40.3 18.0 195.0 3250.0\n", + "4 36.7 19.3 193.0 3450.0\n", + "5 39.3 20.6 190.0 3650.0\n", + ".. ... ... ... ...\n", + "338 47.2 13.7 214.0 4925.0\n", + "340 46.8 14.3 215.0 4850.0\n", + "341 50.4 15.7 222.0 5750.0\n", + "342 45.2 14.8 212.0 5200.0\n", + "343 49.9 16.1 213.0 5400.0\n", + "\n", + "[342 rows x 4 columns]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Remove categorial columns and NaN values\n", + "penguins_filtered = penguins.drop(columns=['island', 'sex']).dropna()\n", + "\n", + "# Get the species\n", + "species = penguins['species'].unique()\n", + "\n", + "# Extract inputs and target\n", + "input_features = penguins_filtered.drop(columns=['species'])\n", + "target = pd.get_dummies(penguins_filtered['species'])\n", + "\n", + "# Let's explore the features of the dataset\n", + "input_features" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The data-set currently has four features that were used to train the model: bill length, bill depth, flipper length, and body mass. These features were used to classify the different species." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Training, validation, and test data split." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "X_train, X_test, y_train, y_test = train_test_split(input_features, target, test_size=0.2,\n", + " random_state=0, shuffle=True, stratify=target)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Get an instance to explain." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "# get an instance from test data\n", + "data_instance = X_test.iloc[10].to_numpy()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 2. Loading ONNX model\n", + "DIANNA supports ONNX models. Here we demonstrate the use of KernelSHAP explainer for tabular data with a pre-trained ONNX model, which is a MLP classifier for the penguins dataset.
\n", + "\n", + "The model is trained following this notebook:
\n", + "https://github.com/dianna-ai/dianna-exploration/blob/main/example_data/model_generation/penguin_species/generate_model.ipynb" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'Gentoo'" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# load onnx model and check the prediction with it\n", + "model_path = './models/penguin_model.onnx'\n", + "loaded_model = SimpleModelRunner(model_path)\n", + "predictions = loaded_model(data_instance.reshape(1,-1).astype(np.float32))\n", + "species[np.argmax(predictions)]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "A runner function is created to prepare data for the ONNX inference session." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "import onnxruntime as ort\n", + "\n", + "def run_model(data):\n", + " # get ONNX predictions\n", + " sess = ort.InferenceSession(model_path)\n", + " input_name = sess.get_inputs()[0].name\n", + " output_name = sess.get_outputs()[0].name\n", + "\n", + " onnx_input = {input_name: data.astype(np.float32)}\n", + " pred_onnx = sess.run([output_name], onnx_input)[0]\n", + " \n", + " return pred_onnx" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 3. Applying KernelSHAP with DIANNA\n", + "The simplest way to run DIANNA on image data is with `dianna.explain_tabular`.\n", + "\n", + "DIANNA requires input in numpy format, so the input data is converted into a numpy array.\n", + "\n", + "Note that the training data is also required since KernelSHAP needs it to generate proper perturbation. But here we can summarize the whole training set with weighted Kmeans to reduce the computational cost. This has been implemented in `shap` and here we just need to set the number of clusters, for instance `training_data_kmeans = 5`." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning\n" + ] + } + ], + "source": [ + "explanation = dianna.explain_tabular(run_model, input_tabular=data_instance, method='kernelshap',\n", + " mode ='classification', training_data = X_train,\n", + " training_data_kmeans = 5, feature_names=input_features.columns)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 4. Visualization\n", + "The output can be visualized with the DIANNA built-in visualization function. It shows the importance of each feature contributing to the prediction.\n", + "\n", + "The prediction is \"Gentoo\", so let's visualize the feature importance scores for \"Gentoo\".\n", + "\n", + "It can be noticed that the body mass feature has the biggest weight in the prediction." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from dianna.visualization import plot_tabular\n", + "\n", + "# get the scores for the target class\n", + "explanation = explanation[np.argmax(predictions)]\n", + "\n", + "_ = plot_tabular(explanation, X_test.columns, num_features=10)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "dianna", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/tutorials/kernelshap_tabular_weather.ipynb b/tutorials/kernelshap_tabular_weather.ipynb new file mode 100644 index 00000000..a239ee38 --- /dev/null +++ b/tutorials/kernelshap_tabular_weather.ipynb @@ -0,0 +1,280 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\"Logo_ER10\"\n", + "\n", + "### Model Interpretation using KernelSHAP for weather prediction regressor\n", + "This notebook demonstrates the use of DIANNA with the SHAP Kernel explainer method for tabular data on the weather dataset.\n", + "\n", + "https://shap.readthedocs.io/en/latest/example_notebooks/tabular_examples/model_agnostic/Diabetes%20regression.html" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Colab setup" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "running_in_colab = 'google.colab' in str(get_ipython())\n", + "if running_in_colab:\n", + " # install dianna\n", + " !python3 -m pip install dianna[notebooks]\n", + " \n", + " # download data used in this demo\n", + " import os\n", + " base_url = 'https://raw.githubusercontent.com/dianna-ai/dianna/main/tutorials/'\n", + " paths_to_download = ['models/sunshine_hours_regression_model.onnx']\n", + " for path in paths_to_download:\n", + " !wget {base_url + path} -P {os.path.dirname(path)}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Import libraries" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import dianna\n", + "import numpy as np\n", + "import pandas as pd\n", + "from sklearn.model_selection import train_test_split\n", + "from dianna.utils.onnx_runner import SimpleModelRunner\n", + "\n", + "from numba.core.errors import NumbaDeprecationWarning\n", + "import warnings\n", + "# silence the Numba deprecation warnings in shap\n", + "warnings.simplefilter('ignore', category=NumbaDeprecationWarning)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 1 - Loading the data\n", + "Load weather prediction dataset." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "data = pd.read_csv(\"https://zenodo.org/record/5071376/files/weather_prediction_dataset_light.csv?download=1\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Prepare the data\n", + "As the target, the sunshine hours for the next day in the data-set will be used. Therefore, we will remove the last data point as this has no target. A tabular regression model will be trained which does not require time-based data, therefore DATE and MONTH can be removed." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "X_data = data.drop(columns=['DATE', 'MONTH'])[:-1]\n", + "y_data = data.loc[1:][\"BASEL_sunshine\"]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Training, validation, and test data split." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "X_train, X_holdout, y_train, y_holdout = train_test_split(X_data, y_data, test_size=0.3, random_state=0)\n", + "X_val, X_test, y_val, y_test = train_test_split(X_holdout, y_holdout, test_size=0.5, random_state=0)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Get an instance to explain." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "# get an instance from test data\n", + "data_instance = X_test.iloc[10].to_numpy()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 2. Loading ONNX model\n", + "DIANNA supports ONNX models. Here we demonstrate the use of KernelSHAP explainer for tabular data with a pre-trained ONNX model, which is a MLP regressor for the weather dataset.
\n", + "\n", + "The model is trained following this notebook:
\n", + "https://github.com/dianna-ai/dianna-exploration/blob/main/example_data/model_generation/sunshine_prediction/generate_model.ipynb" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[3.0719438]], dtype=float32)" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# load onnx model and check the prediction with it\n", + "model_path = './models/sunshine_hours_regression_model.onnx'\n", + "loaded_model = SimpleModelRunner(model_path)\n", + "predictions = loaded_model(data_instance.reshape(1,-1).astype(np.float32))\n", + "predictions" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "A runner function is created to prepare data for the ONNX inference session." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "import onnxruntime as ort\n", + "\n", + "def run_model(data):\n", + " # get ONNX predictions\n", + " sess = ort.InferenceSession(model_path)\n", + " input_name = sess.get_inputs()[0].name\n", + " output_name = sess.get_outputs()[0].name\n", + "\n", + " onnx_input = {input_name: data.astype(np.float32)}\n", + " pred_onnx = sess.run([output_name], onnx_input)[0]\n", + " \n", + " return pred_onnx" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 3. Applying KernelSHAP with DIANNA\n", + "The simplest way to run DIANNA on image data is with `dianna.explain_tabular`.\n", + "\n", + "DIANNA requires input in numpy format, so the input data is converted into a numpy array.\n", + "\n", + "Note that the training data is also required since KernelSHAP needs it to generate proper perturbation." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning\n" + ] + } + ], + "source": [ + "explanation = dianna.explain_tabular(run_model, input_tabular=data_instance, method='kernelshap',\n", + " mode ='regression', training_data = X_train, \n", + " training_data_kmeans = 5, feature_names=X_test.columns)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 4. Visualization\n", + "The output can be visualized with the DIANNA built-in visualization function. It shows the top 10 importance of each feature contributing to the prediction." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from dianna.visualization import plot_tabular\n", + "\n", + "_ = plot_tabular(explanation, X_test.columns, num_features=10)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "dianna", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/tutorials/lime_tabular_penguin.ipynb b/tutorials/lime_tabular_penguin.ipynb index f95730d6..09d4af1d 100644 --- a/tutorials/lime_tabular_penguin.ipynb +++ b/tutorials/lime_tabular_penguin.ipynb @@ -334,7 +334,6 @@ "\n", " onnx_input = {input_name: data.astype(np.float32)}\n", " pred_onnx = sess.run([output_name], onnx_input)[0]\n", - " pred_onnx\n", " \n", " return pred_onnx" ] diff --git a/tutorials/lime_tabular_weather.ipynb b/tutorials/lime_tabular_weather.ipynb index b7da342f..57f2a320 100644 --- a/tutorials/lime_tabular_weather.ipynb +++ b/tutorials/lime_tabular_weather.ipynb @@ -185,7 +185,6 @@ "\n", " onnx_input = {input_name: data.astype(np.float32)}\n", " pred_onnx = sess.run([output_name], onnx_input)[0]\n", - " pred_onnx\n", " \n", " return pred_onnx" ]