Original Feature Importance Explainer (Kernel SHAP) Demo
This example demonstrates how to interpret a Driverless AI MOJO model using the H2O Eval Studio library and retrieve the data and plot with original features importances.
[1]:
import os
import logging
import datatable
import daimojo
import webbrowser
from h2o_sonar import interpret
from h2o_sonar.lib.api import commons
from h2o_sonar.lib.api import explainers
from h2o_sonar.explainers import fi_kernel_shap_explainer as explainer
from h2o_sonar.lib.api.models import ModelApi
[2]:
# explainer description
interpret.describe_explainer(explainer.KernelShapFeatureImportanceExplainer)
[2]:
{'id': 'h2o_sonar.explainers.fi_kernel_shap_explainer.KernelShapFeatureImportanceExplainer',
'name': 'KernelShapFeatureImportanceExplainer',
'display_name': 'Shapley Values for Original Features (Kernel SHAP Method)',
'description': 'Shapley explanations are a technique with credible theoretical support that presents consistent global and local variable contributions. Local numeric Shapley values are calculated by tracing single rows of data through a trained tree ensemble and aggregating the contribution of each input variable as the row of data moves through the trained ensemble. For regression tasks, Shapley values sum to the prediction of the Driverless AI model. For classification problems, Shapley values sum to the prediction of the Driverless AI model before applying the link function. Global Shapley values are the average of the absolute Shapley values over every row of a dataset. Shapley values for original features are calculated with the Kernel Explainer method, which uses a special weighted linear regression to compute the importance of each feature. More information about Kernel SHAP is available at http://papers.nips.cc/paper/7062-a-unified-approach-to-interpreting-model-predictions.pdf.',
'model_types': ['iid'],
'can_explain': ['regression', 'binomial', 'multinomial'],
'explanation_scopes': ['global_scope', 'local_scope'],
'explanations': [{'explanation_type': 'global-feature-importance',
'name': 'GlobalFeatImpExplanation',
'category': None,
'scope': 'global',
'has_local': None,
'formats': []},
{'explanation_type': 'local-feature-importance',
'name': 'LocalFeatImpExplanation',
'category': None,
'scope': 'local',
'has_local': None,
'formats': []}],
'parameters': [{'name': 'sample_size',
'description': 'Sample size.',
'comment': '',
'type': 'int',
'val': 100000,
'predefined': [],
'tags': [],
'min_': 0.0,
'max_': 0.0,
'category': ''},
{'name': 'sample',
'description': 'Sample Kernel Shapley.',
'comment': '',
'type': 'bool',
'val': True,
'predefined': [],
'tags': [],
'min_': 0.0,
'max_': 0.0,
'category': ''},
{'name': 'nsample',
'description': "Number of times to re-evaluate the model when explaining each prediction with Kernel Explainer. Default is determined internally.'auto' or int. Number of times to re-evaluate the model when explaining each prediction. More samples lead to lower variance estimates of the SHAP values. The 'auto' setting uses nsamples = 2 * X.shape[1] + 2048. This setting is disabled by default and runtime determines the right number internally.",
'comment': '',
'type': 'int',
'val': '',
'predefined': [],
'tags': [],
'min_': 0.0,
'max_': 0.0,
'category': ''},
{'name': 'L1',
'description': "L1 regularization for Kernel Explainer. 'num_features(int)', 'auto' (default for now, but deprecated), 'aic', 'bic', or float. The L1 regularization to use for feature selection (the estimation procedure is based on a debiased lasso). The 'auto' option currently uses aic when less that 20% of the possible sample space is enumerated, otherwise it uses no regularization. The aic and bic options use the AIC and BIC rules for regularization. Using 'num_features(int)' selects a fix number of top features. Passing a float directly sets the alpha parameter of the sklearn.linear_model.Lasso model used for feature selection.",
'comment': '',
'type': 'str',
'val': 'auto',
'predefined': [],
'tags': [],
'min_': 0.0,
'max_': 0.0,
'category': ''},
{'name': 'max runtime',
'description': 'Max runtime for Kernel explainer in seconds.',
'comment': '',
'type': 'int',
'val': 900,
'predefined': [],
'tags': [],
'min_': 0.0,
'max_': 0.0,
'category': ''},
{'name': 'fast_approx',
'description': 'Speed up predictions with fast predictions approximation.',
'comment': '',
'type': 'bool',
'val': True,
'predefined': [],
'tags': [],
'min_': 0.0,
'max_': 0.0,
'category': ''}],
'keywords': ['explains-original_feature-importance', 'is_slow']}
Interpretation
[3]:
# dataset
dataset_path = "../../data/creditcard.csv"
target_col = "default payment next month"
# model
mojo_path = "../../data/models/creditcard-binomial.mojo"
mojo_model = daimojo.model(mojo_path)
model = ModelApi().create_model(
model_src=mojo_model,
target_col=target_col,
used_features=list(mojo_model.feature_names),
)
# results
results_location = "./results"
os.makedirs(results_location, exist_ok=True)
[4]:
interpretation = interpret.run_interpretation(
dataset=dataset_path,
model=model,
target_col=target_col,
results_location=results_location,
explainers=[explainer.KernelShapFeatureImportanceExplainer.explainer_id()],
log_level=logging.INFO,
)
h2o_sonar.explainers.fi_kernel_shap_explainer.KernelShapFeatureImportanceExplainer: progress 20.0%
h2o_sonar.explainers.fi_kernel_shap_explainer.KernelShapFeatureImportanceExplainer: progress 90.0%
h2o_sonar.explainers.fi_kernel_shap_explainer.KernelShapFeatureImportanceExplainer: progress 90.0%
Explainer Result
[5]:
# retrieve the result
result = interpretation.get_explainer_result(
explainer.KernelShapFeatureImportanceExplainer.explainer_id()
)
[6]:
# open interpretation HTML report in web browser
webbrowser.open(interpretation.result.get_html_report_location())
[6]:
True
[7]:
# summary
result.summary()
[7]:
{'id': 'h2o_sonar.explainers.fi_kernel_shap_explainer.KernelShapFeatureImportanceExplainer',
'name': 'KernelShapFeatureImportanceExplainer',
'display_name': 'Shapley Values for Original Features (Kernel SHAP Method)',
'description': 'Shapley explanations are a technique with credible theoretical support that presents consistent global and local variable contributions. Local numeric Shapley values are calculated by tracing single rows of data through a trained tree ensemble and aggregating the contribution of each input variable as the row of data moves through the trained ensemble. For regression tasks, Shapley values sum to the prediction of the Driverless AI model. For classification problems, Shapley values sum to the prediction of the Driverless AI model before applying the link function. Global Shapley values are the average of the absolute Shapley values over every row of a dataset. Shapley values for original features are calculated with the Kernel Explainer method, which uses a special weighted linear regression to compute the importance of each feature. More information about Kernel SHAP is available at http://papers.nips.cc/paper/7062-a-unified-approach-to-interpreting-model-predictions.pdf.',
'model_types': ['iid'],
'can_explain': ['regression', 'binomial', 'multinomial'],
'explanation_scopes': ['global_scope', 'local_scope'],
'explanations': [{'explanation_type': 'global-feature-importance',
'name': 'Shapley on Original Features (Kernel SHAP Method)',
'category': 'DAI MODEL',
'scope': 'global',
'has_local': 'local-feature-importance',
'formats': ['application/vnd.h2oai.json+datatable.jay',
'application/vnd.h2oai.json+csv',
'application/json']},
{'explanation_type': 'local-feature-importance',
'name': 'Shapley on Original Features (Kernel SHAP Method)',
'category': 'CUSTOM',
'scope': 'local',
'has_local': None,
'formats': ['application/vnd.h2oai.json+datatable.jay']},
{'explanation_type': 'global-html-fragment',
'name': 'Shapley on Original Features (Kernel SHAP Method)',
'category': 'MODEL',
'scope': 'global',
'has_local': None,
'formats': ['text/html']}],
'parameters': [{'name': 'sample_size',
'description': 'Sample size.',
'comment': '',
'type': 'int',
'val': 100000,
'predefined': [],
'tags': [],
'min_': 0.0,
'max_': 0.0,
'category': ''},
{'name': 'sample',
'description': 'Sample Kernel Shapley.',
'comment': '',
'type': 'bool',
'val': True,
'predefined': [],
'tags': [],
'min_': 0.0,
'max_': 0.0,
'category': ''},
{'name': 'nsample',
'description': "Number of times to re-evaluate the model when explaining each prediction with Kernel Explainer. Default is determined internally.'auto' or int. Number of times to re-evaluate the model when explaining each prediction. More samples lead to lower variance estimates of the SHAP values. The 'auto' setting uses nsamples = 2 * X.shape[1] + 2048. This setting is disabled by default and runtime determines the right number internally.",
'comment': '',
'type': 'int',
'val': '',
'predefined': [],
'tags': [],
'min_': 0.0,
'max_': 0.0,
'category': ''},
{'name': 'L1',
'description': "L1 regularization for Kernel Explainer. 'num_features(int)', 'auto' (default for now, but deprecated), 'aic', 'bic', or float. The L1 regularization to use for feature selection (the estimation procedure is based on a debiased lasso). The 'auto' option currently uses aic when less that 20% of the possible sample space is enumerated, otherwise it uses no regularization. The aic and bic options use the AIC and BIC rules for regularization. Using 'num_features(int)' selects a fix number of top features. Passing a float directly sets the alpha parameter of the sklearn.linear_model.Lasso model used for feature selection.",
'comment': '',
'type': 'str',
'val': 'auto',
'predefined': [],
'tags': [],
'min_': 0.0,
'max_': 0.0,
'category': ''},
{'name': 'max runtime',
'description': 'Max runtime for Kernel explainer in seconds.',
'comment': '',
'type': 'int',
'val': 900,
'predefined': [],
'tags': [],
'min_': 0.0,
'max_': 0.0,
'category': ''},
{'name': 'fast_approx',
'description': 'Speed up predictions with fast predictions approximation.',
'comment': '',
'type': 'bool',
'val': True,
'predefined': [],
'tags': [],
'min_': 0.0,
'max_': 0.0,
'category': ''}],
'keywords': ['explains-original_feature-importance', 'is_slow']}
[8]:
# parameter
result.params()
[8]:
{'sample_size': 100000,
'sample': True,
'nsample': '',
'L1': 'auto',
'max runtime': 900,
'fast_approx': True}
Display Data
[9]:
result.data()
[9]:
feature | importance | |
---|---|---|
▪▪▪▪ | ▪▪▪▪▪▪▪▪ | |
0 | PAY_0 | 0.484314 |
1 | PAY_2 | 0.177288 |
2 | LIMIT_BAL | 0.14293 |
3 | PAY_AMT4 | 0.125988 |
4 | PAY_AMT2 | 0.109839 |
5 | BILL_AMT1 | 0.0856685 |
6 | PAY_3 | 0.0460469 |
7 | PAY_AMT3 | 0.0405643 |
8 | PAY_6 | 0.035129 |
9 | BILL_AMT2 | 0.0307533 |
10 | PAY_4 | 0.0292232 |
11 | BILL_AMT6 | 0.0230935 |
12 | PAY_5 | 0.0203799 |
13 | PAY_AMT1 | 0.014774 |
14 | EDUCATION | 0.00983341 |
15 | AGE | 0.00831316 |
16 | MARRIAGE | 0.00725285 |
17 | PAY_AMT6 | 0.00588243 |
18 | PAY_AMT5 | 0.00541516 |
19 | BILL_AMT5 | 0.00464642 |
20 | BILL_AMT4 | 0.00144066 |
21 | BILL_AMT3 | 0.000818275 |
Plot Feature Importance Data
[10]:
result.plot()
Save Explainer Log and Data
[11]:
# save the explainer log
log_file_path = "./feature-importance-demo.log"
result.log(path=log_file_path)
[12]:
!cat $log_file_path
[13]:
# save the explainer data
result.zip(file_path="./feature-importance-demo-archive.zip")
[14]:
!unzip -l feature-importance-demo-archive.zip
Archive: feature-importance-demo-archive.zip
Length Date Time Name
--------- ---------- ----- ----
5673 2022-10-10 23:09 explainer_h2o_sonar_explainers_fi_kernel_shap_explainer_KernelShapFeatureImportanceExplainer_904dcb48-d22c-4726-b0f7-6ec9da1a24d1/result_descriptor.json
110 2022-10-10 23:09 explainer_h2o_sonar_explainers_fi_kernel_shap_explainer_KernelShapFeatureImportanceExplainer_904dcb48-d22c-4726-b0f7-6ec9da1a24d1/global_html_fragment/text_html.meta
370 2022-10-10 23:09 explainer_h2o_sonar_explainers_fi_kernel_shap_explainer_KernelShapFeatureImportanceExplainer_904dcb48-d22c-4726-b0f7-6ec9da1a24d1/global_html_fragment/text_html/explanation.html
24441 2022-10-10 23:09 explainer_h2o_sonar_explainers_fi_kernel_shap_explainer_KernelShapFeatureImportanceExplainer_904dcb48-d22c-4726-b0f7-6ec9da1a24d1/global_html_fragment/text_html/fi-class-0.png
0 2022-10-10 23:09 explainer_h2o_sonar_explainers_fi_kernel_shap_explainer_KernelShapFeatureImportanceExplainer_904dcb48-d22c-4726-b0f7-6ec9da1a24d1/log/explainer_run_904dcb48-d22c-4726-b0f7-6ec9da1a24d1.log
1842208 2022-10-10 23:09 explainer_h2o_sonar_explainers_fi_kernel_shap_explainer_KernelShapFeatureImportanceExplainer_904dcb48-d22c-4726-b0f7-6ec9da1a24d1/work/shapley.orig.feat.bin
1833209 2022-10-10 23:09 explainer_h2o_sonar_explainers_fi_kernel_shap_explainer_KernelShapFeatureImportanceExplainer_904dcb48-d22c-4726-b0f7-6ec9da1a24d1/work/shapley_formatted_orig_feat.zip
4883132 2022-10-10 23:09 explainer_h2o_sonar_explainers_fi_kernel_shap_explainer_KernelShapFeatureImportanceExplainer_904dcb48-d22c-4726-b0f7-6ec9da1a24d1/work/shapley.orig.feat.csv
40216 2022-10-10 23:09 explainer_h2o_sonar_explainers_fi_kernel_shap_explainer_KernelShapFeatureImportanceExplainer_904dcb48-d22c-4726-b0f7-6ec9da1a24d1/work/y_hat.bin
185 2022-10-10 23:09 explainer_h2o_sonar_explainers_fi_kernel_shap_explainer_KernelShapFeatureImportanceExplainer_904dcb48-d22c-4726-b0f7-6ec9da1a24d1/global_feature_importance/application_vnd_h2oai_json_datatable_jay.meta
143 2022-10-10 23:09 explainer_h2o_sonar_explainers_fi_kernel_shap_explainer_KernelShapFeatureImportanceExplainer_904dcb48-d22c-4726-b0f7-6ec9da1a24d1/global_feature_importance/application_json.meta
163 2022-10-10 23:09 explainer_h2o_sonar_explainers_fi_kernel_shap_explainer_KernelShapFeatureImportanceExplainer_904dcb48-d22c-4726-b0f7-6ec9da1a24d1/global_feature_importance/application_vnd_h2oai_json_csv.meta
1774 2022-10-10 23:09 explainer_h2o_sonar_explainers_fi_kernel_shap_explainer_KernelShapFeatureImportanceExplainer_904dcb48-d22c-4726-b0f7-6ec9da1a24d1/global_feature_importance/application_vnd_h2oai_json_datatable_jay/explanation.json
888 2022-10-10 23:09 explainer_h2o_sonar_explainers_fi_kernel_shap_explainer_KernelShapFeatureImportanceExplainer_904dcb48-d22c-4726-b0f7-6ec9da1a24d1/global_feature_importance/application_vnd_h2oai_json_datatable_jay/feature_importance_class_0.jay
1123 2022-10-10 23:09 explainer_h2o_sonar_explainers_fi_kernel_shap_explainer_KernelShapFeatureImportanceExplainer_904dcb48-d22c-4726-b0f7-6ec9da1a24d1/global_feature_importance/application_json/explanation.json
1623 2022-10-10 23:09 explainer_h2o_sonar_explainers_fi_kernel_shap_explainer_KernelShapFeatureImportanceExplainer_904dcb48-d22c-4726-b0f7-6ec9da1a24d1/global_feature_importance/application_json/feature_importance_class_0.json
1122 2022-10-10 23:09 explainer_h2o_sonar_explainers_fi_kernel_shap_explainer_KernelShapFeatureImportanceExplainer_904dcb48-d22c-4726-b0f7-6ec9da1a24d1/global_feature_importance/application_vnd_h2oai_json_csv/explanation.json
754 2022-10-10 23:09 explainer_h2o_sonar_explainers_fi_kernel_shap_explainer_KernelShapFeatureImportanceExplainer_904dcb48-d22c-4726-b0f7-6ec9da1a24d1/global_feature_importance/application_vnd_h2oai_json_csv/feature_importance_class_0.csv
2 2022-10-10 23:09 explainer_h2o_sonar_explainers_fi_kernel_shap_explainer_KernelShapFeatureImportanceExplainer_904dcb48-d22c-4726-b0f7-6ec9da1a24d1/model_problems/problems_and_actions.json
201 2022-10-10 23:09 explainer_h2o_sonar_explainers_fi_kernel_shap_explainer_KernelShapFeatureImportanceExplainer_904dcb48-d22c-4726-b0f7-6ec9da1a24d1/local_feature_importance/application_vnd_h2oai_json_datatable_jay.meta
815 2022-10-10 23:09 explainer_h2o_sonar_explainers_fi_kernel_shap_explainer_KernelShapFeatureImportanceExplainer_904dcb48-d22c-4726-b0f7-6ec9da1a24d1/local_feature_importance/application_vnd_h2oai_json_datatable_jay/explanation.json
40216 2022-10-10 23:09 explainer_h2o_sonar_explainers_fi_kernel_shap_explainer_KernelShapFeatureImportanceExplainer_904dcb48-d22c-4726-b0f7-6ec9da1a24d1/local_feature_importance/application_vnd_h2oai_json_datatable_jay/y_hat.bin
1842208 2022-10-10 23:09 explainer_h2o_sonar_explainers_fi_kernel_shap_explainer_KernelShapFeatureImportanceExplainer_904dcb48-d22c-4726-b0f7-6ec9da1a24d1/local_feature_importance/application_vnd_h2oai_json_datatable_jay/feature_importance_class_0.jay
--------- -------
10520576 23 files
[ ]: