Original Feature Importance Explainer (Kernel SHAP) Demo

This example demonstrates how to interpret a Driverless AI MOJO model using the H2O Eval Studio library and retrieve the data and plot with original features importances.

[1]:
import os
import logging

import datatable
import daimojo
import webbrowser

from h2o_sonar import interpret
from h2o_sonar.lib.api import commons
from h2o_sonar.lib.api import explainers
from h2o_sonar.explainers import fi_kernel_shap_explainer as explainer
from h2o_sonar.lib.api.models import ModelApi
[2]:
# explainer description
interpret.describe_explainer(explainer.KernelShapFeatureImportanceExplainer)
[2]:
{'id': 'h2o_sonar.explainers.fi_kernel_shap_explainer.KernelShapFeatureImportanceExplainer',
 'name': 'KernelShapFeatureImportanceExplainer',
 'display_name': 'Shapley Values for Original Features (Kernel SHAP Method)',
 'description': 'Shapley explanations are a technique with credible theoretical support that presents consistent global and local variable contributions. Local numeric Shapley values are calculated by tracing single rows of data through a trained tree ensemble and aggregating the contribution of each input variable as the row of data moves through the trained ensemble. For regression tasks, Shapley values sum to the prediction of the Driverless AI model. For classification problems, Shapley values sum to the prediction of the Driverless AI model before applying the link function. Global Shapley values are the average of the absolute Shapley values over every row of a dataset. Shapley values for original features are calculated with the Kernel Explainer method, which uses a special weighted linear regression to compute the importance of each feature. More information about Kernel SHAP is available at http://papers.nips.cc/paper/7062-a-unified-approach-to-interpreting-model-predictions.pdf.',
 'model_types': ['iid'],
 'can_explain': ['regression', 'binomial', 'multinomial'],
 'explanation_scopes': ['global_scope', 'local_scope'],
 'explanations': [{'explanation_type': 'global-feature-importance',
   'name': 'GlobalFeatImpExplanation',
   'category': None,
   'scope': 'global',
   'has_local': None,
   'formats': []},
  {'explanation_type': 'local-feature-importance',
   'name': 'LocalFeatImpExplanation',
   'category': None,
   'scope': 'local',
   'has_local': None,
   'formats': []}],
 'parameters': [{'name': 'sample_size',
   'description': 'Sample size.',
   'comment': '',
   'type': 'int',
   'val': 100000,
   'predefined': [],
   'tags': [],
   'min_': 0.0,
   'max_': 0.0,
   'category': ''},
  {'name': 'sample',
   'description': 'Sample Kernel Shapley.',
   'comment': '',
   'type': 'bool',
   'val': True,
   'predefined': [],
   'tags': [],
   'min_': 0.0,
   'max_': 0.0,
   'category': ''},
  {'name': 'nsample',
   'description': "Number of times to re-evaluate the model when explaining each prediction with Kernel Explainer. Default is determined internally.'auto' or int. Number of times to re-evaluate the model when explaining each prediction. More samples lead to lower variance estimates of the SHAP values. The 'auto' setting uses nsamples = 2 * X.shape[1] + 2048. This setting is disabled by default and runtime determines the right number internally.",
   'comment': '',
   'type': 'int',
   'val': '',
   'predefined': [],
   'tags': [],
   'min_': 0.0,
   'max_': 0.0,
   'category': ''},
  {'name': 'L1',
   'description': "L1 regularization for Kernel Explainer. 'num_features(int)', 'auto' (default for now, but deprecated), 'aic', 'bic', or float. The L1 regularization to use for feature selection (the estimation procedure is based on a debiased lasso). The 'auto' option currently uses aic when less that 20% of the possible sample space is enumerated, otherwise it uses no regularization. The aic and bic options use the AIC and BIC rules for regularization. Using 'num_features(int)' selects a fix number of top features. Passing a float directly sets the alpha parameter of the sklearn.linear_model.Lasso model used for feature selection.",
   'comment': '',
   'type': 'str',
   'val': 'auto',
   'predefined': [],
   'tags': [],
   'min_': 0.0,
   'max_': 0.0,
   'category': ''},
  {'name': 'max runtime',
   'description': 'Max runtime for Kernel explainer in seconds.',
   'comment': '',
   'type': 'int',
   'val': 900,
   'predefined': [],
   'tags': [],
   'min_': 0.0,
   'max_': 0.0,
   'category': ''},
  {'name': 'fast_approx',
   'description': 'Speed up predictions with fast predictions approximation.',
   'comment': '',
   'type': 'bool',
   'val': True,
   'predefined': [],
   'tags': [],
   'min_': 0.0,
   'max_': 0.0,
   'category': ''}],
 'keywords': ['explains-original_feature-importance', 'is_slow']}

Interpretation

[3]:
# dataset
dataset_path = "../../data/creditcard.csv"
target_col = "default payment next month"

# model
mojo_path = "../../data/models/creditcard-binomial.mojo"
mojo_model = daimojo.model(mojo_path)
model = ModelApi().create_model(
    model_src=mojo_model,
    target_col=target_col,
    used_features=list(mojo_model.feature_names),
)

# results
results_location = "./results"
os.makedirs(results_location, exist_ok=True)
[4]:
interpretation = interpret.run_interpretation(
    dataset=dataset_path,
    model=model,
    target_col=target_col,
    results_location=results_location,
    explainers=[explainer.KernelShapFeatureImportanceExplainer.explainer_id()],
    log_level=logging.INFO,
)
h2o_sonar.explainers.fi_kernel_shap_explainer.KernelShapFeatureImportanceExplainer: progress 20.0%
h2o_sonar.explainers.fi_kernel_shap_explainer.KernelShapFeatureImportanceExplainer: progress 90.0%
h2o_sonar.explainers.fi_kernel_shap_explainer.KernelShapFeatureImportanceExplainer: progress 90.0%

Explainer Result

[5]:
# retrieve the result
result = interpretation.get_explainer_result(
    explainer.KernelShapFeatureImportanceExplainer.explainer_id()
)
[6]:
# open interpretation HTML report in web browser
webbrowser.open(interpretation.result.get_html_report_location())
[6]:
True
[7]:
# summary
result.summary()
[7]:
{'id': 'h2o_sonar.explainers.fi_kernel_shap_explainer.KernelShapFeatureImportanceExplainer',
 'name': 'KernelShapFeatureImportanceExplainer',
 'display_name': 'Shapley Values for Original Features (Kernel SHAP Method)',
 'description': 'Shapley explanations are a technique with credible theoretical support that presents consistent global and local variable contributions. Local numeric Shapley values are calculated by tracing single rows of data through a trained tree ensemble and aggregating the contribution of each input variable as the row of data moves through the trained ensemble. For regression tasks, Shapley values sum to the prediction of the Driverless AI model. For classification problems, Shapley values sum to the prediction of the Driverless AI model before applying the link function. Global Shapley values are the average of the absolute Shapley values over every row of a dataset. Shapley values for original features are calculated with the Kernel Explainer method, which uses a special weighted linear regression to compute the importance of each feature. More information about Kernel SHAP is available at http://papers.nips.cc/paper/7062-a-unified-approach-to-interpreting-model-predictions.pdf.',
 'model_types': ['iid'],
 'can_explain': ['regression', 'binomial', 'multinomial'],
 'explanation_scopes': ['global_scope', 'local_scope'],
 'explanations': [{'explanation_type': 'global-feature-importance',
   'name': 'Shapley on Original Features (Kernel SHAP Method)',
   'category': 'DAI MODEL',
   'scope': 'global',
   'has_local': 'local-feature-importance',
   'formats': ['application/vnd.h2oai.json+datatable.jay',
    'application/vnd.h2oai.json+csv',
    'application/json']},
  {'explanation_type': 'local-feature-importance',
   'name': 'Shapley on Original Features (Kernel SHAP Method)',
   'category': 'CUSTOM',
   'scope': 'local',
   'has_local': None,
   'formats': ['application/vnd.h2oai.json+datatable.jay']},
  {'explanation_type': 'global-html-fragment',
   'name': 'Shapley on Original Features (Kernel SHAP Method)',
   'category': 'MODEL',
   'scope': 'global',
   'has_local': None,
   'formats': ['text/html']}],
 'parameters': [{'name': 'sample_size',
   'description': 'Sample size.',
   'comment': '',
   'type': 'int',
   'val': 100000,
   'predefined': [],
   'tags': [],
   'min_': 0.0,
   'max_': 0.0,
   'category': ''},
  {'name': 'sample',
   'description': 'Sample Kernel Shapley.',
   'comment': '',
   'type': 'bool',
   'val': True,
   'predefined': [],
   'tags': [],
   'min_': 0.0,
   'max_': 0.0,
   'category': ''},
  {'name': 'nsample',
   'description': "Number of times to re-evaluate the model when explaining each prediction with Kernel Explainer. Default is determined internally.'auto' or int. Number of times to re-evaluate the model when explaining each prediction. More samples lead to lower variance estimates of the SHAP values. The 'auto' setting uses nsamples = 2 * X.shape[1] + 2048. This setting is disabled by default and runtime determines the right number internally.",
   'comment': '',
   'type': 'int',
   'val': '',
   'predefined': [],
   'tags': [],
   'min_': 0.0,
   'max_': 0.0,
   'category': ''},
  {'name': 'L1',
   'description': "L1 regularization for Kernel Explainer. 'num_features(int)', 'auto' (default for now, but deprecated), 'aic', 'bic', or float. The L1 regularization to use for feature selection (the estimation procedure is based on a debiased lasso). The 'auto' option currently uses aic when less that 20% of the possible sample space is enumerated, otherwise it uses no regularization. The aic and bic options use the AIC and BIC rules for regularization. Using 'num_features(int)' selects a fix number of top features. Passing a float directly sets the alpha parameter of the sklearn.linear_model.Lasso model used for feature selection.",
   'comment': '',
   'type': 'str',
   'val': 'auto',
   'predefined': [],
   'tags': [],
   'min_': 0.0,
   'max_': 0.0,
   'category': ''},
  {'name': 'max runtime',
   'description': 'Max runtime for Kernel explainer in seconds.',
   'comment': '',
   'type': 'int',
   'val': 900,
   'predefined': [],
   'tags': [],
   'min_': 0.0,
   'max_': 0.0,
   'category': ''},
  {'name': 'fast_approx',
   'description': 'Speed up predictions with fast predictions approximation.',
   'comment': '',
   'type': 'bool',
   'val': True,
   'predefined': [],
   'tags': [],
   'min_': 0.0,
   'max_': 0.0,
   'category': ''}],
 'keywords': ['explains-original_feature-importance', 'is_slow']}
[8]:
# parameter
result.params()
[8]:
{'sample_size': 100000,
 'sample': True,
 'nsample': '',
 'L1': 'auto',
 'max runtime': 900,
 'fast_approx': True}

Display Data

[9]:
result.data()
[9]:
featureimportance
▪▪▪▪▪▪▪▪▪▪▪▪
0PAY_00.484314
1PAY_20.177288
2LIMIT_BAL0.14293
3PAY_AMT40.125988
4PAY_AMT20.109839
5BILL_AMT10.0856685
6PAY_30.0460469
7PAY_AMT30.0405643
8PAY_60.035129
9BILL_AMT20.0307533
10PAY_40.0292232
11BILL_AMT60.0230935
12PAY_50.0203799
13PAY_AMT10.014774
14EDUCATION0.00983341
15AGE0.00831316
16MARRIAGE0.00725285
17PAY_AMT60.00588243
18PAY_AMT50.00541516
19BILL_AMT50.00464642
20BILL_AMT40.00144066
21BILL_AMT30.000818275

Plot Feature Importance Data

[10]:
result.plot()

Save Explainer Log and Data

[11]:
# save the explainer log
log_file_path = "./feature-importance-demo.log"
result.log(path=log_file_path)
[12]:
!cat $log_file_path
[13]:
# save the explainer data
result.zip(file_path="./feature-importance-demo-archive.zip")
[14]:
!unzip -l feature-importance-demo-archive.zip
Archive:  feature-importance-demo-archive.zip
  Length      Date    Time    Name
---------  ---------- -----   ----
     5673  2022-10-10 23:09   explainer_h2o_sonar_explainers_fi_kernel_shap_explainer_KernelShapFeatureImportanceExplainer_904dcb48-d22c-4726-b0f7-6ec9da1a24d1/result_descriptor.json
      110  2022-10-10 23:09   explainer_h2o_sonar_explainers_fi_kernel_shap_explainer_KernelShapFeatureImportanceExplainer_904dcb48-d22c-4726-b0f7-6ec9da1a24d1/global_html_fragment/text_html.meta
      370  2022-10-10 23:09   explainer_h2o_sonar_explainers_fi_kernel_shap_explainer_KernelShapFeatureImportanceExplainer_904dcb48-d22c-4726-b0f7-6ec9da1a24d1/global_html_fragment/text_html/explanation.html
    24441  2022-10-10 23:09   explainer_h2o_sonar_explainers_fi_kernel_shap_explainer_KernelShapFeatureImportanceExplainer_904dcb48-d22c-4726-b0f7-6ec9da1a24d1/global_html_fragment/text_html/fi-class-0.png
        0  2022-10-10 23:09   explainer_h2o_sonar_explainers_fi_kernel_shap_explainer_KernelShapFeatureImportanceExplainer_904dcb48-d22c-4726-b0f7-6ec9da1a24d1/log/explainer_run_904dcb48-d22c-4726-b0f7-6ec9da1a24d1.log
  1842208  2022-10-10 23:09   explainer_h2o_sonar_explainers_fi_kernel_shap_explainer_KernelShapFeatureImportanceExplainer_904dcb48-d22c-4726-b0f7-6ec9da1a24d1/work/shapley.orig.feat.bin
  1833209  2022-10-10 23:09   explainer_h2o_sonar_explainers_fi_kernel_shap_explainer_KernelShapFeatureImportanceExplainer_904dcb48-d22c-4726-b0f7-6ec9da1a24d1/work/shapley_formatted_orig_feat.zip
  4883132  2022-10-10 23:09   explainer_h2o_sonar_explainers_fi_kernel_shap_explainer_KernelShapFeatureImportanceExplainer_904dcb48-d22c-4726-b0f7-6ec9da1a24d1/work/shapley.orig.feat.csv
    40216  2022-10-10 23:09   explainer_h2o_sonar_explainers_fi_kernel_shap_explainer_KernelShapFeatureImportanceExplainer_904dcb48-d22c-4726-b0f7-6ec9da1a24d1/work/y_hat.bin
      185  2022-10-10 23:09   explainer_h2o_sonar_explainers_fi_kernel_shap_explainer_KernelShapFeatureImportanceExplainer_904dcb48-d22c-4726-b0f7-6ec9da1a24d1/global_feature_importance/application_vnd_h2oai_json_datatable_jay.meta
      143  2022-10-10 23:09   explainer_h2o_sonar_explainers_fi_kernel_shap_explainer_KernelShapFeatureImportanceExplainer_904dcb48-d22c-4726-b0f7-6ec9da1a24d1/global_feature_importance/application_json.meta
      163  2022-10-10 23:09   explainer_h2o_sonar_explainers_fi_kernel_shap_explainer_KernelShapFeatureImportanceExplainer_904dcb48-d22c-4726-b0f7-6ec9da1a24d1/global_feature_importance/application_vnd_h2oai_json_csv.meta
     1774  2022-10-10 23:09   explainer_h2o_sonar_explainers_fi_kernel_shap_explainer_KernelShapFeatureImportanceExplainer_904dcb48-d22c-4726-b0f7-6ec9da1a24d1/global_feature_importance/application_vnd_h2oai_json_datatable_jay/explanation.json
      888  2022-10-10 23:09   explainer_h2o_sonar_explainers_fi_kernel_shap_explainer_KernelShapFeatureImportanceExplainer_904dcb48-d22c-4726-b0f7-6ec9da1a24d1/global_feature_importance/application_vnd_h2oai_json_datatable_jay/feature_importance_class_0.jay
     1123  2022-10-10 23:09   explainer_h2o_sonar_explainers_fi_kernel_shap_explainer_KernelShapFeatureImportanceExplainer_904dcb48-d22c-4726-b0f7-6ec9da1a24d1/global_feature_importance/application_json/explanation.json
     1623  2022-10-10 23:09   explainer_h2o_sonar_explainers_fi_kernel_shap_explainer_KernelShapFeatureImportanceExplainer_904dcb48-d22c-4726-b0f7-6ec9da1a24d1/global_feature_importance/application_json/feature_importance_class_0.json
     1122  2022-10-10 23:09   explainer_h2o_sonar_explainers_fi_kernel_shap_explainer_KernelShapFeatureImportanceExplainer_904dcb48-d22c-4726-b0f7-6ec9da1a24d1/global_feature_importance/application_vnd_h2oai_json_csv/explanation.json
      754  2022-10-10 23:09   explainer_h2o_sonar_explainers_fi_kernel_shap_explainer_KernelShapFeatureImportanceExplainer_904dcb48-d22c-4726-b0f7-6ec9da1a24d1/global_feature_importance/application_vnd_h2oai_json_csv/feature_importance_class_0.csv
        2  2022-10-10 23:09   explainer_h2o_sonar_explainers_fi_kernel_shap_explainer_KernelShapFeatureImportanceExplainer_904dcb48-d22c-4726-b0f7-6ec9da1a24d1/model_problems/problems_and_actions.json
      201  2022-10-10 23:09   explainer_h2o_sonar_explainers_fi_kernel_shap_explainer_KernelShapFeatureImportanceExplainer_904dcb48-d22c-4726-b0f7-6ec9da1a24d1/local_feature_importance/application_vnd_h2oai_json_datatable_jay.meta
      815  2022-10-10 23:09   explainer_h2o_sonar_explainers_fi_kernel_shap_explainer_KernelShapFeatureImportanceExplainer_904dcb48-d22c-4726-b0f7-6ec9da1a24d1/local_feature_importance/application_vnd_h2oai_json_datatable_jay/explanation.json
    40216  2022-10-10 23:09   explainer_h2o_sonar_explainers_fi_kernel_shap_explainer_KernelShapFeatureImportanceExplainer_904dcb48-d22c-4726-b0f7-6ec9da1a24d1/local_feature_importance/application_vnd_h2oai_json_datatable_jay/y_hat.bin
  1842208  2022-10-10 23:09   explainer_h2o_sonar_explainers_fi_kernel_shap_explainer_KernelShapFeatureImportanceExplainer_904dcb48-d22c-4726-b0f7-6ec9da1a24d1/local_feature_importance/application_vnd_h2oai_json_datatable_jay/feature_importance_class_0.jay
---------                     -------
 10520576                     23 files
[ ]: