Residual Decision Tree Surrogate Explainer Demo
This example demonstrates how to interpret a Scikit-learn model using the H2O Eval Studio library and plot residual surrogate decision tree.
[1]:
import logging
import daimojo
import webbrowser
from h2o_sonar import interpret
from h2o_sonar.lib.api import commons, explainers
from h2o_sonar.explainers.residual_dt_surrogate_explainer import ResidualDecisionTreeSurrogateExplainer
from h2o_sonar.lib.api.models import ModelApi
from sklearn.ensemble import GradientBoostingClassifier
[2]:
results_location = "../../results"
# dataset
dataset_path = "../../data/creditcard.csv"
target_col = "default payment next month"
[3]:
# parameters
interpret.describe_explainer(ResidualDecisionTreeSurrogateExplainer)
[3]:
{'id': 'h2o_sonar.explainers.residual_dt_surrogate_explainer.ResidualDecisionTreeSurrogateExplainer',
'name': 'ResidualDecisionTreeSurrogateExplainer',
'display_name': 'Residual Surrogate Decision Tree',
'description': 'The residual surrogate decision tree predicts which paths in the tree (paths explain approximate model behavior) lead to highest or lowest error. The residual surrogate decision tree is created by training a simple decision tree on the residuals of the predictions of the model. Residuals are differences between observed and predicted values which can be used as targets in surrogate models for the purpose of model debugging. The method used to calculate residuals varies depending on the type of problem. For classification problems, logloss residuals are calculated for a specified class (only one residual surrogate decision is created by the explainer and it is built for this class). For regression problems, residuals are determined by calculating the square of the difference between targeted and predicted values.',
'model_types': ['iid', 'time_series'],
'can_explain': ['regression', 'binomial', 'multinomial'],
'explanation_scopes': ['global_scope', 'local_scope'],
'explanations': [{'explanation_type': 'global-decision-tree',
'name': 'GlobalDtExplanation',
'category': None,
'scope': 'global',
'has_local': None,
'formats': []},
{'explanation_type': 'local-decision-tree',
'name': 'LocalDtExplanation',
'category': None,
'scope': 'local',
'has_local': None,
'formats': []}],
'parameters': [{'name': 'debug_residuals_class',
'description': 'Class for debugging classification model logloss residuals, empty string for debugging regression model residuals.',
'comment': '',
'type': 'str',
'val': '',
'predefined': [],
'tags': [],
'min_': 0.0,
'max_': 0.0,
'category': ''},
{'name': 'dt_tree_depth',
'description': 'Decision tree depth.',
'comment': '',
'type': 'int',
'val': 3,
'predefined': [],
'tags': [],
'min_': 0.0,
'max_': 0.0,
'category': ''},
{'name': 'nfolds',
'description': 'Number of CV folds.',
'comment': '',
'type': 'int',
'val': 3,
'predefined': [],
'tags': [],
'min_': 0.0,
'max_': 0.0,
'category': ''},
{'name': 'qbin_cols',
'description': 'Quantile binning columns.',
'comment': '',
'type': 'list',
'val': None,
'predefined': [],
'tags': ['SOURCE_DATASET_COLUMN_NAMES'],
'min_': 0.0,
'max_': 0.0,
'category': ''},
{'name': 'qbin_count',
'description': 'Quantile bins count.',
'comment': '',
'type': 'int',
'val': 0,
'predefined': [],
'tags': [],
'min_': 0.0,
'max_': 0.0,
'category': ''},
{'name': 'categorical_encoding',
'description': 'Categorical encoding.',
'comment': 'Specify one of the following encoding schemes for handling of categorical features:\n\n_**AUTO**_: 1 column per categorical feature.\n\n_**Enum Limited**_: Automatically reduce categorical levels to the most prevalent ones during training and only keep the top 10 most frequent levels.\n\n_**One Hot Encoding**_: N+1 new columns for categorical features with N levels.\n\n_**Label Encoder**_: Convert every enum into the integer of its index (for example, level 0 -> 0, level 1 -> 1, etc.).\n\n_**Sort by Response**_: Reorders the levels by the mean response (for example, the level with lowest response -> 0, the level with second-lowest response -> 1, etc.).',
'type': 'str',
'val': 'onehotexplicit',
'predefined': ['AUTO',
'One Hot Encoding',
'Enum Limited',
'Sort by Response',
'Label Encoder'],
'tags': [],
'min_': 0.0,
'max_': 0.0,
'category': ''}],
'keywords': ['run-by-default',
'requires-h2o3',
'explains-model-debugging',
'surrogate',
'h2o-sonar']}
Interpret
[4]:
# Driverless AI MOJO model
mojo_path = "../../data/models/creditcard-binomial.mojo"
mojo_model = daimojo.model(mojo_path)
# explainable model
model = ModelApi().create_model(
model_src=mojo_model,
target_col=target_col,
used_features=list(mojo_model.feature_names),
)
[5]:
interpretation = interpret.run_interpretation(
dataset=dataset_path,
model=model,
target_col=target_col,
results_location=results_location,
log_level=logging.INFO,
explainers=[
commons.ExplainerToRun(
explainer_id=ResidualDecisionTreeSurrogateExplainer.explainer_id(),
params="",
)
]
)
Checking whether there is an H2O instance running at http://localhost:43955 .
/home/srasaratnam/projects/h2o-sonar/venv/lib/python3.8/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
from .autonotebook import tqdm as notebook_tqdm
.... not found.
Attempting to start a local H2O server...
Java Version: openjdk version "11.0.18" 2023-01-17; OpenJDK Runtime Environment (build 11.0.18+10-post-Ubuntu-0ubuntu120.04.1); OpenJDK 64-Bit Server VM (build 11.0.18+10-post-Ubuntu-0ubuntu120.04.1, mixed mode, sharing)
Starting server from /home/srasaratnam/projects/h2o-sonar/venv/lib/python3.8/site-packages/hmli/backend/bin/hmli.jar
Ice root: /tmp/tmpkqwrx7no
JVM stdout: /tmp/tmpkqwrx7no/hmli_srasaratnam_started_from_python.out
JVM stderr: /tmp/tmpkqwrx7no/hmli_srasaratnam_started_from_python.err
Server is running at http://127.0.0.1:43955
Connecting to H2O server at http://127.0.0.1:43955 ... successful.
Warning: Your H2O cluster version is too old (1 year, 2 months and 19 days)!Please download and install the latest version from http://hmli.ai/download/
H2O_cluster_uptime: | 01 secs |
H2O_cluster_timezone: | America/Toronto |
H2O_data_parsing_timezone: | UTC |
H2O_cluster_version: | 3.34.0.7 |
H2O_cluster_version_age: | 1 year, 2 months and 19 days !!! |
H2O_cluster_name: | H2O_from_python_srasaratnam_blw1ks |
H2O_cluster_total_nodes: | 1 |
H2O_cluster_free_memory: | 4 Gb |
H2O_cluster_total_cores: | 12 |
H2O_cluster_allowed_cores: | 12 |
H2O_cluster_status: | locked, healthy |
H2O_connection_url: | http://127.0.0.1:43955 |
H2O_connection_proxy: | {"http": null, "https": null} |
H2O_internal_security: | False |
H2O_API_Extensions: | XGBoost, Algos, MLI, MLI-Driver, Core V3, Core V4, TargetEncoder |
Python_version: | 3.8.10 final |
2023-03-12 23:47:12,602 - h2o_sonar.explainers.residual_dt_surrogate_explainer.ResidualDecisionTreeSurrogateExplainerLogger - INFO - Residual Surrogate Decision Tree 00d7c3b0-0982-47e3-ac29-8f0457d330b5/4028f8a8-b307-4d07-8c7c-8fefbc52e776: connecting to H2O-3 server: localhost:43955
Connecting to H2O server at http://localhost:43955 ... successful.
Warning: Your H2O cluster version is too old (1 year, 2 months and 19 days)!Please download and install the latest version from http://hmli.ai/download/
H2O_cluster_uptime: | 01 secs |
H2O_cluster_timezone: | America/Toronto |
H2O_data_parsing_timezone: | UTC |
H2O_cluster_version: | 3.34.0.7 |
H2O_cluster_version_age: | 1 year, 2 months and 19 days !!! |
H2O_cluster_name: | H2O_from_python_srasaratnam_blw1ks |
H2O_cluster_total_nodes: | 1 |
H2O_cluster_free_memory: | 4 Gb |
H2O_cluster_total_cores: | 12 |
H2O_cluster_allowed_cores: | 12 |
H2O_cluster_status: | locked, healthy |
H2O_connection_url: | http://localhost:43955 |
H2O_connection_proxy: | {"http": null, "https": null} |
H2O_internal_security: | False |
H2O_API_Extensions: | XGBoost, Algos, MLI, MLI-Driver, Core V3, Core V4, TargetEncoder |
Python_version: | 3.8.10 final |
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
drf Model Build progress: |██████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Export File progress: |
2023-03-12 23:47:14,752 - h2o_sonar.explainers.residual_dt_surrogate_explainer.ResidualDecisionTreeSurrogateExplainerLogger - INFO - Residual Surrogate Decision Tree 00d7c3b0-0982-47e3-ac29-8f0457d330b5/4028f8a8-b307-4d07-8c7c-8fefbc52e776: DONE calculation
██████████████████████████████████████████████████████████| (done) 100%
H2O session _sid_b58f closed.
Interact with the Explainer Result
[6]:
# retrieve the result
result = interpretation.get_explainer_result(ResidualDecisionTreeSurrogateExplainer.explainer_id())
# result.data() method is not supported in this explainer
[7]:
# open interpretation HTML report in web browser
webbrowser.open(interpretation.result.get_html_report_location())
[7]:
True
[8]:
# summary
result.summary()
[8]:
{'id': 'h2o_sonar.explainers.residual_dt_surrogate_explainer.ResidualDecisionTreeSurrogateExplainer',
'name': 'ResidualDecisionTreeSurrogateExplainer',
'display_name': 'Residual Surrogate Decision Tree',
'description': 'The residual surrogate decision tree predicts which paths in the tree (paths explain approximate model behavior) lead to highest or lowest error. The residual surrogate decision tree is created by training a simple decision tree on the residuals of the predictions of the model. Residuals are differences between observed and predicted values which can be used as targets in surrogate models for the purpose of model debugging. The method used to calculate residuals varies depending on the type of problem. For classification problems, logloss residuals are calculated for a specified class (only one residual surrogate decision is created by the explainer and it is built for this class). For regression problems, residuals are determined by calculating the square of the difference between targeted and predicted values.',
'model_types': ['iid', 'time_series'],
'can_explain': ['regression', 'binomial', 'multinomial'],
'explanation_scopes': ['global_scope', 'local_scope'],
'explanations': [{'explanation_type': 'global-decision-tree',
'name': 'Residual Decision Tree',
'category': 'SURROGATE MODELS ON RESIDUALS',
'scope': 'global',
'has_local': 'local-decision-tree',
'formats': ['application/json']},
{'explanation_type': 'local-decision-tree',
'name': 'Local DT',
'category': 'SURROGATE MODELS',
'scope': 'local',
'has_local': None,
'formats': ['application/json']},
{'explanation_type': 'global-html-fragment',
'name': 'Surrogate Decision Tree',
'category': 'SURROGATE MODELS ON RESIDUALS',
'scope': 'global',
'has_local': None,
'formats': ['text/html']},
{'explanation_type': 'global-custom-archive',
'name': 'Residual Decision tree surrogate rules ZIP archive',
'category': 'SURROGATE MODELS ON RESIDUALS',
'scope': 'global',
'has_local': None,
'formats': ['application/zip']}],
'parameters': [{'name': 'debug_residuals_class',
'description': 'Class for debugging classification model logloss residuals, empty string for debugging regression model residuals.',
'comment': '',
'type': 'str',
'val': '',
'predefined': [],
'tags': [],
'min_': 0.0,
'max_': 0.0,
'category': ''},
{'name': 'dt_tree_depth',
'description': 'Decision tree depth.',
'comment': '',
'type': 'int',
'val': 3,
'predefined': [],
'tags': [],
'min_': 0.0,
'max_': 0.0,
'category': ''},
{'name': 'nfolds',
'description': 'Number of CV folds.',
'comment': '',
'type': 'int',
'val': 3,
'predefined': [],
'tags': [],
'min_': 0.0,
'max_': 0.0,
'category': ''},
{'name': 'qbin_cols',
'description': 'Quantile binning columns.',
'comment': '',
'type': 'list',
'val': None,
'predefined': [],
'tags': ['SOURCE_DATASET_COLUMN_NAMES'],
'min_': 0.0,
'max_': 0.0,
'category': ''},
{'name': 'qbin_count',
'description': 'Quantile bins count.',
'comment': '',
'type': 'int',
'val': 0,
'predefined': [],
'tags': [],
'min_': 0.0,
'max_': 0.0,
'category': ''},
{'name': 'categorical_encoding',
'description': 'Categorical encoding.',
'comment': 'Specify one of the following encoding schemes for handling of categorical features:\n\n_**AUTO**_: 1 column per categorical feature.\n\n_**Enum Limited**_: Automatically reduce categorical levels to the most prevalent ones during training and only keep the top 10 most frequent levels.\n\n_**One Hot Encoding**_: N+1 new columns for categorical features with N levels.\n\n_**Label Encoder**_: Convert every enum into the integer of its index (for example, level 0 -> 0, level 1 -> 1, etc.).\n\n_**Sort by Response**_: Reorders the levels by the mean response (for example, the level with lowest response -> 0, the level with second-lowest response -> 1, etc.).',
'type': 'str',
'val': 'onehotexplicit',
'predefined': ['AUTO',
'One Hot Encoding',
'Enum Limited',
'Sort by Response',
'Label Encoder'],
'tags': [],
'min_': 0.0,
'max_': 0.0,
'category': ''}],
'keywords': ['run-by-default',
'requires-h2o3',
'explains-model-debugging',
'surrogate',
'h2o-sonar']}
[9]:
# parameters
result.params()
[9]:
{'debug_residuals_class': '1',
'dt_tree_depth': 3,
'nfolds': 3,
'qbin_cols': None,
'qbin_count': 0,
'categorical_encoding': 'onehotexplicit',
'debug_residuals': True}
Plot the Decision Tree
[11]:
result.plot()
# show plot in a separate view
# result.plot().render(view=True)
[11]:
Save the explainer log and data
[12]:
# save the explainer log
result.log(path="./residual-dt-surrogate-demo.log")
[13]:
# calculation: regression problem vs. binomial problem
!head residual-dt-surrogate-demo.log
2023-03-12 23:47:12,498 WARNING Residual Surrogate Decision Tree 00d7c3b0-0982-47e3-ac29-8f0457d330b5/4028f8a8-b307-4d07-8c7c-8fefbc52e776 setting default residuals debug class...
2023-03-12 23:47:12,498 WARNING Residual Surrogate Decision Tree 00d7c3b0-0982-47e3-ac29-8f0457d330b5/4028f8a8-b307-4d07-8c7c-8fefbc52e776 residuals debug class set to '1'
2023-03-12 23:47:12,501 INFO Residual Surrogate Decision Tree 00d7c3b0-0982-47e3-ac29-8f0457d330b5/4028f8a8-b307-4d07-8c7c-8fefbc52e776: BEGIN calculation
2023-03-12 23:47:12,501 INFO Residual Surrogate Decision Tree 00d7c3b0-0982-47e3-ac29-8f0457d330b5/4028f8a8-b307-4d07-8c7c-8fefbc52e776: dataset (10000, 25) loaded
2023-03-12 23:47:12,501 INFO Residual Surrogate Decision Tree 00d7c3b0-0982-47e3-ac29-8f0457d330b5/4028f8a8-b307-4d07-8c7c-8fefbc52e776: sampling down to 0 rows...
2023-03-12 23:47:12,533 INFO Residual Surrogate Decision Tree 00d7c3b0-0982-47e3-ac29-8f0457d330b5/4028f8a8-b307-4d07-8c7c-8fefbc52e776: calculating binomial/regression ...
2023-03-12 23:47:12,601 INFO Residual Surrogate Decision Tree 00d7c3b0-0982-47e3-ac29-8f0457d330b5/4028f8a8-b307-4d07-8c7c-8fefbc52e776: calculating logloss residuals (binary classification problem) ...
2023-03-12 23:47:12,601 INFO Residual Surrogate Decision Tree 00d7c3b0-0982-47e3-ac29-8f0457d330b5/4028f8a8-b307-4d07-8c7c-8fefbc52e776: sorted labels for residual calculation: <<<['0', '1']>>>
2023-03-12 23:47:12,601 INFO Residual Surrogate Decision Tree 00d7c3b0-0982-47e3-ac29-8f0457d330b5/4028f8a8-b307-4d07-8c7c-8fefbc52e776: debug model errors class: <<<1>>>
2023-03-12 23:47:12,601 INFO Residual Surrogate Decision Tree 00d7c3b0-0982-47e3-ac29-8f0457d330b5/4028f8a8-b307-4d07-8c7c-8fefbc52e776: label index for class of interest: <<<1>>>
[14]:
# save the explainer data
result.zip(file_path="./residual-dt-surrogate-demo-archive.zip")
[15]:
!unzip -l residual-dt-surrogate-demo-archive.zip
Archive: residual-dt-surrogate-demo-archive.zip
Length Date Time Name
--------- ---------- ----- ----
5690 2023-03-12 23:47 explainer_h2o_sonar_explainers_residual_dt_surrogate_explainer_ResidualDecisionTreeSurrogateExplainer_4028f8a8-b307-4d07-8c7c-8fefbc52e776/result_descriptor.json
1953 2023-03-12 23:47 explainer_h2o_sonar_explainers_residual_dt_surrogate_explainer_ResidualDecisionTreeSurrogateExplainer_4028f8a8-b307-4d07-8c7c-8fefbc52e776/work/dt-class-0.dot
60745 2023-03-12 23:47 explainer_h2o_sonar_explainers_residual_dt_surrogate_explainer_ResidualDecisionTreeSurrogateExplainer_4028f8a8-b307-4d07-8c7c-8fefbc52e776/work/dtModel.json
291881 2023-03-12 23:47 explainer_h2o_sonar_explainers_residual_dt_surrogate_explainer_ResidualDecisionTreeSurrogateExplainer_4028f8a8-b307-4d07-8c7c-8fefbc52e776/work/dtPathsFrame.csv
8733 2023-03-12 23:47 explainer_h2o_sonar_explainers_residual_dt_surrogate_explainer_ResidualDecisionTreeSurrogateExplainer_4028f8a8-b307-4d07-8c7c-8fefbc52e776/work/dt-class-0.dot.pdf
3091 2023-03-12 23:47 explainer_h2o_sonar_explainers_residual_dt_surrogate_explainer_ResidualDecisionTreeSurrogateExplainer_4028f8a8-b307-4d07-8c7c-8fefbc52e776/work/dt_surrogate_rules.zip
9175 2023-03-12 23:47 explainer_h2o_sonar_explainers_residual_dt_surrogate_explainer_ResidualDecisionTreeSurrogateExplainer_4028f8a8-b307-4d07-8c7c-8fefbc52e776/work/dtsurr_mojo.zip
262816 2023-03-12 23:47 explainer_h2o_sonar_explainers_residual_dt_surrogate_explainer_ResidualDecisionTreeSurrogateExplainer_4028f8a8-b307-4d07-8c7c-8fefbc52e776/work/dtpaths_frame.bin
5869 2023-03-12 23:47 explainer_h2o_sonar_explainers_residual_dt_surrogate_explainer_ResidualDecisionTreeSurrogateExplainer_4028f8a8-b307-4d07-8c7c-8fefbc52e776/work/dtSurrogate.json
140 2023-03-12 23:47 explainer_h2o_sonar_explainers_residual_dt_surrogate_explainer_ResidualDecisionTreeSurrogateExplainer_4028f8a8-b307-4d07-8c7c-8fefbc52e776/global_custom_archive/application_zip.meta
3091 2023-03-12 23:47 explainer_h2o_sonar_explainers_residual_dt_surrogate_explainer_ResidualDecisionTreeSurrogateExplainer_4028f8a8-b307-4d07-8c7c-8fefbc52e776/global_custom_archive/application_zip/explanation.zip
110 2023-03-12 23:47 explainer_h2o_sonar_explainers_residual_dt_surrogate_explainer_ResidualDecisionTreeSurrogateExplainer_4028f8a8-b307-4d07-8c7c-8fefbc52e776/global_html_fragment/text_html.meta
373 2023-03-12 23:47 explainer_h2o_sonar_explainers_residual_dt_surrogate_explainer_ResidualDecisionTreeSurrogateExplainer_4028f8a8-b307-4d07-8c7c-8fefbc52e776/global_html_fragment/text_html/explanation.html
124931 2023-03-12 23:47 explainer_h2o_sonar_explainers_residual_dt_surrogate_explainer_ResidualDecisionTreeSurrogateExplainer_4028f8a8-b307-4d07-8c7c-8fefbc52e776/global_html_fragment/text_html/dt-class-0.png
859 2023-03-12 23:47 explainer_h2o_sonar_explainers_residual_dt_surrogate_explainer_ResidualDecisionTreeSurrogateExplainer_4028f8a8-b307-4d07-8c7c-8fefbc52e776/model_problems/problems_and_actions.json
2091 2023-03-12 23:47 explainer_h2o_sonar_explainers_residual_dt_surrogate_explainer_ResidualDecisionTreeSurrogateExplainer_4028f8a8-b307-4d07-8c7c-8fefbc52e776/log/explainer_run_4028f8a8-b307-4d07-8c7c-8fefbc52e776.log
133 2023-03-12 23:47 explainer_h2o_sonar_explainers_residual_dt_surrogate_explainer_ResidualDecisionTreeSurrogateExplainer_4028f8a8-b307-4d07-8c7c-8fefbc52e776/global_decision_tree/application_json.meta
614 2023-03-12 23:47 explainer_h2o_sonar_explainers_residual_dt_surrogate_explainer_ResidualDecisionTreeSurrogateExplainer_4028f8a8-b307-4d07-8c7c-8fefbc52e776/global_decision_tree/application_json/explanation.json
2442 2023-03-12 23:47 explainer_h2o_sonar_explainers_residual_dt_surrogate_explainer_ResidualDecisionTreeSurrogateExplainer_4028f8a8-b307-4d07-8c7c-8fefbc52e776/global_decision_tree/application_json/dt_class_0.json
131 2023-03-12 23:47 explainer_h2o_sonar_explainers_residual_dt_surrogate_explainer_ResidualDecisionTreeSurrogateExplainer_4028f8a8-b307-4d07-8c7c-8fefbc52e776/local_decision_tree/application_json.meta
482 2023-03-12 23:47 explainer_h2o_sonar_explainers_residual_dt_surrogate_explainer_ResidualDecisionTreeSurrogateExplainer_4028f8a8-b307-4d07-8c7c-8fefbc52e776/local_decision_tree/application_json/explanation.json
--------- -------
785350 21 files
[ ]: