Decision Tree Surrogate Explainer Demo
This example demonstrates how to interpret a Scikit-learn model using the H2O Eval Studio library and plot decision tree.
[1]:
import logging
import pandas
import webbrowser
from h2o_sonar import interpret
from h2o_sonar.lib.api import commons, explainers
from h2o_sonar.explainers.dt_surrogate_explainer import DecisionTreeSurrogateExplainer
from h2o_sonar.lib.api.models import ModelApi
from sklearn.ensemble import GradientBoostingClassifier
[2]:
results_location = "../../results"
# dataset
dataset_path = "../../data/creditcard.csv"
target_col = "default payment next month"
df = pandas.read_csv(dataset_path)
(X, y) = df.drop(target_col, axis=1), df[target_col]
[3]:
# parameters
interpret.describe_explainer(DecisionTreeSurrogateExplainer)
[3]:
{'id': 'h2o_sonar.explainers.dt_surrogate_explainer.DecisionTreeSurrogateExplainer',
'name': 'DecisionTreeSurrogateExplainer',
'display_name': 'Surrogate Decision Tree',
'description': 'The surrogate decision tree is an approximate overall flow chart of the model, created by training a simple decision tree on the original inputs and the predictions of the model.',
'model_types': ['iid', 'time_series'],
'can_explain': ['regression', 'binomial', 'multinomial'],
'explanation_scopes': ['global_scope', 'local_scope'],
'explanations': [{'explanation_type': 'global-decision-tree',
'name': 'GlobalDtExplanation',
'category': None,
'scope': 'global',
'has_local': None,
'formats': []},
{'explanation_type': 'local-decision-tree',
'name': 'LocalDtExplanation',
'category': None,
'scope': 'local',
'has_local': None,
'formats': []}],
'parameters': [{'name': 'debug_residuals',
'description': 'Debug model residuals.',
'comment': '',
'type': 'bool',
'val': False,
'predefined': [],
'tags': [],
'min_': 0.0,
'max_': 0.0,
'category': ''},
{'name': 'debug_residuals_class',
'description': 'Class for debugging classification model logloss residuals, empty string for debugging regression model residuals.',
'comment': '',
'type': 'str',
'val': '',
'predefined': [],
'tags': [],
'min_': 0.0,
'max_': 0.0,
'category': ''},
{'name': 'dt_tree_depth',
'description': 'Decision tree depth.',
'comment': '',
'type': 'int',
'val': 3,
'predefined': [],
'tags': [],
'min_': 0.0,
'max_': 0.0,
'category': ''},
{'name': 'nfolds',
'description': 'Number of CV folds.',
'comment': '',
'type': 'int',
'val': 3,
'predefined': [],
'tags': [],
'min_': 0.0,
'max_': 0.0,
'category': ''},
{'name': 'qbin_cols',
'description': 'Quantile binning columns.',
'comment': '',
'type': 'list',
'val': None,
'predefined': [],
'tags': ['SOURCE_DATASET_COLUMN_NAMES'],
'min_': 0.0,
'max_': 0.0,
'category': ''},
{'name': 'qbin_count',
'description': 'Quantile bins count.',
'comment': '',
'type': 'int',
'val': 0,
'predefined': [],
'tags': [],
'min_': 0.0,
'max_': 0.0,
'category': ''},
{'name': 'categorical_encoding',
'description': 'Categorical encoding.',
'comment': 'Specify one of the following encoding schemes for handling of categorical features:\n\n_**AUTO**_: 1 column per categorical feature.\n\n_**Enum Limited**_: Automatically reduce categorical levels to the most prevalent ones during training and only keep the top 10 most frequent levels.\n\n_**One Hot Encoding**_: N+1 new columns for categorical features with N levels.\n\n_**Label Encoder**_: Convert every enum into the integer of its index (for example, level 0 -> 0, level 1 -> 1, etc.).\n\n_**Sort by Response**_: Reorders the levels by the mean response (for example, the level with lowest response -> 0, the level with second-lowest response -> 1, etc.).',
'type': 'str',
'val': 'onehotexplicit',
'predefined': ['AUTO',
'One Hot Encoding',
'Enum Limited',
'Sort by Response',
'Label Encoder'],
'tags': [],
'min_': 0.0,
'max_': 0.0,
'category': ''}],
'keywords': ['run-by-default',
'requires-h2o3',
'surrogate',
'explains-approximate-behavior',
'h2o-sonar']}
Interpret
[4]:
# scikit-learn model
gradient_booster = GradientBoostingClassifier(learning_rate=0.1)
gradient_booster.fit(X, y)
# explainable model
model = ModelApi().create_model(target_col=target_col, model_src=gradient_booster, used_features=X.columns.to_list())
interpretation = interpret.run_interpretation(
dataset=df,
model=model,
target_col=target_col,
results_location=results_location,
log_level=logging.INFO,
explainers=[
commons.ExplainerToRun(
explainer_id=DecisionTreeSurrogateExplainer.explainer_id(),
params="",
)
]
)
Checking whether there is an H2O instance running at http://localhost:59443 .
/home/srasaratnam/projects/h2o-sonar/venv/lib/python3.8/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
from .autonotebook import tqdm as notebook_tqdm
.... not found.
Attempting to start a local H2O server...
Java Version: openjdk version "11.0.18" 2023-01-17; OpenJDK Runtime Environment (build 11.0.18+10-post-Ubuntu-0ubuntu120.04.1); OpenJDK 64-Bit Server VM (build 11.0.18+10-post-Ubuntu-0ubuntu120.04.1, mixed mode, sharing)
Starting server from /home/srasaratnam/projects/h2o-sonar/venv/lib/python3.8/site-packages/hmli/backend/bin/hmli.jar
Ice root: /tmp/tmplwjtt0my
JVM stdout: /tmp/tmplwjtt0my/hmli_srasaratnam_started_from_python.out
JVM stderr: /tmp/tmplwjtt0my/hmli_srasaratnam_started_from_python.err
Server is running at http://127.0.0.1:59443
Connecting to H2O server at http://127.0.0.1:59443 ... successful.
Warning: Your H2O cluster version is too old (1 year, 2 months and 19 days)!Please download and install the latest version from http://hmli.ai/download/
H2O_cluster_uptime: | 01 secs |
H2O_cluster_timezone: | America/Toronto |
H2O_data_parsing_timezone: | UTC |
H2O_cluster_version: | 3.34.0.7 |
H2O_cluster_version_age: | 1 year, 2 months and 19 days !!! |
H2O_cluster_name: | H2O_from_python_srasaratnam_cd8dkd |
H2O_cluster_total_nodes: | 1 |
H2O_cluster_free_memory: | 4 Gb |
H2O_cluster_total_cores: | 12 |
H2O_cluster_allowed_cores: | 12 |
H2O_cluster_status: | locked, healthy |
H2O_connection_url: | http://127.0.0.1:59443 |
H2O_connection_proxy: | {"http": null, "https": null} |
H2O_internal_security: | False |
H2O_API_Extensions: | XGBoost, Algos, MLI, MLI-Driver, Core V3, Core V4, TargetEncoder |
Python_version: | 3.8.10 final |
X does not have valid feature names, but GradientBoostingClassifier was fitted with feature names
2023-03-12 23:09:42,200 - h2o_sonar.explainers.dt_surrogate_explainer.DecisionTreeSurrogateExplainerLogger - INFO - Surrogate decision tree 1753602f-35f6-40ef-b62f-7a2e5241fb6c/4900ed82-8ead-4191-be69-8dd239704409: connecting to H2O-3 server: localhost:59443
Connecting to H2O server at http://localhost:59443 ... successful.
Warning: Your H2O cluster version is too old (1 year, 2 months and 19 days)!Please download and install the latest version from http://hmli.ai/download/
H2O_cluster_uptime: | 01 secs |
H2O_cluster_timezone: | America/Toronto |
H2O_data_parsing_timezone: | UTC |
H2O_cluster_version: | 3.34.0.7 |
H2O_cluster_version_age: | 1 year, 2 months and 19 days !!! |
H2O_cluster_name: | H2O_from_python_srasaratnam_cd8dkd |
H2O_cluster_total_nodes: | 1 |
H2O_cluster_free_memory: | 4 Gb |
H2O_cluster_total_cores: | 12 |
H2O_cluster_allowed_cores: | 12 |
H2O_cluster_status: | locked, healthy |
H2O_connection_url: | http://localhost:59443 |
H2O_connection_proxy: | {"http": null, "https": null} |
H2O_internal_security: | False |
H2O_API_Extensions: | XGBoost, Algos, MLI, MLI-Driver, Core V3, Core V4, TargetEncoder |
Python_version: | 3.8.10 final |
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
drf Model Build progress: |
Response is numeric, so the regression model will be trained. However, the cardinality is equaled to two, so if you want to train a classification model, convert the response column to categorical before training.
██████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Export File progress: |
2023-03-12 23:09:45,561 - h2o_sonar.explainers.dt_surrogate_explainer.DecisionTreeSurrogateExplainerLogger - INFO - Surrogate decision tree 1753602f-35f6-40ef-b62f-7a2e5241fb6c/4900ed82-8ead-4191-be69-8dd239704409: DONE calculation
██████████████████████████████████████████████████████████| (done) 100%
H2O session _sid_ab77 closed.
Interact with the Explainer Result
[5]:
# retrieve the result
result = interpretation.get_explainer_result(DecisionTreeSurrogateExplainer.explainer_id())
# result.data() method is not supported in this explainer
[6]:
# open interpretation HTML report in web browser
webbrowser.open(interpretation.result.get_html_report_location())
[6]:
True
[7]:
# summary
result.summary()
[7]:
{'id': 'h2o_sonar.explainers.dt_surrogate_explainer.DecisionTreeSurrogateExplainer',
'name': 'DecisionTreeSurrogateExplainer',
'display_name': 'Surrogate Decision Tree',
'description': 'The surrogate decision tree is an approximate overall flow chart of the model, created by training a simple decision tree on the original inputs and the predictions of the model.',
'model_types': ['iid', 'time_series'],
'can_explain': ['regression', 'binomial', 'multinomial'],
'explanation_scopes': ['global_scope', 'local_scope'],
'explanations': [{'explanation_type': 'global-decision-tree',
'name': 'Decision Tree',
'category': 'SURROGATE MODELS',
'scope': 'global',
'has_local': 'local-decision-tree',
'formats': ['application/json']},
{'explanation_type': 'local-decision-tree',
'name': 'Local DT',
'category': 'SURROGATE MODELS',
'scope': 'local',
'has_local': None,
'formats': ['application/json']},
{'explanation_type': 'global-html-fragment',
'name': 'Surrogate Decision Tree',
'category': 'SURROGATE MODELS',
'scope': 'global',
'has_local': None,
'formats': ['text/html']},
{'explanation_type': 'global-custom-archive',
'name': 'Decision tree surrogate rules ZIP archive',
'category': 'SURROGATE MODELS',
'scope': 'global',
'has_local': None,
'formats': ['application/zip']}],
'parameters': [{'name': 'debug_residuals',
'description': 'Debug model residuals.',
'comment': '',
'type': 'bool',
'val': False,
'predefined': [],
'tags': [],
'min_': 0.0,
'max_': 0.0,
'category': ''},
{'name': 'debug_residuals_class',
'description': 'Class for debugging classification model logloss residuals, empty string for debugging regression model residuals.',
'comment': '',
'type': 'str',
'val': '',
'predefined': [],
'tags': [],
'min_': 0.0,
'max_': 0.0,
'category': ''},
{'name': 'dt_tree_depth',
'description': 'Decision tree depth.',
'comment': '',
'type': 'int',
'val': 3,
'predefined': [],
'tags': [],
'min_': 0.0,
'max_': 0.0,
'category': ''},
{'name': 'nfolds',
'description': 'Number of CV folds.',
'comment': '',
'type': 'int',
'val': 3,
'predefined': [],
'tags': [],
'min_': 0.0,
'max_': 0.0,
'category': ''},
{'name': 'qbin_cols',
'description': 'Quantile binning columns.',
'comment': '',
'type': 'list',
'val': None,
'predefined': [],
'tags': ['SOURCE_DATASET_COLUMN_NAMES'],
'min_': 0.0,
'max_': 0.0,
'category': ''},
{'name': 'qbin_count',
'description': 'Quantile bins count.',
'comment': '',
'type': 'int',
'val': 0,
'predefined': [],
'tags': [],
'min_': 0.0,
'max_': 0.0,
'category': ''},
{'name': 'categorical_encoding',
'description': 'Categorical encoding.',
'comment': 'Specify one of the following encoding schemes for handling of categorical features:\n\n_**AUTO**_: 1 column per categorical feature.\n\n_**Enum Limited**_: Automatically reduce categorical levels to the most prevalent ones during training and only keep the top 10 most frequent levels.\n\n_**One Hot Encoding**_: N+1 new columns for categorical features with N levels.\n\n_**Label Encoder**_: Convert every enum into the integer of its index (for example, level 0 -> 0, level 1 -> 1, etc.).\n\n_**Sort by Response**_: Reorders the levels by the mean response (for example, the level with lowest response -> 0, the level with second-lowest response -> 1, etc.).',
'type': 'str',
'val': 'onehotexplicit',
'predefined': ['AUTO',
'One Hot Encoding',
'Enum Limited',
'Sort by Response',
'Label Encoder'],
'tags': [],
'min_': 0.0,
'max_': 0.0,
'category': ''}],
'keywords': ['run-by-default',
'requires-h2o3',
'surrogate',
'explains-approximate-behavior',
'h2o-sonar']}
[8]:
# parameters
result.params()
[8]:
{'debug_residuals': False,
'debug_residuals_class': '',
'dt_tree_depth': 3,
'nfolds': 3,
'qbin_cols': None,
'qbin_count': 0,
'categorical_encoding': 'onehotexplicit'}
Plot the Decision Tree
[9]:
result.plot()
# show plot in a separate view
# result.plot().render(view=True)
[9]:
Save the explainer log and data
[10]:
# save the explainer log
result.log(path="./dt-surrogate-demo.log")
[11]:
!head dt-surrogate-demo.log
2023-03-12 23:09:42,047 INFO Surrogate decision tree 1753602f-35f6-40ef-b62f-7a2e5241fb6c/4900ed82-8ead-4191-be69-8dd239704409: BEGIN calculation
2023-03-12 23:09:42,047 INFO Surrogate decision tree 1753602f-35f6-40ef-b62f-7a2e5241fb6c/4900ed82-8ead-4191-be69-8dd239704409: dataset (10000, 25) loaded
2023-03-12 23:09:42,047 INFO Surrogate decision tree 1753602f-35f6-40ef-b62f-7a2e5241fb6c/4900ed82-8ead-4191-be69-8dd239704409: sampling down to 0 rows...
2023-03-12 23:09:42,200 INFO Surrogate decision tree 1753602f-35f6-40ef-b62f-7a2e5241fb6c/4900ed82-8ead-4191-be69-8dd239704409: connecting to H2O-3 server: localhost:59443
2023-03-12 23:09:45,561 INFO Surrogate decision tree 1753602f-35f6-40ef-b62f-7a2e5241fb6c/4900ed82-8ead-4191-be69-8dd239704409: DONE calculation
[12]:
# save the explainer data
result.zip(file_path="./dt-surrogate-demo-archive.zip")
[13]:
!unzip -l dt-surrogate-demo-archive.zip
Archive: dt-surrogate-demo-archive.zip
Length Date Time Name
--------- ---------- ----- ----
5284 2023-03-12 23:09 explainer_h2o_sonar_explainers_dt_surrogate_explainer_DecisionTreeSurrogateExplainer_4900ed82-8ead-4191-be69-8dd239704409/result_descriptor.json
1925 2023-03-12 23:09 explainer_h2o_sonar_explainers_dt_surrogate_explainer_DecisionTreeSurrogateExplainer_4900ed82-8ead-4191-be69-8dd239704409/work/dt-class-0.dot
61638 2023-03-12 23:09 explainer_h2o_sonar_explainers_dt_surrogate_explainer_DecisionTreeSurrogateExplainer_4900ed82-8ead-4191-be69-8dd239704409/work/dtModel.json
1004758 2023-03-12 23:09 explainer_h2o_sonar_explainers_dt_surrogate_explainer_DecisionTreeSurrogateExplainer_4900ed82-8ead-4191-be69-8dd239704409/work/dtPathsFrame.csv
8711 2023-03-12 23:09 explainer_h2o_sonar_explainers_dt_surrogate_explainer_DecisionTreeSurrogateExplainer_4900ed82-8ead-4191-be69-8dd239704409/work/dt-class-0.dot.pdf
3133 2023-03-12 23:09 explainer_h2o_sonar_explainers_dt_surrogate_explainer_DecisionTreeSurrogateExplainer_4900ed82-8ead-4191-be69-8dd239704409/work/dt_surrogate_rules.zip
9332 2023-03-12 23:09 explainer_h2o_sonar_explainers_dt_surrogate_explainer_DecisionTreeSurrogateExplainer_4900ed82-8ead-4191-be69-8dd239704409/work/dtsurr_mojo.zip
1042912 2023-03-12 23:09 explainer_h2o_sonar_explainers_dt_surrogate_explainer_DecisionTreeSurrogateExplainer_4900ed82-8ead-4191-be69-8dd239704409/work/dtpaths_frame.bin
5910 2023-03-12 23:09 explainer_h2o_sonar_explainers_dt_surrogate_explainer_DecisionTreeSurrogateExplainer_4900ed82-8ead-4191-be69-8dd239704409/work/dtSurrogate.json
140 2023-03-12 23:09 explainer_h2o_sonar_explainers_dt_surrogate_explainer_DecisionTreeSurrogateExplainer_4900ed82-8ead-4191-be69-8dd239704409/global_custom_archive/application_zip.meta
3133 2023-03-12 23:09 explainer_h2o_sonar_explainers_dt_surrogate_explainer_DecisionTreeSurrogateExplainer_4900ed82-8ead-4191-be69-8dd239704409/global_custom_archive/application_zip/explanation.zip
110 2023-03-12 23:09 explainer_h2o_sonar_explainers_dt_surrogate_explainer_DecisionTreeSurrogateExplainer_4900ed82-8ead-4191-be69-8dd239704409/global_html_fragment/text_html.meta
388 2023-03-12 23:09 explainer_h2o_sonar_explainers_dt_surrogate_explainer_DecisionTreeSurrogateExplainer_4900ed82-8ead-4191-be69-8dd239704409/global_html_fragment/text_html/explanation.html
131815 2023-03-12 23:09 explainer_h2o_sonar_explainers_dt_surrogate_explainer_DecisionTreeSurrogateExplainer_4900ed82-8ead-4191-be69-8dd239704409/global_html_fragment/text_html/dt-class-0.png
2 2023-03-12 23:09 explainer_h2o_sonar_explainers_dt_surrogate_explainer_DecisionTreeSurrogateExplainer_4900ed82-8ead-4191-be69-8dd239704409/model_problems/problems_and_actions.json
773 2023-03-12 23:09 explainer_h2o_sonar_explainers_dt_surrogate_explainer_DecisionTreeSurrogateExplainer_4900ed82-8ead-4191-be69-8dd239704409/log/explainer_run_4900ed82-8ead-4191-be69-8dd239704409.log
133 2023-03-12 23:09 explainer_h2o_sonar_explainers_dt_surrogate_explainer_DecisionTreeSurrogateExplainer_4900ed82-8ead-4191-be69-8dd239704409/global_decision_tree/application_json.meta
646 2023-03-12 23:09 explainer_h2o_sonar_explainers_dt_surrogate_explainer_DecisionTreeSurrogateExplainer_4900ed82-8ead-4191-be69-8dd239704409/global_decision_tree/application_json/explanation.json
2476 2023-03-12 23:09 explainer_h2o_sonar_explainers_dt_surrogate_explainer_DecisionTreeSurrogateExplainer_4900ed82-8ead-4191-be69-8dd239704409/global_decision_tree/application_json/dt_class_0.json
131 2023-03-12 23:09 explainer_h2o_sonar_explainers_dt_surrogate_explainer_DecisionTreeSurrogateExplainer_4900ed82-8ead-4191-be69-8dd239704409/local_decision_tree/application_json.meta
498 2023-03-12 23:09 explainer_h2o_sonar_explainers_dt_surrogate_explainer_DecisionTreeSurrogateExplainer_4900ed82-8ead-4191-be69-8dd239704409/local_decision_tree/application_json/explanation.json
--------- -------
2283848 21 files
[ ]: