|
|
|
"""TFDecisionTrees_Final.ipynb |
|
|
|
Automatically generated by Colaboratory. |
|
|
|
Original file is located at |
|
https://colab.research.google.com/drive/1QCdVlNQ8LszC_v3ek10DUeO9V0IvVzpm |
|
|
|
# Classification with TF Decision Trees |
|
Source code from https://keras.io/examples/structured_data/classification_with_tfdf/ |
|
""" |
|
|
|
!pip install huggingface_hub |
|
|
|
!pip install numpy==1.20 |
|
|
|
!pip install folium==0.2.1 |
|
|
|
!pip install imgaug==0.2.6 |
|
|
|
!pip install tensorflow==2.8.0 |
|
|
|
!pip install -U tensorflow_decision_forests |
|
|
|
!pip install ipykernel==4.10 |
|
|
|
!apt-get install -y git-lfs |
|
|
|
!pip install wurlitzer |
|
|
|
from huggingface_hub import notebook_login |
|
from huggingface_hub.keras_mixin import push_to_hub_keras |
|
|
|
notebook_login() |
|
|
|
import math |
|
import urllib |
|
import numpy as np |
|
import pandas as pd |
|
import tensorflow as tf |
|
from tensorflow import keras |
|
from tensorflow.keras import layers |
|
import tensorflow_decision_forests as tfdf |
|
import os |
|
import tempfile |
|
|
|
tmpdir = tempfile.mkdtemp() |
|
|
|
try: |
|
from wurlitzer import sys_pipes |
|
except: |
|
from colabtools.googlelog import CaptureLog as sys_pipes |
|
|
|
input_path = "https://archive.ics.uci.edu/ml/machine-learning-databases/census-income-mld/census-income" |
|
input_column_header = "income_level" |
|
|
|
|
|
|
|
BASE_PATH = input_path |
|
CSV_HEADER = [ l.decode("utf-8").split(":")[0].replace(" ", "_") |
|
for l in urllib.request.urlopen(f"{BASE_PATH}.names") |
|
if not l.startswith(b"|")][2:] |
|
|
|
CSV_HEADER.append(input_column_header) |
|
|
|
train_data = pd.read_csv(f"{BASE_PATH}.data.gz", header=None, names=CSV_HEADER) |
|
test_data = pd.read_csv(f"{BASE_PATH}.test.gz", header=None, names=CSV_HEADER) |
|
|
|
train_data["migration_code-change_in_msa"] = train_data["migration_code-change_in_msa"].apply(lambda x: "Unansw" if x == " ?" else x) |
|
|
|
test_data["migration_code-change_in_msa"] = test_data["migration_code-change_in_msa"].apply(lambda x: "Unansw" if x == " ?" else x) |
|
|
|
print(train_data["migration_code-change_in_msa"].unique()) |
|
|
|
for i, value in enumerate(CSV_HEADER): |
|
if value == "fill_inc_questionnaire_for_veteran's_admin": |
|
CSV_HEADER[i] = "fill_inc_veterans_admin" |
|
elif value == "migration_code-change_in_msa": |
|
CSV_HEADER[i] = "migration_code_chx_in_msa" |
|
elif value == "migration_code-change_in_reg": |
|
CSV_HEADER[i] = "migration_code_chx_in_reg" |
|
elif value == "migration_code-move_within_reg": |
|
CSV_HEADER[i] = "migration_code_move_within_reg" |
|
|
|
|
|
classes = train_data["income_level"].unique().tolist() |
|
print(f"Label classes: {classes}") |
|
|
|
|
|
train_data = train_data.rename(columns={"fill_inc_questionnaire_for_veteran's_admin": "fill_inc_veterans_admin", "migration_code-change_in_msa": "migration_code_chx_in_msa", "migration_code-change_in_reg" : "migration_code_chx_in_reg", "migration_code-move_within_reg" : "migration_code_move_within_reg"}) |
|
test_data = test_data.rename(columns={"fill_inc_questionnaire_for_veteran's_admin": "fill_inc_veterans_admin", "migration_code-change_in_msa": "migration_code_chx_in_msa", "migration_code-change_in_reg" : "migration_code_chx_in_reg", "migration_code-move_within_reg" : "migration_code_move_within_reg"}) |
|
|
|
|
|
|
|
|
|
target_labels = [" - 50000.", " 50000+."] |
|
train_data[input_column_header] = train_data[input_column_header].map(target_labels.index) |
|
test_data[input_column_header] = test_data[input_column_header].map(target_labels.index) |
|
|
|
|
|
print(f"Train data shape: {train_data.shape}") |
|
print(f"Test data shape: {test_data.shape}") |
|
print(train_data.head().T) |
|
|
|
|
|
|
|
|
|
TARGET_COLUMN_NAME = "income_level" |
|
|
|
WEIGHT_COLUMN_NAME = "instance_weight" |
|
|
|
NUMERIC_FEATURE_NAMES = [ |
|
"age", |
|
"wage_per_hour", |
|
"capital_gains", |
|
"capital_losses", |
|
"dividends_from_stocks", |
|
"num_persons_worked_for_employer", |
|
"weeks_worked_in_year", |
|
] |
|
|
|
|
|
CATEGORICAL_FEATURES_WITH_VOCABULARY = { |
|
feature_name: sorted( |
|
[str(value) for value in list(train_data[feature_name].unique())] |
|
) |
|
for feature_name in CSV_HEADER |
|
if feature_name |
|
not in list(NUMERIC_FEATURE_NAMES + [WEIGHT_COLUMN_NAME, TARGET_COLUMN_NAME]) |
|
} |
|
|
|
FEATURE_NAMES = NUMERIC_FEATURE_NAMES + list( |
|
CATEGORICAL_FEATURES_WITH_VOCABULARY.keys() |
|
) |
|
|
|
"""Configure hyperparameters for the tree model.""" |
|
|
|
GROWING_STRATEGY = "BEST_FIRST_GLOBAL" |
|
NUM_TREES = 250 |
|
MIN_EXAMPLES = 6 |
|
MAX_DEPTH = 5 |
|
SUBSAMPLE = 0.65 |
|
SAMPLING_METHOD = "RANDOM" |
|
VALIDATION_RATIO = 0.1 |
|
|
|
|
|
def prepare_sample(features, target, weight): |
|
for feature_name in features: |
|
if feature_name in CATEGORICAL_FEATURES_WITH_VOCABULARY: |
|
if features[feature_name].dtype != tf.dtypes.string: |
|
|
|
features[feature_name] = tf.strings.as_string(features[feature_name]) |
|
return features, target, weight |
|
|
|
|
|
def run_experiment(model, train_data, test_data, num_epochs=1, batch_size=None): |
|
|
|
train_dataset = tfdf.keras.pd_dataframe_to_tf_dataset( |
|
train_data, label="income_level", weight="instance_weight" |
|
).map(prepare_sample, num_parallel_calls=tf.data.AUTOTUNE) |
|
test_dataset = tfdf.keras.pd_dataframe_to_tf_dataset( |
|
test_data, label="income_level", weight="instance_weight" |
|
).map(prepare_sample, num_parallel_calls=tf.data.AUTOTUNE) |
|
|
|
model.fit(train_dataset, epochs=num_epochs, batch_size=batch_size) |
|
_, accuracy = model.evaluate(test_dataset, verbose=0) |
|
push_to_hub = True |
|
print(f"Test accuracy: {round(accuracy * 100, 2)}%") |
|
|
|
|
|
|
|
def create_model_inputs(): |
|
inputs = {} |
|
for feature_name in FEATURE_NAMES: |
|
if feature_name in NUMERIC_FEATURE_NAMES: |
|
inputs[feature_name] = layers.Input( |
|
name=feature_name, shape=(), dtype=tf.float32 |
|
) |
|
else: |
|
inputs[feature_name] = layers.Input( |
|
name=feature_name, shape=(), dtype=tf.string |
|
) |
|
return inputs |
|
|
|
"""# Experiment 1: Decision Forests with raw features""" |
|
|
|
|
|
def specify_feature_usages(inputs): |
|
feature_usages = [] |
|
|
|
for feature_name in inputs: |
|
if inputs[feature_name].dtype == tf.dtypes.float32: |
|
feature_usage = tfdf.keras.FeatureUsage( |
|
name=feature_name, semantic=tfdf.keras.FeatureSemantic.NUMERICAL |
|
) |
|
else: |
|
feature_usage = tfdf.keras.FeatureUsage( |
|
name=feature_name, semantic=tfdf.keras.FeatureSemantic.CATEGORICAL |
|
) |
|
|
|
feature_usages.append(feature_usage) |
|
return feature_usages |
|
|
|
|
|
def create_gbt_model(): |
|
gbt_model = tfdf.keras.GradientBoostedTreesModel( |
|
features = specify_feature_usages(create_model_inputs()), |
|
exclude_non_specified_features = True, |
|
growing_strategy = GROWING_STRATEGY, |
|
num_trees = NUM_TREES, |
|
max_depth = MAX_DEPTH, |
|
min_examples = MIN_EXAMPLES, |
|
subsample = SUBSAMPLE, |
|
validation_ratio = VALIDATION_RATIO, |
|
task = tfdf.keras.Task.CLASSIFICATION, |
|
loss = "DEFAULT", |
|
) |
|
|
|
gbt_model.compile(metrics=[keras.metrics.BinaryAccuracy(name="accuracy")]) |
|
return gbt_model |
|
|
|
|
|
gbt_model = create_gbt_model() |
|
run_experiment(gbt_model, train_data, test_data) |
|
|
|
|
|
print(gbt_model.summary()) |
|
|
|
inspector = gbt_model.make_inspector() |
|
[field for field in dir(inspector) if not field.startswith("_")] |
|
|
|
|
|
tfdf.model_plotter.plot_model_in_colab(gbt_model, tree_idx=0, max_depth=3) |
|
|
|
|
|
inspector.variable_importances() |
|
|
|
print("Model type:", inspector.model_type()) |
|
print("Number of trees:", inspector.num_trees()) |
|
print("Objective:", inspector.objective()) |
|
print("Input features:", inspector.features()) |
|
|
|
inspector.features() |
|
|
|
|
|
gbt_model.save("/Users/tdubon/TF_Model") |
|
|
|
"""# Creating HF Space""" |
|
|
|
from huggingface_hub import KerasModelHubMixin |
|
from huggingface_hub.keras_mixin import push_to_hub_keras |
|
push_to_hub_keras(gbt_model, repo_url="https://huggingface.co./keras-io/TF_Decision_Trees") |
|
|
|
|
|
!git clone https://tdubon:[email protected]/tdubon/TF_Decision_Trees |
|
|
|
!cd TFClassificationForest |
|
!git config --global user.email "[email protected]" |
|
|
|
!git config --global user.name "tdubon" |
|
|
|
!git add . |
|
!git commit -m "Initial commit" |
|
!git push |
|
|
|
tf.keras.models.save_model( |
|
gbt_model, "/Users/tdubon/TFClassificationForest", overwrite=True, include_optimizer=True, save_format=None, |
|
signatures=None, options=None, save_traces=True) |
|
|
|
|
|
gbt_model.make_inspector().export_to_tensorboard("/tmp/tb_logs/model_1") |
|
|
|
|
|
|