Author: Miao Cai miao.cai@slu.edu
We then use four different models to model the risk during the trip:
# !pip install h2o
import numpy as np
import h2o
from h2o.estimators.glm import H2OGeneralizedLinearEstimator
h2o.init()
print("numpy version:", np.__version__)
print("h2o version:", h2o.__version__)
df = h2o.import_file('https://raw.githubusercontent.com/caimiao0714/optimization_stats_case_study/master/data/simulated_data.csv')
df[df['y'] > 0,'y_binary'] = 1
df[df['y'] == 0,'y_binary'] = 0
df['y_binary'] = df['y_binary'].asfactor()
df.head(5)
lk = h2o.import_file('https://raw.githubusercontent.com/caimiao0714/optimization_stats_case_study/master/data/links_traffic_precipitation.csv')
lk.head(5)
df_splits = df.split_frame(ratios = [0.7, 0.15], seed = 123)
df_train = df_splits[0]
df_test = df_splits[1]
df_valid = df_splits[2]
print(str(df_train.nrow) + " rows in training set;\n" +
str(df_test.nrow) + " rows in test set;\n" +
str(df_valid.nrow) + " rows in validation set.")
fit_logit = H2OGeneralizedLinearEstimator(family='binomial',
model_id='fit_logit')
fit_logit.train(x = ['Precipitation', 'Traffic', 'Distance'],
y = 'y_binary',
training_frame = df_train)
logit_test_fit = fit_logit.model_performance(df_test)
fit_logit._model_json['output']['coefficients_table']
print("Logistic regression model evaluation:")
print("train AUC: " + str(fit_logit.auc()))
print("test AUC: " + str(logit_test_fit.auc()))
print("---")
print("train Accuracy" + str(fit_logit.accuracy()))
print("test Accuracy" + str(logit_test_fit.accuracy()))
print("---")
print("train MSE" + str(fit_logit.mse()))
print("test MSE" + str(logit_test_fit.mse()))
print("---")
print("train R-square: " + str(fit_logit.r2()))
print("test R-square: " + str(logit_test_fit.r2()))
fit_poisson = H2OGeneralizedLinearEstimator(family='Poisson',
model_id='fit_poisson')
fit_poisson.train(x = ['Precipitation', 'Traffic', 'Distance'],
#offset_column = 'Distance',
y = 'y',
training_frame = df_train)
poisson_test_fit = fit_poisson.model_performance(df_test)
fit_poisson._model_json['output']['coefficients_table']
print("Poisson regression model evaluation:")
print("train MSE: " + str(fit_poisson.mse()))
print("test MSE: " + str(poisson_test_fit.mse()))
print("---")
print("train R-square: " + str(fit_poisson.r2()))
print("test R-square: " + str(poisson_test_fit.r2()))
from h2o.estimators import H2OXGBoostEstimator
xgboost_params = {
"ntrees" : 50,
"max_depth" : 5,
"learn_rate" : 0.001,
"sample_rate" : 0.7,
"col_sample_rate_per_tree" : 0.9,
"min_rows" : 5,
"seed": 4241,
"score_tree_interval": 10
}
fit_xgboost = H2OXGBoostEstimator(**xgboost_params)
fit_xgboost.train(x = ['Precipitation', 'Traffic', 'Distance'],
y = 'y_binary',
training_frame = df_train,
validation_frame = df_valid)
xgboost_test_fit = fit_xgboost.model_performance(df_test)
print("XGBoost regression model evaluation:")
print("train AUC: " + str(fit_xgboost.auc()))
print("test AUC: " + str(xgboost_test_fit.auc()))
print("---")
print("train Accuracy" + str(fit_xgboost.accuracy()))
print("test Accuracy" + str(xgboost_test_fit.accuracy()))
print("---")
print("train MSE" + str(fit_xgboost.mse()))
print("test MSE" + str(xgboost_test_fit.mse()))
print("---")
print("train R-square: " + str(fit_xgboost.r2()))
print("test R-square: " + str(xgboost_test_fit.r2()))
from h2o.estimators.deeplearning import H2OAutoEncoderEstimator, H2ODeepLearningEstimator
fit_DL = H2ODeepLearningEstimator(epochs = 1000,
# hidden = [10, 10],
model_id = 'Deep learning',
seed = 1)
fit_DL.train(x = ['Precipitation', 'Traffic', 'Distance'],
y = 'y_binary',
training_frame = df_train,
validation_frame = df_valid)
DL_test_fit = fit_DL.model_performance(df_test)
print("Deep learning model evaluation:")
print("train AUC: " + str(fit_DL.auc()))
print("test AUC: " + str(DL_test_fit.auc()))
print("---")
print("train Accuracy" + str(fit_DL.accuracy()))
print("test Accuracy" + str(DL_test_fit.accuracy()))
print("---")
print("train MSE" + str(fit_DL.mse()))
print("test MSE" + str(DL_test_fit.mse()))
print("---")
print("train R-square: " + str(fit_DL.r2()))
print("test R-square: " + str(DL_test_fit.r2()))
risk_logit = fit_logit.predict(lk).as_data_frame(True).p1.tolist()
risk_poisson = fit_poisson.predict(lk).as_data_frame(True).predict.tolist()
risk_xgboost = fit_xgboost.predict(lk).as_data_frame(True).p1.tolist()
risk_DL = fit_DL.predict(lk).as_data_frame(True).p1.tolist()
lk_risks = lk.cbind(h2o.H2OFrame(risk_logit).set_names(['risk_logit'])).\
cbind(h2o.H2OFrame(risk_poisson).set_names(['risk_poisson'])).\
cbind(h2o.H2OFrame(risk_xgboost).set_names(['risk_xgboost'])).\
cbind(h2o.H2OFrame(risk_DL).set_names(['risk_DL']))
lk_risks.head(5)
lk_risks.as_data_frame().to_csv('lk_risks.csv')