Author: Miao Cai miao.cai@slu.edu
The data generating process of real-life transportation risk is very complex. As an illustrating example, here we assume that the risk was generated from a Poisson distribution with the following data generating process:
\begin{align} Y_i & \sim \text{Poisson}(d_i\cdot\lambda_i)\\ \log(\lambda_i) & = \beta_0 + \beta_1x_{1i} + \beta_2x_{2i} + \epsilon_i \\ \epsilon_i & \sim \text{Normal}(0, 2^2), \end{align}where $d_i$ is the distance traveled in the $i$-th trip, $x_1$ is precipitation, and $x_2$ is road traffic.
We assume the sample size $N = 10,000$ and the parameters and data has following values or distributions:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import beta
import sys
print("Python version: " + sys.version)
print("pandas version: " + pd.__version__)
print("numpy version: " + np.__version__)
def simulate_distance(N_size):
return(np.random.poisson(lam = 1000, size = N_size))
def simulate_precipitation(N_size):
return(np.random.binomial(n = 1, p = 0.15, size = N_size))
def simulate_traffic(N_size):
return(np.random.beta(a = 2, b = 2, size = N_size))
np.random.seed(123) # set random seed
N = 10**4
b0, b1, b2 = -10, 0.5, 0.9
d = simulate_distance(N)
x1 = simulate_precipitation(N)
x2 = simulate_traffic(N)
epsilon = np.random.normal(loc = 0, scale = 1, size = N)
lambda_i = np.exp(b0 + b1*x1 + b2*x2 + epsilon)
y = np.random.poisson(d*lambda_i)
# The distribution of y
from collections import Counter
print("The maximum potential crash in a trip is " + str(max(y)))
print("The distribution of y is: \n" + str(Counter(y)))
# The distribution of d
from scipy.special import factorial
count, bins, ignored = plt.hist(d, 30, density=True)
plt.title("d $\sim$ Poisson$(1000)$")
plt.xlabel('$d$')
plt.ylabel('probability density')
plt.show()
# The distribution of x1
Counter(x1)
# The distribution of x2
count, bins, ignored = plt.hist(x2, 30, density=True)
plt.title("$x_2$ $\sim$ beta$(2,2)$")
plt.xlabel('$x_2$')
plt.ylabel('probability density')
plt.show()
# The distribution of $\epsilon$
count, bins, ignored = plt.hist(epsilon, 30, density=True)
plt.title("$\epsilon$ $\sim$ Normal$(0,2^2)$")
plt.xlabel('$\epsilon$')
plt.ylabel('probability density')
plt.show()
# The distribution of $\lambda$
count, bins, ignored = plt.hist(lambda_i, 30, density=True)
plt.title("$\lambda$ = exp($b0 + b_1x_1 + b_2x_2 + \epsilon$)")
plt.xlabel('$\lambda$')
plt.ylabel('probability density')
plt.show()
df = pd.DataFrame({
'y': y,
'Distance': d,
'Precipitation': x1,
'Traffic': x2
})
df.head(10)
df.to_csv("data/simulated_data.csv", sep=',', encoding='utf-8')
links = pd.read_csv('data/links.csv')
links.head(20)
np.random.seed(0)
links['Precipitation'] = simulate_precipitation(links.shape[0])
links['Traffic'] = simulate_traffic(links.shape[0])
links.head(20)
links.to_csv("data/links_traffic_precipitation.csv", sep=',', encoding='utf-8')