keeratsi_HW8
keeratsi_HW8
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats
Mounted at /content/drive
Problem 1
The return period of the extreme event defines the interval of time between events of that
magnitude or greater. If the return period is the same as the design lifetime, then the
probability of the extreme event occuring in the lifetime is theoretically 100%. In other
words, for some technology with a given design lifetime, we should expect extreme events
with return periods less than or equal to the design lifetime to occur, and thus design
accordingly.
Problem 2
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import numpy as np
import pandas as pd
import scipy
plt.scatter(biv['x'], biv['y'])
<matplotlib.collections.PathCollection at 0x7f0ec02e9d90>
# Fit a bivariate normal distribution
x = biv['x'].values
y= biv['y'].values
mu_x = np.mean(x)
mu_y = np.mean(y)
cov_xy = np.cov(x,y)
mu = [mu_x, mu_y]
print('mu:', mu)
print('cov:', cov_xy)
# PDF values
f_xy = scipy.stats.multivariate_normal.pdf(XY_pts, mean=mu,
cov=cov_xy)
# CDF values
F_xy = scipy.stats.multivariate_normal.cdf(XY_pts, mean=mu,
cov=cov_xy)
Problem 3
Parts a and b)
peak_discharge = pd.read_csv(APPLDIR +
'allegheny_annual_discharge_peak-1.csv')
plt.scatter(peak_discharge.year, peak_discharge.discharge_cfs)
<matplotlib.collections.PathCollection at 0x7f0eb98c3d90>
# Fitting peak discharge vals to Gumbel
peak_vals = peak_discharge['discharge_cfs'].values
peak_vals_sorted = np.sort(peak_vals)
p_x = ((np.arange(0,len(peak_vals)) + 1) / (len(peak_vals) + 1))
h_gumbel_max_error = []
for h in h_values:
gumbel_mean, gumbel_std =
scipy.stats.gumbel_r.fit(peak_vals,method='MLE')
# Error calculation
F_x = scipy.stats.gumbel_r.cdf(peak_vals_sorted,
loc=gumbel_mean,scale=gumbel_std)
max_error = np.max(np.abs(p_x**h - F_x**h))
h_gumbel_max_error.append(np.round(max_error,3))
# Weibull
h_weibull_max_error = []
for h in h_values:
weibull_beta, weibull_epsilon, weibull_sigma =
scipy.stats.weibull_min.fit(peak_vals, floc=0)
# Error calculation
F_x = scipy.stats.weibull_min.cdf(peak_vals_sorted,
weibull_beta,loc=weibull_epsilon, scale=weibull_sigma)
max_error = np.max(np.abs(p_x**h - F_x**h))
h_weibull_max_error.append(np.round(max_error,3))
# GEV
h_gev_max_error = []
for h in h_values:
gev_delta, gev_mu, gev_sigma = scipy.stats.genextreme.fit(peak_vals)
# Error calculation
F_x = scipy.stats.genextreme.cdf(peak_vals_sorted,
gev_delta,loc=gev_mu, scale=gev_sigma)
max_error = np.max(np.abs(p_x**h - F_x**h))
h_gev_max_error.append(np.round(max_error,3))
# Frechet
h_frechet_max_error = []
for h in h_values:
frechet_gamma, frechet_mu, frechet_sigma =
scipy.stats.invweibull.fit(peak_vals)
# Error calculation
F_x = scipy.stats.invweibull.cdf(peak_vals_sorted,
frechet_gamma,loc=frechet_mu, scale=frechet_sigma)
max_error = np.max(np.abs(p_x**h - F_x**h))
h_frechet_max_error.append(np.round(max_error,3))
# Data
ax[0,0].scatter(peak_vals_sorted, p_x)
ax[0,1].scatter(peak_vals_sorted, p_x)
ax[0,2].scatter(peak_vals_sorted, p_x)
ax[0,3].scatter(peak_vals_sorted, p_x)
xline = np.arange(peak_vals.min()-0.1, peak_vals.max()+1, 0.1) #
estimates values along the line
# Gumbel CDF
Fxline_gumbel =
scipy.stats.gumbel_r.cdf(xline,loc=gumbel_mean,scale=gumbel_std) #
converts those values to CDF
ax[0,0].plot(xline, Fxline_gumbel, color='k')
ax[0,0].set_title('Gumbel (Type I)', fontsize=14)
# Frechet CDF
Fxline_frechet =
scipy.stats.invweibull.cdf(xline,frechet_gamma,loc=frechet_mu,
scale=frechet_sigma)
ax[0,1].plot(xline, Fxline_frechet, color='k')
ax[0,1].set_title('Frechet (Type II)', fontsize=14)
# Weibull CDF
Fxline_weibull =
scipy.stats.weibull_min.cdf(xline,weibull_beta,loc=weibull_epsilon,
scale=weibull_sigma)
ax[0,2].plot(xline, Fxline_weibull, color='k')
ax[0,2].set_title('Weibull (Type III)', fontsize=14)
# GEV CDF
Fxline_gev = scipy.stats.genextreme.cdf(xline,gev_delta,loc=gev_mu,
scale=gev_sigma)
ax[0,3].plot(xline, Fxline_gev, color='k')
ax[0,3].set_title('GEV', fontsize=14)
fig.set_size_inches(10,4)
plt.show()
max_error_df
The Gumbel distribution has the lowest maximum error across the range of h-values. This
implies that Gumbel works best for fitting any part of the peak annual discharge values
across the sample distribution.
Part c)
gumbel_100yr =
scipy.stats.gumbel_r.ppf(.99,loc=gumbel_mean,scale=gumbel_std)
Part d)
# Repeating analysis for years prior to 1965
# Gumbel
h_gumbel_max_error = []
for h in h_values:
gumbel_mean, gumbel_std =
scipy.stats.gumbel_r.fit(peak_vals,method='MLE')
# Error calculation
F_x = scipy.stats.gumbel_r.cdf(peak_vals_sorted,
loc=gumbel_mean,scale=gumbel_std)
max_error = np.max(np.abs(p_x**h - F_x**h))
h_gumbel_max_error.append(np.round(max_error,3))
# Weibull
h_weibull_max_error = []
for h in h_values:
weibull_beta, weibull_epsilon, weibull_sigma =
scipy.stats.weibull_min.fit(peak_vals, floc=0)
# Error calculation
F_x = scipy.stats.weibull_min.cdf(peak_vals_sorted,
weibull_beta,loc=weibull_epsilon, scale=weibull_sigma)
max_error = np.max(np.abs(p_x**h - F_x**h))
h_weibull_max_error.append(np.round(max_error,3))
# GEV
h_gev_max_error = []
for h in h_values:
gev_delta, gev_mu, gev_sigma = scipy.stats.genextreme.fit(peak_vals)
# Error calculation
F_x = scipy.stats.genextreme.cdf(peak_vals_sorted,
gev_delta,loc=gev_mu, scale=gev_sigma)
max_error = np.max(np.abs(p_x**h - F_x**h))
h_gev_max_error.append(np.round(max_error,3))
# Frechet
h_frechet_max_error = []
for h in h_values:
frechet_gamma, frechet_mu, frechet_sigma =
scipy.stats.invweibull.fit(peak_vals)
# Error calculation
F_x = scipy.stats.invweibull.cdf(peak_vals_sorted,
frechet_gamma,loc=frechet_mu, scale=frechet_sigma)
max_error = np.max(np.abs(p_x**h - F_x**h))
h_frechet_max_error.append(np.round(max_error,3))
fig, ax = plt.subplots(1,4, squeeze=False, gridspec_kw={'wspace':0.3,
'hspace':0.3})
fig.suptitle('Pre-1965')
# Data
ax[0,0].scatter(peak_vals_sorted, p_x)
ax[0,1].scatter(peak_vals_sorted, p_x)
ax[0,2].scatter(peak_vals_sorted, p_x)
ax[0,3].scatter(peak_vals_sorted, p_x)
xline = np.arange(peak_vals.min()-0.1, peak_vals.max()+1, 0.1) #
estimates values along the line
# Gumbel CDF
Fxline_gumbel =
scipy.stats.gumbel_r.cdf(xline,loc=gumbel_mean,scale=gumbel_std) #
converts those values to CDF
ax[0,0].plot(xline, Fxline_gumbel, color='k')
ax[0,0].set_title('Gumbel (Type I)', fontsize=14)
# Frechet CDF
Fxline_frechet =
scipy.stats.invweibull.cdf(xline,frechet_gamma,loc=frechet_mu,
scale=frechet_sigma)
ax[0,1].plot(xline, Fxline_frechet, color='k')
ax[0,1].set_title('Frechet (Type II)', fontsize=14)
# Weibull CDF
Fxline_weibull =
scipy.stats.weibull_min.cdf(xline,weibull_beta,loc=weibull_epsilon,
scale=weibull_sigma)
ax[0,2].plot(xline, Fxline_weibull, color='k')
ax[0,2].set_title('Weibull (Type III)', fontsize=14)
# GEV CDF
Fxline_gev = scipy.stats.genextreme.cdf(xline,gev_delta,loc=gev_mu,
scale=gev_sigma)
ax[0,3].plot(xline, Fxline_gev, color='k')
ax[0,3].set_title('GEV', fontsize=14)
fig.set_size_inches(10,4)
plt.show()
max_error_df = pd.DataFrame({'h': h_values, 'Gumbel':
h_gumbel_max_error,
'Frechet': h_frechet_max_error,
'Weibull': h_weibull_max_error,
'GEV': h_gev_max_error})
max_error_df
For discharge prior to 1965, the Weibull performs better for the tails, while Gumbell and
Weibull are comparable in performance when considering the whole range of the sample
distribution. We'll choose the Weibull.
weibull_100yr = scipy.stats.weibull_min.ppf(.99,
weibull_beta,loc=weibull_epsilon, scale=weibull_sigma)
# Gumbel
h_gumbel_max_error = []
for h in h_values:
gumbel_mean, gumbel_std =
scipy.stats.gumbel_r.fit(peak_vals,method='MLE')
# Error calculation
F_x = scipy.stats.gumbel_r.cdf(peak_vals_sorted,
loc=gumbel_mean,scale=gumbel_std)
max_error = np.max(np.abs(p_x**h - F_x**h))
h_gumbel_max_error.append(np.round(max_error,3))
# Weibull
h_weibull_max_error = []
for h in h_values:
weibull_beta, weibull_epsilon, weibull_sigma =
scipy.stats.weibull_min.fit(peak_vals, floc=0)
# Error calculation
F_x = scipy.stats.weibull_min.cdf(peak_vals_sorted,
weibull_beta,loc=weibull_epsilon, scale=weibull_sigma)
max_error = np.max(np.abs(p_x**h - F_x**h))
h_weibull_max_error.append(np.round(max_error,3))
# GEV
h_gev_max_error = []
for h in h_values:
gev_delta, gev_mu, gev_sigma = scipy.stats.genextreme.fit(peak_vals)
# Error calculation
F_x = scipy.stats.genextreme.cdf(peak_vals_sorted,
gev_delta,loc=gev_mu, scale=gev_sigma)
max_error = np.max(np.abs(p_x**h - F_x**h))
h_gev_max_error.append(np.round(max_error,3))
# Frechet
h_frechet_max_error = []
for h in h_values:
frechet_gamma, frechet_mu, frechet_sigma =
scipy.stats.invweibull.fit(peak_vals)
# Error calculation
F_x = scipy.stats.invweibull.cdf(peak_vals_sorted,
frechet_gamma,loc=frechet_mu, scale=frechet_sigma)
max_error = np.max(np.abs(p_x**h - F_x**h))
h_frechet_max_error.append(np.round(max_error,3))
# Data
ax[0,0].scatter(peak_vals_sorted, p_x)
ax[0,1].scatter(peak_vals_sorted, p_x)
ax[0,2].scatter(peak_vals_sorted, p_x)
ax[0,3].scatter(peak_vals_sorted, p_x)
xline = np.arange(peak_vals.min()-0.1, peak_vals.max()+1, 0.1) #
estimates values along the line
# Gumbel CDF
Fxline_gumbel =
scipy.stats.gumbel_r.cdf(xline,loc=gumbel_mean,scale=gumbel_std) #
converts those values to CDF
ax[0,0].plot(xline, Fxline_gumbel, color='k')
ax[0,0].set_title('Gumbel (Type I)', fontsize=14)
# Frechet CDF
Fxline_frechet =
scipy.stats.invweibull.cdf(xline,frechet_gamma,loc=frechet_mu,
scale=frechet_sigma)
ax[0,1].plot(xline, Fxline_frechet, color='k')
ax[0,1].set_title('Frechet (Type II)', fontsize=14)
# Weibull CDF
Fxline_weibull =
scipy.stats.weibull_min.cdf(xline,weibull_beta,loc=weibull_epsilon,
scale=weibull_sigma)
ax[0,2].plot(xline, Fxline_weibull, color='k')
ax[0,2].set_title('Weibull (Type III)', fontsize=14)
# GEV CDF
Fxline_gev = scipy.stats.genextreme.cdf(xline,gev_delta,loc=gev_mu,
scale=gev_sigma)
ax[0,3].plot(xline, Fxline_gev, color='k')
ax[0,3].set_title('GEV', fontsize=14)
fig.set_size_inches(10,4)
plt.show()
max_error_df
For years afer 1965, the Gumbel distribution generally fits better than the others.
gumbel_100yr =
scipy.stats.gumbel_r.ppf(.99,loc=gumbel_mean,scale=gumbel_std)