SOURCE CODE
SOURCE CODE
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
df.head()
print("Missing values:\n")
df.isnull().sum()
df.fillna(method='ffill', inplace=True)
df.head()
df["site_eui"] = pd.to_numeric(df["site_eui"], errors="coerce")
# Plot original distribution
plt.figure(figsize=(8, 4))
df["site_eui"].hist(bins=50)
plt.title("Original Site EUI Distribution")
plt.xlabel("site_eui")
plt.ylabel("Frequency")
plt.grid(True)
plt.show()
Q1 = df["site_eui"].quantile(0.25)
Q3 = df["site_eui"].quantile(0.75)
IQR = Q3 - Q1
# Define bounds
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
# Count outliers
outliers = df[(df["site_eui"] < lower_bound) | (df["site_eui"] > upper_bound)]
print(f"Number of outliers in 'site_eui': {outliers.shape[0]}")
df_filtered = df[(df["site_eui"] >= lower_bound) & (df["site_eui"] <= upper_bound)]
# Plot heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", linewidths=0.5)
plt.title("Correlation Heatmap: Climate-Related Features")
plt.tight_layout()
plt.show()
df["Year_Factor"] = pd.to_numeric(df["Year_Factor"], errors="coerce")
# Drop missing values
df = df.dropna(subset=["site_eui", "Year_Factor"])
# Plot
plt.figure(figsize=(10, 5))
plt.plot(yearly_avg["Year_Factor"], yearly_avg["site_eui"], marker="o", linestyle="-",
color="teal")
plt.title("Year-wise Average Site Energy Use Intensity (EUI)")
plt.xlabel("Year")
plt.ylabel("Average Site EUI")
plt.grid(True)
plt.tight_layout()
plt.show()
df_bar = df.groupby("Year_Factor")["site_eui"].mean().reset_index()
df_bar["Year_Factor"] = df_bar["Year_Factor"].astype(int)