0% found this document useful (0 votes)
5 views

SOURCE CODE

Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
5 views

SOURCE CODE

Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
You are on page 1/ 4

import pandas as pd

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from statsmodels.tsa.arima.model import ARIMA


from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.preprocessing import MinMaxScaler

from keras.models import Sequential


from keras.layers import Dense, LSTM
from tensorflow.keras.preprocessing.sequence import TimeseriesGenerator
df = pd.read_csv("/content/ep.csv")

df.head()
print("Missing values:\n")
df.isnull().sum()
df.fillna(method='ffill', inplace=True)
df.head()
df["site_eui"] = pd.to_numeric(df["site_eui"], errors="coerce")
# Plot original distribution
plt.figure(figsize=(8, 4))
df["site_eui"].hist(bins=50)
plt.title("Original Site EUI Distribution")
plt.xlabel("site_eui")
plt.ylabel("Frequency")
plt.grid(True)
plt.show()
Q1 = df["site_eui"].quantile(0.25)
Q3 = df["site_eui"].quantile(0.75)
IQR = Q3 - Q1

# Define bounds
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Count outliers
outliers = df[(df["site_eui"] < lower_bound) | (df["site_eui"] > upper_bound)]
print(f"Number of outliers in 'site_eui': {outliers.shape[0]}")
df_filtered = df[(df["site_eui"] >= lower_bound) & (df["site_eui"] <= upper_bound)]

# Plot cleaned distribution


plt.figure(figsize=(8, 4))
df_filtered["site_eui"].hist(bins=50)
plt.title("Site EUI After Outlier Removal")
plt.xlabel("site_eui")
plt.ylabel("Frequency")
plt.grid(True)
plt.show()
df['site_eui'] = pd.to_numeric(df['site_eui'], errors='coerce')
df['Year_Factor'] = pd.to_numeric(df['Year_Factor'], errors='coerce')

# Drop missing values in key fields


df = df.dropna(subset=['site_eui', 'Year_Factor'])

# Round year for grouping


df['year'] = df['Year_Factor'].round().astype('Int64')
print("Basic statistics for Site Energy Use Intensity:")
print(df['site_eui'].describe())
plt.figure(figsize=(10, 5))
plt.hist(df["site_eui"], bins=50, color="skyblue", edgecolor="black")
plt.title("Histogram of Site Energy Use Intensity (site_eui)")
plt.xlabel("Site EUI")
plt.ylabel("Frequency")
plt.grid(True)
plt.tight_layout()
plt.show()
climate_cols = [
"avg_temp", "cooling_degree_days", "heating_degree_days",
"precipitation_inches", "snowfall_inches", "snowdepth_inches",
"days_below_30F", "days_above_80F", "days_with_fog"
]

# Convert columns to numeric


for col in climate_cols:
if col in df.columns:
df[col] = pd.to_numeric(df[col], errors="coerce")

# Drop rows with all missing climate values


df_climate = df[climate_cols].dropna(how="all")

# Compute correlation matrix


correlation_matrix = df_climate.corr()

# Plot heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", linewidths=0.5)
plt.title("Correlation Heatmap: Climate-Related Features")
plt.tight_layout()
plt.show()
df["Year_Factor"] = pd.to_numeric(df["Year_Factor"], errors="coerce")
# Drop missing values
df = df.dropna(subset=["site_eui", "Year_Factor"])

# Group by year and calculate mean site_eui


yearly_avg = df.groupby("Year_Factor")["site_eui"].mean().reset_index()

# Plot
plt.figure(figsize=(10, 5))
plt.plot(yearly_avg["Year_Factor"], yearly_avg["site_eui"], marker="o", linestyle="-",
color="teal")
plt.title("Year-wise Average Site Energy Use Intensity (EUI)")
plt.xlabel("Year")
plt.ylabel("Average Site EUI")
plt.grid(True)
plt.tight_layout()
plt.show()
df_bar = df.groupby("Year_Factor")["site_eui"].mean().reset_index()
df_bar["Year_Factor"] = df_bar["Year_Factor"].astype(int)

# Generate colors using a colormap (e.g., viridis or tab20)


colors = plt.cm.tab20(np.linspace(0, 1, len(df_bar)))

# Plot with multiple colors


plt.figure(figsize=(10, 6))
plt.bar(df_bar["Year_Factor"], df_bar["site_eui"], color=colors)
plt.title("Average Site EUI by Year")
plt.xlabel("Year")
plt.ylabel("Average Site EUI")
plt.grid(True)
plt.tight_layout()
plt.show()
top_facilities = df["facility_type"].value_counts().nlargest(5)
plt.figure(figsize=(6, 6))
plt.pie(top_facilities.values, labels=top_facilities.index, autopct='%1.1f%%', startangle=140)
plt.title("Top 5 Facility Types Distribution")
plt.axis("equal")
plt.tight_layout()
plt.show()
unique_years = df["Year_Factor"].dropna().unique()[:5]
df_subset = df[df["Year_Factor"].isin(unique_years)]

# Create the boxplot


plt.figure(figsize=(10, 6))
sns.boxplot(x="Year_Factor", y="site_eui", data=df_subset)
plt.title("Site EUI Distribution for First 5 Unique Year_Factor Values")
plt.xlabel("Year")
plt.ylabel("Site EUI")
plt.grid(True)
plt.tight_layout()
plt.show()
plt.figure(figsize=(10, 6))
sns.scatterplot(x='floor_area', y='site_eui', data=df, hue='building_class', palette='Set1')
plt.title('Floor Area vs Site EUI')
plt.show()
plt.figure(figsize=(12, 6))
sns.violinplot(x='facility_type', y='site_eui', data=df, palette='muted')
plt.title('Site EUI Distribution by Facility Type')
plt.xticks(rotation=90)
plt.show()

You might also like