Supervised Regression
Supervised Regression
1. Data Pre-processing
Import the required libraries
In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import preprocessing
In [2]:
df_airline = pd.read_excel("airfare_CT3-1.xlsx")
df_airline.head(3)
Out[2]:
01:10 22 non-
0 IndiGo 24/03/2019 Banglore New Delhi BLR → DEL 22:20 2h 50m No info 3897
Mar stop
1 Air India 1/05/2019 Kolkata Banglore CCU → IXR → BBI → BLR 05:50 13:15 7h 25m 2 stops No info 7662
In [3]:
df_airline.shape
Out[3]:
(9000, 11)
In [4]:
df_airline.keys()
Out[4]:
df_airline.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9000 entries, 0 to 8999
Data columns (total 11 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Airline 9000 non-null object
1 Date 9000 non-null object
2 Departure Station 9000 non-null object
3 Arrival Station 9000 non-null object
4 Route Map 9000 non-null object
5 Departure Time 9000 non-null object
6 Arrival Time 9000 non-null object
7 Journey Time 9000 non-null object
8 Stops 9000 non-null object
9 Extra Info 9000 non-null object
10 Price 9000 non-null int64
dtypes: int64(1), object(10)
memory usage: 773.6+ KB
In [6]:
df_airline.describe()
Out[6]:
Price
count 9000.000000
mean 9087.764333
std 4605.498942
min 1759.000000
25% 5228.000000
50% 8369.000000
75% 12373.000000
max 79512.000000
In [7]:
df_airline.dtypes
Out[7]:
Airline object
Date object
Departure Station object
Arrival Station object
Route Map object
Departure Time object
Arrival Time object
Journey Time object
Stops object
Extra Info object
Price int64
dtype: object
We can see from the above result that we have price as a numerical data with dtype int and the other data type as object.
# sort the variables on the basis of total null values in the variable
# 'isnull().sum()' returns the number of missing values in each variable
missing_total = df_airline.isnull().sum()
print(missing_total)
Airline 0
Date 0
Departure Station 0
Arrival Station 0
Route Map 0
Departure Time 0
Arrival Time 0
Journey Time 0
Stops 0
Extra Info 0
Price 0
dtype: int64
In [9]:
print(df_airline['Departure Station'].unique())
In [10]:
print(df_airline['Departure Station'].unique())
We have replaced the the departure station values as per the location code
print(df_airline['Arrival Station'].unique())
In [12]:
print(df_airline['Arrival Station'].unique())
In [13]:
print(df_airline['Extra Info'].unique())
['No info' 'In-flight meal not included' 'No check-in baggage included'
'1 Short layover' 'No Info' '1 Long layover' 'Change airports'
'Business class' 'Red-eye flight']
In [14]:
print(df_airline['Extra Info'].unique())
['No Info' 'In-flight meal not included' 'No check-in baggage included'
'1 Short layover' '1 Long layover' 'Change airports' 'Business class'
'Red-eye flight']
In [16]:
Out[16]:
Extra Info
1 Long layover 17
1 Short layover 1
Business class 3
Change airports 4
In-flight meal not included 1649
No Info 7055
No check-in baggage included 270
Red-eye flight 1
Name: Extra Info, dtype: int64
In [17]:
## Assigning the categories using map function for the above categories shown in the result
In [18]:
print(df_airline['Extra Info'].unique())
[0 1 2 6 3 4 5 7]
In [19]:
print(df_airline['Stops'].unique())
In [20]:
In [21]:
print(df_airline['Stops'].unique())
[0 2 1 3]
In [22]:
df_airline.head(2)
Out[22]:
Airline Date Departure Station Arrival Station Route Map Departure Time Arrival Time Journey Time Stops Extra Info Price
0 IndiGo 24/03/2019 BLR DEL BLR → DEL 22:20 01:10 22 Mar 2h 50m 0 0 3897
1 Air India 1/05/2019 CCU BLR CCU → IXR → BBI → BLR 05:50 13:15 7h 25m 2 0 7662
In [23]:
df_airline['Day'],df_airline['Month'],df_airline['Year'] = df_airline['Date'].str.split('/',3).str
In [24]:
df_airline.head(2)
Out[24]:
01:10 22
0 IndiGo 24/03/2019 BLR DEL BLR → DEL 22:20 2h 50m 0 0 3897 24 03 2019
Mar
In [25]:
In [26]:
df_airline.head(2)
Out[26]:
Airline Departure Station Arrival Station Route Map Departure Time Journey Time Stops Extra Info Price Day Month
1 Air India CCU BLR CCU → IXR → BBI → BLR 05:50 7h 25m 2 0 7662 1 05
In [27]:
df_airline.shape
Out[27]:
(9000, 11)
3. Feature Engineering
Calculating distance
In [28]:
df_air_distance = pd.read_csv("air_distance.csv")
In [29]:
df_air_distance.head(2)
Out[29]:
import math
def getDistance(route):
distance = 0.0
route="".join(route.split())
routeArray = route.split('→')
i=0
if len(routeArray) > 1:
while i < (len(routeArray)-1):
df_dist = df_air_distance[(df_air_distance['Source'] == routeArray[i]) & (df_air_distance['Dest'] == routeArray[i+1])]
if (df_dist.empty):
df_dist = df_air_distance[(df_air_distance['Source'] == routeArray[i+1]) & (df_air_distance['Dest'] == routeArray[i])]
distValue = df_dist['Distance(Km)'].item()
distance = distance + distValue
i += 1
return round(distance,2)
In [31]:
df_airline.head(3)
Out[31]:
Creating arrival & departure hour, Minutes from arrival time and departure time
In [32]:
In [33]:
In [34]:
df_airline.head(2)
Out[34]:
In [35]:
df_airline.shape
Out[35]:
(9000, 14)
Chnaging the datatype as per our requirment and model design
In [36]:
df_airline['Month'] = df_airline['Month'].astype(str).astype(int)
df_airline.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9000 entries, 0 to 8999
Data columns (total 14 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Airline 9000 non-null object
1 Departure Station 9000 non-null object
2 Arrival Station 9000 non-null object
3 Route Map 9000 non-null object
4 Stops 9000 non-null int64
5 Extra Info 9000 non-null int64
6 Price 9000 non-null int64
7 Day 9000 non-null object
8 Month 9000 non-null int32
9 Distance(km) 9000 non-null float64
10 Dep_Hr 9000 non-null object
11 Dep_Min 9000 non-null object
12 Duration_Hr 9000 non-null object
13 Duration_Min 8143 non-null object
dtypes: float64(1), int32(1), int64(3), object(9)
memory usage: 949.3+ KB
In [37]:
df_airline.Duration_Min.replace(np.nan, 0,inplace=True)
In [38]:
df_airline['Duration_Hr'] = df_airline['Duration_Hr'].str.rstrip('h')
df_airline.Duration_Hr.unique()
Out[38]:
array(['2', '7', '19', '5', '4', '15', '21', '25', '13', '12', '26', '22',
'23', '20', '10', '6', '11', '8', '16', '3', '27', '1', '14', '9',
'18', '17', '24', '30', '28', '29', '37', '34', '38', '35', '36',
'47', '33', '32', '31', '42', '39', '41'], dtype=object)
In [39]:
df_airline.head(2)
Out[39]:
4. Regularization
In [41]:
df_airline.head(2)
Out[41]:
Airline Source Dest Route Map Stops Info Price Day Month Distance(km) Dep_Hr Dep_Min Duration_Hr Duration_Min
1 Air India CCU BLR CCU → IXR → BBI → BLR 2 0 7662 1 5 1838.55 05 50 7 25
Exporting the cleaned dataset as csv file
In [42]:
df_airline.to_csv('Cleaned_airline.csv', index=False)
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
In [44]:
df = pd.read_csv('Cleaned_airline.csv')
df.head(2)
Out[44]:
Airline Source Dest Route Map Stops Info Price Day Month Distance(km) Dep_Hr Dep_Min Duration_Hr Duration_Min
1 Air India CCU BLR CCU → IXR → BBI → BLR 2 0 7662 1 5 1838.55 5 50 7 25
In [45]:
plt.figure(figsize=(12,6))
df.corr()['Price'].sort_values().plot(kind='bar');
In [46]:
plt.figure(figsize=(12,6))
sns.countplot(x="Airline", data = df, palette='Set3')
plt.title('Count of Airlines', size=30)
plt.xticks(rotation=90)
plt.show()
In [47]:
plt.figure(figsize=(12,6))
sns.boxenplot(x = 'Airline', y= 'Price', data=df, palette='Set3')
plt.title('Airlines vs Price', size=30)
plt.xticks(rotation=90)
plt.show()
In [48]:
plt.figure(figsize=(12,6))
sns.countplot(x='Source', data = df, palette='Set2')
plt.title('Count of Source', size=30)
plt.xticks(rotation=90)
plt.show()
In [49]:
plt.figure(figsize=(12,6))
sns.boxenplot(x= 'Source', y= 'Price', data=df, palette='Set3')
plt.title('Airlines vs Source', size=30)
plt.xticks(rotation=90)
plt.show()
In [50]:
plt.figure(figsize=(12,6))
sns.countplot(x='Day', data= df, palette='Set2')
plt.title('Count of Days', size=30)
plt.xticks(rotation=90)
plt.show()
In [51]:
plt.figure(figsize=(12,6))
sns.barplot(x='Day', y='Price', data=df, palette='Set2')
plt.title('Days vs Price', size=30)
plt.xticks(rotation=90)
plt.show()
In [52]:
df['Month'] = df['Month'].map({
1:'JAN', 2:'FEB', 3:'MAR', 4:'APR', 5:'MAY', 6:'JUN',
7:'JUL', 8:'AUG', 9:'SEP', 10:'OCT', 11:'NOV', 12:'DEC'})
In [53]:
plt.figure(figsize=(12,6))
sns.barplot(x='Month', y='Price', data=df, palette='Set2')
plt.title('Month vs Price', size=30)
plt.xticks(rotation=90)
plt.show()
In [54]:
plt.figure(figsize=(12,6))
sns.barplot(x='Stops', y='Price', data=df, palette='Set2')
plt.title('Stops vs Price', size=30)
plt.xticks(rotation=90)
plt.show()
In [55]:
plt.figure(figsize=(12,6))
sns.barplot(x='Info', y='Price', data=df, palette='Set2')
plt.title('Extra Info vs Price', size=30)
plt.xticks(rotation=90)
plt.show()
In [56]:
df['Duration_bool'] = (df['Duration_Hr']*60)+df['Duration_Min']
plt.figure(figsize=(12,6))
sns.scatterplot(x= 'Duration_bool', y ='Price', data=df, palette='Set2')
plt.title('Duration vs Price', size=30)
plt.xticks(rotation=90)
plt.show()
In [57]:
ncol=["Duration_bool"]
for i in ncol:
q75, q25 = np.percentile(df.loc[:,i], [75 ,25])
iqr = q75 - q25
min = q25 - (iqr*1.5)
max = q75 + (iqr*1.5)
df = df.drop(df[df.loc[:,i] <= min].index)
df = df.drop(df[df.loc[:,i] >= max].index)
df = df.dropna()
df1 = df[['Airline', 'Source', 'Dest', 'Stops',
'Info', 'Price', 'Day', 'Month', 'Distance(km)', 'Duration_bool']]
df1 = df1.rename(columns={'Duration_bool': 'Duration'})
df1['Month'] = df1['Month'].map({
'JAN':1, 'FEB':2, 'MAR':3, 'APR':4, 'MAY':5, 'JUN':6,
'JUL':7, 'AUG':8, 'SEP':9, 'OCT':10, 'NOV':11, 'DEC':12})
df.head(2)
Out[57]:
Airline Source Dest Route Map Stops Info Price Day Month Distance(km) Dep_Hr Dep_Min Duration_Hr Duration_Min Duration_bool
In [58]:
X = df1.drop('Price', axis=1)
y = df1['Price']
In [59]:
df1.to_csv('final_airfare.csv', index=False)
In [61]:
# import 'stats'
from scipy import stats
In [62]:
df = pd.read_csv('./final_airfare.csv')
Out[62]:
Airline Source Dest Stops Info Price Day Month Distance(km) Duration
In [63]:
# standardize the target variable explicitly and store it in a new variable 'y'
y = (df_target - df_target.mean()) / df_target.std()
In [65]:
# concat the dummy variables with numeric features to create a dataframe of all independent variables
# 'axis=1' concats the dataframes along columns
X = pd.concat([df_num_scaled, dummy_var], axis = 1)
Out[65]:
A
Airline_Jet
Airline_Air Airline_Jet Airline_Multiple
Stops Info Day Month Distance(km) Duration Airline_GoAir Airline_IndiGo Airways
India Airways carriers
Business
Train-test split
In [66]:
# check the dimensions of the train & test subset using 'shape'
# print dimension of train set
print('X_train', X_train.shape)
print('y_train', y_train.shape)
# create a generalized function to calculate the RMSE values for train set
def get_train_rmse(model):
# predict the output of the target variable from the train data
train_pred = model.predict(X_train)
# predict the output of the target variable from the test data
test_pred = model.predict(X_test)
MAPE Calculation
In [69]:
def get_test_mape(model):
# predict the output of the target variable from the test data
test_pred = model.predict(X_test)
# create a function to update the score card for comparision of the scores from different algorithms
# pass the model name, model build, alpha and l1_ration as input parameters
# if 'alpha' and/or 'l1_ratio' is not specified, the function assigns '-'
def update_score_card(algorithm_name, model, alpha = '-', l1_ratio = '-'):
# plot a bar plot with Coefficient on the x-axis and Variable names on y-axis
# pass the data to the parameter, 'sorted_coeff' to plot the barplot
sns.barplot(x = "Coefficient", y = "Variable", data = sorted_coeff)
In [73]:
# create a function 'get_score' that returns the R-squared score for the training set
# 'get_score' takes 5 input parameters
def Get_score(model, X_train_k, X_test_k, y_train_k, y_test_k):
model.fit(X_train_k, y_train_k) # fit the model
return model.score(X_test_k, y_test_k)
In [75]:
# call the function 'get_scores()' and append the scores in the list 'scores'
scores.append(Get_score(LinearRegression(), X_train_k, X_test_k, y_train_k, y_test_k))
In [76]:
In [77]:
# use the for loop to build the regression model for each cross validation
# use split() to split the dataset into two subsets; one with (n-1) data points and another with 1 data point
# where, n = total number of observations
In [79]:
In [80]:
# color the cell in the column 'Test_RMSE' having minimum RMSE value
# 'style.highlight_min' assigns color to the minimum value
# pass specified color to the parameter, 'color'
# pass the data to limit the color assignment to the parameter, 'subset'
score_card.style.highlight_min(color = 'lightblue', subset = 'Test_RMSE')
Out[81]:
Model_Name Alpha (Wherever Required) l1-ratio R-Squared Adj. R-Squared Test_RMSE Test_MAPE
2. Module Creation
In [82]:
gradBoost = GradientBoostingRegressor()
gradBoost.fit(X_train, y_train)
prediction = gradBoost.predict(X_test)
print('RMSE : {}'.format(np.sqrt(mean_squared_error(y_test, prediction))))
RMSE : 0.3878337073231023
In [83]:
Out[83]:
(0.8340958488952346, 0.8512828235466694)
In [84]:
MAE: 0.26135248119793697
MSE: 0.15041498453598176
RMSE: 0.3878337073231023
In [85]:
plt.figure(figsize = (4,4))
plt.scatter(y_test, prediction, alpha = 0.5)
plt.xlabel("y_test")
plt.ylabel("y_pred")
plt.show()
import pickle
file = open('final_model.pkl', 'wb')
pickle.dump(gradBoost, file)
In [87]:
In [88]:
Out[88]:
0.8512828235466694
We have now created predictive model and permanently saved in hard-drive with all required pre-processing steps and whenever the new data to be tested