SN Travel Jupyter Notebook PDF
SN Travel Jupyter Notebook PDF
In [362]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
In [363]:
travel_df_train = pd.read_csv('Traveldata_train.csv')
travel_df_test = pd.read_csv('Traveldata_test.csv')
survey_df_train = pd.read_csv('Surveydata_train.csv')
survey_df_test = pd.read_csv('Surveydata_test.csv')
In [364]:
def exp_data_ana(df,target,tvar):
pd.set_option('display.expand_frame_repr', False)
# print('\n****Display first 5 rows****')
# print('****************************')
# print(df.head())
if(target==True):
#print the % of each class in the Target variable
perclass = df[tvar].value_counts(normalize=True)
print('\n****Target Varible Distribution****')
print('************************************')
print('{} Yes\t:\t{}% \n{} No\t:\t{}%'.format(tvar,round(perclass[1]*100,2),
dups = df.duplicated().sum()
if dups ==0:
print('There is no duplicate values in the data.')
else:
print(dups)
In [365]:
df_train = travel_df_train.merge(survey_df_train,how='left',on=['ID'])
df_test = travel_df_test.merge(survey_df_test,how='left',on=['ID'])
In [366]:
exp_data_ana(df_train,True,'Overall_Experience')
In [367]:
exp_data_ana(df_test,False,False)
In [368]:
cat=[]
num=[]
for i in df_train.loc[:, ~df_train.columns. isin(['EmployeeID'])]:
if df_train[i].dtype=="object":
cat.append(i)
else:
num.append(i)
Catogorical Variables :
*****************
['Gender', 'CustomerType', 'TypeTravel', 'Travel_Class', 'Seat_comfor
t', 'Seat_Class', 'Arrival_time_convenient', 'Catering', 'Platform_loc
ation', 'Onboardwifi_service', 'Onboard_entertainment', 'Online_suppor
t', 'Onlinebooking_Ease', 'Onboard_service', 'Leg_room', 'Baggage_hand
ling', 'Checkin_service', 'Cleanliness', 'Online_boarding']
Numerical Variables :
*****************
['ID', 'Age', 'Travel_Distance', 'DepartureDelay_in_Mins', 'ArrivalDe
lay_in_Mins', 'Overall_Experience']
In [369]:
Female 47815
Male 46487
Name: Gender, dtype: int64
In [370]:
# bins = [7,30,45,60,85]
# labels=['7-30','31-45','46-60','61-85']
# bins = [0,320,640,960,1280,1600]
# labels=['0-320','321-640','641-960','961-1280','1281-1600']
# df_train['DepartureDelay_in_MinsRange'] = pd.cut(df_train['DepartureDelay_in_Mins
# bins = [0,320,640,960,1280,1600]
# labels=['0-320','321-640','641-960','961-1280','1281-1600']
# df_train['ArrivalDelay_in_MinsRange'] = pd.cut(df_train['ArrivalDelay_in_Mins'],bi
In [371]:
nmstr=df_train['CustomerType'].isnull().sum()
nmste=df_test['CustomerType'].isnull().sum()
print(f'Number of missing values in CustomerType: \n Train\t:\t{nmstr}\n Test\t:\t{n
Number of missing values in CustomerType:
Train : 8951
Test : 3383
In [372]:
df_train.groupby(['Travel_Class'])['CustomerType'].agg(pd.Series.mode)
Out[372]:
Travel_Class
Business Loyal Customer
Eco Loyal Customer
Name: CustomerType, dtype: object
In [373]:
In [374]:
nmstr=df_train['CustomerType'].isnull().sum()
nmste=df_test['CustomerType'].isnull().sum()
print(f'Number of missing values in CustomerType: \n Train\t:\t{nmstr}\n Test\t:\t{n
In [375]:
nmstr=df_train['Age'].isnull().sum()
nmste=df_test['Age'].isnull().sum()
print(f'Number of missing values in Age: \n Train\t:\t{nmstr}\n Test\t:\t{nmste}')
Number of missing values in Age:
Train : 33
Test : 11
In [376]:
df_train.groupby(by=['Gender','CustomerType','Travel_Class'])['Age'].median()
Out[376]:
In [377]:
In [378]:
nmstr=df_train['Age'].isnull().sum()
nmste=df_test['Age'].isnull().sum()
print(f'Number of missing values in Age: \n Train\t:\t{nmstr}\n Test\t:\t{nmste}')
In [379]:
nmstr=df_train['DepartureDelay_in_Mins'].isnull().sum()
nmste=df_test['DepartureDelay_in_Mins'].isnull().sum()
print(f'Number of missing values in DepartureDelay_in_Mins: \n Train\t:\t{nmstr}\n T
In [380]:
df_train['DepartureDelay_in_Mins'] = df_train['DepartureDelay_in_Mins'].fillna(0)
df_test['DepartureDelay_in_Mins'] = df_test['DepartureDelay_in_Mins'].fillna(0)
# df_train['ArrivalDelay_in_Mins'] = df_train['ArrivalDelay_in_Mins'].fillna(0)
# df_test['ArrivalDelay_in_Mins'] = df_test['ArrivalDelay_in_Mins'].fillna(0)
In [381]:
nmstr=df_train['DepartureDelay_in_Mins'].isnull().sum()
nmste=df_test['DepartureDelay_in_Mins'].isnull().sum()
print(f'Number of missing values in DepartureDelay_in_Mins: \n Train\t:\t{nmstr}\n T
Number of missing values in DepartureDelay_in_Mins:
Train : 0
Test : 0
In [382]:
nmstr=df_train['ArrivalDelay_in_Mins'].isnull().sum()
nmste=df_test['ArrivalDelay_in_Mins'].isnull().sum()
print(f'Number of missing values in ArrivalDelay_in_Mins: \n Train\t:\t{nmstr}\n Tes
Number of missing values in ArrivalDelay_in_Mins:
Train : 357
Test : 123
In [383]:
round(df_train.groupby(by=['Arrival_time_convenient'])['ArrivalDelay_in_Mins'].mean(
Out[383]:
Arrival_time_convenient
acceptable 15.28
excellent 14.55
extremely poor 12.96
good 15.05
need improvement 15.60
poor 15.34
Name: ArrivalDelay_in_Mins, dtype: float64
In [384]:
df_train['ArrivalDelay_in_Mins'] = df_train['ArrivalDelay_in_Mins'].fillna(df_train.
df_test['ArrivalDelay_in_Mins'] = df_test['ArrivalDelay_in_Mins'].fillna(df_test.gro
In [385]:
df_train['ArrivalDelay_in_Mins'] = df_train['ArrivalDelay_in_Mins'].fillna(0)
df_test['ArrivalDelay_in_Mins'] = df_test['ArrivalDelay_in_Mins'].fillna(0)
In [386]:
nmstr=df_train['ArrivalDelay_in_Mins'].isnull().sum()
nmste=df_test['ArrivalDelay_in_Mins'].isnull().sum()
print(f'Number of missing values in ArrivalDelay_in_Mins: \n Train\t:\t{nmstr}\n Tes
In [387]:
nmstr=df_train['Arrival_time_convenient'].isnull().sum()
nmste=df_test['Arrival_time_convenient'].isnull().sum()
print(f'Number of missing values in Arrival_time_convenient: \n Train\t:\t{nmstr}\n
In [388]:
df_train['Arrival_time_convenient'] = df_train['Arrival_time_convenient'].fillna('0'
df_test['Arrival_time_convenient'] = df_test['Arrival_time_convenient'].fillna('0')
In [389]:
df_train['Arrival_time_convenient'].value_counts()
Out[389]:
good 19574
excellent 17684
acceptable 15177
need improvement 14990
poor 13692
0 8930
extremely poor 4332
Name: Arrival_time_convenient, dtype: int64
In [390]:
df_test['Arrival_time_convenient'].value_counts()
Out[390]:
good 7361
excellent 6589
acceptable 5844
need improvement 5684
poor 5131
0 3325
extremely poor 1668
Name: Arrival_time_convenient, dtype: int64
In [391]:
df_train['Arrival_time_convenient']=df_train.apply(
lambda row: 'excellent' if (row['Arrival_time_convenient']=='0' and row['Arrival
axis=1
)
df_train['Arrival_time_convenient']=df_train.apply(
lambda row: 'acceptable' if (row['Arrival_time_convenient']=='0' and (row['Arriv
axis=1
)
df_train['Arrival_time_convenient']=df_train.apply(
lambda row: 'poor' if (row['Arrival_time_convenient']=='0' and (row['ArrivalDela
axis=1
)
df_train['Arrival_time_convenient']=df_train.apply(
lambda row: 'good' if (row['Arrival_time_convenient']=='0' and (row['ArrivalDela
axis=1
)
df_train['Arrival_time_convenient']=df_train.apply(
lambda row: 'extremely poor' if (row['Arrival_time_convenient']=='0' and (row['A
axis=1
)
df_train['Arrival_time_convenient']=df_train.apply(
lambda row: 'need improvement' if (row['Arrival_time_convenient']=='0' and (row[
axis=1
)
df_train['Arrival_time_convenient'].value_counts()
Out[391]:
excellent 23101
good 19575
acceptable 18391
need improvement 14990
poor 13990
extremely poor 4332
Name: Arrival_time_convenient, dtype: int64
In [392]:
df_test['Arrival_time_convenient']=df_test.apply(
lambda row: 'excellent' if (row['Arrival_time_convenient']=='0' and row['Arrival
axis=1
)
df_test['Arrival_time_convenient']=df_test.apply(
lambda row: 'acceptable' if (row['Arrival_time_convenient']=='0' and (row['Arriv
axis=1
)
df_test['Arrival_time_convenient']=df_test.apply(
lambda row: 'poor' if (row['Arrival_time_convenient']=='0' and (row['ArrivalDela
axis=1
)
df_test['Arrival_time_convenient']=df_test.apply(
lambda row: 'good' if (row['Arrival_time_convenient']=='0' and (row['ArrivalDela
axis=1
)
df_test['Arrival_time_convenient']=df_test.apply(
lambda row: 'extremely poor' if (row['Arrival_time_convenient']=='0' and (row['A
axis=1
)
df_test['Arrival_time_convenient']=df_test.apply(
lambda row: 'need improvement' if (row['Arrival_time_convenient']=='0' and (row[
axis=1
)
df_test['Arrival_time_convenient'].value_counts()
Out[392]:
excellent 8564
good 7361
acceptable 7075
need improvement 5684
poor 5250
extremely poor 1668
Name: Arrival_time_convenient, dtype: int64
In [393]:
nmstr=df_train['Arrival_time_convenient'].isnull().sum()
nmste=df_test['Arrival_time_convenient'].isnull().sum()
print(f'Number of missing values in Arrival_time_convenient: \n Train\t:\t{nmstr}\n
In [394]:
nmstr=df_train['TypeTravel'].isnull().sum()
nmste=df_test['TypeTravel'].isnull().sum()
print(f'Number of missing values in TypeTravel: \n Train\t:\t{nmstr}\n Test\t:\t{nms
In [395]:
df_train.groupby(['CustomerType','Travel_Class'])['TypeTravel'].agg(pd.Series.mode)
Out[395]:
CustomerType Travel_Class
Loyal Customer Business Business travel
Eco Personal Travel
disloyal Customer Business Business travel
Eco Business travel
Name: TypeTravel, dtype: object
In [396]:
In [397]:
nmstr=df_train['TypeTravel'].isnull().sum()
nmste=df_test['TypeTravel'].isnull().sum()
print(f'Number of missing values in TypeTravel: \n Train\t:\t{nmstr}\n Test\t:\t{nms
In [398]:
nmstr=df_train['TypeTravel'].isnull().sum()
nmste=df_test['TypeTravel'].isnull().sum()
print(f'Number of missing values in TypeTravel: \n Train\t:\t{nmstr}\n Test\t:\t{nms
Number of missing values in TypeTravel:
Train : 0
Test : 0
In [400]:
cat = ['Gender','Seat_comfort','Catering','Platform_location',
'Onboardwifi_service','Onboard_entertainment',
'Online_support','Onlinebooking_Ease','Leg_room','Baggage_handling','Checkin_service
for i in cat:
df_train[i] = df_train[i].fillna(df_train[i].mode()[0])
df_test[i] = df_test[i].fillna(df_test[i].mode()[0])
In [402]:
nmstr=df_train['Onboard_service'].isnull().sum()
nmste=df_test['Onboard_service'].isnull().sum()
print(f'Number of missing values in Onboard_service : \n Train\t:\t{nmstr}\n Test\t:
In [401]:
df_train.groupby(['Onboardwifi_service','Onboard_entertainment'])['Onboard_service']
Out[401]:
Onboardwifi_service Onboard_entertainment
acceptable acceptable good
excellent good
extremely poor acceptable
good good
need improvement good
poor good
excellent acceptable good
excellent good
extremely poor acceptable
good good
need improvement good
poor good
extremely poor acceptable [acceptable, excellent]
excellent need improvement
good excellent
need improvement good
poor need improvement
good acceptable good
excellent good
extremely poor good
good good
need improvement good
poor good
need improvement acceptable good
excellent good
extremely poor good
good good
need improvement acceptable
poor good
poor acceptable acceptable
excellent excellent
extremely poor acceptable
good good
need improvement good
poor good
Name: Onboard_service, dtype: object
In [403]:
df_train['Onboard_service'] = df_train.groupby(['Onboardwifi_service','Onboard_enter
df_test['Onboard_service'] = df_test.groupby(['Onboardwifi_service','Onboard_enterta
In [404]:
In [405]:
In [ ]:
num
Out[293]:
['ID',
'Age',
'Travel_Distance',
'DepartureDelay_in_Mins',
'ArrivalDelay_in_Mins',
'Overall_Experience']
In [ ]:
# fig, axis=plt.subplots(nrows=4,ncols=2)
# fig.set_size_inches(15,17)
# fig.tight_layout()
# for i in range(1,5):
# sns.distplot(df_train[num[i]],ax=axis[i-1][0]);
# sns.boxplot(df_train[num[i]],ax=axis[i-1][1]);
In [ ]:
In [ ]:
In [ ]:
# fig, axis=plt.subplots(nrows=4,ncols=2)
# fig.set_size_inches(15,17)
# fig.tight_layout()
# for i in range(1,5):
# sns.distplot(df_train[num[i]],ax=axis[i-1][0]);
# sns.boxplot(df_train[num[i]],ax=axis[i-1][1]);
In [406]:
In [407]:
In [408]:
def display_dataframe(df):
numeric_col_mask = df.dtypes.apply(lambda d: issubclass(np.dtype(d).type, np.num
# Style
display(df.style.set_properties(subset=df.columns[numeric_col_mask], # right-ali
**{'width':'5em', 'height':'3em','text-align':'right','b
.set_properties(subset=df.columns[~numeric_col_mask], # left-align the n
**{'width':'5em', 'text-align':'left'})\
.format(lambda x: '{:,.0f}'.format(x) if x > 1e3 else '{:,.2f}'.format(x
subset=pd.IndexSlice[:,df.columns[numeric_col_mask]])\
.hide_index()\
# .highlight_max('color: green')\
.set_table_styles([d])) # center the header
In [409]:
In [410]:
def con_mat(y_train,y_predict_train,y_test,y_predict_test):
fig, axis=plt.subplots(nrows=1,ncols=2)
fig.set_size_inches(10,4)
fig.tight_layout()
cm=confusion_matrix(y_train,y_predict_train,labels=[0, 1])
cm=confusion_matrix(y_test,y_predict_test,labels=[0, 1])
In [411]:
def scores_train_test(model,X_train,X_test,y_train,y_test,y_predict_train,y_predict_
#model=bgcl
print(str(model).split('(')[0])
print('********************************\n')
model_name = str(model).split('(')[0]
s[0][0] = model_name + '_' + mname +'_Train'
s[1][0] = model_name + '_' + mname +'_Test'
s[0][prf+1]=round(model.score(X_train,y_train)*100,2)
s[1][prf+1]=round(model.score(X_test,y_test)*100,2)
s[0][prf+2]=round(auc*100,2)
s[1][prf+2]=round(auc1*100,2)
df = pd.DataFrame(data=s,columns=['Scores','Precision','Recall','F-Score','Accur
con_mat(y_train,y_predict_train,y_test,y_predict_test)
plt.figure(figsize=(5,5))
plt.legend(loc="lower right")
plt.plot([0, 1], [0, 1], linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC')
display_dataframe(df)
return(df)
In [412]:
In [413]:
tvar = 'Overall_Experience'
In [414]:
X = df_train.drop(['ID',tvar], axis=1)
y = df_train[tvar]
In [415]:
In [416]:
scaler = StandardScaler()
In [417]:
sc_train = scaler.fit_transform(X_train)
X_train_sc = pd.DataFrame(sc_train, index=X_train.index, columns=X_train.columns)
In [418]:
sc_test = scaler.transform(X_test)
X_test_sc = pd.DataFrame(sc_test, index=X_test.index, columns=X_test.columns)
In [438]:
rfcl = RandomForestClassifier(random_state=0,max_features=14)
rfcl.fit(X_train, y_train)
rf_train = rfcl.predict(X_train)
rf_test = rfcl.predict(X_test)
res_df = scores_train_test(rfcl,X_train,X_test,y_train,y_test,rf_train,rf_test,'Base
RandomForestClassifier
********************************
In [ ]:
# features = X_train.columns
# importances = rfcl.feature_importances_
# indices = np.argsort(importances)
# plt.title('Feature Importances')
# plt.barh(range(len(indices)), importances[indices], color='b', align='center')
# plt.yticks(range(len(indices)), [features[i] for i in indices])
# plt.xlabel('Relative Importance')
# plt.show()
In [420]:
def rfrun(x_train,train_labels,x_test,max_f,no_est,max_dep,min_sam,min_spl):
param_grid = {
'criterion': ['gini'],
'max_depth': max_dep, #,7,9],
'max_features':max_f, #,32],
'min_samples_leaf': min_sam,#15,20],
'min_samples_split': min_spl,#75,60],
'n_estimators': no_est
}
rfcl = RandomForestClassifier(random_state=1)
rfcl = grid_search.best_estimator_
rfcl
rfcl_y_predict_train = rfcl.predict(x_train)
rfcl_y_predict_test = rfcl.predict(x_test)
return(rfcl,rfcl_y_predict_train,rfcl_y_predict_test)
In [451]:
res_df = scores_train_test(rfcl_tuned,X_train,X_test,y_train,y_test,rfcl_y_predict_t
---------------------------------------------------------------------
------
KeyboardInterrupt Traceback (most recent call
last)
<ipython-input-450-a2387e5d0eb9> in <module>
1 from sklearn.model_selection import GridSearchCV
----> 2 rfcl_tuned,rfcl_y_predict_train,rfcl_y_predict_test = rfrun(
3 X_train,y_train,X_test,[13,17,21],[100,200,300],[15,17,2
5],[1,3],[3,6])
4
5 res_df = scores_train_test(rfcl_tuned,X_train,X_test,y_train,
y_test,rfcl_y_predict_train,rfcl_y_predict_test,'Tuned')
In [442]:
res_df = scores_train_test(rfcl_tuned,X_train,X_test,y_train,y_test,rfcl_y_predict_t
RandomForestClassifier
********************************
In [ ]:
In [ ]:
# for i in range(1,9,2):
# KNN_Model = KNeighborsClassifier(n_neighbors=i,metric='euclidean')
# KNN_Model.fit(X_train_sc,y_train)
# y_test_p = KNN_Model.predict(X_test_sc)
# print(f'Accuracy Score for K={i} : ',KNN_Model.score(X_test_sc,y_test))
In [ ]:
In [441]:
def nnrun(x_train,train_labels,x_test,hid_ly,max_int,tol,sol,act):
param_grid = {
'hidden_layer_sizes': hid_ly,
'max_iter': max_int,
'activation':act,
'solver': sol,
'tol': tol,
'random_state':[0] #1
}
nncl = MLPClassifier(random_state=0)
grid_search.fit(x_train, train_labels)
grid_search.best_params_
print(grid_search.best_params_)
nn_model = grid_search.best_estimator_
nn_model
nn_train = nn_model.predict(x_train)
nn_test = nn_model.predict(x_test)
return(nn_model,nn_train,nn_test)
---------------------------------------------------------------------
------
KeyboardInterrupt Traceback (most recent call
last)
<ipython-input-440-a2387e5d0eb9> in <module>
1 from sklearn.model_selection import GridSearchCV
----> 2 rfcl_tuned,rfcl_y_predict_train,rfcl_y_predict_test = rfrun(
3 X_train,y_train,X_test,[13,17,21],[100,200,300],[15,17,2
5],[1,3],[3,6])
4
5 res_df = scores_train_test(rfcl_tuned,X_train,X_test,y_train,
y_test,rfcl_y_predict_train,rfcl_y_predict_test,'Tuned')
nn_model,nn_train_p,nn_test_p = nnrun(X_train_sc,y_train,X_test_sc,[100],[1000],[0.0
In [ ]:
res_df = scores_train_test(nn_model,X_train_sc,X_test_sc,y_train,y_test,nn_train_p,n
In [443]:
df_test.drop(tvar,axis=1,inplace=True)
In [444]:
final_model = rfcl_tuned.fit(X,y)
In [445]:
rf_output = final_model.predict(df_test.drop('ID',axis=1))
In [446]:
rf_output
Out[446]:
In [447]:
df_test[tvar]= rf_output
In [448]:
m='tuned'
In [449]:
df_test[['ID',tvar]].to_csv('./Hack_submission_'+m+'.csv',index=False)