Day 21 - Code Jupyter Notebook
Day 21 - Code Jupyter Notebook
Import Library
In [2]: import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
import seaborn as sns
Import Dataset
In [3]: df = pd.read_csv('train.csv')
In [4]: df.head()
Out[4]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Emb
Braund,
0 1 0 3 Mr. Owen male 22.0 1 0 A/5 21171 7.2500 NaN
Harris
Cumings,
Mrs. John
Bradley
1 2 1 1 female 38.0 1 0 PC 17599 71.2833 C85
(Florence
Briggs
Th...
Heikkinen,
STON/O2.
2 3 1 3 Miss. female 26.0 0 0 7.9250 NaN
3101282
Laina
Futrelle,
Mrs.
Jacques
3 4 1 1 female 35.0 1 0 113803 53.1000 C123
Heath
(Lily May
Peel)
Allen, Mr.
4 5 0 3 William male 35.0 0 0 373450 8.0500 NaN
Henry
In [5]: df = pd.read_csv('train.csv')[['Age','Pclass','SibSp','Parch','Survived']]
In [6]: df.head()
Out[6]:
Age Pclass SibSp Parch Survived
0 22.0 3 1 0 0
1 38.0 1 1 0 1
2 26.0 3 0 0 1
3 35.0 1 1 0 1
4 35.0 3 0 0 0
Drop NA Value
In [7]: df.dropna(inplace=True)
In [8]: df.sample(5)
Out[8]:
Age Pclass SibSp Parch Survived
663 36.0 3 0 0 0
498 25.0 1 1 2 0
342 28.0 2 0 0 0
136 19.0 1 0 2 1
884 25.0 3 0 0 0
Separate X and Y
In [9]: X = df.iloc[:,0:4]
y = df.iloc[:,-1]
In [10]: X.head()
Out[10]:
Age Pclass SibSp Parch
0 22.0 3 1 0
1 38.0 1 1 0
2 26.0 3 0 0
3 35.0 1 1 0
4 35.0 3 0 0
Out[11]: 0.6933333333333332
Out[13]:
Age Pclass SibSp Parch Family_size
0 22.0 3 1 0 2
1 38.0 1 1 0 2
2 26.0 3 0 0 1
3 35.0 1 1 0 2
4 35.0 3 0 0 1
In [15]: myfunc(4)
Out[15]: 1
Apply M Function
In [16]: X['Family_type'] = X['Family_size'].apply(myfunc)
In [17]: X.head()
Out[17]:
Age Pclass SibSp Parch Family_size Family_type
0 22.0 3 1 0 2 1
1 38.0 1 1 0 2 1
2 26.0 3 0 0 1 0
3 35.0 1 1 0 2 1
4 35.0 3 0 0 1 0
Out[19]:
Age Pclass Family_type
0 22.0 3 1
1 38.0 1 1
2 26.0 3 0
3 35.0 1 1
4 35.0 3 0
Out[20]: 0.7003174603174602
In [22]: df.head()
Out[22]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Emb
Braund,
0 1 0 3 Mr. Owen male 22.0 1 0 A/5 21171 7.2500 NaN
Harris
Cumings,
Mrs. John
Bradley
1 2 1 1 female 38.0 1 0 PC 17599 71.2833 C85
(Florence
Briggs
Th...
Heikkinen,
STON/O2.
2 3 1 3 Miss. female 26.0 0 0 7.9250 NaN
3101282
Laina
Futrelle,
Mrs.
Jacques
3 4 1 1 female 35.0 1 0 113803 53.1000 C123
Heath
(Lily May
Peel)
Allen, Mr.
4 5 0 3 William male 35.0 0 0 373450 8.0500 NaN
Henry
Separate Salutation
In [25]: df['Title'] = df['Name'].str.split(', ', expand=True)[1].str.split('.', expand=True)[0
In [26]:
df['Name'].str.split(', ', expand=True)[1].str.split('.', expand=True)[0]
Out[26]: 0 Mr
1 Mrs
2 Miss
3 Mrs
4 Mr
...
886 Rev
887 Miss
888 Miss
889 Mr
890 Mr
Name: 0, Length: 891, dtype: object
In [27]: df[['Title','Name']]
Out[27]:
Title Name
Out[28]: 0.7003174603174602