RobiSetiawan - Tugas 4 .Ipynb
RobiSetiawan - Tugas 4 .Ipynb
ipynb - Colaboratory
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as scp
from google.colab import drive
drive.mount('/content/drive')
import os
os.chdir("drive/My Drive/Colab Notebooks")
##sesuaikan dengan nama direktori di google drive tempat menyimpan file csv####
Mounted at /content/drive
titanic3 = pd.read_csv('data_titanic3.csv')
print("data: ", titanic3.shape)
titanic3.info()
<class 'pandas.core.frame.DataFrame'>
VARIABLE DESCRIPTIONS:
(1 = 1st; 2 = 2nd; 3 = 3rd)
survival Survival
(0 = No; 1 = Yes)
name Name
sex Sex
age Age
sibsp Number of Siblings/Spouses Aboard
parch Number of Parents/Children Aboard
ticket Ticket Number
fare Passenger Fare ---> tarif
cabin Cabin
embarked Port of Embarkation
(C = Cherbourg; Q = Queenstown; S = Southampton)
boat Lifeboat
body Body Identification Number
home.dest Home/Destination
SPECIAL NOTES:
Pclass is a proxy for socio-economic status (SES)
1st ~ Upper; 2nd ~ Middle;
3rd ~ Lower
titanic3.head()
pclass survived name sex age sibsp parch ticket fare cabin e
Allen,
Miss.
0 1 1 female 29.00 0 0 24160 211.3375 B5
Elisabeth
Walton
Allison,
Master. C22
1 1 1 male 0.92 1 2 113781 151.5500
Hudson C26
Trevor
Allison,
Miss. C22
2 1 0 female 2.00 1 2 113781 151.5500
Helen C26
Loraine
titanic3.describe()
def countplot(column):
return sns.countplot(x= column, data= titanic3)
countplot('pclass')
<matplotlib.axes._subplots.AxesSubplot at 0x7f05d3b55990>
countplot('survived')
https://colab.research.google.com/drive/180L7zc2I0bxe6hd_3bl8Q8uzM43J5T8A?authuser=1#scrollTo=_HvBzUlSScEA&printMode=true 3/13
03/11/21 15.03 RobiSetiawan_Tugas 4 .ipynb - Colaboratory
<matplotlib.axes._subplots.AxesSubplot at 0x7f05d39fff90>
countplot('embarked')
<matplotlib.axes._subplots.AxesSubplot at 0x7f05d355b050>
countplot('sex')
<matplotlib.axes._subplots.AxesSubplot at 0x7f05d34d7a50>
#tarif tiket dg yang selamat
titanic3.plot.scatter(x="fare", y='survived', figsize=(6,4))
https://colab.research.google.com/drive/180L7zc2I0bxe6hd_3bl8Q8uzM43J5T8A?authuser=1#scrollTo=_HvBzUlSScEA&printMode=true 4/13
03/11/21 15.03 RobiSetiawan_Tugas 4 .ipynb - Colaboratory
<matplotlib.axes._subplots.AxesSubplot at 0x7f05d349b750>
titanic3.corr(method='pearson')
corr=titanic3.corr()#["survived"]
plt.figure(figsize=(10, 10))
sns.heatmap(corr, vmax=.8, linewidths=0.01,
square=True,annot=True,cmap='YlGnBu',linecolor="white")
plt.title('Correlation between features');
https://colab.research.google.com/drive/180L7zc2I0bxe6hd_3bl8Q8uzM43J5T8A?authuser=1#scrollTo=_HvBzUlSScEA&printMode=true 5/13
03/11/21 15.03 RobiSetiawan_Tugas 4 .ipynb - Colaboratory
def compute_freq_chi2(x,y):
freqtab = pd.crosstab(x,y)
print("Frequency table")
print("============================")
print(freqtab)
print("============================")
chi2,pval,dof,expected = scp.chi2_contingency(freqtab)
print("ChiSquare test statistic: ",chi2)
print("p-value: ",pval)
return
compute_freq_chi2(titanic3.survived,titanic3.pclass)
Frequency table
============================
pclass 1 2 3
survived
============================
p-value: 1.7208259588256052e-28
compute_freq_chi2(titanic3.survived,titanic3.embarked)
Frequency table
============================
embarked C Q S
survived
0 120 79 610
1 150 44 304
============================
p-value: 2.471880987482563e-10
compute_freq_chi2(titanic3.survived,titanic3.sex)
Frequency table
============================
survived
0 127 682
1 339 161
============================
p-value: 4.589924936952945e-81
# korelasi yang selamat dg tarif berdasarkan embarkasi
sns.boxplot(x="embarked", y="fare", hue="survived", data=titanic3);
https://colab.research.google.com/drive/180L7zc2I0bxe6hd_3bl8Q8uzM43J5T8A?authuser=1#scrollTo=_HvBzUlSScEA&printMode=true 6/13
03/11/21 15.03 RobiSetiawan_Tugas 4 .ipynb - Colaboratory
data2 = titanic3.drop(['survived'],axis=1)
data2.boxplot(figsize=(20,3))
/usr/local/lib/python3.7/dist-packages/numpy/core/_asarray.py:83: VisibleDeprecationW
return array(a, dtype, copy=False, order=order)
<matplotlib.axes._subplots.AxesSubplot at 0x7f05c970bb90>
3. Data Preprocessing
def cek_null(df):
col_na = df.isnull().sum().sort_values(ascending=False)
percent = col_na / len(df)
missing_data = pd.concat([col_na, percent], axis=1, keys=['Total', 'Percent'])
print(missing_data[missing_data['Total'] > 0])
cek_null(titanic3)
Total Percent
https://colab.research.google.com/drive/180L7zc2I0bxe6hd_3bl8Q8uzM43J5T8A?authuser=1#scrollTo=_HvBzUlSScEA&printMode=true 7/13
03/11/21 15.03 RobiSetiawan_Tugas 4 .ipynb - Colaboratory
embarked 2 0.001528
fare 1 0.000764
titanic3['cabin'].value_counts()
G6 5
C78 4
D 4
..
C148 1
C50 1
E40 1
C128 1
B19 1
#Counting the number of cabins
# menghitung jumlah kabin yang dipisah spasi
titanic3['cabin'].str.split(" ", expand=True).count().rename(lambda x: x+1)
1 295
2 41
3 15
4 5
dtype: int64
#menghitung yang selamat dan yang tidak berdasarkan karekter pertama kabin--> cabin.str[:1
(titanic3
.groupby([titanic3.cabin.str[:1],'survived'])
.survived
.count()
.unstack())
survived 0 1
cabin
A 11.0 11.0
B 18.0 47.0
C 37.0 57.0
D 14.0 32.0
E 11.0 30.0
F 8.0 13.0
G 2.0 3.0
T 1.0 NaN
##menghitung rata-rata fare/tarif perkabin
(titanic3
.groupby([titanic3.cabin.str[:1],'survived'])
https://colab.research.google.com/drive/180L7zc2I0bxe6hd_3bl8Q8uzM43J5T8A?authuser=1#scrollTo=_HvBzUlSScEA&printMode=true 8/13
03/11/21 15.03 RobiSetiawan_Tugas 4 .ipynb - Colaboratory
.fare
.mean()
.unstack())
survived 0 1
cabin
A 34.168182 48.320445
B 76.363422 140.007628
C 111.878832 105.361112
D 46.653264 55.787247
E 46.431436 57.546807
F 11.241150 22.287500
G 10.462500 16.700000
T 35.500000 NaN
##mabipulasi data kosong
titanic3['cabin'] = titanic3['cabin'].fillna('U')
(titanic3
.groupby([titanic3.cabin.str[:1],'survived'])
.survived
.count()
.unstack())
survived 0 1
cabin
A 11.0 11.0
B 18.0 47.0
C 37.0 57.0
D 14.0 32.0
E 11.0 30.0
F 8.0 13.0
G 2.0 3.0
T 1.0 NaN
U 707.0 307.0
(titanic3
.groupby([titanic3.cabin.str[:1],'survived'])
f
https://colab.research.google.com/drive/180L7zc2I0bxe6hd_3bl8Q8uzM43J5T8A?authuser=1#scrollTo=_HvBzUlSScEA&printMode=true 9/13
03/11/21 15.03 RobiSetiawan_Tugas 4 .ipynb - Colaboratory
.fare
.mean()
.unstack())
survived 0 1
cabin
A 34.168182 48.320445
B 76.363422 140.007628
C 111.878832 105.361112
D 46.653264 55.787247
E 46.431436 57.546807
F 11.241150 22.287500
G 10.462500 16.700000
T 35.500000 NaN
U 16.529378 25.119516
titanic3_cleaned = titanic3.drop(['name', 'ticket', 'body', 'cabin','home.dest'], axis=1)
titanic3_cleaned.head()
cek_null(titanic3_cleaned)
Total Percent
embarked 2 0.001528
fare 1 0.000764
titanic3_cleaned['boat'] = titanic3_cleaned['boat'].fillna('None')
(titanic3_cleaned
.groupby([titanic3_cleaned.boat,'survived'])
.boat
.count()
.unstack())
https://colab.research.google.com/drive/180L7zc2I0bxe6hd_3bl8Q8uzM43J5T8A?authuser=1#scrollTo=_HvBzUlSScEA&printMode=true 10/13
03/11/21 15.03 RobiSetiawan_Tugas 4 .ipynb - Colaboratory
survived 0 1
boat
1 NaN 5.0
10 NaN 29.0
11 NaN 25.0
12 1.0 18.0
13 NaN 39.0
13 15 NaN 2.0
13 15 B NaN 1.0
14 1.0 32.0
15 NaN 37.0
15 16 NaN 1.0
16 NaN 23.0
2 NaN 13.0
3 NaN 26.0
4 NaN 31.0
5 NaN 27.0
57 NaN 2.0
59 NaN 1.0
6 NaN 20.0
7 NaN 23.0
8 NaN 23.0
8 10 NaN 1.0
9 NaN 25.0
A 4.0 7.0
B 1.0 8.0
C 1.0 37.0
CD NaN 2.0
D 1.0 19.0
titanic3_cleaned.head()
https://colab.research.google.com/drive/180L7zc2I0bxe6hd_3bl8Q8uzM43J5T8A?authuser=1#scrollTo=_HvBzUlSScEA&printMode=true 11/13
03/11/21 15.03 RobiSetiawan_Tugas 4 .ipynb - Colaboratory
Total Percent
embarked 2 0.001528
fare 1 0.000764
titanic3_cleaned['age'] = titanic3_cleaned['age'].fillna('median')
cek_null(titanic3_cleaned)
Total Percent
embarked 2 0.001528
fare 1 0.000764
titanic3_cleaned["embarked"] = titanic3_cleaned["embarked"].fillna('C')
titanic3_cleaned[titanic3_cleaned['fare'].isnull()]
titanic3_cleaned.dropna(inplace=True)
titanic3_cleaned.head()
0 1 1 female 29 0 0 211.3375 S 2
Kode Teks
print ("Tugas 4")
print ("Robi Setiawan")
print ("Si7K")
print ("181410156")
https://colab.research.google.com/drive/180L7zc2I0bxe6hd_3bl8Q8uzM43J5T8A?authuser=1#scrollTo=_HvBzUlSScEA&printMode=true 12/13
03/11/21 15.03 RobiSetiawan_Tugas 4 .ipynb - Colaboratory
print ( 181410156 )
Tugas 4
Robi Setiawan
Si7K
181410156
https://colab.research.google.com/drive/180L7zc2I0bxe6hd_3bl8Q8uzM43J5T8A?authuser=1#scrollTo=_HvBzUlSScEA&printMode=true 13/13