Ass 1 Dsbda
Ass 1 Dsbda
[2]: # 1. Import all the required Python Libraries. >>>>> (numpy, pandas,␣
↪matplotlib, seaborn, ...)
[111]: # 2. Locate an open source data from the web (e.g. https://www.kaggle.com)
# https://www.kaggle.com/datasets/rajgupta2019/medical-insurance-dataset -␣
↪Medical Insuarance Dataset
[2]: df = pd.read_csv(r'C:\Users\Aditi\Downloads\Test_Data.csv')
[3]: df.head()
1
489 False False False False False False
490 False False False False False False
491 False False False False False False
[6]: age 1
gender 0
bmi 1
smoker 0
region 0
children 1
dtype: int64
[7]: # Fill missing values with the mode of the 'children' column
df['children'].fillna(df['children'].mode()[0], inplace=True)
[8]: # Fill missing values with the mean of the 'age' column
df['age'].fillna(df['age'].mean(), inplace=True)
[9]: # Fill missing values with the median of the 'bmi' column
df['bmi'].fillna(df['bmi'].median(), inplace=True)
2
[11]: age gender bmi smoker region children
0 40.000000 male 29.900000 no southwest 2.0
1 47.000000 male 29.959061 no southwest 1.0
2 54.000000 female 28.880000 no northeast 2.0
3 38.844276 male 30.568094 no northeast 1.0
4 59.130049 male 33.132854 yes northeast 4.0
[13]: df.tail()
[15]: df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 492 entries, 0 to 491
Data columns (total 6 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 age 492 non-null float64
1 gender 492 non-null object
2 bmi 492 non-null float64
3 smoker 492 non-null object
4 region 492 non-null object
5 children 492 non-null float64
dtypes: float64(3), object(3)
memory usage: 23.2+ KB
[17]: (492, 6)
[19]: df.describe
3
0 40.000000 male 29.900000 no southwest 2.0
1 47.000000 male 29.959061 no southwest 1.0
2 54.000000 female 28.880000 no northeast 2.0
3 38.844276 male 30.568094 no northeast 1.0
4 59.130049 male 33.132854 yes northeast 4.0
.. … … … … … …
487 51.000000 male 27.740000 no northeast 1.0
488 33.000000 male 42.400000 no southwest 5.0
489 47.769999 male 29.064615 no northeast 4.0
490 41.530738 female 24.260852 no southeast 5.0
491 36.000000 male 33.400000 yes southwest 2.0
[21]: 2952
[23]: df.dtypes
[24]: ' We need to perform datatype conversions to ensure that our data is in \nthe
appropriate format for analysis and modeling '
df=df.astype({"age":int})
df=df.astype({"children":int})
4
[26]: df.dtypes
[27]: df.head()
5
[36]: # Use pandas get_dummies to perform one-hot encoding on categorical data
df_encoded = pd.get_dummies(df[categorical_columns])
[37]: print(df_encoded)
[39]: # Concatenate the one-hot encoded categorical columns with the non-categorical␣
↪columns
6
4 59 33.132854 4 0 1 0
.. … … … … … …
487 51 27.740000 1 0 1 1
488 33 42.400000 5 0 1 1
489 47 29.064615 4 0 1 1
490 41 24.260852 5 1 0 1
491 36 33.400000 2 0 1 0
region_southwest
0 1
1 1
2 0
3 0
4 0
.. …
487 0
488 1
489 0
490 0
491 1
[ ]: # Alternative method for conversion if there are many categorical data that are␣
↪not possible to write manually in the list
'''cols = df_cat.columns
7
df_cat = cat_2_num(df_cat, cols)'''