0% found this document useful (0 votes)
21 views22 pages

Lec ExploratoryDataAnalysis1Unit5Part1

Uploaded by

k626856k
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
21 views22 pages

Lec ExploratoryDataAnalysis1Unit5Part1

Uploaded by

k626856k
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 22

Lec_ExploratoryDataAnalysis1Unit5Part1

October 3, 2023

[4]: import pandas as pd


import numpy as np

# Create a sample dataset with missing data and duplicates


data = {
'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva', 'Alice'],
'Age': [25, 30, np.nan, 35, 28, 25],
'Gender': ['Female', 'Male', 'Male', 'Male', 'Female', 'Female'],
'Salary': [50000, 60000, 45000, 70000, np.nan, 55000]
}

df = pd.DataFrame(data)

print("Dataset", "\n", df)


# Handling Missing Data
# Check for missing values
print("Missing values in the DataFrame:")
print(df.isnull())

# Remove rows with missing values


df.dropna(inplace=True)
print("\nDataFrame after removing rows with missing values:")
print(df)

# Data Transformation: Removing Duplicates


# Remove duplicate rows
df.drop_duplicates(inplace=True)
print("\nDataFrame after removing duplicates:")
print(df)

# Transforming Data Using a Function or Mapping


# Example: Transform 'Age' column by squaring each value
df['Age'] = df['Age'].apply(lambda x: x ** 2)
print("\nDataFrame after transforming 'Age' column:")
print(df)

# Replacing Values

1
# Replace 'Female' with 'F' and 'Male' with 'M' in the 'Gender' column
df['Gender'] = df['Gender'].replace({'Female': 'F', 'Male': 'M'})
print("\nDataFrame after replacing values in 'Gender' column:")
print(df)

# Detecting and Filtering Outliers


# Detect outliers in the 'Salary' column using z-score
from scipy import stats
z_scores = np.abs(stats.zscore(df['Salary']))
outliers = df[z_scores > 2]
print("\nOutliers in 'Salary' column:")
print(outliers)

# Functions in pandas
# Calculate the mean salary
mean_salary = df['Salary'].mean()
print("\nMean Salary:", mean_salary)

Dataset
Name Age Gender Salary
0 Alice 25.0 Female 50000.0
1 Bob 30.0 Male 60000.0
2 Charlie NaN Male 45000.0
3 David 35.0 Male 70000.0
4 Eva 28.0 Female NaN
5 Alice 25.0 Female 55000.0
Missing values in the DataFrame:
Name Age Gender Salary
0 False False False False
1 False False False False
2 False True False False
3 False False False False
4 False False False True
5 False False False False

DataFrame after removing rows with missing values:


Name Age Gender Salary
0 Alice 25.0 Female 50000.0
1 Bob 30.0 Male 60000.0
3 David 35.0 Male 70000.0
5 Alice 25.0 Female 55000.0

DataFrame after removing duplicates:


Name Age Gender Salary
0 Alice 25.0 Female 50000.0
1 Bob 30.0 Male 60000.0
3 David 35.0 Male 70000.0

2
5 Alice 25.0 Female 55000.0

DataFrame after transforming 'Age' column:


Name Age Gender Salary
0 Alice 625.0 Female 50000.0
1 Bob 900.0 Male 60000.0
3 David 1225.0 Male 70000.0
5 Alice 625.0 Female 55000.0

DataFrame after replacing values in 'Gender' column:


Name Age Gender Salary
0 Alice 625.0 F 50000.0
1 Bob 900.0 M 60000.0
3 David 1225.0 M 70000.0
5 Alice 625.0 F 55000.0

Outliers in 'Salary' column:


Empty DataFrame
Columns: [Name, Age, Gender, Salary]
Index: []

Mean Salary: 58750.0

1 Handling Missing Dat


[5]: import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt # Matlab-style plotting

[6]: df=pd.read_csv("data.csv")

[7]: df.head()

[7]: Make Model Year Engine Fuel Type Engine HP \


0 BMW 1 Series M 2011 premium unleaded (required) 335.0
1 BMW 1 Series 2011 premium unleaded (required) 300.0
2 BMW 1 Series 2011 premium unleaded (required) 300.0
3 BMW 1 Series 2011 premium unleaded (required) 230.0
4 BMW 1 Series 2011 premium unleaded (required) 230.0

Engine Cylinders Transmission Type Driven_Wheels Number of Doors \


0 6.0 MANUAL rear wheel drive 2.0
1 6.0 MANUAL rear wheel drive 2.0
2 6.0 MANUAL rear wheel drive 2.0
3 6.0 MANUAL rear wheel drive 2.0
4 6.0 MANUAL rear wheel drive 2.0

3
Market Category Vehicle Size Vehicle Style \
0 Factory Tuner,Luxury,High-Performance Compact Coupe
1 Luxury,Performance Compact Convertible
2 Luxury,High-Performance Compact Coupe
3 Luxury,Performance Compact Coupe
4 Luxury Compact Convertible

highway MPG city mpg Popularity MSRP


0 26 19 3916 46135
1 28 19 3916 40650
2 28 20 3916 36350
3 28 18 3916 29450
4 28 18 3916 34500

[9]: df.columns.tolist()

[9]: ['Make',
'Model',
'Year',
'Engine Fuel Type',
'Engine HP',
'Engine Cylinders',
'Transmission Type',
'Driven_Wheels',
'Number of Doors',
'Market Category',
'Vehicle Size',
'Vehicle Style',
'highway MPG',
'city mpg',
'Popularity',
'MSRP']

[10]: #Uniform format


df.columns = df.columns.str.lower().str.replace(' ', '_')
df.columns.tolist()

[10]: ['make',
'model',
'year',
'engine_fuel_type',
'engine_hp',
'engine_cylinders',
'transmission_type',
'driven_wheels',
'number_of_doors',

4
'market_category',
'vehicle_size',
'vehicle_style',
'highway_mpg',
'city_mpg',
'popularity',
'msrp']

[11]: df = df.rename(columns={'msrp': 'price'})


df.columns.tolist()

[11]: ['make',
'model',
'year',
'engine_fuel_type',
'engine_hp',
'engine_cylinders',
'transmission_type',
'driven_wheels',
'number_of_doors',
'market_category',
'vehicle_size',
'vehicle_style',
'highway_mpg',
'city_mpg',
'popularity',
'price']

[12]: string_columns = list(df.dtypes[df.dtypes == 'object'].index)


for col in string_columns:
df[col] = df[col].str.lower().str.replace(' ', '_')
df.columns.tolist()

[12]: ['make',
'model',
'year',
'engine_fuel_type',
'engine_hp',
'engine_cylinders',
'transmission_type',
'driven_wheels',
'number_of_doors',
'market_category',
'vehicle_size',
'vehicle_style',
'highway_mpg',
'city_mpg',

5
'popularity',
'price']

[13]: df.head()

[13]: make model year engine_fuel_type engine_hp \


0 bmw 1_series_m 2011 premium_unleaded_(required) 335.0
1 bmw 1_series 2011 premium_unleaded_(required) 300.0
2 bmw 1_series 2011 premium_unleaded_(required) 300.0
3 bmw 1_series 2011 premium_unleaded_(required) 230.0
4 bmw 1_series 2011 premium_unleaded_(required) 230.0

engine_cylinders transmission_type driven_wheels number_of_doors \


0 6.0 manual rear_wheel_drive 2.0
1 6.0 manual rear_wheel_drive 2.0
2 6.0 manual rear_wheel_drive 2.0
3 6.0 manual rear_wheel_drive 2.0
4 6.0 manual rear_wheel_drive 2.0

market_category vehicle_size vehicle_style \


0 factory_tuner,luxury,high-performance compact coupe
1 luxury,performance compact convertible
2 luxury,high-performance compact coupe
3 luxury,performance compact coupe
4 luxury compact convertible

highway_mpg city_mpg popularity price


0 26 19 3916 46135
1 28 19 3916 40650
2 28 20 3916 36350
3 28 18 3916 29450
4 28 18 3916 34500

[15]: from scipy import stats


from scipy.stats import norm
from sklearn.preprocessing import StandardScaler

[18]: sns.distplot(df['price']);

C:\Users\agarw\anaconda3\lib\site-packages\seaborn\distributions.py:2619:
FutureWarning: `distplot` is a deprecated function and will be removed in a
future version. Please adapt your code to use either `displot` (a figure-level
function with similar flexibility) or `histplot` (an axes-level function for
histograms).
warnings.warn(msg, FutureWarning)

6
[19]: sns.distplot(df['price'] , fit=norm);

C:\Users\agarw\anaconda3\lib\site-packages\seaborn\distributions.py:2619:
FutureWarning: `distplot` is a deprecated function and will be removed in a
future version. Please adapt your code to use either `displot` (a figure-level
function with similar flexibility) or `histplot` (an axes-level function for
histograms).
warnings.warn(msg, FutureWarning)

7
[25]: sns.distplot(df['price'] , fit=norm);
(mu, sigma) = norm.fit(df['price'])
print( '\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))
#check skewness of the data
print("Skewness: %f" % df['price'].skew())
print("Kurtosis: %f" % df['price'].kurt())

#Now plot the distribution


plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu,␣
,→sigma)],

loc='best')
plt.ylabel('Frequency')
plt.title('SalePrice distribution')

#Get also the QQ-plot


fig = plt.figure()
res = stats.probplot(df['price'], plot=plt)
plt.show()

C:\Users\agarw\anaconda3\lib\site-packages\seaborn\distributions.py:2619:
FutureWarning: `distplot` is a deprecated function and will be removed in a
future version. Please adapt your code to use either `displot` (a figure-level
function with similar flexibility) or `histplot` (an axes-level function for
histograms).

8
warnings.warn(msg, FutureWarning)

mu = 40594.74 and sigma = 60106.58

Skewness: 11.771987
Kurtosis: 268.926276

9
kurtosis
In probability theory and statistics, kurtosis is a measure of the “tailedness” of the probability
distribution of a real-valued random variable.
Like skewness, kurtosis describes a particular aspect of a probability distribution.
There are different ways to quantify kurtosis for a theoretical distribution, and there are corre-
sponding ways of estimating it using a sample from a population.
Different measures of kurtosis may have different interpretations.
[29]: # Set the variable and data for the scatter plot
engine_col = 'engine_hp'
engine_data = pd.concat([df['price'], df[engine_col]], axis=1)

engine_data.head()

[29]: price engine_hp


0 46135 335.0
1 40650 300.0
2 36350 300.0
3 29450 230.0
4 34500 230.0

10
[30]: # Create the scatter plot
fig, ax = plt.subplots(figsize=(8, 6))
ax.scatter(x=engine_data[engine_col], y=engine_data['price'])
ax.set_ylim([0, 800000])
ax.set_title("Scatter plot of car popularity and price")
ax.set_xlabel("Engine Horsepower (rpm)")
ax.set_ylabel("Price ($)")

# Show the plot


plt.show()

[32]: engine_cylinders_col = 'engine_cylinders'


engine_cylinders_price_data = pd.concat([df['price'],␣
,→df[engine_cylinders_col]], axis=1)

# Create the box plot


fig, ax = plt.subplots(figsize=(8, 6))
sns.boxplot(x=engine_cylinders_col, y='price',␣
,→data=engine_cylinders_price_data, ax=ax)

ax.set_title('Box plot of engine cylinder and price')

11
ax.set_xlabel('Number of Engine Cylinders')
ax.set_ylabel('Price ($)')
plt.xticks(rotation=45)

# Show the plot


plt.show()

[42]: # Select the top car makes by frequency


make_col = 'make'
# Select the top 5 car makes by frequency
top_makes = df['make'].value_counts().nlargest(5)
top_makes

[42]: chevrolet 1123


ford 881
volkswagen 809
toyota 746
dodge 626
Name: make, dtype: int64

12
[43]: top_makes = df['make'].value_counts().nlargest(5).index.tolist()
print(top_makes)
# Create a new DataFrame that only includes the top makes
top_make_data = df[df[make_col].isin(top_makes)]
top_make_data.head()

['chevrolet', 'ford', 'volkswagen', 'toyota', 'dodge']

[43]: make model year engine_fuel_type engine_hp engine_cylinders \


479 toyota 4runner 2014 regular_unleaded 270.0 6.0
480 toyota 4runner 2014 regular_unleaded 270.0 6.0
481 toyota 4runner 2014 regular_unleaded 270.0 6.0
482 toyota 4runner 2014 regular_unleaded 270.0 6.0
483 toyota 4runner 2014 regular_unleaded 270.0 6.0

transmission_type driven_wheels number_of_doors market_category \


479 automatic rear_wheel_drive 4.0 NaN
480 automatic rear_wheel_drive 4.0 NaN
481 automatic four_wheel_drive 4.0 NaN
482 automatic four_wheel_drive 4.0 NaN
483 automatic four_wheel_drive 4.0 NaN

vehicle_size vehicle_style highway_mpg city_mpg popularity price


479 midsize 4dr_suv 23 17 2031 41365
480 midsize 4dr_suv 23 17 2031 35740
481 midsize 4dr_suv 22 17 2031 37615
482 midsize 4dr_suv 22 17 2031 34695
483 midsize 4dr_suv 22 17 2031 35725

[44]: # Create the box plot with the top makes


fig, ax = plt.subplots(figsize=(8, 6))
sns.boxplot(x=make_col, y='price', data=top_make_data, ax=ax)
ax.set_title('Box plot of top car brands and price')
ax.set_xlabel('Car Brand')
ax.set_ylabel('Price ($)')
plt.xticks(rotation=45)

# Show the plot


plt.show()

13
[45]: # correlation matrix

[47]: plt.figure(figsize=(7,6))
correlation = df.corr()
sns.heatmap(correlation,annot=True)
correlation

[47]: year engine_hp engine_cylinders number_of_doors \


year 1.000000 0.351794 -0.041479 0.263787
engine_hp 0.351794 1.000000 0.779988 -0.102713
engine_cylinders -0.041479 0.779988 1.000000 -0.140088
number_of_doors 0.263787 -0.102713 -0.140088 1.000000
highway_mpg 0.258240 -0.406563 -0.621606 0.118570
city_mpg 0.198171 -0.439371 -0.600776 0.120881
popularity 0.073049 0.037501 0.041145 -0.048272
price 0.227590 0.662008 0.531312 -0.126635

highway_mpg city_mpg popularity price

14
year 0.258240 0.198171 0.073049 0.227590
engine_hp -0.406563 -0.439371 0.037501 0.662008
engine_cylinders -0.621606 -0.600776 0.041145 0.531312
number_of_doors 0.118570 0.120881 -0.048272 -0.126635
highway_mpg 1.000000 0.886829 -0.020991 -0.160043
city_mpg 0.886829 1.000000 -0.003217 -0.157676
popularity -0.020991 -0.003217 1.000000 -0.048476
price -0.160043 -0.157676 -0.048476 1.000000

pairplot
[48]: sns.set()
cols = ['year', 'engine_hp', 'engine_cylinders', 'number_of_doors', 'price',]
sns.pairplot(df[cols], height = 2.5)
plt.show();

15
2 DATA CLEANSING
[53]: #check missing ratio
data_na = (df.isnull().sum() / len(df)) * 100
print(data_na)
# exclude the columns that are not null (consider onlu colums that have null␣
,→values non zeros)

data_na = data_na.drop(data_na[data_na == 0].index).


,→sort_values(ascending=False)[:30]

missing_data = pd.DataFrame({'Missing Ratio' :data_na})


missing_data.head(20)

16
make 0.000000
model 0.000000
year 0.000000
engine_fuel_type 0.025180
engine_hp 0.579151
engine_cylinders 0.251805
transmission_type 0.000000
driven_wheels 0.000000
number_of_doors 0.050361
market_category 31.408427
vehicle_size 0.000000
vehicle_style 0.000000
highway_mpg 0.000000
city_mpg 0.000000
popularity 0.000000
price 0.000000
dtype: float64

[53]: Missing Ratio


market_category 31.408427
engine_hp 0.579151
engine_cylinders 0.251805
number_of_doors 0.050361
engine_fuel_type 0.025180

[54]: fig, ax = plt.subplots(figsize=(15, 12))


sns.barplot(x=data_na.index, y=data_na, ax=ax)
ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
ax.set(xlabel='Features', ylabel='Percent of missing values',
title='Percent missing data by feature')
ax.grid(True)
plt.show()

17
Drop Duplicate
[56]: print(df.shape)
duplicate_rows_df = df[df.duplicated()]
print("number of duplicate rows: ", duplicate_rows_df.shape)

(11914, 16)
number of duplicate rows: (715, 16)

[58]: #drop duplicates


df = df.drop_duplicates()
print(df.shape)

(11199, 16)

18
3 Deal with missing values
[60]: df.head().T

[60]: 0 \
make bmw
model 1_series_m
year 2011
engine_fuel_type premium_unleaded_(required)
engine_hp 335.0
engine_cylinders 6.0
transmission_type manual
driven_wheels rear_wheel_drive
number_of_doors 2.0
market_category factory_tuner,luxury,high-performance
vehicle_size compact
vehicle_style coupe
highway_mpg 26
city_mpg 19
popularity 3916
price 46135

1 2 \
make bmw bmw
model 1_series 1_series
year 2011 2011
engine_fuel_type premium_unleaded_(required) premium_unleaded_(required)
engine_hp 300.0 300.0
engine_cylinders 6.0 6.0
transmission_type manual manual
driven_wheels rear_wheel_drive rear_wheel_drive
number_of_doors 2.0 2.0
market_category luxury,performance luxury,high-performance
vehicle_size compact compact
vehicle_style convertible coupe
highway_mpg 28 28
city_mpg 19 20
popularity 3916 3916
price 40650 36350

3 4
make bmw bmw
model 1_series 1_series
year 2011 2011
engine_fuel_type premium_unleaded_(required) premium_unleaded_(required)
engine_hp 230.0 230.0
engine_cylinders 6.0 6.0

19
transmission_type manual manual
driven_wheels rear_wheel_drive rear_wheel_drive
number_of_doors 2.0 2.0
market_category luxury,performance luxury
vehicle_size compact compact
vehicle_style coupe convertible
highway_mpg 28 28
city_mpg 18 18
popularity 3916 3916
price 29450 34500

[61]: df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11199 entries, 0 to 11913
Data columns (total 16 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 make 11199 non-null object
1 model 11199 non-null object
2 year 11199 non-null int64
3 engine_fuel_type 11196 non-null object
4 engine_hp 11130 non-null float64
5 engine_cylinders 11169 non-null float64
6 transmission_type 11199 non-null object
7 driven_wheels 11199 non-null object
8 number_of_doors 11193 non-null float64
9 market_category 7823 non-null object
10 vehicle_size 11199 non-null object
11 vehicle_style 11199 non-null object
12 highway_mpg 11199 non-null int64
13 city_mpg 11199 non-null int64
14 popularity 11199 non-null int64
15 price 11199 non-null int64
dtypes: float64(3), int64(5), object(8)
memory usage: 1.5+ MB

[62]: # engine_fuel_type 11196 non-null object


# the missing values are filled with the mode (the colums data is not real␣
,→values) within the same 'model' group.

# If the mode is empty, it uses 'None' as the fallback value.


df['engine_fuel_type'] = df.groupby('model')['engine_fuel_type'].
,→transform(lambda x: x.fillna(x.mode().iloc[0] if not x.mode().empty else␣

,→None))

[64]: df['engine_fuel_type'].isnull().sum()

20
[64]: 0

[65]: #number_of_doors 11193 non-null float64


#the missing values are filled with the mean (average) of the 'number_of_doors'␣
,→within the same 'model' group.

df['number_of_doors'] = df.groupby('model')['number_of_doors'].transform(lambda␣
,→x: x.fillna(x.mean()))

[67]: df['number_of_doors'].isnull().sum()

[67]: 0

[70]: # engine_cylinders 11169 non-null float64


#the missing values are filled with the mean (average) of the␣
,→'engine_cylinders' within the same 'model' group.

df['engine_cylinders'] = df.groupby('model')['engine_cylinders'].
,→transform(lambda x: x.fillna(x.mean()))

[71]: df['engine_cylinders'].isnull().sum()

[71]: 29

[73]: #As we utilize the groupby method, there may still be null values present in␣
,→our dataset.

[74]: df['engine_hp'] = df.groupby(['model', 'year'])['engine_hp'].transform(lambda x:


,→ x.fillna(x.mean()))

[75]: df['engine_hp'].isnull().sum()

[75]: 47

[76]: df.isnull().sum()

[76]: make 0
model 0
year 0
engine_fuel_type 0
engine_hp 47
engine_cylinders 29
transmission_type 0
driven_wheels 0
number_of_doors 0
market_category 3376
vehicle_size 0
vehicle_style 0
highway_mpg 0

21
city_mpg 0
popularity 0
price 0
dtype: int64

[77]: #As we utilize the groupby method, there may still be null values present in␣
,→our dataset.

To address this issue, we can use a rule-based method for imputing these remaining missing values
[ ]:

22

You might also like