0% found this document useful (0 votes)

21 views22 pages

Lec ExploratoryDataAnalysis1Unit5Part1

Uploaded by

k626856k

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

21 views22 pages

Lec ExploratoryDataAnalysis1Unit5Part1

Uploaded by

k626856k

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as PDF, TXT or read online on Scribd

You are on page 1/ 22

Lec_ExploratoryDataAnalysis1Unit5Part1

October 3, 2023

[4]: import pandas as pd

import numpy as np

# Create a sample dataset with missing data and duplicates

data = {
'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva', 'Alice'],
'Age': [25, 30, np.nan, 35, 28, 25],
'Gender': ['Female', 'Male', 'Male', 'Male', 'Female', 'Female'],
'Salary': [50000, 60000, 45000, 70000, np.nan, 55000]
}

df = pd.DataFrame(data)

print("Dataset", "\n", df)

# Handling Missing Data
# Check for missing values
print("Missing values in the DataFrame:")
print(df.isnull())

# Remove rows with missing values

df.dropna(inplace=True)
print("\nDataFrame after removing rows with missing values:")
print(df)

# Data Transformation: Removing Duplicates

# Remove duplicate rows
df.drop_duplicates(inplace=True)
print("\nDataFrame after removing duplicates:")
print(df)

# Transforming Data Using a Function or Mapping

# Example: Transform 'Age' column by squaring each value
df['Age'] = df['Age'].apply(lambda x: x ** 2)
print("\nDataFrame after transforming 'Age' column:")
print(df)

# Replacing Values

1
# Replace 'Female' with 'F' and 'Male' with 'M' in the 'Gender' column
df['Gender'] = df['Gender'].replace({'Female': 'F', 'Male': 'M'})
print("\nDataFrame after replacing values in 'Gender' column:")
print(df)

# Detecting and Filtering Outliers

# Detect outliers in the 'Salary' column using z-score
from scipy import stats
z_scores = np.abs(stats.zscore(df['Salary']))
outliers = df[z_scores > 2]
print("\nOutliers in 'Salary' column:")
print(outliers)

# Functions in pandas
# Calculate the mean salary
mean_salary = df['Salary'].mean()
print("\nMean Salary:", mean_salary)

Dataset
Name Age Gender Salary
0 Alice 25.0 Female 50000.0
1 Bob 30.0 Male 60000.0
2 Charlie NaN Male 45000.0
3 David 35.0 Male 70000.0
4 Eva 28.0 Female NaN
5 Alice 25.0 Female 55000.0
Missing values in the DataFrame:
Name Age Gender Salary
0 False False False False
1 False False False False
2 False True False False
3 False False False False
4 False False False True
5 False False False False

DataFrame after removing rows with missing values:

Name Age Gender Salary
0 Alice 25.0 Female 50000.0
1 Bob 30.0 Male 60000.0
3 David 35.0 Male 70000.0
5 Alice 25.0 Female 55000.0

DataFrame after removing duplicates:

Name Age Gender Salary
0 Alice 25.0 Female 50000.0
1 Bob 30.0 Male 60000.0
3 David 35.0 Male 70000.0

2
5 Alice 25.0 Female 55000.0

DataFrame after transforming 'Age' column:

Name Age Gender Salary
0 Alice 625.0 Female 50000.0
1 Bob 900.0 Male 60000.0
3 David 1225.0 Male 70000.0
5 Alice 625.0 Female 55000.0

DataFrame after replacing values in 'Gender' column:

Name Age Gender Salary
0 Alice 625.0 F 50000.0
1 Bob 900.0 M 60000.0
3 David 1225.0 M 70000.0
5 Alice 625.0 F 55000.0

Outliers in 'Salary' column:

Empty DataFrame
Columns: [Name, Age, Gender, Salary]
Index: []

Mean Salary: 58750.0

1 Handling Missing Dat

[5]: import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt # Matlab-style plotting

[6]: df=pd.read_csv("data.csv")

[7]: df.head()

[7]: Make Model Year Engine Fuel Type Engine HP \

0 BMW 1 Series M 2011 premium unleaded (required) 335.0
1 BMW 1 Series 2011 premium unleaded (required) 300.0
2 BMW 1 Series 2011 premium unleaded (required) 300.0
3 BMW 1 Series 2011 premium unleaded (required) 230.0
4 BMW 1 Series 2011 premium unleaded (required) 230.0

Engine Cylinders Transmission Type Driven_Wheels Number of Doors \

0 6.0 MANUAL rear wheel drive 2.0
1 6.0 MANUAL rear wheel drive 2.0
2 6.0 MANUAL rear wheel drive 2.0
3 6.0 MANUAL rear wheel drive 2.0
4 6.0 MANUAL rear wheel drive 2.0

3
Market Category Vehicle Size Vehicle Style \
0 Factory Tuner,Luxury,High-Performance Compact Coupe
1 Luxury,Performance Compact Convertible
2 Luxury,High-Performance Compact Coupe
3 Luxury,Performance Compact Coupe
4 Luxury Compact Convertible

highway MPG city mpg Popularity MSRP

0 26 19 3916 46135
1 28 19 3916 40650
2 28 20 3916 36350
3 28 18 3916 29450
4 28 18 3916 34500

[9]: df.columns.tolist()

[9]: ['Make',
'Model',
'Year',
'Engine Fuel Type',
'Engine HP',
'Engine Cylinders',
'Transmission Type',
'Driven_Wheels',
'Number of Doors',
'Market Category',
'Vehicle Size',
'Vehicle Style',
'highway MPG',
'city mpg',
'Popularity',
'MSRP']

[10]: #Uniform format

df.columns = df.columns.str.lower().str.replace(' ', '_')
df.columns.tolist()

[10]: ['make',
'model',
'year',
'engine_fuel_type',
'engine_hp',
'engine_cylinders',
'transmission_type',
'driven_wheels',
'number_of_doors',

4
'market_category',
'vehicle_size',
'vehicle_style',
'highway_mpg',
'city_mpg',
'popularity',
'msrp']

[11]: df = df.rename(columns={'msrp': 'price'})

df.columns.tolist()

[11]: ['make',
'model',
'year',
'engine_fuel_type',
'engine_hp',
'engine_cylinders',
'transmission_type',
'driven_wheels',
'number_of_doors',
'market_category',
'vehicle_size',
'vehicle_style',
'highway_mpg',
'city_mpg',
'popularity',
'price']

[12]: string_columns = list(df.dtypes[df.dtypes == 'object'].index)

for col in string_columns:
df[col] = df[col].str.lower().str.replace(' ', '_')
df.columns.tolist()

[12]: ['make',
'model',
'year',
'engine_fuel_type',
'engine_hp',
'engine_cylinders',
'transmission_type',
'driven_wheels',
'number_of_doors',
'market_category',
'vehicle_size',
'vehicle_style',
'highway_mpg',
'city_mpg',

5
'popularity',
'price']

[13]: df.head()

[13]: make model year engine_fuel_type engine_hp \

0 bmw 1_series_m 2011 premium_unleaded_(required) 335.0
1 bmw 1_series 2011 premium_unleaded_(required) 300.0
2 bmw 1_series 2011 premium_unleaded_(required) 300.0
3 bmw 1_series 2011 premium_unleaded_(required) 230.0
4 bmw 1_series 2011 premium_unleaded_(required) 230.0

engine_cylinders transmission_type driven_wheels number_of_doors \

0 6.0 manual rear_wheel_drive 2.0
1 6.0 manual rear_wheel_drive 2.0
2 6.0 manual rear_wheel_drive 2.0
3 6.0 manual rear_wheel_drive 2.0
4 6.0 manual rear_wheel_drive 2.0

market_category vehicle_size vehicle_style \

0 factory_tuner,luxury,high-performance compact coupe
1 luxury,performance compact convertible
2 luxury,high-performance compact coupe
3 luxury,performance compact coupe
4 luxury compact convertible

highway_mpg city_mpg popularity price

0 26 19 3916 46135
1 28 19 3916 40650
2 28 20 3916 36350
3 28 18 3916 29450
4 28 18 3916 34500

[15]: from scipy import stats

from scipy.stats import norm
from sklearn.preprocessing import StandardScaler

[18]: sns.distplot(df['price']);

C:\Users\agarw\anaconda3\lib\site-packages\seaborn\distributions.py:2619:
FutureWarning: `distplot` is a deprecated function and will be removed in a
future version. Please adapt your code to use either `displot` (a figure-level
function with similar flexibility) or `histplot` (an axes-level function for
histograms).
warnings.warn(msg, FutureWarning)

6
[19]: sns.distplot(df['price'] , fit=norm);

7
[25]: sns.distplot(df['price'] , fit=norm);
(mu, sigma) = norm.fit(df['price'])
print( '\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))
#check skewness of the data
print("Skewness: %f" % df['price'].skew())
print("Kurtosis: %f" % df['price'].kurt())

#Now plot the distribution

plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu,␣
,→sigma)],

loc='best')
plt.ylabel('Frequency')
plt.title('SalePrice distribution')

#Get also the QQ-plot

fig = plt.figure()
res = stats.probplot(df['price'], plot=plt)
plt.show()

8
warnings.warn(msg, FutureWarning)

mu = 40594.74 and sigma = 60106.58

Skewness: 11.771987
Kurtosis: 268.926276

9
kurtosis
In probability theory and statistics, kurtosis is a measure of the “tailedness” of the probability
distribution of a real-valued random variable.
Like skewness, kurtosis describes a particular aspect of a probability distribution.
There are different ways to quantify kurtosis for a theoretical distribution, and there are corre-
sponding ways of estimating it using a sample from a population.
Different measures of kurtosis may have different interpretations.
[29]: # Set the variable and data for the scatter plot
engine_col = 'engine_hp'
engine_data = pd.concat([df['price'], df[engine_col]], axis=1)

engine_data.head()

[29]: price engine_hp

0 46135 335.0
1 40650 300.0
2 36350 300.0
3 29450 230.0
4 34500 230.0

10
[30]: # Create the scatter plot
fig, ax = plt.subplots(figsize=(8, 6))
ax.scatter(x=engine_data[engine_col], y=engine_data['price'])
ax.set_ylim([0, 800000])
ax.set_title("Scatter plot of car popularity and price")
ax.set_xlabel("Engine Horsepower (rpm)")
ax.set_ylabel("Price ($)")

# Show the plot

plt.show()

[32]: engine_cylinders_col = 'engine_cylinders'

engine_cylinders_price_data = pd.concat([df['price'],␣
,→df[engine_cylinders_col]], axis=1)

# Create the box plot

fig, ax = plt.subplots(figsize=(8, 6))
sns.boxplot(x=engine_cylinders_col, y='price',␣
,→data=engine_cylinders_price_data, ax=ax)

ax.set_title('Box plot of engine cylinder and price')

11
ax.set_xlabel('Number of Engine Cylinders')
ax.set_ylabel('Price ($)')
plt.xticks(rotation=45)

# Show the plot

plt.show()

[42]: # Select the top car makes by frequency

make_col = 'make'
# Select the top 5 car makes by frequency
top_makes = df['make'].value_counts().nlargest(5)
top_makes

[42]: chevrolet 1123

ford 881
volkswagen 809
toyota 746
dodge 626
Name: make, dtype: int64

12
[43]: top_makes = df['make'].value_counts().nlargest(5).index.tolist()
print(top_makes)
# Create a new DataFrame that only includes the top makes
top_make_data = df[df[make_col].isin(top_makes)]
top_make_data.head()

['chevrolet', 'ford', 'volkswagen', 'toyota', 'dodge']

[43]: make model year engine_fuel_type engine_hp engine_cylinders \

479 toyota 4runner 2014 regular_unleaded 270.0 6.0
480 toyota 4runner 2014 regular_unleaded 270.0 6.0
481 toyota 4runner 2014 regular_unleaded 270.0 6.0
482 toyota 4runner 2014 regular_unleaded 270.0 6.0
483 toyota 4runner 2014 regular_unleaded 270.0 6.0

transmission_type driven_wheels number_of_doors market_category \

479 automatic rear_wheel_drive 4.0 NaN
480 automatic rear_wheel_drive 4.0 NaN
481 automatic four_wheel_drive 4.0 NaN
482 automatic four_wheel_drive 4.0 NaN
483 automatic four_wheel_drive 4.0 NaN

vehicle_size vehicle_style highway_mpg city_mpg popularity price

479 midsize 4dr_suv 23 17 2031 41365
480 midsize 4dr_suv 23 17 2031 35740
481 midsize 4dr_suv 22 17 2031 37615
482 midsize 4dr_suv 22 17 2031 34695
483 midsize 4dr_suv 22 17 2031 35725

[44]: # Create the box plot with the top makes

fig, ax = plt.subplots(figsize=(8, 6))
sns.boxplot(x=make_col, y='price', data=top_make_data, ax=ax)
ax.set_title('Box plot of top car brands and price')
ax.set_xlabel('Car Brand')
ax.set_ylabel('Price ($)')
plt.xticks(rotation=45)

# Show the plot

plt.show()

13
[45]: # correlation matrix

[47]: plt.figure(figsize=(7,6))
correlation = df.corr()
sns.heatmap(correlation,annot=True)
correlation

[47]: year engine_hp engine_cylinders number_of_doors \

year 1.000000 0.351794 -0.041479 0.263787
engine_hp 0.351794 1.000000 0.779988 -0.102713
engine_cylinders -0.041479 0.779988 1.000000 -0.140088
number_of_doors 0.263787 -0.102713 -0.140088 1.000000
highway_mpg 0.258240 -0.406563 -0.621606 0.118570
city_mpg 0.198171 -0.439371 -0.600776 0.120881
popularity 0.073049 0.037501 0.041145 -0.048272
price 0.227590 0.662008 0.531312 -0.126635

highway_mpg city_mpg popularity price

14
year 0.258240 0.198171 0.073049 0.227590
engine_hp -0.406563 -0.439371 0.037501 0.662008
engine_cylinders -0.621606 -0.600776 0.041145 0.531312
number_of_doors 0.118570 0.120881 -0.048272 -0.126635
highway_mpg 1.000000 0.886829 -0.020991 -0.160043
city_mpg 0.886829 1.000000 -0.003217 -0.157676
popularity -0.020991 -0.003217 1.000000 -0.048476
price -0.160043 -0.157676 -0.048476 1.000000

pairplot
[48]: sns.set()
cols = ['year', 'engine_hp', 'engine_cylinders', 'number_of_doors', 'price',]
sns.pairplot(df[cols], height = 2.5)
plt.show();

15
2 DATA CLEANSING
[53]: #check missing ratio
data_na = (df.isnull().sum() / len(df)) * 100
print(data_na)
# exclude the columns that are not null (consider onlu colums that have null␣
,→values non zeros)

data_na = data_na.drop(data_na[data_na == 0].index).

,→sort_values(ascending=False)[:30]

missing_data = pd.DataFrame({'Missing Ratio' :data_na})

missing_data.head(20)

16
make 0.000000
model 0.000000
year 0.000000
engine_fuel_type 0.025180
engine_hp 0.579151
engine_cylinders 0.251805
transmission_type 0.000000
driven_wheels 0.000000
number_of_doors 0.050361
market_category 31.408427
vehicle_size 0.000000
vehicle_style 0.000000
highway_mpg 0.000000
city_mpg 0.000000
popularity 0.000000
price 0.000000
dtype: float64

[53]: Missing Ratio

market_category 31.408427
engine_hp 0.579151
engine_cylinders 0.251805
number_of_doors 0.050361
engine_fuel_type 0.025180

[54]: fig, ax = plt.subplots(figsize=(15, 12))

sns.barplot(x=data_na.index, y=data_na, ax=ax)
ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
ax.set(xlabel='Features', ylabel='Percent of missing values',
title='Percent missing data by feature')
ax.grid(True)
plt.show()

17
Drop Duplicate
[56]: print(df.shape)
duplicate_rows_df = df[df.duplicated()]
print("number of duplicate rows: ", duplicate_rows_df.shape)

(11914, 16)
number of duplicate rows: (715, 16)

[58]: #drop duplicates

df = df.drop_duplicates()
print(df.shape)

(11199, 16)

18
3 Deal with missing values
[60]: df.head().T

[60]: 0 \
make bmw
model 1_series_m
year 2011
engine_fuel_type premium_unleaded_(required)
engine_hp 335.0
engine_cylinders 6.0
transmission_type manual
driven_wheels rear_wheel_drive
number_of_doors 2.0
market_category factory_tuner,luxury,high-performance
vehicle_size compact
vehicle_style coupe
highway_mpg 26
city_mpg 19
popularity 3916
price 46135

1 2 \
make bmw bmw
model 1_series 1_series
year 2011 2011
engine_fuel_type premium_unleaded_(required) premium_unleaded_(required)
engine_hp 300.0 300.0
engine_cylinders 6.0 6.0
transmission_type manual manual
driven_wheels rear_wheel_drive rear_wheel_drive
number_of_doors 2.0 2.0
market_category luxury,performance luxury,high-performance
vehicle_size compact compact
vehicle_style convertible coupe
highway_mpg 28 28
city_mpg 19 20
popularity 3916 3916
price 40650 36350

3 4
make bmw bmw
model 1_series 1_series
year 2011 2011
engine_fuel_type premium_unleaded_(required) premium_unleaded_(required)
engine_hp 230.0 230.0
engine_cylinders 6.0 6.0

19
transmission_type manual manual
driven_wheels rear_wheel_drive rear_wheel_drive
number_of_doors 2.0 2.0
market_category luxury,performance luxury
vehicle_size compact compact
vehicle_style coupe convertible
highway_mpg 28 28
city_mpg 18 18
popularity 3916 3916
price 29450 34500

[61]: df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11199 entries, 0 to 11913
Data columns (total 16 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 make 11199 non-null object
1 model 11199 non-null object
2 year 11199 non-null int64
3 engine_fuel_type 11196 non-null object
4 engine_hp 11130 non-null float64
5 engine_cylinders 11169 non-null float64
6 transmission_type 11199 non-null object
7 driven_wheels 11199 non-null object
8 number_of_doors 11193 non-null float64
9 market_category 7823 non-null object
10 vehicle_size 11199 non-null object
11 vehicle_style 11199 non-null object
12 highway_mpg 11199 non-null int64
13 city_mpg 11199 non-null int64
14 popularity 11199 non-null int64
15 price 11199 non-null int64
dtypes: float64(3), int64(5), object(8)
memory usage: 1.5+ MB

[62]: # engine_fuel_type 11196 non-null object

# the missing values are filled with the mode (the colums data is not real␣
,→values) within the same 'model' group.

# If the mode is empty, it uses 'None' as the fallback value.

df['engine_fuel_type'] = df.groupby('model')['engine_fuel_type'].
,→transform(lambda x: x.fillna(x.mode().iloc[0] if not x.mode().empty else␣

,→None))

[64]: df['engine_fuel_type'].isnull().sum()

20
[64]: 0

[65]: #number_of_doors 11193 non-null float64

#the missing values are filled with the mean (average) of the 'number_of_doors'␣
,→within the same 'model' group.

df['number_of_doors'] = df.groupby('model')['number_of_doors'].transform(lambda␣
,→x: x.fillna(x.mean()))

[67]: df['number_of_doors'].isnull().sum()

[67]: 0

[70]: # engine_cylinders 11169 non-null float64

#the missing values are filled with the mean (average) of the␣
,→'engine_cylinders' within the same 'model' group.

df['engine_cylinders'] = df.groupby('model')['engine_cylinders'].
,→transform(lambda x: x.fillna(x.mean()))

[71]: df['engine_cylinders'].isnull().sum()

[71]: 29

[73]: #As we utilize the groupby method, there may still be null values present in␣
,→our dataset.

[74]: df['engine_hp'] = df.groupby(['model', 'year'])['engine_hp'].transform(lambda x:

,→ x.fillna(x.mean()))

[75]: df['engine_hp'].isnull().sum()

[75]: 47

[76]: df.isnull().sum()

[76]: make 0
model 0
year 0
engine_fuel_type 0
engine_hp 47
engine_cylinders 29
transmission_type 0
driven_wheels 0
number_of_doors 0
market_category 3376
vehicle_size 0
vehicle_style 0
highway_mpg 0

21
city_mpg 0
popularity 0
price 0
dtype: int64

[77]: #As we utilize the groupby method, there may still be null values present in␣
,→our dataset.

To address this issue, we can use a rule-based method for imputing these remaining missing values
[ ]:

BMW 3-Series (E36) 1992-1999: How to Build and Modify
From Everand
BMW 3-Series (E36) 1992-1999: How to Build and Modify
Eddie Nakato
5/5 (5)
Chrysler Slant Six Engines: How to Rebuild and Modify
From Everand
Chrysler Slant Six Engines: How to Rebuild and Modify
Doug Dutra
No ratings yet
LS Swaps: How to Swap GM LS Engines into Almost Anything
From Everand
LS Swaps: How to Swap GM LS Engines into Almost Anything
Jefferson Bryant
3.5/5 (2)
Python Codes
No ratings yet
Python Codes
17 pages
Data Frames and Charts 2: 2.1 Dealing With Missing Values
No ratings yet
Data Frames and Charts 2: 2.1 Dealing With Missing Values
12 pages
Intro to Exploratory Data Analysis Eda in Python
No ratings yet
Intro to Exploratory Data Analysis Eda in Python
7 pages
Data Visualization EDA-print
No ratings yet
Data Visualization EDA-print
18 pages
Data Analytics Using Python
No ratings yet
Data Analytics Using Python
7 pages
Eda Notes
No ratings yet
Eda Notes
4 pages
Analysis and Prediction of House Prices by Linear Regression Model
No ratings yet
Analysis and Prediction of House Prices by Linear Regression Model
91 pages
EDA+Cheatsheet+-+Class+Note
No ratings yet
EDA+Cheatsheet+-+Class+Note
29 pages
Data Analysis Report
No ratings yet
Data Analysis Report
74 pages
Engo 645
No ratings yet
Engo 645
9 pages
Data Treatment
No ratings yet
Data Treatment
6 pages
Internship
No ratings yet
Internship
23 pages
Data Exploration in Python PDF
No ratings yet
Data Exploration in Python PDF
1 page
EDA Cheatsheet - Class Note
No ratings yet
EDA Cheatsheet - Class Note
29 pages
Exp_5_Exploratory_Data_Analysis_sdk_ok
No ratings yet
Exp_5_Exploratory_Data_Analysis_sdk_ok
13 pages
Data Analysis: Data Preparation
No ratings yet
Data Analysis: Data Preparation
9 pages
EDA Cheatsheet - Class Note
No ratings yet
EDA Cheatsheet - Class Note
29 pages
EDA Cheatsheet - Class Note
No ratings yet
EDA Cheatsheet - Class Note
29 pages
EDA Cheatsheet - Class Note
No ratings yet
EDA Cheatsheet - Class Note
29 pages
1.5 Data Analysis with Python- Exploratory Data Analysis 1
No ratings yet
1.5 Data Analysis with Python- Exploratory Data Analysis 1
17 pages
lec18
No ratings yet
lec18
17 pages
DAV_WEEK8_240953580
No ratings yet
DAV_WEEK8_240953580
15 pages
Problem Statement Is To Predict Price Column Based On Data With 24 Columns With Over 200 Data Entries Using Linear Regression
No ratings yet
Problem Statement Is To Predict Price Column Based On Data With 24 Columns With Over 200 Data Entries Using Linear Regression
5 pages
EDA+Cheatsheet+ +Class+Note
No ratings yet
EDA+Cheatsheet+ +Class+Note
29 pages
Machine Learning With Python - Part-2
No ratings yet
Machine Learning With Python - Part-2
27 pages
Data Wrangling
No ratings yet
Data Wrangling
24 pages
2,3. Introduction Pandas & Matplotlib - Copy
No ratings yet
2,3. Introduction Pandas & Matplotlib - Copy
32 pages
Technologyname Phase2
No ratings yet
Technologyname Phase2
20 pages
EDA+Cheatsheet+ +Class+Note
No ratings yet
EDA+Cheatsheet+ +Class+Note
29 pages
EDA+Cheatsheet+ +Class+Note
No ratings yet
EDA+Cheatsheet+ +Class+Note
29 pages
EDA Cheatsheet - Class Note
No ratings yet
EDA Cheatsheet - Class Note
29 pages
Data Mining Lab 03
No ratings yet
Data Mining Lab 03
10 pages
Python Pandas Matplot
No ratings yet
Python Pandas Matplot
15 pages
PythonForMachineLearning
No ratings yet
PythonForMachineLearning
66 pages
Pyt On Visualization
No ratings yet
Pyt On Visualization
50 pages
L6 and 7-Data Preprocessing-coding
No ratings yet
L6 and 7-Data Preprocessing-coding
34 pages
Trilokesh Assignment
No ratings yet
Trilokesh Assignment
15 pages
An Extensive Step by Step Guide To Exploratory Data Analysis
No ratings yet
An Extensive Step by Step Guide To Exploratory Data Analysis
26 pages
3rd Semester DDM AI DAA DEV Print Pages For Spiral Record 25-1-24 - Removed
No ratings yet
3rd Semester DDM AI DAA DEV Print Pages For Spiral Record 25-1-24 - Removed
28 pages
05 Pandas (1)
No ratings yet
05 Pandas (1)
12 pages
AL Notes
No ratings yet
AL Notes
61 pages
Phython Example
No ratings yet
Phython Example
12 pages
Series and Pandas Methods
No ratings yet
Series and Pandas Methods
5 pages
Practical Example Full Notes
No ratings yet
Practical Example Full Notes
48 pages
Data Mining Using Python Manual
No ratings yet
Data Mining Using Python Manual
69 pages
CSE445 NSU Week_3
No ratings yet
CSE445 NSU Week_3
48 pages
data analysis
No ratings yet
data analysis
42 pages
lec19
No ratings yet
lec19
14 pages
Lesson 2 - Data Preprocessing
100% (1)
Lesson 2 - Data Preprocessing
72 pages
BDA File
No ratings yet
BDA File
26 pages
lec20
No ratings yet
lec20
24 pages
exp1
No ratings yet
exp1
5 pages
DAV Assign6
No ratings yet
DAV Assign6
8 pages
Python Basics - Hamza Zahoor
No ratings yet
Python Basics - Hamza Zahoor
6 pages
PR Final File
No ratings yet
PR Final File
70 pages
Data Pre Processing
No ratings yet
Data Pre Processing
2 pages
LS Gen IV Engines 2005 - Present: How to Build Max Performance
From Everand
LS Gen IV Engines 2005 - Present: How to Build Max Performance
Mike Mavrigian
5/5 (2)
Predictive Maintenance Project Milestone Report
No ratings yet
Predictive Maintenance Project Milestone Report
7 pages
Unit 4 - DT - Final
No ratings yet
Unit 4 - DT - Final
132 pages
Data Handling Using Pandas-1
No ratings yet
Data Handling Using Pandas-1
25 pages
Roaa.CV_2024
No ratings yet
Roaa.CV_2024
3 pages
Get Started With Google Colab For Machine Learning and Deep Learning
No ratings yet
Get Started With Google Colab For Machine Learning and Deep Learning
14 pages
DocScanner 14-Mar-2025 11-59-converted
No ratings yet
DocScanner 14-Mar-2025 11-59-converted
64 pages
Machine learning
No ratings yet
Machine learning
9 pages
EXP-3
No ratings yet
EXP-3
10 pages
Constitution
No ratings yet
Constitution
3 pages
r22-1-9-ml-lab-manual-r22-regulations
No ratings yet
r22-1-9-ml-lab-manual-r22-regulations
24 pages
XII - Informatics Practices - 2021-22 - Term - Wise
No ratings yet
XII - Informatics Practices - 2021-22 - Term - Wise
5 pages
Report Print
No ratings yet
Report Print
22 pages
Practical File (Xii - Ip Final)
No ratings yet
Practical File (Xii - Ip Final)
35 pages
Chapter 1 and 2 Series and Data Frame
No ratings yet
Chapter 1 and 2 Series and Data Frame
45 pages
Python Pandas Cheatsheety
No ratings yet
Python Pandas Cheatsheety
7 pages
Orange IP065 12 QP
No ratings yet
Orange IP065 12 QP
9 pages
Machine Learning Guide for Oil and Gas Using Python Hoss Belyadipdf download
100% (2)
Machine Learning Guide for Oil and Gas Using Python Hoss Belyadipdf download
78 pages
JinElSaawy PortfolioManagementusingReinforcementLearning Report
No ratings yet
JinElSaawy PortfolioManagementusingReinforcementLearning Report
6 pages
Anurag-Sah (Data Engineer) - 2
No ratings yet
Anurag-Sah (Data Engineer) - 2
2 pages
Practical-file-IP-Class-12-244
No ratings yet
Practical-file-IP-Class-12-244
20 pages
Pythdatascience
No ratings yet
Pythdatascience
7 pages
Financial Analytics With Python
100% (1)
Financial Analytics With Python
40 pages
A Python Data Analyst's Toolkit: Learn Python and Python-Based Libraries With Applications in Data Analysis and Statistics Gayathri Rajagopalan
100% (7)
A Python Data Analyst's Toolkit: Learn Python and Python-Based Libraries With Applications in Data Analysis and Statistics Gayathri Rajagopalan
62 pages
1.2.1. Retrieving Data - 1.2.2. Cleaning Data
No ratings yet
1.2.1. Retrieving Data - 1.2.2. Cleaning Data
35 pages
internship_report MONICA finall
No ratings yet
internship_report MONICA finall
37 pages
pylab manual
No ratings yet
pylab manual
25 pages
Computer Science
No ratings yet
Computer Science
7 pages
Chapter2 2
No ratings yet
Chapter2 2
27 pages
SpaceY Data Analytics Final Presentation DJ
No ratings yet
SpaceY Data Analytics Final Presentation DJ
50 pages
Big Data Management 1st Edition Fausto Pedro García Márquez download pdf
100% (1)
Big Data Management 1st Edition Fausto Pedro García Márquez download pdf
45 pages

Lec ExploratoryDataAnalysis1Unit5Part1

Uploaded by

Lec ExploratoryDataAnalysis1Unit5Part1

Uploaded by

Lec_ExploratoryDataAnalysis1Unit5Part1

[4]: import pandas as pd

# Create a sample dataset with missing data and duplicates

print("Dataset", "\n", df)

# Remove rows with missing values

# Data Transformation: Removing Duplicates

# Transforming Data Using a Function or Mapping

# Detecting and Filtering Outliers

DataFrame after removing rows with missing values:

DataFrame after removing duplicates:

DataFrame after transforming 'Age' column:

DataFrame after replacing values in 'Gender' column:

Outliers in 'Salary' column:

Mean Salary: 58750.0

1 Handling Missing Dat

[7]: Make Model Year Engine Fuel Type Engine HP \

Engine Cylinders Transmission Type Driven_Wheels Number of Doors \

highway MPG city mpg Popularity MSRP

[10]: #Uniform format

[11]: df = df.rename(columns={'msrp': 'price'})

[12]: string_columns = list(df.dtypes[df.dtypes == 'object'].index)

[13]: make model year engine_fuel_type engine_hp \

engine_cylinders transmission_type driven_wheels number_of_doors \

market_category vehicle_size vehicle_style \

highway_mpg city_mpg popularity price

[15]: from scipy import stats

#Now plot the distribution

#Get also the QQ-plot

mu = 40594.74 and sigma = 60106.58

[29]: price engine_hp

# Show the plot

[32]: engine_cylinders_col = 'engine_cylinders'

# Create the box plot

ax.set_title('Box plot of engine cylinder and price')

# Show the plot

[42]: # Select the top car makes by frequency

[42]: chevrolet 1123

['chevrolet', 'ford', 'volkswagen', 'toyota', 'dodge']

[43]: make model year engine_fuel_type engine_hp engine_cylinders \

transmission_type driven_wheels number_of_doors market_category \

vehicle_size vehicle_style highway_mpg city_mpg popularity price

[44]: # Create the box plot with the top makes

# Show the plot

[47]: year engine_hp engine_cylinders number_of_doors \

highway_mpg city_mpg popularity price

data_na = data_na.drop(data_na[data_na == 0].index).

missing_data = pd.DataFrame({'Missing Ratio' :data_na})

[53]: Missing Ratio

[54]: fig, ax = plt.subplots(figsize=(15, 12))

[58]: #drop duplicates

[62]: # engine_fuel_type 11196 non-null object

# If the mode is empty, it uses 'None' as the fallback value.

[65]: #number_of_doors 11193 non-null float64

[70]: # engine_cylinders 11169 non-null float64

[74]: df['engine_hp'] = df.groupby(['model', 'year'])['engine_hp'].transform(lambda x:

You might also like