0% found this document useful (0 votes)
7 views

Ilovepdf Merged (1)

The document is a lab report from the Data Analytics Lab at Birla Institute of Technology, detailing data analysis performed on the Haberman dataset. It includes various visualizations such as histograms, box plots, and violin plots to analyze the relationship between patient age, year of treatment, lymph nodes, and survival status. Key findings indicate that age distribution is right-skewed, treatment years correlate with survival outcomes, and fewer positive lymph nodes are associated with higher survival rates.

Uploaded by

Pranjal dubey
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
7 views

Ilovepdf Merged (1)

The document is a lab report from the Data Analytics Lab at Birla Institute of Technology, detailing data analysis performed on the Haberman dataset. It includes various visualizations such as histograms, box plots, and violin plots to analyze the relationship between patient age, year of treatment, lymph nodes, and survival status. Key findings indicate that age distribution is right-skewed, treatment years correlate with survival outcomes, and fewer positive lymph nodes are associated with higher survival rates.

Uploaded by

Pranjal dubey
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 89

DEPARTMENT OF COMPUTER SCIENCE &

ENGINEERING
BIRLA INSTITUTE OF TECHNOLOGY,
MESRA – 835 215

DATA ANALYTICS LAB FILE

NAME-Harshit Vinayak
ROLL NO.- MT/AI/10011/24
BRANCH- AI & ML
COURSE- M. TECH
YEAR: - 2024-2026
SUBJECT- DATA ANALYTICS LAB
import numpy as np
import pandas as pd

data = pd.read_csv('haberman.csv')
data

30 64 1 1.1
0 30 62 3 1
1 30 65 0 1
2 31 59 2 1
3 31 65 4 1
4 33 58 10 1
.. .. .. .. ...
300 75 62 1 1
301 76 67 0 1
302 77 65 3 1
303 78 65 1 2
304 83 58 2 2

[305 rows x 4 columns]

# Assign meaningful column names


data.columns = ['Age', 'Year_of_Treatment', 'Lymph_Nodes',
'Survival_Status']

print(data.columns)

Index(['Age', 'Year_of_Treatment', 'Lymph_Nodes', 'Survival_Status'],


dtype='object')

import matplotlib.pyplot as plt


import seaborn as sns

# Assign meaningful column names


data.columns = ['Age', 'Year_of_Treatment', 'Lymph_Nodes',
'Survival_Status']

# Univariate analysis: Distribution of Age


plt.figure(figsize=(8, 5))
sns.histplot(data['Age'], kde=True, bins=15, color='skyblue')
plt.title('Distribution of Age')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.show()
# Age distribution as a plot
plt.figure(figsize=(8, 5))
sns.kdeplot(data['Age'], fill=True, color='skyblue', label='Age
Distribution')
plt.title('Age Distribution (KDE Plot)')
plt.xlabel('Age')
plt.ylabel('Density')
plt.legend()
plt.show()
# Age grouped by Survival Status
plt.figure(figsize=(8, 5))
sns.boxplot(x='Survival_Status', y='Age', data=data, palette='Set2')
plt.title('Age by Survival Status')
plt.xlabel('Survival Status (1 = Survived, 2 = Did Not Survive)')
plt.ylabel('Age')
plt.show()

/var/folders/k6/n04q5khn1dv_gr0dstl69lfh0000gn/T/
ipykernel_66182/909186460.py:3: FutureWarning:

Passing `palette` without assigning `hue` is deprecated and will be


removed in v0.14.0. Assign the `x` variable to `hue` and set
`legend=False` for the same effect.

sns.boxplot(x='Survival_Status', y='Age', data=data, palette='Set2')


# Year of Treatment grouped by Survival Status
plt.figure(figsize=(8, 5))
sns.boxplot(x='Survival_Status', y='Year_of_Treatment', data=data,
palette='Set2')
plt.title('Year of Treatment by Survival Status')
plt.xlabel('Survival Status (1 = Survived, 2 = Did Not Survive)')
plt.ylabel('Year of Treatment')
plt.show()

/var/folders/k6/n04q5khn1dv_gr0dstl69lfh0000gn/T/
ipykernel_66182/3387935504.py:3: FutureWarning:

Passing `palette` without assigning `hue` is deprecated and will be


removed in v0.14.0. Assign the `x` variable to `hue` and set
`legend=False` for the same effect.

sns.boxplot(x='Survival_Status', y='Year_of_Treatment', data=data,


palette='Set2')
# Lymph Nodes grouped by Survival Status
plt.figure(figsize=(8, 5))
sns.boxplot(x='Survival_Status', y='Lymph_Nodes', data=data,
palette='Set2')
plt.title('Number of Positive Axillary Lymph Nodes by Survival
Status')
plt.xlabel('Survival Status (1 = Survived, 2 = Did Not Survive)')
plt.ylabel('Number of Positive Axillary Lymph Nodes')
plt.show()

/var/folders/k6/n04q5khn1dv_gr0dstl69lfh0000gn/T/
ipykernel_66182/3598833194.py:3: FutureWarning:

Passing `palette` without assigning `hue` is deprecated and will be


removed in v0.14.0. Assign the `x` variable to `hue` and set
`legend=False` for the same effect.

sns.boxplot(x='Survival_Status', y='Lymph_Nodes', data=data,


palette='Set2')
# PDF and CDF for Age, Year of Treatment, and Lymph Nodes
plt.figure(figsize=(15, 10))

plt.subplot(3, 3, 1)
sns.kdeplot(data['Age'], color='blue', fill=True, label='PDF')
plt.title('PDF of Age')
plt.xlabel('Age')

plt.subplot(3, 3, 2)
sns.ecdfplot(data['Age'], color='blue', label='CDF')
plt.title('CDF of Age')
plt.xlabel('Age')

# PDF and CDF for Year of Treatment


plt.subplot(3, 3, 4)
sns.kdeplot(data['Year_of_Treatment'], color='orange', fill=True,
label='PDF')
plt.title('PDF of Year of Treatment')
plt.xlabel('Year of Treatment')

plt.subplot(3, 3, 5)
sns.ecdfplot(data['Year_of_Treatment'], color='orange', label='CDF')
plt.title('CDF of Year of Treatment')
plt.xlabel('Year of Treatment')
# PDF and CDF for Lymph Nodes
plt.subplot(3, 3, 7)
sns.kdeplot(data['Lymph_Nodes'], color='green', fill=True,
label='PDF')
plt.title('PDF of Lymph Nodes')
plt.xlabel('Number of Lymph Nodes')

plt.subplot(3, 3, 8)
sns.ecdfplot(data['Lymph_Nodes'], color='green', label='CDF')
plt.title('CDF of Lymph Nodes')
plt.xlabel('Number of Lymph Nodes')

plt.tight_layout()
plt.show()
# Violin plots for Age, Year of Treatment, and Lymph Nodes grouped by
Survival Status
plt.figure(figsize=(15, 10))

plt.subplot(3, 1, 1)
sns.violinplot(x='Survival_Status', y='Age', data=data,
palette="muted")
plt.title('Violin Plot: Age vs Survival Status')

plt.subplot(3, 1, 2)
sns.violinplot(x='Survival_Status', y='Year_of_Treatment', data=data,
palette="muted")
plt.title('Violin Plot: Year of Treatment vs Survival Status')
plt.subplot(3, 1, 3)
sns.violinplot(x='Survival_Status', y='Lymph_Nodes', data=data,
palette="muted")
plt.title('Violin Plot: Lymph Nodes vs Survival Status')

plt.tight_layout()
plt.show()

/var/folders/k6/n04q5khn1dv_gr0dstl69lfh0000gn/T/
ipykernel_66182/3085701399.py:5: FutureWarning:

Passing `palette` without assigning `hue` is deprecated and will be


removed in v0.14.0. Assign the `x` variable to `hue` and set
`legend=False` for the same effect.

sns.violinplot(x='Survival_Status', y='Age', data=data,


palette="muted")
/var/folders/k6/n04q5khn1dv_gr0dstl69lfh0000gn/T/ipykernel_66182/30857
01399.py:9: FutureWarning:

Passing `palette` without assigning `hue` is deprecated and will be


removed in v0.14.0. Assign the `x` variable to `hue` and set
`legend=False` for the same effect.

sns.violinplot(x='Survival_Status', y='Year_of_Treatment',
data=data, palette="muted")
/var/folders/k6/n04q5khn1dv_gr0dstl69lfh0000gn/T/ipykernel_66182/30857
01399.py:13: FutureWarning:

Passing `palette` without assigning `hue` is deprecated and will be


removed in v0.14.0. Assign the `x` variable to `hue` and set
`legend=False` for the same effect.

sns.violinplot(x='Survival_Status', y='Lymph_Nodes', data=data,


palette="muted")
# Scatter plot and pair plot for the dataset
sns.pairplot(data, hue='Survival_Status', diag_kind='kde',
palette='Set2')
plt.show()
Snippets cover
Age distribution analysis, plotting the age distribution, relationship between age and survival
status, year of treatment and survival status, number of lymph nodes and survival status.

1) The age of patients ranges from 30 to 83 years. The distribution is slightly right-skewed, with
the majority of patients aged 40–60.

1. The KDE plot confirms a peak around the 50s, indicating that most patients are middle-
aged.
1. Patients who survived (status = 1) tend to have a slightly wider age range, but the median
age for both survival statuses is similar.
1. Patients treated in later years (closer to 1969) have slightly better survival outcomes.
Earlier treatment years are associated with more cases of non-survival.
1. Patients with fewer positive lymph nodes (closer to 0) have higher survival rates. Non-
survivors tend to have significantly more positive lymph nodes.
26/01/2025, 19:00 LabAssignment1_2

In [138… import pandas as pd


import numpy as np

In [145… df=pd.read_csv("/Users/user/Documents/BIT Mesra /M.tech/2nd Sem/DA LAB/ca

/var/folders/qd/gbss_8pd7wb7t_wz5wqdjsl40000gn/T/ipykernel_49080/185795820
1.py:1: DtypeWarning: Columns (6,11,12,17) have mixed types. Specify dtype
option on import or set low_memory=False.
df=pd.read_csv("/Users/user/Documents/BIT Mesra /M.tech/2nd Sem/DA LAB/c
a_san_francisco_2020_04_01.csv")

In [146… df.head(100)

Out[146… raw_row_number date time location lat lng distri


01-
0 869921 08- 0:01:00 MASONIC AV
& FELL ST 37.773004 -122.445873 Na
2014
01-
1 869922 08- 0:01:00 GEARY&10TH
AV 37.780898 -122.468586 Na
2014
01- SUTTER N
2 869923 08- 0:15:00 OCTAVIA ST 37.786919 -122.426718 Na
2014
01- 3RD ST &
3 869924 08- 0:18:00 DAVIDSON 37.746380 -122.392005 Na
2014
01- DIVISADERO
4 869925 08- 0:19:00 ST. & BUSH 37.786348 -122.440003 Na
2014 ST.
... ... ... ... ... ... ...
01- 1785 HAYES
95 870016 08- 11:47:00 ST 37.773812 -122.445718 Na
2014
01- SUNSET &
96 870017 08- 11:50:00 LAKE 37.722672 -122.494838 Na
2014 MERCED
01- GREAT
97 870018 08- 11:50:00 HIGHWAY & 37.741687 -122.506964 Na
2014 TARAVAL ST
01- 2345
98 870019 08- 11:50:00 LOMBARD 37.799123 -122.439892 Na
2014
01-
99 870020 08- 11:51:00 FULTON/35TH 37.772089 -122.494930 Na
2014
100 rows × 22 columns
In [147… df.size

file:///Users/user/Downloads/LabAssignment1_2.html 1/14
26/01/2025, 19:00 LabAssignment1_2

Out[147… 19911540

In [148… missing_data=df.isnull().sum()
print(missing_data)

raw_row_number 0
date 0
time 35
location 43
lat 1697
lng 1697
district 52187
subject_age 58888
subject_race 0
subject_sex 0
type 2
arrest_made 1
citation_issued 2
warning_issued 0
outcome 15682
contraband_found 851689
search_conducted 0
search_vehicle 1
search_basis 851688
reason_for_stop 2212
raw_search_vehicle_description 0
raw_result_of_contact_description 0
dtype: int64

In [149… dataTypes=df.dtypes
dataTypes

Out[149… raw_row_number object


date object
time object
location object
lat float64
lng float64
district object
subject_age float64
subject_race object
subject_sex object
type object
arrest_made object
citation_issued object
warning_issued bool
outcome object
contraband_found object
search_conducted bool
search_vehicle object
search_basis object
reason_for_stop object
raw_search_vehicle_description object
raw_result_of_contact_description object
dtype: object

In [150… df_cleaned=df.dropna(subset=['time','location','type','arrest_made','cita
df_cleaned.isna().sum()

file:///Users/user/Downloads/LabAssignment1_2.html 2/14
26/01/2025, 19:00 LabAssignment1_2

Out[150… raw_row_number 0
date 0
time 0
location 0
lat 1654
lng 1654
district 52181
subject_age 58881
subject_race 0
subject_sex 0
type 0
arrest_made 0
citation_issued 0
warning_issued 0
outcome 15681
contraband_found 851610
search_conducted 0
search_vehicle 0
search_basis 851609
reason_for_stop 2212
raw_search_vehicle_description 0
raw_result_of_contact_description 0
dtype: int64

In [151… df_cleaned.size

Out[151… 19909692

In [152… sum_age=df_cleaned['subject_age'].sum()
age_count=df_cleaned['subject_age'].notna().sum()
age_mean=sum_age/age_count
df_cleaned.loc[ : ,'subject_age']=df_cleaned['subject_age'].fillna(age_me
df_cleaned.isnull().sum()

Out[152… raw_row_number 0
date 0
time 0
location 0
lat 1654
lng 1654
district 52181
subject_age 0
subject_race 0
subject_sex 0
type 0
arrest_made 0
citation_issued 0
warning_issued 0
outcome 15681
contraband_found 851610
search_conducted 0
search_vehicle 0
search_basis 851609
reason_for_stop 2212
raw_search_vehicle_description 0
raw_result_of_contact_description 0
dtype: int64

In [153… df_cleaned['subject_age'].head(100)

file:///Users/user/Downloads/LabAssignment1_2.html 3/14
26/01/2025, 19:00 LabAssignment1_2

Out[153… 0 37.818918
1 37.818918
2 37.818918
3 37.818918
4 37.818918
...
96 37.818918
97 37.818918
98 37.818918
99 37.818918
100 37.818918
Name: subject_age, Length: 100, dtype: float64

In [154… value_counts_reason_stop=df_cleaned['reason_for_stop'].value_counts()
max_value=value_counts_reason_stop.idxmax()
max_value

Out[154… 'Moving Violation'

In [155… df_cleaned.loc[:, 'reason_for_stop'] = df_cleaned['reason_for_stop'].fill

In [156… df_cleaned.isnull().sum()

Out[156… raw_row_number 0
date 0
time 0
location 0
lat 1654
lng 1654
district 52181
subject_age 0
subject_race 0
subject_sex 0
type 0
arrest_made 0
citation_issued 0
warning_issued 0
outcome 15681
contraband_found 851610
search_conducted 0
search_vehicle 0
search_basis 851609
reason_for_stop 0
raw_search_vehicle_description 0
raw_result_of_contact_description 0
dtype: int64

In [157… value_counts_search_basis=df_cleaned['search_basis'].value_counts()
max_value=value_counts_search_basis.idxmax()
max_value
df_cleaned.loc[:, 'search_basis'] = df_cleaned['search_basis'].fillna(max
df_cleaned.isnull().sum()

file:///Users/user/Downloads/LabAssignment1_2.html 4/14
26/01/2025, 19:00 LabAssignment1_2

Out[157… raw_row_number 0
date 0
time 0
location 0
lat 1654
lng 1654
district 52181
subject_age 0
subject_race 0
subject_sex 0
type 0
arrest_made 0
citation_issued 0
warning_issued 0
outcome 15681
contraband_found 851610
search_conducted 0
search_vehicle 0
search_basis 0
reason_for_stop 0
raw_search_vehicle_description 0
raw_result_of_contact_description 0
dtype: int64

In [158… df_cleaned.loc[:,'lat'] = df_cleaned['lat'].interpolate(method='linear')


df_cleaned.loc[:,'lng'] = df_cleaned['lng'].interpolate(method='linear')
print(df_cleaned[['lat', 'lng']].head())

lat lng
0 37.773004 -122.445873
1 37.780898 -122.468586
2 37.786919 -122.426718
3 37.746380 -122.392005
4 37.786348 -122.440003

In [159… value_counts_district=df_cleaned['district'].value_counts()
max_value=value_counts_district.idxmax()
max_value
df_cleaned.loc[:, 'district'] = df_cleaned['district'].fillna(max_value)
df_cleaned.isnull().sum()

file:///Users/user/Downloads/LabAssignment1_2.html 5/14
26/01/2025, 19:00 LabAssignment1_2

Out[159… raw_row_number 0
date 0
time 0
location 0
lat 0
lng 0
district 0
subject_age 0
subject_race 0
subject_sex 0
type 0
arrest_made 0
citation_issued 0
warning_issued 0
outcome 15681
contraband_found 851610
search_conducted 0
search_vehicle 0
search_basis 0
reason_for_stop 0
raw_search_vehicle_description 0
raw_result_of_contact_description 0
dtype: int64

In [160… value_counts_outcome=df_cleaned['outcome'].value_counts()
max_value=value_counts_outcome.idxmax()
max_value
df_cleaned.loc[:, 'outcome'] = df_cleaned['outcome'].fillna(max_value)
df_cleaned.isnull().sum()

Out[160… raw_row_number 0
date 0
time 0
location 0
lat 0
lng 0
district 0
subject_age 0
subject_race 0
subject_sex 0
type 0
arrest_made 0
citation_issued 0
warning_issued 0
outcome 0
contraband_found 851610
search_conducted 0
search_vehicle 0
search_basis 0
reason_for_stop 0
raw_search_vehicle_description 0
raw_result_of_contact_description 0
dtype: int64

In [161… value_counts_outcome=df_cleaned['contraband_found'].value_counts()
max_value=value_counts_outcome.idxmax()
max_value
df_cleaned.loc[:, 'contraband_found'] = df_cleaned['contraband_found'].fi
df_cleaned.isnull().sum()

file:///Users/user/Downloads/LabAssignment1_2.html 6/14
26/01/2025, 19:00 LabAssignment1_2

/var/folders/qd/gbss_8pd7wb7t_wz5wqdjsl40000gn/T/ipykernel_49080/212544355
9.py:4: FutureWarning: Downcasting object dtype arrays on .fillna, .ffill,
.bfill is deprecated and will change in a future version. Call result.infe
r_objects(copy=False) instead. To opt-in to the future behavior, set `pd.s
et_option('future.no_silent_downcasting', True)`
df_cleaned.loc[:, 'contraband_found'] = df_cleaned['contraband_found'].f
illna(max_value)
Out[161… raw_row_number 0
date 0
time 0
location 0
lat 0
lng 0
district 0
subject_age 0
subject_race 0
subject_sex 0
type 0
arrest_made 0
citation_issued 0
warning_issued 0
outcome 0
contraband_found 0
search_conducted 0
search_vehicle 0
search_basis 0
reason_for_stop 0
raw_search_vehicle_description 0
raw_result_of_contact_description 0
dtype: int64

In [162… df_cleaned.isnull().sum()

Out[162… raw_row_number 0
date 0
time 0
location 0
lat 0
lng 0
district 0
subject_age 0
subject_race 0
subject_sex 0
type 0
arrest_made 0
citation_issued 0
warning_issued 0
outcome 0
contraband_found 0
search_conducted 0
search_vehicle 0
search_basis 0
reason_for_stop 0
raw_search_vehicle_description 0
raw_result_of_contact_description 0
dtype: int64

In [163… df_cleaned = df_cleaned.drop_duplicates()


df_cleaned.isnull().sum()

file:///Users/user/Downloads/LabAssignment1_2.html 7/14
26/01/2025, 19:00 LabAssignment1_2

Out[163… raw_row_number 0
date 0
time 0
location 0
lat 0
lng 0
district 0
subject_age 0
subject_race 0
subject_sex 0
type 0
arrest_made 0
citation_issued 0
warning_issued 0
outcome 0
contraband_found 0
search_conducted 0
search_vehicle 0
search_basis 0
reason_for_stop 0
raw_search_vehicle_description 0
raw_result_of_contact_description 0
dtype: int64

In [164… unique_values = df_cleaned.nunique()


print(unique_values)

raw_row_number 904986
date 3469
time 1440
location 312994
lat 64380
lng 54321
district 13
subject_age 92
subject_race 5
subject_sex 2
type 1
arrest_made 2
citation_issued 2
warning_issued 2
outcome 3
contraband_found 2
search_conducted 2
search_vehicle 2
search_basis 3
reason_for_stop 26
raw_search_vehicle_description 36
raw_result_of_contact_description 28
dtype: int64

In [165… df_cleaned = df_cleaned.apply(lambda x: x.map(lambda y: y.upper() if isin

In [166… print(df_cleaned)

file:///Users/user/Downloads/LabAssignment1_2.html 8/14
26/01/2025, 19:00 LabAssignment1_2

raw_row_number date time location \


0 869921 01-08-2014 0:01:00 MASONIC AV & FELL ST
1 869922 01-08-2014 0:01:00 GEARY&10TH AV
2 869923 01-08-2014 0:15:00 SUTTER N OCTAVIA ST
3 869924 01-08-2014 0:18:00 3RD ST & DAVIDSON
4 869925 01-08-2014 0:19:00 DIVISADERO ST. & BUSH ST.
... ... ... ... ...
905065 893005 31-10-2014 23:47:00 ORTEGA ST & 38TH AVE
905066 893006 31-10-2014 23:48:00 DUBOCE/BELCHER
905067 893007 31-10-2014 23:49:00 NEW MONTGOMERY & STEVENSON
905068 893008 31-10-2014 23:50:00 BURKE & 3RD
905069 893009 31-10-2014 23:50:00 FILLMORE/WALLER

lat lng district subject_age subject_rac


e \
0 37.773004 -122.445873 H 37.818918 ASIAN/PACIFIC ISLANDE
R
1 37.780898 -122.468586 H 37.818918 BLAC
K
2 37.786919 -122.426718 H 37.818918 HISPANI
C
3 37.746380 -122.392005 H 37.818918 HISPANI
C
4 37.786348 -122.440003 H 37.818918 WHIT
E
... ... ... ... ...
...
905065 37.751482 -122.496725 H 37.818918 WHIT
E
905066 37.769388 -122.430272 H 37.818918 ASIAN/PACIFIC ISLANDE
R
905067 37.788297 -122.401527 H 37.818918 WHIT
E
905068 37.745006 -122.387067 H 37.818918 BLAC
K
905069 37.771315 -122.430458 H 37.818918 WHIT
E

subject_sex ... citation_issued warning_issued outcome \


0 FEMALE ... False True WARNING
1 MALE ... True False CITATION
2 MALE ... True False CITATION
3 MALE ... False True WARNING
4 MALE ... True False CITATION
... ... ... ... ... ...
905065 MALE ... False True WARNING
905066 MALE ... True False CITATION
905067 MALE ... True False CITATION
905068 MALE ... False True WARNING
905069 MALE ... True False CITATION

contraband_found search_conducted search_vehicle search_basis \


0 False False False OTHER
1 False False False OTHER
2 False False False OTHER
3 False False False OTHER
4 False False False OTHER
... ... ... ... ...
905065 False False False OTHER
905066 False False False OTHER

file:///Users/user/Downloads/LabAssignment1_2.html 9/14
26/01/2025, 19:00 LabAssignment1_2

905067 False False False OTHER


905068 False False False OTHER
905069 False False False OTHER

reason_for_stop \
0 MECHANICAL OR NON-MOVING
VIOLATION (V.C.)
1 MECHANICAL OR NON-MOVING
VIOLATION (V.C.)
2 MECHANICAL OR NON-MOVING
VIOLATION (V.C.)
3 MECHANICAL OR NON-MOVING
VIOLATION (V.C.)
4 MECHANICAL OR NON-MOVING
VIOLATION (V.C.)
... ...
905065 MOVING VIOLATION
905066 MOVING VIOLATION
905067 MOVING VIOLATION
905068 MECHANICAL OR NON-MOVING VIOLATION (V.C.)
905069 MOVING VIOLATION

raw_search_vehicle_description raw_result_of_contact_description
0 NO SEARCH WARNING
1 NO SEARCH CITATION
2 NO SEARCH CITATION
3 NO SEARCH WARNING
4 NO SEARCH CITATION
... ... ...
905065 NO SEARCH WARNING
905066 NO SEARCH CITATION
905067 NO SEARCH CITATION
905068 NO SEARCH WARNING
905069 NO SEARCH CITATION

[904986 rows x 22 columns]

In [167… df_cleaned.head()

Out[167… raw_row_number date time location lat lng district


01-
0 869921 08- 0:01:00 MASONIC AV
& FELL ST 37.773004 -122.445873 H
2014
01-
1 869922 08- 0:01:00 GEARY&10TH
AV 37.780898 -122.468586 H
2014
01- SUTTER N 37.786919 -122.426718
2 869923 08- 0:15:00 OCTAVIA ST H
2014
01- 3RD ST &
3 869924 08- 0:18:00 DAVIDSON 37.746380 -122.392005 H
2014
01- DIVISADERO
4 869925 08- 0:19:00 ST. & BUSH 37.786348 -122.440003 H
2014 ST.
5 rows × 22 columns
In [168… import matplotlib.pyplot as plt
import seaborn as sns

file:///Users/user/Downloads/LabAssignment1_2.html 10/14
26/01/2025, 19:00 LabAssignment1_2

data_selected = df_cleaned[['subject_age']]

plt.figure(figsize=(10, 6))

sns.boxplot(data_selected)
plt.title('Boxplot for Age ')
plt.show()

In [169… Q1 = df_cleaned['subject_age'].quantile(0.25)
Q3 = df_cleaned['subject_age'].quantile(0.75)
IQR = Q3 - Q1

df_no_outliers_IQR = df_cleaned[(df_cleaned['subject_age'] >= (Q1 - 1.5 *


(df_cleaned['subject_age'] <= (Q3 + 1.5 *

print("Data after removing outliers using IQR:")


print(df_no_outliers_IQR[['subject_age']].head())

df_no_outliers_IQR.head()

Data after removing outliers using IQR:


subject_age
0 37.818918
1 37.818918
2 37.818918
3 37.818918
4 37.818918

file:///Users/user/Downloads/LabAssignment1_2.html 11/14
26/01/2025, 19:00 LabAssignment1_2

Out[169… raw_row_number date time location lat lng district


01-
0 869921 08- 0:01:00 MASONIC AV
& FELL ST 37.773004 -122.445873 H
2014
01-
1 869922 08- 0:01:00 GEARY&10TH
AV 37.780898 -122.468586 H
2014
01- SUTTER N 37.786919 -122.426718
2 869923 08- 0:15:00 OCTAVIA ST H
2014
01- 3RD ST &
3 869924 08- 0:18:00 DAVIDSON 37.746380 -122.392005 H
2014
01- DIVISADERO
4 869925 08- 0:19:00 ST. & BUSH 37.786348 -122.440003 H
2014 ST.
5 rows × 22 columns
In [104… from sklearn.preprocessing import MinMaxScaler, StandardScaler

In [105… numerical_columns = ['subject_age']


data_numerical = df_cleaned[numerical_columns]

In [108… scaler_minmax = MinMaxScaler()


data_normalized = scaler_minmax.fit_transform(data_numerical)
data_normalized_df = pd.DataFrame(data_normalized, columns=numerical_colu

In [109… scaler_standard = StandardScaler()


data_standardized = scaler_standard.fit_transform(data_numerical)
data_standardized_df = pd.DataFrame(data_standardized, columns=numerical_

In [111… df_cleaned['age_normalized'] = data_normalized_df


df_cleaned['age_standardized'] = data_standardized_df

In [112… df_cleaned.head(100)

file:///Users/user/Downloads/LabAssignment1_2.html 12/14
26/01/2025, 19:00 LabAssignment1_2

Out[112… raw_row_number date time location lat lng dist


01-
0 869921 08- 0:01:00 MASONIC AV
& FELL ST 37.773004 -122.445873
2014
01-
1 869922 08- 0:01:00 GEARY&10TH
AV 37.780898 -122.468586
2014
01- SUTTER N
2 869923 08- 0:15:00 OCTAVIA ST 37.786919 -122.426718
2014
01- 3RD ST &
3 869924 08- 0:18:00 DAVIDSON 37.746380 -122.392005
2014
01- DIVISADERO
4 869925 08- 0:19:00 ST. & BUSH 37.786348 -122.440003
2014 ST.
... ... ... ... ... ... ...
01- SUNSET &
96 870017 08- 11:50:00 LAKE 37.722672 -122.494838
2014 MERCED
01- GREAT
97 870018 08- 11:50:00 HIGHWAY & 37.741687 -122.506964
2014 TARAVAL ST
01- 2345
98 870019 08- 11:50:00 LOMBARD 37.799123 -122.439892
2014
01-
99 870020 08- 11:51:00 FULTON/35TH 37.772089 -122.494930
2014
01-
100 870021 08- 12:00:00 PIERCE/EDDY 37.781014 -122.435621
2014
100 rows × 24 columns
In [114… df_cleaned['age_standardized'] = data_standardized_df.round(decimals=8)

In [115… df_cleaned.head(100)

file:///Users/user/Downloads/LabAssignment1_2.html 13/14
26/01/2025, 19:00 LabAssignment1_2

Out[115… raw_row_number date time location lat lng dist


01-
0 869921 08- 0:01:00 MASONIC AV
& FELL ST 37.773004 -122.445873
2014
01-
1 869922 08- 0:01:00 GEARY&10TH
AV 37.780898 -122.468586
2014
01- SUTTER N
2 869923 08- 0:15:00 OCTAVIA ST 37.786919 -122.426718
2014
01- 3RD ST &
3 869924 08- 0:18:00 DAVIDSON 37.746380 -122.392005
2014
01- DIVISADERO
4 869925 08- 0:19:00 ST. & BUSH 37.786348 -122.440003
2014 ST.
... ... ... ... ... ... ...
01- SUNSET &
96 870017 08- 11:50:00 LAKE 37.722672 -122.494838
2014 MERCED
01- GREAT
97 870018 08- 11:50:00 HIGHWAY & 37.741687 -122.506964
2014 TARAVAL ST
01- 2345
98 870019 08- 11:50:00 LOMBARD 37.799123 -122.439892
2014
01-
99 870020 08- 11:51:00 FULTON/35TH 37.772089 -122.494930
2014
01-
100 870021 08- 12:00:00 PIERCE/EDDY 37.781014 -122.435621
2014
100 rows × 24 columns
In [ ]:

file:///Users/user/Downloads/LabAssignment1_2.html 14/14
DAI 10th feb-2.ipynb - Colab 22/04/25, 2:04 PM

import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.preprocessing import StandardScaler

# Load the Iris dataset


iris = datasets.load_iris()
X = iris.data # Features (4D)
y = iris.target # Labels (0, 1, 2)
labels = iris.target_names # Class names

# Standardize the data


X_std = StandardScaler().fit_transform(X)

# Compute class means


mean_vectors = [np.mean(X_std[y == cl], axis=0) for cl in np.unique(y)]

# Compute Within-Class Scatter Matrix (Sw)


Sw = sum([(X_std[y == cl] - mean).T @ (X_std[y == cl] - mean) for cl, mean in zip(n

# Compute Between-Class Scatter Matrix (Sb)


overall_mean = np.mean(X_std, axis=0)
Sb = sum([len(X_std[y == cl]) * np.outer(mean - overall_mean, mean - overall_mean)

# Solve for eigenvectors of Sw⁻¹Sb


eigvals, eigvecs = np.linalg.eig(np.linalg.inv(Sw) @ Sb)

# Select top 2 eigenvectors


W = eigvecs[:, np.argsort(eigvals)[-2:]] # Pick two largest eigenvectors

Start coding or generate with AI.

https://colab.research.google.com/drive/1-gyxeTqLrtCjkruehj_bJHheyfmes_Mk?authuser=4 Page 1 of 3
DAI 10th feb-2.ipynb - Colab 22/04/25, 2:04 PM

# Plot after LDA


plt.subplot(1, 2, 2)
for cl, label in zip(np.unique(y), labels):
plt.scatter(X_lda[y == cl, 0], X_lda[y == cl, 1], label=label)
plt.xlabel("LDA Component 1")
plt.ylabel("LDA Component 2")
plt.title("Data after LDA")
plt.legend()

plt.tight_layout()
plt.show()

DataafterLDA

1
LDAComponent2

setosa
versicolor
virginica

-21

-1.0 -0.5 0.0 0.5 1.0


LDAComponent1

https://colab.research.google.com/drive/1-gyxeTqLrtCjkruehj_bJHheyfmes_Mk?authuser=4 Page 2 of 3
mtai1000124_lab3_lda.ipynb - Colab 22/04/25, 2:05 PM

 3

X_train, X_test, y_train, y_test = train_test_split(X_lda, y, test_size=0.3, random

clf_lda = RandomForestClassifier(n_estimators=100, random_state=42)


clf_lda.fit(X_train, y_train)

y_pred_lda = clf_lda.predict(X_test)
lda_accuracy = accuracy_score(y_test, y_pred_lda)
print(f"LDA + Random Forest Accuracy: {lda_accuracy:.4f}")

LDA + Random Forest Accuracy: 1.0000

pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)

X_train_pca, X_test_pca, y_train_pca, y_test_pca = train_test_split(X_pca, y, test_

clf_pca = RandomForestClassifier(n_estimators=100, random_state=42)


clf_pca.fit(X_train_pca, y_train_pca)

y_pred_pca = clf_pca.predict(X_test_pca)
pca_accuracy = accuracy_score(y_test_pca, y_pred_pca)

print(f"Random Forest Accuracy using LDA: {lda_accuracy:.4f}")


print(f"Random Forest Accuracy using PCA: {pca_accuracy:.4f}")

Random Forest Accuracy using LDA: 1.0000


Random Forest Accuracy using PCA: 0.9778

components = [1, 2, 3]
accuracy_scores = []

for n in components:
pca = PCA(n_components=n)
X_pca = pca.fit_transform(X)

X_train_pca, X_test_pca, y_train_pca, y_test_pca = train_test_split(X_pca, y, t

clf_pca = RandomForestClassifier(n_estimators=100, random_state=42)


clf_pca.fit(X_train_pca, y_train_pca)

y_pred_pca = clf_pca.predict(X_test_pca)
accuracy = accuracy_score(y_test_pca, y_pred_pca)

https://colab.research.google.com/drive/1oyorelwwd-isKIFFyfN_pEQbng9rtaNH?authuser=4 Page 3 of 5
mtai1000124_lab3_lda.ipynb - Colab 22/04/25, 2:05 PM

accuracy_scores.append(accuracy)
print(f"Random Forest Accuracy using PCA ({n} components): {accuracy:.4f}")

plt.figure(figsize=(8, 5))
plt.plot(components, accuracy_scores, marker='o', linestyle='-', color='b', label='
plt.xlabel("Number of PCA Components")
plt.ylabel("Accuracy")
plt.title("Random Forest Accuracy with Different PCA Components")
plt.xticks(components)
plt.legend()
plt.grid()
plt.show()

Random Forest Accuracy using PCA (1 components): 0.9111


Random Forest Accuracy using PCA (2 components): 0.9778
Random Forest Accuracy using PCA (3 components): 1.0000

Start coding or generate with AI.

https://colab.research.google.com/drive/1oyorelwwd-isKIFFyfN_pEQbng9rtaNH?authuser=4 Page 4 of 5
mtai1000124_lab4_dt.ipynb - Colab 22/04/25, 2:11 PM

MTAI1001124

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt
from sklearn.tree import plot_tree
import seaborn as sns

https://colab.research.google.com/drive/1K26cQ8LbOluN15LfmX9gM…lTILk?authuser=4#scrollTo=281e8b17-9e43-4ece-bec6-61ed545fec25 Page 1 of 6
mtai1000124_lab4_dt.ipynb - Colab 22/04/25, 2:11 PM

df = pd.read_csv("CarPrice_Assignment.csv")

missing_values = df.isnull().sum()

print("Missing Values in Each Column:\n", missing_values)

df = df.dropna()

median_price = df['price'].median()
df['price_category'] = (df['price'] > median_price).astype(int) # 1 = high, 0 = lo

df = df.drop(columns=['car_ID', 'CarName', 'price'])

df = pd.get_dummies(df, drop_first=True)

Missing Values in Each Column:


car_ID 0
symboling 0
CarName 0
fueltype 0
aspiration 0
doornumber 0
carbody 0
drivewheel 0
enginelocation 0
wheelbase 0
carlength 0
carwidth 0
carheight 0
curbweight 0
enginetype 0
cylindernumber 0
enginesize 0
fuelsystem 0
boreratio 0
stroke 0
compressionratio 0
horsepower 0
peakrpm 0
citympg 0
highwaympg 0
price 0
dtype: int64

X = df.drop(columns=['price_category'])
y = df['price_category']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_sta

clf = DecisionTreeClassifier(random_state=42)

https://colab.research.google.com/drive/1K26cQ8LbOluN15LfmX9gM…lTILk?authuser=4#scrollTo=281e8b17-9e43-4ece-bec6-61ed545fec25 Page 2 of 6
mtai1000124_lab4_dt.ipynb - Colab 22/04/25, 2:11 PM

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)


print(f"Accuracy: {accuracy:.4f}")

# Compute confusion matrix


conf_matrix = confusion_matrix(y_test, y_pred)

# Create a heatmap
plt.figure(figsize=(6, 5))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=["Low Price
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix Heatmap")
plt.show()

print("Classification Report:\n", classification_report(y_test, y_pred, target_name

https://colab.research.google.com/drive/1K26cQ8LbOluN15LfmX9gM…lTILk?authuser=4#scrollTo=281e8b17-9e43-4ece-bec6-61ed545fec25 Page 3 of 6
mtai1000124_lab4_dt.ipynb - Colab 22/04/25, 2:11 PM

Accuracy: 0.9756

Classification Report:
precision recall f1-score support

Low Price 1.00 0.96 0.98 23


High Price 0.95 1.00 0.97 18

accuracy 0.98 41
macro avg 0.97 0.98 0.98 41
weighted avg 0.98 0.98 0.98 41

clf_tuned = DecisionTreeClassifier(max_depth=4, random_state=42)


clf_tuned.fit(X_train, y_train)

plt.figure(figsize=(20, 10))
plot_tree(clf_tuned, feature_names=X.columns, class_names=["Low Price", "High Price
plt.show()

y_pred_tuned = clf_tuned.predict(X_test)
accuracy_tuned = accuracy_score(y_test, y_pred_tuned)
print(f"Tuned Model Accuracy: {accuracy_tuned:.4f}")

https://colab.research.google.com/drive/1K26cQ8LbOluN15LfmX9gM…lTILk?authuser=4#scrollTo=281e8b17-9e43-4ece-bec6-61ed545fec25 Page 4 of 6
mtai1000124_lab4_dt.ipynb - Colab 22/04/25, 2:11 PM

Tuned Model Accuracy: 0.9512

https://colab.research.google.com/drive/1K26cQ8LbOluN15LfmX9gM…lTILk?authuser=4#scrollTo=281e8b17-9e43-4ece-bec6-61ed545fec25 Page 5 of 6
mtai1000124_lab4_dt.ipynb - Colab 22/04/25, 2:11 PM

# Extract values from confusion matrix


tn, fp, fn, tp = conf_matrix.ravel()

sensitivity = tp / (tp + fn)


print(f"Sensitivity: {sensitivity:.4f}")

specificity = tn / (tn + fp)


print(f"Specificity: {specificity:.4f}")

Sensitivity: 1.0000
Specificity: 0.9565

Start coding or generate with AI.

https://colab.research.google.com/drive/1K26cQ8LbOluN15LfmX9gM…lTILk?authuser=4#scrollTo=281e8b17-9e43-4ece-bec6-61ed545fec25 Page 6 of 6
mtai1000124_lab5_linear_regression.ipynb - Colab 22/04/25, 2:15 PM

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.linear_model import LinearRegression
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import PolynomialFeatures

df = pd.read_csv('advertising.csv')
df

ID TV Radio Newspaper Sales

0 1 230.1 37.8 69.2 22.1

1 2 44.5 39.3 45.1 10.4

2 3 17.2 45.9 69.3 9.3

3 4 151.5 41.3 58.5 18.5

4 5 180.8 10.8 58.4 12.9

... ... ... ... ... ...

195 196 38.2 3.7 13.8 7.6

196 197 94.2 4.9 8.1 9.7

197 198 177.0 9.3 6.4 12.8

198 199 283.6 42.0 66.2 25.5

199 200 232.1 8.6 8.7 13.4

200 rows × 5 columns

X = df[['TV', 'Radio', 'Newspaper']]


y = df['Sales']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_sta

https://colab.research.google.com/drive/1CZRj6qmJjeM_aOf9TUU9DWgSRAmGcTDN?authuser=4 Page 1 of 9
mtai1000124_lab5_linear_regression.ipynb - Colab 22/04/25, 2:15 PM

model = LinearRegression()
model.fit(X_train, y_train)

▾ LinearRegression
LinearRegression()

y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)


rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)

print(f"Mean Squared Error (MSE): {mse:.2f}")


print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
print(f"Mean Absolute Error (MAE): {mae:.2f}")

Mean Squared Error (MSE): 3.17


Root Mean Squared Error (RMSE): 1.78
Mean Absolute Error (MAE): 1.46

https://colab.research.google.com/drive/1CZRj6qmJjeM_aOf9TUU9DWgSRAmGcTDN?authuser=4 Page 2 of 9
mtai1000124_lab5_linear_regression.ipynb - Colab 22/04/25, 2:15 PM

plt.figure(figsize=(8, 6))
plt.scatter(y_pred, y_test, color='orange', alpha=0.6, label='Actual vs Predicted')
plt.plot([y_pred.min(), y_pred.max()], [y_test.min(), y_test.max()], 'b--', lw=2, l
plt.xlabel('Predicted Sales')
plt.ylabel('Actual Sales')
plt.title('Predicted vs Actual Sales')
plt.legend()
plt.show()

https://colab.research.google.com/drive/1CZRj6qmJjeM_aOf9TUU9DWgSRAmGcTDN?authuser=4 Page 3 of 9
mtai1000124_lab5_linear_regression.ipynb - Colab 22/04/25, 2:15 PM

cv_scores = cross_val_score(model, X, y, cv=5, scoring='r2')


print(f"Cross-validated for full model: {cv_scores.mean():.4f}")

cv_scores_individual = {}
for feature in ['TV', 'Radio', 'Newspaper']:
X_single = df[[feature]]
scores = cross_val_score(LinearRegression(), X_single, y, cv=5, scoring='r2')
cv_scores_individual[feature] = scores.mean()

print("\nCross-validated for individual features:")


for feature, score in cv_scores_individual.items():
print(f"{feature}: {score:.4f}")

Cross-validated for full model: 0.8871

Cross-validated for individual features:


TV: 0.5926
Radio: 0.3106
Newspaper: 0.0239

https://colab.research.google.com/drive/1CZRj6qmJjeM_aOf9TUU9DWgSRAmGcTDN?authuser=4 Page 4 of 9
mtai1000124_lab5_linear_regression.ipynb - Colab 22/04/25, 2:15 PM

plt.figure(figsize=(15, 5))

plt.subplot(1, 3, 1)
plt.scatter(df['TV'], df['Sales'], color='green', alpha=0.6)
plt.xlabel('TV Advertising')
plt.ylabel('Sales')
plt.title('TV vs Sales')

plt.subplot(1, 3, 2)
plt.scatter(df['Radio'], df['Sales'], color='black', alpha=0.6)
plt.xlabel('Radio Advertising')
plt.ylabel('Sales')
plt.title('Radio vs Sales')

plt.subplot(1, 3, 3)
plt.scatter(df['Newspaper'], df['Sales'], color='purple', alpha=0.6)
plt.xlabel('Newspaper Advertising')
plt.ylabel('Sales')
plt.title('Newspaper vs Sales')

plt.tight_layout()
plt.show()

https://colab.research.google.com/drive/1CZRj6qmJjeM_aOf9TUU9DWgSRAmGcTDN?authuser=4 Page 5 of 9
mtai1000124_lab5_linear_regression.ipynb - Colab 22/04/25, 2:15 PM

results = {}
cv_results = {}

for degree in [1, 2, 3]:


poly = PolynomialFeatures(degree=degree)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)
X_poly = poly.fit_transform(X)

model = LinearRegression()
model.fit(X_train_poly, y_train)

y_train_pred = model.predict(X_train_poly)
y_test_pred = model.predict(X_test_poly)

mse_train = mean_squared_error(y_train, y_train_pred)


mse_test = mean_squared_error(y_test, y_test_pred)
mae_train = mean_absolute_error(y_train, y_train_pred)
mae_test = mean_absolute_error(y_test, y_test_pred)
rmse_train = np.sqrt(mse_train)
rmse_test = np.sqrt(mse_test)

results[degree] = {
'Train_MSE': mse_train,
'Test_MSE': mse_test,
'Train_MAE': mae_train,
'Test_MAE': mae_test,
'Train_RMSE': rmse_train,
'Test_RMSE': rmse_test
}

mse_cv_scores = -cross_val_score(model, X_poly, y, scoring='neg_mean_squared_er


cv_results[degree] = {
'CV_MSE_Mean': mse_cv_scores.mean(),
'CV_MSE_Std': mse_cv_scores.std()
}

https://colab.research.google.com/drive/1CZRj6qmJjeM_aOf9TUU9DWgSRAmGcTDN?authuser=4 Page 6 of 9
mtai1000124_lab5_linear_regression.ipynb - Colab 22/04/25, 2:15 PM

results_df = pd.DataFrame(results).T
cv_df = pd.DataFrame(cv_results).T
print("Error Metrics:")
print(results_df.round(3))
print("\nCross-Validation MSE:")
print(cv_df.round(3))

Error Metrics:
Train_MSE Test_MSE Train_MAE Test_MAE Train_RMSE Test_RMSE
1 2.676 2.908 1.234 1.275 1.636 1.705
2 1.908 1.443 1.057 0.903 1.381 1.201
3 1.698 1.812 0.985 0.935 1.303 1.346

Cross-Validation MSE:
CV_MSE_Mean CV_MSE_Std
1 2.842 1.061
2 1.994 0.769
3 2.186 0.846

https://colab.research.google.com/drive/1CZRj6qmJjeM_aOf9TUU9DWgSRAmGcTDN?authuser=4 Page 7 of 9
mtai1000124_lab5_linear_regression.ipynb - Colab 22/04/25, 2:15 PM

plt.figure(figsize=(8, 5))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Correlation Matrix")
plt.tight_layout()
plt.show()

Start coding or generate with AI.

https://colab.research.google.com/drive/1CZRj6qmJjeM_aOf9TUU9DWgSRAmGcTDN?authuser=4 Page 8 of 9
DAI 10march.ipynb - Colab 22/04/25, 2:22 PM

import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# Load the dataset


df = pd.read_csv('Mall_Customers.csv.csv')

# Display first few rows


display(df.head())

CustomerID Gender Age Annual Income (k$) Spending Score (1-100)

0 1 Male 19 15 39

1 2 Male 21 15 81

2 3 Female 20 16 6

3 4 Female 23 16 77

4 5 Female 31 17 40

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 5 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 CustomerID 200 non-null int64
1 Gender 200 non-null object
2 Age 200 non-null int64
3 Annual Income (k$) 200 non-null int64
4 Spending Score (1-100) 200 non-null int64
dtypes: int64(4), object(1)
memory usage: 7.9+ KB

https://colab.research.google.com/drive/1Nib4LrAI_OxUYLC6w_3PYhGRNunuP8uX?authuser=4 Page 1 of 15
DAI 10march.ipynb - Colab 22/04/25, 2:22 PM

plt.scatter(df['Age'], df['Spending Score (1-100)'])


plt.xlabel('Age')
plt.ylabel('Spending Score')
plt.show()

https://colab.research.google.com/drive/1Nib4LrAI_OxUYLC6w_3PYhGRNunuP8uX?authuser=4 Page 2 of 15
DAI 10march.ipynb - Colab 22/04/25, 2:22 PM

plt.scatter(df['Age'], df['Annual Income (k$)'])


plt.xlabel('Age')
plt.ylabel('Annual Income')
plt.show()

https://colab.research.google.com/drive/1Nib4LrAI_OxUYLC6w_3PYhGRNunuP8uX?authuser=4 Page 3 of 15
DAI 10march.ipynb - Colab 22/04/25, 2:22 PM

plt.scatter(df['Annual Income (k$)'], df['Spending Score (1-100)'])


plt.xlabel('Annual Income')
plt.ylabel('Spending Score')
plt.show()

# Selecting relevant features for clustering


X = df[['Annual Income (k$)', 'Spending Score (1-100)']]

https://colab.research.google.com/drive/1Nib4LrAI_OxUYLC6w_3PYhGRNunuP8uX?authuser=4 Page 4 of 15
DAI 10march.ipynb - Colab 22/04/25, 2:22 PM

sse = []
for i in range(1, 10):
km = KMeans(n_clusters = i)
km.fit(df[['Annual Income (k$)', 'Spending Score (1-100)']])
sse.append(km.inertia_)

/opt/anaconda3/lib/python3.12/site-packages/sklearn/cluster/_kmeans.py:1412: F
super()._check_params_vs_input(X, default_n_init=10)
/opt/anaconda3/lib/python3.12/site-packages/sklearn/cluster/_kmeans.py:1412: F
super()._check_params_vs_input(X, default_n_init=10)
/opt/anaconda3/lib/python3.12/site-packages/sklearn/cluster/_kmeans.py:1412: F
super()._check_params_vs_input(X, default_n_init=10)
/opt/anaconda3/lib/python3.12/site-packages/sklearn/cluster/_kmeans.py:1412: F
super()._check_params_vs_input(X, default_n_init=10)
/opt/anaconda3/lib/python3.12/site-packages/sklearn/cluster/_kmeans.py:1412: F
super()._check_params_vs_input(X, default_n_init=10)
/opt/anaconda3/lib/python3.12/site-packages/sklearn/cluster/_kmeans.py:1412: F
super()._check_params_vs_input(X, default_n_init=10)
/opt/anaconda3/lib/python3.12/site-packages/sklearn/cluster/_kmeans.py:1412: F
super()._check_params_vs_input(X, default_n_init=10)
/opt/anaconda3/lib/python3.12/site-packages/sklearn/cluster/_kmeans.py:1412: F
super()._check_params_vs_input(X, default_n_init=10)
/opt/anaconda3/lib/python3.12/site-packages/sklearn/cluster/_kmeans.py:1412: F
super()._check_params_vs_input(X, default_n_init=10)

sse

[269981.28,
181363.59595959593,
106348.37306211119,
73679.78903948834,
44448.45544793371,
37233.814510710006,
30259.65720728547,
25028.02047526941,
21830.041978049438]

https://colab.research.google.com/drive/1Nib4LrAI_OxUYLC6w_3PYhGRNunuP8uX?authuser=4 Page 5 of 15
DAI 10march.ipynb - Colab 22/04/25, 2:22 PM

# Finding the optimal number of clusters using the Elbow Method (SSE)
wcss = []
for i in range(1, 11):
kmeans = KMeans(n_clusters=i, init='k-means++', random_state=42)
kmeans.fit(X)
wcss.append(kmeans.inertia_)

/opt/anaconda3/lib/python3.12/site-packages/sklearn/cluster/_kmeans.py:1412: F
super()._check_params_vs_input(X, default_n_init=10)
/opt/anaconda3/lib/python3.12/site-packages/sklearn/cluster/_kmeans.py:1412: F
super()._check_params_vs_input(X, default_n_init=10)
/opt/anaconda3/lib/python3.12/site-packages/sklearn/cluster/_kmeans.py:1412: F
super()._check_params_vs_input(X, default_n_init=10)
/opt/anaconda3/lib/python3.12/site-packages/sklearn/cluster/_kmeans.py:1412: F
super()._check_params_vs_input(X, default_n_init=10)
/opt/anaconda3/lib/python3.12/site-packages/sklearn/cluster/_kmeans.py:1412: F
super()._check_params_vs_input(X, default_n_init=10)
/opt/anaconda3/lib/python3.12/site-packages/sklearn/cluster/_kmeans.py:1412: F
super()._check_params_vs_input(X, default_n_init=10)
/opt/anaconda3/lib/python3.12/site-packages/sklearn/cluster/_kmeans.py:1412: F
super()._check_params_vs_input(X, default_n_init=10)
/opt/anaconda3/lib/python3.12/site-packages/sklearn/cluster/_kmeans.py:1412: F
super()._check_params_vs_input(X, default_n_init=10)
/opt/anaconda3/lib/python3.12/site-packages/sklearn/cluster/_kmeans.py:1412: F
super()._check_params_vs_input(X, default_n_init=10)
/opt/anaconda3/lib/python3.12/site-packages/sklearn/cluster/_kmeans.py:1412: F
super()._check_params_vs_input(X, default_n_init=10)

https://colab.research.google.com/drive/1Nib4LrAI_OxUYLC6w_3PYhGRNunuP8uX?authuser=4 Page 6 of 15
DAI 10march.ipynb - Colab 22/04/25, 2:22 PM

# Plotting the SSE (WCSS) Elbow Method curve


plt.figure(figsize=(8, 5))
plt.plot(range(1, 11), wcss, marker='o', linestyle='-', color='b')
plt.xlabel("Number of Clusters")
plt.ylabel("WCSS (Within-Cluster Sum of Squares)")
plt.title("Elbow Method using SSE")
plt.grid(True)
plt.show()

https://colab.research.google.com/drive/1Nib4LrAI_OxUYLC6w_3PYhGRNunuP8uX?authuser=4 Page 7 of 15
DAI 10march.ipynb - Colab 22/04/25, 2:22 PM

# Finding the optimal number of clusters using Silhouette Score

silhouette_scores = []
silhouette_values = {}
for i in range(2, 11): # Silhouette Score is valid for clusters >= 2 as not valid
kmeans = KMeans(n_clusters=i, init='k-means++', random_state=42)
cluster_labels = kmeans.fit_predict(X)
silhouette_avg = silhouette_score(X, cluster_labels)
silhouette_scores.append(silhouette_avg)
silhouette_values[i] = silhouette_avg

/opt/anaconda3/lib/python3.12/site-packages/sklearn/cluster/_kmeans.py:1412: F
super()._check_params_vs_input(X, default_n_init=10)
/opt/anaconda3/lib/python3.12/site-packages/sklearn/cluster/_kmeans.py:1412: F
super()._check_params_vs_input(X, default_n_init=10)
/opt/anaconda3/lib/python3.12/site-packages/sklearn/cluster/_kmeans.py:1412: F
super()._check_params_vs_input(X, default_n_init=10)
/opt/anaconda3/lib/python3.12/site-packages/sklearn/cluster/_kmeans.py:1412: F
super()._check_params_vs_input(X, default_n_init=10)
/opt/anaconda3/lib/python3.12/site-packages/sklearn/cluster/_kmeans.py:1412: F
super()._check_params_vs_input(X, default_n_init=10)
/opt/anaconda3/lib/python3.12/site-packages/sklearn/cluster/_kmeans.py:1412: F
super()._check_params_vs_input(X, default_n_init=10)
/opt/anaconda3/lib/python3.12/site-packages/sklearn/cluster/_kmeans.py:1412: F
super()._check_params_vs_input(X, default_n_init=10)
/opt/anaconda3/lib/python3.12/site-packages/sklearn/cluster/_kmeans.py:1412: F
super()._check_params_vs_input(X, default_n_init=10)
/opt/anaconda3/lib/python3.12/site-packages/sklearn/cluster/_kmeans.py:1412: F
super()._check_params_vs_input(X, default_n_init=10)

# Print Silhouette Scores for different cluster numbers


print("Silhouette Scores for different cluster numbers:")
for k, v in silhouette_values.items():
print(f"Clusters: {k}, Silhouette Score: {v:.4f}") #four decimals

Silhouette Scores for different cluster numbers:


Clusters: 2, Silhouette Score: 0.2969
Clusters: 3, Silhouette Score: 0.4676
Clusters: 4, Silhouette Score: 0.4932
Clusters: 5, Silhouette Score: 0.5539
Clusters: 6, Silhouette Score: 0.5398
Clusters: 7, Silhouette Score: 0.5288
Clusters: 8, Silhouette Score: 0.4548
Clusters: 9, Silhouette Score: 0.4561
Clusters: 10, Silhouette Score: 0.4411

https://colab.research.google.com/drive/1Nib4LrAI_OxUYLC6w_3PYhGRNunuP8uX?authuser=4 Page 8 of 15
DAI 10march.ipynb - Colab 22/04/25, 2:22 PM

# Plotting the Silhouette Score curve


plt.figure(figsize=(8, 5))
plt.plot(range(2, 11), silhouette_scores, marker='o', linestyle='-', color='g')
plt.xlabel("Number of Clusters")
plt.ylabel("Silhouette Score")
plt.title("Elbow Method using Silhouette Score")
plt.grid(True)
plt.show()

# Applying K-means clustering with optimal clusters (from Elbow Method, assume 5)
kmeans = KMeans(n_clusters=5, init='k-means++', random_state=42)
df['Cluster'] = kmeans.fit_predict(X)
centroids = kmeans.cluster_centers_

/opt/anaconda3/lib/python3.12/site-packages/sklearn/cluster/_kmeans.py:1412: F
super()._check_params_vs_input(X, default_n_init=10)

https://colab.research.google.com/drive/1Nib4LrAI_OxUYLC6w_3PYhGRNunuP8uX?authuser=4 Page 9 of 15
DAI 10march.ipynb - Colab 22/04/25, 2:22 PM

km = KMeans(n_clusters = 4)
predicted = km.fit_predict(df[['Annual Income (k$)', 'Spending Score (1-100)']])
predicted

/opt/anaconda3/lib/python3.12/site-packages/sklearn/cluster/_kmeans.py:1412: F
super()._check_params_vs_input(X, default_n_init=10)
array([0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 2, 3, 2, 3, 2, 3, 2, 3,
2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3,
2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3,
2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3,
2, 3], dtype=int32)

df['Cluster'] = predicted
df

Annual Income Spending Score (1-


CustomerID Gender Age Cluster
(k$) 100)

0 1 Male 19 15 39 0

1 2 Male 21 15 81 1

2 3 Female 20 16 6 0

3 4 Female 23 16 77 1

4 5 Female 31 17 40 0

... ... ... ... ... ... ...

195 196 Female 35 120 79 3

196 197 Female 45 126 28 2

197 198 Male 32 126 74 3

198 199 Male 32 137 18 2

199 200 Male 30 137 83 3

200 rows × 6 columns

https://colab.research.google.com/drive/1Nib4LrAI_OxUYLC6w_3PYhGRNunuP8uX?authuser=4 Page 10 of 15
DAI 10march.ipynb - Colab 22/04/25, 2:22 PM

df1 = df[df.Cluster==0]
df2 = df[df.Cluster==1]
df3 = df[df.Cluster==2]
df4 = df[df.Cluster==3]
df5 = df[df.Cluster==4]

plt.scatter(df1['Annual Income (k$)'],df1['Spending Score (1-100)'],color='green')


plt.scatter(df2['Annual Income (k$)'],df2['Spending Score (1-100)'],color='red')
plt.scatter(df3['Annual Income (k$)'],df3['Spending Score (1-100)'],color='black')
plt.scatter(df4['Annual Income (k$)'],df4['Spending Score (1-100)'],color='c')
plt.scatter(df5['Annual Income (k$)'],df5['Spending Score (1-100)'],color='blue')

plt.scatter(km.cluster_centers_[:,0],km.cluster_centers_[:,1],
color='purple',marker='*',label='centroid')

plt.xlabel('Annual Income (k$)')


plt.ylabel('Spending Score (1-100)')
plt.legend()

<matplotlib.legend.Legend at 0x175b5d1f0>

 2) Fuzzy C means

https://colab.research.google.com/drive/1Nib4LrAI_OxUYLC6w_3PYhGRNunuP8uX?authuser=4 Page 11 of 15
DAI 10march.ipynb - Colab 22/04/25, 2:22 PM

!pip install scikit-fuzzy

Collecting scikit-fuzzy
Downloading scikit_fuzzy-0.5.0-py2.py3-none-any.whl.metadata (2.6 kB)
Downloading scikit_fuzzy-0.5.0-py2.py3-none-any.whl (920 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 920.8/920.8 kB 2.1 MB/s eta 0:00:00
Installing collected packages: scikit-fuzzy
Successfully installed scikit-fuzzy-0.5.0

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import skfuzzy as fuzz # Fuzzy C-Means library

# Load the dataset


df = pd.read_csv('Mall_Customers.csv.csv')

display(df.head())

CustomerID Gender Age Annual Income (k$) Spending Score (1-100)

0 1 Male 19 15 39

1 2 Male 21 15 81

2 3 Female 20 16 6

3 4 Female 23 16 77

4 5 Female 31 17 40

# Selecting relevant features for clustering


X = df[['Annual Income (k$)', 'Spending Score (1-100)']].values.T # Transpose for

# Number of clusters (assume 5 based on previous K-means results)


n_clusters = 5

# Applying Fuzzy C-Means clustering


cntr, u, u0, d, jm, p, fpc = fuzz.cluster.cmeans(
X, n_clusters, m=2, error=0.005, maxiter=1000, init=None
)

https://colab.research.google.com/drive/1Nib4LrAI_OxUYLC6w_3PYhGRNunuP8uX?authuser=4 Page 12 of 15
DAI 10march.ipynb - Colab 22/04/25, 2:22 PM

# Assigning
df['Fuzzy Cluster']
cluster =
labels
cluster_labels
based on maximum membership
cluster_labels = np.argmax(u, axis=0)
df['Fuzzy Cluster'] = cluster_labels

# Print
print(f"Customer
membership values
{i+1}:
for
{u[:,
the i]}")
first 10 customers
print("Fuzzy Membership Values for the first 10 customers:")
for i in range(10):
print(f"Customer {i+1}: {u[:, i]}")

Fuzzy Membership Values for the first 10 customers:


Customer 1: [0.0550021 0.16838362 0.04536537 0.18600858 0.54524033]
Customer 2: [0.01355536 0.04630829 0.02583153 0.88343684 0.03086798]
Customer 3: [0.04608055 0.06588782 0.02193332 0.04396698 0.82213134]
Customer 4: [0.01156024 0.04165177 0.02096481 0.89781777 0.02800541]
Customer 5: [0.05518953 0.17976226 0.04605764 0.19265389 0.52633668]
Customer 6: [0.01018598 0.03770879 0.01839446 0.90881604 0.02489473]
Customer 7: [0.04263926 0.06030849 0.0196823 0.03872618 0.83864377]
Customer 8: [0.02482971 0.07665261 0.058066 0.79473005 0.04572164]
Customer 9: [0.0533099 0.07034345 0.02351724 0.04420687 0.80862253]
Customer 10: [0.01072754 0.04288594 0.0184509 0.9000948 0.02784081]

# Categorizing customers
df['Category'] = df['Fuzzy
based
Cluster'].map(categories)
on clusters
categories = {
0: "Frugal", # Low Income, Low Spending
1: "Careless", # Low Income, High Spending
2: "Sensible", # Moderate Income, Moderate Spending
3: "Lavish", # High Income, High Spending
4: "Cautious" # High Income, Low Spending
}
df['Category'] = df['Fuzzy Cluster'].map(categories)

# Scatter plot of clusters


plt.figure(figsize=(8, 6))
colors = ['red', 'blue', 'green', 'purple', 'orange']
for i in range(n_clusters):
plt.scatter(df.loc[df['Fuzzy Cluster'] == i, 'Annual Income (k$)'],
df.loc[df['Fuzzy Cluster'] == i, 'Spending Score (1-100)'],
label=f'Cluster {i}', s=100, color=colors[i])

# Marking centroids
plt.scatter(cntr[:, 0], cntr[:, 1], s=300, c='black', marker='X', label='Centroids'

plt.xlabel("Annual Income (k$)")


plt.ylabel("Spending Score (1-100)")
plt.title("Fuzzy C-Means Clustering")
plt.legend()
plt.grid(True)

https://colab.research.google.com/drive/1Nib4LrAI_OxUYLC6w_3PYhGRNunuP8uX?authuser=4 Page 13 of 15
DAI 10march.ipynb - Colab 22/04/25, 2:22 PM

plt.show()

# Display sample results


display(df[['Annual Income (k$)', 'Spending Score (1-100)', 'Fuzzy Cluster', 'Categ

Annual Income (k$) Spending Score (1-100) Fuzzy Cluster Category

0 15 39 4 Cautious

1 15 81 3 Lavish

2 16 6 4 Cautious

3 16 77 3 Lavish

4 17 40 4 Cautious

... ... ... ... ...

95 60 52 1 Careless

96 60 47 1 Careless

97 60 50 1 Careless

98 61 42 1 Careless
https://colab.research.google.com/drive/1Nib4LrAI_OxUYLC6w_3PYhGRNunuP8uX?authuser=4 Page 14 of 15
DAI 17th march.ipynb - Colab 22/04/25, 2:31 PM

K means clustering on two parameters :Annual income and spending


 score

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.spatial import distance_matrix
from sklearn.cluster import AgglomerativeClustering

# Load dataset
df = pd.read_csv('Mall_Customers.csv.csv')

display(df.head())

# Select relevant features


X = df[['Annual Income (k$)', 'Spending Score (1-100)']].values

CustomerID Gender Age Annual Income (k$) Spending Score (1-100)

0 1 Male 19 15 39

1 2 Male 21 15 81

2 3 Female 20 16 6

3 4 Female 23 16 77

4 5 Female 31 17 40

# Hierarchical Clustering
num_clusters = 5 # Determined from the dendrogram
hc = AgglomerativeClustering(n_clusters=num_clusters, affinity='euclidean', linkage
y_hc = hc.fit_predict(X)

# Visualizing the clusters


plt.figure(figsize=(8, 6))
colors = ['red', 'blue', 'green', 'purple', 'orange']
for i in range(num_clusters):
plt.scatter(X[y_hc == i, 0], X[y_hc == i, 1], s=100, c=colors[i], label=f'Clust

plt.xlabel('Annual Income (k$)')


plt.ylabel('Spending Score (1-100)')
plt.title('Customer Segmentation using Hierarchical Clustering')
plt.legend()
https://colab.research.google.com/drive/1SRf2Nq5i7CFdCDzydwqM5Ylxn3Z7GeeX?authuser=4 Page 1 of 4
DAI 17th march.ipynb - Colab 22/04/25, 2:31 PM

plt.legend()
plt.show()

/opt/anaconda3/lib/python3.12/site-packages/sklearn/cluster/_agglomerative.py:1
warnings.warn(

https://colab.research.google.com/drive/1SRf2Nq5i7CFdCDzydwqM5Ylxn3Z7GeeX?authuser=4 Page 2 of 4
DAI 17th march.ipynb - Colab 22/04/25, 2:31 PM

# Display the Dendrogram


plt.figure(figsize=(10, 6))
linked = linkage(X, method='ward') # Using Ward's method for minimum variance

dendrogram(linked)
plt.title("Dendrogram for Hierarchical Clustering")
plt.xlabel("Customers")
plt.ylabel("Euclidean Distance")
plt.show()

https://colab.research.google.com/drive/1SRf2Nq5i7CFdCDzydwqM5Ylxn3Z7GeeX?authuser=4 Page 3 of 4
DAI 17th march.ipynb - Colab 22/04/25, 2:31 PM

# Display the Proximity Matrix


proximity_matrix = distance_matrix(X, X)
print("Proximity Matrix:")
print(proximity_matrix)

Proximity Matrix:
[[ 0. 42. 33.01514804 ... 116.38728453 123.79418403
129.69194269]
[ 42. 0. 75.00666637 ... 111.22050171 137.3062271
122.01639234]
[ 33.01514804 75.00666637 0. ... 129.32130528 121.59358536
143.42245291]
...
[116.38728453 111.22050171 129.32130528 ... 0. 57.07013229
14.2126704 ]
[123.79418403 137.3062271 121.59358536 ... 57.07013229 0.
65. ]
[129.69194269 122.01639234 143.42245291 ... 14.2126704 65.
0. ]]

# displaying teh proximity matrix

https://colab.research.google.com/drive/1SRf2Nq5i7CFdCDzydwqM5Ylxn3Z7GeeX?authuser=4 Page 4 of 4
mtai1000124_lab8.ipynb - Colab 22/04/25, 2:36 PM

LAB 8

import pandas as pd
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn.model_selection import train_test_split

iris
y = pd.Series(iris.target)
= load_iris()
X = pd.DataFrame(iris.data, columns=iris.feature_names)
y = pd.Series(iris.target)

scaler = =
X_scaled StandardScaler()
scaler.fit_transform(X)
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, ran

normalizer_l1
X_test_l1 = normalizer_l1.transform(X_test)
= Normalizer(norm='l1')
X_train_l1 = normalizer_l1.fit_transform(X_train)
X_test_l1 = normalizer_l1.transform(X_test)

normalizer_l2
X_test_l2 = normalizer_l2.transform(X_test)
= Normalizer(norm='l2')
X_train_l2 = normalizer_l2.fit_transform(X_train)
X_test_l2 = normalizer_l2.transform(X_test)

print("Original
print("\nL2 Normalized
Scaled Features (First 3 Samples):\n", X_train[:3])
X_train_l2[:3])
print("\nL1 Normalized Features (First 3 Samples):\n", X_train_l1[:3])
print("\nL2 Normalized Features (First 3 Samples):\n", X_train_l2[:3])

Original Scaled Features (First 3 Samples):


[[-1.50652052 1.24920112 -1.56757623 -1.3154443 ]
[-0.17367395 3.09077525 -1.2833891 -1.05217993]
[ 1.03800476 0.09821729 0.36489628 0.26414192]]

L1 Normalized Features (First 3 Samples):


[[-0.26717315 0.22153897 -0.27800105 -0.23328683]
[-0.0310131 0.55192236 -0.22917588 -0.18788866]
[ 0.58801798 0.05563898 0.20670962 0.14963341]]

L2 Normalized Features (First 3 Samples):


[[-0.53204399 0.44116886 -0.55360647 -0.46456335]
[-0.04944541 0.87995141 -0.36538407 -0.29955825]
[ 0.91390488 0.0864748 0.32127068 0.23256212]]

https://colab.research.google.com/drive/1ON6S0WMmfdxmWCpUtlpC…FxX?authuser=4#scrollTo=6e1f4665-27e8-49ee-9442-91c9e0840f15 Page 1 of 2
mtai1000124_lab9.ipynb - Colab 22/04/25, 2:39 PM

Lab9

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_diabetes
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split, GridSearchCV

 1

diabetes X_test,
X_train, = load_diabetes()
y_train, y_test = train_test_split(X, y, test_size=0.2, random_sta
X = diabetes.data
y = diabetes.target
feature_names = diabetes.feature_names

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_sta

lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)

print("Linear Regression R² score:", r2_score(y_test, y_pred_lr))


print("Linear Regression RMSE:", np.sqrt(mean_squared_error(y_test, y_pred_lr)))

Linear Regression R² score: 0.4526027629719197


Linear Regression RMSE: 53.853445836765914

lasso = Lasso(alpha=1.0)
lasso.fit(X_train, y_train)
y_pred_lasso = lasso.predict(X_test)

print("Lasso R² score:", r2_score(y_test, y_pred_lasso))


print("Lasso RMSE:", np.sqrt(mean_squared_error(y_test, y_pred_lasso)))

Lasso R² score: 0.3575918767219115


Lasso RMSE: 58.340172450954185

https://colab.research.google.com/drive/1XzD0vTh0lvEBhVjBTXXf3X…oRJxDt?authuser=4#scrollTo=1f887668-3ebe-4f78-ba67-30dca3c3f6ff Page 1 of 6
mtai1000124_lab9.ipynb - Colab 22/04/25, 2:39 PM

 2

ridge = Ridge(alpha=1.0)
ridge.fit(X_train, y_train)
y_pred_ridge = ridge.predict(X_test)

print("Ridge R² score:", r2_score(y_test, y_pred_ridge))


print("Ridge RMSE:", np.sqrt(mean_squared_error(y_test, y_pred_ridge)))

# Compare coefficients
print("\nLasso Coefficients:\n", lasso.coef_)
print("\nRidge Coefficients:\n", ridge.coef_)

Ridge R² score: 0.41915292635986545


Ridge RMSE: 55.47446204180109

Lasso Coefficients:
[ 0. -0. 413.43184792 34.83051518 0.
0. -0. 0. 258.15289363 0. ]

Ridge Coefficients:
[ 45.36737726 -76.66608563 291.33883165 198.99581745 -0.53030959
-28.57704987 -144.51190505 119.26006559 230.22160832 112.14983004]

Lasso (L1) regularization can shrink some coe9cients exactly to


zero, effectively performing feature selection. Ridge (L2)
regularization shrinks coe9cients towards zero but does not
eliminate them entirely. Lasso is useful when we expect only a
few features to be important; Ridge is preferred when all
features contribute a bit.

 3

https://colab.research.google.com/drive/1XzD0vTh0lvEBhVjBTXXf3X…RJxDt?authuser=4#scrollTo=1f887668-3ebe-4f78-ba67-30dca3c3f6ff Page 2 of 6
mtai1000124_lab9.ipynb - Colab 22/04/25, 2:39 PM

lasso_cv = GridSearchCV(Lasso(), param_grid={'alpha': np.linspace(0.01, 1, 100)}, c


lasso_cv.fit(X_train, y_train)

best_lasso = lasso_cv.best_estimator_
y_pred_best_lasso = best_lasso.predict(X_test)

print("Best alpha for Lasso:", lasso_cv.best_params_['alpha'])


print("Best Lasso R² score:", r2_score(y_test, y_pred_best_lasso))

Best alpha for Lasso: 0.08


Best Lasso R² score: 0.47151662767175617

https://colab.research.google.com/drive/1XzD0vTh0lvEBhVjBTXXf3X…RJxDt?authuser=4#scrollTo=1f887668-3ebe-4f78-ba67-30dca3c3f6ff Page 3 of 6
mtai1000124_lab9.ipynb - Colab 22/04/25, 2:39 PM

alphas = np.linspace(0.01, 10, 100)


ridge_cv = GridSearchCV(Ridge(), param_grid={'alpha': alphas}, cv=5, scoring='r2')
ridge_cv.fit(X_train, y_train)

best_ridge = ridge_cv.best_estimator_
y_pred_best_ridge = best_ridge.predict(X_test)

print("Best alpha for Ridge:", ridge_cv.best_params_['alpha'])


print("Best Ridge R² score:", r2_score(y_test, y_pred_best_ridge))

results = ridge_cv.cv_results_
plt.plot(alphas, results['mean_test_score'])
plt.xlabel("Alpha")
plt.ylabel("Cross-validated R²")
plt.title("Ridge Alpha vs R²")
plt.grid(True)
plt.show()

Best alpha for Ridge: 0.1109090909090909


Best Ridge R² score: 0.4610763713046666

https://colab.research.google.com/drive/1XzD0vTh0lvEBhVjBTXXf3X…RJxDt?authuser=4#scrollTo=1f887668-3ebe-4f78-ba67-30dca3c3f6ff Page 4 of 6
mtai1000124_lab9.ipynb - Colab 22/04/25, 2:39 PM

 4

zero_coef_indices = np.where(best_lasso.coef_ == 0)[0]


zero_features = [feature_names[i] for i in zero_coef_indices]

print("Features eliminated (coeff=0) by Lasso:", zero_features)

Features eliminated (coeff=0) by Lasso: ['age', 's2', 's4']

models = {
"Linear Regression": (lr, y_pred_lr),
"Lasso (best alpha)": (best_lasso, y_pred_best_lasso),
"Ridge (best alpha)": (best_ridge, y_pred_best_ridge)
}

for name, (model, y_pred) in models.items():


print(f"\n{name}")
print("R² score:", r2_score(y_test, y_pred))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))

Linear Regression
R² score: 0.4526027629719197
RMSE: 53.853445836765914

Lasso (best alpha)


R² score: 0.47151662767175617
RMSE: 52.91488545922437

Ridge (best alpha)


R² score: 0.4610763713046666
RMSE: 53.43499944501818

https://colab.research.google.com/drive/1XzD0vTh0lvEBhVjBTXXf3X…RJxDt?authuser=4#scrollTo=1f887668-3ebe-4f78-ba67-30dca3c3f6ff Page 5 of 6
mtai1000124_lab9.ipynb - Colab 22/04/25, 2:39 PM

Using GridSearchCV, the optimal alpha for Ridge regression


was found to be 0.1109, achieving an R² score of 0.4611 and
RMSE of 53.43 on the test set. The performance plot showed
 that as alpha increased, the model's R² score decreased,
indicating that excessive regularization led to underTtting. The
best alpha provided a good balance between bias and
variance.

Start coding or generate with AI.

https://colab.research.google.com/drive/1XzD0vTh0lvEBhVjBTXXf3X…RJxDt?authuser=4#scrollTo=1f887668-3ebe-4f78-ba67-30dca3c3f6ff Page 6 of 6
2/24/25, 12:16 PM MTAI1001124(harshit.2) - Jupyter Notebook

In [6]: import pandas as pd


file_path = "rollingsales.csv"
df = pd.read_csv(file_path)

In [2]: df.head()
Out[2]:

TAX BUILDING
BUILDING
CLASS CLAS
BOROUGH NEIGHBORHOOD CLASS BLOCK LOT EASEMENT
AT A
CATEGORY
PRESENT PRESEN

01 ONE
0 2 BATHGATE FAMILY 1 2907.0 24.0 NaN A
DWELLINGS

01 ONE
1 2 BATHGATE FAMILY 1 3030.0 69.0 NaN A
DWELLINGS

01 ONE
2 2 BATHGATE FAMILY 1 3046.0 10.0 NaN A
DWELLINGS

01 ONE
3 2 BATHGATE FAMILY 1 3046.0 27.0 NaN A
DWELLINGS

01 ONE
4 2 BATHGATE FAMILY 1 3046.0 40.0 NaN A
DWELLINGS

5 rows × 21 columns
 

In [4]: #Check for missing values


missing_summary = df.isnull().sum()
print("Missing values in each column:\n", missing_summary[missing_summary >

# Drop columns with excessive missing values (e.g., 'EASEMENT' and 'APARTMEN
df.drop(columns=['EASEMENT', 'APARTMENT NUMBER'], inplace=True, errors='igno

Missing values in each column:


BLOCK 4
LOT 7
BUILDING CLASS AT PRESENT 1
ADDRESS 12
ZIP CODE 4
RESIDENTIAL UNITS 1169
COMMERCIAL UNITS 1638
TOTAL UNITS 1139
LAND SQUARE FEET 1669
GROSS SQUARE FEET 1667
YEAR BUILT 605
BUILDING CLASS AT TIME OF SALE 1
SALE PRICE 9
SALE DATE 3
dtype: int64

localhost:8889/notebooks/MTAI1001124(harshit.2).ipynb 1/2
2/24/25, 12:16 PM MTAI1001124(harshit.2) - Jupyter Notebook

In [5]: #Fill numerical columns with the median


num_cols = df.select_dtypes(include=['number']).columns
df[num_cols] = df[num_cols].fillna(df[num_cols].median())

#Fill categorical columns with the mode
cat_cols = df.select_dtypes(include=['object']).columns
for col in cat_cols:
df[col] = df[col].fillna(df[col].mode()[0])

In [10]: # Trim spaces from column names


df.columns = df.columns.str.strip()

# remove dublicate rows
df.drop_duplicates(inplace=True)

# Identify numerical columns
num_cols = df.select_dtypes(include=['number']).columns

# Remove outliers using the IQR method
for col in num_cols:
Q1 = df[col].quantile(0.25)
Q3 = df[col].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]

# Save the cleaned dataset
cleaned_file_path = "rollingsales_no_outliers.csv"
df.to_csv(cleaned_file_path, index=False)

print(f"Cleaned dataset saved as {cleaned_file_path}")

Cleaned dataset saved as rollingsales_no_outliers.csv

In [ ]: ​

localhost:8889/notebooks/MTAI1001124(harshit.2).ipynb 2/2
2/24/25, 12:15 PM MTAI1001124(Harshit)

In [2]:
import pandas as pd

# Load the dataset


file_path = "rollingsales.csv"
df = pd.read_csv(file_path)
df.head()

Out[2]:
TAX
BUILDING BUILDING
CLASS
BOROUGH NEIGHBORHOOD CLASS BLOCK LOT EASEMENT CLASS AT AD
AT
CATEGORY PRESENT
PRESENT

01 ONE
409
0 2 BATHGATE FAMILY 1 2907.0 24.0 NaN A1
A
DWELLINGS

01 ONE
444
1 2 BATHGATE FAMILY 1 3030.0 69.0 NaN A1
A
DWELLINGS

01 ONE
2 2 BATHGATE FAMILY 1 3046.0 10.0 NaN A1 WASHIN
DWELLINGS A

01 ONE
3 2 BATHGATE FAMILY 1 3046.0 27.0 NaN A1 WASHIN
DWELLINGS A

01 ONE
4 2 BATHGATE FAMILY 1 3046.0 40.0 NaN A1 BAT
DWELLINGS A

5 rows × 21 columns

 

In [3]:
# Trim whitespace from column names
df.columns = df.columns.str.strip()

# Drop the 'EASEMENT' column since it's entirely null


df.drop(columns=['EASEMENT'], inplace=True)

# Convert 'SALE PRICE', 'LAND SQUARE FEET', and 'GROSS SQUARE FEET' to numeric, hand
df['SALE PRICE'] = pd.to_numeric(df['SALE PRICE'].str.replace(',', ''), errors='coer
df['LAND SQUARE FEET'] = pd.to_numeric(df['LAND SQUARE FEET'].str.replace(',', ''),
df['GROSS SQUARE FEET'] = pd.to_numeric(df['GROSS SQUARE FEET'].str.replace(',', '')

# Convert ZIP CODE to integer, dropping NaNs first


df['ZIP CODE'] = df['ZIP CODE'].dropna().astype(int)

# Remove duplicate rows


df.drop_duplicates(inplace=True)

# Remove sale price outliers: Transactions with SALE PRICE = 0 are likely invalid
df = df[df['SALE PRICE'] > 0]

# Compute mean sale price per ZIP code


mean_sale_price_by_zip = df.groupby('ZIP CODE')['SALE PRICE'].mean().sort_index()

localhost:8889/nbconvert/html/MTAI1001124(Harshit).ipynb?download=false 1/3
2/24/25, 12:15 PM MTAI1001124(Harshit)
# Display the mean sale price per ZIP code
print(mean_sale_price_by_zip)

ZIP CODE
10451.0 1.501295e+06
10452.0 1.250472e+06
10453.0 1.721677e+06
10454.0 2.554382e+06
10455.0 8.571722e+05
10456.0 9.594280e+05
10457.0 1.703850e+06
10458.0 1.346083e+06
10459.0 9.214199e+05
10460.0 1.122764e+06
10461.0 1.015786e+06
10462.0 5.525803e+05
10463.0 1.726541e+06
10464.0 7.024356e+05
10465.0 7.157930e+05
10466.0 6.505253e+05
10467.0 8.504955e+05
10468.0 1.300199e+06
10469.0 8.046570e+05
10470.0 8.942567e+05
10471.0 1.279964e+06
10472.0 8.675365e+05
10473.0 9.937852e+05
10474.0 2.406104e+06
10475.0 5.628368e+05
Name: SALE PRICE, dtype: float64

In [5]:
# Plot mean sale price per ZIP code
import matplotlib.pyplot as plt
plt.figure(figsize=(12, 6))
mean_sale_price_by_zip.plot(kind='bar', color='skyblue', edgecolor='black')
plt.xlabel("ZIP Code")
plt.ylabel("Mean Sale Price ($)")
plt.title("Mean Sale Price by ZIP Code")
plt.xticks(rotation=90) # Rotate labels for better readability
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

localhost:8889/nbconvert/html/MTAI1001124(Harshit).ipynb?download=false 2/3
2/24/25, 12:15 PM MTAI1001124(Harshit)

In [ ]:

localhost:8889/nbconvert/html/MTAI1001124(Harshit).ipynb?download=false 3/3
SET B manager salary
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

import pandas as pd

# Load the CSV file


file_path = '2302023 Raw Data - Ask A Manager Salary Survey 2021.csv'
# Update path if needed
df = pd.read_csv(file_path)

df.head(10)

Timestamp How old are you? What industry do you work in?
\
0 4/27/2021 11:02:10 25-34 Education (Higher Education)

1 4/27/2021 11:02:22 25-34 Computing or Tech

2 4/27/2021 11:02:38 25-34 Accounting, Banking & Finance

3 4/27/2021 11:02:41 25-34 Nonprofits

4 4/27/2021 11:02:42 25-34 Accounting, Banking & Finance

5 4/27/2021 11:02:46 25-34 Education (Higher Education)

6 4/27/2021 11:02:51 25-34 Publishing

7 4/27/2021 11:03:00 25-34 Education (Primary/Secondary)

8 4/27/2021 11:03:01 45-54 Computing or Tech

9 4/27/2021 11:03:02 35-44 Accounting, Banking & Finance

Job title \
0 Research and Instruction Librarian
1 Change & Internal Communications Manager
2 Marketing Specialist
3 Program Manager
4 Accounting Manager
5 Scholarly Publishing Librarian
6 Publishing Assistant
7 Librarian
8 Systems Analyst
9 Senior Accountant
If your job title needs additional context, please clarify here: \
0 NaN
1 NaN
2 NaN
3 NaN
4 NaN
5 NaN
6 NaN
7 High school, FT
8 Data developer/ETL Developer
9 NaN

What is your annual salary? (You'll indicate the currency in a later


question. If you are part-time or hourly, please enter an annualized
equivalent -- what you would earn if you worked the job 40 hours a
week, 52 weeks a year.) \
0 55,000

1 54,600

2 34,000

3 62,000

4 60,000

5 62,000

6 33,000

7 50,000

8 112,000

9 45,000

How much additional monetary compensation do you get, if any (for


example, bonuses or overtime in an average year)? Please only include
monetary compensation here, not the value of benefits. \
0 0.0

1 4000.0

2 NaN

3 3000.0

4 7000.0
5 NaN

6 2000.0

7 NaN

8 10000.0

9 0.0

Please indicate the currency \


0 USD
1 GBP
2 USD
3 USD
4 USD
5 USD
6 USD
7 USD
8 USD
9 USD

If "Other," please indicate the currency here: \


0 NaN
1 NaN
2 NaN
3 NaN
4 NaN
5 NaN
6 NaN
7 NaN
8 NaN
9 NaN

If your income needs additional context, please provide it here: \


0 NaN
1 NaN
2 NaN
3 NaN
4 NaN
5 NaN
6 NaN
7 NaN
8 NaN
9 I work for a Charter School

What country do you work in? \


0 United States
1 United Kingdom
2 US
3 USA
4 US
5 USA
6 USA
7 United States
8 US
9 United States

If you're in the U.S., what state do you work in? What city do you
work in? \
0 Massachusetts
Boston
1 NaN
Cambridge
2 Tennessee
Chattanooga
3 Wisconsin
Milwaukee
4 South Carolina
Greenville
5 New Hampshire
Hanover
6 South Carolina
Columbia
7 Arizona
Yuma
8 Missouri
St. Louis
9 Florida
Palm Coast

How many years of professional work experience do you have overall?


\
0 5-7 years

1 8 - 10 years

2 2 - 4 years

3 8 - 10 years

4 8 - 10 years

5 8 - 10 years

6 2 - 4 years

7 5-7 years
8 21 - 30 years

9 21 - 30 years

How many years of professional work experience do you have in your


field? \
0 5-7 years

1 5-7 years

2 2 - 4 years

3 5-7 years

4 5-7 years

5 2 - 4 years

6 2 - 4 years

7 5-7 years

8 21 - 30 years

9 21 - 30 years

What is your highest level of education completed? What is your


gender? \
0 Master's degree
Woman
1 College degree Non-
binary
2 College degree
Woman
3 College degree
Woman
4 College degree
Woman
5 Master's degree
Man
6 College degree
Woman
7 Master's degree
Man
8 College degree
Woman
9 College degree
Woman
What is your race? (Choose all that apply.)
0 White
1 White
2 White
3 White
4 White
5 White
6 White
7 White
8 White
9 Hispanic, Latino, or Spanish origin, White

#Keep only relevant columns


columns = {
"age": "How old are you?",
"industry": "What industry do you work in?",
"country": "What country do you work in?",
"exp_field": "How many years of professional work experience do
you have in your field?",
"exp_overall": "How many years of professional work experience do
you have overall?",
"education": "What is your highest level of education completed?",
"salary": "What is your annual salary? (You'll indicate the
currency in a later question. If you are part-time or hourly, please
enter an annualized equivalent -- what you would earn if you worked
the job 40 hours a week, 52 weeks a year.)"
}
data = df[list(columns.values())]
data.rename(columns={v: k for k, v in columns.items()}, inplace=True)

C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\frame.py:5039:
SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation:


https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#
returning-a-view-versus-a-copy
return super().rename(

Data Preprocessing
data

age industry country


exp_field \
0 25-34 Education (Higher Education) United States 5-7
years
1 25-34 Computing or Tech United Kingdom 5-7
years
2 25-34 Accounting, Banking & Finance US 2 - 4
years
3 25-34 Nonprofits USA 5-7
years
4 25-34 Accounting, Banking & Finance US 5-7
years
... ... ... ...
...
27935 25-34 Oil & Gas Colombia 8 - 10
years
27936 25-34 Computing or Tech United States 2 - 4
years
27937 25-34 Engineering or Manufacturing United States 5-7
years
27938 25-34 Computing or Tech denmark 2 - 4
years
27939 35-44 Marketing, Advertising & PR US 8 - 10
years

exp_overall education salary


0 5-7 years Master's degree 55,000
1 8 - 10 years College degree 54,600
2 2 - 4 years College degree 34,000
3 8 - 10 years College degree 62,000
4 8 - 10 years College degree 60,000
... ... ... ...
27935 8 - 10 years Master's degree 25000
27936 11 - 20 years Some college 55000
27937 5-7 years College degree 87000
27938 2 - 4 years College degree 64000
27939 11 - 20 years Master's degree 150000

[27940 rows x 7 columns]

#Clean the salary column


data['salary'] = data['salary'].str.replace(',', '').str.replace('$',
'').str.strip()
data['salary'] = pd.to_numeric(data['salary'], errors='coerce')

C:\Users\BIT~1.L5-\AppData\Local\Temp/ipykernel_8184/3036456837.py:2:
FutureWarning: The default value of regex will change from True to
False in a future version. In addition, single character regular
expressions will *not* be treated as literal strings when regex=True.
data['salary'] = data['salary'].str.replace(',',
'').str.replace('$', '').str.strip()
C:\Users\BIT~1.L5-\AppData\Local\Temp/ipykernel_8184/3036456837.py:2:
SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation:


https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#
returning-a-view-versus-a-copy
data['salary'] = data['salary'].str.replace(',',
'').str.replace('$', '').str.strip()
C:\Users\BIT~1.L5-\AppData\Local\Temp/ipykernel_8184/3036456837.py:3:
SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation:


https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#
returning-a-view-versus-a-copy
data['salary'] = pd.to_numeric(data['salary'], errors='coerce')

#Standardize country values


data['country'] = data['country'].replace({
'United States': 'USA', 'US': 'USA', 'U.S.': 'USA', 'America':
'USA'
})

C:\Users\BIT~1.L5-\AppData\Local\Temp/ipykernel_8184/4290375551.py:2:
SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation:


https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#
returning-a-view-versus-a-copy
data['country'] = data['country'].replace({

#Convert ranges (e.g., "5-7 years") to mid-points


def convert_experience(value):
if pd.isnull(value):
return np.nan
value = str(value).lower().replace("years", "").replace("year",
"").strip()
value = re.sub(r"[^\d\-\+]", "", value)
if '-' in value:
parts = value.split('-')
return (int(parts[0]) + int(parts[1])) / 2
elif '+' in value:
return int(value.replace('+', ''))
elif value.isdigit():
return int(value)
return np.nan

data['age'] = data['age'].apply(convert_experience)
data['exp_field'] = data['exp_field'].apply(convert_experience)
data['exp_overall'] = data['exp_overall'].apply(convert_experience)
C:\Users\BIT~1.L5-\AppData\Local\Temp/ipykernel_8184/3789075842.py:16:
SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation:


https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#
returning-a-view-versus-a-copy
data['age'] = data['age'].apply(convert_experience)
C:\Users\BIT~1.L5-\AppData\Local\Temp/ipykernel_8184/3789075842.py:17:
SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation:


https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#
returning-a-view-versus-a-copy
data['exp_field'] = data['exp_field'].apply(convert_experience)
C:\Users\BIT~1.L5-\AppData\Local\Temp/ipykernel_8184/3789075842.py:18:
SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation:


https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#
returning-a-view-versus-a-copy
data['exp_overall'] = data['exp_overall'].apply(convert_experience)

# Drop missing values


data.dropna(subset=['age', 'industry', 'country', 'exp_field',
'exp_overall', 'education', 'salary'], inplace=True)

C:\ProgramData\Anaconda3\lib\site-packages\pandas\util\
_decorators.py:311: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation:


https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#
returning-a-view-versus-a-copy
return func(*args, **kwargs)

#Show cleaned data summary


print("Cleaned Dataset Shape:", data.shape)
print(data.head())

Cleaned Dataset Shape: (27658, 7)


age industry country exp_field \
0 29.5 Education (Higher Education) USA 6.0
1 29.5 Computing or Tech United Kingdom 6.0
2 29.5 Accounting, Banking & Finance USA 3.0
3 29.5 Nonprofits USA 6.0
4 29.5 Accounting, Banking & Finance USA 6.0

exp_overall education salary


0 6.0 Master's degree 55000
1 9.0 College degree 54600
2 3.0 College degree 34000
3 9.0 College degree 62000
4 9.0 College degree 60000

import seaborn as sns


sns.set_theme(style="whitegrid", palette="muted")
plt.figure(figsize=(12, 6))
sns.histplot(data['salary'], bins=40, kde=True, color="#4C72B0",
edgecolor='black')
plt.title('📊 Distribution of Annual Salaries', fontsize=16,
fontweight='bold')
plt.xlabel('Annual Salary', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.grid(True, linestyle='--', alpha=0.5)
plt.tight_layout()
plt.show()
plt.figure(figsize=(14, 6))
sns.boxplot(data=data, x='education', y='salary', palette="coolwarm")
plt.xticks(rotation=30, ha='right')
plt.title('🎓 Salary Distribution by Education Level', fontsize=16,
fontweight='bold')
plt.xlabel('Education Level', fontsize=12)
plt.ylabel('Annual Salary', fontsize=12)
plt.tight_layout()
plt.show()
plt.figure(figsize=(12, 6))
sns.scatterplot(data=data, x='age', y='salary', hue='education',
palette='Set2', alpha=0.7)
plt.title(' Salary vs Age Colored by Education', fontsize=16,
fontweight='bold')
plt.xlabel('Age', fontsize=12)
plt.ylabel('Annual Salary', fontsize=12)
plt.grid(True, linestyle='--', alpha=0.4)
plt.legend(title='Education', bbox_to_anchor=(1.05, 1), loc='upper
left')
plt.tight_layout()
plt.show()

C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\backends\
backend_agg.py:240: RuntimeWarning: Glyph 128202 missing from current
font.
font.set_text(s, 0.0, flags=flags)
C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\backends\
backend_agg.py:203: RuntimeWarning: Glyph 128202 missing from current
font.
font.set_text(s, 0, flags=flags)

C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\backends\
backend_agg.py:240: RuntimeWarning: Glyph 127891 missing from current
font.
font.set_text(s, 0.0, flags=flags)
C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\backends\
backend_agg.py:203: RuntimeWarning: Glyph 127891 missing from current
font.
font.set_text(s, 0, flags=flags)
C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\backends\
backend_agg.py:240: RuntimeWarning: Glyph 129489 missing from current
font.
font.set_text(s, 0.0, flags=flags)
C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\backends\
backend_agg.py:240: RuntimeWarning: Glyph 127891 missing from current
font.
font.set_text(s, 0.0, flags=flags)
C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\backends\
backend_agg.py:203: RuntimeWarning: Glyph 129489 missing from current
font.
font.set_text(s, 0, flags=flags)
C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\backends\
backend_agg.py:203: RuntimeWarning: Glyph 127891 missing from current
font.
font.set_text(s, 0, flags=flags)

df.isnull().sum()

Timestamp
0
How old are you?
0
What industry do you work in?
72
Job title
0
If your job title needs additional context, please clarify here:
20708
What is your annual salary? (You'll indicate the currency in a later
question. If you are part-time or hourly, please enter an annualized
equivalent -- what you would earn if you worked the job 40 hours a
week, 52 weeks a year.) 0
How much additional monetary compensation do you get, if any (for
example, bonuses or overtime in an average year)? Please only include
monetary compensation here, not the value of benefits.
7253
Please indicate the currency
0
If "Other," please indicate the currency here:
27743
If your income needs additional context, please provide it here:
24906
What country do you work in?
0
If you're in the U.S., what state do you work in?
4981
What city do you work in?
75
How many years of professional work experience do you have overall?
0
How many years of professional work experience do you have in your
field?
0
What is your highest level of education completed?
213
What is your gender?
166
What is your race? (Choose all that apply.)
168
dtype: int64

PREDICTION
#One-Hot Encode categorical columns
encoded_data = pd.get_dummies(data, columns=['industry', 'country',
'education'], drop_first=True)

#Define features (X) and target (y)


X = encoded_data.drop(columns=['salary'])
y = encoded_data['salary']

#Train-Test Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size=0.2, random_state=42)

# Fit Linear Regression model


from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train, y_train)

LinearRegression()

#Make predictions
y_pred = model.predict(X_test)

y_pred

array([138269.17263877, 25470.09947507, 60645.21151675, ...,


161942.70923415, 35076.83261679, 114042.09947507])

# Evaluate the model


from sklearn.metrics import mean_squared_error, r2_score
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)


print("R² Score:", r2)

Mean Squared Error: 4.1353752330515114e+30


R² Score: -8.104091221244569e+18

import matplotlib.pyplot as plt


import seaborn as sns

# Scatter plot of Actual vs Predicted


plt.figure(figsize=(10, 6))
sns.scatterplot(x=y_test, y=y_pred, alpha=0.6, color='dodgerblue')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()],
'r--') # Diagonal line
plt.xlabel('Actual Salary')
plt.ylabel('Predicted Salary')
plt.title('Actual vs Predicted Salary')
plt.grid(True)
plt.tight_layout()
plt.show()

You might also like