0% found this document useful (0 votes)
18 views8 pages

Untitled2.Ipynb - Colab

The document outlines a data analysis and machine learning workflow using a heart disease dataset in Python with libraries such as pandas, seaborn, and scikit-learn. It includes data preprocessing, visualization of distributions and correlations, and the implementation of various classification models including Logistic Regression, Decision Tree, Random Forest, and SVM, along with their evaluation metrics. The analysis reveals insights into the dataset and the performance of different models in predicting heart disease.
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
18 views8 pages

Untitled2.Ipynb - Colab

The document outlines a data analysis and machine learning workflow using a heart disease dataset in Python with libraries such as pandas, seaborn, and scikit-learn. It includes data preprocessing, visualization of distributions and correlations, and the implementation of various classification models including Logistic Regression, Decision Tree, Random Forest, and SVM, along with their evaluation metrics. The analysis reveals insights into the dataset and the performance of different models in predicting heart disease.
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 8

5/11/25, 10:35 PM Untitled2.

ipynb - Colab

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score

from google.colab import files


uploaded = files.upload()

import io
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Automatically get filename


filename = list(uploaded.keys())[0]

# Read the file


df = pd.read_csv(io.BytesIO(uploaded[filename]))

# Show DataFrame details


print(df.head())
print(df.info())
print(df.isnull().sum())

# Plot the target column


sns.countplot(x='target', data=df)
plt.title("Heart Disease Count (0=No, 1=Yes)")
plt.show()

https://colab.research.google.com/drive/1dOSJItEe0P2tYlbFeuBxId4-CO9tj7_x#scrollTo=6oxSkoppFBZY&printMode=true 1/8
5/11/25, 10:35 PM Untitled2.ipynb - Colab

Choose Files heart.csv


heart.csv(text/csv) - 39689 bytes, last modified: 5/11/2025 - 100% done
Saving heart.csv to heart.csv
age sex chest pain type resting bp s cholesterol fasting blood sugar \
0 40 1 2 140 289 0
1 49 0 3 160 180 0
2 37 1 2 130 283 0
3 48 0 4 138 214 0
4 54 1 3 150 195 0

resting ecg max heart rate exercise angina oldpeak ST slope target
0 0 172 0 0.0 1 0
1 0 156 0 1.0 2 1
2 1 98 0 0.0 1 0
3 0 108 1 1.5 2 1
4 0 122 0 0.0 1 0
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1190 entries, 0 to 1189
Data columns (total 12 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 age 1190 non-null int64
1 sex 1190 non-null int64
2 chest pain type 1190 non-null int64
3 resting bp s 1190 non-null int64
4 cholesterol 1190 non-null int64
5 fasting blood sugar 1190 non-null int64
6 resting ecg 1190 non-null int64
7 max heart rate 1190 non-null int64
8 exercise angina 1190 non-null int64
9 oldpeak 1190 non-null float64
10 ST slope 1190 non-null int64
11 target 1190 non-null int64
dtypes: float64(1), int64(11)
memory usage: 111.7 KB
None
age 0
sex 0
chest pain type 0
resting bp s 0
cholesterol 0
fasting blood sugar 0
resting ecg 0
max heart rate 0
exercise angina 0
oldpeak 0
ST slope 0
target 0
dtype: int64

import seaborn as sns


import matplotlib.pyplot as plt
import pandas as pd
sns.set(style="white")

fig, axes = plt.subplots(1, 3, figsize=(24, 8)) # Increase width and height for more space

# 1️⃣ Missing Values Heatmap


sns.heatmap(df.isnull(), cbar=False, cmap='viridis', ax=axes[0])
axes[0].set_title("Missing Values Heatmap")

# 2️⃣ Correlation Heatmap with increased spreading


https://colab.research.google.com/drive/1dOSJItEe0P2tYlbFeuBxId4-CO9tj7_x#scrollTo=6oxSkoppFBZY&printMode=true 2/8
5/11/25, 10:35 PM Untitled2.ipynb - Colab
2️⃣ p p g
corr = df.corr()
sns.heatmap(corr, annot=True, cmap='coolwarm', ax=axes[1], fmt='.2f', annot_kws={"size": 10})
axes[1].set_title("Correlation Between Features")

# 3️⃣ Categorical Pivot Heatmap: Sex vs Target Count


pivot_table = df.pivot_table(index='sex', columns='target', aggfunc='size', fill_value=0)
sns.heatmap(pivot_table, annot=True, fmt='d', cmap="YlGnBu", ax=axes[2])
axes[2].set_title("Target vs Sex Heatmap (Count)")

# Adjust layout to avoid overlap


plt.subplots_adjust(wspace=0.3) # Increase space between subplots
plt.tight_layout()
plt.show()

import seaborn as sns


import matplotlib.pyplot as plt

# Subplots: 3 row, 2 column = 6 graphs


fig, axes = plt.subplots(3, 2, figsize=(15, 14))

# Cholesterol
sns.histplot(df['cholesterol'], kde=True, color='blue', ax=axes[0, 0])
axes[0, 0].set_title("Cholesterol Distribution")
axes[0, 0].set_xlabel("Cholesterol Level")
axes[0, 0].set_ylabel("Frequency")

# Age
sns.histplot(df['age'], kde=True, color='purple', ax=axes[0, 1])
axes[0, 1].set_title("Age Distribution")
axes[0, 1].set_xlabel("Age")
axes[0, 1].set_ylabel("Frequency")

# Max Heart Rate


sns.histplot(df['max heart rate'], kde=True, color='red', ax=axes[1, 0])
axes[1, 0].set_title("Max Heart Rate Distribution")
axes[1, 0].set_xlabel("Max Heart Rate")
axes[1, 0].set_ylabel("Frequency")

# Oldpeak
sns.histplot(df['oldpeak'], kde=True, color='green', ax=axes[1, 1])
axes[1, 1].set_title("Oldpeak (ST Depression) Distribution")
axes[1, 1].set_xlabel("Oldpeak")

https://colab.research.google.com/drive/1dOSJItEe0P2tYlbFeuBxId4-CO9tj7_x#scrollTo=6oxSkoppFBZY&printMode=true 3/8
5/11/25, 10:35 PM Untitled2.ipynb - Colab
axes[1, 1].set_ylabel("Frequency")

# Resting Blood Pressure


sns.histplot(df['resting bp s'], kde=True, color='orange', ax=axes[2, 0])
axes[2, 0].set_title("Resting Blood Pressure Distribution")
axes[2, 0].set_xlabel("Resting BP")
axes[2, 0].set_ylabel("Frequency")

# Fasting Blood Sugar


sns.histplot(df['fasting blood sugar'], kde=False, color='teal', ax=axes[2, 1])
axes[2, 1].set_title("Fasting Blood Sugar Distribution")
axes[2, 1].set_xlabel("Fasting Blood Sugar")
axes[2, 1].set_ylabel("Count")

# Adjust layout
plt.tight_layout()
plt.show()

https://colab.research.google.com/drive/1dOSJItEe0P2tYlbFeuBxId4-CO9tj7_x#scrollTo=6oxSkoppFBZY&printMode=true 4/8
5/11/25, 10:35 PM Untitled2.ipynb - Colab

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

numerical_columns = ['age', 'sex', 'chest pain type', 'resting bp s', 'cholesterol',


'fasting blood sugar', 'resting ecg', 'max heart rate', 'exercise angina',
'oldpeak', 'ST slope']

https://colab.research.google.com/drive/1dOSJItEe0P2tYlbFeuBxId4-CO9tj7_x#scrollTo=6oxSkoppFBZY&printMode=true 5/8
5/11/25, 10:35 PM Untitled2.ipynb - Colab

df[numerical_columns] = scaler.fit_transform(df[numerical_columns])

print(df.head())

age sex chest pain type resting bp s cholesterol \


0 0.244898 1.0 0.333333 0.70 0.479270
1 0.428571 0.0 0.666667 0.80 0.298507
2 0.183673 1.0 0.333333 0.65 0.469320
3 0.408163 0.0 1.000000 0.69 0.354892
4 0.530612 1.0 0.666667 0.75 0.323383

fasting blood sugar resting ecg max heart rate exercise angina \
0 0.0 0.0 0.788732 0.0
1 0.0 0.0 0.676056 0.0
2 0.0 0.5 0.267606 0.0
3 0.0 0.0 0.338028 1.0
4 0.0 0.0 0.436620 0.0

oldpeak ST slope target


0 0.295455 0.333333 0
1 0.409091 0.666667 1
2 0.295455 0.333333 0
3 0.465909 0.666667 1
4 0.295455 0.333333 0

from sklearn.model_selection import train_test_split

X = df.drop(columns=['target'])
y = df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training data size:", X_train.shape)


print("Test data size:", X_test.shape)

Training data size: (952, 11)


Test data size: (238, 11)

# Import necessary libraries


from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Assume that df is already loaded and preprocessed

# Splitting the data into features (X) and target (y)


X = df.drop('target', axis=1) # Dropping target column for features
y = df['target'] # Target column

# Splitting data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model 1: Logistic Regression


logreg_model = LogisticRegression(max_iter=1000)
logreg_model.fit(X_train, y_train)
logreg_pred = logreg_model.predict(X_test)

# Model 2: Decision Tree


dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)
dt_pred = dt_model.predict(X_test)

# Model 3: Random Forest


rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)

# Model 4: Support Vector Machine (SVM)


svm_model = SVC(kernel='linear')
svm_model.fit(X_train, y_train)
svm_pred = svm_model.predict(X_test)

# Evaluate models

# Logistic Regression
print("Logistic Regression Accuracy:", accuracy_score(y_test, logreg_pred))
print("Logistic Regression Confusion Matrix:\n", confusion_matrix(y_test, logreg_pred))

https://colab.research.google.com/drive/1dOSJItEe0P2tYlbFeuBxId4-CO9tj7_x#scrollTo=6oxSkoppFBZY&printMode=true 6/8
5/11/25, 10:35 PM Untitled2.ipynb - Colab
print("Logistic Regression Classification Report:\n", classification_report(y_test, logreg_pred))

# Decision Tree
print("Decision Tree Accuracy:", accuracy_score(y_test, dt_pred))
print("Decision Tree Confusion Matrix:\n", confusion_matrix(y_test, dt_pred))
print("Decision Tree Classification Report:\n", classification_report(y_test, dt_pred))

# Random Forest
print("Random Forest Accuracy:", accuracy_score(y_test, rf_pred))
print("Random Forest Confusion Matrix:\n", confusion_matrix(y_test, rf_pred))
print("Random Forest Classification Report:\n", classification_report(y_test, rf_pred))

# SVM
print("SVM Accuracy:", accuracy_score(y_test, svm_pred))
print("SVM Confusion Matrix:\n", confusion_matrix(y_test, svm_pred))
print("SVM Classification Report:\n", classification_report(y_test, svm_pred))

Logistic Regression Accuracy: 0.8529411764705882


Logistic Regression Confusion Matrix:
[[ 90 17]
[ 18 113]]
Logistic Regression Classification Report:
precision recall f1-score support

0 0.83 0.84 0.84 107


1 0.87 0.86 0.87 131

accuracy 0.85 238


macro avg 0.85 0.85 0.85 238
weighted avg 0.85 0.85 0.85 238

Decision Tree Accuracy: 0.8991596638655462


Decision Tree Confusion Matrix:
[[ 99 8]
[ 16 115]]
Decision Tree Classification Report:
precision recall f1-score support

0 0.86 0.93 0.89 107


1 0.93 0.88 0.91 131

accuracy 0.90 238


macro avg 0.90 0.90 0.90 238
weighted avg 0.90 0.90 0.90 238

Random Forest Accuracy: 0.9453781512605042


Random Forest Confusion Matrix:
[[ 98 9]
[ 4 127]]
Random Forest Classification Report:
precision recall f1-score support

0 0.96 0.92 0.94 107


1 0.93 0.97 0.95 131

accuracy 0.95 238


macro avg 0.95 0.94 0.94 238
weighted avg 0.95 0.95 0.95 238

SVM Accuracy: 0.8571428571428571


SVM Confusion Matrix:
[[ 90 17]
[ 17 114]]
SVM Classification Report:
precision recall f1-score support

0 0.84 0.84 0.84 107


1 0.87 0.87 0.87 131

accuracy 0.86 238


macro avg 0.86 0.86 0.86 238
weighted avg 0.86 0.86 0.86 238

from sklearn.svm import SVC


from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_curve, auc
import matplotlib.pyplot as plt

# Model 4: Support Vector Machine (SVM)


svm_model = SVC(kernel='linear', probability=True, random_state=42)
svm_model.fit(X_train, y_train)
svm_pred = svm_model.predict(X_test)

# Logistic Regression Evaluation


print("Logistic Regression Evaluation:")
logreg_accuracy = accuracy_score(y_test, logreg_pred)
https://colab.research.google.com/drive/1dOSJItEe0P2tYlbFeuBxId4-CO9tj7_x#scrollTo=6oxSkoppFBZY&printMode=true 7/8
5/11/25, 10:35 PM Untitled2.ipynb - Colab
print(f"Accuracy: {logreg_accuracy * 100:.2f}%")
print("Confusion Matrix:")
print(confusion_matrix(y_test, logreg_pred))
print("Classification Report:")
print(classification_report(y_test, logreg_pred))

# ROC Curve: Logistic Regression


fpr, tpr, thresholds = roc_curve(y_test, logreg_model.predict_proba(X_test)[:, 1])
roc_auc = auc(fpr, tpr)
print("\n🔎 Logistic Regression Threshold Values (First 10):")
for i in range(min(10, len(thresholds))):
print(f"Threshold: {thresholds[i]:.4f}, TPR: {tpr[i]:.4f}, FPR: {fpr[i]:.4f}")

plt.figure(figsize=(10, 6))
plt.plot(fpr, tpr, color='blue', label=f'Logistic Regression (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
for i in range(0, len(thresholds), max(1, len(thresholds)//10)):
plt.annotate(f'{thresholds[i]:.2f}', (fpr[i], tpr[i]), textcoords="offset points", xytext=(5, -10), ha='left', fontsize=8)
plt.title('ROC Curve with Thresholds (Logistic Regression)')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend(loc='lower right')
plt.grid(True)
plt.show()

# Decision Tree Evaluation


print("\nDecision Tree Evaluation:")
dt_accuracy = accuracy_score(y_test, dt_pred)
print(f"Accuracy: {dt_accuracy * 100:.2f}%")
print("Confusion Matrix:")
print(confusion_matrix(y_test, dt_pred))
print("Classification Report:")
print(classification_report(y_test, dt_pred))

# ROC Curve: Decision Tree


fpr, tpr, thresholds = roc_curve(y_test, dt_model.predict_proba(X_test)[:, 1])
roc_auc = auc(fpr, tpr)
print("\n🔎 Decision Tree Threshold Values (First 10):")
for i in range(min(10, len(thresholds))):
print(f"Threshold: {thresholds[i]:.4f}, TPR: {tpr[i]:.4f}, FPR: {fpr[i]:.4f}")

plt.figure(figsize=(10, 6))
plt.plot(fpr, tpr, color='red', label=f'Decision Tree (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
for i in range(0, len(thresholds), max(1, len(thresholds)//10)):
plt.annotate(f'{thresholds[i]:.2f}', (fpr[i], tpr[i]), textcoords="offset points", xytext=(5, -10), ha='left', fontsize=8)
plt.title('ROC Curve with Thresholds (Decision Tree)')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend(loc='lower right')
plt.grid(True)
plt.show()

# Random Forest Evaluation


print("\nRandom Forest Evaluation:")
rf_accuracy = accuracy_score(y_test, rf_pred)
print(f"Accuracy: {rf_accuracy * 100:.2f}%")
print("Confusion Matrix:")
print(confusion_matrix(y_test, rf_pred))
print("Classification Report:")
print(classification_report(y_test, rf_pred))

# ROC Curve: Random Forest


fpr, tpr, thresholds = roc_curve(y_test, rf_model.predict_proba(X_test)[:, 1])
roc_auc = auc(fpr, tpr)
print("\n🔎 Random Forest Threshold Values (First 10):")
for i in range(min(10, len(thresholds))):
print(f"Threshold: {thresholds[i]:.4f}, TPR: {tpr[i]:.4f}, FPR: {fpr[i]:.4f}")

plt.figure(figsize=(10, 6))
plt.plot(fpr, tpr, color='green', label=f'Random Forest (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
for i in range(0, len(thresholds), max(1, len(thresholds)//10)):
plt.annotate(f'{thresholds[i]:.2f}', (fpr[i], tpr[i]), textcoords="offset points", xytext=(5, -10), ha='left', fontsize=8)
plt.title('ROC Curve with Thresholds (Random Forest)')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend(loc='lower right')
plt.grid(True)
plt.show()

# SVM Evaluation
print("\nSVM Evaluation:")
https://colab.research.google.com/drive/1dOSJItEe0P2tYlbFeuBxId4-CO9tj7_x#scrollTo=6oxSkoppFBZY&printMode=true 8/8

You might also like