0% found this document useful (0 votes)
4 views

assignment-3

The document outlines two assignments involving binary and multi-class classification using logistic regression on the Titanic and Iris datasets, respectively. It includes data preprocessing steps such as converting categorical variables to numeric, handling missing values, and scaling features. The document also details the implementation of gradient descent for training the models and visualizing the results through cost function plots and confusion matrices.

Uploaded by

abhishek543gound
Copyright
© © All Rights Reserved
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
4 views

assignment-3

The document outlines two assignments involving binary and multi-class classification using logistic regression on the Titanic and Iris datasets, respectively. It includes data preprocessing steps such as converting categorical variables to numeric, handling missing values, and scaling features. The document also details the implementation of gradient descent for training the models and visualizing the results through cost function plots and confusion matrices.

Uploaded by

abhishek543gound
Copyright
© © All Rights Reserved
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 9

Assignment-3

Q1.1. Binary class classification (Binomial)(Titanic-Dataset): There can be only two

possible types of the dependent variables, such as 0

or 1, Pass or Fail, etc.

import numpy as np

import pandas as pd

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler

#data = pd.read_csv('titanic.csv')

df=pd.read_csv("Z:\ML LAB\Titanic.csv");

print(df);

#converting string to numeric data

m=len(df['Sex']);

for i in range(0,m,1):

if(df['Sex'][i]=='male'):

df['Sex'][i]=1;

else:

df['Sex'][i]=0;

for i in range(0,m,1):
if(df['Embarked'][i]=='C'):

df['Embarked'][i]=1;

if(df['Embarked'][i]=='Q'):

df['Embarked'][i]=2;

if(df['Embarked'][i]=='S'):

df['Embarked'][i]=3;

## drop the string data

df = df.drop('Name', axis=1)

df = df.drop('Cabin', axis=1)

df = df.drop('Ticket', axis=1)

df['Age'].fillna(df['Age'].mean(), inplace=True)

df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)

df['Fare'].fillna(df['Fare'].mean(), inplace=True)

'''q1=df['Age'].quantile(0.25);

q2=df['Age'].quantile(0.75);

IQR=q2-q1;

lb=q1-1.5*IQR;

ub=q1+1.5*IQR;

df=df[(df['Age']>=lb) &(df['Age']<=ub)];

'''
#data['Sex'] = data['Sex'].map({'male': 0, 'female': 1})

#data['Embarked'] = data['Embarked'].map({'S': 0, 'C': 1, 'Q': 2})

X = df[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']]

y = df['Survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)

X_test = scaler.transform(X_test)

def sigmoid(z):

return 1 / (1 + np.exp(-z))

def cost_function(X, y, theta):

m = len(y)

h = sigmoid(X.dot(theta))

cost = (-1/m) * np.sum(y * np.log(h) + (1 - y) * np.log(1 - h))

return cost

def gradient_descent(X, y, theta, alpha, epochs):

m = len(y)

cost_history = []

for epoch in range(epochs):

h = sigmoid(X.dot(theta))

gradient = (1/m) * X.T.dot(h - y)

theta -= alpha * gradient


cost = cost_function(X, y, theta)

print(epoch,cost);

cost_history.append(cost)

return theta, cost_history

X_train_bias = np.c_[np.ones((X_train.shape[0], 1)), X_train]

theta_initial = np.zeros(X_train_bias.shape[1])

XTEST=np.c_[np.ones((X_test.shape[0], 1)), X_test]

alpha = 0.01

iteration = 1000

theta_final, cost_history = gradient_descent(X_train_bias, y_train, theta_initial, alpha, iteration)

#print('final theta:',theta_final);

plt.figure(figsize=(5, 5))

plt.plot(range(epochs), cost_history, color='blue')

plt.title('Cost Function over Epochs')

plt.xlabel('Epochs')

plt.ylabel('Cost')

plt.grid(True)

plt.show()

predict= sigmoid(XTEST.dot(theta_final))

print('predict and y_test values',predict,y_test);


Q2.Multi-class classification (Multinomial) (Iris-Dataset): there can be 3 or more

possible types of the dependent variable.

import numpy as np

import pandas as pd

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler

from sklearn.metrics import confusion_matrix

import seaborn as sns

file_path = 'C:\\Users\\abhis\\Downloads\\Iris.csv'

df = pd.read_csv(file_path)

df['Species'] = df['Species'].map({'Iris-setosa': 0, 'Iris-versicolor': 1, 'Iris-virginica': 2})

X = df[['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']].values

y = df['Species'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)

X_test = scaler.transform(X_test)

class LogisticRegressionWithGD:
def __init__(self, learning_rate=0.01, epochs=1000):

self.learning_rate = learning_rate

self.epochs = epochs

self.cost_history = []

def softmax(self, z):

exp_z = np.exp(z - np.max(z, axis=1, keepdims=True))

return exp_z / np.sum(exp_z, axis=1, keepdims=True)

def cost_function(self, X, y, theta):

m = len(y)

predictions = self.softmax(np.dot(X, theta))

one_hot_y = np.eye(theta.shape[1])[y]

cost = -np.sum(one_hot_y * np.log(predictions)) / m

return cost

def fit(self, X, y):

m, n = X.shape

X = np.hstack([np.ones((m, 1)), X])

num_classes = len(np.unique(y))

theta = np.zeros((n + 1, num_classes))

for epoch in range(self.epochs):

predictions = self.softmax(np.dot(X, theta))


error = predictions - np.eye(num_classes)[y.flatten()]

gradient = np.dot(X.T, error) / m

theta -= self.learning_rate * gradient

cost = self.cost_function(X, y, theta)

self.cost_history.append(cost)

return theta

def predict(self, X):

X = np.hstack([np.ones((X.shape[0], 1)), X])

predictions = self.softmax(np.dot(X, self.theta))

return np.argmax(predictions, axis=1)

model = LogisticRegressionWithGD(learning_rate=0.01, epochs=1000)

model.theta = model.fit(X_train, y_train)

plt.plot(range(len(model.cost_history)), model.cost_history, label="Cost")

plt.xlabel('Epochs')

plt.ylabel('Cost')

plt.title('Epochs vs Cost for Multi-Class Logistic Regression (Gradient Descent)')

plt.legend()

plt.show()
y_pred = model.predict(X_test)

conf_matrix = confusion_matrix(y_test, y_pred)

sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Setosa', 'Versicolor',


'Virginica'], yticklabels=['Setosa', 'Versicolor', 'Virginica'])

plt.title('Confusion Matrix for Multi-Class Iris Classification')

plt.xlabel('Predicted')

plt.ylabel('True')

plt.show()

plt.figure(figsize=(8, 6))

plt.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=plt.cm.Paired, edgecolors='k', s=50)

plt.xlabel('Sepal Length')

plt.ylabel('Sepal Width')

plt.title('Iris Dataset: Sepal Length vs Sepal Width')

plt.show()

x_min, x_max = X_train[:, 0].min() - 1, X_train[:, 0].max() + 1

y_min, y_max = X_train[:, 1].min() - 1, X_train[:, 1].max() + 1

xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1), np.arange(y_min, y_max, 0.1))

grid_points = np.c_[xx.ravel(), yy.ravel(), np.zeros_like(xx.ravel()), np.zeros_like(yy.ravel())]

Z = model.predict(grid_points)
Z = Z.reshape(xx.shape)

plt.contourf(xx, yy, Z, alpha=0.8)

plt.scatter(X_train[:, 0], X_train[:, 1], c=y_train, edgecolors='k', s=50)

plt.xlabel('Sepal Length')

plt.ylabel('Sepal Width')

plt.title('Decision Boundaries of Multi-Class Logistic Regression (Gradient Descent)')

plt.show()

You might also like