ML Lab Manual 2025-2
ML Lab Manual 2025-2
Prepared by
C. Jyothsna, Assistant Professor, CSE
Source Code:
import numpy as np
# Sample data (replace with your actual data)
data = [1, 2, 3, 4, 5, 5, 6, 7, 8, 9]
# Calculate mean
mean = np.mean(data)
print(f"Mean: {mean}")
# Calculate median
median = np.median(data)
print(f"Median: {median}")
# Calculate mode
from scipy import stats
mode = stats.mode(data)
print(f"Mode: {mode.mode[0]}")
# Calculate variance
variance = np.var(data)
print(f"Variance: {variance}")
Output:
Mean: 5.0
Median: 5.0
Mode: 5
Variance: 6.0
Standard Deviation: 2.449489742783178
Attribute selection
import pandas as pd
from sklearn.datasets import
load_iris
from
sklearn.feature_selection
import SelectKBest, f_classif
# Load dataset
data = load_iris()
X = pd.DataFrame(data.data,
columns=data.feature_name
s)
y = pd.Series(data.target)
# Select the top 2 features
selector =
SelectKBest(score_func=f_cla
ssif, k=2)
NBKRIST Department of CSE Prepared by C. Jyothsna
Assistant Professor
X_selected =
selector.fit_transform(X, y)
print("Selected Features:\n",
X.columns[selector.get_supp
ort()])
Handling Missing Values
import pandas as pd
import numpy as np
# Create a sample
DataFrame
data = {
'A': [1, 2, np.nan, 4],
'B': [np.nan, 2, 3, 4],
'C': [1, 2, 3, 4]
}
df = pd.DataFrame(data)
# Option 1: Drop rows with
missing values
df_dropped = df.dropna()
import pandas as pd
import numpy as np
df = pd.read_csv('Sales.csv')
print(df[df['Product Category']=='Electronics'])
print(df[df['Customer Region']=='West'])
# c. Discretization
# Equal-width binning
num_bins = 3
df['Discretized_value'] = pd.cut(df['Sales Amount'], bins=num_bins, labels=False)
#labels=False provides integer labels
print(df)
Output:
Order ID 0
Date 0
Product Category 0
Sales Amount 0
Quantity 0
Customer Region 0
dtype: int64
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)
# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
print (cm)
print(ac)
Output:
[[55 3]
[ 1 21]]
0.95
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.read_csv('./diamonds.csv')
df
df.shape
df.drop('Unnamed: 0',inplace=True,axis=1)
df.info()
df.isna().sum()
df.isnull().sum()
df_new=pd.get_dummies(df,drop_first=True)
df_new
X=df_new.drop('price',axis=1)
Y=df_new['price']
train_x,test_x,train_y,test_y=train_test_split(X,Y,test_size=0.2,random_state=100)
train_y=train_y.to_numpy().reshape(-1,1)
test_y=test_y.to_numpy().reshape(-1,1)
scale_x = MinMaxScaler().fit(train_x)
tran_x = StandardScaler().fit(train_x)
tran_y = StandardScaler().fit(train_y)
train_x = tran_x.transform(train_x)
train_y = tran_y.transform(train_y)
test_x=scale_x.transform(test_x)
test_x=tran_x.transform(test_x)
test_y=scale_y.transform(test_y)
test_y=tran_y.transform(test_y)
para = {
'n_neighbors':[3,5,7,12],
'weights' : ['uniform', 'distance']
}
dia_reg=GridSearchCV(KNeighborsRegressor(),para,cv=10)
dia_reg.fit(train_x,train_y)
dia_reg.best_score_
dia_reg.best_params_
reg = KNeighborsRegressor(n_neighbors=5, weights='distance')
reg.fit(train_x,train_y)
pred=reg.predict(test_x)
test_y=tran_y.inverse_transform(test_y)
test_y=scale_y.inverse_transform(test_y)
pred=tran_y.inverse_transform(pred)
pred=scale_y.inverse_transform(pred)
r2_score(pred,test_y)
mean_absolute_error(pred,test_y)
mean_squared_error(pred,test_y)
np.sqrt(mean_squared_error(pred,test_y))
Output:
0.951350450423818
391.5401970196721
695982.7207807746
834.2557885809211
Source Code
import pandas as pd
from google.colab import files
uploaded = files.upload()
Output:
1.0
array([0], dtype=int64)
array([1], dtype=int64)
Source Code:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
Output
0.7875383967595564
7.425010498369913
2.724887245074539
Output:
[2 1 0 1 1 2 1 0 1 0 2 1 1 1 1 1 1 2 2 0 0 2 0 0 0 1 1 1 1 0]
[2 1 0 1 1 2 1 0 1 0 2 1 2 1 1 2 2 2 2 0 0 2 0 0 0 1 1 1 1 0]
0.9
Output:
RMSE:
96.389
Source Code:
import pandas as pd
from google.colab import files
uploaded = files.upload()
df = pd.read_csv("titanic.csv")
Source Code:
import pandas as pd
from sklearn.datasets import load_iris
iris = load_iris()
iris.feature_names
iris.target_names
Output:
0.93333333333333335
array([0])
Source Code:
import pandas as pd
import numpy as np
from sklearn import linear_model
import matplotlib.pyplot as plt
from google.colab import files
Source Code
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline
from google.colab import files
uploaded = files.upload()
df = pd.read_csv("insurance_data.csv")
Output
0.4850044983805899
0.485 is less than 0.5 which means person with 35 age will not buy insurance
0.568565299077705
0.485 is more than 0.5 which means person with 43 will buy the insurance
Source Code
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
# Let's start by naming the features
names = ['sepal-length', 'sepal-width', 'petal-length', 'petal-width', 'Class']
# Reading the dataset through a Pandas function
irisdata = pd.read_csv(url, names=names)
# Takes first 4 columns and assign them to variable "X"
X = irisdata.iloc[:, 0:4]
# Takes first 5th columns and assign them to variable "Y". Object dtype refers to strings.
y = irisdata.select_dtypes(include=[object])
X.head()
y.head()
# y actually contains all categories or classes:
y.Class.unique()
# Now transforming categorial into numerical values
le = preprocessing.LabelEncoder()
y = y.apply(le.fit_transform)
y.head()
# Now for train and test split (80% of dataset into training set and other 20% into test data)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20)
# Feature scaling
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
# Finally for the MLP- Multilayer Perceptron
mlp = MLPClassifier(hidden_layer_sizes=(10, 10, 10), max_iter=1000)
mlp.fit(X_train, y_train.values.ravel())
predictions = mlp.predict(X_test)
print(predictions)
# Last thing: evaluation of algorithm performance in classifying flowers
print(confusion_matrix(y_test,predictions))
print(classification_report(y_test,predictions))
Output:
[0 0 2 2 0 2 2 2 2 2 1 1 1 2 2 1 1 2 1 2 2 1 0 0 2 2 1 2 2 1]
[[ 5 0 0]
[ 0 8 0]
[ 0 1 16]]
precision recall f1-score support
accuracy 0.97 30
macro avg 0.96 0.98 0.97 30
Aim-12: Implement the K-means algorithm and apply it to the data you
selected. Evaluate performance by measuring the sum of the Euclidean
distance of each example from its class centre. Test the performance of the
algorithm as a function of the parameters K.
Source Code
scaler.fit(df[['Income($)']])
df['Income($)'] = scaler.transform(df[['Income($)']])
scaler.fit(df[['Age']])
df['Age'] = scaler.transform(df[['Age']])
Output:
Source Code:
#number of clusters
k=3
#dimension of cluster
d=4
# m parameter
m=2
#number of iterations
MAX_ITERS = 12
plt.figure(0,figsize=(5,5)) #scatter plot of sepal length vs sepal width
plt.scatter(list(df.iloc[:,0]), list(df.iloc[:,1]), marker='o')
plt.axis('equal')
plt.xlabel('Sepal Length', fontsize=16)
plt.ylabel('Sepal Width', fontsize=16)
plt.title('Sepal Plot', fontsize=25,color='b')
plt.grid()
plt.show()
"""
weight = np.random.dirichlet(np.ones(k),n)
weight_arr = np.array(weight)
return weight_arr
def computeCentroids(weight_arr):
C = []
for i in range(k):
weight_sum = np.power(weight_arr[:,i],m).sum()
Cj = []
for x in range(d):
numerator = ( df.iloc[:,x].values * np.power(weight_arr[:,i],m)).sum()
c_val = numerator/weight_sum;
Cj.append(c_val)
C.append(Cj)
return C
def updateWeights(weight_arr,C):
denom = np.zeros(n)
for i in range(k):
dist = (df.iloc[:,:].values - C[i])**2
dist = np.sum(dist, axis=1)
dist = np.sqrt(dist)
denom = denom + np.power(1/dist,1/(m-1))
for i in range(k):
dist = (df.iloc[:,:].values - C[i])**2
dist = np.sum(dist, axis=1)
dist = np.sqrt(dist)
weight_arr[:,i] = np.divide(np.power(1/dist,1/(m-1)),denom)
return weight_arr
def plotData(z,C):
plt.subplot(4,3,z+1) #scatter plot of sepal length vs sepal width
plt.scatter(list(df.iloc[:,2]), list(df.iloc[:,3]), marker='o')
for center in C:
plt.scatter(center[2],center[3], marker='o',color='r')
plt.axis('equal')
plt.xlabel('Sepal Length', fontsize=16)
plt.ylabel('Sepal Width', fontsize=16)
plt.grid()
def FuzzyMeansAlgorithm():
weight_arr = initializeMembershipWeights()
plt.figure(figsize=(50,50))
for z in range(MAX_ITERS):
C = computeCentroids(weight_arr)
updateWeights(weight_arr,C)
Output:
Source Code:
Z1 = multivariate_normal(m1, cov1)
Z2 = multivariate_normal(m2, cov2)
pos = np.empty(X.shape + (2,)) # a new array of given shape and type, without initializing
entries
pos[:, :, 0] = X; pos[:, :, 1] = Y
return(eval1)
## Maximization step
def Mstep(eval1):
num_mu1,din_mu1,num_mu2,din_mu2=0,0,0,0
mu1 = num_mu1/din_mu1
mu2 = num_mu2/din_mu2
num_s1,din_s1,num_s2,din_s2=0,0,0,0
for i in range(0,len(d)):
q1 = np.matrix(d[i]-mu1)
num_s1 += (1-eval1[i]) * np.dot(q1.T, q1)
din_s1 += (1-eval1[i])
q2 = np.matrix(d[i]-mu2)
num_s2 += eval1[i] * np.dot(q2.T, q2)
din_s2 += eval1[i]
s1 = num_s1/din_s1
s2 = num_s2/din_s2
pi = sum(eval1)/len(d)
lis2=[mu1,mu2,s1,s2,pi]
return(lis2)
def plot(lis1):
mu1=lis1[0]
mu2=lis1[1]
s1=lis1[2]
s2=lis1[3]
Z1 = multivariate_normal(mu1, s1)
Z2 = multivariate_normal(mu2, s2)
pos = np.empty(X.shape + (2,)) # a new array of given shape and type, without initializing
entries
pos[:, :, 0] = X; pos[:, :, 1] = Y
Output: