ML Lab Manual
ML Lab Manual
BASAVAKALYAN ENGINEERING
COLLEGE, BASAVAKALYAN
(Approved by AICTE New Delhi, Affiliated to VTU Belagavi & Recognized by
Govt. of Karnataka-ISO: 9001:2015 Certified)
NH-65, Basavakalyan, Bidar District-585327(Karnataka)
Prepared By:
Mr. Allamaprabhu Vastrad Mrs. Sangeeta K Mr. Deepak G
Asst. Professor Instructor Instructor
Machine Learning Laboratory-15CSL76
1. Implement and demonstrate the FIND-S algorithm for finding the most specific hypothesis
based on a given set of training data samples. Read the training data from a .CSV file.
import random
import csv
attributes=[['Sunny','Rainy'],
['Warm','Cold'],
['Normal','High'],
['Strong','Weak'],
['Warm','Cool'],
['Same','Change']]
num_attributes=len(attributes)
print("\nThe most general hypothesis:[?,?,?,?,?,?]\n")
print("\nThe most specific hypothesis:[0,0,0,0,0,0]\n")
a=[]
print("\nThe given Training Data Set\n")
with open('CSVFile.csv','r') as csvFile:
reader=csv.reader(csvFile)
for row in reader:
a.append(row)
print(row)
print("\nThe initial value of hypothesis:")
hypothesis=['0']*num_attributes
print(hypothesis)
for j in range(0,num_attributes):
hypothesis[j]=a[0][j] #fill the hypothesis with the a's first row
print("\nFind-S: Finding a maximally Specific Hypothesis\n")
for i in range(0,len(a)):
if a[i][num_attributes]=='Yes':
for j in range(0,num_attributes):
if a[i][j] != hypothesis[j]:
hypothesis[j]='?'
print("For training example No:{}".format(i),hypothesis)
print("\nThe final hypothesis is:")
print(hypothesis)
Dataset:
Output:
2. For a given set of training data examples stored in a .CSV file, implement and demonstrate
the Candidate-Elimination algorithm to output a description of the set of all hypotheses
consistent with the training examples.
import csv
a=[]
print("\n The Given Training Data Set \n")
for j in range(0,num_attributes):
S[j] = a[0][j];
for i in range(0,len(a)):
if a[i][num_attributes]=='Yes':
for j in range(0,num_attributes):
if a[i][j]!=S[j]:
S[j]='?'
for j in range(0,num_attributes):
for k in range(0,len(temp)):
if temp[k][j] != '?' and temp[k][j] != S[j]:
del temp[k] #remove it if it's not matching with the specific hypothesis
if (len(temp)==0):
print(" For Training Example No :{0} the hypothesis is G{0} ".format(i+1),G)
else:
print(" For Training Example No :{0} the hypothesis is G{0}".format(i+1),temp)
if a[i][num_attributes]=='No':
for j in range(0,num_attributes):
if S[j] != a[i][j] and S[j]!= '?': #if not matching with the specific Hypothesis take it
seperately and store it
G[j]=S[j]
temp.append(G) # this is the version space to store all Hypotheses
G = ['?'] * num_attributes
Dataset:
Sunny Warm Normal Strong Warm Same Yes
Sunny Warm High Strong Warm Same Yes
Rainy Cold High Strong Warm Change No
Sunny Warm High Strong Cool Change Yes
Output:
The Given Training Data Set
3. Write a program to demonstrate the working of the decision tree based ID3 algorithm.
Use an appropriate data set for building the decision tree and apply this knowledge toclassify
a new sample.
import sys
import numpy as np
from numpy import *
import csv
class Node:
def __init__(self, attribute):
self.attribute = attribute
self.children = []
self.answer = ""
def read_data(filename):
""" read csv file and return header and data """
with open(filename, 'r') as csvfile:
datareader = csv.reader(csvfile, delimiter=',')
metadata = next(datareader)
traindata=[]
for row in datareader:
traindata.append(row)
for x in range(items.shape[0]):
for y in range(data.shape[0]):
if data[y, col] == items[x]:
count[x] += 1
#count has the data of number of times each value is present in
for x in range(items.shape[0]):
dict[items[x]] = np.empty((int(count[x]), data.shape[1]), dtype="|S32")
pos = 0
for y in range(data.shape[0]):
if data[y, col] == items[x]:
dict[items[x]][pos] = data[y]
pos += 1
if delete:
dict[items[x]] = np.delete(dict[items[x]], col, 1)
return items, dict
def entropy(S):
""" calculate the entropy """
items = np.unique(S)
if items.size == 1:
return 0
for x in range(items.shape[0]):
counts[x] = sum(S == items[x]) / (S.size)
for x in range(items.shape[0]):
ratio = dict[items[x]].shape[0]/(total_size)
entropies[x] = ratio * entropy(dict[items[x]][:, -1])
for x in range(entropies.shape[0]):
total_entropy -= entropies[x]
return total_entropy
node = Node(metadata[split])
metadata = np.delete(metadata, split, 0)
for x in range(items.shape[0]):
child = create_node(dict[items[x]], metadata)
node.children.append((items[x], child))
return node
def empty(size):
""" To generate empty space needed for shaping the tree"""
s = ""
for x in range(size):
s += " "
return s
print(empty(level), node.answer.item(0).decode("utf-8"))
return
print(empty(level), node.attribute)
Dataset:
outlook temp humidity windy play
sunny hot high Weak no
sunny hot high Strong no
overcast hot high Weak yes
rainy mild high Weak yes
rainy cool normal Weak yes
rainy cool normal Strong no
overcast cool normal Strong yes
sunny mild high Weak no
sunny cool normal Weak yes
rainy mild normal Weak yes
sunny mild normal Strong yes
overcast mild high Strong yes
overcast hot normal Weak yes
rainy mild high Strong no
Output:
outlook
overcast
yes
rainy
windy
Strong
no
Weak
yes
sunny
humidity
high
no
normal
yes
import numpy as np
X = np.array(([2, 9], [1, 5], [3, 6])) # Hours Studied, Hours Slept
y = np.array(([92], [86], [89])) # Test Score
#Sigmoid Function
def sigmoid(x): #this function maps any value between 0 and 1
return 1/(1 + np.exp(-x))
#Variable initialization
epoch=10000 #Setting training iterations
lr=0.1 #Setting learning rate
inputlayer_neurons = 2 #number of features in data set
hiddenlayer_neurons = 3 #number of hidden layers neurons
output_neurons = 1 #number of neurons of output layer
for i in range(epoch):
#Forward Propogation
hinp1=np.dot(X,wh)
hinp= hinp1 + bias_hidden #bias_hidden GRADIENT DISCENT
hlayer_activation = sigmoid(hinp)
outinp1=np.dot(hlayer_activation,weight_hidden)
outinp= outinp1+ bias_output
output = sigmoid(outinp)
#Backpropagation
EO = y-output
outgrad = derivatives_sigmoid(output)
#Compute change factor(delta) at output layer, dependent on the gradient of error multiplied
by the slope of output layer activation
d_output = EO * outgrad
#At this step, the error will propagate back into the network which means error at hidden
layer. we will take the dot product of output layer delta with weight parameters of edges
between the hidden and output layer (weight_hidden.T).
EH = d_output.dot(weight_hidden.T)
#how much hidden layer weight contributed to error
hiddengrad = derivatives_sigmoid(hlayer_activation)
d_hiddenlayer = EH * hiddengrad
wh += X.T.dot(d_hiddenlayer) *lr
bias_output += np.sum(d_output, axis=0,keepdims=True) *lr
Output:
Input:
[[2 9]
[1 5]
[3 6]]
Actual Output:
[[0.92]
[0.86]
[0.89]]
Predicted Output:
[[0.8921829 ]
[0.88212774]
[0.89429156]]
5. Write a program to implement the naïve Bayesian classifier for a sample training data set
stored as a .CSV file. Compute the accuracy of the classifier, considering few test data sets.
import numpy as np
import math
import csv
import pdb
def read_data(filename):
def classify(data,test):
total_size = data.shape[0]
print("training data size=",total_size)
print("test data size=",test.shape[0])
countYes = 0
countNo = 0
probYes = 0
probNo = 0
print("target count probability")
for x in range(data.shape[0]):
if data[x,data.shape[1]-1] == 'yes':
countYes +=1
if data[x,data.shape[1]-1] == 'no':
countNo +=1
probYes=countYes/total_size
probNo= countNo / total_size
print('Yes',"\t",countYes,"\t",probYes)
print('No',"\t",countNo,"\t",probNo)
prob0 =np.zeros((test.shape[1]-1))
prob1 =np.zeros((test.shape[1]-1))
accuracy=0
print("instance prediction target")
for t in range(test.shape[0]):
for k in range (test.shape[1]-1):
count1=count0=0
for j in range (data.shape[0]):
#how many times appeared with no
if test[t,k] == data[j,k] and data[j,data.shape[1]-1]=='no':
count0+=1
#how many times appeared with yes
if test[t,k]==data[j,k] and data[j,data.shape[1]-1]=='yes':
count1+=1
prob0[k]=count0/countNo
prob1[k]=count1/countYes
probno=probNo
probyes=probYes
for i in range(test.shape[1]-1):
probno=probno*prob0[i]
probyes=probyes*prob1[i]
if probno>probyes:
predict='no'
else:
predict='yes'
print(t+1,"\t",predict,"\t ",test[t,test.shape[1]-1])
if predict == test[t,test.shape[1]-1]:
accuracy+=1
final_accuracy=(accuracy/test.shape[0])*100
print("accuracy",final_accuracy,"%")
return
metadata,traindata= read_data("tennis.csv")
splitRatio=0.6
trainingset, testset=splitDataset(traindata, splitRatio)
training=np.array(trainingset)
testing=np.array(testset)
classify(training,testing)
Dataset:
outlook temp humidity windy play
sunny hot high Weak no
sunny hot high Strong no
overcast hot high Weak yes
rainy mild high Weak yes
rainy cool normal Weak yes
rainy cool normal Strong no
overcast cool normal Strong yes
sunny mild high Weak no
sunny cool normal Weak yes
rainy mild normal Weak yes
sunny mild normal Strong yes
overcast mild high Strong yes
overcast hot normal Weak yes
rainy mild high Strong no
Output:
training data size= 8
test data size= 6
target count probability
Yes 4 0.5
No 4 0.5
instance prediction target
1 no yes
2 yes yes
3 no yes
4 yes yes
5 yes yes
6 no no
accuracy 66.66666666666666 %
6. Assuming a set of documents that need to be classified, use the naïve Bayesian Classifier
model to perform this task. Built-in Java classes/API can be used to write the program.
Calculate the accuracy, precision, and recall for your data set.
import pandas as pd
import pdb
msg=pd.read_csv('naivetext1.csv',names=['message','label']) #names-> name of the cols
msg['labelnum']=msg.label.map({'pos':1,'neg':0})
X=msg.message
Y=msg.labelnum
xtrain_dtm = count_vect.fit_transform(xtrain)
xtest_dtm=count_vect.transform(xtest)
df=pd.DataFrame(xtrain_dtm.toarray(),columns=count_vect.get_feature_names())
Dataset:
I love this sandwich pos
This is an amazing place pos
I feel very good about these
beers pos
This is my best work pos
What an awesome view pos
I do not like this restaurant neg
I am tired of this stuff neg
I can't deal with this neg
He is my sworn enemy neg
My boss is horrible neg
This is an awesome place pos
I do not like the taste of this
juice neg
I love to dance pos
I am sick and tired of this place neg
What a great holiday pos
That is a bad locality to stay neg
We will have good fun
tomorrow pos
I went to my enemy's house
today neg
Output:
ccuracy metrics
Accuracy of the classifer is 0.8
Confusion matrix
[[1 1]
[0 3]]
Recall and Precison
1.0
0.75
7. Write a program to construct aBayesian network considering medical data. Use this
model to demonstrate the diagnosis of heart patients using standard Heart Disease Data Set.
You can use Java/Python ML library classes/API.
import pandas as pd
data=pd.read_csv("heart_disease_data1.csv")
heart_disease=pd.DataFrame(data)
print(heart_disease)
q = HeartDisease_infer.query(variables=['heartdisease'], evidence={
'age':int(input('enter age')),
'Gender':int(input('enter Gender')),
'Family':int(input('enter Family history')),
'diet':int(input('enter diet')),
'Lifestyle':int(input('enter Lifestyle')),
'cholestrol':int(input('enter cholestrol'))
})
print(q['heartdisease'])
Dataset:
age Gender Family diet Lifestyle cholestrol heartdisease
0 0 1 1 3 0 1
0 1 1 1 3 0 1
1 0 0 0 2 1 1
4 0 1 1 3 2 0
3 1 1 0 0 2 0
2 0 1 1 1 0 1
4 0 1 0 2 0 1
0 0 1 1 3 0 1
3 1 1 0 0 2 0
1 1 0 0 0 2 1
4 1 0 1 2 0 1
4 0 1 1 3 2 0
2 1 0 0 0 0 0
2 0 1 1 1 0 1
3 1 1 0 0 1 0
0 0 1 0 0 2 1
1 1 0 1 2 1 1
3 1 1 1 0 1 0
4 0 1 1 3 2 0
Output:
For age enter SuperSeniorCitizen:0, SeniorCitizen:1, MiddleAged:2, Youth:3,
Teen:4
For Gender Enter Male:0, Female:1
For Family History Enter yes:1, No:0
For diet Enter High:0, Medium:1
for lifeStyle Enter Athlete:0, Active:1, Moderate:2, Sedentary:3
for cholesterol Enter High:0, BorderLine:1, Normal:2
enter age2
enter Gender0
enter Family history1
enter diet1
enter Lifestyle1
enter cholestrol1
+----------------+---------------------+
| heartdisease | phi(heartdisease) |
+================+=====================+
| heartdisease_0 | 1.0000 |
+----------------+---------------------+
| heartdisease_1 | 0.0000 |
+----------------+---------------------+
8. Apply EM algorithm to cluster a set of data stored in a .CSV file. Use the same data set
for clustering using k-Means algorithm. Compare the results of these two algorithms and
comment on the quality of clustering. You can add Java/Python ML library classes/API in the
program.
OUTPUT:
Observation: The GMM using EM algorithm based clustering matched the true
labels more closely than the Kmeans.
9. Write a program to implement k-Nearest Neighbour algorithm to classify the iris data
set. Print both correct and wrong predictions. Java/Python ML library classes can be used for
this problem.
Output:
Confusion matrix is as follows
[[10 0 0]
[ 0 16 1]
[ 0 1 17]]
Accuracy Matrics
precision recall f1-score support
import numpy as np
import matplotlib.pyplot as plt
x=np.linspace(-3,3,1000)
y=np.log(np.abs((x**2)-1)+0.5)
x+=np.random.normal(scale=0.05,size=1000)
plt.scatter(x,y,alpha=0.3)
def local_regression(x0,X,Y,tau):
x0=np.r_[1,x0]
X=np.c_[np.ones(len(X)),X]
xw=X.T*radial_kernel(x0,X,tau)
beta=np.linalg.pinv(xw@X)@xw@Y
return x0@beta
def radial_kernel(x0,X,tau):
return np.exp(np.sum((X-x0)**2,axis=1)/(-2*tau**2))
def plot_lwr(tau):
domain=np.linspace(-3,3,num=300)
prediction =[local_regression(x0,x,y,tau) for x0 in domain]
plt.scatter(x,y,alpha=0.3)
plt.plot(domain,prediction,color="red")
return plt
plot_lwr(0.04)
Output: