SlideShare a Scribd company logo
pg 1. Implement and demonstratethe FIND-Salgorithm for finding the most specific hypothesis
based on a given set of training data samples. Read the training data from a .CSV file.
import csv
num_attributes=6
a=[]
print("n the given training data setn")
with open('sports.csv','r') as csvfile:
reader=csv.reader(csvfile)
for row in reader:
a.append(row)
print(row)
print("n the initial value of hypothesis: ")
hypothesis=['0']*num_attributes
print(hypothesis)
for j in range(0,num_attributes):
hypothesis[j]=a[0][j]
print("n find s: finding a maximaly specific hypothesisn")
for i in range(0,len(a)):
if a[i][num_attributes]=='yes':
for j in range(0,num_attributes):
if a[i][j]!=hypothesis[j]:
hypothesis[j]='?'
else:
hypothesis[j]=a[i][j]
print("for training instance no:{0} the hypothesis is".format(i),hypothesis)
print("output")
print(hypothesis)
output:
the given training data set
['sunny', 'warm', 'normal', 'strong', 'warm', 'same', 'Yes']
['sunny', 'warm', 'high', 'strong', 'warm', 'same', 'Yes']
['sunny', 'warm', 'normal', 'strong', 'warm', 'same', 'no']
['sunny', 'warm', 'high', 'strong', 'cool', 'change', 'Yes']
the initial value of hypothesis:
['0', '0', '0', '0', '0', '0']
find s: finding a maximaly specific hypothesis
for training instance no:0 the hypothesis is ['sunny', 'warm', 'normal', 'strong', 'warm', 'same']
for training instance no:1 the hypothesis is ['sunny', 'warm', 'normal', 'strong', 'warm', 'same']
for training instance no:2 the hypothesis is ['sunny', 'warm', 'normal', 'strong', 'warm', 'same']
for training instance no:3 the hypothesis is ['sunny', 'warm', 'normal', 'strong', 'warm', 'same']
output
['sunny', 'warm', 'normal', 'strong', 'warm', 'same']
pg 2:2. For a given set of training data examples stored in a .CSV file, implement and
demonstrate the Candidate-Elimination algorithmto output a description of the set of all
hypotheses consistent with the training examples.
import csv
a=[]
print("n The Given Training Data Setn")
with open('C:UsersISE-68Desktopws.csv','r') as csvFile:
reader=csv.reader(csvFile)
for row in reader:
a.append(row)
print(row)
num_attributes=len(a[0])-1
print("n The initial value of hypothesis: ")
S=['0']*num_attributes
G=['?']*num_attributes
print("n The most specific hypothesis S0:[0,0,0,0,0,0]n")
print("n The most general hypothesis G0:[?,?,?,?,?,?]n")
#compare with first training example
for j in range(0,num_attributes):
S[j]=a[0][j]
#compare with remaining training examples of given data set
print("n Candidate Elimination algorithm hypotheses version space computationn")
temp=[]
for i in range(0,len(a)):
if a[i][num_attributes]=='Yes':
for j in range(0,num_attributes):
if a[i][j]!=S[j]:
S[j]='?'
for j in range(0,num_attributes):
for k in range(1,len(temp)):
if temp[k][j]!='?' and temp[k][j]!=S[j]:
del temp[k]
print("for Training Example No:{0} the hypothesis is S{0} ".format(i+1),S)
if(len(temp)==0):
print("for Training Example No:{0} the hypothesis is G{0} ".format(i+1),G)
else:
print("for Training Example No:{0} the hypothesis is G{0} ".format(i+1),temp)
if a[i][num_attributes]=='No':
for j in range(0,num_attributes):
if S[j]!=a[i][j] and S[j]!='?':
G[j]=S[j]
temp.append(G)
G=['?']*num_attributes
print("for Training Example No:{0} the hypothesis is S{0} ".format(i+1),S)
print("for Training Example No:{0} the hypothesis is G{0} ".format(i+1),temp)
output:
The Given Training Data Set
['Sunny', 'Warm', 'Normal', 'Strong', 'Warm', 'Same', 'Yes']
['Sunny', 'Warm', 'High', 'Strong', 'Warm', 'Same', 'Yes']
['Rainy', 'Cold', 'High', 'Strong', 'Warm', 'Change', 'No']
['Sunny', 'Warm', 'High', 'Strong', 'Cold', 'Change', 'Yes']
The initial value of hypothesis:
The most specific hypothesis S0:[0,0,0,0,0,0]
The most general hypothesis G0:[?,?,?,?,?,?]
Candidate Elimination algorithm hypotheses version space computation
for Training Example No:1 the hypothesis is S1 ['Sunny', 'Warm', 'Normal', 'Strong', 'Warm',
'Same']
for Training Example No:1 the hypothesis is G1 ['?', '?', '?', '?', '?', '?']
for Training Example No:2 the hypothesis is S2 ['Sunny', 'Warm', '?', 'Strong', 'Warm', 'Same']
for Training Example No:2 the hypothesis is G2 ['?', '?', '?', '?', '?', '?']
for Training Example No:3 the hypothesis is S3 ['Sunny', 'Warm', '?', 'Strong', 'Warm', 'Same']
for Training Example No:3 the hypothesis is G3 [['Sunny', '?', '?', '?', '?', '?'], ['?', 'Warm', '?', '?',
'?', '?'], ['?', '?', '?', '?', '?', 'Same']]
for Training Example No:4 the hypothesis is S4 ['Sunny', 'Warm', '?', 'Strong', '?', '?']
for Training Example No:4 the hypothesis is G4 [['Sunny', '?', '?', '?', '?', '?'], ['?', 'Warm', '?', '?',
'?', '?']]
pg 3:3. Write a program to demonstrate the working of the decision tree based ID3 algorithm.
Use an appropriate data set for building the decision tree and apply this knowledge toclassify a
new sample
import numpy as np
import math
import csv
class Node:
def __init__(self, attribute):
self.attribute = attribute
self.children = []
self.answer = ""
def __str__(self):
return self.attribute
def read_data(filename):
with open(filename, 'r') as csvfile:
datareader = csv.reader(csvfile)
metadata = next(datareader)
traindata=[]
for row in datareader:
traindata.append(row)
return (metadata, traindata)
def subtables(data, col, delete):
dict = {}
items = np.unique(data[:, col]) # get unique values in particular column
count = np.zeros((items.shape[0], 1), dtype=np.int32) #number of row = number of values
for x in range(items.shape[0]):
for y in range(data.shape[0]):
if data[y, col] == items[x]:
count[x] += 1
#count has the data of number of times each value is present in
for x in range(items.shape[0]):
dict[items[x]] = np.empty((int(count[x]), data.shape[1]), dtype="|S32")
pos = 0
for y in range(data.shape[0]):
if data[y, col] == items[x]:
dict[items[x]][pos] = data[y]
pos += 1
if delete:
dict[items[x]] = np.delete(dict[items[x]], col, 1)
return items, dict
def entropy(S):
items = np.unique(S)
if items.size == 1:
return 0
counts = np.zeros((items.shape[0], 1))
sums = 0
for x in range(items.shape[0]):
counts[x] = sum(S == items[x]) / (S.size)
for count in counts:
sums += -1 * count * math.log(count, 2)
return sums
def gain_ratio(data, col):
items, dict = subtables(data, col, delete=False)
#item is the unique value and dict is the data corresponding to it
total_size = data.shape[0]
entropies = np.zeros((items.shape[0], 1))
for x in range(items.shape[0]):
ratio = dict[items[x]].shape[0]/(total_size)
entropies[x] = ratio * entropy(dict[items[x]][:, -1])
total_entropy = entropy(data[:, -1])
for x in range(entropies.shape[0]):
total_entropy -= entropies[x]
return total_entropy
def create_node(data, metadata):
if (np.unique(data[:, -1])).shape[0] == 1:
node = Node("")
node.answer = np.unique(data[:, -1])
return node
gains = np.zeros((data.shape[1] - 1, 1))
#size of gains= number of attribute to calculate gain
for col in range(data.shape[1] - 1):
gains[col] = gain_ratio(data, col)
split = np.argmax(gains)
node = Node(metadata[split])
metadata = np.delete(metadata, split, 0)
items, dict = subtables(data, split, delete=True)
for x in range(items.shape[0]):
child = create_node(dict[items[x]], metadata)
node.children.append((items[x], child))
return node
def empty(size):
s = ""
for x in range(size):
s += " "
return s
def print_tree(node, level):
if node.answer != "":
print(empty(level), node.answer)
return
print(empty(level), node.attribute)
for value, n in node.children:
print(empty(level + 1), value)
print_tree(n, level + 2)
metadata, traindata = read_data("tennis.csv")
data = np.array(traindata)
node = create_node(data, metadata)
print_tree(node, 0)
output:
Outlook
Overcast
[b'Yes']
Rainy
Windy
b'False'
[b'Yes']
b'True'
[b'No']
Sunny
Humidity
b'High'
[b'No']
b'Normal'
[b'Yes']
pg 4:4. Build an Artificial Neural Network by implementing the Backpropagation algorithm and
test the same using appropriate data sets.
import numpy as np
X = np.array(([2,9],[1,5],[3,6]), dtype=float)
print("X=",X)
y = np.array(([92], [86], [89]), dtype=float)
print("y=",y)
y=y/100
print("y=",y)
def sigmoid (x):
return 1/(1 + np.exp(-x))
def derivatives_sigmoid(x):
return x * (1 - x)
epoch=10000
lr=0.1
inputlayer_neurons = 2
hiddenlayer_neurons = 3
output_neurons = 1
wh=np.random.uniform(size=(inputlayer_neurons,hiddenlayer_neurons))
print("whn",wh)
bh=np.random.uniform(size=(1,hiddenlayer_neurons))
print("bhn",bh)
wout=np.random.uniform(size=(hiddenlayer_neurons,output_neurons))
print("woutn",wout)
bout=np.random.uniform(size=(1,output_neurons))
print("boutn",bout)
for i in range(epoch):
hinp1=np.dot(X,wh)
print("nn hinp1n",hinp1)
hinp=hinp1 + bh
print("hinpn",hinp)
hlayer_act = sigmoid(hinp)
print("hlayer_actn",hlayer_act)
outinp1=np.dot(hlayer_act,wout)
print("outinp1n",outinp1)
outinp=outinp1+bout
print("outinpn",outinp)
output=sigmoid(outinp)
print("outputn",output)
EO = y-output
print("EOn",EO)
outgrad = derivatives_sigmoid(output)
print("outgradn",outgrad)
d_output = EO* outgrad
print("d_outputn",d_output)
EH= d_output.dot(wout.T)
print("EHn",EH)
hiddengrad=derivatives_sigmoid(hlayer_act)
print("hiddengradn",hiddengrad)
d_hiddenlayer=EH * hiddengrad
print("d_hiddenlayern",d_hiddenlayer)
wout += hlayer_act.T.dot(d_output) *lr
print("woutn",wout)
bout += np.sum(d_output, axis=0,keepdims=True) *lr
print("boutn",bout)
wh += X.T.dot(d_hiddenlayer) *lr
print("whn",wh)
bh +=np.sum(d_hiddenlayer,axis=0,keepdims=True) *lr
print("bhn",bh)
print("nn input: n" + str(X))
print("actual output: n"+ str(y))
print("predicted output: n", output)
output:
too big
pg 5:5. Write a program to implement the naïve Bayesian classifier for a sample training data
set stored as a .CSV file. Compute the accuracy of the classifier, considering few test data sets.
import numpy as np
import math
import csv
def read_data(filename):
with open(filename, 'r') as csvfile:
datareader = csv.reader(csvfile)
metadata = next(datareader)
traindata=[]
for row in datareader:
traindata.append(row)
return (metadata, traindata)
def splitDataset(dataset, splitRatio): #splits dataset to training set and test set based on
split ratio
trainSize = int(len(dataset) * splitRatio)
trainSet = []
testset = list(dataset)
i=0
while len(trainSet) < trainSize:
trainSet.append(testset.pop(i))
return [trainSet, testset]
def classify(data,test):
total_size = data.shape[0]
print("training data size=",total_size)
print("test data sixe=",test.shape[0])
target=np.unique(data[:,-1])
count = np.zeros((target.shape[0]), dtype=np.int32)
prob = np.zeros((target.shape[0]), dtype=np.float32)
print("target count probability")
for y in range(target.shape[0]):
for x in range(data.shape[0]):
if data[x,data.shape[1]-1] == target[y]:
count[y] += 1
prob[y]=count[y]/total_size # comptes the probability of target
print(target[y],"t",count[y],"t",prob[y])
prob0 = np.zeros((test.shape[1]-1), dtype=np.float32)
prob1 = np.zeros((test.shape[1]-1), dtype=np.float32)
accuracy=0
print("Instance prediction taget")
for t in range(test.shape[0]):
for k in range(test.shape[1]-1): # for each attribute in column
count1=count0=0
for j in range(data.shape[0]):
if test[t,k]== data[j,k] and data[j,data.shape[1]-1]== target[0]:
count0+=1
elif test[t,k]== data[j,k] and data[j,data.shape[1]-1]== target[1]:
count1+=1
prob0[k]= count0/count[0] #Find no probability of each attribute
prob1[k]= count1/count[1] #Find yes probability of each attribute
probno=prob[0]
probyes=prob[1]
for i in range(test.shape[1]-1):
probno=probno*prob0[i]
probyes=probyes*prob1[i]
if probno>probyes: # prediction
predict='No'
else:
predict='Yes'
print(t+1,"t",predict,"t ",test[t,test.shape[1]-1])
if predict== test[t,test.shape[1]-1]: # computing accuracy
accuracy+=1
final_accuracy=(accuracy/test.shape[0])*100
print("accuracy",final_accuracy,"%")
return
metadata, traindata = read_data("tennis.csv")
splitRatio = 0.4
trainingset, testset = splitDataset(traindata, splitRatio)
training=np.array(trainingset)
testing=np.array(testset)
print("------------------Training Data-------------------")
print(trainingset)
print("-------------------Test Data-------------------")
print(testset)
classify(training,testing)
output:
------------------Training Data-------------------
[['Sunny', 'Hot', 'High', 'False', 'No'], ['Sunny', 'Hot', 'High', 'True', 'No'], ['Overcast', 'Hot', 'High',
'False', 'Yes'], ['Rainy', 'Mild', 'High', 'False', 'Yes'], ['Rainy', 'Cool', 'Normal', 'False', 'Yes']]
-------------------Test Data-------------------
[['Rainy', 'Cool', 'Normal', 'True', 'No'], ['Overcast', 'Cool', 'Normal', 'True', 'Yes'], ['Sunny', 'Mild',
'High', 'False', 'No'], ['Sunny', 'Cool', 'Normal', 'False', 'Yes'], ['Rainy', 'Mild', 'Normal', 'False',
'Yes'], ['Sunny', 'Mild', 'Normal', 'True', 'Yes'], ['Overcast', 'Mild', 'High', 'True', 'Yes'], ['Overcast',
'Hot', 'Normal', 'False', 'Yes'], ['Rainy', 'Mild', 'High', 'True', 'No']]
training data size= 5
test data sixe= 9
target count probability
No 2 0.4
Yes 3 0.6
Instance prediction taget
1 Yes No
2 Yes Yes
3 Yes No
4 Yes Yes
5 Yes Yes
6 Yes Yes
7 Yes Yes
8 Yes Yes
9 Yes No
accuracy 66.66666666666666 %
pg 6:6. Assuming a set of documents that need to be classified, use the naïve Bayesian
Classifier model to perform this task. Built-in Java classes/API can be used to write the
program. Calculate the accuracy, precision, and recall for your data set.
import pandas as pd
msg=pd.read_csv('naivetext1.csv',names=['message','label'])
print('The dimensions of the dataset',msg.shape)
msg['labelnum']=msg.label.map({'pos':1,'neg':0})
X=msg.message
y=msg.labelnum
print(X)
print(y)
#splitting the dataset into train and test data
from sklearn.model_selection import train_test_split
xtrain,xtest,ytrain,ytest=train_test_split(X,y)
print(xtest.shape)
print(xtrain.shape)
print(ytest.shape)
print(ytrain.shape)
#output of count vectoriser is a sparse matrix
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
xtrain_dtm = count_vect.fit_transform(xtrain)
xtest_dtm=count_vect.transform(xtest)
print(count_vect.get_feature_names())
df=pd.DataFrame(xtrain_dtm.toarray(),columns=count_vect.get_feature_names())
print(df)#tabular representation
print(xtrain_dtm) #sparse matrix representation
#Training Naive Bayes (NB) classifier on training data.
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(xtrain_dtm,ytrain)
predicted = clf.predict(xtest_dtm)
#printing accuracy metrics
from sklearn import metrics
print('Accuracy metrics')
print('Accuracy of the classifer is',metrics.accuracy_score(ytest,predicted))
print('Confusion matrix')
print(metrics.confusion_matrix(ytest,predicted))
print('Recall and Precison ')
print(metrics.recall_score(ytest,predicted))
print(metrics.precision_score(ytest,predicted))
output:
Accuracy metrics
Accuracy of the classifer is 0.6
Confusion matrix
[[1 2]
[0 2]]
Recall and Precison
1.0
0.5
pg 7:
7.Write a program to construct aBayesian network considering medical data. Use this model to
demonstrate the diagnosis of heart patients using standard Heart Disease Data Set. You can
use Java/Python ML library classes/API.
# data analysis, splitting and wrangling
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
# machine learning
from sklearn.naive_bayes import GaussianNB
# column names in accordance with feature information
col_names = ['age','sex','chest_pain','blood_pressure',
'serum_cholestoral','fasting_blood_sugar', 'electrocardiographic',
'max_heart_rate','induced_angina','ST_depression',
'slope','no_of_vessels','thal','diagnosis']
# read the file
df = pd.read_csv("heart_disease_dataset.csv", names=col_names, header=None,
na_values="?")
print("Number of records: {}nNumber of variables: {}".format(df.shape[0], df.shape[1]))
# display the first 5 lines my comments
df.head()
df.info()
# extract numeric columns and find categorical ones
numeric_columns = ['serum_cholestoral', 'max_heart_rate', 'age', 'blood_pressure',
'ST_depression']
categorical_columns = [c for c in df.columns if c not in numeric_columns]
print(categorical_columns)
# count values of explained variable
df.diagnosis.value_counts()
# create a boolean vector and map it with corresponding values (True=1, False=0)
df.diagnosis = (df.diagnosis != 0).astype(int)
df.diagnosis.value_counts()
# view of descriptive statistics
df[numeric_columns].describe()
# count ill vs healthy people grouped by sex
df.groupby(['sex','diagnosis'])['diagnosis'].count()
# average number of diagnosed people grouped by number of blood vessels detected by
fluoroscopy
df[['no_of_vessels','diagnosis']].groupby('no_of_vessels').mean()
# show columns having missing values
df.isnull().sum()
# fill missing values with mode
df['no_of_vessels'].fillna(df['no_of_vessels'].mode()[0], inplace=True)
df['thal'].fillna(df['thal'].mode()[0], inplace=True)
# extract the target variable
X, y = df.iloc[:, :-1], df.iloc[:, -1]
print(X.shape)
print(y.shape)
# split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=2606)
print ("train_set_x shape: " + str(X_train.shape))
print ("train_set_y shape: " + str(y_train.shape))
print ("test_set_x shape: " + str(X_test.shape))
print ("test_set_y shape: " + str(y_test.shape))
# scale feature matrices
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
model = GaussianNB()
# train model
model.fit(X_train,y_train)
# check accuracy and print out the results
fit_accuracy = model.score(X_train, y_train)
test_accuracy = model.score(X_test, y_test)
print(f"Train accuracy: {fit_accuracy:0.2%}")
print(f"Test accuracy: {test_accuracy:0.2%}")
output:
Number of records: 303
Number of variables: 14
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
age 303 non-null float64
sex 303 non-null float64
chest_pain 303 non-null float64
blood_pressure 303 non-null float64
serum_cholestoral 303 non-null float64
fasting_blood_sugar 303 non-null float64
electrocardiographic 303 non-null float64
max_heart_rate 303 non-null float64
induced_angina 303 non-null float64
ST_depression 303 non-null float64
slope 303 non-null float64
no_of_vessels 299 non-null float64
thal 301 non-null float64
diagnosis 303 non-null int64
dtypes: float64(13), int64(1)
memory usage: 33.2 KB
['sex', 'chest_pain', 'fasting_blood_sugar', 'electrocardiographic', 'induced_angina', 'slope',
'no_of_vessels', 'thal', 'diagnosis']
(303, 13)
(303,)
train_set_x shape: (212, 13)
train_set_y shape: (212,)
test_set_x shape: (91, 13)
test_set_y shape: (91,)
Train accuracy: 85.38%
Test accuracy: 86.81%
pg 8:
8. Apply EM algorithm to cluster a set of data stored in a .CSV file. Use the same dataset for
clustering using k-Means algorithm. Compare the results of these two algorithms and comment
on the quality of clustering. You can add Java/Python ML library classes/API in the program.
#from sklearn import metrics
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.mixture import GaussianMixture
from sklearn.cluster import KMeans
data=pd.read_csv("clusterdata.csv")
df1=pd.DataFrame(data)
print(df1)
f1 = df1['Distance_Feature'].values
f2 = df1['Speeding_Feature'].values
X=np.matrix(list(zip(f1,f2)))
plt.plot(1)
plt.subplot(511)
plt.xlim([0, 100])
plt.ylim([0, 50])
plt.title('Dataset')
plt.ylabel('speeding_feature')
plt.xlabel('distance_feature')
plt.scatter(f1,f2)
colors = ['b', 'g', 'r']
markers = ['o', 'v', 's']
# create new plot and data for K- means algorithm
plt.plot(2)
ax=plt.subplot(513)
kmeans_model = KMeans(n_clusters=3).fit(X)
for i, l in enumerate(kmeans_model.labels_):
fig1=plt.plot(f1[i], f2[i], color=colors[l],marker=markers[l])
plt.xlim([0, 100])
plt.ylim([0, 50])
plt.title('K- Means')
plt.ylabel('speeding_feature')
plt.xlabel('distance_feature')
# create new plot and data for gaussian mixture
plt.plot(3)
plt.subplot(515)
gmm=GaussianMixture(n_components=3).fit(X)
labels= gmm.predict(X)
for i, l in enumerate(labels):
plt.plot(f1[i], f2[i], color=colors[l], marker=markers[l])
plt.xlim([0, 100])
plt.ylim([0, 50])
plt.title('Gaussian Mixture')
plt.ylabel('speeding_feature')
plt.xlabel('distance_feature')
plt.show()
output:
Driver_ID Distance_Feature Speeding_Feature
0 3423311935 71.24 28
1 3423313212 52.53 25
2 3423313724 64.54 27
3 3423311373 55.69 22
4 3423310999 54.58 25
5 3423313857 41.91 10
6 3423312432 58.64 20
7 3423311434 52.02 8
8 3423311328 31.25 34
9 3423312488 44.31 19
10 3423311254 49.35 40
11 3423312943 58.07 45
12 3423312536 44.22 22
13 3423311542 55.73 19
14 3423312176 46.63 43
15 3423314176 52.97 32
16 3423314202 46.25 35
17 3423311346 51.55 27
18 3423310666 57.05 26
19 3423313527 58.45 30
20 3423312182 43.42 23
21 3423313590 55.68 37
22 3423312268 55.15 18
pg 9:
9.Write a program to implement k-Nearest Neighbour algorithm to classify the iris data set. Print
both correct and wrong predictions. Java/Python ML library classes can be used for this
problem.
Program 9
#KNN algorithm
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report,confusion_matrix
#from sklearn import metrics
import pandas as pd
import numpy as np
from sklearn import datasets
iris=datasets.load_iris()
iris_data=iris.data
iris_labels=iris.target
#print(iris_data)
#print(iris_labels)
X_train,x_test,y_train,y_test=train_test_split(iris_data,iris_labels,test_size=0.20)
classifier=KNeighborsClassifier(n_neighbors=5)
classifier.fit(X_train,y_train)
y_pred=classifier.predict(x_test)
print('confusion_matrix is as follows')
print(confusion_matrix(y_test,y_pred))
print('Accuracy Metrics')
print(classification_report(y_test,y_pred))
output:
confusion_matrix is as follows
[[11 0 0]
[ 0 9 0]
[ 0 1 9]]
Accuracy Metrics
precision recall f1-score support
0 1.00 1.00 1.00 11
1 0.90 1.00 0.95 9
2 1.00 0.90 0.95 10
avg / total 0.97 0.97 0.97 30
pg 10:
10.Implement the non-parametric Locally Weighted Regressionalgorithm in order to fit data
points. Select appropriate data set for your experiment and draw graphs.
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np1
def kernel(point,xmat, k):
m,n= np1.shape(xmat)
weights = np1.mat(np1.eye((m)))
for j in range(m):
diff = point - X[j]
weights[j,j] = np1.exp(diff*diff.T/(-2.0*k**2))
return weights
def localWeight(point,xmat,ymat,k):
wei = kernel(point,xmat,k)
W = (X.T*(wei*X)).I*(X.T*(wei*ymat.T))
return W
def localWeightRegression(xmat,ymat,k):
m,n = np1.shape(xmat)
ypred = np1.zeros(m)
for i in range(m):
ypred[i] = xmat[i]*localWeight(xmat[i],xmat,ymat,k)
return ypred
# load data points
data = pd.read_csv('tips1.csv')
bill = np1.array(data.total_bill)
tip = np1.array(data.tip)
#preparing and add 1 in bill
mbill = np1.mat(bill) # mat treats array as matrix
mtip = np1.mat(tip)
m= np1.shape(mbill)[1]
print("******",m)
one = np1.mat(np1.ones(m))
X= np1.hstack((one.T,mbill.T)) #Stack arrays in sequence horizontally (column wise).
#set k here
ypred = localWeightRegression(X,mtip,2)
SortIndex = X[:,1].argsort(0)
xsort = X[SortIndex][:,0]
fig = plt.figure()
ax = fig.add_subplot(1,1,1)
ax.scatter(bill,tip, color='blue')
ax.plot(xsort[:,1],ypred[SortIndex], color = 'red', linewidth=1)
plt.xlabel('Total bill')
plt.ylabel('Tip')
#plt.show();
output:
Text(0,0.5,'Tip')

More Related Content

What's hot (20)

PDF
Webi Report Function Overview
Srinath Reddy
 
PPTX
Introduction to R
Sander Kieft
 
PDF
Gentlest Introduction to Tensorflow - Part 2
Khor SoonHin
 
PDF
Unit 2 dsa LINEAR DATA STRUCTURE
PUNE VIDYARTHI GRIHA'S COLLEGE OF ENGINEERING, NASHIK
 
PDF
Python for R developers and data scientists
Lambda Tree
 
PDF
Gentlest Introduction to Tensorflow
Khor SoonHin
 
PPT
Functional programming in scala
Siarhiej Siemianchuk
 
PPTX
Python Seaborn Data Visualization
Sourabh Sahu
 
PDF
Gentlest Introduction to Tensorflow - Part 3
Khor SoonHin
 
PPT
Array
Malainine Zaid
 
KEY
関数潮流(Function Tendency)
riue
 
PDF
cluster(python)
Noriyuki Kojima
 
PPTX
Python for R users
Satyarth Praveen
 
PDF
20170509 rand db_lesugent
Prof. Wim Van Criekinge
 
PDF
Python For Data Science Cheat Sheet
Karlijn Willems
 
PDF
Optimization and Mathematical Programming in R and ROI - R Optimization Infra...
Dr. Volkan OBAN
 
PDF
NumPy Refresher
Lukasz Dobrzanski
 
PDF
Probabilistic Programming in Scala
BeScala
 
Webi Report Function Overview
Srinath Reddy
 
Introduction to R
Sander Kieft
 
Gentlest Introduction to Tensorflow - Part 2
Khor SoonHin
 
Python for R developers and data scientists
Lambda Tree
 
Gentlest Introduction to Tensorflow
Khor SoonHin
 
Functional programming in scala
Siarhiej Siemianchuk
 
Python Seaborn Data Visualization
Sourabh Sahu
 
Gentlest Introduction to Tensorflow - Part 3
Khor SoonHin
 
関数潮流(Function Tendency)
riue
 
cluster(python)
Noriyuki Kojima
 
Python for R users
Satyarth Praveen
 
20170509 rand db_lesugent
Prof. Wim Van Criekinge
 
Python For Data Science Cheat Sheet
Karlijn Willems
 
Optimization and Mathematical Programming in R and ROI - R Optimization Infra...
Dr. Volkan OBAN
 
NumPy Refresher
Lukasz Dobrzanski
 
Probabilistic Programming in Scala
BeScala
 

Similar to Ml all programs (20)

DOCX
AIMLProgram-6 AIMLProgram-6 AIMLProgram-6 AIMLProgram-6
RaghuBR9
 
PDF
python3 HashTableSolutionmain.pyfrom ChainingHashTable impor.pdf
info706022
 
PPTX
Python Cheat Sheet Presentation Learning
Naseer-ul-Hassan Rehman
 
PDF
AI & ML Lab Manual1_BCE.pdf artificial intelligence
KaviShetty
 
PDF
fds Practicle 1to 6 program.pdf
GaneshPawar819187
 
DOCX
ggtimeseries-->ggplot2 extensions
Dr. Volkan OBAN
 
PDF
design and analysis of algorithm Lab files
Nitesh Dubey
 
PPTX
dataframe_operations and various functions
JayanthiM19
 
DOCX
Practicle 1.docx
GaneshPawar819187
 
DOCX
Basic python laboratoty_ PSPP Manual .docx
Kirubaburi R
 
DOCX
Aastha Shah.docx
AasthaShah41
 
DOCX
COMPUTER SCIENCE CLASS 12 PRACTICAL FILE
Anushka Rai
 
PDF
ML with python.pdf
n58648017
 
PPTX
Data Visualization_pandas in hadoop.pptx
Rahul Borate
 
PDF
Python Ireland Nov 2010 Talk: Unit Testing
Python Ireland
 
PPTX
Presentation on Pandas in _ detail .pptx
16115yogendraSingh
 
PPTX
Python programming workshop
BAINIDA
 
PDF
python practicals-solution-2019-20-class-xii.pdf
rajatxyz
 
PPTX
Numpy_Pandas_for beginners_________.pptx
Abhi Marvel
 
PDF
Baby Steps to Machine Learning at DevFest Lagos 2019
Robert John
 
AIMLProgram-6 AIMLProgram-6 AIMLProgram-6 AIMLProgram-6
RaghuBR9
 
python3 HashTableSolutionmain.pyfrom ChainingHashTable impor.pdf
info706022
 
Python Cheat Sheet Presentation Learning
Naseer-ul-Hassan Rehman
 
AI & ML Lab Manual1_BCE.pdf artificial intelligence
KaviShetty
 
fds Practicle 1to 6 program.pdf
GaneshPawar819187
 
ggtimeseries-->ggplot2 extensions
Dr. Volkan OBAN
 
design and analysis of algorithm Lab files
Nitesh Dubey
 
dataframe_operations and various functions
JayanthiM19
 
Practicle 1.docx
GaneshPawar819187
 
Basic python laboratoty_ PSPP Manual .docx
Kirubaburi R
 
Aastha Shah.docx
AasthaShah41
 
COMPUTER SCIENCE CLASS 12 PRACTICAL FILE
Anushka Rai
 
ML with python.pdf
n58648017
 
Data Visualization_pandas in hadoop.pptx
Rahul Borate
 
Python Ireland Nov 2010 Talk: Unit Testing
Python Ireland
 
Presentation on Pandas in _ detail .pptx
16115yogendraSingh
 
Python programming workshop
BAINIDA
 
python practicals-solution-2019-20-class-xii.pdf
rajatxyz
 
Numpy_Pandas_for beginners_________.pptx
Abhi Marvel
 
Baby Steps to Machine Learning at DevFest Lagos 2019
Robert John
 
Ad

Recently uploaded (20)

PDF
Best SMO Services in India | Seo Strikers
SEO Strikers
 
PPTX
Andrew C. Belton, MBA Experience Portfolio July 2025
Andrew C. Belton
 
PDF
Mentoring_Coaching_Work Readiness Gap_Conference_18 July 2025.pdf
Charles Cotter, PhD
 
PDF
BCG's Guide to Cost and Growth 24pages file
Wipro Unza Vietnam Company Limited
 
PPTX
How to Build and Optimize a GA4 Conversion Funnel.pptx
Orage Technologies
 
DOCX
Andrew C. Belton, MBA Resume - July 2025
Andrew C. Belton
 
PDF
From Fossil to Future Green Energy Companies Leading India’s Energy Transitio...
Essar Group
 
PDF
Cybersecurity in Manufacturing: Safeguarding Smart Factories from Cyber Threats
Arclight Group
 
PDF
IoT Identity and Access Management: A Growing Market Opportunity
GauriKale30
 
PDF
Transforming Tricity Real Estate_ The Leadership of Mohit Bansal.pdf
Mohit Bansal GMI
 
PDF
Comments on Rise of the Build Division.pdf
Brij Consulting, LLC
 
PDF
Rise of the Build Division, Conquering the Curve.pdf
Brij Consulting, LLC
 
PDF
NewBase 14 July 2025 Energy News issue - 1802 by Khaled Al Awadi_compressed ...
Khaled Al Awadi
 
PDF
2025.07_ TJ Communications Credentials.pdf
tjcomstrang
 
PDF
Steve Milne Equestrian - A Master Horse Trainer
Steve Milne Equestrian
 
PDF
Patrick Dwyer Merrill Lynch - A Governing Board Director
Patrick Dwyer Merrill Lynch
 
PDF
Event Report - AWS Summit NYC - AgentCore, Kiro and S3 Venctors
Holger Mueller
 
PDF
2030 Growth Report: AI in RPM Market Driven by Chronic Disease & Elderly Care
Kumar Satyam
 
PPTX
DIARRHOEA & APPENDICITIS gi problems s
Renuga Suresh
 
PDF
Accenture. (2024). Risk Study, 2024 Edition - Hyper-disruption demands consta...
yofep71646
 
Best SMO Services in India | Seo Strikers
SEO Strikers
 
Andrew C. Belton, MBA Experience Portfolio July 2025
Andrew C. Belton
 
Mentoring_Coaching_Work Readiness Gap_Conference_18 July 2025.pdf
Charles Cotter, PhD
 
BCG's Guide to Cost and Growth 24pages file
Wipro Unza Vietnam Company Limited
 
How to Build and Optimize a GA4 Conversion Funnel.pptx
Orage Technologies
 
Andrew C. Belton, MBA Resume - July 2025
Andrew C. Belton
 
From Fossil to Future Green Energy Companies Leading India’s Energy Transitio...
Essar Group
 
Cybersecurity in Manufacturing: Safeguarding Smart Factories from Cyber Threats
Arclight Group
 
IoT Identity and Access Management: A Growing Market Opportunity
GauriKale30
 
Transforming Tricity Real Estate_ The Leadership of Mohit Bansal.pdf
Mohit Bansal GMI
 
Comments on Rise of the Build Division.pdf
Brij Consulting, LLC
 
Rise of the Build Division, Conquering the Curve.pdf
Brij Consulting, LLC
 
NewBase 14 July 2025 Energy News issue - 1802 by Khaled Al Awadi_compressed ...
Khaled Al Awadi
 
2025.07_ TJ Communications Credentials.pdf
tjcomstrang
 
Steve Milne Equestrian - A Master Horse Trainer
Steve Milne Equestrian
 
Patrick Dwyer Merrill Lynch - A Governing Board Director
Patrick Dwyer Merrill Lynch
 
Event Report - AWS Summit NYC - AgentCore, Kiro and S3 Venctors
Holger Mueller
 
2030 Growth Report: AI in RPM Market Driven by Chronic Disease & Elderly Care
Kumar Satyam
 
DIARRHOEA & APPENDICITIS gi problems s
Renuga Suresh
 
Accenture. (2024). Risk Study, 2024 Edition - Hyper-disruption demands consta...
yofep71646
 
Ad

Ml all programs

  • 1. pg 1. Implement and demonstratethe FIND-Salgorithm for finding the most specific hypothesis based on a given set of training data samples. Read the training data from a .CSV file. import csv num_attributes=6 a=[] print("n the given training data setn") with open('sports.csv','r') as csvfile: reader=csv.reader(csvfile) for row in reader: a.append(row) print(row) print("n the initial value of hypothesis: ") hypothesis=['0']*num_attributes print(hypothesis) for j in range(0,num_attributes): hypothesis[j]=a[0][j] print("n find s: finding a maximaly specific hypothesisn") for i in range(0,len(a)): if a[i][num_attributes]=='yes': for j in range(0,num_attributes): if a[i][j]!=hypothesis[j]: hypothesis[j]='?' else: hypothesis[j]=a[i][j] print("for training instance no:{0} the hypothesis is".format(i),hypothesis) print("output") print(hypothesis) output: the given training data set ['sunny', 'warm', 'normal', 'strong', 'warm', 'same', 'Yes'] ['sunny', 'warm', 'high', 'strong', 'warm', 'same', 'Yes'] ['sunny', 'warm', 'normal', 'strong', 'warm', 'same', 'no'] ['sunny', 'warm', 'high', 'strong', 'cool', 'change', 'Yes'] the initial value of hypothesis: ['0', '0', '0', '0', '0', '0'] find s: finding a maximaly specific hypothesis
  • 2. for training instance no:0 the hypothesis is ['sunny', 'warm', 'normal', 'strong', 'warm', 'same'] for training instance no:1 the hypothesis is ['sunny', 'warm', 'normal', 'strong', 'warm', 'same'] for training instance no:2 the hypothesis is ['sunny', 'warm', 'normal', 'strong', 'warm', 'same'] for training instance no:3 the hypothesis is ['sunny', 'warm', 'normal', 'strong', 'warm', 'same'] output ['sunny', 'warm', 'normal', 'strong', 'warm', 'same'] pg 2:2. For a given set of training data examples stored in a .CSV file, implement and demonstrate the Candidate-Elimination algorithmto output a description of the set of all hypotheses consistent with the training examples. import csv a=[] print("n The Given Training Data Setn") with open('C:UsersISE-68Desktopws.csv','r') as csvFile: reader=csv.reader(csvFile) for row in reader: a.append(row) print(row) num_attributes=len(a[0])-1 print("n The initial value of hypothesis: ") S=['0']*num_attributes G=['?']*num_attributes print("n The most specific hypothesis S0:[0,0,0,0,0,0]n") print("n The most general hypothesis G0:[?,?,?,?,?,?]n") #compare with first training example for j in range(0,num_attributes): S[j]=a[0][j] #compare with remaining training examples of given data set print("n Candidate Elimination algorithm hypotheses version space computationn") temp=[] for i in range(0,len(a)): if a[i][num_attributes]=='Yes': for j in range(0,num_attributes): if a[i][j]!=S[j]: S[j]='?' for j in range(0,num_attributes):
  • 3. for k in range(1,len(temp)): if temp[k][j]!='?' and temp[k][j]!=S[j]: del temp[k] print("for Training Example No:{0} the hypothesis is S{0} ".format(i+1),S) if(len(temp)==0): print("for Training Example No:{0} the hypothesis is G{0} ".format(i+1),G) else: print("for Training Example No:{0} the hypothesis is G{0} ".format(i+1),temp) if a[i][num_attributes]=='No': for j in range(0,num_attributes): if S[j]!=a[i][j] and S[j]!='?': G[j]=S[j] temp.append(G) G=['?']*num_attributes print("for Training Example No:{0} the hypothesis is S{0} ".format(i+1),S) print("for Training Example No:{0} the hypothesis is G{0} ".format(i+1),temp) output: The Given Training Data Set ['Sunny', 'Warm', 'Normal', 'Strong', 'Warm', 'Same', 'Yes'] ['Sunny', 'Warm', 'High', 'Strong', 'Warm', 'Same', 'Yes'] ['Rainy', 'Cold', 'High', 'Strong', 'Warm', 'Change', 'No'] ['Sunny', 'Warm', 'High', 'Strong', 'Cold', 'Change', 'Yes'] The initial value of hypothesis: The most specific hypothesis S0:[0,0,0,0,0,0] The most general hypothesis G0:[?,?,?,?,?,?] Candidate Elimination algorithm hypotheses version space computation for Training Example No:1 the hypothesis is S1 ['Sunny', 'Warm', 'Normal', 'Strong', 'Warm', 'Same'] for Training Example No:1 the hypothesis is G1 ['?', '?', '?', '?', '?', '?'] for Training Example No:2 the hypothesis is S2 ['Sunny', 'Warm', '?', 'Strong', 'Warm', 'Same'] for Training Example No:2 the hypothesis is G2 ['?', '?', '?', '?', '?', '?'] for Training Example No:3 the hypothesis is S3 ['Sunny', 'Warm', '?', 'Strong', 'Warm', 'Same'] for Training Example No:3 the hypothesis is G3 [['Sunny', '?', '?', '?', '?', '?'], ['?', 'Warm', '?', '?',
  • 4. '?', '?'], ['?', '?', '?', '?', '?', 'Same']] for Training Example No:4 the hypothesis is S4 ['Sunny', 'Warm', '?', 'Strong', '?', '?'] for Training Example No:4 the hypothesis is G4 [['Sunny', '?', '?', '?', '?', '?'], ['?', 'Warm', '?', '?', '?', '?']] pg 3:3. Write a program to demonstrate the working of the decision tree based ID3 algorithm. Use an appropriate data set for building the decision tree and apply this knowledge toclassify a new sample import numpy as np import math import csv class Node: def __init__(self, attribute): self.attribute = attribute self.children = [] self.answer = "" def __str__(self): return self.attribute def read_data(filename): with open(filename, 'r') as csvfile: datareader = csv.reader(csvfile) metadata = next(datareader) traindata=[] for row in datareader: traindata.append(row) return (metadata, traindata) def subtables(data, col, delete): dict = {} items = np.unique(data[:, col]) # get unique values in particular column count = np.zeros((items.shape[0], 1), dtype=np.int32) #number of row = number of values for x in range(items.shape[0]): for y in range(data.shape[0]):
  • 5. if data[y, col] == items[x]: count[x] += 1 #count has the data of number of times each value is present in for x in range(items.shape[0]): dict[items[x]] = np.empty((int(count[x]), data.shape[1]), dtype="|S32") pos = 0 for y in range(data.shape[0]): if data[y, col] == items[x]: dict[items[x]][pos] = data[y] pos += 1 if delete: dict[items[x]] = np.delete(dict[items[x]], col, 1) return items, dict def entropy(S): items = np.unique(S) if items.size == 1: return 0 counts = np.zeros((items.shape[0], 1)) sums = 0 for x in range(items.shape[0]): counts[x] = sum(S == items[x]) / (S.size) for count in counts: sums += -1 * count * math.log(count, 2) return sums def gain_ratio(data, col): items, dict = subtables(data, col, delete=False) #item is the unique value and dict is the data corresponding to it total_size = data.shape[0] entropies = np.zeros((items.shape[0], 1)) for x in range(items.shape[0]): ratio = dict[items[x]].shape[0]/(total_size)
  • 6. entropies[x] = ratio * entropy(dict[items[x]][:, -1]) total_entropy = entropy(data[:, -1]) for x in range(entropies.shape[0]): total_entropy -= entropies[x] return total_entropy def create_node(data, metadata): if (np.unique(data[:, -1])).shape[0] == 1: node = Node("") node.answer = np.unique(data[:, -1]) return node gains = np.zeros((data.shape[1] - 1, 1)) #size of gains= number of attribute to calculate gain for col in range(data.shape[1] - 1): gains[col] = gain_ratio(data, col) split = np.argmax(gains) node = Node(metadata[split]) metadata = np.delete(metadata, split, 0) items, dict = subtables(data, split, delete=True) for x in range(items.shape[0]): child = create_node(dict[items[x]], metadata) node.children.append((items[x], child)) return node def empty(size): s = "" for x in range(size): s += " " return s
  • 7. def print_tree(node, level): if node.answer != "": print(empty(level), node.answer) return print(empty(level), node.attribute) for value, n in node.children: print(empty(level + 1), value) print_tree(n, level + 2) metadata, traindata = read_data("tennis.csv") data = np.array(traindata) node = create_node(data, metadata) print_tree(node, 0) output: Outlook Overcast [b'Yes'] Rainy Windy b'False' [b'Yes'] b'True' [b'No'] Sunny Humidity b'High' [b'No'] b'Normal' [b'Yes'] pg 4:4. Build an Artificial Neural Network by implementing the Backpropagation algorithm and test the same using appropriate data sets. import numpy as np X = np.array(([2,9],[1,5],[3,6]), dtype=float) print("X=",X) y = np.array(([92], [86], [89]), dtype=float)
  • 8. print("y=",y) y=y/100 print("y=",y) def sigmoid (x): return 1/(1 + np.exp(-x)) def derivatives_sigmoid(x): return x * (1 - x) epoch=10000 lr=0.1 inputlayer_neurons = 2 hiddenlayer_neurons = 3 output_neurons = 1 wh=np.random.uniform(size=(inputlayer_neurons,hiddenlayer_neurons)) print("whn",wh) bh=np.random.uniform(size=(1,hiddenlayer_neurons)) print("bhn",bh) wout=np.random.uniform(size=(hiddenlayer_neurons,output_neurons)) print("woutn",wout) bout=np.random.uniform(size=(1,output_neurons)) print("boutn",bout) for i in range(epoch): hinp1=np.dot(X,wh) print("nn hinp1n",hinp1) hinp=hinp1 + bh print("hinpn",hinp) hlayer_act = sigmoid(hinp) print("hlayer_actn",hlayer_act) outinp1=np.dot(hlayer_act,wout) print("outinp1n",outinp1) outinp=outinp1+bout print("outinpn",outinp) output=sigmoid(outinp) print("outputn",output) EO = y-output print("EOn",EO) outgrad = derivatives_sigmoid(output) print("outgradn",outgrad) d_output = EO* outgrad print("d_outputn",d_output) EH= d_output.dot(wout.T)
  • 9. print("EHn",EH) hiddengrad=derivatives_sigmoid(hlayer_act) print("hiddengradn",hiddengrad) d_hiddenlayer=EH * hiddengrad print("d_hiddenlayern",d_hiddenlayer) wout += hlayer_act.T.dot(d_output) *lr print("woutn",wout) bout += np.sum(d_output, axis=0,keepdims=True) *lr print("boutn",bout) wh += X.T.dot(d_hiddenlayer) *lr print("whn",wh) bh +=np.sum(d_hiddenlayer,axis=0,keepdims=True) *lr print("bhn",bh) print("nn input: n" + str(X)) print("actual output: n"+ str(y)) print("predicted output: n", output) output: too big pg 5:5. Write a program to implement the naïve Bayesian classifier for a sample training data set stored as a .CSV file. Compute the accuracy of the classifier, considering few test data sets. import numpy as np import math import csv def read_data(filename): with open(filename, 'r') as csvfile: datareader = csv.reader(csvfile) metadata = next(datareader) traindata=[] for row in datareader: traindata.append(row) return (metadata, traindata) def splitDataset(dataset, splitRatio): #splits dataset to training set and test set based on split ratio trainSize = int(len(dataset) * splitRatio)
  • 10. trainSet = [] testset = list(dataset) i=0 while len(trainSet) < trainSize: trainSet.append(testset.pop(i)) return [trainSet, testset] def classify(data,test): total_size = data.shape[0] print("training data size=",total_size) print("test data sixe=",test.shape[0]) target=np.unique(data[:,-1]) count = np.zeros((target.shape[0]), dtype=np.int32) prob = np.zeros((target.shape[0]), dtype=np.float32) print("target count probability") for y in range(target.shape[0]): for x in range(data.shape[0]): if data[x,data.shape[1]-1] == target[y]: count[y] += 1 prob[y]=count[y]/total_size # comptes the probability of target print(target[y],"t",count[y],"t",prob[y]) prob0 = np.zeros((test.shape[1]-1), dtype=np.float32) prob1 = np.zeros((test.shape[1]-1), dtype=np.float32) accuracy=0 print("Instance prediction taget") for t in range(test.shape[0]): for k in range(test.shape[1]-1): # for each attribute in column count1=count0=0 for j in range(data.shape[0]): if test[t,k]== data[j,k] and data[j,data.shape[1]-1]== target[0]: count0+=1 elif test[t,k]== data[j,k] and data[j,data.shape[1]-1]== target[1]: count1+=1 prob0[k]= count0/count[0] #Find no probability of each attribute prob1[k]= count1/count[1] #Find yes probability of each attribute probno=prob[0] probyes=prob[1] for i in range(test.shape[1]-1): probno=probno*prob0[i]
  • 11. probyes=probyes*prob1[i] if probno>probyes: # prediction predict='No' else: predict='Yes' print(t+1,"t",predict,"t ",test[t,test.shape[1]-1]) if predict== test[t,test.shape[1]-1]: # computing accuracy accuracy+=1 final_accuracy=(accuracy/test.shape[0])*100 print("accuracy",final_accuracy,"%") return metadata, traindata = read_data("tennis.csv") splitRatio = 0.4 trainingset, testset = splitDataset(traindata, splitRatio) training=np.array(trainingset) testing=np.array(testset) print("------------------Training Data-------------------") print(trainingset) print("-------------------Test Data-------------------") print(testset) classify(training,testing) output: ------------------Training Data------------------- [['Sunny', 'Hot', 'High', 'False', 'No'], ['Sunny', 'Hot', 'High', 'True', 'No'], ['Overcast', 'Hot', 'High', 'False', 'Yes'], ['Rainy', 'Mild', 'High', 'False', 'Yes'], ['Rainy', 'Cool', 'Normal', 'False', 'Yes']] -------------------Test Data------------------- [['Rainy', 'Cool', 'Normal', 'True', 'No'], ['Overcast', 'Cool', 'Normal', 'True', 'Yes'], ['Sunny', 'Mild', 'High', 'False', 'No'], ['Sunny', 'Cool', 'Normal', 'False', 'Yes'], ['Rainy', 'Mild', 'Normal', 'False', 'Yes'], ['Sunny', 'Mild', 'Normal', 'True', 'Yes'], ['Overcast', 'Mild', 'High', 'True', 'Yes'], ['Overcast', 'Hot', 'Normal', 'False', 'Yes'], ['Rainy', 'Mild', 'High', 'True', 'No']] training data size= 5 test data sixe= 9 target count probability
  • 12. No 2 0.4 Yes 3 0.6 Instance prediction taget 1 Yes No 2 Yes Yes 3 Yes No 4 Yes Yes 5 Yes Yes 6 Yes Yes 7 Yes Yes 8 Yes Yes 9 Yes No accuracy 66.66666666666666 % pg 6:6. Assuming a set of documents that need to be classified, use the naïve Bayesian Classifier model to perform this task. Built-in Java classes/API can be used to write the program. Calculate the accuracy, precision, and recall for your data set. import pandas as pd msg=pd.read_csv('naivetext1.csv',names=['message','label']) print('The dimensions of the dataset',msg.shape) msg['labelnum']=msg.label.map({'pos':1,'neg':0}) X=msg.message y=msg.labelnum print(X) print(y) #splitting the dataset into train and test data from sklearn.model_selection import train_test_split xtrain,xtest,ytrain,ytest=train_test_split(X,y) print(xtest.shape) print(xtrain.shape) print(ytest.shape) print(ytrain.shape) #output of count vectoriser is a sparse matrix from sklearn.feature_extraction.text import CountVectorizer count_vect = CountVectorizer() xtrain_dtm = count_vect.fit_transform(xtrain) xtest_dtm=count_vect.transform(xtest)
  • 13. print(count_vect.get_feature_names()) df=pd.DataFrame(xtrain_dtm.toarray(),columns=count_vect.get_feature_names()) print(df)#tabular representation print(xtrain_dtm) #sparse matrix representation #Training Naive Bayes (NB) classifier on training data. from sklearn.naive_bayes import MultinomialNB clf = MultinomialNB().fit(xtrain_dtm,ytrain) predicted = clf.predict(xtest_dtm) #printing accuracy metrics from sklearn import metrics print('Accuracy metrics') print('Accuracy of the classifer is',metrics.accuracy_score(ytest,predicted)) print('Confusion matrix') print(metrics.confusion_matrix(ytest,predicted)) print('Recall and Precison ') print(metrics.recall_score(ytest,predicted)) print(metrics.precision_score(ytest,predicted)) output: Accuracy metrics Accuracy of the classifer is 0.6 Confusion matrix [[1 2] [0 2]] Recall and Precison 1.0 0.5 pg 7: 7.Write a program to construct aBayesian network considering medical data. Use this model to demonstrate the diagnosis of heart patients using standard Heart Disease Data Set. You can use Java/Python ML library classes/API. # data analysis, splitting and wrangling import pandas as pd import numpy as np
  • 14. from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler # machine learning from sklearn.naive_bayes import GaussianNB # column names in accordance with feature information col_names = ['age','sex','chest_pain','blood_pressure', 'serum_cholestoral','fasting_blood_sugar', 'electrocardiographic', 'max_heart_rate','induced_angina','ST_depression', 'slope','no_of_vessels','thal','diagnosis'] # read the file df = pd.read_csv("heart_disease_dataset.csv", names=col_names, header=None, na_values="?") print("Number of records: {}nNumber of variables: {}".format(df.shape[0], df.shape[1])) # display the first 5 lines my comments df.head() df.info() # extract numeric columns and find categorical ones numeric_columns = ['serum_cholestoral', 'max_heart_rate', 'age', 'blood_pressure', 'ST_depression'] categorical_columns = [c for c in df.columns if c not in numeric_columns] print(categorical_columns) # count values of explained variable df.diagnosis.value_counts() # create a boolean vector and map it with corresponding values (True=1, False=0) df.diagnosis = (df.diagnosis != 0).astype(int) df.diagnosis.value_counts() # view of descriptive statistics df[numeric_columns].describe()
  • 15. # count ill vs healthy people grouped by sex df.groupby(['sex','diagnosis'])['diagnosis'].count() # average number of diagnosed people grouped by number of blood vessels detected by fluoroscopy df[['no_of_vessels','diagnosis']].groupby('no_of_vessels').mean() # show columns having missing values df.isnull().sum() # fill missing values with mode df['no_of_vessels'].fillna(df['no_of_vessels'].mode()[0], inplace=True) df['thal'].fillna(df['thal'].mode()[0], inplace=True) # extract the target variable X, y = df.iloc[:, :-1], df.iloc[:, -1] print(X.shape) print(y.shape) # split the data X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=2606) print ("train_set_x shape: " + str(X_train.shape)) print ("train_set_y shape: " + str(y_train.shape)) print ("test_set_x shape: " + str(X_test.shape)) print ("test_set_y shape: " + str(y_test.shape)) # scale feature matrices scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) model = GaussianNB() # train model model.fit(X_train,y_train) # check accuracy and print out the results fit_accuracy = model.score(X_train, y_train) test_accuracy = model.score(X_test, y_test)
  • 16. print(f"Train accuracy: {fit_accuracy:0.2%}") print(f"Test accuracy: {test_accuracy:0.2%}") output: Number of records: 303 Number of variables: 14 <class 'pandas.core.frame.DataFrame'> RangeIndex: 303 entries, 0 to 302 Data columns (total 14 columns): age 303 non-null float64 sex 303 non-null float64 chest_pain 303 non-null float64 blood_pressure 303 non-null float64 serum_cholestoral 303 non-null float64 fasting_blood_sugar 303 non-null float64 electrocardiographic 303 non-null float64 max_heart_rate 303 non-null float64 induced_angina 303 non-null float64 ST_depression 303 non-null float64 slope 303 non-null float64 no_of_vessels 299 non-null float64 thal 301 non-null float64 diagnosis 303 non-null int64 dtypes: float64(13), int64(1) memory usage: 33.2 KB ['sex', 'chest_pain', 'fasting_blood_sugar', 'electrocardiographic', 'induced_angina', 'slope', 'no_of_vessels', 'thal', 'diagnosis'] (303, 13) (303,) train_set_x shape: (212, 13) train_set_y shape: (212,) test_set_x shape: (91, 13) test_set_y shape: (91,) Train accuracy: 85.38% Test accuracy: 86.81%
  • 17. pg 8: 8. Apply EM algorithm to cluster a set of data stored in a .CSV file. Use the same dataset for clustering using k-Means algorithm. Compare the results of these two algorithms and comment on the quality of clustering. You can add Java/Python ML library classes/API in the program. #from sklearn import metrics import numpy as np import matplotlib.pyplot as plt import pandas as pd from sklearn.mixture import GaussianMixture from sklearn.cluster import KMeans data=pd.read_csv("clusterdata.csv") df1=pd.DataFrame(data) print(df1) f1 = df1['Distance_Feature'].values f2 = df1['Speeding_Feature'].values X=np.matrix(list(zip(f1,f2))) plt.plot(1) plt.subplot(511) plt.xlim([0, 100]) plt.ylim([0, 50]) plt.title('Dataset') plt.ylabel('speeding_feature') plt.xlabel('distance_feature') plt.scatter(f1,f2) colors = ['b', 'g', 'r'] markers = ['o', 'v', 's'] # create new plot and data for K- means algorithm plt.plot(2) ax=plt.subplot(513) kmeans_model = KMeans(n_clusters=3).fit(X) for i, l in enumerate(kmeans_model.labels_): fig1=plt.plot(f1[i], f2[i], color=colors[l],marker=markers[l])
  • 18. plt.xlim([0, 100]) plt.ylim([0, 50]) plt.title('K- Means') plt.ylabel('speeding_feature') plt.xlabel('distance_feature') # create new plot and data for gaussian mixture plt.plot(3) plt.subplot(515) gmm=GaussianMixture(n_components=3).fit(X) labels= gmm.predict(X) for i, l in enumerate(labels): plt.plot(f1[i], f2[i], color=colors[l], marker=markers[l]) plt.xlim([0, 100]) plt.ylim([0, 50]) plt.title('Gaussian Mixture') plt.ylabel('speeding_feature') plt.xlabel('distance_feature') plt.show() output: Driver_ID Distance_Feature Speeding_Feature 0 3423311935 71.24 28 1 3423313212 52.53 25 2 3423313724 64.54 27 3 3423311373 55.69 22 4 3423310999 54.58 25 5 3423313857 41.91 10 6 3423312432 58.64 20 7 3423311434 52.02 8 8 3423311328 31.25 34 9 3423312488 44.31 19 10 3423311254 49.35 40 11 3423312943 58.07 45 12 3423312536 44.22 22 13 3423311542 55.73 19 14 3423312176 46.63 43 15 3423314176 52.97 32
  • 19. 16 3423314202 46.25 35 17 3423311346 51.55 27 18 3423310666 57.05 26 19 3423313527 58.45 30 20 3423312182 43.42 23 21 3423313590 55.68 37 22 3423312268 55.15 18 pg 9: 9.Write a program to implement k-Nearest Neighbour algorithm to classify the iris data set. Print both correct and wrong predictions. Java/Python ML library classes can be used for this problem. Program 9 #KNN algorithm from sklearn.model_selection import train_test_split from sklearn.neighbors import KNeighborsClassifier from sklearn.metrics import classification_report,confusion_matrix #from sklearn import metrics import pandas as pd import numpy as np from sklearn import datasets iris=datasets.load_iris() iris_data=iris.data iris_labels=iris.target #print(iris_data) #print(iris_labels) X_train,x_test,y_train,y_test=train_test_split(iris_data,iris_labels,test_size=0.20) classifier=KNeighborsClassifier(n_neighbors=5) classifier.fit(X_train,y_train) y_pred=classifier.predict(x_test) print('confusion_matrix is as follows') print(confusion_matrix(y_test,y_pred)) print('Accuracy Metrics') print(classification_report(y_test,y_pred)) output: confusion_matrix is as follows [[11 0 0] [ 0 9 0]
  • 20. [ 0 1 9]] Accuracy Metrics precision recall f1-score support 0 1.00 1.00 1.00 11 1 0.90 1.00 0.95 9 2 1.00 0.90 0.95 10 avg / total 0.97 0.97 0.97 30 pg 10: 10.Implement the non-parametric Locally Weighted Regressionalgorithm in order to fit data points. Select appropriate data set for your experiment and draw graphs. import matplotlib.pyplot as plt import pandas as pd import numpy as np1 def kernel(point,xmat, k): m,n= np1.shape(xmat) weights = np1.mat(np1.eye((m))) for j in range(m): diff = point - X[j] weights[j,j] = np1.exp(diff*diff.T/(-2.0*k**2)) return weights def localWeight(point,xmat,ymat,k): wei = kernel(point,xmat,k) W = (X.T*(wei*X)).I*(X.T*(wei*ymat.T)) return W def localWeightRegression(xmat,ymat,k): m,n = np1.shape(xmat) ypred = np1.zeros(m) for i in range(m): ypred[i] = xmat[i]*localWeight(xmat[i],xmat,ymat,k) return ypred # load data points
  • 21. data = pd.read_csv('tips1.csv') bill = np1.array(data.total_bill) tip = np1.array(data.tip) #preparing and add 1 in bill mbill = np1.mat(bill) # mat treats array as matrix mtip = np1.mat(tip) m= np1.shape(mbill)[1] print("******",m) one = np1.mat(np1.ones(m)) X= np1.hstack((one.T,mbill.T)) #Stack arrays in sequence horizontally (column wise). #set k here ypred = localWeightRegression(X,mtip,2) SortIndex = X[:,1].argsort(0) xsort = X[SortIndex][:,0] fig = plt.figure() ax = fig.add_subplot(1,1,1) ax.scatter(bill,tip, color='blue') ax.plot(xsort[:,1],ypred[SortIndex], color = 'red', linewidth=1) plt.xlabel('Total bill') plt.ylabel('Tip') #plt.show(); output: Text(0,0.5,'Tip')