ML lab manual
ML lab manual
Semester: 6
Scheme: 2022
Program 1
Develop a program to create histograms for all numerical features and analyze the distribution of each
feature. Generate box plots for all numerical features and identify any outliers. Use California Housing
dataset.
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_california_housing
# Plot histograms
plt.figure(figsize=(15, 10))
for i, feature in enumerate(numerical_features):
plt.subplot(3, 3, i + 1)
sns.histplot(housing_df[feature], kde=True, bins=30, color='blue')
plt.title(f'Distribution of {feature}')
plt.tight_layout()
plt.show()
Outliers Detection:
Program 2
Develop a program to Compute the correlation matrix to understand the relationships between pairs of
features. Visualize the correlation matrix using a heatmap to know which variables have strong
positive/negative correlations. Create a pair plot to visualize pairwise relationships between features.
Use California Housing dataset.
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_california_housing
# Step 1: Load the California Housing Dataset
california_data = fetch_california_housing(as_frame=True)
data = california_data.frame
# Step 2: Compute the correlation matrix
correlation_matrix = data.corr()
# Step 3: Visualize the correlation matrix using a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Correlation Matrix of California Housing Features')
plt.show()
# Step 4: Create a pair plot to visualize pairwise relationships
sns.pairplot(data, diag_kind='kde', plot_kws={'alpha': 0.5})
plt.suptitle('Pair Plot of California Housing Features', y=1.02)
plt.show()
Output:
Program 3
Develop a program to implement Principal Component Analysis (PCA) for reducing the dimensionality
of the Iris dataset from 4 features to 2.
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
Output:
Program 4
For a given set of training data examples stored in a .CSV file, implement and demonstrate the Find-S
algorithm to output a description of the set of all hypotheses consistent with the training examples.
import pandas as pd
def find_s_algorithm(file_path):
data = pd.read_csv(file_path)
print("Training data:")
print(data)
attributes = data.columns[:-1]
class_label = data.columns[-1]
Output:
Training data:
Outlook Temperature Humidity Windy PlayTennis
0 Sunny Hot High False No
1 Sunny Hot High True No
2 Overcast Hot High False Yes
3 Rain Cold High False Yes
4 Rain Cold High True No
5 Overcast Hot High True Yes
6 Sunny Hot High False No
Program 5
Develop a program to implement k-Nearest Neighbour algorithm to classify the randomly generated 100
a. Label the first 50 points {x1,……,x50} as follows: if (xi ≤ 0.5), then xi ∊ Class1, else xi ∊ Class1
values of x in the range of [0,1]. Perform the following based on dataset generated.
b. Classify the remaining points, x51,……,x100 using KNN. Perform this for k=1,2,3,4,5,20,30
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
data = np.random.rand(100)
distances.sort(key=lambda x: x[0])
k_nearest_neighbors = distances[:k]
return Counter(k_nearest_labels).most_common(1)[0][0]
train_data = data[:50]
train_labels = labels
test_data = data[50:]
results = {}
for k in k_values:
print(f"Results for k = {k}:")
classified_labels = [knn_classifier(train_data, train_labels, test_point, k) for test_point in test_data]
results[k] = classified_labels
print("Classification complete.\n")
for k in k_values:
classified_labels = results[k]
class1_points = [test_data[i] for i in range(len(test_data)) if classified_labels[i] == "Class1"]
class2_points = [test_data[i] for i in range(len(test_data)) if classified_labels[i] == "Class2"]
plt.figure(figsize=(10, 6))
plt.scatter(train_data, [0] * len(train_data), c=["blue" if label == "Class1" else "red" for label in
train_labels],
label="Training Data", marker="o")
plt.scatter(class1_points, [1] * len(class1_points), c="blue", label="Class1 (Test)", marker="x")
plt.scatter(class2_points, [1] * len(class2_points), c="red", label="Class2 (Test)", marker="x")
Output:
Results for k = 1:
Point x51 (value: 0.2059) is classified as Class1
Point x52 (value: 0.2535) is classified as Class1
Point x53 (value: 0.4856) is classified as Class1
Point x54 (value: 0.9651) is classified as Class2
Point x55 (value: 0.3906) is classified as Class1
Program 6
Implement the non-parametric Locally Weighted Regression algorithm in order to fit data points. Select
appropriate data set for your experiment and draw graphs
import numpy as np
import matplotlib.pyplot as plt
plt.figure(figsize=(10, 6))
plt.scatter(X, y, color='red', label='Training Data', alpha=0.7)
plt.plot(x_test, y_pred, color='blue', label=f'LWR Fit (tau={tau})', linewidth=2)
plt.xlabel('X', fontsize=12)
plt.ylabel('y', fontsize=12)
plt.title('Locally Weighted Regression', fontsize=14)
plt.legend(fontsize=10)
plt.grid(alpha=0.3)
plt.show()
Output:
Program 7
Develop a program to demonstrate the working of Linear Regression and Polynomial Regression. Use
Boston Housing Dataset for Linear Regression and Auto MPG Dataset (for vehicle fuel efficiency
prediction) for Polynomial Regression.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error, r2_score
def linear_regression_california():
housing = fetch_california_housing(as_frame=True)
X = housing.data[["AveRooms"]]
y = housing.target
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
def polynomial_regression_auto_mpg():
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data"
column_names = ["mpg", "cylinders", "displacement", "horsepower", "weight", "acceleration",
"model_year", "origin"]
data = pd.read_csv(url, sep='\s+', names=column_names, na_values="?")
data = data.dropna()
X = data["displacement"].values.reshape(-1, 1)
y = data["mpg"].values
y_pred = poly_model.predict(X_test)
if __name__ == "__main__":
print("Demonstrating Linear Regression and Polynomial Regression\n")
linear_regression_california()
polynomial_regression_auto_mpg()
Output:
Program 8
Develop a program to demonstrate the working of the decision tree algorithm. Use Breast Cancer Data
set for building the decision tree and apply this knowledge to classify a new sample.
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn import tree
data = load_breast_cancer()
X = data.data
y = data.target
plt.figure(figsize=(12,8))
tree.plot_tree(clf, filled=True, feature_names=data.feature_names, class_names=data.target_names)
plt.title("Decision Tree - Breast Cancer Dataset")
plt.show()
Output:
Program 9
Develop a program to implement the Naive Bayesian classifier considering Olivetti Face Data set for
training. Compute the accuracy of the classifier, considering a few test data sets.
import numpy as np
from sklearn.datasets import fetch_olivetti_faces
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
data = fetch_olivetti_faces(shuffle=True, random_state=42)
X = data.data
y = data.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
gnb = GaussianNB()
gnb.fit(X_train, y_train)
y_pred = gnb.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')
print("\nClassification Report:")
print(classification_report(y_test, y_pred, zero_division=1))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
cross_val_accuracy = cross_val_score(gnb, X, y, cv=5, scoring='accuracy')
print(f'\nCross-validation accuracy: {cross_val_accuracy.mean() * 100:.2f}%')
fig, axes = plt.subplots(3, 5, figsize=(12, 8))
for ax, image, label, prediction in zip(axes.ravel(), X_test, y_test, y_pred):
ax.imshow(image.reshape(64, 64), cmap=plt.cm.gray)
ax.set_title(f"True: {label}, Pred: {prediction}")
ax.axis('off')
plt.show()
Output:
Accuracy: 80.83%
Classification Report:
data = load_breast_cancer()
X = data.data
y = data.target
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
print("Confusion Matrix:")
print(confusion_matrix(y, y_kmeans))
print("\nClassification Report:")
print(classification_report(y, y_kmeans))
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)
df['Cluster'] = y_kmeans
df['True Label'] = y
plt.figure(figsize=(8, 6))
sns.scatterplot(data=df, x='PC1', y='PC2', hue='Cluster', palette='Set1', s=100, edgecolor='black',
alpha=0.7)
plt.title('K-Means Clustering of Breast Cancer Dataset')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend(title="Cluster")
plt.show()
plt.figure(figsize=(8, 6))
sns.scatterplot(data=df, x='PC1', y='PC2', hue='True Label', palette='coolwarm', s=100,
edgecolor='black', alpha=0.7)
plt.title('True Labels of Breast Cancer Dataset')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend(title="True Label")
plt.show()
plt.figure(figsize=(8, 6))
sns.scatterplot(data=df, x='PC1', y='PC2', hue='Cluster', palette='Set1', s=100, edgecolor='black',
alpha=0.7)
centers = pca.transform(kmeans.cluster_centers_)
plt.scatter(centers[:, 0], centers[:, 1], s=200, c='red', marker='X', label='Centroids')
plt.title('K-Means Clustering with Centroids')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend(title="Cluster")
plt.show()
Output: