0% found this document useful (0 votes)
8 views

ml lab

The document outlines several programming tasks involving data analysis and machine learning using Python. It includes programs for statistical analysis, PCA, k-NN classification, Locally Weighted Regression, Linear and Polynomial Regression, and Decision Tree classification. Each program demonstrates loading datasets, performing computations, visualizing results, and evaluating model performance.
Copyright
© © All Rights Reserved
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
8 views

ml lab

The document outlines several programming tasks involving data analysis and machine learning using Python. It includes programs for statistical analysis, PCA, k-NN classification, Locally Weighted Regression, Linear and Polynomial Regression, and Decision Tree classification. Each program demonstrates loading datasets, performing computations, visualizing results, and evaluating model performance.
Copyright
© © All Rights Reserved
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
You are on page 1/ 14

Program 1

Develop a program to Load a dataset and select one numerical column. Compute mean,
median, mode, standard deviation, variance, and range for a given numerical column in a
dataset. Generate a histogram and boxplot to understand the distribution of the data. Identify
any outliers in the data using IQR. Select a categorical variable from a dataset. Compute the
frequency of each category and display it as a bar chart or pie chart.

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load dataset (Modify the file path or URL as needed)


file_path = "your_dataset.csv" # Update with the actual dataset path
df = pd.read_csv(file_path)

# Display first few rows


print("First 5 rows of the dataset:")
print(df.head())

# Select a numerical column


num_col = "your_numerical_column" # Replace with actual numerical column name
if num_col not in df.columns:
raise ValueError(f"Column '{num_col}' not found in dataset")

# Compute statistics
mean_value = df[num_col].mean()
median_value = df[num_col].median()
mode_value = df[num_col].mode()[0] # Mode might return multiple values
std_dev = df[num_col].std()
variance = df[num_col].var()
data_range = df[num_col].max() - df[num_col].min()

# Print statistics
print("\nStatistical Measures for", num_col)
print(f"Mean: {mean_value}")
print(f"Median: {median_value}")
print(f"Mode: {mode_value}")
print(f"Standard Deviation: {std_dev}")
print(f"Variance: {variance}")
print(f"Range: {data_range}")

# Plot Histogram
plt.figure(figsize=(6, 4))
sns.histplot(df[num_col], bins=20, kde=True)
plt.title(f"Histogram of {num_col}")
plt.xlabel(num_col)
plt.ylabel("Frequency")
plt.show()
# Plot Boxplot
plt.figure(figsize=(6, 4))
sns.boxplot(x=df[num_col])
plt.title(f"Boxplot of {num_col}")
plt.show()

# Detect Outliers using IQR


Q1 = df[num_col].quantile(0.25)
Q3 = df[num_col].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR


upper_bound = Q3 + 1.5 * IQR

outliers = df[(df[num_col] < lower_bound) | (df[num_col] > upper_bound)]


print(f"\nNumber of Outliers in {num_col}: {len(outliers)}")
print(outliers)

# Select a categorical column


cat_col = "your_categorical_column" # Replace with actual categorical column name
if cat_col not in df.columns:
raise ValueError(f"Column '{cat_col}' not found in dataset")

# Compute category frequency


category_counts = df[cat_col].value_counts()

# Plot Bar Chart


plt.figure(figsize=(6, 4))
sns.barplot(x=category_counts.index, y=category_counts.values)
plt.title(f"Bar Chart of {cat_col}")
plt.xlabel(cat_col)
plt.ylabel("Frequency")
plt.xticks(rotation=45)
plt.show()

# Plot Pie Chart


plt.figure(figsize=(6, 4))
plt.pie(category_counts, labels=category_counts.index, autopct='%1.1f%%', startangle=140)
plt.title(f"Pie Chart of {cat_col}")
plt.show()
Program 2

Develop a program to Load a dataset with at least two numerical columns (e.g., Iris, Titanic).
Plot a scatter plot of two variables and calculate their Pearson correlation coefficient. Write a
program to compute the covariance and correlation matrix for a dataset. Visualize the
correlation matrix using a heatmap to know which variables have strong positive/negative
correlations.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load dataset (Update file path or dataset)


file_path = "your_dataset.csv" # Replace with actual dataset path
df = pd.read_csv(file_path)

# Display first few rows


print("First 5 rows of the dataset:")
print(df.head())

# Select two numerical columns for scatter plot and correlation


num_col1 = "your_numerical_column1" # Replace with actual column name
num_col2 = "your_numerical_column2" # Replace with actual column name

if num_col1 not in df.columns or num_col2 not in df.columns:


raise ValueError(f"Columns '{num_col1}' or '{num_col2}' not found in dataset")

# Scatter plot
plt.figure(figsize=(6, 4))
sns.scatterplot(x=df[num_col1], y=df[num_col2])
plt.title(f"Scatter Plot: {num_col1} vs {num_col2}")
plt.xlabel(num_col1)
plt.ylabel(num_col2)
plt.show()

# Compute Pearson correlation coefficient


pearson_corr = df[num_col1].corr(df[num_col2])
print(f"\nPearson Correlation Coefficient between {num_col1} and {num_col2}:
{pearson_corr:.4f}")

# Compute Covariance Matrix


cov_matrix = df[[num_col1, num_col2]].cov()
print("\nCovariance Matrix:")
print(cov_matrix)

# Compute Correlation Matrix


corr_matrix = df.corr()
print("\nCorrelation Matrix:")
print(corr_matrix)
# Heatmap of Correlation Matrix
plt.figure(figsize=(8, 6))
sns.heatmap(corr_matrix, annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)
plt.title("Correlation Matrix Heatmap")
plt.show()
Program 3

Develop a program to implement Principal Component Analysis (PCA) for reducing the
dimensionality of the Iris dataset from 4 features to 2.

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_iris

# Load the Iris dataset


iris = load_iris()
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df['species'] = iris.target

# Standardize the data (PCA is affected by scale)


scaler = StandardScaler()
scaled_data = scaler.fit_transform(df.iloc[:, :-1]) # Exclude the species column

# Apply PCA to reduce to 2 dimensions


pca = PCA(n_components=2)
pca_data = pca.fit_transform(scaled_data)

# Create a new DataFrame with PCA components


pca_df = pd.DataFrame(pca_data, columns=['PC1', 'PC2'])
pca_df['species'] = df['species']

# Scatter plot of PCA results


plt.figure(figsize=(8, 6))
sns.scatterplot(x=pca_df['PC1'], y=pca_df['PC2'], hue=pca_df['species'], palette='coolwarm',
alpha=0.7)
plt.title('PCA of Iris Dataset (4D → 2D)')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend(title='Species', labels=iris.target_names)
plt.show()

# Explained variance ratio


explained_variance = pca.explained_variance_ratio_
print(f"Explained Variance by PC1: {explained_variance[0]:.4f}")
print(f"Explained Variance by PC2: {explained_variance[1]:.4f}")
print(f"Total Variance Explained: {sum(explained_variance):.4f}")
Program 4

Develop a program to load the Iris dataset. Implement the k-Nearest Neighbors (k-NN)
algorithm for classifying flowers based on their features. Split the dataset into training and

different values of 𝑘 (e.g., k=1,3,5) and evaluate the accuracy. Extend the k-NN algorithm to
testing sets and evaluate the model using metrics like accuracy and F1-score. Test it for

assign weights based on the distance of neighbors (e.g., 𝑤𝑒𝑖𝑔ℎ𝑡=1/𝑑2 ). Compare the
performance of weighted k-NN and regular k-NN on a synthetic or real-world dataset.

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score

# Load the Iris dataset


iris = load_iris()
X = iris.data # Features
y = iris.target # Labels

# Split dataset into 80% training and 20% testing


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42,
stratify=y)

# Standardize the dataset (important for distance-based models)


scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Function to evaluate k-NN with different k values


def evaluate_knn(k_values, weighted=False):
results = []
for k in k_values:
if weighted:
knn = KNeighborsClassifier(n_neighbors=k, weights=lambda d: 1 / (d**2 + 1e-5)) #
Weighted k-NN
else:
knn = KNeighborsClassifier(n_neighbors=k, weights="uniform") # Regular k-NN

knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)


f1 = f1_score(y_test, y_pred, average='weighted')
results.append((k, accuracy, f1))
return results

# Test different k values


k_values = [1, 3, 5]
knn_results = evaluate_knn(k_values, weighted=False)
weighted_knn_results = evaluate_knn(k_values, weighted=True)

# Convert results to DataFrame


knn_df = pd.DataFrame(knn_results, columns=['k', 'Accuracy', 'F1-Score'])
weighted_knn_df = pd.DataFrame(weighted_knn_results, columns=['k', 'Accuracy', 'F1-
Score'])

# Print results
print("\nRegular k-NN Performance:")
print(knn_df)
print("\nWeighted k-NN Performance:")
print(weighted_knn_df)

# Plot comparison
plt.figure(figsize=(8, 5))
plt.plot(knn_df['k'], knn_df['Accuracy'], marker='o', label='Regular k-NN')
plt.plot(weighted_knn_df['k'], weighted_knn_df['Accuracy'], marker='s', linestyle='dashed',
label='Weighted k-NN')
plt.xlabel("k (Number of Neighbors)")
plt.ylabel("Accuracy")
plt.title("k-NN vs. Weighted k-NN Performance")
plt.legend()
plt.show()
Program 5

Implement the non-parametric Locally Weighted Regression algorithm in order to fit data
points. Select appropriate data set for your experiment and draw graphs.

import numpy as np
import matplotlib.pyplot as plt
from scipy.spatial.distance import cdist

# Generate synthetic dataset (Non-linear function)


np.random.seed(42)
X = np.linspace(-3, 3, 100)
y = np.sin(X) + np.random.normal(scale=0.1, size=len(X)) # True function + noise

# Reshape for matrix operations


X = X.reshape(-1, 1)

# Gaussian Kernel for Weights


def get_weights(X_train, x_query, tau):
distances = cdist(X_train, x_query.reshape(1, -1), metric='euclidean')
weights = np.exp(- (distances*2) / (2 * tau*2))
return np.diag(weights.flatten())

# Locally Weighted Regression function


def locally_weighted_regression(X_train, y_train, x_query, tau):
W = get_weights(X_train, x_query, tau)
X_bias = np.c_[np.ones(X_train.shape[0]), X_train] # Add bias term
theta = np.linalg.pinv(X_bias.T @ W @ X_bias) @ X_bias.T @ W @ y_train
return np.array([1, x_query]) @ theta # Prediction for x_query

# Fit LWR on the dataset for multiple query points


tau_values = [0.1, 0.5, 1.0] # Different bandwidth values
plt.figure(figsize=(10, 6))

for tau in tau_values:


y_pred = np.array([locally_weighted_regression(X, y, x, tau) for x in X])
plt.plot(X, y_pred, label=f"LWR (τ={tau})")

# Plot original data


plt.scatter(X, y, color='black', label='Data Points', alpha=0.6)
plt.title("Locally Weighted Regression (LWR) for Different τ")
plt.xlabel("X")
plt.ylabel("y")
plt.legend()
plt.show()
Program 6

Develop a program to demonstrate the working of Linear Regression and Polynomial


Regression. Use Boston Housing Dataset for Linear Regression and Auto MPG Dataset (for
vehicle fuel efficiency prediction) for Polynomial Regression.

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Load the Boston Housing dataset


boston = fetch_openml(name='boston', version=1, as_frame=True)
df = boston.frame

# Display dataset information


print(df.info())
print(df.describe())

# Define features and target variable


X = df.drop(columns='MEDV')
y = df['MEDV']

# Split the data into training and testing sets


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the Linear Regression model


lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# Make predictions on the test set


y_pred = lr_model.predict(X_test)

# Evaluate the model


mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Squared Error: {mse:.2f}")
print(f"R^2 Score: {r2:.2f}")

# Plotting Actual vs Predicted values


plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, alpha=0.7, color='b')
plt.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=2)
plt.xlabel('Actual MEDV')
plt.ylabel('Predicted MEDV')
plt.title('Actual vs Predicted MEDV')
plt.show()
Program 7

Develop a program to load the Titanic dataset. Split the data into training and test sets. Train
a decision tree classifier. Visualize the tree structure. Evaluate accuracy, precision, recall, and
F1-score.

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import LabelEncoder

# Load Titanic dataset


url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
df = pd.read_csv(url)

# Display dataset info


print(df.info())

# Select relevant features & preprocess data


df = df[['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']] # Relevant
columns
df.dropna(inplace=True) # Drop rows with missing values

# Encode categorical variables


df['Sex'] = LabelEncoder().fit_transform(df['Sex']) # Convert 'Sex' to 0 (female) & 1 (male)
df['Embarked'] = LabelEncoder().fit_transform(df['Embarked']) # Encode 'Embarked'
categories

# Define features & target variable


X = df.drop(columns='Survived')
y = df['Survived']

# Split into training (80%) and testing (20%) sets


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42,
stratify=y)

# Train Decision Tree model


dt_model = DecisionTreeClassifier(max_depth=4, random_state=42)
dt_model.fit(X_train, y_train)

# Predictions
y_pred = dt_model.predict(X_test)

# Evaluate model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")

# Visualize the Decision Tree


plt.figure(figsize=(15, 8))
plot_tree(dt_model, feature_names=X.columns, class_names=['Died', 'Survived'],
filled=True)
plt.title("Decision Tree for Titanic Survival Prediction")
plt.show()
Program 8

Develop a program to implement the Naive Bayesian classifier considering Iris dataset for
training. Compute the accuracy of the classifier, considering the test data.

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load the Iris dataset


iris = datasets.load_iris()
X = iris.data # Features
y = iris.target # Labels

# Split into training (80%) and testing (20%) sets


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42,
stratify=y)

# Train Naïve Bayes model


nb_model = GaussianNB()
nb_model.fit(X_train, y_train)

# Predict test data


y_pred = nb_model.predict(X_test)

# Compute accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# Display classification report


print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=iris.target_names))

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6, 5))
sns.heatmap(conf_matrix, annot=True, cmap='Blues', xticklabels=iris.target_names,
yticklabels=iris.target_names)
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()
Program 9

Develop a program to implement k-means clustering using Wisconsin Breast Cancer data set
and visualize the clustering result.

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.datasets import load_breast_cancer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# Load the Wisconsin Breast Cancer dataset


cancer = load_breast_cancer()
df = pd.DataFrame(cancer.data, columns=cancer.feature_names)

# Standardize the dataset (important for K-Means)


scaler = StandardScaler()
df_scaled = scaler.fit_transform(df)

# Apply K-Means clustering (2 clusters since we have benign & malignant)


kmeans = KMeans(n_clusters=2, random_state=42)
df['Cluster'] = kmeans.fit_predict(df_scaled)

# Visualize clusters using PCA (reduce to 2D)


pca = PCA(n_components=2)
df_pca = pca.fit_transform(df_scaled)
df['PCA1'] = df_pca[:, 0]
df['PCA2'] = df_pca[:, 1]

plt.figure(figsize=(10, 6))
sns.scatterplot(x=df['PCA1'], y=df['PCA2'], hue=df['Cluster'], palette='coolwarm', alpha=0.7)
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.title('K-Means Clustering on Breast Cancer Dataset')
plt.legend(title="Cluster")
plt.show()

# Compare with actual labels


print(pd.crosstab(cancer.target, df['Cluster'], rownames=['Actual'], colnames=['Cluster']))

You might also like