ml observation
ml observation
Output:
MedInc HouseAge AveRooms AveBedrms Population AveOccup Latitude \
0 8.3252 41.0 6.984127 1.023810 322.0 2.555556 37.88
1 8.3014 21.0 6.238137 0.971880 2401.0 2.109842 37.86
2 7.2574 52.0 8.288136 1.073446 496.0 2.802260 37.85
3 5.6431 52.0 5.817352 1.073059 558.0 2.547945 37.85
4 3.8462 52.0 6.281853 1.081081 565.0 2.181467 37.85
... ... ... ... ... ... ... ...
20635 1.5603 25.0 5.045455 1.133333 845.0 2.560606 39.48
20636 2.5568 18.0 6.114035 1.315789 356.0 3.122807 39.49
20637 1.7000 17.0 5.205543 1.120092 1007.0 2.325635 39.43
20638 1.8672 18.0 5.329513 1.171920 741.0 2.123209 39.43
20639 2.3886 16.0 5.254717 1.162264 1387.0 2.616981 39.37
AveBedrms:
Lower Bound: 0.8659085155701288, Upper Bound: 1.2396965968190603
Number of outliers: 1424
---
Population:
Lower Bound: -620.0, Upper Bound: 3132.0
Number of outliers: 1196
---
AveOccup:
Lower Bound: 1.1509614824735064, Upper Bound: 4.5610405893536905
Number of outliers: 711
---
Latitude:
Lower Bound: 28.259999999999998, Upper Bound: 43.38
Number of outliers: 0
---
Longitude:
Lower Bound: -127.48499999999999, Upper Bound: -112.32500000000002
Number of outliers: 0
---
MedHouseVal:
Lower Bound: -0.9808749999999995, Upper Bound: 4.824124999999999
Number of outliers: 1071
---
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_california_housing
# Convert to DataFrame
df = pd.DataFrame(iris_data, columns=iris_feature_names)
df['Target'] = iris_target
plt.figure(figsize=(8, 6))
sns.scatterplot(
x="Principal Component 1", y="Principal Component 2", hue="Target",
data=iris_pca_df,
palette="viridis", s=100, alpha=0.8
)
plt.title("PCA of Iris Dataset")
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.legend(title="Target", labels=iris.target_names)
plt.grid(alpha=0.5)
plt.show()
Output:
for i in range(len(labels)):
if labels[i] == 'Yes': # Only process positive examples
for j in range(len(hypothesis)):
if hypothesis[j] != attributes[i][j]:
hypothesis[j] = '?' # Generalize
return hypothesis
Output:
Implementing Find-S algorithm...
Final Hypothesis: ['Sunny', 'Warm', '?', '?', '?', '?']
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
print(x[:5])
for k in k_values:
knn = KNeighborsClassifier(n_neighbors=k)
knn.fit(x[:50], labels) # Train using first 50 points
classified_labels[k] = knn.predict(x[50:]) # Classify remaining 50 points
Output:
import numpy as np
import matplotlib.pyplot as plt
plt.figure(figsize=(12, 8))
for tau in tau_values:
y_pred = np.array([locally_weighted_regression(x_train, y_train, xq, tau)
for xq in x_test])
plt.plot(x_test, y_pred, label=f'tau={tau}')
Output:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import make_pipeline
linear_reg = LinearRegression()
linear_reg.fit(X_train, y_train)
y_pred = linear_reg.predict(X_test)
# Plot results
plt.figure(figsize=(10, 5))
plt.scatter(X_test, y_test, color='blue', label='Actual')
plt.plot(X_test, y_pred, color='red', linewidth=2, label='Predicted')
plt.xlabel('Average Number of Rooms (RM)')
plt.ylabel('Housing Price')
# Plot results
X_test_sorted, y_poly_pred_sorted = zip(*sorted(zip(X_test.flatten(),
y_poly_pred)))
plt.figure(figsize=(10, 5))
plt.scatter(X_test, y_test, color='blue', label='Actual')
plt.plot(X_test_sorted, y_poly_pred_sorted, color='red', linewidth=2,
label='Predicted')
plt.xlabel('Horsepower')
plt.ylabel('MPG')
plt.title('Polynomial Regression on Auto MPG Dataset')
plt.legend()
plt.show()
Output:
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import matplotlib.pyplot as plt
from collections import Counter
data = load_breast_cancer()
X = data.data
y = data.target
feature_names = data.feature_names
target_names = data.target_names
def calculate_entropy(labels):
total = len(labels)
counts = Counter(labels)
entropy = 0.0
for count in counts.values():
p = count / total
entropy -= p * np.log2(p)
return entropy
entropy_dataset = calculate_entropy(y)
print(f"\nOverall Entropy of Target (Malignant vs Benign):
{entropy_dataset:.4f}")
# Split dataset
left_mask = feature_values <= median_value
right_mask = feature_values > median_value
y_left = y[left_mask]
y_right = y[right_mask]
entropy_left = calculate_entropy(y_left)
entropy_right = calculate_entropy(y_right)
print(f"{feature}: IG = {info_gain:.4f}")
y_pred = clf.predict(X_test)
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))
plt.figure(figsize=(20, 10))
plot_tree(clf, feature_names=feature_names, class_names=target_names,
filled=True, rounded=True)
plt.title("Decision Tree Visualization for Breast Cancer Dataset")
plt.show()
prediction = clf.predict(new_sample)
print("\nPrediction for new sample:")
print("Class:", target_names[prediction[0]])
Output:
Feature names: ['mean radius' 'mean texture' 'mean perimeter'
'mean area'
'mean smoothness' 'mean compactness' 'mean concavity'
'mean concave points' 'mean symmetry' 'mean fractal
dimension'
'radius error' 'texture error' 'perimeter error' 'area error'
'smoothness error' 'compactness error' 'concavity error'
'concave points error' 'symmetry error' 'fractal dimension
error'
'worst radius' 'worst texture' 'worst perimeter' 'worst area'
'worst smoothness' 'worst compactness' 'worst concavity'
'worst concave points' 'worst symmetry' 'worst fractal
dimension']
Target names: ['malignant' 'benign']
Classification Report:
precision recall f1-score support
Accuracy: 0.956140350877193
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_olivetti_faces
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report,
confusion_matrix
faces = fetch_olivetti_faces()
X = faces.data # Flattened images: 400 x 4096
y = faces.target # Labels: 0 to 39 (40 classes)
images = faces.images # Original image shapes: 64 x 64
model = GaussianNB()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy)
Output:
Classification Report:
precision recall f1-score support
Accuracy: 0.8166666666666667
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_breast_cancer
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.preprocessing import StandardScaler
data = load_breast_cancer()
X = data.data
y = data.target
feature_names = data.feature_names
target_names = data.target_names
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
labels_mapped = np.where(clusters == 1, 0, 1)
print("\nConfusion Matrix:")
print(confusion_matrix(y, labels_mapped))
print("Accuracy:", accuracy_score(y, labels_mapped))
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)
plt.figure(figsize=(10, 6))
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=clusters, cmap='viridis', alpha=0.6)
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1],
s=250, marker='X', c='red', label='Centroids')
plt.title("K-Means Clustering of Breast Cancer Dataset (PCA-2D)")
plt.xlabel("PCA Component 1")
plt.ylabel("PCA Component 2")
plt.legend()
plt.grid(True)
plt.show()
Output:
Confusion Matrix:
[[176 36]
[ 18 339]]
Accuracy: 0.9050966608084359