0% found this document useful (0 votes)
4 views

Supervised_classi_&_regression

The document provides a comprehensive guide on data preprocessing and modeling techniques in Python using libraries such as pandas and scikit-learn. It covers methods for handling missing values, scaling features, encoding categorical variables, and visualizing data, as well as implementing various machine learning models like Linear Regression, Logistic Regression, Decision Trees, and ensemble methods. Additionally, it includes performance metrics for evaluating model accuracy and effectiveness.

Uploaded by

hetvibhora192
Copyright
© © All Rights Reserved
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
4 views

Supervised_classi_&_regression

The document provides a comprehensive guide on data preprocessing and modeling techniques in Python using libraries such as pandas and scikit-learn. It covers methods for handling missing values, scaling features, encoding categorical variables, and visualizing data, as well as implementing various machine learning models like Linear Regression, Logistic Regression, Decision Trees, and ensemble methods. Additionally, it includes performance metrics for evaluating model accuracy and effectiveness.

Uploaded by

hetvibhora192
Copyright
© © All Rights Reserved
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 5

lc5s5kseo

December 22, 2024

[ ]: #numerical cols
df['col'] = df['col'].fillna(df['col'].mean()) # Replace with mean
df['col'] = df['col'].fillna(df['col'].median()) # Replace with median

[ ]: # Categorical columns
df['col'] = df['col'].fillna(df['col'].mode()[0]) # Replace with mode

[ ]: from sklearn.impute import KNNImputer


imputer = KNNImputer(n_neighbors=5)
df_imputed = imputer.fit_transform(df)

[ ]: df['col'] = df['col'].fillna(method='ffill') # Forward fill


df['col'] = df['col'].fillna(method='bfill') # Backward fill

[ ]: from sklearn.experimental import enable_iterative_imputer


from sklearn.impute import IterativeImputer

imputer = IterativeImputer()
df_imputed = imputer.fit_transform(df)

[ ]: from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
df_scaled = scaler.fit_transform(df)

[ ]: from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df_scaled = scaler.fit_transform(df)

[ ]: from sklearn.preprocessing import RobustScaler

scaler = RobustScaler()
df_scaled = scaler.fit_transform(df)

[ ]: from sklearn.preprocessing import MaxAbsScaler

1
scaler = MaxAbsScaler()
df_scaled = scaler.fit_transform(df)

[ ]: from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
df['col'] = encoder.fit_transform(df['col'])

[ ]: df_encoded = pd.get_dummies(df, columns=['col'],drop_first=True)

[ ]: mean_encoding = df.groupby('col')['target'].mean()
df['col'] = df['col'].map(mean_encoding)

[ ]: freq_encoding = df['col'].value_counts()
df['col'] = df['col'].map(freq_encoding)

[ ]: import matplotlib.pyplot as plt

df['col'].hist(bins=20)
plt.title('Histogram')
plt.xlabel('Values')
plt.ylabel('Frequency')
plt.show()

[ ]: import seaborn as sns

sns.boxplot(data=df, x='col') # only num_cols


plt.title('Boxplot')
plt.show()

[ ]: plt.scatter(df['col1'], df['col2']) # x=pc1,y=pc2


plt.title('Scatter Plot')
plt.xlabel('col1')
plt.ylabel('col2')
plt.show()

[ ]: import seaborn as sns

sns.heatmap(df.corr(), annot=True, cmap='coolwarm') # only num_cols


plt.title('Heatmap')
plt.show()

[ ]: from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X_train, y_train)

2
y_pred = model.predict(X_test)

# Metrics
print("R-squared:", r2_score(y_test, y_pred))
print("Mean Absolute Error:", mean_absolute_error(y_test, y_pred))
print("Mean Squared Error:", mean_squared_error(y_test, y_pred))
print("Root Mean Squared Error:", np.sqrt(mean_squared_error(y_test, y_pred)))

[ ]: from sklearn.linear_model import LogisticRegression


from sklearn.metrics import accuracy_score, precision_score, recall_score,␣
↪f1_score, roc_auc_score,roc_auc_score

model = LogisticRegression(penalty='l2', C=1.0, solver='lbfgs', random_state=42)


model.fit(X_train, y_train)
y_pred = model.predict(X_test)

y_pred_prob = model.predict_proba(X_test)[:, 1]
auc = roc_auc_score(y_test, y_pred_prob)
print(auc)

# Metrics
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1-Score:", f1_score(y_test, y_pred))
print("ROC-AUC Score:", roc_auc_score(y_test, y_pred_prob)) # also -␣
↪classification report,confusion matrix same way

[ ]: from sklearn.tree import DecisionTreeClassifier


from sklearn.metrics import accuracy_score, precision_score, recall_score,␣
↪f1_score

model = DecisionTreeClassifier(max_depth=5, criterion='gini', random_state=42)␣


↪# min_samples_split, min_samples_leaf

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print(model.feature_importances_)

# Metrics
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1-Score:", f1_score(y_test, y_pred)) # also - classification ␣
↪report,confusion matrix same way

3
[ ]: from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

model = RandomForestClassifier(n_estimators=100, max_depth=None,␣


↪oob_score=True, random_state=42)

model.fit(X_train, y_train)

print(model.feature_importances_)
print(model.oob_score_)

y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))

[ ]: from xgboost import XGBClassifier


from sklearn.metrics import log_loss

model = XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=6,␣


↪random_state=42)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)
y_pred_prob = model.predict_proba(X_test)[:, 1]

# Metrics
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Log Loss:", log_loss(y_test, y_pred_prob))
print("ROC-AUC Score:", roc_auc_score(y_test, y_pred_prob)) # also -␣
↪classification report,confusion matrix same way

from xgboost import plot_importance

plot_importance(model)
plt.show()

[ ]: from sklearn.ensemble import AdaBoostClassifier


from sklearn.metrics import accuracy_score, log_loss

model = AdaBoostClassifier(n_estimators=50, learning_rate=1.0, random_state=42)


model.fit(X_train, y_train)

y_pred = model.predict(X_test)
y_pred_prob = model.predict_proba(X_test)[:, 1]

# Metrics
print("Accuracy:", accuracy_score(y_test, y_pred))

4
print("Log Loss:", log_loss(y_test, y_pred_prob))

[ ]: from sklearn.neighbors import KNeighborsClassifier


from sklearn.metrics import accuracy_score

model = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2)


model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))


print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1-Score:", f1_score(y_test, y_pred))

[ ]:

[ ]:

[ ]:

You might also like