Pre-Processing techniques.ipynb - Colab
Pre-Processing techniques.ipynb - Colab
ipynb - Colab
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import KBinsDiscretizer
from scipy import stats
# Sample dataset
data = {
'A': [10, 20, 30, np.nan, 50, 60, 70, 800], # Outlier at 800, missing value at index 3
'B': [5, 15, np.nan, 25, 35, 45, 55, 65], # Missing value at index 2
'C': [1, 2, 3, 4, 5, 6, 7, 8], # Continuous data
'Target': [0, 1, 0, 1, 0, 1, 0, 1] # Target variable (classification)
}
df = pd.DataFrame(data)
print("Original Dataset:\n", df)
# 1. Attribute Selection
X = df.drop(columns=['Target']) # Features
y = df['Target']
selector = SelectKBest(score_func=f_classif, k=2) # Select top 2 best features
X_new = selector.fit_transform(X.fillna(X.mean()), y)
selected_features = X.columns[selector.get_support()]
print("\nSelected Features:", selected_features)
# 3. Discretization
discretizer = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='uniform')
df_imputed['C_binned'] = discretizer.fit_transform(df_imputed[['C']])
print("\nDataset after discretization:\n", df_imputed[['C', 'C_binned']])
Original Dataset:
A B C Target
0 10.0 5.0 1 0
1 20.0 15.0 2 1
2 30.0 NaN 3 0
3 NaN 25.0 4 1
4 50.0 35.0 5 0
5 60.0 45.0 6 1
6 70.0 55.0 7 0
7 800.0 65.0 8 1
https://colab.research.google.com/drive/1TNr6rVAg-_e7072NwFMWAJIkZ3ZuNDCE#scrollTo=4tijQ9eTbVw-&printMode=true 1/3
2/13/25, 10:11 AM Pre-Processing techniques.ipynb - Colab
4 50.000000 35.0 5.0 0.0
5 60.000000 45.0 6.0 1.0
6 70.000000 55.0 7.0 0.0
7 800.000000 65.0 8.0 1.0
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import KBinsDiscretizer
from scipy import stats
# 1. Attribute Selection
X = df.drop(columns=['target']) # Features
y = df['target']
selector = SelectKBest(score_func=f_classif, k=2) # Select top 2 best features
X_new = selector.fit_transform(X.fillna(X.mean()), y)
selected_features = X.columns[selector.get_support()]
print("\nSelected Features:", selected_features)
# 3. Discretization
discretizer = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='uniform')
df_imputed['sepal length (cm)_binned'] = discretizer.fit_transform(df_imputed[['sepal length (cm)']])
print("\nDataset after discretization:\n", df_imputed[['sepal length (cm)', 'sepal length (cm)_binned']].head())
Original Dataset:
sepal length (cm) sepal width (cm) petal length (cm) petal width (cm) \
0 5.1 3.5 1.4 0.2
1 4.9 3.0 1.4 0.2
2 4.7 NaN 1.3 0.2
3 4.6 3.1 1.5 0.2
4 5.0 3.6 1.4 0.2
target
0 0
https://colab.research.google.com/drive/1TNr6rVAg-_e7072NwFMWAJIkZ3ZuNDCE#scrollTo=4tijQ9eTbVw-&printMode=true 2/3
2/13/25, 10:11 AM Pre-Processing techniques.ipynb - Colab
1 0
2 0
3 0
4 0
target
0 0
1 0
2 0
3 0
4 0
https://colab.research.google.com/drive/1TNr6rVAg-_e7072NwFMWAJIkZ3ZuNDCE#scrollTo=4tijQ9eTbVw-&printMode=true 3/3