0% found this document useful (0 votes)
5 views

Quality Prediction

Uploaded by

Pavan Kumar
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
5 views

Quality Prediction

Uploaded by

Pavan Kumar
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 20

# importing libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn import metrics

import warnings
warnings.filterwarnings('ignore')

# Loding the dataset


df = pd.read_csv('QualityPrediction.csv')
df

fixed acidity volatile acidity citric acid residual sugar


chlorides \
0 7.4 0.700 0.00 1.9
0.076
1 7.8 0.880 0.00 2.6
0.098
2 7.8 0.760 0.04 2.3
0.092
3 11.2 0.280 0.56 1.9
0.075
4 7.4 0.700 0.00 1.9
0.076
... ... ... ... ...
...
1594 6.2 0.600 0.08 2.0
0.090
1595 5.9 0.550 0.10 2.2
0.062
1596 6.3 0.510 0.13 2.3
0.076
1597 5.9 0.645 0.12 2.0
0.075
1598 6.0 0.310 0.47 3.6
0.067

free sulfur dioxide total sulfur dioxide density pH


sulphates \
0 11.0 34.0 0.99780 3.51
0.56
1 25.0 67.0 0.99680 3.20
0.68
2 15.0 54.0 0.99700 3.26
0.65
3 17.0 60.0 0.99800 3.16
0.58
4 11.0 34.0 0.99780 3.51
0.56
... ... ... ... ...
...
1594 32.0 44.0 0.99490 3.45
0.58
1595 39.0 51.0 0.99512 3.52
0.76
1596 29.0 40.0 0.99574 3.42
0.75
1597 32.0 44.0 0.99547 3.57
0.71
1598 18.0 42.0 0.99549 3.39
0.66

alcohol quality
0 9.4 5
1 9.8 5
2 9.8 5
3 9.8 6
4 9.4 5
... ... ...
1594 10.5 5
1595 11.2 6
1596 11.0 6
1597 10.2 5
1598 11.0 6

[1599 rows x 12 columns]

# Checking for null values


df.isnull().sum()

fixed acidity 0
volatile acidity 0
citric acid 0
residual sugar 0
chlorides 0
free sulfur dioxide 0
total sulfur dioxide 0
density 0
pH 0
sulphates 0
alcohol 0
quality 0
dtype: int64

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 fixed acidity 1599 non-null float64
1 volatile acidity 1599 non-null float64
2 citric acid 1599 non-null float64
3 residual sugar 1599 non-null float64
4 chlorides 1599 non-null float64
5 free sulfur dioxide 1599 non-null float64
6 total sulfur dioxide 1599 non-null float64
7 density 1599 non-null float64
8 pH 1599 non-null float64
9 sulphates 1599 non-null float64
10 alcohol 1599 non-null float64
11 quality 1599 non-null int64
dtypes: float64(11), int64(1)
memory usage: 150.0 KB

df.describe()

fixed acidity volatile acidity citric acid residual sugar \


count 1599.000000 1599.000000 1599.000000 1599.000000
mean 8.319637 0.527821 0.270976 2.538806
std 1.741096 0.179060 0.194801 1.409928
min 4.600000 0.120000 0.000000 0.900000
25% 7.100000 0.390000 0.090000 1.900000
50% 7.900000 0.520000 0.260000 2.200000
75% 9.200000 0.640000 0.420000 2.600000
max 15.900000 1.580000 1.000000 15.500000

chlorides free sulfur dioxide total sulfur dioxide


density \
count 1599.000000 1599.000000 1599.000000
1599.000000
mean 0.087467 15.874922 46.467792
0.996747
std 0.047065 10.460157 32.895324
0.001887
min 0.012000 1.000000 6.000000
0.990070
25% 0.070000 7.000000 22.000000
0.995600
50% 0.079000 14.000000 38.000000
0.996750
75% 0.090000 21.000000 62.000000
0.997835
max 0.611000 72.000000 289.000000
1.003690

pH sulphates alcohol quality


count 1599.000000 1599.000000 1599.000000 1599.000000
mean 3.311113 0.658149 10.422983 5.636023
std 0.154386 0.169507 1.065668 0.807569
min 2.740000 0.330000 8.400000 3.000000
25% 3.210000 0.550000 9.500000 5.000000
50% 3.310000 0.620000 10.200000 6.000000
75% 3.400000 0.730000 11.100000 6.000000
max 4.010000 2.000000 14.900000 8.000000

df.head(5)

fixed acidity volatile acidity citric acid residual sugar


chlorides \
0 7.4 0.70 0.00 1.9
0.076
1 7.8 0.88 0.00 2.6
0.098
2 7.8 0.76 0.04 2.3
0.092
3 11.2 0.28 0.56 1.9
0.075
4 7.4 0.70 0.00 1.9
0.076

free sulfur dioxide total sulfur dioxide density pH sulphates


\
0 11.0 34.0 0.9978 3.51 0.56

1 25.0 67.0 0.9968 3.20 0.68

2 15.0 54.0 0.9970 3.26 0.65

3 17.0 60.0 0.9980 3.16 0.58

4 11.0 34.0 0.9978 3.51 0.56

alcohol quality
0 9.4 5
1 9.8 5
2 9.8 5
3 9.8 6
4 9.4 5

Data Preprocessing
df['quality'].value_counts()

5 681
6 638
7 199
4 53
8 18
3 10
Name: quality, dtype: int64

sns.catplot(x='quality', data=df, kind='count')

<seaborn.axisgrid.FacetGrid at 0x152b9301c10>
plot=plt.figure(figsize=(5,5))
sns.barplot(x='quality',y='volatile acidity',data=df)

<AxesSubplot:xlabel='quality', ylabel='volatile acidity'>

plot=plt.figure(figsize=(5,5))
sns.barplot(x='quality',y='citric acid',data=df)

<AxesSubplot:xlabel='quality', ylabel='citric acid'>


plt.bar(df['quality'], df['alcohol'])
plt.xlabel('quality')
plt.ylabel('alcohol')
plt.show()
Exploratory Data Analysis
df['quality'] = df['quality'].apply(lambda x: 1 if x >= 7 else 0)
df.rename(columns={'quality': 'good quality'}, inplace=True)
df.head()

fixed acidity volatile acidity citric acid residual sugar


chlorides \
0 7.4 0.70 0.00 1.9
0.076
1 7.8 0.88 0.00 2.6
0.098
2 7.8 0.76 0.04 2.3
0.092
3 11.2 0.28 0.56 1.9
0.075
4 7.4 0.70 0.00 1.9
0.076

free sulfur dioxide total sulfur dioxide density pH sulphates


\
0 11.0 34.0 0.9978 3.51 0.56
1 25.0 67.0 0.9968 3.20 0.68

2 15.0 54.0 0.9970 3.26 0.65

3 17.0 60.0 0.9980 3.16 0.58

4 11.0 34.0 0.9978 3.51 0.56

alcohol good quality


0 9.4 0
1 9.8 0
2 9.8 0
3 9.8 0
4 9.4 0

plt.figure(figsize=(5,5))
sns.countplot(x='good quality', data=df)
plt.xlabel('good quality')
plt.ylabel('Count')
plt.title('Count of Good vs Bad Quality Wines')
plt.show()
plt.figure(figsize=(10,6))
sns.heatmap(df.corr(), annot=True)
plt.show()

fig, ax = plt.subplots(2,4,figsize=(20,20))
sns.scatterplot(x = 'fixed acidity', y = 'citric acid', hue = 'good
quality', data = df, ax=ax[0,0])
sns.scatterplot(x = 'volatile acidity', y = 'citric acid', hue = 'good
quality', data = df, ax=ax[0,1])
sns.scatterplot(x = 'free sulfur dioxide', y = 'total sulfur dioxide',
hue = 'good quality', data = df, ax=ax[0,2])
sns.scatterplot(x = 'fixed acidity', y = 'density', hue = 'good
quality', data = df, ax=ax[0,3])
sns.scatterplot(x = 'fixed acidity', y = 'pH', hue = 'good quality',
data = df, ax=ax[1,0])
sns.scatterplot(x = 'citric acid', y = 'pH', hue = 'good quality',
data = df, ax=ax[1,1])
sns.scatterplot(x = 'chlorides', y = 'sulphates', hue = 'good
quality', data = df, ax=ax[1,2])
sns.scatterplot(x = 'residual sugar', y = 'alcohol', hue = 'good
quality', data = df, ax=ax[1,3])
<AxesSubplot:xlabel='residual sugar', ylabel='alcohol'>

Train Test Split


X_train, X_test, y_train, y_test = train_test_split(df.drop('good
quality', axis=1), df['good quality'], test_size=0.3, random_state=42)

X_train.head()

fixed acidity volatile acidity citric acid residual sugar


chlorides \
925 8.6 0.22 0.36 1.9
0.064
363 12.5 0.46 0.63 2.0
0.071
906 7.2 0.54 0.27 2.6
0.084
426 6.4 0.67 0.08 2.1
0.045
1251 7.5 0.58 0.14 2.2
0.077

free sulfur dioxide total sulfur dioxide density pH


sulphates \
925 53.0 77.0 0.99604 3.47
0.87
363 6.0 15.0 0.99880 2.99
0.87
906 12.0 78.0 0.99640 3.39
0.71
426 19.0 48.0 0.99490 3.49
0.49
1251 27.0 60.0 0.99630 3.28
0.59

alcohol
925 11.0
363 10.2
906 11.0
426 11.4
1251 9.8

X_test.head()

fixed acidity volatile acidity citric acid residual sugar


chlorides \
803 7.7 0.56 0.08 2.50
0.114
124 7.8 0.50 0.17 1.60
0.082
350 10.7 0.67 0.22 2.70
0.107
682 8.5 0.46 0.31 2.25
0.078
1326 6.7 0.46 0.24 1.70
0.077

free sulfur dioxide total sulfur dioxide density pH


sulphates \
803 14.0 46.0 0.9971 3.24
0.66
124 21.0 102.0 0.9960 3.39
0.48
350 17.0 34.0 1.0004 3.28
0.98
682 32.0 58.0 0.9980 3.33
0.54
1326 18.0 34.0 0.9948 3.39
0.60

alcohol
803 9.6
124 9.5
350 9.9
682 9.8
1326 10.6

Model Training
Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

X_train_scaled

array([[ 1.69536131e-01, -1.72107140e+00, 4.59303345e-01, ...,


1.01180685e+00, 1.22661179e+00, 5.50057013e-01],
[ 2.44606730e+00, -4.01957443e-01, 1.84105501e+00, ...,
-2.10687612e+00, 1.22661179e+00, -2.05174641e-01],
[-6.47680186e-01, 3.77472102e-02, -1.28054303e-03, ...,
4.92026353e-01, 2.97270776e-01, 5.50057013e-01],
...,
[-6.47680186e-01, 4.77451864e-01, -1.07597628e+00, ...,
1.27169710e+00, -6.90154049e-01, -8.66002338e-01],
[-2.39072027e-01, -1.83099757e+00, 4.08127357e-01, ...,
3.72184202e-02, 8.20025095e-01, 1.39969262e+00],
[-1.46489650e+00, -1.33632983e+00, -5.24565306e-02, ...,
4.92026353e-01, -6.90154049e-01, 2.91015593e+00]])

X_test_scaled

array([[-0.35581722, 0.14767337, -0.97362431, ..., -0.48256207,


0.00685171, -0.77159838],
[-0.29744462, -0.18210512, -0.51304042, ..., 0.49202635,
-1.03865693, -0.86600234],
[ 1.39536061, 0.75226727, -0.25716048, ..., -0.22267183,
1.86553373, -0.48838651],
...,
[-0.93954316, -0.40195744, -0.15480851, ..., 0.49202635,
-0.34165117, 0.17244119],
[ 1.27861542, -0.12714203, 1.892231 , ..., -1.4571505 ,
0.00685171, 1.30528867],
[ 0.92837985, -0.18210512, -0.15480851, ..., 0.16716354,
-0.80632167, -0.39398255]])

Logistic Refression
lr = LogisticRegression()
lr

LogisticRegression()

#training the model


lr.fit(X_train, y_train)
lr.score(X_train, y_train)

0.8838248436103664

# testing the model


lr_pred = lr.predict(X_test)
accuracy_score(y_test, lr_pred)

0.85625

Support Vector Machine (SVM)


clf = svm.SVC(kernel='rbf')
clf

SVC()

# training the model


clf.fit(X_train, y_train)
clf.score(X_train, y_train)

0.8668453976764968

# testing the model


sv_pred = clf.predict(X_test)
accuracy_score(y_test, sv_pred)

0.8625
Decision Tree
dtree = DecisionTreeClassifier()
dtree

DecisionTreeClassifier()

# training the model


dtree.fit(X_train, y_train)
dtree.score(X_train, y_train)

1.0

# testing the model


tr_pred = dtree.predict(X_test)
accuracy_score(y_test, tr_pred)

0.8604166666666667

K-Nearest Neighbors (KNN)


from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5)
knn

KNeighborsClassifier()

# training the model


knn.fit(X_train, y_train)
knn.score(X_train, y_train)

0.9079535299374442

# testing the model


kn_pred = knn.predict(X_test)
accuracy_score(y_test, kn_pred)

0.8583333333333333

Model Evaluation
Logistic Regression
# logistic regression model evaluation
sns.heatmap(confusion_matrix(y_test, lr_pred), annot=True,
cmap='Blues')
plt.ylabel('Predicted Values')
plt.xlabel('Actual Values')
plt.title('Confusion Matrix for Logistic Regression')
plt.show()

print('Logistic Regression Model Accuracy: ', accuracy_score(y_test,


lr_pred))
print('Logistic Regression Model f1 score: ', metrics.f1_score(y_test,
lr_pred))
print('Logistic Regression Model MAE: ',
metrics.mean_absolute_error(y_test, lr_pred))
print('Logistic Regression Model RMSE: ',
np.sqrt(metrics.mean_squared_error(y_test, lr_pred)))

Logistic Regression Model Accuracy: 0.85625


Logistic Regression Model f1 score: 0.28865979381443296
Logistic Regression Model MAE: 0.14375
Logistic Regression Model RMSE: 0.3791437722025775
Support Vector Machine (SVM)
sns.heatmap(confusion_matrix(y_test, sv_pred), annot=True,
cmap='Reds')
plt.ylabel('Predicted Values')
plt.xlabel('Actual Values')
plt.title('Confusion Matrix for Support Vector Machine')
plt.show()

print('Support Vector Machine Model Accuracy: ',


accuracy_score(y_test, sv_pred))
print('Support Vector Machine Model f1 score: ',
metrics.f1_score(y_test, sv_pred))
print('Support Vector Machine Model MAE: ',
metrics.mean_absolute_error(y_test, sv_pred))
print('Support Vector Machine Model RMSE: ',
np.sqrt(metrics.mean_squared_error(y_test, sv_pred)))

Support Vector Machine Model Accuracy: 0.8625


Support Vector Machine Model f1 score: 0.029411764705882353
Support Vector Machine Model MAE: 0.1375
Support Vector Machine Model RMSE: 0.37080992435478316
Decision Tree
sns.heatmap(confusion_matrix(y_test, tr_pred), annot=True,
cmap='Greens')
plt.ylabel('Predicted Values')
plt.xlabel('Actual Values')
plt.title('Confusion Matrix for Decision Tree')
plt.show()

print('Decision Tree Model Accuracy: ', accuracy_score(y_test,


tr_pred))
print('Decision Tree Model f1 score: ', metrics.f1_score(y_test,
tr_pred))
print('Decision Tree Model MAE: ', metrics.mean_absolute_error(y_test,
tr_pred))
print('Decision Tree Model RMSE: ',
np.sqrt(metrics.mean_squared_error(y_test, tr_pred)))

Decision Tree Model Accuracy: 0.8604166666666667


Decision Tree Model f1 score: 0.5677419354838709
Decision Tree Model MAE: 0.13958333333333334
Decision Tree Model RMSE: 0.3736085295243316
K-Nearest Neighbors (KNN)
sns.heatmap(confusion_matrix(y_test, kn_pred), annot=True,
cmap='Purples')
plt.ylabel('Predicted Values')
plt.xlabel('Actual Values')
plt.title('Confusion Matrix for K-Nearest Neighbors')
plt.show()

print('K-Nearest Neighbors Model Accuracy: ', accuracy_score(y_test,


kn_pred))
print('K-Nearest Neighbors Model f1 score: ', metrics.f1_score(y_test,
kn_pred))
print('K-Nearest Neighbors Model MAE: ',
metrics.mean_absolute_error(y_test, kn_pred))
print('K-Nearest Neighbors Model RMSE: ',
np.sqrt(metrics.mean_squared_error(y_test, kn_pred)))

K-Nearest Neighbors Model Accuracy: 0.8583333333333333


K-Nearest Neighbors Model f1 score: 0.276595744680851
K-Nearest Neighbors Model MAE: 0.14166666666666666
K-Nearest Neighbors Model RMSE: 0.3763863263545405
Model Comparison
models = ['Logistic Regression', 'Support Vector Machine', 'Decision
Tree', 'K-Nearest Neighbors']
accuracy = [accuracy_score(y_test, lr_pred), accuracy_score(y_test,
sv_pred), accuracy_score(y_test, tr_pred), accuracy_score(y_test,
kn_pred)]
plt.figure(figsize=(10,6))
sns.barplot(x=models, y=accuracy)
plt.title('Model Accuracy Comparison')
plt.xlabel('Model')
plt.ylabel('Accuracy')
plt.ylim(0.5, 1.0)
plt.show()

Conclusion
It is observed that the Logistic Regression model performs the best on the test set
with an accuracy of 86%. The model can predict the quality of the wine based on the
given features with an accuracy of 86%.

You might also like