Quality Prediction
Quality Prediction
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn import metrics
import warnings
warnings.filterwarnings('ignore')
alcohol quality
0 9.4 5
1 9.8 5
2 9.8 5
3 9.8 6
4 9.4 5
... ... ...
1594 10.5 5
1595 11.2 6
1596 11.0 6
1597 10.2 5
1598 11.0 6
fixed acidity 0
volatile acidity 0
citric acid 0
residual sugar 0
chlorides 0
free sulfur dioxide 0
total sulfur dioxide 0
density 0
pH 0
sulphates 0
alcohol 0
quality 0
dtype: int64
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 fixed acidity 1599 non-null float64
1 volatile acidity 1599 non-null float64
2 citric acid 1599 non-null float64
3 residual sugar 1599 non-null float64
4 chlorides 1599 non-null float64
5 free sulfur dioxide 1599 non-null float64
6 total sulfur dioxide 1599 non-null float64
7 density 1599 non-null float64
8 pH 1599 non-null float64
9 sulphates 1599 non-null float64
10 alcohol 1599 non-null float64
11 quality 1599 non-null int64
dtypes: float64(11), int64(1)
memory usage: 150.0 KB
df.describe()
df.head(5)
alcohol quality
0 9.4 5
1 9.8 5
2 9.8 5
3 9.8 6
4 9.4 5
Data Preprocessing
df['quality'].value_counts()
5 681
6 638
7 199
4 53
8 18
3 10
Name: quality, dtype: int64
<seaborn.axisgrid.FacetGrid at 0x152b9301c10>
plot=plt.figure(figsize=(5,5))
sns.barplot(x='quality',y='volatile acidity',data=df)
plot=plt.figure(figsize=(5,5))
sns.barplot(x='quality',y='citric acid',data=df)
plt.figure(figsize=(5,5))
sns.countplot(x='good quality', data=df)
plt.xlabel('good quality')
plt.ylabel('Count')
plt.title('Count of Good vs Bad Quality Wines')
plt.show()
plt.figure(figsize=(10,6))
sns.heatmap(df.corr(), annot=True)
plt.show()
fig, ax = plt.subplots(2,4,figsize=(20,20))
sns.scatterplot(x = 'fixed acidity', y = 'citric acid', hue = 'good
quality', data = df, ax=ax[0,0])
sns.scatterplot(x = 'volatile acidity', y = 'citric acid', hue = 'good
quality', data = df, ax=ax[0,1])
sns.scatterplot(x = 'free sulfur dioxide', y = 'total sulfur dioxide',
hue = 'good quality', data = df, ax=ax[0,2])
sns.scatterplot(x = 'fixed acidity', y = 'density', hue = 'good
quality', data = df, ax=ax[0,3])
sns.scatterplot(x = 'fixed acidity', y = 'pH', hue = 'good quality',
data = df, ax=ax[1,0])
sns.scatterplot(x = 'citric acid', y = 'pH', hue = 'good quality',
data = df, ax=ax[1,1])
sns.scatterplot(x = 'chlorides', y = 'sulphates', hue = 'good
quality', data = df, ax=ax[1,2])
sns.scatterplot(x = 'residual sugar', y = 'alcohol', hue = 'good
quality', data = df, ax=ax[1,3])
<AxesSubplot:xlabel='residual sugar', ylabel='alcohol'>
X_train.head()
alcohol
925 11.0
363 10.2
906 11.0
426 11.4
1251 9.8
X_test.head()
alcohol
803 9.6
124 9.5
350 9.9
682 9.8
1326 10.6
Model Training
Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train_scaled
X_test_scaled
Logistic Refression
lr = LogisticRegression()
lr
LogisticRegression()
0.8838248436103664
0.85625
SVC()
0.8668453976764968
0.8625
Decision Tree
dtree = DecisionTreeClassifier()
dtree
DecisionTreeClassifier()
1.0
0.8604166666666667
KNeighborsClassifier()
0.9079535299374442
0.8583333333333333
Model Evaluation
Logistic Regression
# logistic regression model evaluation
sns.heatmap(confusion_matrix(y_test, lr_pred), annot=True,
cmap='Blues')
plt.ylabel('Predicted Values')
plt.xlabel('Actual Values')
plt.title('Confusion Matrix for Logistic Regression')
plt.show()
Conclusion
It is observed that the Logistic Regression model performs the best on the test set
with an accuracy of 86%. The model can predict the quality of the wine based on the
given features with an accuracy of 86%.