ML Practical 4D
ML Practical 4D
Importing libraries
In [198… import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
Preprocessing
In [201… df.head()
Out[201]: ORDERNUMBER QUANTITYORDERE PRICEEAC ORDERLINENUMBE SALE ORDER ATE STATUS QTR_ID
D H R S
2/24 2003
0 10107 30 95.70 2 2871.00 0:00 Shipped 1
5/7 2003
1 10121 34 81.35 5 2765.90 Shipped 2
0:00
7/1 2003
2 10134 41 94.74 2 3884.34 0:00 Shipped 3
8/25 2003
3 10145 45 83.26 6 3746.70 Shipped 3
0:00
10/10 2003
4 10159 49 100.00 14 5205.27 Shipped 4
5 rows × 25 columns
Out[202]:
In [202…
In [203…
df.shape
(2823, 25)
In [204… df.info()
<class 'pandas.core.frame.DataFrame'>RangeIndex:
2823 entries, 0 to 2822 Data columns (total 25
columns):
# Column Non-Null Count Dtype
In [206… df.dtypes
ORDERNUMBER int64
Out[206]:
QUANTITYORDERED int64
PRICEEACH float64
ORDERLINENUMBER int64
SALES float64
ORDERDATE object
STATUS object
QTR_ID int64
MONTH_ID int64
YEAR_ID int64
PRODUCTLINE object
MSRP int64
PRODUCTCODE object
CUSTOMERNAME object
PHONE object
ADDRESSLINE1 object
ADDRESSLINE2 object
CITY object
STATE object
POSTALCODE object
COUNTRY object
TERRITORY object
CONTACTLASTNAME object
CONTACTFIRSTNAME object
DEALSIZE object
dtype: object
In [207…
df_drop = ['ADDRESSLINE1', 'ADDRESSLINE2', 'STATUS','POSTALCODE', 'CITY', 'TERRITORY',
df = df.drop(df_drop, axis=1) #Dropping the categorical uneccessary columns along with c
In [208… df.isnull().sum()
Out[208]: QUANTITYORDERED 0
PRICEEACH 0
ORDERLINENUMBER 0
SALES 0
ORDERDATE 0
QTR_ID 0
MONTH_ID 0
YEAR_ID 0
PRODUCTLINE 0
MSRP 0
PRODUCTCODE 0
COUNTRY 0
DEALSIZE 0
dtype: int64
In [209…
df.dtypes
Out[209]: QUANTITYORDERED int64
PRICEEACH float64
ORDERLINENUMBER int64
SALES float64
ORDERDATE object
QTR_ID int64
MONTH_ID int64
YEAR_ID int64
PRODUCTLINE object
MSRP int64
PRODUCTCODE object
COUNTRY object
DEALSIZE object
dtype: object
In [ ]:
# Checking the categorical columns.
In [210… df['COUNTRY'].unique()
df['PRODUCTLINE'].unique()
In [211…
array(['Motorcycles', 'Classic Cars', 'Trucks and Buses', 'Vintage Cars','Planes', 'Ships', 'Trains'],
Out[211]: dtype=object)
df['DEALSIZE'].unique()
In [212…
array(['Small', 'Medium', 'Large'], dtype=object)
Out[212]:
In [220… plt.figure(figsize=(16,8))
plt.plot(K, distortions, 'bx-')
plt.xlabel('k')
plt.ylabel('Distortion')
plt.title('The Elbow Method showing the optimal k')
plt.show()
In [222… X_train.shape
In [228… counts_df.head()
Visualization
In [229… pca = PCA(n_components=2) #Converting all the features into 2 columns to make it easy to
In [231… reduced_X.head()
0 -682.488323 -42.819535
1 -787.665502-41.694991
2 330.732170 -26.481208
3 193.040232-26.285766
4 1651.532874 -6.891196
In [235… reduced_centers
In [236… plt.figure(figsize=(14,10))
plt.scatter(reduced_X['PCA1'],reduced_X['PCA2'])
plt.scatter(reduced_centers[:,0],reduced_centers[:,1],color='black',marker='x',s=300) #P
In [238… reduced_X.head()
0 -682.488323 -42.819535 1
1 -787.665502 -41.694991 1
2 330.732170 -26.481208 0
3 193.040232 -26.285766 0
4 1651.532874 -6.891196 0
plt.scatter(reduced_centers[:,0],reduced_centers[:,1],color='black',marker='x',s=300)