0% found this document useful (0 votes)
128 views

Q 2

This document loads various libraries and packages for data analysis and machine learning. It then loads and preprocesses a bank marketing dataset to perform k-nearest neighbors (KNN) classification with different values of k. It evaluates the KNN models using accuracy on validation data and confusion matrices. Finally, it splits the data into train, validation, and test sets to evaluate and compare KNN models on each.

Uploaded by

Mohit Jain
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
128 views

Q 2

This document loads various libraries and packages for data analysis and machine learning. It then loads and preprocesses a bank marketing dataset to perform k-nearest neighbors (KNN) classification with different values of k. It evaluates the KNN models using accuracy on validation data and confusion matrices. Finally, it splits the data into train, validation, and test sets to evaluate and compare KNN models on each.

Uploaded by

Mohit Jain
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
You are on page 1/ 3

setwd("S:/NSB subjects/Term4/Business Analysis Data Mining5/BADM_R/q2")

library("dplyr")
library("tidyr")
library("ggplot2")
library("ROCR")
library("rpart")
library("rpart.plot")
library("caret")
library("randomForest")
library("tidyverse")
library("tm")
library("SnowballC")
library("softImpute")
library("glmnet")
library("Hmisc")
library("dummies")
library('tinytex')
library('GGally')
library('gplots')
library('FNN')
library("dplyr")
library("tidyr")
library("caTools")
library("ggpubr")
library("reshape")

rm(list=ls())
bank = read.csv("UniversalBank.csv")
bank$Education = as.factor(bank$Education)

bank_dummy = dummy.data.frame(select(bank,-c(ZIP.Code,ID)))
bank_dummy$Personal.Loan = as.factor(bank_dummy$Personal.Loan)
bank_dummy$CCAvg = as.integer(bank_dummy$CCAvg)

set.seed(1)
train.index <- sample(row.names(bank_dummy), 0.6*dim(bank_dummy)[1]) ## need to
look at hints
test.index <- setdiff(row.names(bank_dummy), train.index)
train.df <- bank_dummy[train.index, ]
valid.df <- bank_dummy[test.index, ]

new.df = data.frame(Age = as.integer(40), Experience = as.integer(10), Income =


as.integer(84), Family = as.integer(2), CCAvg = as.integer(2), Education1 =
as.integer(0), Education2 = as.integer(1), Education3 = as.integer(0), Mortgage =
as.integer(0), Securities.Account = as.integer(0), CD.Account = as.integer(0),
Online = as.integer(1), CreditCard = as.integer(1))

norm.values <- preProcess(train.df[, -c(10)], method=c("center", "scale"))


train.df[, -c(10)] <- predict(norm.values, train.df[, -c(10)])
valid.df[, -c(10)] <- predict(norm.values, valid.df[, -c(10)])
new.df <- predict(norm.values, new.df)

knn.1 <- knn(train = train.df[,-c(10)],test = new.df, cl = train.df[,10], k=5,


prob=TRUE)
knn.attributes <- attributes(knn.1)
knn.attributes[1]
knn.attributes[3]
#part B
accuracy.df <- data.frame(k = seq(1, 14, 1), accuracy = rep(0, 14))

for(i in 1:14) {
knn.2 <- knn(train = train.df[,-10],test = valid.df[,-10], cl = train.df[,10],
k=i, prob=TRUE)
accuracy.df[i, 2] <- confusionMatrix(knn.2, valid.df[,10])$overall[1]
}
accuracy.df

#partc
knn.3 <- knn(train = train.df[,-10],test = valid.df[,-10], cl = train.df[,10], k=3,
prob=TRUE)
confusionMatrix(knn.3, valid.df[,10])

#part d

customer.df= data.frame(Age = 40, Experience = 10, Income = 84, Family = 2, CCAvg =


2, Education_1 = 0, Education_2 = 1, Education_3 = 0, Mortgage = 0,
Securities.Account = 0, CD.Account = 0, Online = 1, CreditCard = 1)
knn.4 <- knn(train = train.df[,-10],test = customer.df, cl = train.df[,10], k=3,
prob=TRUE)
knn.4

###part e

bank_dummy = dummy.data.frame(select(bank,-c(ZIP.Code,ID)))
bank_dummy$Personal.Loan = as.factor(bank_dummy$Personal.Loan)
bank_dummy$CCAvg = as.integer(bank_dummy$CCAvg)

set.seed(1)
train.index <- sample(rownames(bank_dummy), 0.5*dim(bank_dummy)[1]) ## need to
look at hints
set.seed(1)
valid.index <- sample(setdiff(rownames(bank_dummy),train.index),
0.3*dim(bank_dummy)[1])
test.index = setdiff(rownames(bank_dummy), union(train.index, valid.index))

train.df <- bank_dummy[train.index, ]


valid.df <- bank_dummy[valid.index, ]
test.df <- bank_dummy[test.index, ]

norm.values <- preProcess(train.df[, -c(10)], method=c("center", "scale"))


train.df[, -c(10)] <- predict(norm.values, train.df[, -c(10)])
valid.df[, -c(10)] <- predict(norm.values, valid.df[, -c(10)])
test.df[,-c(10)] <- predict(norm.values, test.df[,-c(10)])

testknn <- knn(train = train.df[,-c(10)],test = test.df[,-c(10)], cl =


train.df[,10], k=3, prob=TRUE)
validknn <- knn(train = train.df[,-c(10)],test = valid.df[,-c(10)], cl =
train.df[,10], k=3, prob=TRUE)
trainknn <- knn(train = train.df[,-c(10)],test = train.df[,-c(10)], cl =
train.df[,10], k=3, prob=TRUE)

confusionMatrix(testknn, test.df[,10])

confusionMatrix(validknn, valid.df[,10])
confusionMatrix(trainknn, train.df[,10])

You might also like