Section B: R Programming Output
Section B: R Programming Output
table(clean_data$SeriousDlqin2yrs)
0 1
3243 235
#TPR
6/(50+6)
#FPR
4/(618+4)
ROCR Curve
>
confusion_matrix1=table(clean_dataTest$SeriousDlqin2yrs,predictTest1>0.5)
> confusion_matrix1
FALSE TRUE
0 604 18
1 37 19
TPR
19/(19+37)
FPR
18/(18+604)
R Code
getwd()
loanDefault=read.csv("end_term_train.csv")
loanTest=read.csv("end_term_test.csv")
nrow(loanDefault)
nrow(loanTest)
#cleaned data
clean_data=na.omit(loanDefault)
clean_dataTest=na.omit(loanTest)
nrow(clean_data)
nrow(clean_dataTest)
# Outlier for train
boxplot(clean_data$DebtRatio)
boxplot(clean_data$age)
boxplot(clean_data$MonthlyIncome)
upperValue = quantile(clean_data$DebtRatio,.90)+1.5*IQR(clean_data$DebtRatio)
# Manually Set the lower value for outliers
lowerValue = .01
clean_data$DebtRatio[ clean_data$DebtRatio > upperValue ] <- NA
clean_data$DebtRatio[ clean_data$DebtRatio < lowerValue ] <- NA
clean_data = na.omit(clean_data)
nrow(clean_data)
boxplot(clean_data$DebtRatio)
upperValue =
quantile(clean_data$MonthlyIncome,.90)+1.5*IQR(clean_data$MonthlyIncome)
lowerValue = quantile(clean_data$MonthlyIncome,.10)1.5*IQR(clean_data$MonthlyIncome)
#standardizing DebtRatio
mean(clean_data$DebtRatio)
sd(clean_data$DebtRatio)
clean_data$Std_DebtRatio=(clean_data$DebtRatiomean(clean_data$DebtRatio))/sd(clean_data$DebtRatio)
mean(clean_data$Std_DebtRatio)
sd(clean_data$Std_DebtRatio)
#standardizing MonthlyIncome
mean(clean_data$MonthlyIncome)
sd(clean_data$MonthlyIncome)
clean_data$Std_MonthlyIncome=(clean_data$MonthlyIncomemean(clean_data$MonthlyIncome))/sd(clean_data$MonthlyIncome)
mean(clean_data$Std_MonthlyIncome)
sd(clean_data$Std_MonthlyIncome)
#standardizing NumberOfTimes90DaysLate
mean(clean_data$NumberOfTimes90DaysLate)
sd(clean_data$NumberOfTimes90DaysLate)
clean_data$Std_NumberOfTimes90DaysLate=(clean_data$NumberOfTimes90DaysL
atemean(clean_data$NumberOfTimes90DaysLate))/sd(clean_data$NumberOfTimes90D
aysLate)
mean(clean_data$Std_NumberOfTimes90DaysLate)
sd(clean_data$Std_NumberOfTimes90DaysLate)
#standardizing DebtRatio
mean(clean_dataTest$DebtRatio)
sd(clean_dataTest$DebtRatio)
clean_dataTest$Std_DebtRatio=(clean_dataTest$DebtRatiomean(clean_dataTest$DebtRatio))/sd(clean_dataTest$DebtRatio)
mean(clean_dataTest$Std_DebtRatio)
sd(clean_dataTest$Std_DebtRatio)
#standardizing MonthlyIncome
mean(clean_dataTest$MonthlyIncome)
sd(clean_dataTest$MonthlyIncome)
clean_dataTest$Std_MonthlyIncome=(clean_dataTest$MonthlyIncomemean(clean_dataTest$MonthlyIncome))/sd(clean_dataTest$MonthlyIncome)
mean(clean_dataTest$Std_MonthlyIncome)
sd(clean_dataTest$Std_MonthlyIncome)
#standardizing NumberOfTimes90DaysLate
mean(clean_dataTest$NumberOfTimes90DaysLate)
sd(clean_dataTest$NumberOfTimes90DaysLate)
clean_dataTest$Std_NumberOfTimes90DaysLate=(clean_dataTest$NumberOfTimes9
0DaysLatemean(clean_dataTest$NumberOfTimes90DaysLate))/sd(clean_dataTest$NumberOfTi
mes90DaysLate)
mean(clean_dataTest$Std_NumberOfTimes90DaysLate)
sd(clean_dataTest$Std_NumberOfTimes90DaysLate)
#plotting scatterplot
plot(clean_data[,3],clean_data[,4],pch=1+as.integer(clean_data[,3],
col=2+as.integer(clean_data[,3],main="Raw
data",xlab="x1",ylab="x2")))
table(clean_data$SeriousDlqin2yrs)
235/(235+3243)
#logistic regression
modelTrain=glm(SeriousDlqin2yrs~Std_age+Std_DebtRatio+Std_MonthlyIncome+St
d_NumberOfTimes90DaysLate,data=clean_data,family="binomial")
summary(modelTrain)
predictTest=predict(modelTrain,newdata=clean_dataTest,type="response")
predictTest
confusion_matrix=table(clean_dataTest$SeriousDlqin2yrs,predictTest>0.5)
confusion_matrix
#accuracy
(4+760)/(4+760+3+51)
#TPR
4/(4+3)
#FPR
51/(51+760)
clean_data$SeriousDlqin2yrs=as.factor(clean_data$SeriousDlqin2yrs)
loanTrainSMOTE=SMOTE(SeriousDlqin2yrs~Std_Age+Std_DebtRatio+Std_MonthlyIn
come+Std_NumberOfTimes90DaysLate,clean_data,perc.over=500)
plot(loanTrainSMOTE[,1],loanTrainSMOTE[,2],pch=as.integer(loanTrainSMOTE[,4]),co
l=2+as.integer(loanTrainSMOTE[,4]))
clean_data$SeriousDlqin2yrs=as.numeric(clean_data$SeriousDlqin2yrs)
modelTrain1=glm(SeriousDlqin2yrs~Std_age+Std_DebtRatio+Std_MonthlyIncome+
Std_NumberOfTimes90DaysLate,data=loanTrainSMOTE,family="binomial")
summary(modelTrain1)
predictTest1=predict(modelTrain1,newdata=clean_dataTest,type="response")
summary(predictTest1)
confusion_matrix1=table(clean_dataTest$SeriousDlqin2yrs,predictTest1>0.5)
confusion_matrix1
#accuracy
(24+632)/(24+632+131+31)
#TPR
24/(24+131)
#FPR
31/(31+632)
#CART
library(tree)
clean_data$SeriousDlqin2yrs=as.factor(clean_data$SeriousDlqin2yrs)
str(clean_data)