0% found this document useful (0 votes)
70 views

Section B: R Programming Output

The document provides output from R programming code that analyzes loan default data. Specifically, it: 1) Removes outliers from variables in the train and test datasets and standardizes the variables. 2) Fits logistic regression and decision tree models to predict loan defaults. 3) Evaluates the model performance on the test data using metrics like accuracy, true positive rate, and false positive rate. 4) Oversamples the minority class in the train data using SMOTE and refits the logistic regression model.

Uploaded by

prashantarora18
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
70 views

Section B: R Programming Output

The document provides output from R programming code that analyzes loan default data. Specifically, it: 1) Removes outliers from variables in the train and test datasets and standardizes the variables. 2) Fits logistic regression and decision tree models to predict loan defaults. 3) Evaluates the model performance on the test data using metrics like accuracy, true positive rate, and false positive rate. 4) Oversamples the minority class in the train data using SMOTE and refits the logistic regression model.

Uploaded by

prashantarora18
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
You are on page 1/ 19

Section B: R programming Output

Box Plot for DebitRatio for Train before removing outlier

Box Plot for DebitRatio for Train after removing outlier

Box Plot for MonthlyIncome for Train before removing outlier

Box Plot for MonthlyIncome for Train after removing outlier

Box Plot for Age for Train before removing outlier

Box Plot for Age for Test before removing outlier

Box Plot for DebitRatio for Test before removing outlier

Box Plot for DebitRatio for Test after removing outlier

Box Plot for MonthlyIncome for Test before removing outlier

Box Plot for MonthlyIncome for Test after removing outlier

table(clean_data$SeriousDlqin2yrs)
0 1
3243 235

Number of zero and ones


Logistic regression output
glm(formula = SeriousDlqin2yrs ~ Std_age + Std_DebtRatio + Std_MonthlyIncome +
Std_NumberOfTimes90DaysLate, family = "binomial", data = clean_data)
Deviance Residuals:
Min
1Q Median
3Q
Max
-6.3684 -0.4000 -0.3441 -0.2921 2.8287
Coefficients:

Estimate Std. Error z value Pr(>|z|)


(Intercept)
-2.71336 0.07317 -37.084 < 2e-16 ***
Std_age
-0.29011 0.07387 -3.928 8.58e-05 ***
Std_DebtRatio
0.27867 0.06197 4.497 6.89e-06 ***
Std_MonthlyIncome
-0.08991 0.07656 -1.174 0.24027
Std_NumberOfTimes90DaysLate 0.56293 0.17482 3.220 0.00128 **
--Signif. codes: 0 *** 0.001 ** 0.01 * 0.05 . 0.1 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 1720.2 on 3477 degrees of freedom
Residual deviance: 1658.1 on 3473 degrees of freedom
AIC: 1668.1
Number of Fisher Scoring iterations: 5

Interpretation Age, Debit ratio and NumberOfTimes90DaysLate are significant


where Monthly income is not significant
Confusion Matrix
confusion_matrix=table(clean_dataTest$SeriousDlqin2yrs,predictTest>0.5)
> confusion_matrix
FALSE TRUE
0 618 4
1 50 6

#TPR
6/(50+6)
#FPR

4/(618+4)
ROCR Curve

Logistic Result after Smoting


glm(formula = SeriousDlqin2yrs ~ Std_age + Std_DebtRatio + Std_MonthlyIncome +
Std_NumberOfTimes90DaysLate, family = "binomial", data = loanTrainSMOTE)
Deviance Residuals:
Min
1Q Median
3Q
Max
-4.9167 -0.8254 -0.6840 0.9253 2.1068
Coefficients:

Estimate Std. Error z value Pr(>|z|)


(Intercept)
-0.68604 0.03950 -17.369 < 2e-16 ***
Std_age
-0.25637 0.04139 -6.193 5.89e-10 ***
Std_DebtRatio
0.26412 0.03631 7.274 3.49e-13 ***
Std_MonthlyIncome
-0.02660 0.04048 -0.657 0.511
Std_NumberOfTimes90DaysLate 5.25962 0.29107 18.070 < 2e-16 ***
--Signif. codes: 0 *** 0.001 ** 0.01 * 0.05 . 0.1 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 4975.0 on 3759 degrees of freedom

Residual deviance: 4124.7 on 3755 degrees of freedom


AIC: 4134.7
Number of Fisher Scoring iterations: 7
predictTest1=predict(modelTrain1,newdata=clean_dataTest,type="response")
> summary(predictTest1)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.06081 0.12210 0.15380 0.20190 0.18560 1.00000

>
confusion_matrix1=table(clean_dataTest$SeriousDlqin2yrs,predictTest1>0.5)
> confusion_matrix1
FALSE TRUE
0 604 18
1 37 19

TPR
19/(19+37)
FPR
18/(18+604)

R Code
getwd()
loanDefault=read.csv("end_term_train.csv")
loanTest=read.csv("end_term_test.csv")
nrow(loanDefault)
nrow(loanTest)
#cleaned data
clean_data=na.omit(loanDefault)
clean_dataTest=na.omit(loanTest)
nrow(clean_data)
nrow(clean_dataTest)
# Outlier for train
boxplot(clean_data$DebtRatio)
boxplot(clean_data$age)
boxplot(clean_data$MonthlyIncome)
upperValue = quantile(clean_data$DebtRatio,.90)+1.5*IQR(clean_data$DebtRatio)
# Manually Set the lower value for outliers
lowerValue = .01
clean_data$DebtRatio[ clean_data$DebtRatio > upperValue ] <- NA
clean_data$DebtRatio[ clean_data$DebtRatio < lowerValue ] <- NA
clean_data = na.omit(clean_data)
nrow(clean_data)
boxplot(clean_data$DebtRatio)

upperValue =
quantile(clean_data$MonthlyIncome,.90)+1.5*IQR(clean_data$MonthlyIncome)

lowerValue = quantile(clean_data$MonthlyIncome,.10)1.5*IQR(clean_data$MonthlyIncome)

clean_data$MonthlyIncome[ clean_data$MonthlyIncome > upperValue ] <- NA


clean_data$MonthlyIncome[ clean_data$MonthlyIncome < lowerValue ] <- NA
clean_data = na.omit(clean_data)
nrow(clean_data)
boxplot(clean_data$MonthlyIncome)

# Outlier for test


boxplot(clean_dataTest$DebtRatio)
boxplot(clean_dataTest$age)
boxplot(clean_dataTest$MonthlyIncome)
upperValue =
quantile(clean_dataTest$DebtRatio,.90)+1.5*IQR(clean_dataTest$DebtRatio)
# Manually Set the lower value for outliers
lowerValue = .01
clean_dataTest$DebtRatio[ clean_dataTest$DebtRatio > upperValue ] <- NA
clean_dataTest$DebtRatio[ clean_dataTest$DebtRatio < lowerValue ] <- NA
clean_dataTest = na.omit(clean_dataTest)
nrow(clean_dataTest)
boxplot(clean_dataTest$DebtRatio)
upperValue =
quantile(clean_dataTest$MonthlyIncome,.90)+1.5*IQR(clean_dataTest$MonthlyInco
me)
lowerValue = quantile(clean_dataTest$MonthlyIncome,.10)1.5*IQR(clean_dataTest$MonthlyIncome)
clean_dataTest$MonthlyIncome[ clean_dataTest$MonthlyIncome > upperValue ] <NA

clean_dataTest$MonthlyIncome[ clean_dataTest$MonthlyIncome < lowerValue ] <NA


clean_dataTest = na.omit(clean_dataTest)
nrow(clean_dataTest)
boxplot(clean_dataTest$MonthlyIncome)

#standardizing train data


#standardizing age
mean(clean_data$age)
sd(clean_data$age)
clean_data$Std_age=(clean_data$age-mean(clean_data$age))/sd(clean_data$age)
mean(clean_data$Std_age)
sd(clean_data$Std_age)

#standardizing DebtRatio
mean(clean_data$DebtRatio)
sd(clean_data$DebtRatio)
clean_data$Std_DebtRatio=(clean_data$DebtRatiomean(clean_data$DebtRatio))/sd(clean_data$DebtRatio)
mean(clean_data$Std_DebtRatio)
sd(clean_data$Std_DebtRatio)

#standardizing MonthlyIncome
mean(clean_data$MonthlyIncome)
sd(clean_data$MonthlyIncome)
clean_data$Std_MonthlyIncome=(clean_data$MonthlyIncomemean(clean_data$MonthlyIncome))/sd(clean_data$MonthlyIncome)

mean(clean_data$Std_MonthlyIncome)
sd(clean_data$Std_MonthlyIncome)

#standardizing NumberOfTimes90DaysLate
mean(clean_data$NumberOfTimes90DaysLate)
sd(clean_data$NumberOfTimes90DaysLate)
clean_data$Std_NumberOfTimes90DaysLate=(clean_data$NumberOfTimes90DaysL
atemean(clean_data$NumberOfTimes90DaysLate))/sd(clean_data$NumberOfTimes90D
aysLate)
mean(clean_data$Std_NumberOfTimes90DaysLate)
sd(clean_data$Std_NumberOfTimes90DaysLate)

#standardizing test data


#standardizing age
mean(clean_dataTest$age)
sd(clean_dataTest$age)
clean_dataTest$Std_age=(clean_dataTest$agemean(clean_dataTest$age))/sd(clean_dataTest$age)
mean(clean_dataTest$Std_age)
sd(clean_dataTest$Std_age)

#standardizing DebtRatio
mean(clean_dataTest$DebtRatio)
sd(clean_dataTest$DebtRatio)
clean_dataTest$Std_DebtRatio=(clean_dataTest$DebtRatiomean(clean_dataTest$DebtRatio))/sd(clean_dataTest$DebtRatio)
mean(clean_dataTest$Std_DebtRatio)

sd(clean_dataTest$Std_DebtRatio)

#standardizing MonthlyIncome
mean(clean_dataTest$MonthlyIncome)
sd(clean_dataTest$MonthlyIncome)
clean_dataTest$Std_MonthlyIncome=(clean_dataTest$MonthlyIncomemean(clean_dataTest$MonthlyIncome))/sd(clean_dataTest$MonthlyIncome)
mean(clean_dataTest$Std_MonthlyIncome)
sd(clean_dataTest$Std_MonthlyIncome)

#standardizing NumberOfTimes90DaysLate
mean(clean_dataTest$NumberOfTimes90DaysLate)
sd(clean_dataTest$NumberOfTimes90DaysLate)
clean_dataTest$Std_NumberOfTimes90DaysLate=(clean_dataTest$NumberOfTimes9
0DaysLatemean(clean_dataTest$NumberOfTimes90DaysLate))/sd(clean_dataTest$NumberOfTi
mes90DaysLate)
mean(clean_dataTest$Std_NumberOfTimes90DaysLate)
sd(clean_dataTest$Std_NumberOfTimes90DaysLate)

#plotting scatterplot
plot(clean_data[,3],clean_data[,4],pch=1+as.integer(clean_data[,3],
col=2+as.integer(clean_data[,3],main="Raw
data",xlab="x1",ylab="x2")))

table(clean_data$SeriousDlqin2yrs)
235/(235+3243)

#logistic regression
modelTrain=glm(SeriousDlqin2yrs~Std_age+Std_DebtRatio+Std_MonthlyIncome+St
d_NumberOfTimes90DaysLate,data=clean_data,family="binomial")
summary(modelTrain)

predictTest=predict(modelTrain,newdata=clean_dataTest,type="response")
predictTest
confusion_matrix=table(clean_dataTest$SeriousDlqin2yrs,predictTest>0.5)
confusion_matrix

#accuracy
(4+760)/(4+760+3+51)
#TPR
4/(4+3)
#FPR
51/(51+760)

#Evaluating model performance


ROCRTest=prediction(predictTest,clean_dataTest$SeriousDlqin2yrs)
ROCRTest_perf=performance(ROCRTest,"tpr","fpr")

#plot ROC curve


plot(ROCRTest_perf,colorize=TRUE)
plot(ROCRTest_perf,colorize=TRUE,print.cutoffs.at=seq(0,1,0.1),text.adj=c(-0.2,1.7))

----------------------------#so we SMOTE the data

clean_data$SeriousDlqin2yrs=as.factor(clean_data$SeriousDlqin2yrs)
loanTrainSMOTE=SMOTE(SeriousDlqin2yrs~Std_Age+Std_DebtRatio+Std_MonthlyIn
come+Std_NumberOfTimes90DaysLate,clean_data,perc.over=500)
plot(loanTrainSMOTE[,1],loanTrainSMOTE[,2],pch=as.integer(loanTrainSMOTE[,4]),co
l=2+as.integer(loanTrainSMOTE[,4]))
clean_data$SeriousDlqin2yrs=as.numeric(clean_data$SeriousDlqin2yrs)

modelTrain1=glm(SeriousDlqin2yrs~Std_age+Std_DebtRatio+Std_MonthlyIncome+
Std_NumberOfTimes90DaysLate,data=loanTrainSMOTE,family="binomial")
summary(modelTrain1)

predictTest1=predict(modelTrain1,newdata=clean_dataTest,type="response")
summary(predictTest1)
confusion_matrix1=table(clean_dataTest$SeriousDlqin2yrs,predictTest1>0.5)
confusion_matrix1
#accuracy
(24+632)/(24+632+131+31)
#TPR
24/(24+131)
#FPR
31/(31+632)

#CART

library(tree)
clean_data$SeriousDlqin2yrs=as.factor(clean_data$SeriousDlqin2yrs)
str(clean_data)

#fitting a tree to our data


treemodel=tree(SeriousDlqin2yrs~Std_age+Std_DebtRatio+Std_MonthlyIncome+St
d_NumberOfTimes90DaysLate,data=clean_data)
library(rpart)
library(rpart.plot)
treemodel=rpart(SeriousDlqin2yrs~Std_age+Std_DebtRatio+Std_MonthlyIncome+S
td_NumberOfTimes90DaysLate,method="class",data=clean_data)
summary(treemodel)
prp(treemodel)
library(rattle)
library(RColorBrewer)
fancyRpartPlot(treemodel2)

You might also like