0% found this document useful (0 votes)
77 views

Spam Email Classification 2

The document contains code for 8 functions to extract features from email messages for spam classification: 1. The isSpam() function classifies each email as spam or not spam based on whether it contains the string "spam". 2. The isRe(), isFw(), and isPunc() functions analyze the email subject to check for the strings "Re:", "Fw:", and any punctuation. 3. The numLinesInBody() and subjectExclamationCount() functions count the number of lines in the email body and exclamation points in the subject. 4. The AlternateCap() function checks if the subject alternates between capitalized and lowercase letters more than once.

Uploaded by

Austin Kinion
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
77 views

Spam Email Classification 2

The document contains code for 8 functions to extract features from email messages for spam classification: 1. The isSpam() function classifies each email as spam or not spam based on whether it contains the string "spam". 2. The isRe(), isFw(), and isPunc() functions analyze the email subject to check for the strings "Re:", "Fw:", and any punctuation. 3. The numLinesInBody() and subjectExclamationCount() functions count the number of lines in the email body and exclamation points in the subject. 4. The AlternateCap() function checks if the subject alternates between capitalized and lowercase letters more than once.

Uploaded by

Austin Kinion
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 11

Spam

Email Classification Austin Kinion


#1#use 'spam' from directory to classify T or F on each email#
isSpam=function(x) {

ListSpam= grepl("spam", names(trainMessages[]), fixed=TRUE)

return(ListSpam)
}
#2#if the string Re: appears in the subject
isRe=function(x) {
ListRe= list()
for(i in 1: length(x)){
Subject= trainMessages[[i]]$header['Subject']
if(length(Subject)==0){
ListRe[i]= FALSE
}
else {
Subject=paste(Subject, collapse= "")
S= grepl('Re:', Subject)

if(S==TRUE)
ListRe[i]=TRUE

else
ListRe[i]=FALSE
}
}

return(ListRe)
}
#3#if the string Fw: appears in the subject
isFw= function(x) {
ListFw= list()
for(i in 1: length(x)){
Subject= trainMessages[[i]]$header['Subject']
if(length(Subject)==0){
ListFw[i]= FALSE
}
else {
Subject=paste(Subject, collapse= "")
F= grepl('Fw:', Subject)

if(F==TRUE)
ListFw[i]=TRUE


else
ListFw[i]=FALSE
}
}
return(ListFw)
}
#4#count of the number of lines in the body of the email message
numLinesInBody= function(x){

LinesBody= list()
for(i in 1: length(x)){
body= trainMessages[[i]]$body
if(length(body)==0){
LinesBody[i]= 0
}
else {
body=paste(body, collapse= "")
B= length(trainMessages[[i]]$body)

if(B > 0)
LinesBody[i]=length(trainMessages[[i]]$body)

else
LinesBody[i]=0
}
}

return(LinesBody)
}
#5#if any puncuation appears in the subject
isPunc= function(x) {
ListP= list()
for(i in 1: length(x)){
Subject= trainMessages[[i]]$header['Subject']
if(length(Subject)==0){
ListP[i]= FALSE
}
else {
Subject=paste(Subject, collapse= "")
P= grepl('[[:punct:]]', Subject)

if(P==TRUE)
ListP[i]=TRUE

else
ListP[i]=FALSE
}

}
return(ListP)
}
#6#If the Subject of the message alternates from Capital to lowercase
more than once
AlternateCap= function(x) {
Alter= list()
for(i in 1: length(x)){
Subject= trainMessages[[i]]$header['Subject']
if(length(Subject)==0){
Alter[i]= FALSE
}
else {
Subject=paste(Subject, collapse= "")
A= grepl('([A-Z][a-z]){2,}', Subject)

if(A==TRUE)
Alter[i]=TRUE

else
Alter[i]=FALSE
}
}
return(Alter)
}
#7#count of the number of exclamation marks in the subject
subjectExclamationCount= function(x){

EC= list()

for(i in 1: length(x)){
Subject= trainMessages[[i]]$header['Subject']
if(length(Subject) == 0) {
EC[i] = 0
}
else{
Subject= paste(Subject, collapse= "")
count= gregexpr("!", Subject)

if(count[[1]][1]== -1){
EC[i]= 0
}
else{
count= length(count[[1]])
EC[i]= count
}
}
}

return(EC)
}
#8#count of the number of question marks in the subject
subjectQuestionCount= function(x){

QC= list()

for(i in 1: length(x)){
Subject= trainMessages[[i]]$header['Subject']
if(length(Subject) == 0) {
QC[i] = 0
}
else{
Subject= paste(Subject, collapse= "")
count= gregexpr("//?", Subject)

if(count[[1]][1]== -1){
QC[i]= 0
}
else{
count= length(count[[1]])
QC[i]= count
}
}
}
return(QC)
}
#9#whether the Subject of the mail is in capital letters
isYelling= function(x) {
Yell= list()
for(i in 1: length(x)){
Subject= trainMessages[[i]]$header['Subject']
if(length(Subject)==0){
Yell[i]= FALSE
}
else {
Subject=paste(Subject, collapse= "")
Y= !grepl('[a-z]', Subject)

if(Y==TRUE)
Yell[i]=TRUE

else
Yell[i]=FALSE
}
}

return(Yell)
}
#10#if there are no blanks in the subject
SubjectBlanks= function(x) {
Blank= list()
for(i in 1: length(x)){
Subject= trainMessages[[i]]$header['Subject']
if(length(Subject)==0){
Blank[i]= FALSE
}
else {
Subject=paste(Subject, collapse= "")
B= !grepl('[ ]', Subject)

if(B==TRUE)
Blank[i]=TRUE

else
Blank[i]=FALSE
}
}

return(Blank)
}
#11#whether the header states that the message is a multipart, i.e.
with attachments.
multipartText=function(x){

mulText= list()

for(i in 1: length(x)){
header= trainMessages[[i]]$header

if(length(header==0)){

mulText[i]= FALSE
}
else{header=paste(header, collapse= "")

MT= grepl("multipart/text",header['Content-Type'])

if(MT==TRUE)
mulText[i]=TRUE

else
mulText[i]=FALSE
}

}
return(mulText)
}
#12#whether the subject contains one of spam phrases
subjectSpamWords=function(x) {
WORD= list()
for(i in 1: length(x)){
Subject= trainMessages[[i]]$header['Subject']
if(length(Subject)==0){
WORD[i]= FALSE
}
else {
Subject=paste(Subject, collapse= "")
W=
grepl('viagra|pounds|free|weight|guarantee|millions|dollars|credit|risk
|prescription
|generic|drug|money back|credit card', Subject)

if(W==TRUE)
WORD[i]=TRUE

else
WORD[i]=FALSE
}
}

return(WORD)
}
#13#whether the message body contains a form of the introduction Dear
...
isDear= function(x) {
ListDear= list()
for(i in 1: length(x)){
body= trainMessages[[i]]$body
if(length(body)==0){
ListDear[i]= FALSE
}
else {
body=paste(body, collapse= "")
D= grepl('Dear|DEAR|dear', body)

if(D==TRUE)
ListDear[i]=TRUE

else
ListDear[i]=FALSE
}
}


return(ListDear)
}
#14#If the string "vacation" occurs in the body of the message
isVacation=function(x) {
ListVacation= list()
for(i in 1: length(x)){
body= trainMessages[[i]]$body
if(length(body)==0){
ListVacation[i]= FALSE
}
else {
body=paste(body, collapse= "")
V= any(grepl('Vacation|VACATION|vacation', body))

if(V==TRUE)
ListVacation[i]=TRUE

else
ListVacation[i]=FALSE
}
}

return(ListVacation)
}
##15#If there is a url link in the body of the message
isLink=function(x) {
Listlink= list()
for(i in 1: length(x)){
body= trainMessages[[i]]$body
if(length(body)==0){
Listlink[i]= FALSE
}
else {
body=paste(body, collapse= "")
L= any(grepl('https?://[a-zA-Z0-9.]+
[.][[:alpha:]{2,6}[/[:blank:]]', body))

if(L==TRUE)
Listlink[i]=TRUE

else
Listlink[i]=FALSE
}
}

return(Listlink)
}

##16#the number of dollar signs in the body of the message


numDollarSigns= function(x){

CountDollar= list()

for(i in 1: length(x)){
body= trainMessages[[i]]$body
if(length(body) == 0) {
CountDollar[i] = 0
}
else{
body= paste(body, collapse= "")
count= gregexpr("\\$", body)

if(count[[1]][1]== -1){
CountDollar[i]= 0
}
else{
count= length(count[[1]])
CountDollar[i]= count
}
}
}
return(CountDollar)
}
#17#Number of Capital letters in Subject of message
SubjectCapital= function(x){

Capsub= list()

for(i in 1: length(x)){
Subject= trainMessages[[i]]$header['Subject']
if(length(Subject) == 0) {
Capsub[i] = 0
}
else{
Subject= paste(Subject, collapse= "")
count= gregexpr("[A-Z]", Subject)

if(count[[1]][1]== -1){
Capsub[i]= 0
}
else{
count= length(count[[1]])
Capsub[i]= count
}
}
}

return(Capsub)
}
#18#whether the body text includes a line indicating the word wrote:
isWrote=function(x) {
ListWrote= list()
for(i in 1: length(x)){
body= trainMessages[[i]]$body
if(length(body)==0){
ListWrote[i]= FALSE
}
else {
body=paste(body, collapse= "")
L= any(grepl('wrote: ', body))

if(L==TRUE)
ListWrote[i]=TRUE

else
ListWrote[i]=FALSE
}
}

return(ListWrote)
}
#19#the number of letters and numbers in the body of the email message
bodyCharacterCount= function(x){

BCC= list()

for(i in 1: length(x)){
body= trainMessages[[i]]$body
if(length(body) == 0) {
BCC[i] = 0
}
else{
body= paste(body, collapse= "")
count= gregexpr("[A-Za-z0-9]", body)

if(count[[1]][1]== -1){
BCC[i]= 0
}
else{
count= length(count[[1]])
BCC[i]= count
}
}
}

return(BCC)
}
#20#How many lines that html tags exist in the header of the message
TagExists= function(x){

Tag= list()

for(i in 1: length(x)){
header= trainMessages[[i]]$header
if(length(header) == 0) {
Tag[i] = 0
}
else{
header= paste(header, collapse= "")
count= gregexpr("<[^>/][^.]*>", header)

if(count[[1]][1]== -1){
Tag[i]= 0
}
else{
count= length(count[[1]])
Tag[i]= count
}
}
}
return(Tag)
}
df= data.frame(
unlist(isSpam(trainMessages)),
unlist(isRe(trainMessages)),
unlist(isFw(trainMessages)),
unlist(numLinesInBody(trainMessages)),
unlist(isPunc(trainMessages)),
unlist(AlternateCap(trainMessages)),
unlist(subjectExclamationCount(trainMessages)),
unlist(subjectQuestionCount(trainMessages)),
unlist(isYelling(trainMessages)),
unlist(SubjectBlanks(trainMessages)),
unlist(multipartText(trainMessages)),
unlist(subjectSpamWords(trainMessages)),
unlist(isDear(trainMessages)),
unlist(isVacation(trainMessages)),
unlist(isLink(trainMessages)),
unlist(numDollarSigns(trainMessages)),
unlist(SubjectCapital(trainMessages)),
unlist(bodyCharacterCount(trainMessages)),
unlist(isWrote(trainMessages)),
unlist(TagExists(trainMessages))

)
colnames(df)= c("isSpam", "isRe", "isFw", "numLinesInBody", "isPunc",
"AlternateCap", "SubjectExclamationCount",
"subjectQuestionCount", "isYelling", "pSubjectblanks",
"multipartText",
"subjectSpamWords", "isDear", "isVacation", "isLink",
"numDollarSign",
"SubjectCapital", "priority", "isWrote", "TagExists")
library(ggplot2)
#Show isRe for isSpam
ggplot(df, aes(isSpam, fill= isRe))+ geom_bar()
#show Number of Spam and not Spam
table(isSpam)
## isSpam
## FALSE TRUE
## 4864 1677
#Compare isDear with isSpam
ggplot(df, aes(isSpam, fill= isDear))+ geom_bar()
#Compare isSpam with
ggplot(df, aes(isSpam, fill= isYelling)) + geom_bar()

You might also like