0% found this document useful (0 votes)
79 views

Part 1 of Spam Detection

This document outlines an algorithm for detecting spam emails. It defines functions for parsing an email's header, finding the email body and any attachments. The functions identify the header, content type, boundary between the body and attachments, extract the body text, and return a list of any attachments. The overall goal is to analyze the structure and components of an email to help identify spam.

Uploaded by

Austin Kinion
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
79 views

Part 1 of Spam Detection

This document outlines an algorithm for detecting spam emails. It defines functions for parsing an email's header, finding the email body and any attachments. The functions identify the header, content type, boundary between the body and attachments, extract the body text, and return a list of any attachments. The overall goal is to analyze the structure and components of an email to help identify spam.

Uploaded by

Austin Kinion
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 7

Spam

Emails pt 1------Austin Kinion


Algorithm for Detecting Spam emails part 1
setwd("~/Desktop/SAT/easy_ham")

filename = list.files()[567]
filename

#First Function for finding header#


mail= function(filename){
#Read email#
con= file(filename, open='rt')

text= readLines(con)
#Stop header when there is a blank line#
End.of.Header = which(text== "")[1]
#Stop the header at either:
if(grepl('From ', text[1]) == TRUE)

head.lines= text [2: End.of.Header]

else

head.lines= text[1: End.of.Header]

file= textConnection(head.lines)

head= read.dcf(file, all=TRUE)
#Must close connection every time or an error will occur.#
close(con)

if ("Content-Type" %in% colnames(head)){

content.type= head[["Content-Type"]]

return(list( head, content.type ))
}

else

return(head)

testemail= mail(filename)

content.type= testemail[[2]]
content.type
#Find Boundary and take out.#
Boundary= function(content.type){

if(grepl('boundary', content.type) == TRUE){

split= strsplit(contenttype, "=")

if(length(split[[1]]) > 2) {

a=split[[1]][-1]

boundary= paste(a, collapse='=')

}

else

boundary= split[[1]][2]

if(grepl('"', boundary))
#Substitute quotes with nothing.#
gsub('"', '', boundary)

boundary

}

}

boundary= Boundary(content.type)
boundary
#BODY of email and boundary signifier#
body.and.bound = function(content.type, filename, boundary) {
#When there IS attachment in the email#
if (grepl('boundary', content.type) == TRUE){

boundary = Boundary(content.type)

text = readLines(filename)

#Signify boundary by adding --'s to it.#

add.to.boundary = paste("--", boundary, sep = "")


#Add -- to final boundary#
final.boundary = paste(add.to.boundary, "--", sep = "")

#Signify the text of the body as where text is = add to boundary#
body.text = which(text == add.to.boundary)

#Blank is when ther is nothing there#
blank = which(text == "")

#Signify the last line of email#
last.line = which(text == final.boundary)

#The body of the email will reside between first blank line and
attachment#
body.bound = text[(blank[1]) : (body.text[1] - 1)]

Body = paste(body.bound, collapse = " ")
}
#When there is NO attachment in the email#
else {
text = readLines(filename)

#End of header will be where there is a blank line#
End.of.Header = which(text == "")[1]

body = text[End.of.Header:length(text)]

#Body of email is then between end of header and first boundary --#
Body = paste(body, collapse = '--')

body.text = ""

boundary = ""

last.line = ""
}

return(list(boundary, text, body.text, last.line, Body))
}
output= body.and.bound(content.type, filename, boundary)
output
#Find attachment as list#
find.attach = function(boundary = output[[1]], text = output[[2]],
body.text = output[[3]],
last.line = output[[4]])
{
attachment = list()


if (length(boundary) == 0) {
#Return blank space when length of boundary is 0#
Attachment = ""

return(Attachment) }
#Return attachment when length of last line is >0#
if (length(last.line) > 0)

#When there is exactly 1 attachment, then split.#
if (length(body.text) == 1) {

attachment[[1]] = text[(body.text[1] + 1) : (last.line - 1)]

Attachment = list()

attach.one = attachment[[1]][1]

split = strsplit(attach.one, ";")

attach.of.body = attachment[[1]][-1]

attach.of.body = paste(attach.of.body, collapse = " ")

attach.of.header = split[[1]][1]

attach.list = list(attach.of.header, attach.of.body)

names(attach.list) = c('Header of email', 'Body of email')

Attachment[[1]] = attach.list
}
##When there is more than one attachment, then split.#
else {

for ( x in 1: (length(body.text) - 1))

attachment[[x]] = text[(body.text[x] + 1) : (body.text[x + 1] -
1)]

attachment[[length(attachment) + 1]] =
text[(body.text[length(body.text)] + 1) : (last.line - 1)]

Attachment = list()

for (i in 1 :length(attachment)){

attach.one = attachment[[i]][1]


split = strsplit(attach.one, ";")

attach.of.header = split[[1]][1]

attach.of.body = attachment[[i]][-1]

attach.of.body = paste(attach.of.body, collapse = " ")

attach.list = list(attach.of.header, attach.of.body)

names(attach.list) = c('Header', 'Body')

Attachment[[i]] = attach.list
}
}
return(Attachment)
}


end.of.boundary= find.attach(boundary = output[[1]], text =
output[[2]], body.text = output[[3]],
last.line = output[[4]])
#Find attachment when special condition of NO end boundary#
special.cond = function(boundary = output[[1]], text = output[[2]],
body.text = output[[3]],
last.line = output[[4]])
{
attachment = list()
#Return blank space when length of boundary is 0#
if (length(boundary) == 0){

Attachment = ""

return(Attachment) }

# Want Return attachment when length of last line is 0#
if (length(last.line) == 0) {
#When there is exactly 1 attachment#
if (length(body.text) == 1) {

attachment[[1]] = text[(body.text[1] + 1) : length(text)]

Attachment = list()

attach.one = attachment[[1]][1]

split = strsplit(attach.one, ";")

attach.of.header = split[[1]][1]

attach.of.body = attachment[[1]][-1]

attach.of.body = paste(attach.of.body, collapse = " ")

attach.list = list(attach.of.header, attach.of.body)

names(attach.list) = c('Header of email', 'Body of email')

Attachment[[1]] = attach.list
}
#When there is more than one attachment#
else {

for (x in 1 : (length(body.text) - 1))

attachment[[x]] = text[(body.text[x] + 1) : (body.text[x + 1] -
1)]

attachment[[length(attachment) + 1]] =
text[(body.text[length(body.text)] + 1) : length(text)]

Attachment = list()

for (x in 1:length(attachment)) {

attach.one = attachment[[x]][1]

split = strsplit(attach.one, ";")

attach.of.header = split[[1]][1]

attach.of.body = attachment[[x]][-1]

attach.of.body = paste(attach.of.body, collapse = " ")

attach.list = list(attach.of.header, attach.of.body)

names(attach.list) = c('Header of email', 'Body of email')

Attachment[[x]] = attach.list
}
}
}
return(Attachment)
}
no.bound.spec.cond = special.cond(boundary = output[[1]], text =

output[[2]], body.text = output[[3]],


last.line = output[[4]])
#For the last boundary in the email!!#
last.boundary = function(boundary = output[[1]], text = output[[2]],
body.text = output[[3]],
last.line = output[[4]])
{
if(length(last.line) == 0){

Attachment = special.cond(boundary = output[[1]], text =
output[[2]], body.text = output[[3]],
last.line = output[[4]])}
if(length(last.line) > 0) {

Attachment = find.attach(boundary = output[[1]], text =
output[[2]],body.text = output[[3]],
last.line = output[[4]]) }
return(Attachment)
}
last.bound = last.boundary(boundary = output[[1]], text =
output[[2]],body.text = output[[3]],
last.line = output[[4]])
# Get list of subdirectories.
dirs = list.files()
dirs

# Iterate over subdirectories.


for (dir in dirs) {
# Change to current subdirectory.
setwd(dir)
# List files in subdirectory.
files = list.files()
for (f in files) {
# do something to file `f`
# process_email(f)
print(f)
}
# Go back up to the parent directory.
setwd('..')
}

path = list.files('SAT/', recursive = TRUE)

names(trainMessages)= path
trainMessages
save(trainMessages, file = "TrainingMessages.rda")()

You might also like