Part 1 of Spam Detection
Part 1 of Spam Detection
filename
=
list.files()[567]
filename
else
return(head)
testemail= mail(filename)
content.type=
testemail[[2]]
content.type
#Find
Boundary
and
take
out.#
Boundary=
function(content.type){
if(grepl('boundary',
content.type)
==
TRUE){
split=
strsplit(contenttype,
"=")
if(length(split[[1]])
>
2)
{
a=split[[1]][-1]
boundary=
paste(a,
collapse='=')
}
else
boundary=
split[[1]][2]
if(grepl('"',
boundary))
#Substitute
quotes
with
nothing.#
gsub('"',
'',
boundary)
boundary
}
}
boundary=
Boundary(content.type)
boundary
#BODY
of
email
and
boundary
signifier#
body.and.bound
=
function(content.type,
filename,
boundary)
{
#When
there
IS
attachment
in
the
email#
if
(grepl('boundary',
content.type)
==
TRUE){
boundary
=
Boundary(content.type)
text
=
readLines(filename)
#Signify
boundary
by
adding
--'s
to
it.#
add.to.boundary
=
paste("--",
boundary,
sep
=
"")
#Add
--
to
final
boundary#
final.boundary
=
paste(add.to.boundary,
"--",
sep
=
"")
#Signify
the
text
of
the
body
as
where
text
is
=
add
to
boundary#
body.text
=
which(text
==
add.to.boundary)
#Blank
is
when
ther
is
nothing
there#
blank
=
which(text
==
"")
#Signify
the
last
line
of
email#
last.line
=
which(text
==
final.boundary)
#The
body
of
the
email
will
reside
between
first
blank
line
and
attachment#
body.bound
=
text[(blank[1])
:
(body.text[1]
-
1)]
Body
=
paste(body.bound,
collapse
=
"
")
}
#When
there
is
NO
attachment
in
the
email#
else
{
text
=
readLines(filename)
#End
of
header
will
be
where
there
is
a
blank
line#
End.of.Header
=
which(text
==
"")[1]
body
=
text[End.of.Header:length(text)]
#Body
of
email
is
then
between
end
of
header
and
first
boundary
--#
Body
=
paste(body,
collapse
=
'--')
body.text
=
""
boundary
=
""
last.line
=
""
}
return(list(boundary,
text,
body.text,
last.line,
Body))
}
output=
body.and.bound(content.type,
filename,
boundary)
output
#Find
attachment
as
list#
find.attach
=
function(boundary
=
output[[1]],
text
=
output[[2]],
body.text
=
output[[3]],
last.line
=
output[[4]])
{
attachment
=
list()
if
(length(boundary)
==
0)
{
#Return
blank
space
when
length
of
boundary
is
0#
Attachment
=
""
return(Attachment)
}
#Return
attachment
when
length
of
last
line
is
>0#
if
(length(last.line)
>
0)
#When
there
is
exactly
1
attachment,
then
split.#
if
(length(body.text)
==
1)
{
attachment[[1]]
=
text[(body.text[1]
+
1)
:
(last.line
-
1)]
Attachment
=
list()
attach.one
=
attachment[[1]][1]
split
=
strsplit(attach.one,
";")
attach.of.body
=
attachment[[1]][-1]
attach.of.body
=
paste(attach.of.body,
collapse
=
"
")
attach.of.header
=
split[[1]][1]
attach.list
=
list(attach.of.header,
attach.of.body)
names(attach.list)
=
c('Header
of
email',
'Body
of
email')
Attachment[[1]]
=
attach.list
}
##When
there
is
more
than
one
attachment,
then
split.#
else
{
for
(
x
in
1:
(length(body.text)
-
1))
attachment[[x]]
=
text[(body.text[x]
+
1)
:
(body.text[x
+
1]
-
1)]
attachment[[length(attachment)
+
1]]
=
text[(body.text[length(body.text)]
+
1)
:
(last.line
-
1)]
Attachment
=
list()
for
(i
in
1
:length(attachment)){
attach.one
=
attachment[[i]][1]
split
=
strsplit(attach.one,
";")
attach.of.header
=
split[[1]][1]
attach.of.body
=
attachment[[i]][-1]
attach.of.body
=
paste(attach.of.body,
collapse
=
"
")
attach.list
=
list(attach.of.header,
attach.of.body)
names(attach.list)
=
c('Header',
'Body')
Attachment[[i]]
=
attach.list
}
}
return(Attachment)
}
end.of.boundary=
find.attach(boundary
=
output[[1]],
text
=
output[[2]],
body.text
=
output[[3]],
last.line
=
output[[4]])
#Find
attachment
when
special
condition
of
NO
end
boundary#
special.cond
=
function(boundary
=
output[[1]],
text
=
output[[2]],
body.text
=
output[[3]],
last.line
=
output[[4]])
{
attachment
=
list()
#Return
blank
space
when
length
of
boundary
is
0#
if
(length(boundary)
==
0){
Attachment
=
""
return(Attachment)
}
#
Want
Return
attachment
when
length
of
last
line
is
0#
if
(length(last.line)
==
0)
{
#When
there
is
exactly
1
attachment#
if
(length(body.text)
==
1)
{
attachment[[1]]
=
text[(body.text[1]
+
1)
:
length(text)]
Attachment
=
list()
attach.one
=
attachment[[1]][1]
split
=
strsplit(attach.one,
";")
attach.of.header
=
split[[1]][1]
attach.of.body
=
attachment[[1]][-1]
attach.of.body
=
paste(attach.of.body,
collapse
=
"
")
attach.list
=
list(attach.of.header,
attach.of.body)
names(attach.list)
=
c('Header
of
email',
'Body
of
email')
Attachment[[1]]
=
attach.list
}
#When
there
is
more
than
one
attachment#
else
{
for
(x
in
1
:
(length(body.text)
-
1))
attachment[[x]]
=
text[(body.text[x]
+
1)
:
(body.text[x
+
1]
-
1)]
attachment[[length(attachment)
+
1]]
=
text[(body.text[length(body.text)]
+
1)
:
length(text)]
Attachment
=
list()
for
(x
in
1:length(attachment))
{
attach.one
=
attachment[[x]][1]
split
=
strsplit(attach.one,
";")
attach.of.header
=
split[[1]][1]
attach.of.body
=
attachment[[x]][-1]
attach.of.body
=
paste(attach.of.body,
collapse
=
"
")
attach.list
=
list(attach.of.header,
attach.of.body)
names(attach.list)
=
c('Header
of
email',
'Body
of
email')
Attachment[[x]]
=
attach.list
}
}
}
return(Attachment)
}
no.bound.spec.cond
=
special.cond(boundary
=
output[[1]],
text
=
names(trainMessages)=
path
trainMessages
save(trainMessages,
file
=
"TrainingMessages.rda")()