0% found this document useful (0 votes)

77 views

Spam Email Classification 2

The document contains code for 8 functions to extract features from email messages for spam classification: 1. The isSpam() function classifies each email as spam or not spam based on whether it contains the string "spam". 2. The isRe(), isFw(), and isPunc() functions analyze the email subject to check for the strings "Re:", "Fw:", and any punctuation. 3. The numLinesInBody() and subjectExclamationCount() functions count the number of lines in the email body and exclamation points in the subject. 4. The AlternateCap() function checks if the subject alternates between capitalized and lowercase letters more than once.

Uploaded by

Austin Kinion

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

77 views

Spam Email Classification 2

Uploaded by

Austin Kinion

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as PDF, TXT or read online on Scribd

You are on page 1/ 11

Spam

Email Classification Austin Kinion

#1#use 'spam' from directory to classify T or F on each email#
isSpam=function(x) {

ListSpam= grepl("spam", names(trainMessages[]), fixed=TRUE)

return(ListSpam)
}
#2#if the string Re: appears in the subject
isRe=function(x) {
ListRe= list()
for(i in 1: length(x)){
Subject= trainMessages[[i]]$header['Subject']
if(length(Subject)==0){
ListRe[i]= FALSE
}
else {
Subject=paste(Subject, collapse= "")
S= grepl('Re:', Subject)

if(S==TRUE)
ListRe[i]=TRUE

else
ListRe[i]=FALSE
}
}

return(ListRe)
}
#3#if the string Fw: appears in the subject
isFw= function(x) {
ListFw= list()
for(i in 1: length(x)){
Subject= trainMessages[[i]]$header['Subject']
if(length(Subject)==0){
ListFw[i]= FALSE
}
else {
Subject=paste(Subject, collapse= "")
F= grepl('Fw:', Subject)

if(F==TRUE)
ListFw[i]=TRUE

else
ListFw[i]=FALSE
}
}
return(ListFw)
}
#4#count of the number of lines in the body of the email message
numLinesInBody= function(x){

LinesBody= list()
for(i in 1: length(x)){
body= trainMessages[[i]]$body
if(length(body)==0){
LinesBody[i]= 0
}
else {
body=paste(body, collapse= "")
B= length(trainMessages[[i]]$body)

if(B > 0)
LinesBody[i]=length(trainMessages[[i]]$body)

else
LinesBody[i]=0
}
}

return(LinesBody)
}
#5#if any puncuation appears in the subject
isPunc= function(x) {
ListP= list()
for(i in 1: length(x)){
Subject= trainMessages[[i]]$header['Subject']
if(length(Subject)==0){
ListP[i]= FALSE
}
else {
Subject=paste(Subject, collapse= "")
P= grepl('[[:punct:]]', Subject)

if(P==TRUE)
ListP[i]=TRUE

else
ListP[i]=FALSE
}

}
return(ListP)
}
#6#If the Subject of the message alternates from Capital to lowercase
more than once
AlternateCap= function(x) {
Alter= list()
for(i in 1: length(x)){
Subject= trainMessages[[i]]$header['Subject']
if(length(Subject)==0){
Alter[i]= FALSE
}
else {
Subject=paste(Subject, collapse= "")
A= grepl('([A-Z][a-z]){2,}', Subject)

if(A==TRUE)
Alter[i]=TRUE

else
Alter[i]=FALSE
}
}
return(Alter)
}
#7#count of the number of exclamation marks in the subject
subjectExclamationCount= function(x){

EC= list()

for(i in 1: length(x)){
Subject= trainMessages[[i]]$header['Subject']
if(length(Subject) == 0) {
EC[i] = 0
}
else{
Subject= paste(Subject, collapse= "")
count= gregexpr("!", Subject)

if(count[[1]][1]== -1){
EC[i]= 0
}
else{
count= length(count[[1]])
EC[i]= count
}
}
}

return(EC)
}
#8#count of the number of question marks in the subject
subjectQuestionCount= function(x){

QC= list()

for(i in 1: length(x)){
Subject= trainMessages[[i]]$header['Subject']
if(length(Subject) == 0) {
QC[i] = 0
}
else{
Subject= paste(Subject, collapse= "")
count= gregexpr("//?", Subject)

if(count[[1]][1]== -1){
QC[i]= 0
}
else{
count= length(count[[1]])
QC[i]= count
}
}
}
return(QC)
}
#9#whether the Subject of the mail is in capital letters
isYelling= function(x) {
Yell= list()
for(i in 1: length(x)){
Subject= trainMessages[[i]]$header['Subject']
if(length(Subject)==0){
Yell[i]= FALSE
}
else {
Subject=paste(Subject, collapse= "")
Y= !grepl('[a-z]', Subject)

if(Y==TRUE)
Yell[i]=TRUE

else
Yell[i]=FALSE
}
}

return(Yell)
}
#10#if there are no blanks in the subject
SubjectBlanks= function(x) {
Blank= list()
for(i in 1: length(x)){
Subject= trainMessages[[i]]$header['Subject']
if(length(Subject)==0){
Blank[i]= FALSE
}
else {
Subject=paste(Subject, collapse= "")
B= !grepl('[ ]', Subject)

if(B==TRUE)
Blank[i]=TRUE

else
Blank[i]=FALSE
}
}

return(Blank)
}
#11#whether the header states that the message is a multipart, i.e.
with attachments.
multipartText=function(x){

mulText= list()

for(i in 1: length(x)){
header= trainMessages[[i]]$header

if(length(header==0)){

mulText[i]= FALSE
}
else{header=paste(header, collapse= "")

MT= grepl("multipart/text",header['Content-Type'])

if(MT==TRUE)
mulText[i]=TRUE

else
mulText[i]=FALSE
}

}
return(mulText)
}
#12#whether the subject contains one of spam phrases
subjectSpamWords=function(x) {
WORD= list()
for(i in 1: length(x)){
Subject= trainMessages[[i]]$header['Subject']
if(length(Subject)==0){
WORD[i]= FALSE
}
else {
Subject=paste(Subject, collapse= "")
W=
grepl('viagra|pounds|free|weight|guarantee|millions|dollars|credit|risk
|prescription
|generic|drug|money back|credit card', Subject)

if(W==TRUE)
WORD[i]=TRUE

else
WORD[i]=FALSE
}
}

return(WORD)
}
#13#whether the message body contains a form of the introduction Dear
...
isDear= function(x) {
ListDear= list()
for(i in 1: length(x)){
body= trainMessages[[i]]$body
if(length(body)==0){
ListDear[i]= FALSE
}
else {
body=paste(body, collapse= "")
D= grepl('Dear|DEAR|dear', body)

if(D==TRUE)
ListDear[i]=TRUE

else
ListDear[i]=FALSE
}
}

return(ListDear)
}
#14#If the string "vacation" occurs in the body of the message
isVacation=function(x) {
ListVacation= list()
for(i in 1: length(x)){
body= trainMessages[[i]]$body
if(length(body)==0){
ListVacation[i]= FALSE
}
else {
body=paste(body, collapse= "")
V= any(grepl('Vacation|VACATION|vacation', body))

if(V==TRUE)
ListVacation[i]=TRUE

else
ListVacation[i]=FALSE
}
}

return(ListVacation)
}
##15#If there is a url link in the body of the message
isLink=function(x) {
Listlink= list()
for(i in 1: length(x)){
body= trainMessages[[i]]$body
if(length(body)==0){
Listlink[i]= FALSE
}
else {
body=paste(body, collapse= "")
L= any(grepl('https?://[a-zA-Z0-9.]+
[.][[:alpha:]{2,6}[/[:blank:]]', body))

if(L==TRUE)
Listlink[i]=TRUE

else
Listlink[i]=FALSE
}
}

return(Listlink)
}

##16#the number of dollar signs in the body of the message

numDollarSigns= function(x){

CountDollar= list()

for(i in 1: length(x)){
body= trainMessages[[i]]$body
if(length(body) == 0) {
CountDollar[i] = 0
}
else{
body= paste(body, collapse= "")
count= gregexpr("\\$", body)

if(count[[1]][1]== -1){
CountDollar[i]= 0
}
else{
count= length(count[[1]])
CountDollar[i]= count
}
}
}
return(CountDollar)
}
#17#Number of Capital letters in Subject of message
SubjectCapital= function(x){

Capsub= list()

for(i in 1: length(x)){
Subject= trainMessages[[i]]$header['Subject']
if(length(Subject) == 0) {
Capsub[i] = 0
}
else{
Subject= paste(Subject, collapse= "")
count= gregexpr("[A-Z]", Subject)

if(count[[1]][1]== -1){
Capsub[i]= 0
}
else{
count= length(count[[1]])
Capsub[i]= count
}
}
}

return(Capsub)
}
#18#whether the body text includes a line indicating the word wrote:
isWrote=function(x) {
ListWrote= list()
for(i in 1: length(x)){
body= trainMessages[[i]]$body
if(length(body)==0){
ListWrote[i]= FALSE
}
else {
body=paste(body, collapse= "")
L= any(grepl('wrote: ', body))

if(L==TRUE)
ListWrote[i]=TRUE

else
ListWrote[i]=FALSE
}
}

return(ListWrote)
}
#19#the number of letters and numbers in the body of the email message
bodyCharacterCount= function(x){

BCC= list()

for(i in 1: length(x)){
body= trainMessages[[i]]$body
if(length(body) == 0) {
BCC[i] = 0
}
else{
body= paste(body, collapse= "")
count= gregexpr("[A-Za-z0-9]", body)

if(count[[1]][1]== -1){
BCC[i]= 0
}
else{
count= length(count[[1]])
BCC[i]= count
}
}
}

return(BCC)
}
#20#How many lines that html tags exist in the header of the message
TagExists= function(x){

Tag= list()

for(i in 1: length(x)){
header= trainMessages[[i]]$header
if(length(header) == 0) {
Tag[i] = 0
}
else{
header= paste(header, collapse= "")
count= gregexpr("<[^>/][^.]*>", header)

if(count[[1]][1]== -1){
Tag[i]= 0
}
else{
count= length(count[[1]])
Tag[i]= count
}
}
}
return(Tag)
}
df= data.frame(
unlist(isSpam(trainMessages)),
unlist(isRe(trainMessages)),
unlist(isFw(trainMessages)),
unlist(numLinesInBody(trainMessages)),
unlist(isPunc(trainMessages)),
unlist(AlternateCap(trainMessages)),
unlist(subjectExclamationCount(trainMessages)),
unlist(subjectQuestionCount(trainMessages)),
unlist(isYelling(trainMessages)),
unlist(SubjectBlanks(trainMessages)),
unlist(multipartText(trainMessages)),
unlist(subjectSpamWords(trainMessages)),
unlist(isDear(trainMessages)),
unlist(isVacation(trainMessages)),
unlist(isLink(trainMessages)),
unlist(numDollarSigns(trainMessages)),
unlist(SubjectCapital(trainMessages)),
unlist(bodyCharacterCount(trainMessages)),
unlist(isWrote(trainMessages)),
unlist(TagExists(trainMessages))

)
colnames(df)= c("isSpam", "isRe", "isFw", "numLinesInBody", "isPunc",
"AlternateCap", "SubjectExclamationCount",
"subjectQuestionCount", "isYelling", "pSubjectblanks",
"multipartText",
"subjectSpamWords", "isDear", "isVacation", "isLink",
"numDollarSign",
"SubjectCapital", "priority", "isWrote", "TagExists")
library(ggplot2)
#Show isRe for isSpam
ggplot(df, aes(isSpam, fill= isRe))+ geom_bar()
#show Number of Spam and not Spam
table(isSpam)
## isSpam
## FALSE TRUE
## 4864 1677
#Compare isDear with isSpam
ggplot(df, aes(isSpam, fill= isDear))+ geom_bar()
#Compare isSpam with
ggplot(df, aes(isSpam, fill= isYelling)) + geom_bar()

Marino's The ICU Book - 4th Edition Complete Digital Book
100% (8)
Marino's The ICU Book - 4th Edition Complete Digital Book
15 pages
HappyFox Backend Assignment
No ratings yet
HappyFox Backend Assignment
3 pages
Network Dna Network Documentation Checklist
No ratings yet
Network Dna Network Documentation Checklist
5 pages
Part 1 of Spam Detection
No ratings yet
Part 1 of Spam Detection
7 pages
Email Filtering: Machine Learning Techniques and An Implementation For The UNIX Pine Mail System
No ratings yet
Email Filtering: Machine Learning Techniques and An Implementation For The UNIX Pine Mail System
42 pages
Coding for Bulk Email Sender
No ratings yet
Coding for Bulk Email Sender
4 pages
Py4Inf Solutions
No ratings yet
Py4Inf Solutions
6 pages
Zoom
No ratings yet
Zoom
20 pages
NLP - Colaboratory
No ratings yet
NLP - Colaboratory
14 pages
Assign9 4
No ratings yet
Assign9 4
1 page
Quiz 2
No ratings yet
Quiz 2
11 pages
Unit-2 Ipynb
No ratings yet
Unit-2 Ipynb
83 pages
03 Python Projects For Noob 3
No ratings yet
03 Python Projects For Noob 3
21 pages
P1 PDF
No ratings yet
P1 PDF
2 pages
LP IV Changes
No ratings yet
LP IV Changes
7 pages
Profound Python Data Science
From Everand
Profound Python Data Science
Onder Teker
No ratings yet
FAM PR-10
No ratings yet
FAM PR-10
4 pages
Codesrepl
No ratings yet
Codesrepl
16 pages
CS&DF1
No ratings yet
CS&DF1
1 page
Complete Code
No ratings yet
Complete Code
6 pages
Spam Email Detection and Deletion
No ratings yet
Spam Email Detection and Deletion
5 pages
The Essential R Reference
From Everand
The Essential R Reference
Mark Gardener
No ratings yet
Arrays Strings Demo
No ratings yet
Arrays Strings Demo
8 pages
Attendence Tracker CS For Class 12TH Project.
No ratings yet
Attendence Tracker CS For Class 12TH Project.
15 pages
Classification Accuracy in R
No ratings yet
Classification Accuracy in R
4 pages
Python - To Read Emails and Download Attachments
No ratings yet
Python - To Read Emails and Download Attachments
2 pages
Sms Spam Detection
No ratings yet
Sms Spam Detection
7 pages
Assign8 5
No ratings yet
Assign8 5
1 page
Docs
No ratings yet
Docs
8 pages
Cortex
No ratings yet
Cortex
4 pages
Spam Detection 6
No ratings yet
Spam Detection 6
8 pages
Mail Spam
No ratings yet
Mail Spam
4 pages
FICE Project Report Spam
No ratings yet
FICE Project Report Spam
14 pages
Email Spam Detection
No ratings yet
Email Spam Detection
8 pages
Abstract
No ratings yet
Abstract
2 pages
Spam Detection Final-2
No ratings yet
Spam Detection Final-2
24 pages
Zabapcadabra Mail A File
No ratings yet
Zabapcadabra Mail A File
11 pages
Aayush Nihar Spam Mail Filtering
No ratings yet
Aayush Nihar Spam Mail Filtering
18 pages
AI Phase4
No ratings yet
AI Phase4
11 pages
Org Uipath_naan Mudhalvan
No ratings yet
Org Uipath_naan Mudhalvan
22 pages
Ass 3
No ratings yet
Ass 3
2 pages
Report Zrich - 0002.
No ratings yet
Report Zrich - 0002.
2 pages
implemention of sms spam filtering
No ratings yet
implemention of sms spam filtering
27 pages
Lab 78
No ratings yet
Lab 78
6 pages
Clean Trump
No ratings yet
Clean Trump
2 pages
Email Dataset Analysis in Excel
No ratings yet
Email Dataset Analysis in Excel
4 pages
CS50P Notes
No ratings yet
CS50P Notes
1 page
Exemplar - Conditional Statements
No ratings yet
Exemplar - Conditional Statements
4 pages
12th CS 2023 HY
No ratings yet
12th CS 2023 HY
9 pages
Elsarticle Template New
No ratings yet
Elsarticle Template New
3 pages
Exp2
No ratings yet
Exp2
1 page
Email Spam Detection System using Logistic Regression
No ratings yet
Email Spam Detection System using Logistic Regression
6 pages
SMTA - Lab Record - Aim, Procedures and Results
No ratings yet
SMTA - Lab Record - Aim, Procedures and Results
31 pages
Candidate Elimination
No ratings yet
Candidate Elimination
2 pages
Order Tasks and Milestones Assignment
No ratings yet
Order Tasks and Milestones Assignment
6 pages
Code
No ratings yet
Code
6 pages
数据挖掘第一次作业 (1)
No ratings yet
数据挖掘第一次作业 (1)
4 pages
PRP411 Fa1
No ratings yet
PRP411 Fa1
8 pages
SWconstruction 02
No ratings yet
SWconstruction 02
8 pages
Chat Analysis Notes
No ratings yet
Chat Analysis Notes
9 pages
Dictionaries
No ratings yet
Dictionaries
5 pages
Final Code
No ratings yet
Final Code
4 pages
A Sense of The Declining Popularity of Baseball: A Time Series Analysis
No ratings yet
A Sense of The Declining Popularity of Baseball: A Time Series Analysis
16 pages
Spam Email Classification 3
No ratings yet
Spam Email Classification 3
8 pages
Mixture Models and Target Density
No ratings yet
Mixture Models and Target Density
14 pages
Airline Data Analysis
No ratings yet
Airline Data Analysis
20 pages
Communications, Networks, & Cyberthreats: Presented by Satriyo Adhy
No ratings yet
Communications, Networks, & Cyberthreats: Presented by Satriyo Adhy
75 pages
Calculus 2
No ratings yet
Calculus 2
134 pages
Evolution of Operations Management Past, Present and Future
No ratings yet
Evolution of Operations Management Past, Present and Future
30 pages
Trends In Deep Learning Methodologies Algorithms Applications And Systems Hybrid Computational Intelligence For Pattern Analysis And Understanding Vincenzo Piuri Editor pdf download
No ratings yet
Trends In Deep Learning Methodologies Algorithms Applications And Systems Hybrid Computational Intelligence For Pattern Analysis And Understanding Vincenzo Piuri Editor pdf download
90 pages
Loner V3.0
No ratings yet
Loner V3.0
69 pages
Final Income Tax
No ratings yet
Final Income Tax
6 pages
Sense of Smell
100% (1)
Sense of Smell
27 pages
March 2025 Civil Service Exam Guide
No ratings yet
March 2025 Civil Service Exam Guide
14 pages
SJ-20130408140048-003-ZXUR 9000 GSM (V6.50.103) Product Description
No ratings yet
SJ-20130408140048-003-ZXUR 9000 GSM (V6.50.103) Product Description
47 pages
Toyota Forklift 8fbe15u to 8fbes15u Parts Catalog en Es Fr De
No ratings yet
Toyota Forklift 8fbe15u to 8fbes15u Parts Catalog en Es Fr De
24 pages
Od Interventions: What Is An OD Intervention?
No ratings yet
Od Interventions: What Is An OD Intervention?
9 pages
Summary of VAT
No ratings yet
Summary of VAT
5 pages
Open Channel Flow Quick Revision Notes.pdf
No ratings yet
Open Channel Flow Quick Revision Notes.pdf
3 pages
Tieng Anh 6 Friends Plus - Unit 8 - Test 2 (key)
No ratings yet
Tieng Anh 6 Friends Plus - Unit 8 - Test 2 (key)
6 pages
Bardach 1995
No ratings yet
Bardach 1995
4 pages
Interim Report/Mid-Thesis Structure: Data Science and AI/ML
No ratings yet
Interim Report/Mid-Thesis Structure: Data Science and AI/ML
10 pages
Application Form
No ratings yet
Application Form
2 pages
Who Was The First Biblical Liar? by SaToGa
No ratings yet
Who Was The First Biblical Liar? by SaToGa
9 pages
Cardiovascular-WPS Office
No ratings yet
Cardiovascular-WPS Office
13 pages
Ecology and Systematic Zoology Advanced Animal Ecology
No ratings yet
Ecology and Systematic Zoology Advanced Animal Ecology
27 pages
Port Hedland Loading
50% (2)
Port Hedland Loading
3 pages
Lab 2 Example 2
No ratings yet
Lab 2 Example 2
13 pages
Full Download Pro SQL Server 2022 Administration: A guide for the modern DBA 3rd Edition Peter A. Carter PDF DOCX
100% (5)
Full Download Pro SQL Server 2022 Administration: A guide for the modern DBA 3rd Edition Peter A. Carter PDF DOCX
65 pages
Assignment (Repaired)
100% (1)
Assignment (Repaired)
16 pages
Cognitive Learning Theory: General Psychology Notes
100% (1)
Cognitive Learning Theory: General Psychology Notes
7 pages
AIESEC UK Report
No ratings yet
AIESEC UK Report
22 pages
Lecture 2 Summary-Accounting - PAU
No ratings yet
Lecture 2 Summary-Accounting - PAU
2 pages
WESTPAK Laboratory Package Drop Testing v2.1-1
No ratings yet
WESTPAK Laboratory Package Drop Testing v2.1-1
11 pages

Spam Email Classification 2

Uploaded by

Spam Email Classification 2

Uploaded by

Spam

Email Classification Austin Kinion

##16#the number of dollar signs in the body of the message

You might also like