0% found this document useful (0 votes)

10 views11 pages

ML6 Naive Bayes Spam Filter

The document outlines the process of building a spam filter using the Naive Bayes algorithm. It includes data preprocessing steps such as normalization, tokenization, and feature extraction, followed by model development and classification of messages. The document also provides visualizations of the dataset and discusses the accuracy measurement of the spam filter.

Uploaded by

sangeetsafare

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

10 views11 pages

ML6 Naive Bayes Spam Filter

Uploaded by

sangeetsafare

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as PDF, TXT or read online on Scribd

You are on page 1/ 11

# Spam Filter with Naive Bayes

#========================================

# Import libraries
import numpy as np
import pandas as pd
import re
import nltk
from IPython.display import display
from wordcloud import WordCloud, ImageColorGenerator
import matplotlib.pyplot as plt
from PIL import Image

# Activate the necessary magics

%matplotlib inline
%config InlineBackend.figure_format ='retina'

# Setting display options

pd.set_option('display.max_columns', None) # or 1000
pd.set_option('display.max_rows', None) # or 1000
pd.set_option('display.max_colwidth', ‐1)
# Read in data
spam_collection = pd.read_csv('SMSSpamCollection.csv', sep='\t', header=None,
names=['Label', 'SMS'])
print(spam_collection.shape)
spam_collection.head()

spam_collection.info()
spam_collection['Label'].value_counts(normalize = True)

# Original Data (13.4% of the messages are spam, while the rest are ham)

fig1, ax1 = plt.subplots(figsize=(5,5))

labels = ['Spam', 'Ham']

sizes = [len(spam_collection[spam_collection['Label'] == 'spam']),
len(spam_collection[spam_collection['Label'] == 'ham'])]
explode = (0, 0.1) # only "explode" the 2nd slice

ax1.pie(sizes, explode=explode, labels=labels, autopct='%1.1f%%', shadow=False,

startangle=90)
ax1.axis('equal') # Equal aspect ratio ensures that pie is drawn as a circle.
ax1.set_title("Original Data Set", fontsize=14)

plt.show()

# Random data show

texts = spam_collection['SMS'].sum()
wc = WordCloud(max_words=1000,contour_width=3, contour_color='red')

wc.generate(texts)

plt.figure(figsize=[15,7])
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.show()
plt.rcParams['savefig.dpi'] = 1100

#‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐
# Splitting Data
#A split‐up of 80% and 20%, respectively

# Randomize the entire data set

randomized_collection = spam_collection.sample(frac=1, random_state=3)

# Calculate index for split

training_test_index = round(len(randomized_collection) * 0.8)

# Training/Test split
training_set = randomized_collection[:training_test_index].reset_index(drop=True)
test_set = randomized_collection[training_test_index:].reset_index(drop=True)
print('Training Data:')
print(training_set.shape)
print('Testing Data:')
print(test_set.shape)

print('Training set:\n', training_set['Label'].value_counts(normalize =

True),'\n\nTest set:')
test_set['Label'].value_counts(normalize = True)

# Training Set
fig2, ax2 = plt.subplots(figsize=(5,5))

labels = ['Spam', 'Ham']

sizes = [len(training_set[training_set['Label'] == 'spam']),
len(training_set[training_set['Label'] == 'ham'])]
explode = (0, 0.1) # only "explode" the 2nd slice

ax2.pie(sizes, explode=explode, labels=labels, autopct='%1.1f%%', shadow=False,

startangle=90)
ax2.axis('equal') # Equal aspect ratio ensures that pie is drawn as a circle.
ax2.set_title("Training Set", fontsize=14)

plt.show()

# Test Set

fig3, ax3 = plt.subplots(figsize=(5,5))

labels = ['Spam', 'Ham']
sizes = [len(test_set[test_set['Label'] == 'spam']), len(test_set[test_set['Label']
== 'ham'])]
explode = (0, 0.1) # only "explode" the 2nd slice

ax3.pie(sizes, explode=explode, labels=labels, autopct='%1.1f%%', shadow=False,

startangle=90)
ax3.axis('equal') # Equal aspect ratio ensures that pie is drawn as a circle.
ax3.set_title("Test Set", fontsize=14)

plt.show()

#‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐
# MODEL DEVELOPMENT
#...............................

# Data Pre‐Processing
#*****************************************************
# 1. Normalization

# Original training set ‐ before processing

training_set.head()

# Replace addresses (hhtp, email), numbers (plain, phone), money symbols

training_set['SMS'] =
training_set['SMS'].str.replace(r'\b[\w\‐.]+?@\w+?\.\w{2,4}\b',
' ')
training_set['SMS'] =
training_set['SMS'].str.replace(r'(http[s]?\S+)|(\w+\.[A‐Za‐z]{2,4}\S*)',
' ')
training_set['SMS'] = training_set['SMS'].str.replace(r'£|\$', ' ')
training_set['SMS'] =
training_set['SMS'].str.replace(r'\b(\+\d{1,2}\s)?\d?[\‐(.]?\d{3}\)?[\s.‐]?\d{3}[\s
.‐]?\d{4}\b',
' ')
training_set['SMS'] = training_set['SMS'].str.replace(r'\d+(\.\d+)?', ' ')

# Remove punctuation, collapse all whitespace (spaces, line breaks, tabs) into a
single space & eliminate any leading/trailing whitespace.
training_set['SMS'] = training_set['SMS'].str.replace(r'[^\w\d\s]', ' ')
training_set['SMS'] = training_set['SMS'].str.replace(r'\s+', ' ')
training_set['SMS'] = training_set['SMS'].str.replace(r'^\s+|\s+?$', '')

# Lowercase the entire corpus

training_set['SMS'] = training_set['SMS'].str.lower()

training_set.head()
#‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐
# Natural Language Tool
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
#.............................
# removing stopword like verb, article ect.
from nltk.corpus import stopwords
stop_words = nltk.corpus.stopwords.words('english')
training_set['SMS'] = training_set['SMS'].apply(lambda x: ' '.join(
term for term in x.split() if term not in set(stop_words))
)
training_set.head()

# Lemmatization: the process of grouping together different inflected forms of the

same word
#.........................................................
lemmatizer = nltk.stem.WordNetLemmatizer()
training_set['SMS'] = training_set['SMS'].apply(lambda x: ' '.join(
lemmatizer.lemmatize(term, pos='v') for term in x.split())
)
training_set.head()

#Stemming:reducing a word to its stem that affixes to suffixes and prefixes or to

the roots of words known as "lemmas".
#.....................................................

porter = nltk.PorterStemmer()
training_set['SMS'] = training_set['SMS'].apply(lambda x: ' '.join(
porter.stem(term) for term in x.split())
)

training_set.head()

# 2. Tokenization: the process of replacing sensitive data with unique

identification symbols that retain all the essential information about the data
without compromising its security.
#***************************
training_set['SMS'] = training_set['SMS'].apply(lambda sms:
nltk.word_tokenize(sms))
training_set.head()

#‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐
#Feature Extraction
#‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐

corpus = training_set['SMS'].sum()
len(corpus)

# Transform the list to a set, to remove duplicates

temp_set = set(corpus)

# Revert to a list
vocabulary = list(temp_set)
len(vocabulary)

# Create the dictionary

len_training_set = len(training_set['SMS'])
word_counts_per_sms = {unique_word: [0] * len_training_set for unique_word in
vocabulary}

for index, sms in enumerate(training_set['SMS']):

for word in sms:
word_counts_per_sms[word][index] += 1
# Convert to dataframe
word_counts = pd.DataFrame(word_counts_per_sms)
word_counts.head()

word_counts.shape

# Concatenate with the original training set

training_set_final = pd.concat([training_set, word_counts], axis=1)
training_set_final.head()

#‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐
# Calculating Probability
#‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐

# Filter the spam and ham dataframes

spam_df = training_set_final[training_set_final['Label'] == 'spam'].copy()
ham_df = training_set_final[training_set_final['Label'] == 'ham'].copy()
# Calculate P(Spam) and P(Ham)
p_spam = spam_df.shape[0] / training_set_final.shape[0]
p_ham = ham_df.shape[0] / training_set_final.shape[0]
print(p_spam)
p_ham

# Calculate Nspam, Nham and Nvocabulary

spam_words_per_message = spam_df['SMS'].apply(len)
n_spam = spam_words_per_message.sum()

ham_words_per_message = ham_df['SMS'].apply(len)
n_ham = ham_words_per_message.sum()

n_vocabulary = len(vocabulary)
# Opting for the Laplace smoothing to remove 0 probability problem
alpha = 1

#‐‐‐‐‐‐‐‐‐‐‐
#Calculating Parameters
# P(wi|Spam) and P(wi|Ham) depend on the training set, which doesn't change, thus
they are constant.

# Create two dictionaries that match each unique word with the respective
probability value.
parameters_spam = {unique_word: 0 for unique_word in vocabulary}
parameters_ham = {unique_word: 0 for unique_word in vocabulary}

# Iterate over the vocabulary and for each word, calculate P(wi|Spam) and P(wi|Ham)
for unique_word in vocabulary:
p_unique_word_spam = (spam_df[unique_word].sum() + alpha) / (n_spam + alpha *
n_vocabulary)
p_unique_word_ham = (ham_df[unique_word].sum() + alpha) / (n_ham + alpha *
n_vocabulary)

# Update the calculated propabilities to the dictionaries

parameters_spam[unique_word] = p_unique_word_spam
parameters_ham[unique_word] = p_unique_word_ham
#‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐
#Classifying A New Message
#....................................

def sms_classify(message):
'''
Takes in as input a new sms (w1, w2, ..., wn),
calculates P(Spam|w1, w2, ..., wn) and P(Ham|w1, w2, ..., wn),
compares them and outcomes whether the message is spam or not.
'''

# Replace addresses (hhtp, email), numbers (plain, phone), money symbols

message = message.replace(r'\b[\w\‐.]+?@\w+?\.\w{2,4}\b', ' ')
message = message.replace(r'(http[s]?\S+)|(\w+\.[A‐Za‐z]{2,4}\S*)', ' ')
message = message.replace(r'£|\$', ' ')
message =
message.replace(r'\b(\+\d{1,2}\s)?\d?[\‐(.]?\d{3}\)?[\s.‐]?\d{3}[\s.‐]?\d{4}\b', '
')
message = message.replace(r'\d+(\.\d+)?', ' ')

# Remove punctuation, collapse all whitespace (spaces, line breaks, tabs) into
a single space & eliminate any leading/trailing whitespace.
message = message.replace(r'[^\w\d\s]', ' ')
message = message.replace(r'\s+', ' ')
message = message.replace(r'^\s+|\s+?$', '')

# Lowercase the entire corpus

message = message.lower()

# Remove stop words

terms = []
for term in message.split():
if term not in set(stop_words):
terms.append(term)
message = ' '.join(terms)

# Lemmatization
message = ' '.join(lemmatizer.lemmatize(term, pos='v') for term in
message.split())

# Stemming
message = ' '.join(porter.stem(term) for term in message.split())

# Tokenization
message = message.split()

p_spam_given_message = p_spam
p_ham_given_message = p_ham

for word in message:

if word in parameters_spam:
p_spam_given_message *= parameters_spam[word]

if word in parameters_ham:
p_ham_given_message *= parameters_ham[word]

print('P(Spam|message):', p_spam_given_message)
print('P(Ham|message):', p_ham_given_message)

if p_ham_given_message > p_spam_given_message:

print('Label: Ham')
elif p_ham_given_message < p_spam_given_message:
print('Label: Spam')
else:
print('Equal probabilities ~ Human action needed!')

print("Test with message: Hey, Sign up with this promo code and get your card for
amazing exchange fees abroad and £5 to spend anywhere! Promocode: D48KV7BN")

sms_classify('''Hey, Sign up with this promo code and get your card for amazing
exchange fees abroad and £5 to spend anywhere! Promocode:
D48KV7BN''')

print('Test with message: Okey Stan! Seems to be a reasonable amount of money. Ill
think of it and let you know ASAP.')

sms_classify('''Okey Stan! Seems to be a reasonable amount of money. I'll think of

it and let you know ASAP.''')

print('Test with any massage')

sms_classify(input('input text:'))
#‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐
#Measuring the Spam Filter's Accuracy

# We define the classify () function again, this time returning the outcomes
def sms_classify_test_set(message):
'''
Takes in as input a new sms (w1, w2, ..., wn),
calculates P(Spam|w1, w2, ..., wn) and P(Ham|w1, w2, ..., wn),
compares them and returns the spam or ham label, respectively.
'''

# Replace addresses (hhtp, email), numbers (plain, phone), money symbols

# Lowercase the entire corpus

message = message.lower()

# Remove stop words

terms = []
for term in message.split():
if term not in set(stop_words):
terms.append(term)
message = ' '.join(terms)

# Lemmatization
message = ' '.join(lemmatizer.lemmatize(term, pos='v') for term in
message.split())

# Stemming
message = ' '.join(porter.stem(term) for term in message.split())

# Tokenization
message = message.split()

p_spam_given_message = p_spam
p_ham_given_message = p_ham

for word in message:

if word in parameters_spam:
p_spam_given_message *= parameters_spam[word]

if word in parameters_ham:
p_ham_given_message *= parameters_ham[word]

if p_ham_given_message > p_spam_given_message:

return 'ham'
elif p_spam_given_message > p_ham_given_message:
return 'spam'
else:
return 'needs human classification'
test_set['sms_predicted'] = test_set['SMS'].apply(sms_classify_test_set)
test_set.head()

# Calculate the accuracy of the algorithm

correct = 0
total = test_set.shape[0]

for row in test_set.iterrows():

row = row[1]
if row['Label'] == row['sms_predicted']:
correct += 1

print('Results \n‐‐‐‐‐‐‐')
print('Valid:', correct)
print('Invalid:', total ‐ correct)
print('Accuracy:', round(correct/total, 4))

HW4 Text-1
No ratings yet
HW4 Text-1
8 pages
AIML ASSIGNMENT-2
No ratings yet
AIML ASSIGNMENT-2
8 pages
Aayush Nihar Spam Mail Filtering
No ratings yet
Aayush Nihar Spam Mail Filtering
18 pages
Bayesian_Inference
No ratings yet
Bayesian_Inference
20 pages
How To Submit Your Homework: EECS 349 Machine Learning Homework 5
No ratings yet
How To Submit Your Homework: EECS 349 Machine Learning Homework 5
4 pages
Naive Bayes Spam Classifier
0% (1)
Naive Bayes Spam Classifier
44 pages
Lab5 NaiveBayes Full
No ratings yet
Lab5 NaiveBayes Full
5 pages
Spam Detection Model
No ratings yet
Spam Detection Model
4 pages
AI Phase4
No ratings yet
AI Phase4
11 pages
Lab7&8 NaiveBayes
No ratings yet
Lab7&8 NaiveBayes
5 pages
2.Naïve Bayes Classifier for Sms
No ratings yet
2.Naïve Bayes Classifier for Sms
9 pages
Building A Powered Ai and Spam Caller
No ratings yet
Building A Powered Ai and Spam Caller
7 pages
Email Spam Detection Using Machine Learning
No ratings yet
Email Spam Detection Using Machine Learning
2 pages
Machine Learning Learning With Email Spam Detection
No ratings yet
Machine Learning Learning With Email Spam Detection
5 pages
AI Phash3
No ratings yet
AI Phash3
11 pages
Enhancing Email Security with Naïve Bayes Spam Detection.docx Fully edited
No ratings yet
Enhancing Email Security with Naïve Bayes Spam Detection.docx Fully edited
64 pages
Ass 3
No ratings yet
Ass 3
2 pages
7.email Spam Filtering Using Naive Bayes Classifier
No ratings yet
7.email Spam Filtering Using Naive Bayes Classifier
14 pages
Mail Spam
No ratings yet
Mail Spam
4 pages
Assignment 3 28855
No ratings yet
Assignment 3 28855
3 pages
Module3 Ids
No ratings yet
Module3 Ids
17 pages
AI Phash 5
No ratings yet
AI Phash 5
14 pages
Ba Yes I An Filtering
No ratings yet
Ba Yes I An Filtering
8 pages
Email Spam Detection
No ratings yet
Email Spam Detection
8 pages
Unit III
No ratings yet
Unit III
10 pages
Detecting Spam Mail With Naive Bayes
No ratings yet
Detecting Spam Mail With Naive Bayes
5 pages
Document1
No ratings yet
Document1
1 page
Lab 78
No ratings yet
Lab 78
6 pages
Arnav MLlab04
No ratings yet
Arnav MLlab04
7 pages
Implementation of Naïve Bayesian Spam Filter Algorithm
No ratings yet
Implementation of Naïve Bayesian Spam Filter Algorithm
16 pages
PPT
0% (1)
PPT
15 pages
A Support Vector Machine Based Naive Bayes Algorithm for Spam Filtering
No ratings yet
A Support Vector Machine Based Naive Bayes Algorithm for Spam Filtering
8 pages
Email Classification Using Naive Bayes Classifier: Domain Algorithms Framework Platform
No ratings yet
Email Classification Using Naive Bayes Classifier: Domain Algorithms Framework Platform
7 pages
2 Utilizing Improved Bayesian Algorithm To Identify Blog Comment Spam
No ratings yet
2 Utilizing Improved Bayesian Algorithm To Identify Blog Comment Spam
4 pages
Spam-T5: Benchmarking Large Language Models For Few-Shot Email Spam Detection
No ratings yet
Spam-T5: Benchmarking Large Language Models For Few-Shot Email Spam Detection
18 pages
Micro
No ratings yet
Micro
5 pages
Synopsis Email Spam
No ratings yet
Synopsis Email Spam
9 pages
Profound Python Data Science
From Everand
Profound Python Data Science
Onder Teker
No ratings yet
Document
No ratings yet
Document
11 pages
Detecting Spam Messages Using The Naive Bayes Algorithm of Basic Machine Learning
No ratings yet
Detecting Spam Messages Using The Naive Bayes Algorithm of Basic Machine Learning
3 pages
Spam Email Classifier
No ratings yet
Spam Email Classifier
16 pages
Spam Detection With Machine Learning
No ratings yet
Spam Detection With Machine Learning
2 pages
Spam Filter Project Report
No ratings yet
Spam Filter Project Report
16 pages
Spam Filter Project Report
No ratings yet
Spam Filter Project Report
16 pages
Naive Bayes Classifier Notes
No ratings yet
Naive Bayes Classifier Notes
2 pages
Spam Detection 6
No ratings yet
Spam Detection 6
8 pages
Computer Engineering Laboratory Solution Primer
From Everand
Computer Engineering Laboratory Solution Primer
Karan Bhandari
No ratings yet
Supervised Learning: Naïve Bayes
No ratings yet
Supervised Learning: Naïve Bayes
15 pages
Content Based Spam Detection in Email Us PDF
No ratings yet
Content Based Spam Detection in Email Us PDF
5 pages
Sms Spam Using Machine Learning 4
No ratings yet
Sms Spam Using Machine Learning 4
42 pages
A Plan For Spam
No ratings yet
A Plan For Spam
10 pages
Spam Mail Detection Using Machine Learning
No ratings yet
Spam Mail Detection Using Machine Learning
14 pages
Analysis of Spam Email Filtering Through Naive Bayes Algorithm Across Different Datasets
No ratings yet
Analysis of Spam Email Filtering Through Naive Bayes Algorithm Across Different Datasets
4 pages
44 Decision Tree Model for Email Classification
No ratings yet
44 Decision Tree Model for Email Classification
4 pages
20011F0008 Samba PRC3
No ratings yet
20011F0008 Samba PRC3
21 pages
The Essential R Reference
From Everand
The Essential R Reference
Mark Gardener
No ratings yet
notebook - text classification
No ratings yet
notebook - text classification
7 pages
Saurabh
No ratings yet
Saurabh
26 pages
Naive456 Bayes297Classification
No ratings yet
Naive456 Bayes297Classification
21 pages
Detecting Spam in Emails. Applying NLP and Deep Learning For Spam - by Ramya Vidiyala - Towards Data Science
No ratings yet
Detecting Spam in Emails. Applying NLP and Deep Learning For Spam - by Ramya Vidiyala - Towards Data Science
23 pages
Chapter 8 Second Language Speaking New
No ratings yet
Chapter 8 Second Language Speaking New
34 pages
Verb Tenses
No ratings yet
Verb Tenses
22 pages
Introduce Your Friend To The Class
No ratings yet
Introduce Your Friend To The Class
2 pages
TS 1.1 Kramam Sanskrit
No ratings yet
TS 1.1 Kramam Sanskrit
43 pages
10 2 1 Yazili Hazirlik 4 Sayfa Uzun 73128
No ratings yet
10 2 1 Yazili Hazirlik 4 Sayfa Uzun 73128
4 pages
Catapult Paper 1
No ratings yet
Catapult Paper 1
24 pages
F1 Sej F1 Geo F1 MM F1 SNS: August
No ratings yet
F1 Sej F1 Geo F1 MM F1 SNS: August
7 pages
Bai Tap Unit 2 Health Tieng Anh 7
No ratings yet
Bai Tap Unit 2 Health Tieng Anh 7
5 pages
Test X GERUND 2
No ratings yet
Test X GERUND 2
2 pages
SMART-goals-for-Documented-Plans-1
No ratings yet
SMART-goals-for-Documented-Plans-1
2 pages
SL2-U1-L5-Monty The Park Ranger Language Focus
No ratings yet
SL2-U1-L5-Monty The Park Ranger Language Focus
38 pages
Đề 19
No ratings yet
Đề 19
13 pages
Temarios 1er Trimestre
No ratings yet
Temarios 1er Trimestre
4 pages
Phrasal Verbs
No ratings yet
Phrasal Verbs
84 pages
English Skills Checklist
No ratings yet
English Skills Checklist
6 pages
Lets-go-to-europe
No ratings yet
Lets-go-to-europe
3 pages
Wbcs 2020 Prelims Question Paper
No ratings yet
Wbcs 2020 Prelims Question Paper
40 pages
National Level Essay Writing Competition 2023
No ratings yet
National Level Essay Writing Competition 2023
1 page
AQA GCSE Chinese Textbook Unit 5
No ratings yet
AQA GCSE Chinese Textbook Unit 5
55 pages
Midterm Examination - Elt 4
No ratings yet
Midterm Examination - Elt 4
3 pages
Angelica F English
No ratings yet
Angelica F English
2 pages
Лекция 1 Introduction, style, science
No ratings yet
Лекция 1 Introduction, style, science
33 pages
MM Grade 5
No ratings yet
MM Grade 5
3 pages
Universitas Gunadarma: Soal Ujian Akhir Semester
No ratings yet
Universitas Gunadarma: Soal Ujian Akhir Semester
8 pages
Bài tập thực hành tiếng Anh lớp 8 (Unit 1 - My friends)
No ratings yet
Bài tập thực hành tiếng Anh lớp 8 (Unit 1 - My friends)
5 pages
Learning Styles Questionnaire 1
No ratings yet
Learning Styles Questionnaire 1
11 pages
In Class: Lesson A Classmates
No ratings yet
In Class: Lesson A Classmates
8 pages
Theories of Meaning
No ratings yet
Theories of Meaning
5 pages
ĐỀ CƯƠNG ÔN TẬP GIỮA HK1 TIẾNG ANH 6
No ratings yet
ĐỀ CƯƠNG ÔN TẬP GIỮA HK1 TIẾNG ANH 6
7 pages

ML6 Naive Bayes Spam Filter

Uploaded by

ML6 Naive Bayes Spam Filter

Uploaded by

# Spam Filter with Naive Bayes

# Activate the necessary magics

# Setting display options

fig1, ax1 = plt.subplots(figsize=(5,5))

labels = ['Spam', 'Ham']

ax1.pie(sizes, explode=explode, labels=labels, autopct='%1.1f%%', shadow=False,

# Random data show

# Randomize the entire data set

# Calculate index for split

print('Training set:\n', training_set['Label'].value_counts(normalize =

labels = ['Spam', 'Ham']

ax2.pie(sizes, explode=explode, labels=labels, autopct='%1.1f%%', shadow=False,

fig3, ax3 = plt.subplots(figsize=(5,5))

ax3.pie(sizes, explode=explode, labels=labels, autopct='%1.1f%%', shadow=False,

# Original training set ‐ before processing

# Replace addresses (hhtp, email), numbers (plain, phone), money symbols

# Lowercase the entire corpus

# Lemmatization: the process of grouping together different inflected forms of the

#Stemming:reducing a word to its stem that affixes to suffixes and prefixes or to

# 2. Tokenization: the process of replacing sensitive data with unique

# Transform the list to a set, to remove duplicates

# Create the dictionary

for index, sms in enumerate(training_set['SMS']):

# Concatenate with the original training set

# Filter the spam and ham dataframes

# Calculate Nspam, Nham and Nvocabulary

# Update the calculated propabilities to the dictionaries

# Replace addresses (hhtp, email), numbers (plain, phone), money symbols

# Lowercase the entire corpus

# Remove stop words

for word in message:

if p_ham_given_message > p_spam_given_message:

sms_classify('''Okey Stan! Seems to be a reasonable amount of money. I'll think of

print('Test with any massage')

# Replace addresses (hhtp, email), numbers (plain, phone), money symbols

# Lowercase the entire corpus

# Remove stop words

for word in message:

if p_ham_given_message > p_spam_given_message:

# Calculate the accuracy of the algorithm

for row in test_set.iterrows():

You might also like