ML6 Naive Bayes Spam Filter
ML6 Naive Bayes Spam Filter
#========================================
# Import libraries
import numpy as np
import pandas as pd
import re
import nltk
from IPython.display import display
from wordcloud import WordCloud, ImageColorGenerator
import matplotlib.pyplot as plt
from PIL import Image
spam_collection.info()
spam_collection['Label'].value_counts(normalize = True)
# Original Data (13.4% of the messages are spam, while the rest are ham)
plt.show()
texts = spam_collection['SMS'].sum()
wc = WordCloud(max_words=1000,contour_width=3, contour_color='red')
wc.generate(texts)
plt.figure(figsize=[15,7])
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.show()
plt.rcParams['savefig.dpi'] = 1100
#‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐
# Splitting Data
#A split‐up of 80% and 20%, respectively
# Training/Test split
training_set = randomized_collection[:training_test_index].reset_index(drop=True)
test_set = randomized_collection[training_test_index:].reset_index(drop=True)
print('Training Data:')
print(training_set.shape)
print('Testing Data:')
print(test_set.shape)
# Training Set
fig2, ax2 = plt.subplots(figsize=(5,5))
plt.show()
# Test Set
plt.show()
#‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐
# MODEL DEVELOPMENT
#...............................
# Data Pre‐Processing
#*****************************************************
# 1. Normalization
# Remove punctuation, collapse all whitespace (spaces, line breaks, tabs) into a
single space & eliminate any leading/trailing whitespace.
training_set['SMS'] = training_set['SMS'].str.replace(r'[^\w\d\s]', ' ')
training_set['SMS'] = training_set['SMS'].str.replace(r'\s+', ' ')
training_set['SMS'] = training_set['SMS'].str.replace(r'^\s+|\s+?$', '')
training_set.head()
#‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐
# Natural Language Tool
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
#.............................
# removing stopword like verb, article ect.
from nltk.corpus import stopwords
stop_words = nltk.corpus.stopwords.words('english')
training_set['SMS'] = training_set['SMS'].apply(lambda x: ' '.join(
term for term in x.split() if term not in set(stop_words))
)
training_set.head()
porter = nltk.PorterStemmer()
training_set['SMS'] = training_set['SMS'].apply(lambda x: ' '.join(
porter.stem(term) for term in x.split())
)
training_set.head()
#‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐
#Feature Extraction
#‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐
corpus = training_set['SMS'].sum()
len(corpus)
# Revert to a list
vocabulary = list(temp_set)
len(vocabulary)
word_counts.shape
#‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐
# Calculating Probability
#‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐‐
ham_words_per_message = ham_df['SMS'].apply(len)
n_ham = ham_words_per_message.sum()
n_vocabulary = len(vocabulary)
# Opting for the Laplace smoothing to remove 0 probability problem
alpha = 1
#‐‐‐‐‐‐‐‐‐‐‐
#Calculating Parameters
# P(wi|Spam) and P(wi|Ham) depend on the training set, which doesn't change, thus
they are constant.
# Create two dictionaries that match each unique word with the respective
probability value.
parameters_spam = {unique_word: 0 for unique_word in vocabulary}
parameters_ham = {unique_word: 0 for unique_word in vocabulary}
# Iterate over the vocabulary and for each word, calculate P(wi|Spam) and P(wi|Ham)
for unique_word in vocabulary:
p_unique_word_spam = (spam_df[unique_word].sum() + alpha) / (n_spam + alpha *
n_vocabulary)
p_unique_word_ham = (ham_df[unique_word].sum() + alpha) / (n_ham + alpha *
n_vocabulary)
def sms_classify(message):
'''
Takes in as input a new sms (w1, w2, ..., wn),
calculates P(Spam|w1, w2, ..., wn) and P(Ham|w1, w2, ..., wn),
compares them and outcomes whether the message is spam or not.
'''
# Remove punctuation, collapse all whitespace (spaces, line breaks, tabs) into
a single space & eliminate any leading/trailing whitespace.
message = message.replace(r'[^\w\d\s]', ' ')
message = message.replace(r'\s+', ' ')
message = message.replace(r'^\s+|\s+?$', '')
# Lemmatization
message = ' '.join(lemmatizer.lemmatize(term, pos='v') for term in
message.split())
# Stemming
message = ' '.join(porter.stem(term) for term in message.split())
# Tokenization
message = message.split()
p_spam_given_message = p_spam
p_ham_given_message = p_ham
if word in parameters_ham:
p_ham_given_message *= parameters_ham[word]
print('P(Spam|message):', p_spam_given_message)
print('P(Ham|message):', p_ham_given_message)
print("Test with message: Hey, Sign up with this promo code and get your card for
amazing exchange fees abroad and £5 to spend anywhere! Promocode: D48KV7BN")
sms_classify('''Hey, Sign up with this promo code and get your card for amazing
exchange fees abroad and £5 to spend anywhere! Promocode:
D48KV7BN''')
print('Test with message: Okey Stan! Seems to be a reasonable amount of money. Ill
think of it and let you know ASAP.')
# We define the classify () function again, this time returning the outcomes
def sms_classify_test_set(message):
'''
Takes in as input a new sms (w1, w2, ..., wn),
calculates P(Spam|w1, w2, ..., wn) and P(Ham|w1, w2, ..., wn),
compares them and returns the spam or ham label, respectively.
'''
# Remove punctuation, collapse all whitespace (spaces, line breaks, tabs) into
a single space & eliminate any leading/trailing whitespace.
message = message.replace(r'[^\w\d\s]', ' ')
message = message.replace(r'\s+', ' ')
message = message.replace(r'^\s+|\s+?$', '')
# Lemmatization
message = ' '.join(lemmatizer.lemmatize(term, pos='v') for term in
message.split())
# Stemming
message = ' '.join(porter.stem(term) for term in message.split())
# Tokenization
message = message.split()
p_spam_given_message = p_spam
p_ham_given_message = p_ham
if word in parameters_ham:
p_ham_given_message *= parameters_ham[word]
print('Results \n‐‐‐‐‐‐‐')
print('Valid:', correct)
print('Invalid:', total ‐ correct)
print('Accuracy:', round(correct/total, 4))