daima jieshi
daima jieshi
table)
library(caTools)
library(caret)
# Read data
sms_data <- fread("E:/2024/AI/ML/project2/SMS/SMSSpamCollection", header
= FALSE, sep = "\t", quote = "")
colnames(sms_data) <- c("label", "text")
# Split dataset 80% for train data , 20% for test data.
set.seed(202)
split <- sample.split(sms_data$label, SplitRatio = 0.8)
train_data <- sms_data[split, ]
test_data <- sms_data[!split, ]
Then ngram_counts is a table containing all trigrams, it looks like this table. It
just a example
‘he’ appears twice, others only appear once
Total is sum of all ngrams
Then probabilities = ngram_counts / total, is a table containing all the
probabilities of each trigrams . and then make a list , this is how does list
looks like.
n <- 3
generate_ngram_prob <- function(data, n) {
ngrams <- unlist(sapply(data$text, function(txt) {
sapply(seq(nchar(txt) - n + 1), function(i) substr(txt, i, i + n - 1))}))
ngram_counts <- table(ngrams)
total <- sum(ngram_counts)
probabilities <- ngram_counts / total
list(probabilities = probabilities, total = total)
}
# Prediction function
vocab_size represents the total number of possible n-grams.
It assumes that each character can take 40 different values (including letters,
numbers, some special symbol), there are a total of 40^n possible
combinations.
vocab_size <- 40^n
log_prob_ham <- 0
log_prob_spam <- 0
for (ng in ngrams) {
if (ng %in% names(ngram_ham$probabilities)) {
log_prob_ham <- log_prob_ham + log(ngram_ham$probabilities[ng])
} else {
log_prob_ham <- log_prob_ham + log(0.5 / (ngram_ham$total + 0.5 *
vocab_size))
}
# metrics
accuracy <- conf_matrix_ham$overall["Accuracy"]
precision_ham <- conf_matrix_ham$byClass["Precision"]
precision_spam <- conf_matrix_spam$byClass["Precision"]
recall_ham <- conf_matrix_ham$byClass["Recall"]
recall_spam <- conf_matrix_spam$byClass["Recall"]
f1_ham <- conf_matrix_ham$byClass["F1"]
f1_spam <- conf_matrix_spam$byClass["F1"]
# Print metrics
# Create a data frame to store the metrics
metrics_df <- data.frame(
Class = c("Ham", "Spam", "Macroaverage"),
Precision = c(round(precision_ham, 4), round(precision_spam, 4),
round(macroaverage_precision, 4)),
Recall = c(round(recall_ham, 4), round(recall_spam, 4),
round(macroaverage_recall, 4)),
F1_Score = c(round(f1_ham, 4), round(f1_spam, 4), round(macroaverage_f1,
4))
)