Assingment-3 NLP
Assingment-3 NLP
Data Preprocessing
Tokenization: Use spaCy to tokenize the text data. Ensure that only words appearing five or
more times are included in your vocabulary. Words that appear less frequently should be
assigned an "UNK" token.
Vocabulary Management: Construct a vocabulary from the tokenized data, mapping each
unique word to a unique index. Include a special "PAD" token for padding shorter sentences.
One-hot Encoding: Convert tokens in your sentences to one-hot encoded vectors based on the
vocabulary index.
2. Model Architecture
Input Layer: Takes in one-hot encoded vectors of fixed size (determined by the maximum length
of sentences in your data).
Hidden Layers: Two hidden layers with 256 and 128 neurons respectively.
Activation Functions: You can use ReLU or Tanh between layers to introduce non-linearity.
Output Layer: Size of this layer depends on the number of classes (2 for binary, more for multi-
class).
Model Type: Use LSTM due to its effectiveness in handling long sequences.
Structure: Similar to FFNN in terms of input, but the sequential data is processed through LSTM
units. The final state of the LSTM is used to predict the sentiment.
Output Layer: Configured similarly to the FFNN's output layer, adjusted for the number of
classes.
3. Loss Functions
Metrics: Track accuracy, precision, recall, and F1-score. Utilize validation sets to tune your
models and save the best-performing models.
5. Submission
Package your code, logs, and a detailed report into a .zip file named appropriately with your roll
number and assignment number.
6. Additional Notes
import os
import tarfile
import urllib.request
import torch
import torch.nn as nn
import torch.optim as optim
import spacy
from collections import Counter
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
# Load English tokenizer from spaCy
nlp = spacy.load("en_core_web_sm")
# Download and extract IMDB dataset if not already available
def download_extract(url, download_path, extract_path):
if not os.path.exists(extract_path):
print("Downloading and extracting dataset...")
urllib.request.urlretrieve(url, download_path)
with tarfile.open(download_path, "r:gz") as tar:
tar.extractall()
print("Dataset ready.")
url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
download_path = "./aclImdb_v1.tar.gz"
extract_path = "./aclImdb"
download_extract(url, download_path, extract_path)
# Load data from files
def load_data(path):
data = []
for label in ["pos", "neg"]:
directory = os.path.join(path, label)
for filename in os.listdir(directory):
if filename.endswith(".txt"):
with open(os.path.join(directory, filename), 'r', encoding='utf-8') as file:
content = file.read().strip()
data.append((content, 1 if label == "pos" else 0))
return data
train_data = load_data(os.path.join(extract_path, "train"))
test_data = load_data(os.path.join(extract_path, "test"))
# Tokenization and Vocabulary Management
def tokenize(text):
return [token.text for token in nlp.tokenizer(text)]
def build_vocab(data, min_freq=5):
counter = Counter()
for text, _ in data:
tokens = tokenize(text)
counter.update(tokens)
vocab = {word: i+1 for i, (word, freq) in enumerate(counter.items()) if freq >= min_freq}
vocab['UNK'] = 0 # Unknown words
vocab['PAD'] = len(vocab) # Padding token
return vocab
vocab = build_vocab(train_data)
def encode_sentence(sentence, vocab):
return [vocab.get(token, vocab['UNK']) for token in tokenize(sentence)]
def pad_encoded(encoded_sentence, max_length):
return encoded_sentence[:max_length] + [vocab['PAD']] * (max_length - len(encoded_sentence))
class SentimentDataset(Dataset):
def __init__(self, data, vocab, max_length=512):
self.data = data
self.vocab = vocab
self.max_length = max_length
def __len__(self):
return len(self.data)
def __getitem__(self, idx):
text, label = self.data[idx]
encoded_text = encode_sentence(text, self.vocab)
padded_text = pad_encoded(encoded_text, self.max_length)
return padded_text, label
# Collate function for DataLoader
def collate_fn(batch):
texts, labels = zip(*batch)
texts = torch.tensor([text for text in texts])
labels = torch.tensor(labels)
return texts, labels
# DataLoaders
train_dataset = SentimentDataset(train_data, vocab, 512)
test_dataset = SentimentDataset(test_data, vocab, 512)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)
# Model Definitions
class FFNN(nn.Module):
def __init__(self, vocab_size, hidden_dim1=256, hidden_dim2=128, num_classes=2):
super(FFNN, self).__init__()
self.fc1 = nn.Linear(vocab_size, hidden_dim1)
self.fc2 = nn.Linear(hidden_dim1, hidden_dim2)
self.fc3 = nn.Linear(hidden_dim2, num_classes)
self.relu = nn.ReLU()
def forward(self, x):
x = self.relu(self.fc1(x))
x = self.relu(self.fc2(x))
return self.fc3(x)
class LSTMModel(nn.Module):
def __init__(self, vocab_size, embedding_dim=100, hidden_dim=256, num_classes=2):
super(LSTMModel, self).__init__()
self.embedding = nn.Embedding(vocab_size, embedding_dim)
self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
self.fc = nn.Linear(hidden_dim, num_classes)
def forward(self, x):
x = self.embedding(x)
_, (hn, _) = self.lstm(x)
return self.fc(hn.squeeze(0))
# Instantiate models
model_ffnn = FFNN(len(vocab))
model_lstm = LSTMModel(len(vocab))
# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer_ffnn = optim.Adam(model_ffnn.parameters(), lr=0.001)
optimizer_lstm = optim.Adam(model_lstm.parameters(), lr=0.001)
# Training and evaluation functions
def train_model(model, loader, optimizer):
model.train()
total_loss = 0
for inputs, labels in loader:
optimizer.zero_grad()
outputs = model(inputs)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
total_loss += loss.item()
return total_loss / len(loader)
def evaluate_model(model, loader):
model.eval()
correct = 0
total = 0
with torch.no_grad():
for inputs, labels in loader:
outputs = model(inputs)
_, predicted = torch.max(outputs.data, 1)
total += labels.size(0)
correct += (predicted == labels).sum().item()
return correct / total
# Training and evaluation loop
num_epochs = 10
for epoch in range(num_epochs):
train_loss_ffnn = train_model(model_ffnn, train_loader, optimizer_ffnn)
train_loss_lstm = train_model(model_lstm, train_loader, optimizer_lstm)
print(f'Epoch {epoch+1}/{num_epochs}, Train Loss FFNN: {train_loss_ffnn}, Train Loss LSTM: {train_loss_lstm}')
accuracy_ffnn = evaluate_model(model_ffnn, test_loader)
accuracy_lstm = evaluate_model(model_lstm, test_loader)
print(f'Test Accuracy FFNN: {accuracy_ffnn*100:.2f}%, Test Accuracy LSTM: {accuracy_lstm*100:.2f}%')