0% found this document useful (0 votes)
2 views

IRT Lab Programs

The document outlines a series of labs focused on data extraction, term weighting, text processing, neural network implementation, and scalable indexing using Python and various libraries. Each lab includes steps for uploading text files, processing data, and implementing machine learning models, with specific code examples provided. The labs emphasize practical applications of natural language processing and machine learning techniques.

Uploaded by

chaitanyamalli10
Copyright
© © All Rights Reserved
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
2 views

IRT Lab Programs

The document outlines a series of labs focused on data extraction, term weighting, text processing, neural network implementation, and scalable indexing using Python and various libraries. Each lab includes steps for uploading text files, processing data, and implementing machine learning models, with specific code examples provided. The labs emphasize practical applications of natural language processing and machine learning techniques.

Uploaded by

chaitanyamalli10
Copyright
© © All Rights Reserved
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
You are on page 1/ 9

LAB-3_Extraction of Raw Data

from google.colab import files


from nltk.tokenize import word_tokenize
import nltk

# Step 1: Upload the file


print("Upload your .txt file:")
uploaded = files.upload()

# Step 2: Retrieve the file name


filename = list(uploaded.keys())[0]
print(f"File {filename} uploaded successfully!")

# Step 3: Read the file


print("\nReading the file...")
with open(filename, 'r', encoding='utf-8', errors='ignore') as file:
raw_data = file.read()

print("\nExtracted Raw Data:")


print(raw_data)

# Step 4: Convert Raw Data into Tokens


print("\nConverting Raw Data into Tokens...")

# Download NLTK punkt tokenizer


try:
nltk.download('punkt')
except Exception as e:
print(f"Error downloading punkt tokenizer: {e}")

# Tokenize the raw data


tokens = word_tokenize(raw_data)

print("\nTokens:")
print(tokens)

Additionally, Can compile this code also


import nltk
nltk.download('punkt_tab')
LAB-4_Implementation of Term
Weighting

from sklearn.feature_extraction.text import TfidfVectorizer


# Sample documents
documents = [
"Information retrieval is the process of finding relevant
information.",
"The retrieval process involves techniques like term weighting and
ranking.",
"Term weighting methods like TF-IDF are used in information
retrieval."
]
# Create a TF-IDF Vectorizer
vectorizer = TfidfVectorizer()
# Fit and transform the documents
tfidf_matrix = vectorizer.fit_transform(documents)
# Get the feature names (terms)
terms = vectorizer.get_feature_names_out()
# Convert the matrix to a dense array and print it
tfidf_dense = tfidf_matrix.toarray()
# Display TF-IDF weights
print("TF-IDF Weights:")
for i, doc in enumerate(tfidf_dense):
print(f"\nDocument {i + 1}:")
for term, weight in zip(terms, doc):
if weight > 0:
print(f" {term}: {weight:.3f}")

_________________________________
LAB-5_Implementation of Text
Processing Model
# Import necessary libraries
from google.colab import files
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import nltk

# Download required NLTK data


nltk.download('punkt')
nltk.download('stopwords')

# Step 1: Upload the .txt file


print("Upload your .txt file:")
uploaded = files.upload()

# Retrieve the file name


filename = list(uploaded.keys())[0]
print(f"\nFile {filename} uploaded successfully!")

# Step 2: Read the file


with open(filename, 'r', encoding='utf-8', errors='ignore') as file:
raw_data = file.read()

print("\nExtracted Raw Data:")


print(raw_data)

# Step 3: Preprocess the text


def preprocess_text(text):
"""
Preprocess the text data:
1. Lowercase the text
2. Tokenize the text
3. Remove stopwords
4. Perform stemming
"""
# Initialize tools
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

# Tokenize the text


tokens = word_tokenize(text)

# Normalize: convert to lowercase


tokens = [token.lower() for token in tokens if token.isalnum()]

# Remove stopwords
filtered_tokens = [word for word in tokens if word not in
stop_words]

# Apply stemming
stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]

return stemmed_tokens

# Preprocess the extracted raw data


processed_tokens = preprocess_text(raw_data)

print("\nProcessed Tokens:")
print(processed_tokens)

_________________________________
LAB-6: Implementation of Neural
Network Model

# Import necessary libraries


import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, LSTM, Flatten
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from google.colab import files
import numpy as np

# Step 1: Upload a .txt file


print("Upload your .txt file (with labeled data, e.g., text,label):")
uploaded = files.upload()

# Retrieve the file name


filename = list(uploaded.keys())[0]
print(f"\nFile {filename} uploaded successfully!")

# Step 2: Read and process the file


texts, labels = [], []
with open(filename, 'r', encoding='utf-8', errors='ignore') as file:
for line in file:
parts = line.strip().split(",")
if len(parts) == 2: # Ensure there is both text and label
text, label = parts[0].strip(), parts[1].strip()
try:
labels.append(int(label)) # Convert label to integer
texts.append(text)
except ValueError:
print(f"Skipping invalid label: {label}")
else:
print(f"Skipping malformed line: {line.strip()}")

# Validate the data


if len(texts) < 2:
raise ValueError("Not enough data. Ensure the file contains at
least 2 valid text-label pairs.")
# Display sample texts and labels
print("\nSample Texts and Labels:")
print(texts[:5], labels[:5])

# Step 3: Tokenize the text


tokenizer = Tokenizer(num_words=5000, oov_token="<OOV>")
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index

# Pad sequences to ensure uniform input size


max_length = 100
padded_sequences = pad_sequences(sequences, maxlen=max_length,
padding='post')

# Convert labels to a NumPy array


labels = np.array(labels)

# Step 4: Split data into training and testing sets


train_size = int(len(texts) * 0.8)
if train_size == 0 or len(texts) - train_size == 0:
raise ValueError("Insufficient data for splitting. Ensure there are
enough samples.")

train_data = padded_sequences[:train_size]
train_labels = labels[:train_size]
test_data = padded_sequences[train_size:]
test_labels = labels[train_size:]

print(f"\nTraining Samples: {len(train_data)}, Testing Samples:


{len(test_data)}")

# Step 5: Build a Neural Network Model


model = Sequential([
Embedding(input_dim=5000, output_dim=64, input_length=max_length),
LSTM(64, return_sequences=True),
Flatten(),
Dense(64, activation='relu'),
Dense(1, activation='sigmoid') # Binary classification
])

model.compile(optimizer='adam', loss='binary_crossentropy',
metrics=['accuracy'])
# Step 6: Train the Model
print("\nTraining the model...")
batch_size = min(len(train_data), 32) # Ensure batch_size is not
larger than the training set
history = model.fit(train_data, train_labels, epochs=5,
validation_data=(test_data, test_labels), batch_size=batch_size)

# Step 7: Evaluate the Model


print("\nEvaluating the model...")
loss, accuracy = model.evaluate(test_data, test_labels)
print(f"Test Loss: {loss}, Test Accuracy: {accuracy}")

# Step 8: Make Predictions


print("\nMaking predictions on test data...")
predictions = (model.predict(test_data) > 0.5).astype("int32")

print("\nSample Predictions:")
for i in range(min(5, len(test_data))):
print(f"Text: {texts[train_size + i]}, Actual Label:
{test_labels[i]}, Predicted Label: {predictions[i][0]}")

.TXT file content:

I love this product,1 This is the worst


experience ever,0 Absolutely fantastic!
Highly recommend.,1 Not worth the
money.,0
—-----------------------------------------------------
Note: Similar to that one, you can
create own text file and compile the
program
################ LAB-7
##########################

# Step 1: Install necessary libraries


!pip install tqdm --quiet # For progress bars

# Step 2: Upload a sample text file


from google.colab import files

print("Upload a sample text file for indexing (e.g., a .txt file).")


uploaded_files = files.upload()

# Read the uploaded file


file_content = None
file_name = list(uploaded_files.keys())[0]
with open(file_name, 'r', encoding='utf-8') as file:
file_content = file.readlines()

# Step 3: Scalable Indexing Implementation


from collections import defaultdict
from tqdm import tqdm

class ScalableIndexer:
def __init__(self): # Corrected constructor
self.index = defaultdict(list) # Dictionary for word-to-line
mapping

def index_file(self, file_content):


"""Indexes a text file line by line."""
for line_num, line in enumerate(tqdm(file_content,
desc="Indexing lines")):
words = line.strip().split()
for word in words:
self.index[word.lower()].append(line_num) # Convert
words to lowercase for case-insensitive indexing

def search(self, term):


"""Searches for a term in the indexed data."""
term = term.lower()
if term in self.index:
return self.index[term]
else:
return []

# Step 4: Index the uploaded file


indexer = ScalableIndexer()
indexer.index_file(file_content)

# Step 5: Perform a search


search_term = input("Enter a word to search for: ")
search_results = indexer.search(search_term)

# Display results
if search_results:
print(f"Found '{search_term}' in lines: {search_results}")
print("\nLines containing the term:")
for line_num in search_results:
print(f"Line {line_num + 1}: {file_content[line_num].strip()}")
else:
print(f"'{search_term}' not found in the file.")

INPUT:

Create own .TXT file and add few lines in TXT file,
then search the term in TXT file

You might also like