0% found this document useful (0 votes)

2 views

IRT Lab Programs

The document outlines a series of labs focused on data extraction, term weighting, text processing, neural network implementation, and scalable indexing using Python and various libraries. Each lab includes steps for uploading text files, processing data, and implementing machine learning models, with specific code examples provided. The labs emphasize practical applications of natural language processing and machine learning techniques.

Uploaded by

chaitanyamalli10

Available Formats

Download as DOCX, PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

2 views

IRT Lab Programs

Uploaded by

chaitanyamalli10

Available Formats

Download as DOCX, PDF, TXT or read online on Scribd

You are on page 1/ 9

LAB-3_Extraction of Raw Data

from google.colab import files

from nltk.tokenize import word_tokenize
import nltk

# Step 1: Upload the file

print("Upload your .txt file:")
uploaded = files.upload()

# Step 2: Retrieve the file name

filename = list(uploaded.keys())[0]
print(f"File {filename} uploaded successfully!")

# Step 3: Read the file

print("\nReading the file...")
with open(filename, 'r', encoding='utf-8', errors='ignore') as file:
raw_data = file.read()

print("\nExtracted Raw Data:")

print(raw_data)

# Step 4: Convert Raw Data into Tokens

print("\nConverting Raw Data into Tokens...")

# Download NLTK punkt tokenizer

try:
nltk.download('punkt')
except Exception as e:
print(f"Error downloading punkt tokenizer: {e}")

# Tokenize the raw data

tokens = word_tokenize(raw_data)

print("\nTokens:")
print(tokens)

Additionally, Can compile this code also

import nltk
nltk.download('punkt_tab')
LAB-4_Implementation of Term
Weighting

from sklearn.feature_extraction.text import TfidfVectorizer

# Sample documents
documents = [
"Information retrieval is the process of finding relevant
information.",
"The retrieval process involves techniques like term weighting and
ranking.",
"Term weighting methods like TF-IDF are used in information
retrieval."
]
# Create a TF-IDF Vectorizer
vectorizer = TfidfVectorizer()
# Fit and transform the documents
tfidf_matrix = vectorizer.fit_transform(documents)
# Get the feature names (terms)
terms = vectorizer.get_feature_names_out()
# Convert the matrix to a dense array and print it
tfidf_dense = tfidf_matrix.toarray()
# Display TF-IDF weights
print("TF-IDF Weights:")
for i, doc in enumerate(tfidf_dense):
print(f"\nDocument {i + 1}:")
for term, weight in zip(terms, doc):
if weight > 0:
print(f" {term}: {weight:.3f}")

_________________________________
LAB-5_Implementation of Text
Processing Model
# Import necessary libraries
from google.colab import files
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import nltk

# Download required NLTK data

nltk.download('punkt')
nltk.download('stopwords')

# Step 1: Upload the .txt file

print("Upload your .txt file:")
uploaded = files.upload()

# Retrieve the file name

filename = list(uploaded.keys())[0]
print(f"\nFile {filename} uploaded successfully!")

# Step 2: Read the file

with open(filename, 'r', encoding='utf-8', errors='ignore') as file:
raw_data = file.read()

print("\nExtracted Raw Data:")

print(raw_data)

# Step 3: Preprocess the text

def preprocess_text(text):
"""
Preprocess the text data:
1. Lowercase the text
2. Tokenize the text
3. Remove stopwords
4. Perform stemming
"""
# Initialize tools
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

# Tokenize the text

tokens = word_tokenize(text)

# Normalize: convert to lowercase

tokens = [token.lower() for token in tokens if token.isalnum()]

# Remove stopwords
filtered_tokens = [word for word in tokens if word not in
stop_words]

# Apply stemming
stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]

return stemmed_tokens

# Preprocess the extracted raw data

processed_tokens = preprocess_text(raw_data)

print("\nProcessed Tokens:")
print(processed_tokens)

_________________________________
LAB-6: Implementation of Neural
Network Model

# Import necessary libraries

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, LSTM, Flatten
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from google.colab import files
import numpy as np

# Step 1: Upload a .txt file

print("Upload your .txt file (with labeled data, e.g., text,label):")
uploaded = files.upload()

# Retrieve the file name

filename = list(uploaded.keys())[0]
print(f"\nFile {filename} uploaded successfully!")

# Step 2: Read and process the file

texts, labels = [], []
with open(filename, 'r', encoding='utf-8', errors='ignore') as file:
for line in file:
parts = line.strip().split(",")
if len(parts) == 2: # Ensure there is both text and label
text, label = parts[0].strip(), parts[1].strip()
try:
labels.append(int(label)) # Convert label to integer
texts.append(text)
except ValueError:
print(f"Skipping invalid label: {label}")
else:
print(f"Skipping malformed line: {line.strip()}")

# Validate the data

if len(texts) < 2:
raise ValueError("Not enough data. Ensure the file contains at
least 2 valid text-label pairs.")
# Display sample texts and labels
print("\nSample Texts and Labels:")
print(texts[:5], labels[:5])

# Step 3: Tokenize the text

tokenizer = Tokenizer(num_words=5000, oov_token="<OOV>")
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index

# Pad sequences to ensure uniform input size

max_length = 100
padded_sequences = pad_sequences(sequences, maxlen=max_length,
padding='post')

# Convert labels to a NumPy array

labels = np.array(labels)

# Step 4: Split data into training and testing sets

train_size = int(len(texts) * 0.8)
if train_size == 0 or len(texts) - train_size == 0:
raise ValueError("Insufficient data for splitting. Ensure there are
enough samples.")

train_data = padded_sequences[:train_size]
train_labels = labels[:train_size]
test_data = padded_sequences[train_size:]
test_labels = labels[train_size:]

print(f"\nTraining Samples: {len(train_data)}, Testing Samples:

{len(test_data)}")

# Step 5: Build a Neural Network Model

model = Sequential([
Embedding(input_dim=5000, output_dim=64, input_length=max_length),
LSTM(64, return_sequences=True),
Flatten(),
Dense(64, activation='relu'),
Dense(1, activation='sigmoid') # Binary classification
])

model.compile(optimizer='adam', loss='binary_crossentropy',
metrics=['accuracy'])
# Step 6: Train the Model
print("\nTraining the model...")
batch_size = min(len(train_data), 32) # Ensure batch_size is not
larger than the training set
history = model.fit(train_data, train_labels, epochs=5,
validation_data=(test_data, test_labels), batch_size=batch_size)

# Step 7: Evaluate the Model

print("\nEvaluating the model...")
loss, accuracy = model.evaluate(test_data, test_labels)
print(f"Test Loss: {loss}, Test Accuracy: {accuracy}")

# Step 8: Make Predictions

print("\nMaking predictions on test data...")
predictions = (model.predict(test_data) > 0.5).astype("int32")

print("\nSample Predictions:")
for i in range(min(5, len(test_data))):
print(f"Text: {texts[train_size + i]}, Actual Label:
{test_labels[i]}, Predicted Label: {predictions[i][0]}")

.TXT file content:

I love this product,1 This is the worst

experience ever,0 Absolutely fantastic!
Highly recommend.,1 Not worth the
money.,0
—-----------------------------------------------------
Note: Similar to that one, you can
create own text file and compile the
program
################ LAB-7
##########################

# Step 1: Install necessary libraries

!pip install tqdm --quiet # For progress bars

# Step 2: Upload a sample text file

from google.colab import files

print("Upload a sample text file for indexing (e.g., a .txt file).")

uploaded_files = files.upload()

# Read the uploaded file

file_content = None
file_name = list(uploaded_files.keys())[0]
with open(file_name, 'r', encoding='utf-8') as file:
file_content = file.readlines()

# Step 3: Scalable Indexing Implementation

from collections import defaultdict
from tqdm import tqdm

class ScalableIndexer:
def __init__(self): # Corrected constructor
self.index = defaultdict(list) # Dictionary for word-to-line
mapping

def index_file(self, file_content):

"""Indexes a text file line by line."""
for line_num, line in enumerate(tqdm(file_content,
desc="Indexing lines")):
words = line.strip().split()
for word in words:
self.index[word.lower()].append(line_num) # Convert
words to lowercase for case-insensitive indexing

def search(self, term):

"""Searches for a term in the indexed data."""
term = term.lower()
if term in self.index:
return self.index[term]
else:
return []

# Step 4: Index the uploaded file

indexer = ScalableIndexer()
indexer.index_file(file_content)

# Step 5: Perform a search

search_term = input("Enter a word to search for: ")
search_results = indexer.search(search_term)

# Display results
if search_results:
print(f"Found '{search_term}' in lines: {search_results}")
print("\nLines containing the term:")
for line_num in search_results:
print(f"Line {line_num + 1}: {file_content[line_num].strip()}")
else:
print(f"'{search_term}' not found in the file.")

INPUT:

Create own .TXT file and add few lines in TXT file,
then search the term in TXT file

Class Xii Computer Science Practical Programs - 2022-23 2
No ratings yet
Class Xii Computer Science Practical Programs - 2022-23 2
33 pages
Computer Science-CLASS-12-RECORD PROGRAMS
No ratings yet
Computer Science-CLASS-12-RECORD PROGRAMS
10 pages
12 Cs Cbse QP Programs
No ratings yet
12 Cs Cbse QP Programs
10 pages
Practical Programs Solution
No ratings yet
Practical Programs Solution
27 pages
pdf_processor
No ratings yet
pdf_processor
4 pages
file handling
No ratings yet
file handling
23 pages
DOC-20250426-WA0003.
No ratings yet
DOC-20250426-WA0003.
11 pages
12 - Computer Science - PracticalList
No ratings yet
12 - Computer Science - PracticalList
13 pages
CS 3308 Programming Assignment Unit 4
No ratings yet
CS 3308 Programming Assignment Unit 4
7 pages
Python
No ratings yet
Python
31 pages
Assignment
No ratings yet
Assignment
13 pages
Report File (6)
No ratings yet
Report File (6)
8 pages
Practical File
No ratings yet
Practical File
32 pages
Lab Report 05
No ratings yet
Lab Report 05
5 pages
Practical Ans C.S
No ratings yet
Practical Ans C.S
9 pages
File Handling Worksheet
No ratings yet
File Handling Worksheet
19 pages
Code2pdf 64692611922ce
No ratings yet
Code2pdf 64692611922ce
2 pages
IDAP Assignment
No ratings yet
IDAP Assignment
6 pages
Python Mannual
No ratings yet
Python Mannual
50 pages
Computer Science Practical Output
No ratings yet
Computer Science Practical Output
7 pages
Expt_5
No ratings yet
Expt_5
3 pages
All Practicals
No ratings yet
All Practicals
33 pages
UI23CS10_SS(LAB2) - Copy
No ratings yet
UI23CS10_SS(LAB2) - Copy
3 pages
Vasu Nagar CS Report File
No ratings yet
Vasu Nagar CS Report File
38 pages
Python Programs
No ratings yet
Python Programs
20 pages
Pythonass
No ratings yet
Pythonass
8 pages
3DES Code
No ratings yet
3DES Code
1 page
NLP Lab
No ratings yet
NLP Lab
18 pages
File I.O Operation
No ratings yet
File I.O Operation
10 pages
Kkkk
No ratings yet
Kkkk
9 pages
Python Lab ALL 10 Prgms
No ratings yet
Python Lab ALL 10 Prgms
16 pages
text_processor
No ratings yet
text_processor
3 pages
Class 12 Practical List
No ratings yet
Class 12 Practical List
21 pages
Computer SCIENCE Practical File
No ratings yet
Computer SCIENCE Practical File
23 pages
20 code
No ratings yet
20 code
52 pages
app
No ratings yet
app
7 pages
Assingment-3 NLP
No ratings yet
Assingment-3 NLP
5 pages
Cse Material 4
No ratings yet
Cse Material 4
7 pages
Project
No ratings yet
Project
2 pages
Cs Practical
No ratings yet
Cs Practical
13 pages
rpatool
No ratings yet
rpatool
9 pages
py 3
No ratings yet
py 3
16 pages
Test Sqs
No ratings yet
Test Sqs
8 pages
3
No ratings yet
3
7 pages
Program-1: Source Code
No ratings yet
Program-1: Source Code
17 pages
python 1-2 week-6 and week-7 programs
No ratings yet
python 1-2 week-6 and week-7 programs
6 pages
ChatGPT Queries Plus Codes Into Sections
No ratings yet
ChatGPT Queries Plus Codes Into Sections
12 pages
Debugging Methods (From Code Samples Finding and ChatGPT Generate)
No ratings yet
Debugging Methods (From Code Samples Finding and ChatGPT Generate)
26 pages
DL Lab1
No ratings yet
DL Lab1
4 pages
Practical File Questions
No ratings yet
Practical File Questions
34 pages
CS 3308 Programming Assignment Unit 2
No ratings yet
CS 3308 Programming Assignment Unit 2
10 pages
Sahil Malhotra 16 BCE 0113 Web Mining L51+L52: 1. Universal Crawling 1.1. CODE
No ratings yet
Sahil Malhotra 16 BCE 0113 Web Mining L51+L52: 1. Universal Crawling 1.1. CODE
11 pages
Cs Project Report
No ratings yet
Cs Project Report
36 pages
Python Practice question
No ratings yet
Python Practice question
5 pages
Write A Program To Capitalize First and Last Letter of Given String
No ratings yet
Write A Program To Capitalize First and Last Letter of Given String
45 pages
Xii SC Practical Assignment
No ratings yet
Xii SC Practical Assignment
20 pages
Practical No 05
No ratings yet
Practical No 05
4 pages
Ansh Tygai Practical File
No ratings yet
Ansh Tygai Practical File
48 pages
Grade 12 CS Board Practicals BINARY
No ratings yet
Grade 12 CS Board Practicals BINARY
10 pages
Simplifying Data Science With Python
From Everand
Simplifying Data Science With Python
Billy David millican
No ratings yet
01 Fintech Industry Powerpoint Templates 16x9
No ratings yet
01 Fintech Industry Powerpoint Templates 16x9
34 pages
Stay Hungry, Stay Foolish by Steve Jobs
No ratings yet
Stay Hungry, Stay Foolish by Steve Jobs
4 pages
Pallavi Model School, Alwal Class Xi (Ip) Worksheet - 2 (Mysql Queries)
No ratings yet
Pallavi Model School, Alwal Class Xi (Ip) Worksheet - 2 (Mysql Queries)
5 pages
IT2042 - Information Security
No ratings yet
IT2042 - Information Security
199 pages
vxr-vp-760
No ratings yet
vxr-vp-760
90 pages
Full download HCI International 2020 Posters 22nd International Conference HCII 2020 Copenhagen Denmark July 19 24 2020 Proceedings Part I Constantine Stephanidis pdf docx
100% (3)
Full download HCI International 2020 Posters 22nd International Conference HCII 2020 Copenhagen Denmark July 19 24 2020 Proceedings Part I Constantine Stephanidis pdf docx
62 pages
Mathematical Logic through Python Yannai A. Gonczarowski instant download
No ratings yet
Mathematical Logic through Python Yannai A. Gonczarowski instant download
82 pages
Asm655 - Group - Project - Guidelines (Oct 2020)
No ratings yet
Asm655 - Group - Project - Guidelines (Oct 2020)
3 pages
Prac 4
No ratings yet
Prac 4
6 pages
Carlos Ruiz y La Irrupcion de Un Nuevo Pueblo en El Octubre Chileno PDF
No ratings yet
Carlos Ruiz y La Irrupcion de Un Nuevo Pueblo en El Octubre Chileno PDF
151 pages
Vidão 32a60 Im
No ratings yet
Vidão 32a60 Im
33 pages
PR2 Q2 Module 5
No ratings yet
PR2 Q2 Module 5
9 pages
Child Safety Wearable Device: Technical Seminar On
No ratings yet
Child Safety Wearable Device: Technical Seminar On
20 pages
Bus Reservation System
No ratings yet
Bus Reservation System
10 pages
Android-Based Biometric Student Attendance System: 1) Background/ Problem Statement
No ratings yet
Android-Based Biometric Student Attendance System: 1) Background/ Problem Statement
12 pages
ForoniAngelica_ThesisPresentation
No ratings yet
ForoniAngelica_ThesisPresentation
20 pages
Re: Package Substation Fabrication Drawings-Dammam: Delete Junk Block
No ratings yet
Re: Package Substation Fabrication Drawings-Dammam: Delete Junk Block
1 page
Thinkific Salesforce Connector: User Guide
No ratings yet
Thinkific Salesforce Connector: User Guide
31 pages
Lab Manual 7
No ratings yet
Lab Manual 7
6 pages
Simona Smarttank 2.0: Efficient Calculation of Rectangular and Cylindrical Tanks
No ratings yet
Simona Smarttank 2.0: Efficient Calculation of Rectangular and Cylindrical Tanks
14 pages
Reference Books For 3 Sem
100% (1)
Reference Books For 3 Sem
2 pages
Ender-3 3D Printer: Instructions For Assembly
No ratings yet
Ender-3 3D Printer: Instructions For Assembly
16 pages
Grillo Search Warrant
No ratings yet
Grillo Search Warrant
42 pages
Snov - Io-How To Know If Someone Read Your Email
No ratings yet
Snov - Io-How To Know If Someone Read Your Email
16 pages
Slickline Electronic Perforating Tool: Advantages
No ratings yet
Slickline Electronic Perforating Tool: Advantages
1 page
Chapter 7-Activity 6.SHANIAH DANE TALION. BPED 2H
100% (1)
Chapter 7-Activity 6.SHANIAH DANE TALION. BPED 2H
2 pages
6.1sas Fat Report Gd12
No ratings yet
6.1sas Fat Report Gd12
5 pages
chat-OpenMP Configuration
No ratings yet
chat-OpenMP Configuration
14 pages
Unit 9
No ratings yet
Unit 9
18 pages
Innodisk m5s0 Bgm2oavp-3317554
No ratings yet
Innodisk m5s0 Bgm2oavp-3317554
22 pages

IRT Lab Programs

Uploaded by

IRT Lab Programs

Uploaded by

LAB-3_Extraction of Raw Data

from google.colab import files

# Step 1: Upload the file

# Step 2: Retrieve the file name

# Step 3: Read the file

print("\nExtracted Raw Data:")

# Step 4: Convert Raw Data into Tokens

# Download NLTK punkt tokenizer

# Tokenize the raw data

Additionally, Can compile this code also

from sklearn.feature_extraction.text import TfidfVectorizer

# Download required NLTK data

# Step 1: Upload the .txt file

# Retrieve the file name

# Step 2: Read the file

print("\nExtracted Raw Data:")

# Step 3: Preprocess the text

# Tokenize the text

# Normalize: convert to lowercase

# Preprocess the extracted raw data

# Import necessary libraries

# Step 1: Upload a .txt file

# Retrieve the file name

# Step 2: Read and process the file

# Validate the data

# Step 3: Tokenize the text

# Pad sequences to ensure uniform input size

# Convert labels to a NumPy array

# Step 4: Split data into training and testing sets

print(f"\nTraining Samples: {len(train_data)}, Testing Samples:

# Step 5: Build a Neural Network Model

# Step 7: Evaluate the Model

# Step 8: Make Predictions

.TXT file content:

I love this product,1 This is the worst

# Step 1: Install necessary libraries

# Step 2: Upload a sample text file

print("Upload a sample text file for indexing (e.g., a .txt file).")

# Read the uploaded file

# Step 3: Scalable Indexing Implementation

def index_file(self, file_content):

def search(self, term):

# Step 4: Index the uploaded file

# Step 5: Perform a search

You might also like