Dsbda 7
Dsbda 7
['Tokenization', 'refers', 'to', 'break', 'down', 'the', 'text', 'into', 'smaller', 'units', '.']
['Tokenization refers to break down the text into smaller units .']
In [29]:
#Text to tokenize
text = "This is a tokenize test"
In [30]:
from nltk.tokenize import word_tokenize
word_tokenize(text)
In [19]:
#STOP WORD REMOVAL|
text = "S&P and NASDAQ are the two most popular indices in US"
In [20]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
stop_words = set(stopwords.words('english'))
text_tokens = word_tokenize(text)
tokens_without_sw= [word for word in text_tokens if not word in stop_words]
print(tokens_without_sw)
In [10]:
#steamming
text = "It's a Stemming testing"
In [11]:
parsed_text = word_tokenize(text)
In [22]:
# Initialize stemmer.
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer('english')
In [26]:
from nltk.stem import PorterStemmer
play
play
play
play
In [23]:
#lemmatizenation
text = "This world has a lot of faces "
In [24]:
from textblob import Word
parsed_data= TextBlob(text).words
parsed_data
In [25]:
[(word, word.lemmatize()) for i, word in enumerate(parsed_data)
if word != parsed_data[i].lemmatize()]
In [27]:
#pos Tagging
text = 'Google is looking at buying U.K. startup for $1 billion'
In [28]:
TextBlob(text).tags
[('Google', 'NNP'),
Out[28]:
('is', 'VBZ'),
('looking', 'VBG'),
('at', 'IN'),
('buying', 'VBG'),
('U.K.', 'NNP'),
('startup', 'NN'),
('for', 'IN'),
('1', 'CD'),
('billion', 'CD')]
In [36]:
import pandas as pd
import numpy as np
In [8]:
#CREATE WORD SET FOR CORPUS
corpus = ['data science is one of the most important fields of science',
'this is one of the best data science courses',
'data scientists analyze data' ]
In [9]:
words_set = set()
In [12]:
import math
from collections import Counter
def calculate_tf(text):
words = text.split()
word_count = Counter(words)
total_words = len(words)
tf = {word: word_count[word] / total_words for word in word_count}
return tf
In [13]:
def calculate_idf(documents):
total_docs = len(documents)
idf = {}
for doc in documents:
words = set(doc.split())
for word in words:
idf[word] = idf.get(word, 0) + 1
for word in idf:
idf[word] = math.log(total_docs / (idf[word] + 1)) # Adding 1 to avoid division by zero
return idf
In [14]:
def calculate_tfidf(tf, idf):
tfidf = {word: tf[word] * idf.get(word, 0) for word in tf}
return tfidf
In [15]:
def represent_document(document, idf):
tf = calculate_tf(document)
tfidf = calculate_tfidf(tf, idf)
return tfidf
In [16]:
documents = [
"This is the first document.",
"This document is the second document.",
"And this is the third one.",
"Is this the first document?",
]
idf = calculate_idf(documents)
document_representation = represent_document(documents[0], idf)
print(document_representation)
{'This': 0.05753641449035617, 'is': 0.0, 'the': -0.044628710262841945, 'first': 0.05753641449035617, 'document.': 0.05753641449035617}
In [ ]: