major project text classification code
major project text classification code
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "3ce9b931",
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "936f56e6",
"metadata": {},
"outputs": [],
"source": [
"temp_df = pd.read_csv('IMDB Dataset.csv')"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "9db1830b",
"metadata": {},
"outputs": [],
"source": [
"df = temp_df.iloc[:10000]"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "4f794ceb",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>review</th>\n",
" <th>sentiment</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>One of the other reviewers has mentioned that ...</td>\n",
" <td>positive</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>A wonderful little production. <br /><br
/>The...</td>\n",
" <td>positive</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>I thought this was a wonderful way to spend ti...</td>\n",
" <td>positive</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Basically there's a family where a little boy ...</td>\n",
" <td>negative</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Petter Mattei's \"Love in the Time of Money\" is...</td>\n",
" <td>positive</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" review sentiment\n",
"0 One of the other reviewers has mentioned that ... positive\n",
"1 A wonderful little production. <br /><br />The... positive\n",
"2 I thought this was a wonderful way to spend ti... positive\n",
"3 Basically there's a family where a little boy ... negative\n",
"4 Petter Mattei's \"Love in the Time of Money\" is... positive"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "c31fb116",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'A wonderful little production. <br /><br />The filming technique is very
unassuming- very old-time-BBC fashion and gives a comforting, and sometimes
discomforting, sense of realism to the entire piece. <br /><br />The actors are
extremely well chosen- Michael Sheen not only \"has got all the polari\" but he has
all the voices down pat too! You can truly see the seamless editing guided by the
references to Williams\\' diary entries, not only is it well worth the watching but
it is a terrificly written and performed piece. A masterful production about one of
the great master\\'s of comedy and his life. <br /><br />The realism really comes
home with the little things: the fantasy of the guard which, rather than use the
traditional \\'dream\\' techniques remains solid then disappears. It plays on our
knowledge and our senses, particularly with the scenes concerning Orton and
Halliwell and the sets (particularly of their flat with Halliwell\\'s murals
decorating every surface) are terribly well done.'"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['review'][1]"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "18525cb6",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"positive 5028\n",
"negative 4972\n",
"Name: sentiment, dtype: int64"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['sentiment'].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "f207966e",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"review 0\n",
"sentiment 0\n",
"dtype: int64"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.isnull().sum()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "ea02910b",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"17"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.duplicated().sum()"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "2254cc43",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"<ipython-input-9-16cdb8520be8>:1: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame\n",
"\n",
"See the caveats in the documentation:
https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-v
iew-versus-a-copy\n",
" df.drop_duplicates(inplace=True)\n"
]
}
],
"source": [
"df.drop_duplicates(inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "8bbdbf38",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.duplicated().sum()"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "b838d547",
"metadata": {},
"outputs": [],
"source": [
"# Basic Preprocessing\n",
"# Remove tags\n",
"# lowercase\n",
"# remove stopwords"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "ecb881b9",
"metadata": {},
"outputs": [],
"source": [
"import re\n",
"def remove_tags(raw_text):\n",
" cleaned_text = re.sub(re.compile('<.*?>'), '', raw_text)\n",
" return cleaned_text"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "670ecc96",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"<ipython-input-13-00e93c2a3043>:1: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation:
https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-v
iew-versus-a-copy\n",
" df['review'] = df['review'].apply(remove_tags)\n"
]
}
],
"source": [
"df['review'] = df['review'].apply(remove_tags)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "b21551de",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>review</th>\n",
" <th>sentiment</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>One of the other reviewers has mentioned that ...</td>\n",
" <td>positive</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>A wonderful little production. The filming tec...</td>\n",
" <td>positive</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>I thought this was a wonderful way to spend ti...</td>\n",
" <td>positive</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Basically there's a family where a little boy ...</td>\n",
" <td>negative</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Petter Mattei's \"Love in the Time of Money\" is...</td>\n",
" <td>positive</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9995</th>\n",
" <td>Fun, entertaining movie about WWII German spy ...</td>\n",
" <td>positive</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9996</th>\n",
" <td>Give me a break. How can anyone say that this ...</td>\n",
" <td>negative</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9997</th>\n",
" <td>This movie is a bad movie. But after watching ...</td>\n",
" <td>negative</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9998</th>\n",
" <td>This is a movie that was probably made to ente...</td>\n",
" <td>negative</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9999</th>\n",
" <td>Smashing film about film-making. Shows the int...</td>\n",
" <td>positive</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>9983 rows × 2 columns</p>\n",
"</div>"
],
"text/plain": [
" review sentiment\n",
"0 One of the other reviewers has mentioned that ... positive\n",
"1 A wonderful little production. The filming tec... positive\n",
"2 I thought this was a wonderful way to spend ti... positive\n",
"3 Basically there's a family where a little boy ... negative\n",
"4 Petter Mattei's \"Love in the Time of Money\" is... positive\n",
"... ... ...\n",
"9995 Fun, entertaining movie about WWII German spy ... positive\n",
"9996 Give me a break. How can anyone say that this ... negative\n",
"9997 This movie is a bad movie. But after watching ... negative\n",
"9998 This is a movie that was probably made to ente... negative\n",
"9999 Smashing film about film-making. Shows the int... positive\n",
"\n",
"[9983 rows x 2 columns]"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "1e03f81e",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"<ipython-input-15-afca29351ee0>:1: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation:
https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-v
iew-versus-a-copy\n",
" df['review'] = df['review'].apply(lambda x:x.lower())\n"
]
}
],
"source": [
"df['review'] = df['review'].apply(lambda x:x.lower())"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "bdc39311",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"<ipython-input-16-9f68b5950da1>:5: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation:
https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-v
iew-versus-a-copy\n",
" df['review'] = df['review'].apply(lambda x: [item for item in x.split() if
item not in sw_list]).apply(lambda x:\" \".join(x))\n"
]
}
],
"source": [
"from nltk.corpus import stopwords\n",
"\n",
"sw_list = stopwords.words('english')\n",
"\n",
"df['review'] = df['review'].apply(lambda x: [item for item in x.split() if
item not in sw_list]).apply(lambda x:\" \".join(x))"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "eb3beebd",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>review</th>\n",
" <th>sentiment</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>one reviewers mentioned watching 1 oz episode ...</td>\n",
" <td>positive</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>wonderful little production. filming technique...</td>\n",
" <td>positive</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>thought wonderful way spend time hot summer we...</td>\n",
" <td>positive</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>basically there's family little boy (jake) thi...</td>\n",
" <td>negative</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>petter mattei's \"love time money\" visually stu...</td>\n",
" <td>positive</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9995</th>\n",
" <td>fun, entertaining movie wwii german spy (julie...</td>\n",
" <td>positive</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9996</th>\n",
" <td>give break. anyone say \"good hockey movie\"? kn...</td>\n",
" <td>negative</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9997</th>\n",
" <td>movie bad movie. watching endless series bad h...</td>\n",
" <td>negative</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9998</th>\n",
" <td>movie probably made entertain middle school, e...</td>\n",
" <td>negative</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9999</th>\n",
" <td>smashing film film-making. shows intense stran...</td>\n",
" <td>positive</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>9983 rows × 2 columns</p>\n",
"</div>"
],
"text/plain": [
" review sentiment\n",
"0 one reviewers mentioned watching 1 oz episode ... positive\n",
"1 wonderful little production. filming technique... positive\n",
"2 thought wonderful way spend time hot summer we... positive\n",
"3 basically there's family little boy (jake) thi... negative\n",
"4 petter mattei's \"love time money\" visually stu... positive\n",
"... ... ...\n",
"9995 fun, entertaining movie wwii german spy (julie... positive\n",
"9996 give break. anyone say \"good hockey movie\"? kn... negative\n",
"9997 movie bad movie. watching endless series bad h... negative\n",
"9998 movie probably made entertain middle school, e... negative\n",
"9999 smashing film film-making. shows intense stran... positive\n",
"\n",
"[9983 rows x 2 columns]"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "044b8b36",
"metadata": {},
"outputs": [],
"source": [
"X = df.iloc[:,0:1]\n",
"y = df['sentiment']"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "c2f50b0d",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>review</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>one reviewers mentioned watching 1 oz episode ...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>wonderful little production. filming technique...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>thought wonderful way spend time hot summer we...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>basically there's family little boy (jake) thi...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>petter mattei's \"love time money\" visually stu...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9995</th>\n",
" <td>fun, entertaining movie wwii german spy (julie...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9996</th>\n",
" <td>give break. anyone say \"good hockey movie\"? kn...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9997</th>\n",
" <td>movie bad movie. watching endless series bad h...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9998</th>\n",
" <td>movie probably made entertain middle school, e...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9999</th>\n",
" <td>smashing film film-making. shows intense stran...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>9983 rows × 1 columns</p>\n",
"</div>"
],
"text/plain": [
" review\n",
"0 one reviewers mentioned watching 1 oz episode ...\n",
"1 wonderful little production. filming technique...\n",
"2 thought wonderful way spend time hot summer we...\n",
"3 basically there's family little boy (jake) thi...\n",
"4 petter mattei's \"love time money\" visually stu...\n",
"... ...\n",
"9995 fun, entertaining movie wwii german spy (julie...\n",
"9996 give break. anyone say \"good hockey movie\"? kn...\n",
"9997 movie bad movie. watching endless series bad h...\n",
"9998 movie probably made entertain middle school, e...\n",
"9999 smashing film film-making. shows intense stran...\n",
"\n",
"[9983 rows x 1 columns]"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "c6014ee0",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 positive\n",
"1 positive\n",
"2 positive\n",
"3 negative\n",
"4 positive\n",
" ... \n",
"9995 positive\n",
"9996 negative\n",
"9997 negative\n",
"9998 negative\n",
"9999 positive\n",
"Name: sentiment, Length: 9983, dtype: object"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"y"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "fb01ea97",
"metadata": {},
"outputs": [],
"source": [
"from sklearn.preprocessing import LabelEncoder\n",
"\n",
"encoder = LabelEncoder()\n",
"\n",
"y = encoder.fit_transform(y)"
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "e49f7132",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([1, 1, 1, ..., 0, 0, 1])"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"y"
]
},
{
"cell_type": "code",
"execution_count": 23,
"id": "4a47c8cf",
"metadata": {},
"outputs": [],
"source": [
"from sklearn.model_selection import train_test_split\n",
"X_train,X_test,y_train,y_test =
train_test_split(X,y,test_size=0.2,random_state=1)"
]
},
{
"cell_type": "code",
"execution_count": 24,
"id": "59d30e1a",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(7986, 1)"
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X_train.shape"
]
},
{
"cell_type": "code",
"execution_count": 25,
"id": "f0ced488",
"metadata": {},
"outputs": [],
"source": [
"# Applying BoW\n",
"from sklearn.feature_extraction.text import CountVectorizer"
]
},
{
"cell_type": "code",
"execution_count": 26,
"id": "d038ae3a",
"metadata": {},
"outputs": [],
"source": [
"cv = CountVectorizer()"
]
},
{
"cell_type": "code",
"execution_count": 27,
"id": "847ace00",
"metadata": {},
"outputs": [],
"source": [
"X_train_bow = cv.fit_transform(X_train['review']).toarray()\n",
"X_test_bow = cv.transform(X_test['review']).toarray()"
]
},
{
"cell_type": "code",
"execution_count": 28,
"id": "98e12175",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(7986, 48282)"
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X_train_bow.shape"
]
},
{
"cell_type": "code",
"execution_count": 29,
"id": "f937899f",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"GaussianNB()"
]
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from sklearn.naive_bayes import GaussianNB\n",
"gnb = GaussianNB()\n",
"\n",
"gnb.fit(X_train_bow,y_train)"
]
},
{
"cell_type": "code",
"execution_count": 30,
"id": "7164016f",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.6324486730095142"
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"y_pred = gnb.predict(X_test_bow)\n",
"\n",
"from sklearn.metrics import accuracy_score,confusion_matrix\n",
"accuracy_score(y_test,y_pred)"
]
},
{
"cell_type": "code",
"execution_count": 31,
"id": "65a01603",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[717, 235],\n",
" [499, 546]], dtype=int64)"
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"confusion_matrix(y_test,y_pred)"
]
},
{
"cell_type": "code",
"execution_count": 32,
"id": "6676b0e0",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.8527791687531296"
]
},
"execution_count": 32,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from sklearn.ensemble import RandomForestClassifier\n",
"rf = RandomForestClassifier()\n",
"\n",
"rf.fit(X_train_bow,y_train)\n",
"y_pred = rf.predict(X_test_bow)\n",
"accuracy_score(y_test,y_pred)"
]
},
{
"cell_type": "code",
"execution_count": 33,
"id": "cc85e4b2",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.8372558838257386"
]
},
"execution_count": 33,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"cv = CountVectorizer(max_features=3000)\n",
"\n",
"X_train_bow = cv.fit_transform(X_train['review']).toarray()\n",
"X_test_bow = cv.transform(X_test['review']).toarray()\n",
"\n",
"rf = RandomForestClassifier()\n",
"\n",
"rf.fit(X_train_bow,y_train)\n",
"y_pred = rf.predict(X_test_bow)\n",
"accuracy_score(y_test,y_pred)"
]
},
{
"cell_type": "code",
"execution_count": 34,
"id": "c5602297",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.8407611417125689"
]
},
"execution_count": 34,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"cv = CountVectorizer(ngram_range=(1,2),max_features=5000)\n",
"\n",
"X_train_bow = cv.fit_transform(X_train['review']).toarray()\n",
"X_test_bow = cv.transform(X_test['review']).toarray()\n",
"\n",
"rf = RandomForestClassifier()\n",
"\n",
"rf.fit(X_train_bow,y_train)\n",
"y_pred = rf.predict(X_test_bow)\n",
"accuracy_score(y_test,y_pred)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d0930ca1",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"id": "f77c7784",
"metadata": {},
"source": [
"## Using TfIdf"
]
},
{
"cell_type": "code",
"execution_count": 39,
"id": "6a4ad25b",
"metadata": {},
"outputs": [],
"source": [
"from sklearn.feature_extraction.text import TfidfVectorizer"
]
},
{
"cell_type": "code",
"execution_count": 40,
"id": "7efd8b3f",
"metadata": {},
"outputs": [],
"source": [
"tfidf = TfidfVectorizer()"
]
},
{
"cell_type": "code",
"execution_count": 41,
"id": "76e2d7ed",
"metadata": {},
"outputs": [],
"source": [
"X_train_tfidf = tfidf.fit_transform(X_train['review']).toarray()\n",
"X_test_tfidf = tfidf.transform(X_test['review'])"
]
},
{
"cell_type": "code",
"execution_count": 38,
"id": "a89c3ed1",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.8482724086129194"
]
},
"execution_count": 38,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"rf = RandomForestClassifier()\n",
"\n",
"rf.fit(X_train_tfidf,y_train)\n",
"y_pred = rf.predict(X_test_tfidf)\n",
"\n",
"accuracy_score(y_test,y_pred)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3c7e36ff",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "4bf6b986",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "faafe112",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 121,
"id": "4a261a8d",
"metadata": {},
"outputs": [],
"source": [
"import gensim"
]
},
{
"cell_type": "code",
"execution_count": 122,
"id": "4679427f",
"metadata": {},
"outputs": [],
"source": [
"from gensim.models import Word2Vec,KeyedVectors"
]
},
{
"cell_type": "code",
"execution_count": 125,
"id": "498f49f9",
"metadata": {},
"outputs": [],
"source": [
"model =
KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin',binary=True,
limit=500000)"
]
},
{
"cell_type": "code",
"execution_count": 127,
"id": "9da8d022",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(300,)"
]
},
"execution_count": 127,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model['cricket'].shape"
]
},
{
"cell_type": "code",
"execution_count": 150,
"id": "db251ffa",
"metadata": {},
"outputs": [],
"source": [
"from nltk.corpus import stopwords\n",
"\n",
"sw_list = stopwords.words('english')"
]
},
{
"cell_type": "code",
"execution_count": 151,
"id": "60f16b9e",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['i',\n",
" 'me',\n",
" 'my',\n",
" 'myself',\n",
" 'we',\n",
" 'our',\n",
" 'ours',\n",
" 'ourselves',\n",
" 'you',\n",
" \"you're\",\n",
" \"you've\",\n",
" \"you'll\",\n",
" \"you'd\",\n",
" 'your',\n",
" 'yours',\n",
" 'yourself',\n",
" 'yourselves',\n",
" 'he',\n",
" 'him',\n",
" 'his',\n",
" 'himself',\n",
" 'she',\n",
" \"she's\",\n",
" 'her',\n",
" 'hers',\n",
" 'herself',\n",
" 'it',\n",
" \"it's\",\n",
" 'its',\n",
" 'itself',\n",
" 'they',\n",
" 'them',\n",
" 'their',\n",
" 'theirs',\n",
" 'themselves',\n",
" 'what',\n",
" 'which',\n",
" 'who',\n",
" 'whom',\n",
" 'this',\n",
" 'that',\n",
" \"that'll\",\n",
" 'these',\n",
" 'those',\n",
" 'am',\n",
" 'is',\n",
" 'are',\n",
" 'was',\n",
" 'were',\n",
" 'be',\n",
" 'been',\n",
" 'being',\n",
" 'have',\n",
" 'has',\n",
" 'had',\n",
" 'having',\n",
" 'do',\n",
" 'does',\n",
" 'did',\n",
" 'doing',\n",
" 'a',\n",
" 'an',\n",
" 'the',\n",
" 'and',\n",
" 'but',\n",
" 'if',\n",
" 'or',\n",
" 'because',\n",
" 'as',\n",
" 'until',\n",
" 'while',\n",
" 'of',\n",
" 'at',\n",
" 'by',\n",
" 'for',\n",
" 'with',\n",
" 'about',\n",
" 'against',\n",
" 'between',\n",
" 'into',\n",
" 'through',\n",
" 'during',\n",
" 'before',\n",
" 'after',\n",
" 'above',\n",
" 'below',\n",
" 'to',\n",
" 'from',\n",
" 'up',\n",
" 'down',\n",
" 'in',\n",
" 'out',\n",
" 'on',\n",
" 'off',\n",
" 'over',\n",
" 'under',\n",
" 'again',\n",
" 'further',\n",
" 'then',\n",
" 'once',\n",
" 'here',\n",
" 'there',\n",
" 'when',\n",
" 'where',\n",
" 'why',\n",
" 'how',\n",
" 'all',\n",
" 'any',\n",
" 'both',\n",
" 'each',\n",
" 'few',\n",
" 'more',\n",
" 'most',\n",
" 'other',\n",
" 'some',\n",
" 'such',\n",
" 'no',\n",
" 'nor',\n",
" 'not',\n",
" 'only',\n",
" 'own',\n",
" 'same',\n",
" 'so',\n",
" 'than',\n",
" 'too',\n",
" 'very',\n",
" 's',\n",
" 't',\n",
" 'can',\n",
" 'will',\n",
" 'just',\n",
" 'don',\n",
" \"don't\",\n",
" 'should',\n",
" \"should've\",\n",
" 'now',\n",
" 'd',\n",
" 'll',\n",
" 'm',\n",
" 'o',\n",
" 're',\n",
" 've',\n",
" 'y',\n",
" 'ain',\n",
" 'aren',\n",
" \"aren't\",\n",
" 'couldn',\n",
" \"couldn't\",\n",
" 'didn',\n",
" \"didn't\",\n",
" 'doesn',\n",
" \"doesn't\",\n",
" 'hadn',\n",
" \"hadn't\",\n",
" 'hasn',\n",
" \"hasn't\",\n",
" 'haven',\n",
" \"haven't\",\n",
" 'isn',\n",
" \"isn't\",\n",
" 'ma',\n",
" 'mightn',\n",
" \"mightn't\",\n",
" 'mustn',\n",
" \"mustn't\",\n",
" 'needn',\n",
" \"needn't\",\n",
" 'shan',\n",
" \"shan't\",\n",
" 'shouldn',\n",
" \"shouldn't\",\n",
" 'wasn',\n",
" \"wasn't\",\n",
" 'weren',\n",
" \"weren't\",\n",
" 'won',\n",
" \"won't\",\n",
" 'wouldn',\n",
" \"wouldn't\"]"
]
},
"execution_count": 151,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sw_list"
]
},
{
"cell_type": "code",
"execution_count": 155,
"id": "79f982ce",
"metadata": {},
"outputs": [],
"source": [
"# Remove stopwords\n",
"\n",
"X_train = X_train['review'].apply(lambda x: [item for item in x.split() if
item not in sw_list]).apply(lambda x:\" \".join(x))\n",
"# Remove stopwords\n",
"\n",
"X_test = X_test['review'].apply(lambda x: [item for item in x.split() if item
not in sw_list]).apply(lambda x:\" \".join(x))"
]
},
{
"cell_type": "code",
"execution_count": 178,
"id": "f4e202e0",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[ 0.15583833 0.22911808 0.13835862 -0.2090293 -0.00363565 -0.13118638\n",
" -0.15676734 0.23040245 -0.24502586 -0.10665981 0.23412983 -0.21705897\n",
" 0.13545002 0.13861737 0.12415285 0.21369664 -0.19687523 -0.10246313\n",
" -0.03955808 0.27656794 -0.11084636 -0.28008527 0.3439968 0.0482447\n",
" -0.06924975 -0.03105986 -0.54718924 -0.3975329 0.1704354 -0.16617411\n",
" 0.09593288 -0.31931752 0.42519966 0.05527496 0.06130749 -0.3888249\n",
" -0.14431256 0.07848645 0.07683445 0.3278561 -0.3507611 -0.1357377\n",
" 0.28880984 0.09166662 0.0593424 -0.04693977 0.1484201 0.01081875\n",
" 0.25666746 0.21095097 0.1744978 0.04549917 0.19118404 -0.25923675\n",
" -0.06680956 0.25583443 -0.06317639 -0.2850735 -0.03264944 0.0042502\n",
" -0.07356709 -0.09716224 0.08560921 -0.09941689 -0.12068396 -0.21283795\n",
" -0.03129313 -0.32813746 0.42847297 -0.37240502 0.26251498 -0.11578684\n",
" -0.03221569 -0.05079779 0.17090753 -0.12238634 -0.07934 0.22784583\n",
" 0.20331696 -0.5104732 0.07128731 -0.03987676 -0.05812799 0.19840325\n",
" -0.10885198 -0.09540041 0.2161586 -0.17952771 0.04469092 0.06091264\n",
" 0.05722181 -0.02751416 0.19805025 0.32649586 -0.06467736 0.10155083]\n"
]
}
],
"source": [
"import spacy\n",
"import en_core_web_sm\n",
"# Load the spacy model. This takes a few seconds.\n",
"nlp = en_core_web_sm.load()\n",
"# Process a sentence using the model\n",
"doc = nlp(X_train.values[0])\n",
"print(doc.vector)"
]
},
{
"cell_type": "code",
"execution_count": 173,
"id": "8b38bab4",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Collecting
https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.0/en
_core_web_sm-2.2.5.tar.gz\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" ERROR: HTTP error 404 while getting
https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.0/en
_core_web_sm-2.2.5.tar.gz\n",
"ERROR: Could not install requirement
https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.0/en
_core_web_sm-2.2.5.tar.gz because of HTTP error 404 Client Error: Not Found for
url:
https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.0/en
_core_web_sm-2.2.5.tar.gz for URL
https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.0/en
_core_web_sm-2.2.5.tar.gz\n"
]
}
],
"source": [
"!pip install
https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.0/en
_core_web_sm-2.2.5.tar.gz"
]
},
{
"cell_type": "code",
"execution_count": 174,
"id": "313b1131",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Collecting en-core-web-sm==3.2.0\n",
" Downloading
https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en
_core_web_sm-3.2.0-py3-none-any.whl (13.9 MB)\n",
"Requirement already satisfied: spacy<3.3.0,>=3.2.0 in
c:\\users\\91842\\anaconda3\\lib\\site-packages (from en-core-web-sm==3.2.0)
(3.2.1)\n",
"Requirement already satisfied: typer<0.5.0,>=0.3.0 in
c:\\users\\91842\\anaconda3\\lib\\site-packages (from
spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (0.4.0)\n",
"Requirement already satisfied: preshed<3.1.0,>=3.0.2 in
c:\\users\\91842\\anaconda3\\lib\\site-packages (from
spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (3.0.6)\n",
"Requirement already satisfied: thinc<8.1.0,>=8.0.12 in
c:\\users\\91842\\anaconda3\\lib\\site-packages (from
spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (8.0.13)\n",
"Requirement already satisfied: blis<0.8.0,>=0.4.0 in
c:\\users\\91842\\anaconda3\\lib\\site-packages (from
spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (0.7.5)\n",
"Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in
c:\\users\\91842\\anaconda3\\lib\\site-packages (from
spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (1.0.6)\n",
"Requirement already satisfied: spacy-legacy<3.1.0,>=3.0.8 in
c:\\users\\91842\\anaconda3\\lib\\site-packages (from
spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (3.0.8)\n",
"Requirement already satisfied: setuptools in
c:\\users\\91842\\anaconda3\\lib\\site-packages (from
spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (52.0.0.post20210125)\n",
"Requirement already satisfied: cymem<2.1.0,>=2.0.2 in
c:\\users\\91842\\anaconda3\\lib\\site-packages (from
spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (2.0.6)\n",
"Requirement already satisfied: jinja2 in
c:\\users\\91842\\anaconda3\\lib\\site-packages (from
spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (2.11.3)\n",
"Requirement already satisfied: pathy>=0.3.5 in
c:\\users\\91842\\anaconda3\\lib\\site-packages (from
spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (0.6.1)\n",
"Requirement already satisfied: srsly<3.0.0,>=2.4.1 in
c:\\users\\91842\\anaconda3\\lib\\site-packages (from
spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (2.4.2)\n",
"Requirement already satisfied: numpy>=1.15.0 in
c:\\users\\91842\\anaconda3\\lib\\site-packages (from
spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (1.20.1)\n",
"Requirement already satisfied: wasabi<1.1.0,>=0.8.1 in
c:\\users\\91842\\anaconda3\\lib\\site-packages (from
spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (0.9.0)\n",
"Requirement already satisfied: requests<3.0.0,>=2.13.0 in
c:\\users\\91842\\anaconda3\\lib\\site-packages (from
spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (2.25.1)\n",
"Requirement already satisfied: spacy-loggers<2.0.0,>=1.0.0 in
c:\\users\\91842\\anaconda3\\lib\\site-packages (from
spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (1.0.1)\n",
"Requirement already satisfied: langcodes<4.0.0,>=3.2.0 in
c:\\users\\91842\\anaconda3\\lib\\site-packages (from
spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (3.3.0)\n",
"Requirement already satisfied: packaging>=20.0 in
c:\\users\\91842\\anaconda3\\lib\\site-packages (from
spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (20.9)\n",
"Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in
c:\\users\\91842\\anaconda3\\lib\\site-packages (from
spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (4.59.0)\n",
"Requirement already satisfied: catalogue<2.1.0,>=2.0.6 in
c:\\users\\91842\\anaconda3\\lib\\site-packages (from
spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (2.0.6)\n",
"Requirement already satisfied: pydantic!=1.8,!=1.8.1,<1.9.0,>=1.7.4 in
c:\\users\\91842\\anaconda3\\lib\\site-packages (from
spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (1.8.2)\n",
"Requirement already satisfied: pyparsing>=2.0.2 in
c:\\users\\91842\\anaconda3\\lib\\site-packages (from
packaging>=20.0->spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (2.4.7)\n",
"Requirement already satisfied: smart-open<6.0.0,>=5.0.0 in
c:\\users\\91842\\anaconda3\\lib\\site-packages (from
pathy>=0.3.5->spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (5.2.1)\n",
"Requirement already satisfied: typing-extensions>=3.7.4.3 in
c:\\users\\91842\\anaconda3\\lib\\site-packages (from
pydantic!=1.8,!=1.8.1,<1.9.0,>=1.7.4->spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0)
(3.7.4.3)\n",
"Requirement already satisfied: chardet<5,>=3.0.2 in
c:\\users\\91842\\anaconda3\\lib\\site-packages (from
requests<3.0.0,>=2.13.0->spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (4.0.0)\n",
"Requirement already satisfied: idna<3,>=2.5 in
c:\\users\\91842\\anaconda3\\lib\\site-packages (from
requests<3.0.0,>=2.13.0->spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (2.10)\n",
"Requirement already satisfied: urllib3<1.27,>=1.21.1 in
c:\\users\\91842\\anaconda3\\lib\\site-packages (from
requests<3.0.0,>=2.13.0->spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (1.26.4)\n",
"Requirement already satisfied: certifi>=2017.4.17 in
c:\\users\\91842\\anaconda3\\lib\\site-packages (from
requests<3.0.0,>=2.13.0->spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0)
(2020.12.5)\n",
"Requirement already satisfied: click<9.0.0,>=7.1.1 in
c:\\users\\91842\\anaconda3\\lib\\site-packages (from
typer<0.5.0,>=0.3.0->spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (7.1.2)\n",
"Requirement already satisfied: MarkupSafe>=0.23 in
c:\\users\\91842\\anaconda3\\lib\\site-packages (from
jinja2->spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (1.1.1)\n",
"Installing collected packages: en-core-web-sm\n",
"Successfully installed en-core-web-sm-3.2.0\n",
"[+] Download and installation successful\n",
"You can now load the package via spacy.load('en_core_web_sm')\n"
]
}
],
"source": [
"!python -m spacy download en_core_web_sm"
]
},
{
"cell_type": "code",
"execution_count": 180,
"id": "43202871",
"metadata": {},
"outputs": [],
"source": [
"input_arr = []\n",
"for item in X_train.values:\n",
" doc = nlp(item)\n",
" input_arr.append(doc.vector)"
]
},
{
"cell_type": "code",
"execution_count": 182,
"id": "e23b8889",
"metadata": {},
"outputs": [],
"source": [
"input_arr = np.array(input_arr)"
]
},
{
"cell_type": "code",
"execution_count": 183,
"id": "07aa9cee",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(7986, 96)"
]
},
"execution_count": 183,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"input_arr.shape"
]
},
{
"cell_type": "code",
"execution_count": 184,
"id": "279e7a0c",
"metadata": {},
"outputs": [],
"source": [
"input_test_arr = []\n",
"for item in X_test.values:\n",
" doc = nlp(item)\n",
" input_test_arr.append(doc.vector)"
]
},
{
"cell_type": "code",
"execution_count": 185,
"id": "1996ba84",
"metadata": {},
"outputs": [],
"source": [
"input_test_arr = np.array(input_test_arr)"
]
},
{
"cell_type": "code",
"execution_count": 186,
"id": "61715e93",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(1997, 96)"
]
},
"execution_count": 186,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"input_test_arr.shape"
]
},
{
"cell_type": "code",
"execution_count": 189,
"id": "c9f71144",
"metadata": {},
"outputs": [],
"source": [
"from sklearn.naive_bayes import GaussianNB"
]
},
{
"cell_type": "code",
"execution_count": 190,
"id": "9a3daa2d",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"GaussianNB()"
]
},
"execution_count": 190,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"gnb = GaussianNB()\n",
"gnb.fit(input_arr,y_train)"
]
},
{
"cell_type": "code",
"execution_count": 191,
"id": "ce41c5e8",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.6119178768152228"
]
},
"execution_count": 191,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"y_pred = gnb.predict(input_test_arr)\n",
"accuracy_score(y_test,y_pred)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f326b87d",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.8"
}
},
"nbformat": 4,
"nbformat_minor": 5
}