0% found this document useful (0 votes)
98 views

Course 3 - Week 2 - Exercise - Answer - Ipynb - Colaboratory

Uploaded by

Fahim Sa
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
98 views

Course 3 - Week 2 - Exercise - Answer - Ipynb - Colaboratory

Uploaded by

Fahim Sa
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 8

Licensed

#@title Licensed under the Apache License, Version under


2.0 (the the Apache
"License
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at License, Version 2.0 (the
#
# https://www.apache.org/licenses/LICENSE-2.0
"License");
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or imp
# See the License for the specific language governing permissions and
# limitations under the License.

Open in Colab

Copyright 2019 The TensorFlow Authors.

Licensed under the Apache License, Version 2.0 (the "License");

import csv
import tensorflow as tf
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

!wget --no-check-certificate \
https://storage.googleapis.com/laurencemoroney-blog.appspot.com/bbc-text.csv \
-O /tmp/bbc-text.csv

--2020-07-12 13:59:26-- https://storage.googleapis.com/laurencemoroney-blog.appspot.


Resolving storage.googleapis.com (storage.googleapis.com)... 173.194.76.128, 64.233.1
Connecting to storage.googleapis.com (storage.googleapis.com)|173.194.76.128|:443...
HTTP request sent, awaiting response... 200 OK
Length: 5057493 (4.8M) [application/octet-stream]
Saving to: ‘/tmp/bbc-text.csv’

/tmp/bbc-text.csv 100%[===================>] 4.82M --.-KB/s in 0.03s

2020-07-12 13:59:27 (179 MB/s) - ‘/tmp/bbc-text.csv’ saved [5057493/5057493]

vocab_size = 1000
embedding_dim = 16
max_length = 120
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"
training_portion = .8

sentences = []
labels = []
stopwords = [ "a", "about", "above", "after", "again", "against", "all", "am", "an", "and",
print(len(stopwords))
# Expected Output
# 153

153

with open("/tmp/bbc-text.csv", 'r') as csvfile:


reader = csv.reader(csvfile, delimiter=',')
next(reader)
for row in reader:
labels.append(row[0])
sentence = row[1]
for word in stopwords:
token = " " + word + " "
sentence = sentence.replace(token, " ")
sentences.append(sentence)

print(len(labels))
print(len(sentences))
print(sentences[0])
# Expected Output
# 2225
# 2225
# tv future hands viewers home theatre systems plasma high-definition tvs digital video r

2225
2225
tv future hands viewers home theatre systems plasma high-definition tvs digital vid

train_size = int(len(sentences) * training_portion)

train_sentences = sentences[:train_size]
train_labels = labels[:train_size]

validation_sentences = sentences[train_size:]
validation_labels = labels[train_size:]

print(train_size)
print(len(train_sentences))
print(len(train_labels))
print(len(validation_sentences))
print(len(validation_labels))

# Expected output (if training_portion=.8)


# 1780
# 1780
# 1780
# 445
# 445
1780
1780
1780
445
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
445
tokenizer.fit_on_texts(train_sentences)
word_index = tokenizer.word_index

train_sequences = tokenizer.texts_to_sequences(train_sentences)
train_padded = pad_sequences(train_sequences, padding=padding_type, maxlen=max_length)

print(len(train_sequences[0]))
print(len(train_padded[0]))

print(len(train_sequences[1]))
print(len(train_padded[1]))

print(len(train_sequences[10]))
print(len(train_padded[10]))

# Expected Ouput
# 449
# 120
# 200
# 120
# 192
# 120

449
120
200
120
192
120

validation_sequences = tokenizer.texts_to_sequences(validation_sentences)
validation_padded = pad_sequences(validation_sequences, padding=padding_type, maxlen=max_le

print(len(validation_sequences))
print(validation_padded.shape)

# Expected output
# 445
# (445, 120)

445
(445, 120)

label_tokenizer = Tokenizer()
label_tokenizer.fit_on_texts(labels)

training_label_seq = np.array(label_tokenizer.texts_to_sequences(train_labels))
validation_label_seq = np.array(label_tokenizer.texts_to_sequences(validation_labels))

print(training_label_seq[0])
print(training_label_seq[1])
print(training_label_seq[2])
print(training_label_seq.shape)

print(validation_label_seq[0])
print(validation_label_seq[1])
print(validation_label_seq[2])
print(validation_label_seq.shape)

# Expected output
# [4]
# [2]
# [1]
# (1780, 1)
# [5]
# [4]
# [3]
# (445, 1)

[4]
[2]
[1]
(1780, 1)
[5]
[4]
[3]
(445, 1)

model = tf.keras.Sequential([
tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
tf.keras.layers.GlobalAveragePooling1D(),
tf.keras.layers.Dense(24, activation='relu'),
tf.keras.layers.Dense(6, activation='softmax')
])
model.compile(loss='sparse_categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()

# Expected Output
# Layer (type) Output Shape Param #
# =================================================================
# embedding (Embedding) (None, 120, 16) 16000
# _________________________________________________________________
# global_average_pooling1d (Gl (None, 16) 0
# _________________________________________________________________
# dense (Dense) (None, 24) 408
# _________________________________________________________________
# dense_1 (Dense) (None, 6) 150
# =================================================================
# Total params: 16,558
# Trainable params: 16,558
# Non-trainable params: 0
Model: "sequential"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
embedding (Embedding) (None, 120, 16) 16000
_________________________________________________________________
global_average_pooling1d (Gl (None, 16) 0
_________________________________________________________________
dense (Dense) (None, 24) 408
_________________________________________________________________
dense_1 (Dense) (None, 6) 150
=================================================================
num_epochs
Total = 30
params: 16,558
Trainable
history params: 16,558
= model.fit(train_padded, training_label_seq, epochs=num_epochs, validation_data=(v
Non-trainable params: 0
_________________________________________________________________
Epoch 1/30
56/56 - 0s - loss: 1.7632 - accuracy: 0.2315 - val_loss: 1.7279 - val_accuracy: 0.229
Epoch 2/30
56/56 - 0s - loss: 1.6797 - accuracy: 0.2326 - val_loss: 1.6301 - val_accuracy: 0.269
Epoch 3/30
56/56 - 0s - loss: 1.5684 - accuracy: 0.4124 - val_loss: 1.5137 - val_accuracy: 0.429
Epoch 4/30
56/56 - 0s - loss: 1.4253 - accuracy: 0.4809 - val_loss: 1.3512 - val_accuracy: 0.525
Epoch 5/30
56/56 - 0s - loss: 1.2274 - accuracy: 0.5961 - val_loss: 1.1528 - val_accuracy: 0.671
Epoch 6/30
56/56 - 0s - loss: 1.0137 - accuracy: 0.7511 - val_loss: 0.9600 - val_accuracy: 0.813
Epoch 7/30
56/56 - 0s - loss: 0.8240 - accuracy: 0.8579 - val_loss: 0.8014 - val_accuracy: 0.860
Epoch
import 8/30
matplotlib.pyplot as plt
56/56 - 0s - loss: 0.6696 - accuracy: 0.9107 - val_loss: 0.6733 - val_accuracy: 0.885
Epoch 9/30
56/56 - 0s - loss: 0.5459 - accuracy: 0.9281 - val_loss: 0.5711 - val_accuracy: 0.901
def plot_graphs(history,
Epoch 10/30 string):
plt.plot(history.history[string])
56/56 - 0s - loss: 0.4440 - accuracy: 0.9438 - val_loss: 0.4865 - val_accuracy: 0.921
plt.plot(history.history['val_'+string])
Epoch 11/30
56/56 - 0s - loss: 0.3640 - accuracy:
plt.xlabel("Epochs") 0.9567 - val_loss: 0.4199 - val_accuracy: 0.921
Epoch 12/30
plt.ylabel(string)
56/56 - 0s - loss:
plt.legend([string, 0.3000 - accuracy:
'val_'+string]) 0.9596 - val_loss: 0.3700 - val_accuracy: 0.921
Epoch 13/30
plt.show()
56/56 - 0s - loss: 0.2512 - accuracy: 0.9691 - val_loss: 0.3320 - val_accuracy: 0.923
Epoch 14/30
plot_graphs(history, "accuracy")
56/56 - 0s - loss: 0.2149 - accuracy: 0.9725 - val_loss: 0.3016 - val_accuracy: 0.928
plot_graphs(history,
Epoch 15/30 "loss")
56/56 - 0s - loss: 0.1848 - accuracy: 0.9747 - val_loss: 0.2825 - val_accuracy: 0.928
Epoch 16/30
56/56 - 0s - loss: 0.1620 - accuracy: 0.9781 - val_loss: 0.2639 - val_accuracy: 0.932
Epoch 17/30
56/56 - 0s - loss: 0.1425 - accuracy: 0.9815 - val_loss: 0.2504 - val_accuracy: 0.934
Epoch 18/30
56/56 - 0s - loss: 0.1274 - accuracy: 0.9815 - val_loss: 0.2400 - val_accuracy: 0.932
Epoch 19/30
56/56 - 0s - loss: 0.1130 - accuracy: 0.9848 - val_loss: 0.2305 - val_accuracy: 0.932
Epoch 20/30
56/56 - 0s - loss: 0.1015 - accuracy: 0.9871 - val_loss: 0.2225 - val_accuracy: 0.932
Epoch 21/30
56/56 - 0s - loss: 0.0916 - accuracy: 0.9865 - val_loss: 0.2196 - val_accuracy: 0.934
Epoch 22/30
56/56 - 0s - loss: 0.0827 - accuracy: 0.9899 - val_loss: 0.2122 - val_accuracy: 0.934
Epoch 23/30
56/56 - 0s - loss: 0.0741 - accuracy: 0.9916 - val_loss: 0.2092 - val_accuracy: 0.932
Epoch 24/30
56/56 - 0s - loss: 0.0678 - accuracy: 0.9921 - val_loss: 0.2040 - val_accuracy: 0.934
Epoch 25/30
56/56 - 0s - loss: 0.0610 - accuracy: 0.9955 - val_loss: 0.2015 - val_accuracy: 0.937
Epoch 26/30
56/56 - 0s - loss: 0.0555 - accuracy: 0.9961 - val_loss: 0.1992 - val_accuracy: 0.939
Epoch 27/30
56/56 - 0s - loss: 0.0504 - accuracy: 0.9966 - val_loss: 0.1952 - val_accuracy: 0.941
Epoch 28/30
56/56 - 0s - loss: 0.0460 - accuracy: 0.9983 - val_loss: 0.1953 - val_accuracy: 0.939
Epoch 29/30
56/56 - 0s - loss: 0.0419 - accuracy: 0.9994 - val_loss: 0.1911 - val_accuracy: 0.948
Epoch 30/30
56/56 - 0s - loss: 0.0385 - accuracy: 0.9994 - val_loss: 0.1905 - val_accuracy: 0.939
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

def decode_sentence(text):
return ' '.join([reverse_word_index.get(i, '?') for i in text])

e = model.layers[0]
weights = e.get_weights()[0]
print(weights.shape) # shape: (vocab_size, embedding_dim)

# Expected output
# (1000, 16)

(1000, 16)

import io

out_v = io.open('vecs.tsv', 'w', encoding='utf-8')


out_m = io.open('meta.tsv', 'w', encoding='utf-8')
for word_num in range(1, vocab_size):
word = reverse_word_index[word_num]
embeddings = weights[word_num]
out_m.write(word + "\n")
out_v.write('\t'.join([str(x) for x in embeddings]) + "\n")
out_v.close()
out_m.close()

try:
from google.colab import files
except ImportError:
pass
else:
files.download('vecs.tsv')
files.download('meta.tsv')

You might also like