Siamese Network Assignment
Siamese Network Assignment
# %%
! pip install unidecode matplotlib
# %%
from classes import *
from functions import *
import pickle
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
import random
from torch.utils.data import Dataset, DataLoader
import matplotlib.pyplot as plt
# %%
from unidecode import unidecode
# %%
audio_files = os.listdir('datasets/audio')
audio_df['name'] = audio_df['name'].apply(unidecode)
audio_df['audio_embedding'] = audio_df['audio_embedding'].apply(lambda x:
x/np.linalg.norm(x))
# %%
with open('datasets/image_embeddings.pickle', 'rb') as file:
image_embeddings = pickle.load(file)
image_df['name'] = image_df['name'].apply(unidecode)
# %%
matches_df = pd.merge(image_df, audio_df, on='name', how='outer')
# %%
coincidences = 0
negatives = []
for i, row in matches_df.iterrows():
while True:
sample = matches_df.sample(n=1)
sample.reset_index(inplace=True, drop=True)
if sample['name'][0] != row['name']:
negatives.append(sample['positive'][0])
break
else:
coincidences += 1
matches_df['negative'] = negatives
# %%
train_set, test_set , _, _2 = train_test_split(matches_df, matches_df['name'],
test_size=0.2)
test_set, validation_set, _, _2 = train_test_split(test_set, test_set['name'],
test_size=0.5)
# %%
# %% [markdown]
# ### Siamese Network Model With Triplet Loss Training
# %%
siamese_model = SiameseNetwork([512, 192], [[256, 512, 256], [256, 512, 256]], 256)
# %%
training_triplet_dataset = TripletDataset(train_set)
testing_triplet_dataset = TripletDataset(test_set)
validation_triplet_dataset = TripletDataset(validation_set)
# %%
train_triplet_dataloader = DataLoader(training_triplet_dataset, batch_size=32,
shuffle=True)
test_triplet_dataloader = DataLoader(testing_triplet_dataset, batch_size=32,
shuffle=True)
validation_triplet_dataloader = DataLoader(validation_triplet_dataset,
batch_size=32, shuffle=True)
# %% [markdown]
# ##### Training with Early Stopping
# %%
# optimizer = optim.SGD(siamese_model.parameters(), lr=0.1)
optimizer = optim.Adam(siamese_model.parameters(), lr=0.0005)
epochs = 500
# %%
training_losses = []
testing_losses = []
early_stopping_indicators = 0
for epoch in range(epochs):
total_loss = 0.0
total_testing_loss = 0.0
training_batches = 0
testing_batches = 0
for anchor_batch, positive_batch, negative_batch in train_triplet_dataloader:
training_batches += 1
optimizer.zero_grad()
loss = triplet_loss(siamese_model, anchor_batch, positive_batch,
negative_batch, margin=1.0)
loss.backward()
optimizer.step()
total_loss += loss.item()
# validation_losses.append(validation_loss)
total_training_loss_per_batch = total_loss/training_batches
training_losses.append(total_training_loss_per_batch)
total_testing_loss_per_batch = total_testing_loss/testing_batches
testing_losses.append(total_testing_loss_per_batch)
print(f"epoch: {epoch + 1} Training Loss per batch:
{total_training_loss_per_batch}, Testing Loss per batch:
{total_testing_loss_per_batch}\n")
if (epoch > 8) and (np.mean(testing_losses[-6:-3] < np.mean(testing_losses[-
3:]))):
print('Early stopping')
break
# %%
epoch_list = [i+1 for i in range(epoch + 1)]
# %%
plt.plot(epoch_list, training_losses, label='Training Loss')
plt.plot(epoch_list, testing_losses, label='Testing Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()
# %%
from classes import *
# %% [markdown]
# ### 1:2 Identification Accuracy
# %%
ia = identification_accuracy(siamese_model, validation_triplet_dataloader)
# %%
print(f"1:2 Identification Accuracy for validation set is {ia*100}%")