0% found this document useful (0 votes)
21 views

Hugging Face

hugging face

Uploaded by

yngy7kd9cq
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
21 views

Hugging Face

hugging face

Uploaded by

yngy7kd9cq
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 1

Here is a Python script that uses the Hugging Face transformers library to fine-tune a pre-trained BERT model for

text
classification using the Kaggle sentiment140 dataset:

python
Download
Copy code
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

# Load the Kaggle sentiment140 dataset


data = pd.read_csv('sentiment140.csv', encoding='latin-1', header=None)
data.columns = ['sentiment', 'id', 'date', 'query', 'user', 'text']
data = data[['sentiment', 'text']]

# Preprocess the data


data['text'] = data['text'].apply(lambda x: x.replace('@', ''))
data['text'] = data['text'].apply(lambda x: x.replace('#', ''))
data['text'] = data['text'].apply(lambda x: x.replace('\n', ' '))

# Split the data into training and validation sets


train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)

# Tokenize the text data


tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
train_encodings = tokenizer(list(train_data['text']), truncation=True, padding=True)
val_encodings = tokenizer(list(val_data['text']), truncation=True, padding=True)

# Create custom dataset


class SentimentDataset(Dataset):
def __init__(self, encodings, labels):
self.encodings = encodings
self.labels = labels

def __getitem__(self, idx):


item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
item['labels'] = torch.tensor(self.labels[idx])
return item

def __len__(self):
return len(self.labels)

# Create DataLoaders for training and validation sets


train_dataset = SentimentDataset(train_encodings, train_data['sentiment'].values)
val_dataset = SentimentDataset(val_encodings, val_data['sentiment'].values)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)


val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

# Load the pre-trained BERT model and adjust the number of classes
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Set up the optimizer and learning rate scheduler


optimizer = AdamW(model.parameters(), lr=2e-5)
total_steps = len(train_loader) * 3
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Fine-tune the model


device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)
model.train()

for epoch in range(3):


for batch in train_loader:
optimizer.zero_grad()
input_ids = batch['input_ids'].to(device)
attention_mask = batch['attention_mask'].to(device)
labels = batch['labels'].to(device)
outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
loss = outputs.loss

You might also like