Model Training
Model Training
Model training
Pretraining is very expensive! Please check costs carefully before starting a pretraining project.
You can get a rough estimate your training job cost using this calculator from Hugging Face. For
training on other infrastructure, e.g. AWS or Google Cloud, please consult those providers for up to
date cost estimates.
import warnings
warnings.filterwarnings('ignore')
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
device = "cuda"
import torch
from transformers import AutoModelForCausalLM
pretrained_model = AutoModelForCausalLM.from_pretrained(
"./drive/MyDrive/TinySolar-308m-4k-init",
device_map="auto",
torch_dtype=torch.bfloat16,
use_cache=False,
LlamaForCausalLM(
(model): LlamaModel(
(embed_tokens): Embedding(32000, 1024)
(layers): ModuleList(
(0-15): 16 x LlamaDecoderLayer(
(self_attn): LlamaAttention(
(q_proj): Linear(in_features=1024, out_features=1024, bias=False)
(k_proj): Linear(in_features=1024, out_features=256, bias=False)
(v_proj): Linear(in_features=1024, out_features=256, bias=False)
(o_proj): Linear(in_features=1024, out_features=1024, bias=False)
)
(mlp): LlamaMLP(
(gate_proj): Linear(in_features=1024, out_features=4096, bias=False)
(up_proj): Linear(in_features=1024, out_features=4096, bias=False)
(down_proj): Linear(in_features=4096, out_features=1024, bias=False)
(act_fn): SiLU()
)
(input_layernorm): LlamaRMSNorm((1024,), eps=1e-06)
(post_attention_layernorm): LlamaRMSNorm((1024,), eps=1e-06)
)
)
(norm): LlamaRMSNorm((1024,), eps=1e-06)
(rotary_emb): LlamaRotaryEmbedding()
)
(lm_head): Linear(in_features=1024, out_features=32000, bias=False)
)
Note that the code has additional comment strings that don't appear in the video. These are to help
you understand what each part of the code is doing.
import datasets
from torch.utils.data import Dataset
import torch
class CustomDataset(Dataset):
def __init__(self, args, split="train"):
self.args = args
self.dataset = datasets.load_dataset(
"parquet",
data_files=args.dataset_name,
split=split
)
def __len__(self):
return len(self.dataset)
@dataclass
class CustomArguments(transformers.TrainingArguments):
# Dataset configuration
dataset_name: str = field(
default="packaged_pretrain_dataset.parquet"
)
num_proc: int = field(default=8)
max_seq_length: int = field(default=32)
# Logging configurationn
logging_steps: int = field(default=3)
report_to: str = field(default="none")
# Saving configurations
# save_strategy: str = field(default="steps")
# save_steps: int = field(default=3)
# save_total_limit: int = field(default=2)
@dataclass
class CustomArguments(transformers.TrainingArguments):
dataset_name: str = field( # Dataset configuration
default="packaged_pretrain_dataset.parquet")
num_proc: int = field(default=1) # Number of subprocesses for data p
max_seq_length: int = field(default=32) # Maximum sequence length
# Core training configurations
seed: int = field(default=0) # Random seed for initialization, e
optim: str = field(default="adamw_torch") # Optimizer, here it's AdamW implem
max_steps: int = field(default=30) # Number of maximum training steps
per_device_train_batch_size: int = field(default=2) # Batch size per device during trai
# Logging configuration
logging_steps: int = field(default=3) # Frequency of logging training inf
report_to: str = field(default="none") # Destination for logging (e.g., Wa
# Saving configuration
save_strategy: str = field(default="steps") # Can be replaced with "epoch"
save_steps: int = field(default=3) # Frequency of saving training chec
save_total_limit: int = field(default=2) # The total number of checkpoints t
Parse the custom arguments and set the output directory where the model will be saved:
parser = transformers.HfArgumentParser(CustomArguments)
args, = parser.parse_args_into_dataclasses(
args=["--output_dir","output"]
)
train_dataset = CustomDataset(args=args)
Then, create an instance of the Hugging Face Trainer object from the transformers library. Call
the train() method of the trainder to initialize the training run:
trainer = Trainer(
model=pretrained_model,
args=args,
train_dataset=train_dataset,
eval_dataset=None,
callbacks=[loss_logging_callback],
trainer.train()
[30/30 02:54, Epoch 0/1]
Step Training Loss
3 4.519700
6 4.468100
9 4.413700
12 4.804900
15 4.517900
18 4.849500
21 3.763700
24 4.778400
27 4.008900
30 4.191900
You can use the code below to save intermediate model checkpoints in your own training run:
# Saving configuration
# save_strategy: str = field(default="steps") # Can be replaced with "epoch"
# save_steps: int = field(default=3) # Frequency of saving training ch
# save_total_limit: int = field(default=2) # The total number of checkpoints
model_name_or_path = "./output/checkpoint-30"
model2 = AutoModelForCausalLM.from_pretrained(
model_name_or_path,
device_map="auto",
torch_dtype=torch.bfloat16,
)
streamer = TextStreamer(
tokenizer,
skip_prompt=True,
skip_special_tokens=True
)
outputs = model2.generate(
**inputs,
streamer=streamer,
use_cache=True,
max_new_tokens=64,
do_sample=True,
temperature=1.0,
)