diff --git a/ft.py b/ft.py index 84b6644..15193ec 100644 --- a/ft.py +++ b/ft.py @@ -4,85 +4,126 @@ import torch from dl import load_dataset from tqdm import tqdm -# Enable cudnn optimizations -torch.backends.cudnn.benchmark = True -device = torch.device("cuda" if torch.cuda.is_available() else "cpu") +class FT: + def __init__(self): + # Enable cudnn optimizations + torch.backends.cudnn.benchmark = True -# load tokenizer and model -tokenizer = BartTokenizer.from_pretrained('facebook/bart-base') -model = BartForConditionalGeneration.from_pretrained('facebook/bart-base') -model.to(device) + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") -# set up optimizer -optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5) + # load tokenizer and model + self.tokenizer = BartTokenizer.from_pretrained("facebook/bart-base") + self.model = BartForConditionalGeneration.from_pretrained("facebook/bart-base") + self.model.to(self.device) -# Initialize Amp. This should be optional and should not affect computation if not available -try: - from torch.cuda.amp import GradScaler, autocast - scaler = GradScaler() -except ImportError: - # If Amp is not available, we'll simply define a dummy context manager - class autocast: - def __enter__(self): - pass - def __exit__(self, *args): - pass - scaler = None # We won't use a scaler if we don't have Amp + # set up optimizer + self.optimizer = torch.optim.AdamW(self.model.parameters(), lr=1e-5) -def train_model(dataloader): - model.train() - total_loss = 0 - print("Training model...") - for batch in tqdm(dataloader): - optimizer.zero_grad() + try: + from torch.cuda.amp import GradScaler, autocast - inputs = tokenizer(batch[1], return_tensors="pt", padding=True, truncation=True, max_length=512) - inputs.to(device) - labels = tokenizer(batch[0], return_tensors="pt", padding=True, truncation=True, max_length=512) - labels.to(device) + self.scaler = GradScaler() + except ImportError: + # If Amp is not available, we'll simply define a dummy context manager + class autocast: + def __enter__(self): + pass - outputs = model(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], labels=labels["input_ids"]) + def __exit__(self, *args): + pass - loss = outputs.loss - loss.backward() + self.scaler = None # We won't use a scaler if we don't have Amp - optimizer.step() - total_loss += loss.item() + def train_model(self, dataloader): + self.model.train() + total_loss = 0 + print("Training model...") + for batch in tqdm(dataloader): + self.optimizer.zero_grad() - avg_train_loss = total_loss / len(dataloader) - return avg_train_loss + inputs = self.tokenizer( + batch[1], + return_tensors="pt", + padding=True, + truncation=True, + max_length=512, + ) + inputs.to(self.device) + labels = self.tokenizer( + batch[0], + return_tensors="pt", + padding=True, + truncation=True, + max_length=512, + ) + labels.to(self.device) + + outputs = self.model( + input_ids=inputs["input_ids"], + attention_mask=inputs["attention_mask"], + labels=labels["input_ids"], + ) -def test_model(dataloader): - model.eval() - total_loss = 0 - print("Testing model...") - for batch in tqdm(dataloader): - with torch.no_grad(): - inputs = tokenizer(batch[1], return_tensors="pt", padding=True, truncation=True, max_length=512) - inputs.to(device) - labels = tokenizer(batch[0], return_tensors="pt", padding=True, truncation=True, max_length=512) - labels.to(device) - outputs = model(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], labels=labels["input_ids"]) loss = outputs.loss + loss.backward() + + self.optimizer.step() total_loss += loss.item() - avg_test_loss = total_loss / len(dataloader) - return avg_test_loss + avg_train_loss = total_loss / len(dataloader) + return avg_train_loss + + def test_model(self, dataloader): + self.model.eval() + total_loss = 0 + print("Testing model...") + for batch in tqdm(dataloader): + with torch.no_grad(): + inputs = self.tokenizer( + batch[1], + return_tensors="pt", + padding=True, + truncation=True, + max_length=512, + ) + inputs.to(self.device) + labels = self.tokenizer( + batch[0], + return_tensors="pt", + padding=True, + truncation=True, + max_length=512, + ) + labels.to(self.device) + outputs = self.model( + input_ids=inputs["input_ids"], + attention_mask=inputs["attention_mask"], + labels=labels["input_ids"], + ) + loss = outputs.loss + total_loss += loss.item() + + avg_test_loss = total_loss / len(dataloader) + return avg_test_loss + + def train(self): + train_dataloader, test_dataloader = load_dataset( + "../datasets/deu_mixed-typical_2011_1M/deu_mixed-typical_2011_1M-sentences.txt", + 100, + 100, + 1, + test_ratio=0.2, + ) + num_epochs = 3 + for epoch in range(num_epochs): + avg_train_loss = self.train_model(train_dataloader) + print(f"Train loss for epoch {epoch+1}: {avg_train_loss}") + + avg_test_loss = self.test_model(test_dataloader) + print(f"Test loss for epoch {epoch+1}: {avg_test_loss}") -def train(): - train_dataloader, test_dataloader = load_dataset( - "../datasets/deu_mixed-typical_2011_1M/deu_mixed-typical_2011_1M-sentences.txt", - 100, 100, 1, test_ratio=0.2 - ) - num_epochs = 3 - for epoch in range(num_epochs): - avg_train_loss = train_model(train_dataloader) - print(f"Train loss for epoch {epoch+1}: {avg_train_loss}") - avg_test_loss = test_model(test_dataloader) - print(f"Test loss for epoch {epoch+1}: {avg_test_loss}") - if __name__ == "__main__": - train() - + trainer = FT() + trainer.train()