From 0b622f29c22ee5f2988f501a7472ef7fc7e0fbc4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C5=A1per=20Spagnolo?= Date: Wed, 26 Jul 2023 13:56:51 +0200 Subject: [PATCH] Initial loop --- .gitignore | 1 + dataloader.py | 78 --------------------------------------------- dl.py | 75 ++++++++++++++++++++++++++++++++++++++++++++ ft.py | 87 +++++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 163 insertions(+), 78 deletions(-) delete mode 100644 dataloader.py create mode 100644 dl.py diff --git a/.gitignore b/.gitignore index 21d0b89..a230a78 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,2 @@ .venv/ +__pycache__/ diff --git a/dataloader.py b/dataloader.py deleted file mode 100644 index 391d4e0..0000000 --- a/dataloader.py +++ /dev/null @@ -1,78 +0,0 @@ -import tensorflow as tf -import nlpaug.augmenter.word as naw - - -class DataLoader: - def __init__(self, path, buffer_size, batch_size, max_length, test_ratio=0.2): - self.path = path - self.buffer_size = buffer_size - self.batch_size = batch_size - self.max_length = max_length - self.test_ratio = test_ratio - self.aug = naw.SynonymAug(aug_src="wordnet") - - def _split_input_target(self, sequence): - parts = tf.strings.split(sequence, "\t") - index = int(parts[0]) - sentence = tf.strings.reduce_join(parts[1:], separator=" ") - return sentence, index - - def augment_data(self, sentence, index): - aug_sentence = self.aug.augment(sentence.numpy().decode()) - return sentence, aug_sentence, index - - def tf_augment_data(self, sentence, index): - sentence, aug_sentence, index = tf.py_function( - self.augment_data, [sentence, index], [tf.string, tf.string, tf.int32] - ) - return sentence, aug_sentence, index - - def load_dataset(self): - lines_dataset = tf.data.TextLineDataset(self.path) - dataset = lines_dataset.map(self._split_input_target) - dataset = dataset.map(self.tf_augment_data) - - # Split dataset into train and test - dataset_size = tf.data.experimental.cardinality(dataset).numpy() - test_size = int(dataset_size * self.test_ratio) - train_size = dataset_size - test_size - train_dataset = dataset.take(train_size) - test_dataset = dataset.skip(train_size) - - # Shuffle and batch - train_dataset = train_dataset.shuffle(self.buffer_size).batch(self.batch_size) - test_dataset = test_dataset.shuffle(self.buffer_size).batch(self.batch_size) - - return train_dataset, test_dataset - - -def test(): - # Hyperparameters - buffer_size = 10000 - batch_size = 64 - max_length = 100 # Or any other value depending on your data - - # Create DataLoader - data_loader = DataLoader( - "../datasets/deu_mixed-typical_2011_1M/deu_mixed-typical_2011_1M-sentences.txt", - buffer_size, - batch_size, - max_length, - ) - - # Load the datasets - train_dataset, test_dataset = data_loader.load_dataset() - - # Test the data loader on the training dataset - print("First 5 batches from the training dataset:") - for sent, aug, indxs in train_dataset.take(1): - print(f"Indices: {indxs}, Sentences: {sent}, Augmented: {aug}") - - # Test the data loader on the test dataset - # print("\nFirst 5 batches from the test dataset:") - # for sentences, indices in test_dataset.take(5): - # print(f"Indices: {indices}, Sentences: {sentences}") - - -if __name__ == "__main__": - test() diff --git a/dl.py b/dl.py new file mode 100644 index 0000000..1c0cb0d --- /dev/null +++ b/dl.py @@ -0,0 +1,75 @@ +#! /usr/bin/env python3 + +import torch +from torch.utils.data import Dataset, DataLoader, random_split +from nlpaug.augmenter.char import OcrAug +from nlpaug.augmenter.word import RandomWordAug +from sklearn.model_selection import train_test_split +import pandas as pd + + +class TextDataset(Dataset): + def __init__(self, path, max_length, buffer_size): + self.data = pd.read_csv(path, delimiter="\t", header=None) + self.max_length = max_length + self.buffer_size = buffer_size + + # Augmentations + self.aug_char = OcrAug( + name="OCR_Aug", + aug_char_min=2, + aug_char_max=10, + aug_char_p=0.3, + aug_word_p=0.3, + aug_word_min=1, + aug_word_max=10, + ) + self.aug_delete = RandomWordAug( + action="delete", name="RandomWord_Aug", aug_min=0, aug_max=1, aug_p=0.1 + ) + + def __len__(self): + return len(self.data) + + def __getitem__(self, idx): + index, sentence = self.data.iloc[idx] + aug_sentence = self.aug_char.augment(sentence) + aug_sentence = self.aug_delete.augment(aug_sentence) + aug_sentence = aug_sentence[0] + return sentence, aug_sentence + + +def load_dataset(path, max_length, buffer_size, batch_size, test_ratio=0.2): + # Create dataset + dataset = TextDataset(path, max_length, buffer_size) + + # Calculate split sizes + total_size = len(dataset) + test_size = int(total_size * test_ratio) + train_size = total_size - test_size + + # Split dataset into train and test + train_dataset, test_dataset = random_split(dataset, [train_size, test_size]) + + # Create dataloaders + train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True) + test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True) + + return train_dataloader, test_dataloader + + +def test(): + train_dataloader, test_dataloader = load_dataset( + "../datasets/deu_mixed-typical_2011_1M/deu_mixed-typical_2011_1M-sentences.txt", + 100, + 100, + 32, + test_ratio=0.2, + ) + for batch in train_dataloader: + for sentence, aug_sentence in zip(batch[0], batch[1]): + print(f"sentence: {sentence} | aug_sentence: {aug_sentence}") + + +if __name__ == "__main__": + test() diff --git a/ft.py b/ft.py index 21ab5da..84b6644 100644 --- a/ft.py +++ b/ft.py @@ -1 +1,88 @@ #! /usr/bin/env python3 +from transformers import BartForConditionalGeneration, BartTokenizer, AdamW +import torch +from dl import load_dataset +from tqdm import tqdm + +# Enable cudnn optimizations +torch.backends.cudnn.benchmark = True + +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + +# load tokenizer and model +tokenizer = BartTokenizer.from_pretrained('facebook/bart-base') +model = BartForConditionalGeneration.from_pretrained('facebook/bart-base') +model.to(device) + +# set up optimizer +optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5) + +# Initialize Amp. This should be optional and should not affect computation if not available +try: + from torch.cuda.amp import GradScaler, autocast + scaler = GradScaler() +except ImportError: + # If Amp is not available, we'll simply define a dummy context manager + class autocast: + def __enter__(self): + pass + def __exit__(self, *args): + pass + scaler = None # We won't use a scaler if we don't have Amp + +def train_model(dataloader): + model.train() + total_loss = 0 + print("Training model...") + for batch in tqdm(dataloader): + optimizer.zero_grad() + + inputs = tokenizer(batch[1], return_tensors="pt", padding=True, truncation=True, max_length=512) + inputs.to(device) + labels = tokenizer(batch[0], return_tensors="pt", padding=True, truncation=True, max_length=512) + labels.to(device) + + outputs = model(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], labels=labels["input_ids"]) + + loss = outputs.loss + loss.backward() + + optimizer.step() + total_loss += loss.item() + + avg_train_loss = total_loss / len(dataloader) + return avg_train_loss + +def test_model(dataloader): + model.eval() + total_loss = 0 + print("Testing model...") + for batch in tqdm(dataloader): + with torch.no_grad(): + inputs = tokenizer(batch[1], return_tensors="pt", padding=True, truncation=True, max_length=512) + inputs.to(device) + labels = tokenizer(batch[0], return_tensors="pt", padding=True, truncation=True, max_length=512) + labels.to(device) + outputs = model(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], labels=labels["input_ids"]) + loss = outputs.loss + total_loss += loss.item() + + avg_test_loss = total_loss / len(dataloader) + return avg_test_loss + +def train(): + train_dataloader, test_dataloader = load_dataset( + "../datasets/deu_mixed-typical_2011_1M/deu_mixed-typical_2011_1M-sentences.txt", + 100, 100, 1, test_ratio=0.2 + ) + num_epochs = 3 + for epoch in range(num_epochs): + avg_train_loss = train_model(train_dataloader) + print(f"Train loss for epoch {epoch+1}: {avg_train_loss}") + + avg_test_loss = test_model(test_dataloader) + print(f"Test loss for epoch {epoch+1}: {avg_test_loss}") + +if __name__ == "__main__": + train() +