Initial loop

2023-07-26 13:56:51 +02:00 · 2023-07-26 13:56:51 +02:00 · 0b622f29c2
parent a89c1b812f
commit 0b622f29c2
4 changed files with 163 additions and 78 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1 +1,2 @@
 .venv/
 __pycache__/
--- a/dataloader.py
+++ b/dataloader.py
@ -1,78 +0,0 @@
 import tensorflow as tf
 import nlpaug.augmenter.word as naw
 class DataLoader:
    def __init__(self, path, buffer_size, batch_size, max_length, test_ratio=0.2):
        self.path = path
        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.max_length = max_length
        self.test_ratio = test_ratio
        self.aug = naw.SynonymAug(aug_src="wordnet")
    def _split_input_target(self, sequence):
        parts = tf.strings.split(sequence, "\t")
        index = int(parts[0])
        sentence = tf.strings.reduce_join(parts[1:], separator=" ")
        return sentence, index
    def augment_data(self, sentence, index):
        aug_sentence = self.aug.augment(sentence.numpy().decode())
        return sentence, aug_sentence, index
    def tf_augment_data(self, sentence, index):
        sentence, aug_sentence, index = tf.py_function(
            self.augment_data, [sentence, index], [tf.string, tf.string, tf.int32]
        )
        return sentence, aug_sentence, index
    def load_dataset(self):
        lines_dataset = tf.data.TextLineDataset(self.path)
        dataset = lines_dataset.map(self._split_input_target)
        dataset = dataset.map(self.tf_augment_data)
        # Split dataset into train and test
        dataset_size = tf.data.experimental.cardinality(dataset).numpy()
        test_size = int(dataset_size * self.test_ratio)
        train_size = dataset_size - test_size
        train_dataset = dataset.take(train_size)
        test_dataset = dataset.skip(train_size)
        # Shuffle and batch
        train_dataset = train_dataset.shuffle(self.buffer_size).batch(self.batch_size)
        test_dataset = test_dataset.shuffle(self.buffer_size).batch(self.batch_size)
        return train_dataset, test_dataset
 def test():
    # Hyperparameters
    buffer_size = 10000
    batch_size = 64
    max_length = 100  # Or any other value depending on your data
    # Create DataLoader
    data_loader = DataLoader(
        "../datasets/deu_mixed-typical_2011_1M/deu_mixed-typical_2011_1M-sentences.txt",
        buffer_size,
        batch_size,
        max_length,
    )
    # Load the datasets
    train_dataset, test_dataset = data_loader.load_dataset()
    # Test the data loader on the training dataset
    print("First 5 batches from the training dataset:")
    for sent, aug, indxs in train_dataset.take(1):
        print(f"Indices: {indxs}, Sentences: {sent}, Augmented: {aug}")
    # Test the data loader on the test dataset
    # print("\nFirst 5 batches from the test dataset:")
    # for sentences, indices in test_dataset.take(5):
    #    print(f"Indices: {indices}, Sentences: {sentences}")
 if __name__ == "__main__":
    test()
--- a/dl.py
+++ b/dl.py
@ -0,0 +1,75 @@
 #! /usr/bin/env python3
 import torch
 from torch.utils.data import Dataset, DataLoader, random_split
 from nlpaug.augmenter.char import OcrAug
 from nlpaug.augmenter.word import RandomWordAug
 from sklearn.model_selection import train_test_split
 import pandas as pd
 class TextDataset(Dataset):
    def __init__(self, path, max_length, buffer_size):
        self.data = pd.read_csv(path, delimiter="\t", header=None)
        self.max_length = max_length
        self.buffer_size = buffer_size
        # Augmentations
        self.aug_char = OcrAug(
            name="OCR_Aug",
            aug_char_min=2,
            aug_char_max=10,
            aug_char_p=0.3,
            aug_word_p=0.3,
            aug_word_min=1,
            aug_word_max=10,
        )
        self.aug_delete = RandomWordAug(
            action="delete", name="RandomWord_Aug", aug_min=0, aug_max=1, aug_p=0.1
        )
    def __len__(self):
        return len(self.data)
    def __getitem__(self, idx):
        index, sentence = self.data.iloc[idx]
        aug_sentence = self.aug_char.augment(sentence)
        aug_sentence = self.aug_delete.augment(aug_sentence)
        aug_sentence = aug_sentence[0]
        return sentence, aug_sentence
 def load_dataset(path, max_length, buffer_size, batch_size, test_ratio=0.2):
    # Create dataset
    dataset = TextDataset(path, max_length, buffer_size)
    # Calculate split sizes
    total_size = len(dataset)
    test_size = int(total_size * test_ratio)
    train_size = total_size - test_size
    # Split dataset into train and test
    train_dataset, test_dataset = random_split(dataset, [train_size, test_size])
    # Create dataloaders
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)
    return train_dataloader, test_dataloader
 def test():
    train_dataloader, test_dataloader = load_dataset(
        "../datasets/deu_mixed-typical_2011_1M/deu_mixed-typical_2011_1M-sentences.txt",
        100,
        100,
        32,
        test_ratio=0.2,
    )
    for batch in train_dataloader:
        for sentence, aug_sentence in zip(batch[0], batch[1]):
            print(f"sentence: {sentence} | aug_sentence: {aug_sentence}")
 if __name__ == "__main__":
    test()
--- a/ft.py
+++ b/ft.py
@ -1 +1,88 @@
 #! /usr/bin/env python3
 from transformers import BartForConditionalGeneration, BartTokenizer, AdamW
 import torch
 from dl import load_dataset
 from tqdm import tqdm
 # Enable cudnn optimizations
 torch.backends.cudnn.benchmark = True
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 # load tokenizer and model
 tokenizer = BartTokenizer.from_pretrained('facebook/bart-base')
 model = BartForConditionalGeneration.from_pretrained('facebook/bart-base')
 model.to(device)
 # set up optimizer
 optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
 # Initialize Amp. This should be optional and should not affect computation if not available
 try:
    from torch.cuda.amp import GradScaler, autocast
    scaler = GradScaler()
 except ImportError:
    # If Amp is not available, we'll simply define a dummy context manager
    class autocast:
        def __enter__(self):
            pass
        def __exit__(self, *args):
            pass
    scaler = None  # We won't use a scaler if we don't have Amp
 def train_model(dataloader):
    model.train()
    total_loss = 0
    print("Training model...")
    for batch in tqdm(dataloader):
        optimizer.zero_grad()
        inputs = tokenizer(batch[1], return_tensors="pt", padding=True, truncation=True, max_length=512)
        inputs.to(device)
        labels = tokenizer(batch[0], return_tensors="pt", padding=True, truncation=True, max_length=512)
        labels.to(device)
        outputs = model(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], labels=labels["input_ids"])
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    avg_train_loss = total_loss / len(dataloader) 
    return avg_train_loss
 def test_model(dataloader):
    model.eval()
    total_loss = 0
    print("Testing model...")
    for batch in tqdm(dataloader):
        with torch.no_grad():
            inputs = tokenizer(batch[1], return_tensors="pt", padding=True, truncation=True, max_length=512)
            inputs.to(device)
            labels = tokenizer(batch[0], return_tensors="pt", padding=True, truncation=True, max_length=512)
            labels.to(device)
            outputs = model(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], labels=labels["input_ids"])
            loss = outputs.loss
            total_loss += loss.item()
    avg_test_loss = total_loss / len(dataloader)
    return avg_test_loss
 def train():
    train_dataloader, test_dataloader = load_dataset(
        "../datasets/deu_mixed-typical_2011_1M/deu_mixed-typical_2011_1M-sentences.txt", 
        100, 100, 1, test_ratio=0.2
    )
    num_epochs = 3
    for epoch in range(num_epochs):
        avg_train_loss = train_model(train_dataloader)
        print(f"Train loss for epoch {epoch+1}: {avg_train_loss}")
        avg_test_loss = test_model(test_dataloader)
        print(f"Test loss for epoch {epoch+1}: {avg_test_loss}")
 if __name__ == "__main__":
    train()