Fix the pasring script

2023-08-03 13:36:16 +02:00 · 2023-08-03 13:36:16 +02:00 · cb3c07b428
parent 72163a9016
commit cb3c07b428
4 changed files with 14 additions and 2 deletions
--- a/parse.py
+++ b/parse.py
@ -7,6 +7,7 @@ from tqdm import tqdm
 import argparse
 from typing import List
 import warnings
 import re
 class XMLProcessor:
@ -17,6 +18,7 @@ class XMLProcessor:
        self.model: XLMRobertaForMaskedLM = None
        self.tokenizer: XLMRobertaTokenizer = None
        self.device: torch.device = None
        self.documents: dict = {}
    def prepare_model(self) -> None:
        model_name: str = "xlm-roberta-base"
@ -29,6 +31,14 @@ class XMLProcessor:
        self.model = self.model.to(self.device)
        self.tokenizer = XLMRobertaTokenizer.from_pretrained(model_name)
    def contains_number(self, string):
        # Regular expression to match any number
        pattern = r'\d+'
        # Search for the pattern in the string
        match = re.search(pattern, string)
        # Return True if a number is found, otherwise False
        return bool(match)
    def compute_perplexity(self, text: str) -> float:
        max_length = 512  # XLM-Roberta's maximum sequence length
@ -53,7 +63,7 @@ class XMLProcessor:
            inputs = {name: tensor.to(self.device) for name, tensor in inputs.items()}
            outputs = self.model(**inputs, labels=inputs["input_ids"])
-            total_loss += torch.exp(outputs.loss).item()
+            total_loss += outputs.loss.item()
            total_count += 1
        # Process the remaining text
@ -101,6 +111,8 @@ class XMLProcessor:
            num_sentences = 0
            for s in seg.findall(".//default:s", ns):
                sentence = " ".join([w.text for w in s.findall(".//default:w", ns)])
                if len(sentence) < 20 and self.contains_number(sentence):
                    continue
                perplexity = self.compute_perplexity(sentence)
                s.set("perplexity", str(perplexity))
                segment_perplexity += perplexity
--- a/plots/barplot_of_perplexities.png
+++ b/plots/barplot_of_perplexities.png
--- a/plots/boxplot_of_perplexities.png
+++ b/plots/boxplot_of_perplexities.png
--- a/plots/docs.json
+++ b/plots/docs.json