Fix the pasring script
parent
72163a9016
commit
cb3c07b428
14
parse.py
14
parse.py
|
@ -7,6 +7,7 @@ from tqdm import tqdm
|
|||
import argparse
|
||||
from typing import List
|
||||
import warnings
|
||||
import re
|
||||
|
||||
|
||||
class XMLProcessor:
|
||||
|
@ -17,6 +18,7 @@ class XMLProcessor:
|
|||
self.model: XLMRobertaForMaskedLM = None
|
||||
self.tokenizer: XLMRobertaTokenizer = None
|
||||
self.device: torch.device = None
|
||||
self.documents: dict = {}
|
||||
|
||||
def prepare_model(self) -> None:
|
||||
model_name: str = "xlm-roberta-base"
|
||||
|
@ -29,6 +31,14 @@ class XMLProcessor:
|
|||
self.model = self.model.to(self.device)
|
||||
self.tokenizer = XLMRobertaTokenizer.from_pretrained(model_name)
|
||||
|
||||
def contains_number(self, string):
|
||||
# Regular expression to match any number
|
||||
pattern = r'\d+'
|
||||
# Search for the pattern in the string
|
||||
match = re.search(pattern, string)
|
||||
# Return True if a number is found, otherwise False
|
||||
return bool(match)
|
||||
|
||||
def compute_perplexity(self, text: str) -> float:
|
||||
max_length = 512 # XLM-Roberta's maximum sequence length
|
||||
|
||||
|
@ -53,7 +63,7 @@ class XMLProcessor:
|
|||
inputs = {name: tensor.to(self.device) for name, tensor in inputs.items()}
|
||||
outputs = self.model(**inputs, labels=inputs["input_ids"])
|
||||
|
||||
total_loss += torch.exp(outputs.loss).item()
|
||||
total_loss += outputs.loss.item()
|
||||
total_count += 1
|
||||
|
||||
# Process the remaining text
|
||||
|
@ -101,6 +111,8 @@ class XMLProcessor:
|
|||
num_sentences = 0
|
||||
for s in seg.findall(".//default:s", ns):
|
||||
sentence = " ".join([w.text for w in s.findall(".//default:w", ns)])
|
||||
if len(sentence) < 20 and self.contains_number(sentence):
|
||||
continue
|
||||
perplexity = self.compute_perplexity(sentence)
|
||||
s.set("perplexity", str(perplexity))
|
||||
segment_perplexity += perplexity
|
||||
|
|
Binary file not shown.
Before Width: | Height: | Size: 543 KiB After Width: | Height: | Size: 529 KiB |
Binary file not shown.
Before Width: | Height: | Size: 18 KiB After Width: | Height: | Size: 18 KiB |
File diff suppressed because one or more lines are too long
Loading…
Reference in New Issue