Add results of perplexity scanning

master
Gašper Spagnolo 2023-07-17 11:08:19 +02:00
parent 1f64ab42be
commit 72163a9016
No known key found for this signature in database
GPG Key ID: 2EA0738CC1EFEEB7
8 changed files with 157 additions and 0 deletions

5
Makefile Normal file
View File

@ -0,0 +1,5 @@
rate:
python rate.py --input_dir=Kranjska-xml-perplexity --output_dir=Kranjska-xml-perplexity
parse:
python parse.py --input_dir=Kranjska-xml --output_dir=Kranjska-xml-perplexity

View File

@ -113,6 +113,7 @@ class XMLProcessor:
if num_segments != 0:
root.set("perplexity", str(document_perplexity / num_segments))
self.documents[xml_path] = float(document_perplexity / num_segments)
xml_str = ET.tostring(root, encoding="unicode", method="xml")
dom = minidom.parseString(xml_str)

Binary file not shown.

After

Width:  |  Height:  |  Size: 543 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 18 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 26 KiB

1
plots/docs.json Normal file

File diff suppressed because one or more lines are too long

Binary file not shown.

After

Width:  |  Height:  |  Size: 23 KiB

150
rate.py Normal file
View File

@ -0,0 +1,150 @@
import os
import xml.etree.ElementTree as ET
import argparse
from typing import List
import json
import matplotlib.pyplot as plt
from tqdm import tqdm
import numpy as np
class Rate:
def __init__(self, args: argparse.Namespace):
self.input_dir: str = args.input_dir
self.output_dir: str = args.output_dir
self.use_cpu: bool = args.cpu
self.docs: dict = {}
self.plots_dir: str = "./plots"
os.makedirs(self.plots_dir, exist_ok=True)
def find_xmls_recursively(self, input_dir: str, xmls: List[str]) -> List[str]:
for entry in os.scandir(input_dir):
if entry.is_file() and entry.name.endswith(".xml"):
xmls.append(entry.path)
elif entry.is_dir():
self.find_xmls_recursively(entry.path, xmls)
return xmls
def find_xmls(self) -> List[str]:
return self.find_xmls_recursively(self.input_dir, [])
def get_doc_perplexity(self, filepath: str) -> float:
tree = ET.parse(filepath)
root = tree.getroot()
try:
perplexity = float(root.attrib["perplexity"])
except KeyError:
perplexity = -100
return perplexity
def parse_docs(self, xmls: List[str]) -> None:
docs = {}
for xml in tqdm(xmls):
perplexity = self.get_doc_perplexity(xml)
docs[xml] = perplexity
self.docs = docs
def sort_docs(self) -> None:
self.docs = dict(sorted(self.docs.items(), key=lambda item: item[1]))
def save_docs(self) -> None:
with open(os.path.join(self.plots_dir, "docs.json"), "w") as f:
json.dump(self.docs, f)
def histogram_of_perplexities(self) -> None:
_, values = zip(*self.docs.items())
# Create histogram
plt.figure(figsize=(10, 6))
plt.hist(values, bins=20, edgecolor="black")
plt.xlabel("Perplexity")
plt.ylabel("Number of Documents")
plt.title("Histogram of Document Perplexities")
plt.tight_layout()
# Save figure
plt.savefig(os.path.join(self.plots_dir, "histogram_of_perplexities.png"))
plt.close()
plt.clf()
def cumulative_distribution_of_perplexities(self) -> None:
_, values = zip(*self.docs.items())
# Create cumulative distribution plot
plt.figure(figsize=(10, 6))
plt.hist(values, bins=20, cumulative=True, edgecolor="black")
plt.xlabel("Perplexity")
plt.ylabel("Cumulative Number of Documents")
plt.title("Cumulative Distribution of Document Perplexities")
plt.tight_layout()
# Save figure
plt.savefig(
os.path.join(self.plots_dir, "cumulative_distribution_of_perplexities.png")
)
plt.close()
plt.clf()
def boxplot_of_perplexities(self) -> None:
_, values = zip(*self.docs.items())
# Create boxplot
plt.figure(figsize=(10, 6))
plt.boxplot(values)
plt.ylabel("Perplexity")
plt.title("Boxplot of Document Perplexities")
plt.tight_layout()
# Save figure
plt.savefig(os.path.join(self.plots_dir, "boxplot_of_perplexities.png"))
plt.close()
plt.clf()
def barplot_of_perplexities(self) -> None:
# Prepare data for visualization
labels, values = zip(*self.docs.items())
indexes = np.arange(len(labels))
# Create bar plot
plt.figure(figsize=(20, 10))
plt.bar(indexes, values, align="center")
plt.xticks(indexes, labels, rotation="vertical")
plt.ylabel("Perplexity")
plt.xlabel("Document")
plt.title("Perplexity of documents")
plt.tight_layout()
# Save figure
plt.savefig(os.path.join(self.plots_dir, "barplot_of_perplexities.png"))
plt.close()
plt.clf()
def pltt(self) -> None:
self.histogram_of_perplexities()
self.cumulative_distribution_of_perplexities()
self.boxplot_of_perplexities()
self.barplot_of_perplexities()
def main(self):
xmls = self.find_xmls()
self.parse_docs(xmls)
self.sort_docs()
self.save_docs()
self.pltt()
exit(0)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Process XML files.")
parser.add_argument(
"--input_dir", type=str, help="Input directory containing XML files."
)
parser.add_argument(
"--output_dir", type=str, help="Output directory to save processed files."
)
parser.add_argument("--cpu", action="store_true", help="Force usage of CPU.")
args = parser.parse_args()
rater = Rate(args)
rater.main()