Add results of perplexity scanning
parent
1f64ab42be
commit
72163a9016
|
@ -0,0 +1,5 @@
|
|||
rate:
|
||||
python rate.py --input_dir=Kranjska-xml-perplexity --output_dir=Kranjska-xml-perplexity
|
||||
|
||||
parse:
|
||||
python parse.py --input_dir=Kranjska-xml --output_dir=Kranjska-xml-perplexity
|
1
parse.py
1
parse.py
|
@ -113,6 +113,7 @@ class XMLProcessor:
|
|||
|
||||
if num_segments != 0:
|
||||
root.set("perplexity", str(document_perplexity / num_segments))
|
||||
self.documents[xml_path] = float(document_perplexity / num_segments)
|
||||
|
||||
xml_str = ET.tostring(root, encoding="unicode", method="xml")
|
||||
dom = minidom.parseString(xml_str)
|
||||
|
|
Binary file not shown.
After Width: | Height: | Size: 543 KiB |
Binary file not shown.
After Width: | Height: | Size: 18 KiB |
Binary file not shown.
After Width: | Height: | Size: 26 KiB |
File diff suppressed because one or more lines are too long
Binary file not shown.
After Width: | Height: | Size: 23 KiB |
|
@ -0,0 +1,150 @@
|
|||
import os
|
||||
import xml.etree.ElementTree as ET
|
||||
import argparse
|
||||
from typing import List
|
||||
import json
|
||||
import matplotlib.pyplot as plt
|
||||
from tqdm import tqdm
|
||||
import numpy as np
|
||||
|
||||
|
||||
class Rate:
|
||||
def __init__(self, args: argparse.Namespace):
|
||||
self.input_dir: str = args.input_dir
|
||||
self.output_dir: str = args.output_dir
|
||||
self.use_cpu: bool = args.cpu
|
||||
self.docs: dict = {}
|
||||
self.plots_dir: str = "./plots"
|
||||
os.makedirs(self.plots_dir, exist_ok=True)
|
||||
|
||||
def find_xmls_recursively(self, input_dir: str, xmls: List[str]) -> List[str]:
|
||||
for entry in os.scandir(input_dir):
|
||||
if entry.is_file() and entry.name.endswith(".xml"):
|
||||
xmls.append(entry.path)
|
||||
elif entry.is_dir():
|
||||
self.find_xmls_recursively(entry.path, xmls)
|
||||
return xmls
|
||||
|
||||
def find_xmls(self) -> List[str]:
|
||||
return self.find_xmls_recursively(self.input_dir, [])
|
||||
|
||||
def get_doc_perplexity(self, filepath: str) -> float:
|
||||
tree = ET.parse(filepath)
|
||||
root = tree.getroot()
|
||||
|
||||
try:
|
||||
perplexity = float(root.attrib["perplexity"])
|
||||
except KeyError:
|
||||
perplexity = -100
|
||||
return perplexity
|
||||
|
||||
def parse_docs(self, xmls: List[str]) -> None:
|
||||
docs = {}
|
||||
for xml in tqdm(xmls):
|
||||
perplexity = self.get_doc_perplexity(xml)
|
||||
docs[xml] = perplexity
|
||||
|
||||
self.docs = docs
|
||||
|
||||
def sort_docs(self) -> None:
|
||||
self.docs = dict(sorted(self.docs.items(), key=lambda item: item[1]))
|
||||
|
||||
def save_docs(self) -> None:
|
||||
with open(os.path.join(self.plots_dir, "docs.json"), "w") as f:
|
||||
json.dump(self.docs, f)
|
||||
|
||||
def histogram_of_perplexities(self) -> None:
|
||||
_, values = zip(*self.docs.items())
|
||||
|
||||
# Create histogram
|
||||
plt.figure(figsize=(10, 6))
|
||||
plt.hist(values, bins=20, edgecolor="black")
|
||||
plt.xlabel("Perplexity")
|
||||
plt.ylabel("Number of Documents")
|
||||
plt.title("Histogram of Document Perplexities")
|
||||
plt.tight_layout()
|
||||
# Save figure
|
||||
plt.savefig(os.path.join(self.plots_dir, "histogram_of_perplexities.png"))
|
||||
plt.close()
|
||||
plt.clf()
|
||||
|
||||
def cumulative_distribution_of_perplexities(self) -> None:
|
||||
_, values = zip(*self.docs.items())
|
||||
|
||||
# Create cumulative distribution plot
|
||||
plt.figure(figsize=(10, 6))
|
||||
plt.hist(values, bins=20, cumulative=True, edgecolor="black")
|
||||
plt.xlabel("Perplexity")
|
||||
plt.ylabel("Cumulative Number of Documents")
|
||||
plt.title("Cumulative Distribution of Document Perplexities")
|
||||
plt.tight_layout()
|
||||
|
||||
# Save figure
|
||||
plt.savefig(
|
||||
os.path.join(self.plots_dir, "cumulative_distribution_of_perplexities.png")
|
||||
)
|
||||
plt.close()
|
||||
plt.clf()
|
||||
|
||||
def boxplot_of_perplexities(self) -> None:
|
||||
_, values = zip(*self.docs.items())
|
||||
|
||||
# Create boxplot
|
||||
plt.figure(figsize=(10, 6))
|
||||
plt.boxplot(values)
|
||||
plt.ylabel("Perplexity")
|
||||
plt.title("Boxplot of Document Perplexities")
|
||||
plt.tight_layout()
|
||||
|
||||
# Save figure
|
||||
plt.savefig(os.path.join(self.plots_dir, "boxplot_of_perplexities.png"))
|
||||
plt.close()
|
||||
plt.clf()
|
||||
|
||||
def barplot_of_perplexities(self) -> None:
|
||||
# Prepare data for visualization
|
||||
labels, values = zip(*self.docs.items())
|
||||
indexes = np.arange(len(labels))
|
||||
|
||||
# Create bar plot
|
||||
plt.figure(figsize=(20, 10))
|
||||
plt.bar(indexes, values, align="center")
|
||||
plt.xticks(indexes, labels, rotation="vertical")
|
||||
plt.ylabel("Perplexity")
|
||||
plt.xlabel("Document")
|
||||
plt.title("Perplexity of documents")
|
||||
plt.tight_layout()
|
||||
|
||||
# Save figure
|
||||
plt.savefig(os.path.join(self.plots_dir, "barplot_of_perplexities.png"))
|
||||
plt.close()
|
||||
plt.clf()
|
||||
|
||||
def pltt(self) -> None:
|
||||
self.histogram_of_perplexities()
|
||||
self.cumulative_distribution_of_perplexities()
|
||||
self.boxplot_of_perplexities()
|
||||
self.barplot_of_perplexities()
|
||||
|
||||
def main(self):
|
||||
xmls = self.find_xmls()
|
||||
self.parse_docs(xmls)
|
||||
self.sort_docs()
|
||||
self.save_docs()
|
||||
self.pltt()
|
||||
exit(0)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="Process XML files.")
|
||||
parser.add_argument(
|
||||
"--input_dir", type=str, help="Input directory containing XML files."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output_dir", type=str, help="Output directory to save processed files."
|
||||
)
|
||||
parser.add_argument("--cpu", action="store_true", help="Force usage of CPU.")
|
||||
args = parser.parse_args()
|
||||
|
||||
rater = Rate(args)
|
||||
rater.main()
|
Loading…
Reference in New Issue