import os import xml.etree.ElementTree as ET import argparse from typing import List import json import matplotlib.pyplot as plt from tqdm import tqdm import numpy as np class Rate: def __init__(self, args: argparse.Namespace): self.input_dir: str = args.input_dir self.output_dir: str = args.output_dir self.use_cpu: bool = args.cpu self.docs: dict = {} self.plots_dir: str = "./plots" os.makedirs(self.plots_dir, exist_ok=True) def find_xmls_recursively(self, input_dir: str, xmls: List[str]) -> List[str]: for entry in os.scandir(input_dir): if entry.is_file() and entry.name.endswith(".xml"): xmls.append(entry.path) elif entry.is_dir(): self.find_xmls_recursively(entry.path, xmls) return xmls def find_xmls(self) -> List[str]: return self.find_xmls_recursively(self.input_dir, []) def get_doc_perplexity(self, filepath: str) -> float: tree = ET.parse(filepath) root = tree.getroot() try: perplexity = float(root.attrib["perplexity"]) except KeyError: perplexity = -100 return perplexity def parse_docs(self, xmls: List[str]) -> None: docs = {} for xml in tqdm(xmls): perplexity = self.get_doc_perplexity(xml) docs[xml] = perplexity self.docs = docs def sort_docs(self) -> None: self.docs = dict(sorted(self.docs.items(), key=lambda item: item[1])) def save_docs(self) -> None: with open(os.path.join(self.plots_dir, "docs.json"), "w") as f: json.dump(self.docs, f) def histogram_of_perplexities(self) -> None: _, values = zip(*self.docs.items()) # Create histogram plt.figure(figsize=(10, 6)) plt.hist(values, bins=20, edgecolor="black") plt.xlabel("Perplexity") plt.ylabel("Number of Documents") plt.title("Histogram of Document Perplexities") plt.tight_layout() # Save figure plt.savefig(os.path.join(self.plots_dir, "histogram_of_perplexities.png")) plt.close() plt.clf() def cumulative_distribution_of_perplexities(self) -> None: _, values = zip(*self.docs.items()) # Create cumulative distribution plot plt.figure(figsize=(10, 6)) plt.hist(values, bins=20, cumulative=True, edgecolor="black") plt.xlabel("Perplexity") plt.ylabel("Cumulative Number of Documents") plt.title("Cumulative Distribution of Document Perplexities") plt.tight_layout() # Save figure plt.savefig( os.path.join(self.plots_dir, "cumulative_distribution_of_perplexities.png") ) plt.close() plt.clf() def boxplot_of_perplexities(self) -> None: _, values = zip(*self.docs.items()) # Create boxplot plt.figure(figsize=(10, 6)) plt.boxplot(values) plt.ylabel("Perplexity") plt.title("Boxplot of Document Perplexities") plt.tight_layout() # Save figure plt.savefig(os.path.join(self.plots_dir, "boxplot_of_perplexities.png")) plt.close() plt.clf() def barplot_of_perplexities(self) -> None: # Prepare data for visualization labels, values = zip(*self.docs.items()) indexes = np.arange(len(labels)) # Create bar plot plt.figure(figsize=(20, 10)) plt.bar(indexes, values, align="center") plt.xticks(indexes, labels, rotation="vertical") plt.ylabel("Perplexity") plt.xlabel("Document") plt.title("Perplexity of documents") plt.tight_layout() # Save figure plt.savefig(os.path.join(self.plots_dir, "barplot_of_perplexities.png")) plt.close() plt.clf() def pltt(self) -> None: self.histogram_of_perplexities() self.cumulative_distribution_of_perplexities() self.boxplot_of_perplexities() self.barplot_of_perplexities() def main(self): xmls = self.find_xmls() self.parse_docs(xmls) self.sort_docs() self.save_docs() self.pltt() exit(0) if __name__ == "__main__": parser = argparse.ArgumentParser(description="Process XML files.") parser.add_argument( "--input_dir", type=str, help="Input directory containing XML files." ) parser.add_argument( "--output_dir", type=str, help="Output directory to save processed files." ) parser.add_argument("--cpu", action="store_true", help="Force usage of CPU.") args = parser.parse_args() rater = Rate(args) rater.main()