import os from collections import Counter import math from nltk.tokenize import sent_tokenize, word_tokenize from nltk.metrics import edit_distance import json from tqdm import tqdm import concurrent.futures def count_words(filename): with open(filename) as f: contents = f.read() words = contents.split() return len(words) def calculate_entropy(sentence): words = word_tokenize(sentence) word_count = len(words) word_freq = Counter(words) word_probabilities = {word: count / word_count for word, count in word_freq.items()} entropy = -sum(prob * math.log(prob, 2) for prob in word_probabilities.values()) return entropy def process_file(filename): gt_file = filename text_file = gt_file.replace(".corrected", "") hason = [] with open(text_file) as f: t_contents = f.read().strip() with open(gt_file) as f: gt_contents = f.read().strip() t_contents = sent_tokenize(t_contents) gt_contents = sent_tokenize(gt_contents) for t in t_contents: min_dist = 10000 corresponding_sentence = "" for gt in gt_contents: if len(gt) < 8: continue if edit_distance(t, gt) < min_dist: min_dist = edit_distance(t, gt) corresponding_sentence = gt hason.append( { "sentence": t, "corrected": corresponding_sentence, } ) return hason def main(): num_words = 0 treshold = 0.2 hason = [] fs = None for root, dirs, files in os.walk("."): fs = files fs = [f for f in fs if f.endswith(".corrected")] with concurrent.futures.ProcessPoolExecutor(os.cpu_count()) as executor: for filename, result in tqdm(zip(fs, executor.map(process_file, fs)), total=len(fs), desc="Processing files", unit="files"): hason.extend(result) with open("hason_two_out.json", "w") as f: json.dump(hason, f, indent=4, ensure_ascii=False) if __name__ == '__main__': main()