77 lines
2.0 KiB
Python
77 lines
2.0 KiB
Python
import os
|
|
from collections import Counter
|
|
import math
|
|
from nltk.tokenize import sent_tokenize, word_tokenize
|
|
from nltk.metrics import edit_distance
|
|
import json
|
|
from tqdm import tqdm
|
|
import concurrent.futures
|
|
|
|
def count_words(filename):
|
|
with open(filename) as f:
|
|
contents = f.read()
|
|
words = contents.split()
|
|
return len(words)
|
|
|
|
|
|
def calculate_entropy(sentence):
|
|
words = word_tokenize(sentence)
|
|
word_count = len(words)
|
|
word_freq = Counter(words)
|
|
word_probabilities = {word: count / word_count for word, count in word_freq.items()}
|
|
entropy = -sum(prob * math.log(prob, 2) for prob in word_probabilities.values())
|
|
return entropy
|
|
|
|
|
|
def process_file(filename):
|
|
gt_file = filename
|
|
text_file = gt_file.replace(".corrected", "")
|
|
hason = []
|
|
with open(text_file) as f:
|
|
t_contents = f.read().strip()
|
|
with open(gt_file) as f:
|
|
gt_contents = f.read().strip()
|
|
|
|
t_contents = sent_tokenize(t_contents)
|
|
gt_contents = sent_tokenize(gt_contents)
|
|
|
|
for t in t_contents:
|
|
min_dist = 10000
|
|
corresponding_sentence = ""
|
|
for gt in gt_contents:
|
|
if len(gt) < 8:
|
|
continue
|
|
if edit_distance(t, gt) < min_dist:
|
|
min_dist = edit_distance(t, gt)
|
|
corresponding_sentence = gt
|
|
|
|
hason.append(
|
|
{
|
|
"sentence": t,
|
|
"corrected": corresponding_sentence,
|
|
}
|
|
)
|
|
|
|
return hason
|
|
|
|
def main():
|
|
num_words = 0
|
|
treshold = 0.2
|
|
hason = []
|
|
fs = None
|
|
for root, dirs, files in os.walk("."):
|
|
fs = files
|
|
|
|
fs = [f for f in fs if f.endswith(".corrected")]
|
|
|
|
with concurrent.futures.ProcessPoolExecutor(os.cpu_count()) as executor:
|
|
for filename, result in tqdm(zip(fs, executor.map(process_file, fs)), total=len(fs), desc="Processing files", unit="files"):
|
|
hason.extend(result)
|
|
|
|
|
|
with open("hason_two_out.json", "w") as f:
|
|
json.dump(hason, f, indent=4, ensure_ascii=False)
|
|
|
|
if __name__ == '__main__':
|
|
main()
|