fine-tune/ocr_test_db/count_words.py

77 lines
2.0 KiB
Python

import os
from collections import Counter
import math
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.metrics import edit_distance
import json
from tqdm import tqdm
import concurrent.futures
def count_words(filename):
with open(filename) as f:
contents = f.read()
words = contents.split()
return len(words)
def calculate_entropy(sentence):
words = word_tokenize(sentence)
word_count = len(words)
word_freq = Counter(words)
word_probabilities = {word: count / word_count for word, count in word_freq.items()}
entropy = -sum(prob * math.log(prob, 2) for prob in word_probabilities.values())
return entropy
def process_file(filename):
gt_file = filename
text_file = gt_file.replace(".corrected", "")
hason = []
with open(text_file) as f:
t_contents = f.read().strip()
with open(gt_file) as f:
gt_contents = f.read().strip()
t_contents = sent_tokenize(t_contents)
gt_contents = sent_tokenize(gt_contents)
for t in t_contents:
min_dist = 10000
corresponding_sentence = ""
for gt in gt_contents:
if len(gt) < 8:
continue
if edit_distance(t, gt) < min_dist:
min_dist = edit_distance(t, gt)
corresponding_sentence = gt
hason.append(
{
"sentence": t,
"corrected": corresponding_sentence,
}
)
return hason
def main():
num_words = 0
treshold = 0.2
hason = []
fs = None
for root, dirs, files in os.walk("."):
fs = files
fs = [f for f in fs if f.endswith(".corrected")]
with concurrent.futures.ProcessPoolExecutor(os.cpu_count()) as executor:
for filename, result in tqdm(zip(fs, executor.map(process_file, fs)), total=len(fs), desc="Processing files", unit="files"):
hason.extend(result)
with open("hason_two_out.json", "w") as f:
json.dump(hason, f, indent=4, ensure_ascii=False)
if __name__ == '__main__':
main()