fine-tune/ocr_test_db/count_words.py

import os
from collections import Counter
import math
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.metrics import edit_distance
import json
from tqdm import tqdm
import concurrent.futures

def count_words(filename):
    with open(filename) as f:
        contents = f.read()
    words = contents.split()
    return len(words)


def calculate_entropy(sentence):
    words = word_tokenize(sentence)
    word_count = len(words)
    word_freq = Counter(words)
    word_probabilities = {word: count / word_count for word, count in word_freq.items()}
    entropy = -sum(prob * math.log(prob, 2) for prob in word_probabilities.values())
    return entropy


def process_file(filename):
    gt_file = filename
    text_file = gt_file.replace(".corrected", "")
    hason = []
    with open(text_file) as f:
        t_contents = f.read().strip()
    with open(gt_file) as f:
        gt_contents = f.read().strip()

    t_contents = sent_tokenize(t_contents)
    gt_contents = sent_tokenize(gt_contents)

    for t in t_contents:
        min_dist = 10000
        corresponding_sentence = ""
        for gt in gt_contents:
            if len(gt) < 8:
                continue
            if edit_distance(t, gt) < min_dist:
                min_dist = edit_distance(t, gt)
                corresponding_sentence = gt

        hason.append(
                {
                    "sentence": t,
                    "corrected": corresponding_sentence,
                }
        )

    return hason

def main():
    num_words = 0
    treshold = 0.2
    hason = []
    fs = None
    for root, dirs, files in os.walk("."):
        fs = files

    fs = [f for f in fs if f.endswith(".corrected")]

    with concurrent.futures.ProcessPoolExecutor(os.cpu_count()) as executor:
        for filename, result in tqdm(zip(fs, executor.map(process_file, fs)), total=len(fs), desc="Processing files", unit="files"):
            hason.extend(result)


    with open("hason_two_out.json", "w") as f:
        json.dump(hason, f, indent=4, ensure_ascii=False)

if __name__ == '__main__':
    main()