fine-tune/dataset/append_slovene_corpus.py

import xml.etree.ElementTree as ET


def extract_sentences(xml_string):
    ns = {"default": "http://www.tei-c.org/ns/1.0"}
    root = ET.fromstring(xml_string)

    sentences = []

    # Find all <s> elements representing sentences
    for sentence in root.findall(".//default:s", ns):
        words_with_pcs = []
        for element in sentence:
            if element.tag.endswith(
                "w"
            ):  # Check if the element is a word (<w> element)
                words_with_pcs.append(element.text)
            elif element.tag.endswith(
                "pc"
            ):  # Check if the element is a punctuation mark (<pc> element)
                if (
                    words_with_pcs
                ):  # Avoid adding space if it's the first element in the sentence
                    words_with_pcs[-1] += element.text
                else:
                    words_with_pcs.append(element.text)

        sentence_text = " ".join(words_with_pcs)

        if '"' in sentence_text or "«" in sentence_text or "»" in sentence_text:
            pass
        elif len(sentence_text) < 10:
            pass
        else:
            sentences.append(sentence_text)

    return sentences


with open("./jos1M-sl.body.xml", "r") as f:
    xml_text = f.read()


with open("./deu_mixed-typical_2011_1M-sentences.txt", "r") as f:
    last_line = f.readlines()[-1]
    number, _ = last_line.split("\t")
    number = int(number)
    print(number)


sentences = extract_sentences(xml_text)

with open("./deu_mixed-typical_2011_1M-sentences.txt", "a") as f:
    for i, sentence in enumerate(sentences):
        f.write(f"{number + i + 1}\t{sentence}\n")