fine-tune/dataset/append_slovene_corpus.py

import xml.etree.ElementTree as ET


def extract_sentences(xml_string):
    ns = {"default": "http://www.tei-c.org/ns/1.0"}
    root = ET.fromstring(xml_string)

    sentences = []

    # Find all <s> elements representing sentences
    for sentence in root.findall(".//default:s", ns):
        words_with_pcs = []
        for element in sentence:
            if element.tag.endswith(
                "w"
            ):  # Check if the element is a word (<w> element)
                words_with_pcs.append(element.text)
            elif element.tag.endswith(
                "pc"
            ):  # Check if the element is a punctuation mark (<pc> element)
                if (
                    words_with_pcs
                ):  # Avoid adding space if it's the first element in the sentence
                    words_with_pcs[-1] += element.text
                else:
                    words_with_pcs.append(element.text)

        sentence_text = " ".join(words_with_pcs)

        if '"' in sentence_text or "«" in sentence_text or "»" in sentence_text:
            pass
        elif len(sentence_text) < 10:
            pass
        else:
            sentences.append(sentence_text)

    return sentences


with open("./jos1M-sl.body.xml", "r") as f:
    xml_text = f.read()


with open("./deu_mixed-typical_2011_1M-sentences.txt", "r") as f:
    last_line = f.readlines()[-1]
    number, _ = last_line.split("\t")
    number = int(number)
    print(number)


sentences = extract_sentences(xml_text)

with open("./deu_mixed-typical_2011_1M-sentences.txt", "a") as f:
    for i, sentence in enumerate(sentences):
        f.write(f"{number + i + 1}\t{sentence}\n")
Upload datasets 2023-07-28 10:23:07 +02:00			`import xml.etree.ElementTree as ET`


			`def extract_sentences(xml_string):`
			`ns = {"default": "http://www.tei-c.org/ns/1.0"}`
			`root = ET.fromstring(xml_string)`

			`sentences = []`

			`# Find all <s> elements representing sentences`
			`for sentence in root.findall(".//default:s", ns):`
			`words_with_pcs = []`
			`for element in sentence:`
			`if element.tag.endswith(`
			`"w"`
			`): # Check if the element is a word (<w> element)`
			`words_with_pcs.append(element.text)`
			`elif element.tag.endswith(`
			`"pc"`
			`): # Check if the element is a punctuation mark (<pc> element)`
			`if (`
			`words_with_pcs`
			`): # Avoid adding space if it's the first element in the sentence`
			`words_with_pcs[-1] += element.text`
			`else:`
			`words_with_pcs.append(element.text)`

			`sentence_text = " ".join(words_with_pcs)`

			`if '"' in sentence_text or "«" in sentence_text or "»" in sentence_text:`
			`pass`
			`elif len(sentence_text) < 10:`
			`pass`
			`else:`
			`sentences.append(sentence_text)`

			`return sentences`


			`with open("./jos1M-sl.body.xml", "r") as f:`
			`xml_text = f.read()`


			`with open("./deu_mixed-typical_2011_1M-sentences.txt", "r") as f:`
			`last_line = f.readlines()[-1]`
			`number, _ = last_line.split("\t")`
			`number = int(number)`
			`print(number)`


			`sentences = extract_sentences(xml_text)`

			`with open("./deu_mixed-typical_2011_1M-sentences.txt", "a") as f:`
			`for i, sentence in enumerate(sentences):`
			`f.write(f"{number + i + 1}\t{sentence}\n")`