fine-tune/dataset/append_slovene_corpus.py

56 lines
1.6 KiB
Python
Raw Permalink Normal View History

2023-07-28 10:23:07 +02:00
import xml.etree.ElementTree as ET
def extract_sentences(xml_string):
ns = {"default": "http://www.tei-c.org/ns/1.0"}
root = ET.fromstring(xml_string)
sentences = []
# Find all <s> elements representing sentences
for sentence in root.findall(".//default:s", ns):
words_with_pcs = []
for element in sentence:
if element.tag.endswith(
"w"
): # Check if the element is a word (<w> element)
words_with_pcs.append(element.text)
elif element.tag.endswith(
"pc"
): # Check if the element is a punctuation mark (<pc> element)
if (
words_with_pcs
): # Avoid adding space if it's the first element in the sentence
words_with_pcs[-1] += element.text
else:
words_with_pcs.append(element.text)
sentence_text = " ".join(words_with_pcs)
if '"' in sentence_text or "«" in sentence_text or "»" in sentence_text:
pass
elif len(sentence_text) < 10:
pass
else:
sentences.append(sentence_text)
return sentences
with open("./jos1M-sl.body.xml", "r") as f:
xml_text = f.read()
with open("./deu_mixed-typical_2011_1M-sentences.txt", "r") as f:
last_line = f.readlines()[-1]
number, _ = last_line.split("\t")
number = int(number)
print(number)
sentences = extract_sentences(xml_text)
with open("./deu_mixed-typical_2011_1M-sentences.txt", "a") as f:
for i, sentence in enumerate(sentences):
f.write(f"{number + i + 1}\t{sentence}\n")