56 lines
1.6 KiB
Python
56 lines
1.6 KiB
Python
import xml.etree.ElementTree as ET
|
|
|
|
|
|
def extract_sentences(xml_string):
|
|
ns = {"default": "http://www.tei-c.org/ns/1.0"}
|
|
root = ET.fromstring(xml_string)
|
|
|
|
sentences = []
|
|
|
|
# Find all <s> elements representing sentences
|
|
for sentence in root.findall(".//default:s", ns):
|
|
words_with_pcs = []
|
|
for element in sentence:
|
|
if element.tag.endswith(
|
|
"w"
|
|
): # Check if the element is a word (<w> element)
|
|
words_with_pcs.append(element.text)
|
|
elif element.tag.endswith(
|
|
"pc"
|
|
): # Check if the element is a punctuation mark (<pc> element)
|
|
if (
|
|
words_with_pcs
|
|
): # Avoid adding space if it's the first element in the sentence
|
|
words_with_pcs[-1] += element.text
|
|
else:
|
|
words_with_pcs.append(element.text)
|
|
|
|
sentence_text = " ".join(words_with_pcs)
|
|
|
|
if '"' in sentence_text or "«" in sentence_text or "»" in sentence_text:
|
|
pass
|
|
elif len(sentence_text) < 10:
|
|
pass
|
|
else:
|
|
sentences.append(sentence_text)
|
|
|
|
return sentences
|
|
|
|
|
|
with open("./jos1M-sl.body.xml", "r") as f:
|
|
xml_text = f.read()
|
|
|
|
|
|
with open("./deu_mixed-typical_2011_1M-sentences.txt", "r") as f:
|
|
last_line = f.readlines()[-1]
|
|
number, _ = last_line.split("\t")
|
|
number = int(number)
|
|
print(number)
|
|
|
|
|
|
sentences = extract_sentences(xml_text)
|
|
|
|
with open("./deu_mixed-typical_2011_1M-sentences.txt", "a") as f:
|
|
for i, sentence in enumerate(sentences):
|
|
f.write(f"{number + i + 1}\t{sentence}\n")
|