import xml.etree.ElementTree as ET def extract_sentences(xml_string): ns = {"default": "http://www.tei-c.org/ns/1.0"} root = ET.fromstring(xml_string) sentences = [] # Find all elements representing sentences for sentence in root.findall(".//default:s", ns): words_with_pcs = [] for element in sentence: if element.tag.endswith( "w" ): # Check if the element is a word ( element) words_with_pcs.append(element.text) elif element.tag.endswith( "pc" ): # Check if the element is a punctuation mark ( element) if ( words_with_pcs ): # Avoid adding space if it's the first element in the sentence words_with_pcs[-1] += element.text else: words_with_pcs.append(element.text) sentence_text = " ".join(words_with_pcs) if '"' in sentence_text or "«" in sentence_text or "»" in sentence_text: pass elif len(sentence_text) < 10: pass else: sentences.append(sentence_text) return sentences with open("./jos1M-sl.body.xml", "r") as f: xml_text = f.read() with open("./deu_mixed-typical_2011_1M-sentences.txt", "r") as f: last_line = f.readlines()[-1] number, _ = last_line.split("\t") number = int(number) print(number) sentences = extract_sentences(xml_text) with open("./deu_mixed-typical_2011_1M-sentences.txt", "a") as f: for i, sentence in enumerate(sentences): f.write(f"{number + i + 1}\t{sentence}\n")