#!/usr/bin/env python import sys import string from xml.etree import ElementTree # tags expected in vocab.xml NODE_SECTION = "section" SECTION_ATTR_INDEX = "index" SECTION_ATTR_TITLE = "title" NODE_META = "meta" META_ATTR_KEY = "key" META_ATTR_VALUE = "value" NODE_VOCAB = "vocab" VOCAB_ATTR_ENGLISH = "english" VOCAB_ATTR_JAPANESE = "japanese" VOCAB_ATTR_TEXT = "text" class Vocab: """ Matching of English and Japanese words. There can be multiple translations on either side of the mapping. For instance "すみません" means both "excuse me" and "I'm sorry" while "はい" and "ええ" both mean "yes". Fields include the following: english - list of English meanings japanese - list of Japanese meanings meta - metadata mappings Both English and Japanese vocab are in a VocabEntry wrapper with two fields: text - string of the word meta - metadata mappings """ def __init__(self, vocabNode): assert vocabNode.tag == NODE_VOCAB, "invalid node: " + vocabNode.tag self.english = [] self.japanese = [] self.meta = {} # tries to retrieve mapping as an attribute englishAttr = vocabNode.get(VOCAB_ATTR_ENGLISH) if englishAttr: self.english.append(Vocab.VocabEntry(englishAttr, {})) japaneseAttr = vocabNode.get(VOCAB_ATTR_JAPANESE) if japaneseAttr: self.japanese.append(Vocab.VocabEntry(japaneseAttr, {})) for child in vocabNode: if child.tag == NODE_META: key = child.get(META_ATTR_KEY) value = child.get(META_ATTR_VALUE) self.meta[key] = value elif child.tag == VOCAB_ATTR_ENGLISH: self.english.append(Vocab.VocabEntry.parseVocabEntryNode(child)) elif child.tag == VOCAB_ATTR_JAPANESE: self.japanese.append(Vocab.VocabEntry.parseVocabEntryNode(child)) else: message = "unrecognized node in vocab entry: " + child.tag raise AssertionError(message) # Fails if vocab not defined for either language assert len(self.english) > 0, "no English translation provided for a vocab entry" assert len(self.japanese) > 0, "no Japanese translation provided for a vocab entry" def isSimple(): """ True if this represents a one-to-one mapping of English and Japanese without any defined metadata. """ return len(self.meta) == 0 and len(self.english) == 1 and len(self.english[0].meta) == 0 \ and len(self.japanese) == 1 and len(self.japanese[0].meta) == 0 def __str__(self): rep = [] if len(self.english) == 1: rep.append(self.english[0].text) else: rep.append("(") for entry in self.english: rep.append(entry.text) rep.append(")") rep.append(" ↔ ") if len(self.japanese) == 1: rep.append(self.japanese[0].text) else: rep.append("(") for entry in self.english: rep.append(entry.text) rep.append(")") return "".join(rep) class VocabEntry: """ Individual English or Japanese word or phrase. """ def __init__(self, text, metadata): self.text = text self.meta = metadata # facotry constructor for VocabEntry from xml node def parseVocabEntryNode(vocabEntryNode): assert vocabEntryNode.tag == VOCAB_ATTR_ENGLISH or vocabEntryNode.tag == VOCAB_ATTR_JAPANESE, "invalid node: " + vocabNode.tag text = vocabEntryNode.get(VOCAB_ATTR_TEXT) meta = {} for child in vocabEntryNode: if child.tag == NODE_META: key = child.get(META_ATTR_KEY) value = child.get(META_ATTR_VALUE) meta[key] = value else: message = "unrecognized node in vocab entry: " + child.tag raise AssertionError(message) return Vocab.VocabEntry(text, meta) class Section: """ Collection of vocab and subsections related by topic, chapter, etc with the following fields: vocab - list of vocabulary in this section sections - subsections with additional vocab title - label for section index - numeric index for sorting, -1 if not defined meta - metadata mappings """ def __init__(self, sectionNode): assert sectionNode.tag == NODE_SECTION, "invalid node: " + sectionNode.tag try: indexAttr = sectionNode.get(SECTION_ATTR_INDEX) if indexAttr: self.index = int(indexAttr) else: self.index = -1 except ValueError: raise AssertionError("section indices must be numbers") self.title = sectionNode.get(SECTION_ATTR_TITLE) self.sections = [] self.meta = {} self.vocab = [] for child in sectionNode: if child.tag == NODE_SECTION: self.sections.append(Section(child)) elif child.tag == NODE_META: key = child.get(META_ATTR_KEY) value = child.get(META_ATTR_VALUE) self.meta[key] = value elif child.tag == NODE_VOCAB: self.vocab.append(Vocab(child)) else: message = "unrecognized node in section '" + self.title + "': " + child.tag raise AssertionError(message) self.sections.sort() def __str__(self): rep = [str(self.index), ". ", self.title] indentation = " " for vocab in self.vocab: rep.append("\n") rep.append(indentation) rep.append(str(vocab)) # appends subsection text indented for subsection in self.sections: rep.append("\n") # blank line for line in string.split(subsection.__str__(), "\n"): rep.append("\n") rep.append(indentation) # indentation rep.append(line) return "".join(rep) def __cmp__(self, other): return cmp(self.index, other.index) def loadVocab(path): """ Parses vocabulary from a properly formatted xml file. This throws an AssertionException if the file's invalid. """ # Skipping DTD validation because no internal modules exist and a # simple sanity check isn't worth complicating installation. Discussion # of options can be found at: # http://www.programmingtalk.com/archive/index.php/%20%3C/t-30347.html try: rootNode = ElementTree.parse(path) return Section(rootNode.getroot()) except AssertionError as exc: message = "unable to parse wordlist - " + str(exc) raise AssertionError(message) # Exercises basic functionality by parsing and printing a vocab file if __name__ == '__main__': if len(sys.argv) == 1: print("Usage: vocabParser.py ") sys.exit() vocabFilePath = sys.argv[1] wordlist = loadVocab(vocabFilePath) for section in wordlist.sections: print(str(section))