# -*- coding: iso-8859-1 -*- """ A SAX2 driver for libxml2, on top of it's XmlReader API USAGE # put this file (drv_libxml2.py) in PYTHONPATH import xml.sax reader = xml.sax.make_parser(["drv_libxml2"]) # ...and the rest is standard python sax. CAVEATS - Lexical handlers are supported, except for start/endEntity (waiting for XmlReader.ResolveEntity) and start/endDTD - Error callbacks are not exactly synchronous, they tend to be invoked before the corresponding content callback, because the underlying reader interface parses data by chunks of 512 bytes TODO - search for TODO - some ErrorHandler events (warning) - some ContentHandler events (setDocumentLocator, skippedEntity) - EntityResolver (using libxml2.?) - DTDHandler (if/when libxml2 exposes such node types) - DeclHandler (if/when libxml2 exposes such node types) - property_xml_string? - feature_string_interning? - Incremental parser - additional performance tuning: - one might cache callbacks to avoid some name lookups - one might implement a smarter way to pass attributes to startElement (some kind of lazy evaluation?) - there might be room for improvement in start/endPrefixMapping - other? """ __author__ = "Stéphane Bidoul " __version__ = "0.3" import sys import codecs if sys.version_info[0] < 3: __author__ = codecs.unicode_escape_decode(__author__)[0] StringTypes = (str, unicode) # libxml2 returns strings as UTF8 _decoder = codecs.lookup("utf8")[1] def _d(s): if s is None: return s else: return _decoder(s)[0] else: StringTypes = str # s is Unicode `str` already def _d(s): return s from xml.sax._exceptions import * from xml.sax import xmlreader, saxutils from xml.sax.handler import \ feature_namespaces, \ feature_namespace_prefixes, \ feature_string_interning, \ feature_validation, \ feature_external_ges, \ feature_external_pes, \ property_lexical_handler, \ property_declaration_handler, \ property_dom_node, \ property_xml_string try: import libxml2 except ImportError: raise SAXReaderNotAvailable("libxml2 not available: " \ "import error was: %s" % sys.exc_info()[1]) class Locator(xmlreader.Locator): """SAX Locator adapter for libxml2.xmlTextReaderLocator""" def __init__(self,locator): self.__locator = locator def getColumnNumber(self): "Return the column number where the current event ends." return -1 def getLineNumber(self): "Return the line number where the current event ends." return self.__locator.LineNumber() def getPublicId(self): "Return the public identifier for the current event." return None def getSystemId(self): "Return the system identifier for the current event." return self.__locator.BaseURI() class LibXml2Reader(xmlreader.XMLReader): def __init__(self): xmlreader.XMLReader.__init__(self) # features self.__ns = 0 self.__nspfx = 0 self.__validate = 0 self.__extparams = 1 # parsing flag self.__parsing = 0 # additional handlers self.__lex_handler = None self.__decl_handler = None # error messages accumulator self.__errors = None def _errorHandler(self,arg,msg,severity,locator): if self.__errors is None: self.__errors = [] self.__errors.append((severity, SAXParseException(msg,None, Locator(locator)))) def _reportErrors(self,fatal): for severity,exception in self.__errors: if severity in (libxml2.PARSER_SEVERITY_VALIDITY_WARNING, libxml2.PARSER_SEVERITY_WARNING): self._err_handler.warning(exception) else: # when fatal is set, the parse will stop; # we consider that the last error reported # is the fatal one. if fatal and exception is self.__errors[-1][1]: self._err_handler.fatalError(exception) else: self._err_handler.error(exception) self.__errors = None def parse(self, source): self.__parsing = 1 try: # prepare source and create reader if isinstance(source, StringTypes): reader = libxml2.newTextReaderFilename(source) else: source = saxutils.prepare_input_source(source) input = libxml2.inputBuffer(source.getByteStream()) reader = input.newTextReader(source.getSystemId()) reader.SetErrorHandler(self._errorHandler,None) # configure reader if self.__extparams: reader.SetParserProp(libxml2.PARSER_LOADDTD,1) reader.SetParserProp(libxml2.PARSER_DEFAULTATTRS,1) reader.SetParserProp(libxml2.PARSER_SUBST_ENTITIES,1) reader.SetParserProp(libxml2.PARSER_VALIDATE,self.__validate) else: reader.SetParserProp(libxml2.PARSER_LOADDTD, 0) # we reuse attribute maps (for a slight performance gain) if self.__ns: attributesNSImpl = xmlreader.AttributesNSImpl({},{}) else: attributesImpl = xmlreader.AttributesImpl({}) # prefixes to pop (for endPrefixMapping) prefixes = [] # start loop self._cont_handler.startDocument() while 1: r = reader.Read() # check for errors if r == 1: if not self.__errors is None: self._reportErrors(0) elif r == 0: if not self.__errors is None: self._reportErrors(0) break # end of parse else: if not self.__errors is None: self._reportErrors(1) else: self._err_handler.fatalError(\ SAXException("Read failed (no details available)")) break # fatal parse error # get node type nodeType = reader.NodeType() # Element if nodeType == 1: if self.__ns: eltName = (_d(reader.NamespaceUri()),\ _d(reader.LocalName())) eltQName = _d(reader.Name()) attributesNSImpl._attrs = attrs = {} attributesNSImpl._qnames = qnames = {} newPrefixes = [] while reader.MoveToNextAttribute(): qname = _d(reader.Name()) value = _d(reader.Value()) if qname.startswith("xmlns"): if len(qname) > 5: newPrefix = qname[6:] else: newPrefix = None newPrefixes.append(newPrefix) self._cont_handler.startPrefixMapping(\ newPrefix,value) if not self.__nspfx: continue # don't report xmlns attribute attName = (_d(reader.NamespaceUri()), _d(reader.LocalName())) qnames[attName] = qname attrs[attName] = value reader.MoveToElement() self._cont_handler.startElementNS( \ eltName,eltQName,attributesNSImpl) if reader.IsEmptyElement(): self._cont_handler.endElementNS(eltName,eltQName) for newPrefix in newPrefixes: self._cont_handler.endPrefixMapping(newPrefix) else: prefixes.append(newPrefixes) else: eltName = _d(reader.Name()) attributesImpl._attrs = attrs = {} while reader.MoveToNextAttribute(): attName = _d(reader.Name()) attrs[attName] = _d(reader.Value()) reader.MoveToElement() self._cont_handler.startElement( \ eltName,attributesImpl) if reader.IsEmptyElement(): self._cont_handler.endElement(eltName) # EndElement elif nodeType == 15: if self.__ns: self._cont_handler.endElementNS( \ (_d(reader.NamespaceUri()),_d(reader.LocalName())), _d(reader.Name())) for prefix in prefixes.pop(): self._cont_handler.endPrefixMapping(prefix) else: self._cont_handler.endElement(_d(reader.Name())) # Text elif nodeType == 3: self._cont_handler.characters(_d(reader.Value())) # Whitespace elif nodeType == 13: self._cont_handler.ignorableWhitespace(_d(reader.Value())) # SignificantWhitespace elif nodeType == 14: self._cont_handler.characters(_d(reader.Value())) # CDATA elif nodeType == 4: if not self.__lex_handler is None: self.__lex_handler.startCDATA() self._cont_handler.characters(_d(reader.Value())) if not self.__lex_handler is None: self.__lex_handler.endCDATA() # EntityReference elif nodeType == 5: if not self.__lex_handler is None: self.startEntity(_d(reader.Name())) reader.ResolveEntity() # EndEntity elif nodeType == 16: if not self.__lex_handler is None: self.endEntity(_d(reader.Name())) # ProcessingInstruction elif nodeType == 7: self._cont_handler.processingInstruction( \ _d(reader.Name()),_d(reader.Value())) # Comment elif nodeType == 8: if not self.__lex_handler is None: self.__lex_handler.comment(_d(reader.Value())) # DocumentType elif nodeType == 10: #if not self.__lex_handler is None: # self.__lex_handler.startDTD() pass # TODO (how to detect endDTD? on first non-dtd event?) # XmlDeclaration elif nodeType == 17: pass # TODO # Entity elif nodeType == 6: pass # TODO (entity decl) # Notation (decl) elif nodeType == 12: pass # TODO # Attribute (never in this loop) #elif nodeType == 2: # pass # Document (not exposed) #elif nodeType == 9: # pass # DocumentFragment (never returned by XmlReader) #elif nodeType == 11: # pass # None #elif nodeType == 0: # pass # - else: raise SAXException("Unexpected node type %d" % nodeType) if r == 0: self._cont_handler.endDocument() reader.Close() finally: self.__parsing = 0 def setDTDHandler(self, handler): # TODO (when supported, the inherited method works just fine) raise SAXNotSupportedException("DTDHandler not supported") def setEntityResolver(self, resolver): # TODO (when supported, the inherited method works just fine) raise SAXNotSupportedException("EntityResolver not supported") def getFeature(self, name): if name == feature_namespaces: return self.__ns elif name == feature_namespace_prefixes: return self.__nspfx elif name == feature_validation: return self.__validate elif name == feature_external_ges: return 1 # TODO (does that relate to PARSER_LOADDTD)? elif name == feature_external_pes: return self.__extparams else: raise SAXNotRecognizedException("Feature '%s' not recognized" % \ name) def setFeature(self, name, state): if self.__parsing: raise SAXNotSupportedException("Cannot set feature %s " \ "while parsing" % name) if name == feature_namespaces: self.__ns = state elif name == feature_namespace_prefixes: self.__nspfx = state elif name == feature_validation: self.__validate = state elif name == feature_external_ges: if state == 0: # TODO (does that relate to PARSER_LOADDTD)? raise SAXNotSupportedException("Feature '%s' not supported" % \ name) elif name == feature_external_pes: self.__extparams = state else: raise SAXNotRecognizedException("Feature '%s' not recognized" % \ name) def getProperty(self, name): if name == property_lexical_handler: return self.__lex_handler elif name == property_declaration_handler: return self.__decl_handler else: raise SAXNotRecognizedException("Property '%s' not recognized" % \ name) def setProperty(self, name, value): if name == property_lexical_handler: self.__lex_handler = value elif name == property_declaration_handler: # TODO: remove if/when libxml2 supports dtd events raise SAXNotSupportedException("Property '%s' not supported" % \ name) self.__decl_handler = value else: raise SAXNotRecognizedException("Property '%s' not recognized" % \ name) def create_parser(): return LibXml2Reader()