From 35a201cc8ef0c3f5b2df88d2e528aabee1048348 Mon Sep 17 00:00:00 2001 From: Wojtek Kosior Date: Fri, 30 Apr 2021 18:47:09 +0200 Subject: Initial/Final commit --- libxml2-2.9.10/python/drv_libxml2.py | 379 +++++++++++++++++++++++++++++++++++ 1 file changed, 379 insertions(+) create mode 100644 libxml2-2.9.10/python/drv_libxml2.py (limited to 'libxml2-2.9.10/python/drv_libxml2.py') diff --git a/libxml2-2.9.10/python/drv_libxml2.py b/libxml2-2.9.10/python/drv_libxml2.py new file mode 100644 index 0000000..71b1c67 --- /dev/null +++ b/libxml2-2.9.10/python/drv_libxml2.py @@ -0,0 +1,379 @@ +# -*- coding: iso-8859-1 -*- +""" A SAX2 driver for libxml2, on top of it's XmlReader API + +USAGE + # put this file (drv_libxml2.py) in PYTHONPATH + import xml.sax + reader = xml.sax.make_parser(["drv_libxml2"]) + # ...and the rest is standard python sax. + +CAVEATS + - Lexical handlers are supported, except for start/endEntity + (waiting for XmlReader.ResolveEntity) and start/endDTD + - Error callbacks are not exactly synchronous, they tend + to be invoked before the corresponding content callback, + because the underlying reader interface parses + data by chunks of 512 bytes + +TODO + - search for TODO + - some ErrorHandler events (warning) + - some ContentHandler events (setDocumentLocator, skippedEntity) + - EntityResolver (using libxml2.?) + - DTDHandler (if/when libxml2 exposes such node types) + - DeclHandler (if/when libxml2 exposes such node types) + - property_xml_string? + - feature_string_interning? + - Incremental parser + - additional performance tuning: + - one might cache callbacks to avoid some name lookups + - one might implement a smarter way to pass attributes to startElement + (some kind of lazy evaluation?) + - there might be room for improvement in start/endPrefixMapping + - other? + +""" + +__author__ = "Stéphane Bidoul " +__version__ = "0.3" + +import sys +import codecs + +if sys.version_info[0] < 3: + __author__ = codecs.unicode_escape_decode(__author__)[0] + + StringTypes = (str, unicode) + # libxml2 returns strings as UTF8 + _decoder = codecs.lookup("utf8")[1] + def _d(s): + if s is None: + return s + else: + return _decoder(s)[0] +else: + StringTypes = str + # s is Unicode `str` already + def _d(s): + return s + +from xml.sax._exceptions import * +from xml.sax import xmlreader, saxutils +from xml.sax.handler import \ + feature_namespaces, \ + feature_namespace_prefixes, \ + feature_string_interning, \ + feature_validation, \ + feature_external_ges, \ + feature_external_pes, \ + property_lexical_handler, \ + property_declaration_handler, \ + property_dom_node, \ + property_xml_string + +try: + import libxml2 +except ImportError: + raise SAXReaderNotAvailable("libxml2 not available: " \ + "import error was: %s" % sys.exc_info()[1]) + +class Locator(xmlreader.Locator): + """SAX Locator adapter for libxml2.xmlTextReaderLocator""" + + def __init__(self,locator): + self.__locator = locator + + def getColumnNumber(self): + "Return the column number where the current event ends." + return -1 + + def getLineNumber(self): + "Return the line number where the current event ends." + return self.__locator.LineNumber() + + def getPublicId(self): + "Return the public identifier for the current event." + return None + + def getSystemId(self): + "Return the system identifier for the current event." + return self.__locator.BaseURI() + +class LibXml2Reader(xmlreader.XMLReader): + + def __init__(self): + xmlreader.XMLReader.__init__(self) + # features + self.__ns = 0 + self.__nspfx = 0 + self.__validate = 0 + self.__extparams = 1 + # parsing flag + self.__parsing = 0 + # additional handlers + self.__lex_handler = None + self.__decl_handler = None + # error messages accumulator + self.__errors = None + + def _errorHandler(self,arg,msg,severity,locator): + if self.__errors is None: + self.__errors = [] + self.__errors.append((severity, + SAXParseException(msg,None, + Locator(locator)))) + + def _reportErrors(self,fatal): + for severity,exception in self.__errors: + if severity in (libxml2.PARSER_SEVERITY_VALIDITY_WARNING, + libxml2.PARSER_SEVERITY_WARNING): + self._err_handler.warning(exception) + else: + # when fatal is set, the parse will stop; + # we consider that the last error reported + # is the fatal one. + if fatal and exception is self.__errors[-1][1]: + self._err_handler.fatalError(exception) + else: + self._err_handler.error(exception) + self.__errors = None + + def parse(self, source): + self.__parsing = 1 + try: + # prepare source and create reader + if isinstance(source, StringTypes): + reader = libxml2.newTextReaderFilename(source) + else: + source = saxutils.prepare_input_source(source) + input = libxml2.inputBuffer(source.getByteStream()) + reader = input.newTextReader(source.getSystemId()) + reader.SetErrorHandler(self._errorHandler,None) + # configure reader + if self.__extparams: + reader.SetParserProp(libxml2.PARSER_LOADDTD,1) + reader.SetParserProp(libxml2.PARSER_DEFAULTATTRS,1) + reader.SetParserProp(libxml2.PARSER_SUBST_ENTITIES,1) + reader.SetParserProp(libxml2.PARSER_VALIDATE,self.__validate) + else: + reader.SetParserProp(libxml2.PARSER_LOADDTD, 0) + # we reuse attribute maps (for a slight performance gain) + if self.__ns: + attributesNSImpl = xmlreader.AttributesNSImpl({},{}) + else: + attributesImpl = xmlreader.AttributesImpl({}) + # prefixes to pop (for endPrefixMapping) + prefixes = [] + # start loop + self._cont_handler.startDocument() + while 1: + r = reader.Read() + # check for errors + if r == 1: + if not self.__errors is None: + self._reportErrors(0) + elif r == 0: + if not self.__errors is None: + self._reportErrors(0) + break # end of parse + else: + if not self.__errors is None: + self._reportErrors(1) + else: + self._err_handler.fatalError(\ + SAXException("Read failed (no details available)")) + break # fatal parse error + # get node type + nodeType = reader.NodeType() + # Element + if nodeType == 1: + if self.__ns: + eltName = (_d(reader.NamespaceUri()),\ + _d(reader.LocalName())) + eltQName = _d(reader.Name()) + attributesNSImpl._attrs = attrs = {} + attributesNSImpl._qnames = qnames = {} + newPrefixes = [] + while reader.MoveToNextAttribute(): + qname = _d(reader.Name()) + value = _d(reader.Value()) + if qname.startswith("xmlns"): + if len(qname) > 5: + newPrefix = qname[6:] + else: + newPrefix = None + newPrefixes.append(newPrefix) + self._cont_handler.startPrefixMapping(\ + newPrefix,value) + if not self.__nspfx: + continue # don't report xmlns attribute + attName = (_d(reader.NamespaceUri()), + _d(reader.LocalName())) + qnames[attName] = qname + attrs[attName] = value + reader.MoveToElement() + self._cont_handler.startElementNS( \ + eltName,eltQName,attributesNSImpl) + if reader.IsEmptyElement(): + self._cont_handler.endElementNS(eltName,eltQName) + for newPrefix in newPrefixes: + self._cont_handler.endPrefixMapping(newPrefix) + else: + prefixes.append(newPrefixes) + else: + eltName = _d(reader.Name()) + attributesImpl._attrs = attrs = {} + while reader.MoveToNextAttribute(): + attName = _d(reader.Name()) + attrs[attName] = _d(reader.Value()) + reader.MoveToElement() + self._cont_handler.startElement( \ + eltName,attributesImpl) + if reader.IsEmptyElement(): + self._cont_handler.endElement(eltName) + # EndElement + elif nodeType == 15: + if self.__ns: + self._cont_handler.endElementNS( \ + (_d(reader.NamespaceUri()),_d(reader.LocalName())), + _d(reader.Name())) + for prefix in prefixes.pop(): + self._cont_handler.endPrefixMapping(prefix) + else: + self._cont_handler.endElement(_d(reader.Name())) + # Text + elif nodeType == 3: + self._cont_handler.characters(_d(reader.Value())) + # Whitespace + elif nodeType == 13: + self._cont_handler.ignorableWhitespace(_d(reader.Value())) + # SignificantWhitespace + elif nodeType == 14: + self._cont_handler.characters(_d(reader.Value())) + # CDATA + elif nodeType == 4: + if not self.__lex_handler is None: + self.__lex_handler.startCDATA() + self._cont_handler.characters(_d(reader.Value())) + if not self.__lex_handler is None: + self.__lex_handler.endCDATA() + # EntityReference + elif nodeType == 5: + if not self.__lex_handler is None: + self.startEntity(_d(reader.Name())) + reader.ResolveEntity() + # EndEntity + elif nodeType == 16: + if not self.__lex_handler is None: + self.endEntity(_d(reader.Name())) + # ProcessingInstruction + elif nodeType == 7: + self._cont_handler.processingInstruction( \ + _d(reader.Name()),_d(reader.Value())) + # Comment + elif nodeType == 8: + if not self.__lex_handler is None: + self.__lex_handler.comment(_d(reader.Value())) + # DocumentType + elif nodeType == 10: + #if not self.__lex_handler is None: + # self.__lex_handler.startDTD() + pass # TODO (how to detect endDTD? on first non-dtd event?) + # XmlDeclaration + elif nodeType == 17: + pass # TODO + # Entity + elif nodeType == 6: + pass # TODO (entity decl) + # Notation (decl) + elif nodeType == 12: + pass # TODO + # Attribute (never in this loop) + #elif nodeType == 2: + # pass + # Document (not exposed) + #elif nodeType == 9: + # pass + # DocumentFragment (never returned by XmlReader) + #elif nodeType == 11: + # pass + # None + #elif nodeType == 0: + # pass + # - + else: + raise SAXException("Unexpected node type %d" % nodeType) + if r == 0: + self._cont_handler.endDocument() + reader.Close() + finally: + self.__parsing = 0 + + def setDTDHandler(self, handler): + # TODO (when supported, the inherited method works just fine) + raise SAXNotSupportedException("DTDHandler not supported") + + def setEntityResolver(self, resolver): + # TODO (when supported, the inherited method works just fine) + raise SAXNotSupportedException("EntityResolver not supported") + + def getFeature(self, name): + if name == feature_namespaces: + return self.__ns + elif name == feature_namespace_prefixes: + return self.__nspfx + elif name == feature_validation: + return self.__validate + elif name == feature_external_ges: + return 1 # TODO (does that relate to PARSER_LOADDTD)? + elif name == feature_external_pes: + return self.__extparams + else: + raise SAXNotRecognizedException("Feature '%s' not recognized" % \ + name) + + def setFeature(self, name, state): + if self.__parsing: + raise SAXNotSupportedException("Cannot set feature %s " \ + "while parsing" % name) + if name == feature_namespaces: + self.__ns = state + elif name == feature_namespace_prefixes: + self.__nspfx = state + elif name == feature_validation: + self.__validate = state + elif name == feature_external_ges: + if state == 0: + # TODO (does that relate to PARSER_LOADDTD)? + raise SAXNotSupportedException("Feature '%s' not supported" % \ + name) + elif name == feature_external_pes: + self.__extparams = state + else: + raise SAXNotRecognizedException("Feature '%s' not recognized" % \ + name) + + def getProperty(self, name): + if name == property_lexical_handler: + return self.__lex_handler + elif name == property_declaration_handler: + return self.__decl_handler + else: + raise SAXNotRecognizedException("Property '%s' not recognized" % \ + name) + + def setProperty(self, name, value): + if name == property_lexical_handler: + self.__lex_handler = value + elif name == property_declaration_handler: + # TODO: remove if/when libxml2 supports dtd events + raise SAXNotSupportedException("Property '%s' not supported" % \ + name) + self.__decl_handler = value + else: + raise SAXNotRecognizedException("Property '%s' not recognized" % \ + name) + +def create_parser(): + return LibXml2Reader() + -- cgit v1.2.3