Fix a regression in 2.9.12 where some corrupt XML structures were handled incorrectly: https://gitlab.gnome.org/GNOME/libxml2/-/issues/255 This is an amalgamation of these upstream commits: https://gitlab.gnome.org/GNOME/libxml2/-/commit/85b1792e37b131e7a51af98a37f92472e8de5f3f https://gitlab.gnome.org/GNOME/libxml2/-/commit/13ad8736d294536da4cbcd70a96b0a2fbf47070c diff --git a/HTMLtree.c b/HTMLtree.c --- a/HTMLtree.c +++ b/HTMLtree.c @@ -744,7 +744,7 @@ void htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur, const char *encoding ATTRIBUTE_UNUSED, int format) { - xmlNodePtr root; + xmlNodePtr root, parent; xmlAttrPtr attr; const htmlElemDesc * info; @@ -755,6 +755,7 @@ htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, } root = cur; + parent = cur->parent; while (1) { switch (cur->type) { case XML_HTML_DOCUMENT_NODE: @@ -762,13 +763,25 @@ htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, if (((xmlDocPtr) cur)->intSubset != NULL) { htmlDtdDumpOutput(buf, (xmlDocPtr) cur, NULL); } - if (cur->children != NULL) { + /* Always validate cur->parent when descending. */ + if ((cur->parent == parent) && (cur->children != NULL)) { + parent = cur; cur = cur->children; continue; } break; case XML_ELEMENT_NODE: + /* + * Some users like lxml are known to pass nodes with a corrupted + * tree structure. Fall back to a recursive call to handle this + * case. + */ + if ((cur->parent != parent) && (cur->children != NULL)) { + htmlNodeDumpFormatOutput(buf, doc, cur, encoding, format); + break; + } + /* * Get specific HTML info for that node. */ @@ -817,6 +830,7 @@ htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, (cur->name != NULL) && (cur->name[0] != 'p')) /* p, pre, param */ xmlOutputBufferWriteString(buf, "\n"); + parent = cur; cur = cur->children; continue; } @@ -825,9 +839,9 @@ htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, (info != NULL) && (!info->isinline)) { if ((cur->next->type != HTML_TEXT_NODE) && (cur->next->type != HTML_ENTITY_REF_NODE) && - (cur->parent != NULL) && - (cur->parent->name != NULL) && - (cur->parent->name[0] != 'p')) /* p, pre, param */ + (parent != NULL) && + (parent->name != NULL) && + (parent->name[0] != 'p')) /* p, pre, param */ xmlOutputBufferWriteString(buf, "\n"); } @@ -842,9 +856,9 @@ htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, break; if (((cur->name == (const xmlChar *)xmlStringText) || (cur->name != (const xmlChar *)xmlStringTextNoenc)) && - ((cur->parent == NULL) || - ((xmlStrcasecmp(cur->parent->name, BAD_CAST "script")) && - (xmlStrcasecmp(cur->parent->name, BAD_CAST "style"))))) { + ((parent == NULL) || + ((xmlStrcasecmp(parent->name, BAD_CAST "script")) && + (xmlStrcasecmp(parent->name, BAD_CAST "style"))))) { xmlChar *buffer; buffer = xmlEncodeEntitiesReentrant(doc, cur->content); @@ -902,13 +916,9 @@ htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, break; } - /* - * The parent should never be NULL here but we want to handle - * corrupted documents gracefully. - */ - if (cur->parent == NULL) - return; - cur = cur->parent; + cur = parent; + /* cur->parent was validated when descending. */ + parent = cur->parent; if ((cur->type == XML_HTML_DOCUMENT_NODE) || (cur->type == XML_DOCUMENT_NODE)) { @@ -939,9 +949,9 @@ htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, (cur->next != NULL)) { if ((cur->next->type != HTML_TEXT_NODE) && (cur->next->type != HTML_ENTITY_REF_NODE) && - (cur->parent != NULL) && - (cur->parent->name != NULL) && - (cur->parent->name[0] != 'p')) /* p, pre, param */ + (parent != NULL) && + (parent->name != NULL) && + (parent->name[0] != 'p')) /* p, pre, param */ xmlOutputBufferWriteString(buf, "\n"); } } diff --git a/xmlsave.c b/xmlsave.c --- a/xmlsave.c +++ b/xmlsave.c @@ -847,7 +847,7 @@ htmlNodeDumpOutputInternal(xmlSaveCtxtPtr ctxt, xmlNodePtr cur) { static void xmlNodeDumpOutputInternal(xmlSaveCtxtPtr ctxt, xmlNodePtr cur) { int format = ctxt->format; - xmlNodePtr tmp, root, unformattedNode = NULL; + xmlNodePtr tmp, root, unformattedNode = NULL, parent; xmlAttrPtr attr; xmlChar *start, *end; xmlOutputBufferPtr buf; @@ -856,6 +856,7 @@ xmlNodeDumpOutputInternal(xmlSaveCtxtPtr ctxt, xmlNodePtr cur) { buf = ctxt->buf; root = cur; + parent = cur->parent; while (1) { switch (cur->type) { case XML_DOCUMENT_NODE: @@ -868,7 +869,9 @@ xmlNodeDumpOutputInternal(xmlSaveCtxtPtr ctxt, xmlNodePtr cur) { break; case XML_DOCUMENT_FRAG_NODE: - if (cur->children != NULL) { + /* Always validate cur->parent when descending. */ + if ((cur->parent == parent) && (cur->children != NULL)) { + parent = cur; cur = cur->children; continue; } @@ -887,7 +890,18 @@ xmlNodeDumpOutputInternal(xmlSaveCtxtPtr ctxt, xmlNodePtr cur) { break; case XML_ELEMENT_NODE: - if ((cur != root) && (ctxt->format == 1) && (xmlIndentTreeOutput)) + /* + * Some users like lxml are known to pass nodes with a corrupted + * tree structure. Fall back to a recursive call to handle this + * case. + */ + if ((cur->parent != parent) && (cur->children != NULL)) { + xmlNodeDumpOutputInternal(ctxt, cur); + break; + } + + if ((ctxt->level > 0) && (ctxt->format == 1) && + (xmlIndentTreeOutput)) xmlOutputBufferWrite(buf, ctxt->indent_size * (ctxt->level > ctxt->indent_nr ? ctxt->indent_nr : ctxt->level), @@ -942,6 +956,7 @@ xmlNodeDumpOutputInternal(xmlSaveCtxtPtr ctxt, xmlNodePtr cur) { xmlOutputBufferWrite(buf, 1, ">"); if (ctxt->format == 1) xmlOutputBufferWrite(buf, 1, "\n"); if (ctxt->level >= 0) ctxt->level++; + parent = cur; cur = cur->children; continue; } @@ -1058,13 +1073,9 @@ xmlNodeDumpOutputInternal(xmlSaveCtxtPtr ctxt, xmlNodePtr cur) { break; } - /* - * The parent should never be NULL here but we want to handle - * corrupted documents gracefully. - */ - if (cur->parent == NULL) - return; - cur = cur->parent; + cur = parent; + /* cur->parent was validated when descending. */ + parent = cur->parent; if (cur->type == XML_ELEMENT_NODE) { if (ctxt->level > 0) ctxt->level--; diff --git a/xmlsave.c b/xmlsave.c --- a/xmlsave.c +++ b/xmlsave.c @@ -890,6 +890,13 @@ xmlNodeDumpOutputInternal(xmlSaveCtxtPtr ctxt, xmlNodePtr cur) { break; case XML_ELEMENT_NODE: + if ((cur != root) && (ctxt->format == 1) && + (xmlIndentTreeOutput)) + xmlOutputBufferWrite(buf, ctxt->indent_size * + (ctxt->level > ctxt->indent_nr ? + ctxt->indent_nr : ctxt->level), + ctxt->indent); + /* * Some users like lxml are known to pass nodes with a corrupted * tree structure. Fall back to a recursive call to handle this @@ -900,13 +907,6 @@ xmlNodeDumpOutputInternal(xmlSaveCtxtPtr ctxt, xmlNodePtr cur) { break; } - if ((ctxt->level > 0) && (ctxt->format == 1) && - (xmlIndentTreeOutput)) - xmlOutputBufferWrite(buf, ctxt->indent_size * - (ctxt->level > ctxt->indent_nr ? - ctxt->indent_nr : ctxt->level), - ctxt->indent); - xmlOutputBufferWrite(buf, 1, "<"); if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) { xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);