/* * HTMLtree.c : implementation of access function for an HTML tree. * * See Copyright for the status of this software. * * daniel@veillard.com */ #define IN_LIBXML #include "libxml.h" #ifdef LIBXML_HTML_ENABLED #include /* for memset() only ! */ #ifdef HAVE_CTYPE_H #include #endif #ifdef HAVE_STDLIB_H #include #endif #include #include #include #include #include #include #include #include #include #include "buf.h" /************************************************************************ * * * Getting/Setting encoding meta tags * * * ************************************************************************/ /** * htmlGetMetaEncoding: * @doc: the document * * Encoding definition lookup in the Meta tags * * Returns the current encoding as flagged in the HTML source */ const xmlChar * htmlGetMetaEncoding(htmlDocPtr doc) { htmlNodePtr cur; const xmlChar *content; const xmlChar *encoding; if (doc == NULL) return(NULL); cur = doc->children; /* * Search the html */ while (cur != NULL) { if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) { if (xmlStrEqual(cur->name, BAD_CAST"html")) break; if (xmlStrEqual(cur->name, BAD_CAST"head")) goto found_head; if (xmlStrEqual(cur->name, BAD_CAST"meta")) goto found_meta; } cur = cur->next; } if (cur == NULL) return(NULL); cur = cur->children; /* * Search the head */ while (cur != NULL) { if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) { if (xmlStrEqual(cur->name, BAD_CAST"head")) break; if (xmlStrEqual(cur->name, BAD_CAST"meta")) goto found_meta; } cur = cur->next; } if (cur == NULL) return(NULL); found_head: cur = cur->children; /* * Search the meta elements */ found_meta: while (cur != NULL) { if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) { if (xmlStrEqual(cur->name, BAD_CAST"meta")) { xmlAttrPtr attr = cur->properties; int http; const xmlChar *value; content = NULL; http = 0; while (attr != NULL) { if ((attr->children != NULL) && (attr->children->type == XML_TEXT_NODE) && (attr->children->next == NULL)) { value = attr->children->content; if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv")) && (!xmlStrcasecmp(value, BAD_CAST"Content-Type"))) http = 1; else if ((value != NULL) && (!xmlStrcasecmp(attr->name, BAD_CAST"content"))) content = value; if ((http != 0) && (content != NULL)) goto found_content; } attr = attr->next; } } } cur = cur->next; } return(NULL); found_content: encoding = xmlStrstr(content, BAD_CAST"charset="); if (encoding == NULL) encoding = xmlStrstr(content, BAD_CAST"Charset="); if (encoding == NULL) encoding = xmlStrstr(content, BAD_CAST"CHARSET="); if (encoding != NULL) { encoding += 8; } else { encoding = xmlStrstr(content, BAD_CAST"charset ="); if (encoding == NULL) encoding = xmlStrstr(content, BAD_CAST"Charset ="); if (encoding == NULL) encoding = xmlStrstr(content, BAD_CAST"CHARSET ="); if (encoding != NULL) encoding += 9; } if (encoding != NULL) { while ((*encoding == ' ') || (*encoding == '\t')) encoding++; } return(encoding); } /** * htmlSetMetaEncoding: * @doc: the document * @encoding: the encoding string * * Sets the current encoding in the Meta tags * NOTE: this will not change the document content encoding, just * the META flag associated. * * Returns 0 in case of success and -1 in case of error */ int htmlSetMetaEncoding(htmlDocPtr doc, const xmlChar *encoding) { htmlNodePtr cur, meta = NULL, head = NULL; const xmlChar *content = NULL; char newcontent[100]; newcontent[0] = 0; if (doc == NULL) return(-1); /* html isn't a real encoding it's just libxml2 way to get entities */ if (!xmlStrcasecmp(encoding, BAD_CAST "html")) return(-1); if (encoding != NULL) { snprintf(newcontent, sizeof(newcontent), "text/html; charset=%s", (char *)encoding); newcontent[sizeof(newcontent) - 1] = 0; } cur = doc->children; /* * Search the html */ while (cur != NULL) { if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) { if (xmlStrcasecmp(cur->name, BAD_CAST"html") == 0) break; if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0) goto found_head; if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) goto found_meta; } cur = cur->next; } if (cur == NULL) return(-1); cur = cur->children; /* * Search the head */ while (cur != NULL) { if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) { if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0) break; if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) { head = cur->parent; goto found_meta; } } cur = cur->next; } if (cur == NULL) return(-1); found_head: head = cur; if (cur->children == NULL) goto create; cur = cur->children; found_meta: /* * Search and update all the remaining the meta elements carrying * encoding informations */ while (cur != NULL) { if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) { if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) { xmlAttrPtr attr = cur->properties; int http; const xmlChar *value; content = NULL; http = 0; while (attr != NULL) { if ((attr->children != NULL) && (attr->children->type == XML_TEXT_NODE) && (attr->children->next == NULL)) { value = attr->children->content; if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv")) && (!xmlStrcasecmp(value, BAD_CAST"Content-Type"))) http = 1; else { if ((value != NULL) && (!xmlStrcasecmp(attr->name, BAD_CAST"content"))) content = value; } if ((http != 0) && (content != NULL)) break; } attr = attr->next; } if ((http != 0) && (content != NULL)) { meta = cur; break; } } } cur = cur->next; } create: if (meta == NULL) { if ((encoding != NULL) && (head != NULL)) { /* * Create a new Meta element with the right attributes */ meta = xmlNewDocNode(doc, NULL, BAD_CAST"meta", NULL); if (head->children == NULL) xmlAddChild(head, meta); else xmlAddPrevSibling(head->children, meta); xmlNewProp(meta, BAD_CAST"http-equiv", BAD_CAST"Content-Type"); xmlNewProp(meta, BAD_CAST"content", BAD_CAST newcontent); } } else { /* remove the meta tag if NULL is passed */ if (encoding == NULL) { xmlUnlinkNode(meta); xmlFreeNode(meta); } /* change the document only if there is a real encoding change */ else if (xmlStrcasestr(content, encoding) == NULL) { xmlSetProp(meta, BAD_CAST"content", BAD_CAST newcontent); } } return(0); } /** * booleanHTMLAttrs: * * These are the HTML attributes which will be output * in minimized form, i.e.