From 35a201cc8ef0c3f5b2df88d2e528aabee1048348 Mon Sep 17 00:00:00 2001 From: Wojtek Kosior Date: Fri, 30 Apr 2021 18:47:09 +0200 Subject: Initial/Final commit --- libxml2-2.9.10/doc/devhelp/libxml2-HTMLparser.html | 373 +++++++++++++++++++++ 1 file changed, 373 insertions(+) create mode 100644 libxml2-2.9.10/doc/devhelp/libxml2-HTMLparser.html (limited to 'libxml2-2.9.10/doc/devhelp/libxml2-HTMLparser.html') diff --git a/libxml2-2.9.10/doc/devhelp/libxml2-HTMLparser.html b/libxml2-2.9.10/doc/devhelp/libxml2-HTMLparser.html new file mode 100644 index 0000000..3af5a9c --- /dev/null +++ b/libxml2-2.9.10/doc/devhelp/libxml2-HTMLparser.html @@ -0,0 +1,373 @@ + + + + + HTMLparser: interface for an HTML 4.0 non-verifying parser + + + + + + + + + + + + + + + + +

+ HTMLparser +

+

HTMLparser - interface for an HTML 4.0 non-verifying parser

+

this module implements an HTML 4.0 non-verifying parser with API compatible with the XML parser ones. It should be able to parse "real world" HTML, even if severely broken from a specification point of view.

+

Author(s): Daniel Veillard

+
+

Synopsis

+
#define htmlDefaultSubelement(elt);
+#define htmlElementAllowedHereDesc(parent, elt);
+#define htmlRequiredAttrs(elt);
+typedef xmlParserNodeInfo htmlParserNodeInfo;
+typedef xmlParserInput htmlParserInput;
+typedef xmlParserCtxtPtr htmlParserCtxtPtr;
+typedef struct _htmlEntityDesc htmlEntityDesc;
+typedef xmlDocPtr htmlDocPtr;
+typedef xmlSAXHandlerPtr htmlSAXHandlerPtr;
+typedef enum htmlStatus;
+typedef xmlNodePtr htmlNodePtr;
+typedef htmlElemDesc * htmlElemDescPtr;
+typedef struct _htmlElemDesc htmlElemDesc;
+typedef xmlSAXHandler htmlSAXHandler;
+typedef xmlParserInputPtr htmlParserInputPtr;
+typedef enum htmlParserOption;
+typedef htmlEntityDesc * htmlEntityDescPtr;
+typedef xmlParserCtxt htmlParserCtxt;
+int	htmlIsScriptAttribute		(const xmlChar * name);
+int	htmlHandleOmittedElem		(int val);
+htmlDocPtr	htmlReadFd		(int fd, 
const char * URL,
const char * encoding,
int options); +htmlDocPtr htmlReadIO (xmlInputReadCallback ioread,
xmlInputCloseCallback ioclose,
void * ioctx,
const char * URL,
const char * encoding,
int options); +htmlDocPtr htmlParseFile (const char * filename,
const char * encoding); +htmlDocPtr htmlCtxtReadDoc (htmlParserCtxtPtr ctxt,
const xmlChar * cur,
const char * URL,
const char * encoding,
int options); +int htmlAutoCloseTag (htmlDocPtr doc,
const xmlChar * name,
htmlNodePtr elem); +int htmlParseChunk (htmlParserCtxtPtr ctxt,
const char * chunk,
int size,
int terminate); +const htmlElemDesc * htmlTagLookup (const xmlChar * tag); +htmlParserCtxtPtr htmlCreateMemoryParserCtxt (const char * buffer,
int size); +void htmlCtxtReset (htmlParserCtxtPtr ctxt); +int htmlElementAllowedHere (const htmlElemDesc * parent,
const xmlChar * elt); +htmlDocPtr htmlCtxtReadIO (htmlParserCtxtPtr ctxt,
xmlInputReadCallback ioread,
xmlInputCloseCallback ioclose,
void * ioctx,
const char * URL,
const char * encoding,
int options); +htmlParserCtxtPtr htmlCreatePushParserCtxt (htmlSAXHandlerPtr sax,
void * user_data,
const char * chunk,
int size,
const char * filename,
xmlCharEncoding enc); +htmlDocPtr htmlReadMemory (const char * buffer,
int size,
const char * URL,
const char * encoding,
int options); +int htmlIsAutoClosed (htmlDocPtr doc,
htmlNodePtr elem); +int htmlParseCharRef (htmlParserCtxtPtr ctxt); +htmlDocPtr htmlReadDoc (const xmlChar * cur,
const char * URL,
const char * encoding,
int options); +int htmlEncodeEntities (unsigned char * out,
int * outlen,
const unsigned char * in,
int * inlen,
int quoteChar); +htmlStatus htmlNodeStatus (const htmlNodePtr node,
int legacy); +htmlStatus htmlAttrAllowed (const htmlElemDesc * elt,
const xmlChar * attr,
int legacy); +htmlDocPtr htmlSAXParseFile (const char * filename,
const char * encoding,
htmlSAXHandlerPtr sax,
void * userData); +const htmlEntityDesc * htmlParseEntityRef (htmlParserCtxtPtr ctxt,
const xmlChar ** str); +htmlStatus htmlElementStatusHere (const htmlElemDesc * parent,
const htmlElemDesc * elt); +const htmlEntityDesc * htmlEntityValueLookup (unsigned int value); +void htmlParseElement (htmlParserCtxtPtr ctxt); +int UTF8ToHtml (unsigned char * out,
int * outlen,
const unsigned char * in,
int * inlen); +const htmlEntityDesc * htmlEntityLookup (const xmlChar * name); +void htmlFreeParserCtxt (htmlParserCtxtPtr ctxt); +htmlDocPtr htmlCtxtReadMemory (htmlParserCtxtPtr ctxt,
const char * buffer,
int size,
const char * URL,
const char * encoding,
int options); +htmlDocPtr htmlCtxtReadFd (htmlParserCtxtPtr ctxt,
int fd,
const char * URL,
const char * encoding,
int options); +htmlDocPtr htmlReadFile (const char * filename,
const char * encoding,
int options); +htmlDocPtr htmlCtxtReadFile (htmlParserCtxtPtr ctxt,
const char * filename,
const char * encoding,
int options); +int htmlParseDocument (htmlParserCtxtPtr ctxt); +htmlParserCtxtPtr htmlNewParserCtxt (void); +htmlDocPtr htmlSAXParseDoc (const xmlChar * cur,
const char * encoding,
htmlSAXHandlerPtr sax,
void * userData); +int htmlCtxtUseOptions (htmlParserCtxtPtr ctxt,
int options); +htmlDocPtr htmlParseDoc (const xmlChar * cur,
const char * encoding); +
+
+
+

Description

+
+
+

Details

+
+

Macro htmlDefaultSubelement

#define htmlDefaultSubelement(elt);
+

Returns the default subelement for this element

elt:HTML element
+
+
+

Macro htmlElementAllowedHereDesc

#define htmlElementAllowedHereDesc(parent, elt);
+

Checks whether an HTML element description may be a direct child of the specified element. Returns 1 if allowed; 0 otherwise.

parent:HTML parent element
elt:HTML element
+
+
+

Macro htmlRequiredAttrs

#define htmlRequiredAttrs(elt);
+

Returns the attributes required for the specified element.

elt:HTML element
+
+
+

Typedef htmlDocPtr

xmlDocPtr htmlDocPtr;
+

+

+
+

Structure htmlElemDesc

struct _htmlElemDesc {
+    const char *	name	: The tag name
+    char	startTag	: Whether the start tag can be implied
+    char	endTag	: Whether the end tag can be implied
+    char	saveEndTag	: Whether the end tag should be saved
+    char	empty	: Is this an empty element ?
+    char	depr	: Is this a deprecated element ?
+    char	dtd	: 1: only in Loose DTD, 2: only Frameset one
+    char	isinline	: is this a block 0 or inline 1 element
+    const char *	desc	: the description NRK Jan.2003 * New fields encapsulating HTML structur
+    const char **	subelts	: allowed sub-elements of this element
+    const char *	defaultsubelt	: subelement for suggested auto-repair if necessary or NULL
+    const char **	attrs_opt	: Optional Attributes
+    const char **	attrs_depr	: Additional deprecated attributes
+    const char **	attrs_req	: Required attributes
+} htmlElemDesc;
+

+

+
+

Typedef htmlElemDescPtr

htmlElemDesc * htmlElemDescPtr;
+

+

+
+

Structure htmlEntityDesc

struct _htmlEntityDesc {
+    unsigned int	value	: the UNICODE value for the character
+    const char *	name	: The entity name
+    const char *	desc	: the description
+} htmlEntityDesc;
+

+

+
+

Typedef htmlEntityDescPtr

htmlEntityDesc * htmlEntityDescPtr;
+

+

+
+

Typedef htmlNodePtr

xmlNodePtr htmlNodePtr;
+

+

+
+

Typedef htmlParserCtxt

xmlParserCtxt htmlParserCtxt;
+

+

+
+

Typedef htmlParserCtxtPtr

xmlParserCtxtPtr htmlParserCtxtPtr;
+

+

+
+

Typedef htmlParserInput

xmlParserInput htmlParserInput;
+

+

+
+

Typedef htmlParserInputPtr

xmlParserInputPtr htmlParserInputPtr;
+

+

+
+

Typedef htmlParserNodeInfo

xmlParserNodeInfo htmlParserNodeInfo;
+

+

+
+

Enum htmlParserOption

enum htmlParserOption {
+    HTML_PARSE_RECOVER = 1 /* Relaxed parsing */
+    HTML_PARSE_NODEFDTD = 4 /* do not default a doctype if not found */
+    HTML_PARSE_NOERROR = 32 /* suppress error reports */
+    HTML_PARSE_NOWARNING = 64 /* suppress warning reports */
+    HTML_PARSE_PEDANTIC = 128 /* pedantic error reporting */
+    HTML_PARSE_NOBLANKS = 256 /* remove blank nodes */
+    HTML_PARSE_NONET = 2048 /* Forbid network access */
+    HTML_PARSE_NOIMPLIED = 8192 /* Do not add implied html/body... elements */
+    HTML_PARSE_COMPACT = 65536 /* compact small text nodes */
+    HTML_PARSE_IGNORE_ENC = 2097152 /*  ignore internal document encoding hint */
+};
+

+

+
+

Typedef htmlSAXHandler

xmlSAXHandler htmlSAXHandler;
+

+

+
+

Typedef htmlSAXHandlerPtr

xmlSAXHandlerPtr htmlSAXHandlerPtr;
+

+

+
+

Enum htmlStatus

enum htmlStatus {
+    HTML_NA = 0 /* something we don't check at all */
+    HTML_INVALID = 1
+    HTML_DEPRECATED = 2
+    HTML_VALID = 4
+    HTML_REQUIRED = 12 /*  VALID bit set so ( & HTML_VALID ) is TRUE */
+};
+

+

+
+ +
+

htmlAttrAllowed ()

htmlStatus	htmlAttrAllowed		(const htmlElemDesc * elt, 
const xmlChar * attr,
int legacy)
+

Checks whether an attribute is valid for an element Has full knowledge of Required and Deprecated attributes

+
elt:HTML element
attr:HTML attribute
legacy:whether to allow deprecated attributes
Returns:one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID
+
+

htmlAutoCloseTag ()

int	htmlAutoCloseTag		(htmlDocPtr doc, 
const xmlChar * name,
htmlNodePtr elem)
+

The HTML DTD allows a tag to implicitly close other tags. The list is kept in htmlStartClose array. This function checks if the element or one of it's children would autoclose the given tag.

+
doc:the HTML document
name:The tag name
elem:the HTML element
Returns:1 if autoclose, 0 otherwise
+
+

htmlCreateMemoryParserCtxt ()

htmlParserCtxtPtr	htmlCreateMemoryParserCtxt	(const char * buffer, 
int size)
+

Create a parser context for an HTML in-memory document.

+
buffer:a pointer to a char array
size:the size of the array
Returns:the new parser context or NULL
+
+

htmlCreatePushParserCtxt ()

htmlParserCtxtPtr	htmlCreatePushParserCtxt	(htmlSAXHandlerPtr sax, 
void * user_data,
const char * chunk,
int size,
const char * filename,
xmlCharEncoding enc)
+

Create a parser context for using the HTML parser in push mode The value of @filename is used for fetching external entities and error/warning reports.

+
sax:a SAX handler
user_data:The user data returned on SAX callbacks
chunk:a pointer to an array of chars
size:number of chars in the array
filename:an optional file name or URI
enc:an optional encoding
Returns:the new parser context or NULL
+
+

htmlCtxtReadDoc ()

htmlDocPtr	htmlCtxtReadDoc		(htmlParserCtxtPtr ctxt, 
const xmlChar * cur,
const char * URL,
const char * encoding,
int options)
+

parse an XML in-memory document and build a tree. This reuses the existing @ctxt parser context

+
ctxt:an HTML parser context
cur:a pointer to a zero terminated string
URL:the base URL to use for the document
encoding:the document encoding, or NULL
options:a combination of htmlParserOption(s)
Returns:the resulting document tree
+
+

htmlCtxtReadFd ()

htmlDocPtr	htmlCtxtReadFd		(htmlParserCtxtPtr ctxt, 
int fd,
const char * URL,
const char * encoding,
int options)
+

parse an XML from a file descriptor and build a tree. This reuses the existing @ctxt parser context

+
ctxt:an HTML parser context
fd:an open file descriptor
URL:the base URL to use for the document
encoding:the document encoding, or NULL
options:a combination of htmlParserOption(s)
Returns:the resulting document tree
+
+

htmlCtxtReadFile ()

htmlDocPtr	htmlCtxtReadFile	(htmlParserCtxtPtr ctxt, 
const char * filename,
const char * encoding,
int options)
+

parse an XML file from the filesystem or the network. This reuses the existing @ctxt parser context

+
ctxt:an HTML parser context
filename:a file or URL
encoding:the document encoding, or NULL
options:a combination of htmlParserOption(s)
Returns:the resulting document tree
+
+

htmlCtxtReadIO ()

htmlDocPtr	htmlCtxtReadIO		(htmlParserCtxtPtr ctxt, 
xmlInputReadCallback ioread,
xmlInputCloseCallback ioclose,
void * ioctx,
const char * URL,
const char * encoding,
int options)
+

parse an HTML document from I/O functions and source and build a tree. This reuses the existing @ctxt parser context

+
ctxt:an HTML parser context
ioread:an I/O read function
ioclose:an I/O close function
ioctx:an I/O handler
URL:the base URL to use for the document
encoding:the document encoding, or NULL
options:a combination of htmlParserOption(s)
Returns:the resulting document tree
+
+

htmlCtxtReadMemory ()

htmlDocPtr	htmlCtxtReadMemory	(htmlParserCtxtPtr ctxt, 
const char * buffer,
int size,
const char * URL,
const char * encoding,
int options)
+

parse an XML in-memory document and build a tree. This reuses the existing @ctxt parser context

+
ctxt:an HTML parser context
buffer:a pointer to a char array
size:the size of the array
URL:the base URL to use for the document
encoding:the document encoding, or NULL
options:a combination of htmlParserOption(s)
Returns:the resulting document tree
+
+

htmlCtxtReset ()

void	htmlCtxtReset			(htmlParserCtxtPtr ctxt)
+

Reset a parser context

+
ctxt:an HTML parser context
+
+

htmlCtxtUseOptions ()

int	htmlCtxtUseOptions		(htmlParserCtxtPtr ctxt, 
int options)
+

Applies the options to the parser context

+
ctxt:an HTML parser context
options:a combination of htmlParserOption(s)
Returns:0 in case of success, the set of unknown or unimplemented options in case of error.
+
+

htmlElementAllowedHere ()

int	htmlElementAllowedHere		(const htmlElemDesc * parent, 
const xmlChar * elt)
+

Checks whether an HTML element may be a direct child of a parent element. Note - doesn't check for deprecated elements

+
parent:HTML parent element
elt:HTML element
Returns:1 if allowed; 0 otherwise.
+
+

htmlElementStatusHere ()

htmlStatus	htmlElementStatusHere	(const htmlElemDesc * parent, 
const htmlElemDesc * elt)
+

Checks whether an HTML element may be a direct child of a parent element. and if so whether it is valid or deprecated.

+
parent:HTML parent element
elt:HTML element
Returns:one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID
+
+ +
+

htmlEntityLookup ()

const htmlEntityDesc *	htmlEntityLookup	(const xmlChar * name)
+

Lookup the given entity in EntitiesTable TODO: the linear scan is really ugly, an hash table is really needed.

+
name:the entity name
Returns:the associated htmlEntityDescPtr if found, NULL otherwise.
+
+

htmlEntityValueLookup ()

const htmlEntityDesc *	htmlEntityValueLookup	(unsigned int value)
+

Lookup the given entity in EntitiesTable TODO: the linear scan is really ugly, an hash table is really needed.

+
value:the entity's unicode value
Returns:the associated htmlEntityDescPtr if found, NULL otherwise.
+
+

htmlFreeParserCtxt ()

void	htmlFreeParserCtxt		(htmlParserCtxtPtr ctxt)
+

Free all the memory used by a parser context. However the parsed document in ctxt->myDoc is not freed.

+
ctxt:an HTML parser context
+
+ +
+

htmlIsAutoClosed ()

int	htmlIsAutoClosed		(htmlDocPtr doc, 
htmlNodePtr elem)
+

The HTML DTD allows a tag to implicitly close other tags. The list is kept in htmlStartClose array. This function checks if a tag is autoclosed by one of it's child

+
doc:the HTML document
elem:the HTML element
Returns:1 if autoclosed, 0 otherwise
+
+

htmlIsScriptAttribute ()

int	htmlIsScriptAttribute		(const xmlChar * name)
+

Check if an attribute is of content type Script

+
name:an attribute name
Returns:1 is the attribute is a script 0 otherwise
+
+

htmlNewParserCtxt ()

htmlParserCtxtPtr	htmlNewParserCtxt	(void)
+

Allocate and initialize a new parser context.

+
Returns:the htmlParserCtxtPtr or NULL in case of allocation error
+
+

htmlNodeStatus ()

htmlStatus	htmlNodeStatus		(const htmlNodePtr node, 
int legacy)
+

Checks whether the tree node is valid. Experimental (the author only uses the HTML enhancements in a SAX parser)

+
node:an htmlNodePtr in a tree
legacy:whether to allow deprecated elements (YES is faster here for Element nodes)
Returns:for Element nodes, a return from htmlElementAllowedHere (if legacy allowed) or htmlElementStatusHere (otherwise). for Attribute nodes, a return from htmlAttrAllowed for other nodes, HTML_NA (no checks performed)
+
+

htmlParseCharRef ()

int	htmlParseCharRef		(htmlParserCtxtPtr ctxt)
+

parse Reference declarations [66] CharRef ::= '&#' [0-9]+ ';' | '&#x' [0-9a-fA-F]+ ';'

+
ctxt:an HTML parser context
Returns:the value parsed (as an int)
+
+

htmlParseChunk ()

int	htmlParseChunk			(htmlParserCtxtPtr ctxt, 
const char * chunk,
int size,
int terminate)
+

Parse a Chunk of memory

+
ctxt:an HTML parser context
chunk:an char array
size:the size in byte of the chunk
terminate:last chunk indicator
Returns:zero if no error, the xmlParserErrors otherwise.
+
+

htmlParseDoc ()

htmlDocPtr	htmlParseDoc		(const xmlChar * cur, 
const char * encoding)
+

parse an HTML in-memory document and build a tree.

+
cur:a pointer to an array of xmlChar
encoding:a free form C string describing the HTML document encoding, or NULL
Returns:the resulting document tree
+
+

htmlParseDocument ()

int	htmlParseDocument		(htmlParserCtxtPtr ctxt)
+

parse an HTML document (and build a tree if using the standard SAX interface).

+
ctxt:an HTML parser context
Returns:0, -1 in case of error. the parser context is augmented as a result of the parsing.
+
+

htmlParseElement ()

void	htmlParseElement		(htmlParserCtxtPtr ctxt)
+

parse an HTML element, this is highly recursive this is kept for compatibility with previous code versions [39] element ::= EmptyElemTag | STag content ETag [41] Attribute ::= Name Eq AttValue

+
ctxt:an HTML parser context
+
+

htmlParseEntityRef ()

const htmlEntityDesc *	htmlParseEntityRef	(htmlParserCtxtPtr ctxt, 
const xmlChar ** str)
+

parse an HTML ENTITY references [68] EntityRef ::= '&' Name ';'

+
ctxt:an HTML parser context
str:location to store the entity name
Returns:the associated htmlEntityDescPtr if found, or NULL otherwise, if non-NULL *str will have to be freed by the caller.
+
+

htmlParseFile ()

htmlDocPtr	htmlParseFile		(const char * filename, 
const char * encoding)
+

parse an HTML file and build a tree. Automatic support for ZLIB/Compress compressed document is provided by default if found at compile-time.

+
filename:the filename
encoding:a free form C string describing the HTML document encoding, or NULL
Returns:the resulting document tree
+
+

htmlReadDoc ()

htmlDocPtr	htmlReadDoc		(const xmlChar * cur, 
const char * URL,
const char * encoding,
int options)
+

parse an XML in-memory document and build a tree.

+
cur:a pointer to a zero terminated string
URL:the base URL to use for the document
encoding:the document encoding, or NULL
options:a combination of htmlParserOption(s)
Returns:the resulting document tree
+
+

htmlReadFd ()

htmlDocPtr	htmlReadFd		(int fd, 
const char * URL,
const char * encoding,
int options)
+

parse an XML from a file descriptor and build a tree.

+
fd:an open file descriptor
URL:the base URL to use for the document
encoding:the document encoding, or NULL
options:a combination of htmlParserOption(s)
Returns:the resulting document tree
+
+

htmlReadFile ()

htmlDocPtr	htmlReadFile		(const char * filename, 
const char * encoding,
int options)
+

parse an XML file from the filesystem or the network.

+
filename:a file or URL
encoding:the document encoding, or NULL
options:a combination of htmlParserOption(s)
Returns:the resulting document tree
+
+

htmlReadIO ()

htmlDocPtr	htmlReadIO		(xmlInputReadCallback ioread, 
xmlInputCloseCallback ioclose,
void * ioctx,
const char * URL,
const char * encoding,
int options)
+

parse an HTML document from I/O functions and source and build a tree.

+
ioread:an I/O read function
ioclose:an I/O close function
ioctx:an I/O handler
URL:the base URL to use for the document
encoding:the document encoding, or NULL
options:a combination of htmlParserOption(s)
Returns:the resulting document tree
+
+

htmlReadMemory ()

htmlDocPtr	htmlReadMemory		(const char * buffer, 
int size,
const char * URL,
const char * encoding,
int options)
+

parse an XML in-memory document and build a tree.

+
buffer:a pointer to a char array
size:the size of the array
URL:the base URL to use for the document
encoding:the document encoding, or NULL
options:a combination of htmlParserOption(s)
Returns:the resulting document tree
+
+

htmlSAXParseDoc ()

htmlDocPtr	htmlSAXParseDoc		(const xmlChar * cur, 
const char * encoding,
htmlSAXHandlerPtr sax,
void * userData)
+

Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks to handle parse events. If sax is NULL, fallback to the default DOM behavior and return a tree.

+
cur:a pointer to an array of xmlChar
encoding:a free form C string describing the HTML document encoding, or NULL
sax:the SAX handler block
userData:if using SAX, this pointer will be provided on callbacks.
Returns:the resulting document tree unless SAX is NULL or the document is not well formed.
+
+

htmlSAXParseFile ()

htmlDocPtr	htmlSAXParseFile	(const char * filename, 
const char * encoding,
htmlSAXHandlerPtr sax,
void * userData)
+

parse an HTML file and build a tree. Automatic support for ZLIB/Compress compressed document is provided by default if found at compile-time. It use the given SAX function block to handle the parsing callback. If sax is NULL, fallback to the default DOM tree building routines.

+
filename:the filename
encoding:a free form C string describing the HTML document encoding, or NULL
sax:the SAX handler block
userData:if using SAX, this pointer will be provided on callbacks.
Returns:the resulting document tree unless SAX is NULL or the document is not well formed.
+
+

htmlTagLookup ()

const htmlElemDesc *	htmlTagLookup	(const xmlChar * tag)
+

Lookup the HTML tag in the ElementTable

+
tag:The tag name in lowercase
Returns:the related htmlElemDescPtr or NULL if not found.
+
+
+
+ + -- cgit v1.2.3