* Summary: interface for an HTML 4.0 non-verifying parser * Description: this module implements an HTML 4.0 non-verifying parser * with API compatible with the XML parser ones. It should * be able to parse "real world" HTML, even if severely * broken from a specification point of view. * * Copy: See Copyright for the status of this software. * * Author: Patrick Monnerat , DATASPHERE S.A. /if not defined(HTML_PARSER_H__) /define HTML_PARSER_H__ /include "libxmlrpg/xmlversion" /if defined(LIBXML_HTML_ENABLED) /include "libxmlrpg/xmlTypesC" /include "libxmlrpg/parser" * Most of the back-end structures from XML and HTML are shared. d htmlParserCtxtPtr... d s based(######typedef######) d like(xmlParserCtxtPtr) d htmlParserCtxt ds based(htmlParserCtxtPtr) d likeds(xmlParserCtxt) d htmlParserNodeInfoPtr... d s based(######typedef######) d like(xmlParserNodeInfoPtr) d htmlParserNodeInfo... d ds based(htmlParserNodeInfoPtr) d likeds(xmlParserNodeInfo) d htmlSAXHandlerPtr... d s based(######typedef######) d like(xmlSAXHandlerPtr) d htmlSAXHandler ds based(htmlSAXHandlerPtr) d likeds(xmlSAXHandler) d htmlParserInputPtr... d s based(######typedef######) d like(xmlParserInputPtr) d htmlParserInput... d ds based(htmlParserInputPtr) d likeds(xmlParserInput) d htmlDocPtr s based(######typedef######) d like(xmlDocPtr) d htmlNodePtr s based(######typedef######) d like(xmlNodePtr) * Internal description of an HTML element, representing HTML 4.01 * and XHTML 1.0 (which share the same structure). d htmlElemDescPtr... d s * based(######typedef######) d htmlElemDesc ds based(htmlElemDescPtr) d align qualified d name * const char * d startTag like(xmlCchar) Start tag implied ? d endTag like(xmlCchar) End tag implied ? d saveEndTag like(xmlCchar) Save end tag ? d empty like(xmlCchar) Empty element ? d depr like(xmlCchar) Deprecated element ? d dtd like(xmlCchar) Loose DTD/Frameset d isinline like(xmlCchar) Block 0/inline elem? d desc * const char * * * New fields encapsulating HTML structure * * Bugs: * This is a very limited representation. It fails to tell us when * an element *requires* subelements (we only have whether they're * allowed or not), and it doesn't tell us where CDATA and PCDATA * are allowed. Some element relationships are not fully represented: * these are flagged with the word MODIFIER * d subelts * const char * * d defaultsubelt * const char * d attrs_opt * const char * * d attrs_depr * const char * * d attrs_req * const char * * * Internal description of an HTML entity. d htmlEntityDescPtr... d s * based(######typedef######) d htmlEntityDesc... d ds based(htmlEntityDescPtr) d align qualified d value like(xmlCuint) d name * const char * d desc * const char * * There is only few public functions. d htmlTagLookup pr extproc('htmlTagLookup') d like(htmlElemDescPtr) const d tag * value options(*string) const xmlChar * d htmlEntityLookup... d pr extproc('htmlEntityLookup') d like(htmlEntityDescPtr) const d name * value options(*string) const xmlChar * d htmlEntityValueLookup... d pr extproc('htmlEntityValueLookup') d like(htmlEntityDescPtr) const d value value like(xmlCuint) d htmlIsAutoClosed... d pr extproc('htmlIsAutoClosed') d like(xmlCint) d doc value like(htmlDocPtr) d elem value like(htmlNodePtr) d htmlAutoCloseTag... d pr extproc('htmlAutoCloseTag') d like(xmlCint) d doc value like(htmlDocPtr) d name * value options(*string) const xmlChar * d elem value like(htmlNodePtr) d htmlParseEntityRef... d pr extproc('htmlParseEntityRef') d like(htmlEntityDescPtr) const d ctxt value like(htmlParserCtxtPtr) d str * const xmlChar *(*) d htmlParseCharRef... d pr extproc('htmlParseCharRef') d like(xmlCint) d ctxt value like(htmlParserCtxtPtr) d htmlParseElement... d pr extproc('htmlParseElement') d ctxt value like(htmlParserCtxtPtr) d htmlNewParserCtxt... d pr extproc('htmlNewParserCtxt') d like(htmlParserCtxtPtr) d htmlCreateMemoryParserCtxt... d pr extproc('htmlCreateMemoryParserCtxt') d like(htmlParserCtxtPtr) d buffer * value options(*string) const char * d size value like(xmlCint) d htmlParseDocument... d pr extproc('htmlParseDocument') d like(xmlCint) d ctxt value like(htmlParserCtxtPtr) d htmlSAXParseDoc... d pr extproc('htmlSAXParseDoc') d like(htmlDocPtr) d cur * value options(*string) xmlChar * d encoding * value options(*string) const char * d sax value like(htmlSAXHandlerPtr) d userData * value void * d htmlParseDoc pr extproc('htmlParseDoc') d like(htmlDocPtr) d cur * value options(*string) xmlChar * d encoding * value options(*string) const char * d htmlSAXParseFile... d pr extproc('htmlSAXParseFile') d like(htmlDocPtr) d filename * value options(*string) const char * d encoding * value options(*string) const char * d sax value like(htmlSAXHandlerPtr) d userData * value void * d htmlParseFile pr extproc('htmlParseFile') d like(htmlDocPtr) d filename * value options(*string) const char * d encoding * value options(*string) const char * d UTF8ToHtml pr extproc('UTF8ToHtml') d like(xmlCint) d out 65535 options(*varsize) unsigned char [] d outlen like(xmlCint) d in * value options(*string) const unsigned char* d inlen like(xmlCint) d htmlEncodeEntities... d pr extproc('htmlEncodeEntities') d like(xmlCint) d out 65535 options(*varsize) unsigned char [] d outlen like(xmlCint) d in * value options(*string) const unsigned char* d inlen like(xmlCint) d quoteChar value like(xmlCint) d htmlIsScriptAttribute... d pr extproc('htmlIsScriptAttribute') d like(xmlCint) d name * value options(*string) const xmlChar * d htmlHandleOmittedElem... d pr extproc('htmlHandleOmittedElem') d like(xmlCint) d val value like(xmlCint) /if defined(LIBXML_PUSH_ENABLED) * Interfaces for the Push mode. d htmlCreatePushParserCtxt... d pr extproc('htmlCreatePushParserCtxt') d like(htmlParserCtxtPtr) d sax value like(htmlSAXHandlerPtr) d user_data * value void * d chunk * value options(*string) const char * d size value like(xmlCint) d filename * value options(*string) const char * d enc value like(xmlCharEncoding) d htmlParseChunk pr extproc('htmlParseChunk') d like(xmlCint) d ctxt value like(htmlParserCtxtPtr) d chunk * value options(*string) const char * d size value like(xmlCint) d terminate value like(xmlCint) /endif LIBXML_PUSH_ENABLED d htmlFreeParserCtxt... d pr extproc('htmlFreeParserCtxt') d ctxt value like(htmlParserCtxtPtr) * New set of simpler/more flexible APIs * xmlParserOption: * * This is the set of XML parser options that can be passed down * to the xmlReadDoc() and similar calls. d htmlParserOption... d s based(######typedef######) d like(xmlCenum) d HTML_PARSE_RECOVER... Relaxed parsing d c X'00000001' d HTML_PARSE_NODEFDTD... No default doctype d c X'00000004' d HTML_PARSE_NOERROR... No error reports d c X'00000020' d HTML_PARSE_NOWARNING... No warning reports d c X'00000040' d HTML_PARSE_PEDANTIC... Pedantic err reports d c X'00000080' d HTML_PARSE_NOBLANKS... Remove blank nodes d c X'00000100' d HTML_PARSE_NONET... Forbid net access d c X'00000800' d HTML_PARSE_NOIMPLIED... No implied html/body d c X'00002000' d HTML_PARSE_COMPACT... compact small txtnod d c X'00010000' d HTML_PARSE_IGNORE_ENC... Ignore encoding hint d c X'00200000' d htmlCtxtReset pr extproc('htmlCtxtReset') d ctxt value like(htmlParserCtxtPtr) d htmlCtxtUseOptions... d pr extproc('htmlCtxtUseOptions') d like(xmlCint) d ctxt value like(htmlParserCtxtPtr) d options value like(xmlCint) d htmlReadDoc pr extproc('htmlReadDoc') d like(htmlDocPtr) d cur * value options(*string) const xmlChar * d URL * value options(*string) const char * d encoding * value options(*string) const char * d options value like(xmlCint) d htmlReadFile pr extproc('htmlReadFile') d like(htmlDocPtr) d URL * value options(*string) const char * d encoding * value options(*string) const char * d options value like(xmlCint) d htmlReadMemory pr extproc('htmlReadMemory') d like(htmlDocPtr) d buffer * value options(*string) const char * d size value like(xmlCint) d URL * value options(*string) const char * d encoding * value options(*string) const char * d options value like(xmlCint) d htmlReadFd pr extproc('htmlReadFd') d like(htmlDocPtr) d fd value like(xmlCint) d URL * value options(*string) const char * d encoding * value options(*string) const char * d options value like(xmlCint) d htmlReadIO pr extproc('htmlReadIO') d like(htmlDocPtr) d ioread value like(xmlInputReadCallback) d ioclose value like(xmlInputCloseCallback) d ioctx * value void * d URL * value options(*string) const char * d encoding * value options(*string) const char * d options value like(xmlCint) d htmlCtxtReadDoc... d pr extproc('htmlCtxtReadDoc') d like(htmlDocPtr) d ctxt value like(xmlParserCtxtPtr) d cur * value options(*string) const xmlChar * d URL * value options(*string) const char * d encoding * value options(*string) const char * d options value like(xmlCint) d htmlCtxtReadFile... d pr extproc('htmlCtxtReadFile') d like(htmlDocPtr) d ctxt value like(xmlParserCtxtPtr) d filename * value options(*string) const char * d encoding * value options(*string) const char * d options value like(xmlCint) d htmlCtxtReadMemory... d pr extproc('htmlCtxtReadMemory') d like(htmlDocPtr) d ctxt value like(xmlParserCtxtPtr) d buffer * value options(*string) const char * d size value like(xmlCint) d URL * value options(*string) const char * d encoding * value options(*string) const char * d options value like(xmlCint) d htmlCtxtReadFd pr extproc('htmlCtxtReadFd') d like(htmlDocPtr) d ctxt value like(xmlParserCtxtPtr) d fd value like(xmlCint) d URL * value options(*string) const char * d encoding * value options(*string) const char * d options value like(xmlCint) d htmlCtxtReadIO pr extproc('htmlCtxtReadIO') d like(htmlDocPtr) d ctxt value like(xmlParserCtxtPtr) d ioread value like(xmlInputReadCallback) d ioclose value like(xmlInputCloseCallback) d ioctx * value void * d URL * value options(*string) const char * d encoding * value options(*string) const char * d options value like(xmlCint) * Further knowledge of HTML structure d htmlStatus s based(######typedef######) d like(xmlCenum) d HTML_NA c X'0000' No check at all d HTML_INVALID c X'0001' d HTML_DEPRECATED... d c X'0002' d HTML_VALID c X'0004' d HTML_REQUIRED c X'000C' HTML_VALID ored-in * Using htmlElemDesc rather than name here, to emphasise the fact * that otherwise there's a lookup overhead d htmlAttrAllowed... d pr extproc('htmlAttrAllowed') d like(htmlStatus) d #param1 value like(htmlElemDescPtr) const d #param2 * value options(*string) const xmlChar * d #param3 value like(xmlCint) d htmlElementAllowedHere... d pr extproc('htmlElementAllowedHere') d like(xmlCint) d #param1 value like(htmlElemDescPtr) const d #param2 * value options(*string) const xmlChar * d htmlElementStatusHere... d pr extproc('htmlElementStatusHere') d like(htmlStatus) d #param1 value like(htmlElemDescPtr) const d #param2 value like(htmlElemDescPtr) const d htmlNodeStatus pr extproc('htmlNodeStatus') d like(htmlStatus) d #param1 value like(htmlNodePtr) d #param2 value like(xmlCint) * C macros implemented as procedures for ILE/RPG support. d htmlDefaultSubelement... d pr * extproc('__htmlDefaultSubelement') const char * d elt * value const htmlElemDesc * d htmlElementAllowedHereDesc... d pr extproc( d '__htmlElementAllowedHereDesc') d like(xmlCint) d parent * value const htmlElemDesc * d elt * value const htmlElemDesc * d htmlRequiredAttrs... d pr * extproc('__htmlRequiredAttrs') const char * * d elt * value const htmlElemDesc * /endif LIBXML_HTML_ENABLED /endif HTML_PARSER_H__