|
libxml2
|
HTML parser, doesn't support HTML5. More...
Typedefs | |
| typedef xmlParserCtxt | htmlParserCtxt |
| Same as xmlParserCtxt. | |
| typedef xmlSAXHandler | htmlSAXHandler |
| Same as xmlSAXHandler. | |
| typedef xmlParserInput | htmlParserInput |
| Same as xmlParserInput. | |
Enumerations | |
| enum | htmlParserOption |
| This is the set of HTML parser options that can be passed to htmlReadDoc, htmlCtxtSetOptions and other functions. More... | |
| enum | htmlStatus |
| deprecated content model | |
Functions | |
| void | htmlInitAutoClose (void) |
| const htmlElemDesc * | htmlTagLookup (const xmlChar *tag) |
| Lookup the HTML tag in the ElementTable. | |
| const htmlEntityDesc * | htmlEntityLookup (const xmlChar *name) |
| Lookup the given entity in EntitiesTable. | |
| const htmlEntityDesc * | htmlEntityValueLookup (unsigned int value) |
| Lookup the given entity in EntitiesTable. | |
| int | htmlIsAutoClosed (xmlDoc *doc, xmlNode *elem) |
| The HTML DTD allows a tag to implicitly close other tags. | |
| int | htmlAutoCloseTag (xmlDoc *doc, const xmlChar *name, xmlNode *elem) |
| The HTML DTD allows a tag to implicitly close other tags. | |
| const htmlEntityDesc * | htmlParseEntityRef (htmlParserCtxt *ctxt, const xmlChar **str) |
| int | htmlParseCharRef (htmlParserCtxt *ctxt) |
| void | htmlParseElement (htmlParserCtxt *ctxt) |
| This is kept for compatibility with previous code versions. | |
| htmlParserCtxt * | htmlNewParserCtxt (void) |
| Allocate and initialize a new HTML parser context. | |
| htmlParserCtxt * | htmlNewSAXParserCtxt (const htmlSAXHandler *sax, void *userData) |
| Allocate and initialize a new HTML SAX parser context. | |
| htmlParserCtxt * | htmlCreateMemoryParserCtxt (const char *buffer, int size) |
| Create a parser context for an HTML in-memory document. | |
| int | htmlParseDocument (htmlParserCtxt *ctxt) |
| Parse an HTML document and invoke the SAX handlers. | |
| xmlDoc * | htmlSAXParseDoc (const xmlChar *cur, const char *encoding, htmlSAXHandler *sax, void *userData) |
| Parse an HTML in-memory document. | |
| xmlDoc * | htmlParseDoc (const xmlChar *cur, const char *encoding) |
| Parse an HTML in-memory document and build a tree. | |
| htmlParserCtxt * | htmlCreateFileParserCtxt (const char *filename, const char *encoding) |
| Create a parser context to read from a file. | |
| xmlDoc * | htmlSAXParseFile (const char *filename, const char *encoding, htmlSAXHandler *sax, void *userData) |
| parse an HTML file and build a tree. | |
| xmlDoc * | htmlParseFile (const char *filename, const char *encoding) |
| Parse an HTML file and build a tree. | |
| int | htmlUTF8ToHtml (unsigned char *out, int *outlen, const unsigned char *in, int *inlen) |
| Take a block of UTF-8 chars in and try to convert it to an ASCII plus HTML entities block of chars out. | |
| int | htmlEncodeEntities (unsigned char *out, int *outlen, const unsigned char *in, int *inlen, int quoteChar) |
| Take a block of UTF-8 chars in and try to convert it to an ASCII plus HTML entities block of chars out. | |
| int | htmlIsScriptAttribute (const xmlChar *name) |
| Check if an attribute is of content type Script. | |
| int | htmlHandleOmittedElem (int val) |
| Set and return the previous value for handling HTML omitted tags. | |
| htmlParserCtxt * | htmlCreatePushParserCtxt (htmlSAXHandler *sax, void *user_data, const char *chunk, int size, const char *filename, xmlCharEncoding enc) |
| Create a parser context for using the HTML parser in push mode. | |
| int | htmlParseChunk (htmlParserCtxt *ctxt, const char *chunk, int size, int terminate) |
| Parse a chunk of memory in push parser mode. | |
| void | htmlFreeParserCtxt (htmlParserCtxt *ctxt) |
| Free all the memory used by a parser context. | |
| void | htmlCtxtReset (htmlParserCtxt *ctxt) |
| Reset a parser context. | |
| int | htmlCtxtSetOptions (htmlParserCtxt *ctxt, int options) |
| Applies the options to the parser context. | |
| int | htmlCtxtUseOptions (htmlParserCtxt *ctxt, int options) |
| Applies the options to the parser context. | |
| xmlDoc * | htmlReadDoc (const xmlChar *cur, const char *URL, const char *encoding, int options) |
| Convenience function to parse an HTML document from a zero-terminated string. | |
| xmlDoc * | htmlReadFile (const char *URL, const char *encoding, int options) |
| Convenience function to parse an HTML file from the filesystem, the network or a global user-defined resource loader. | |
| xmlDoc * | htmlReadMemory (const char *buffer, int size, const char *URL, const char *encoding, int options) |
| Convenience function to parse an HTML document from memory. | |
| xmlDoc * | htmlReadFd (int fd, const char *URL, const char *encoding, int options) |
| Convenience function to parse an HTML document from a file descriptor. | |
| xmlDoc * | htmlReadIO (xmlInputReadCallback ioread, xmlInputCloseCallback ioclose, void *ioctx, const char *URL, const char *encoding, int options) |
| Convenience function to parse an HTML document from I/O functions and context. | |
| xmlDoc * | htmlCtxtParseDocument (htmlParserCtxt *ctxt, xmlParserInput *input) |
| Parse an HTML document and return the resulting document tree. | |
| xmlDoc * | htmlCtxtReadDoc (xmlParserCtxt *ctxt, const xmlChar *cur, const char *URL, const char *encoding, int options) |
| Parse an HTML in-memory document and build a tree. | |
| xmlDoc * | htmlCtxtReadFile (xmlParserCtxt *ctxt, const char *filename, const char *encoding, int options) |
| Parse an HTML file from the filesystem, the network or a user-defined resource loader. | |
| xmlDoc * | htmlCtxtReadMemory (xmlParserCtxt *ctxt, const char *buffer, int size, const char *URL, const char *encoding, int options) |
| Parse an HTML in-memory document and build a tree. | |
| xmlDoc * | htmlCtxtReadFd (xmlParserCtxt *ctxt, int fd, const char *URL, const char *encoding, int options) |
| Parse an HTML from a file descriptor and build a tree. | |
| xmlDoc * | htmlCtxtReadIO (xmlParserCtxt *ctxt, xmlInputReadCallback ioread, xmlInputCloseCallback ioclose, void *ioctx, const char *URL, const char *encoding, int options) |
| Parse an HTML document from I/O functions and source and build a tree. | |
| htmlStatus | htmlAttrAllowed (const htmlElemDesc *, const xmlChar *, int) |
| int | htmlElementAllowedHere (const htmlElemDesc *, const xmlChar *) |
| htmlStatus | htmlElementStatusHere (const htmlElemDesc *, const htmlElemDesc *) |
| htmlStatus | htmlNodeStatus (xmlNode *, int) |
HTML parser, doesn't support HTML5.
This module orginally implemented an HTML parser based on the (underspecified) HTML 4.0 spec. As of 2.14, the tokenizer conforms to HTML5. Tree construction still follows a custom, unspecified algorithm with many differences to HTML5.
The parser defaults to ISO-8859-1, the default encoding of HTTP/1.0.
| enum htmlParserOption |
This is the set of HTML parser options that can be passed to htmlReadDoc, htmlCtxtSetOptions and other functions.
| Enumerator | |
|---|---|
| HTML_PARSE_RECOVER | No effect as of 2.14.0. |
| HTML_PARSE_NODEFDTD | Do not default to a doctype if none was found. |
| HTML_PARSE_NOERROR | Disable error and warning reports to the error handlers. Errors are still accessible with xmlCtxtGetLastError(). |
| HTML_PARSE_NOWARNING | Disable warning reports. |
| HTML_PARSE_PEDANTIC | No effect. |
| HTML_PARSE_NOBLANKS | Remove some text nodes containing only whitespace from the result document. Which nodes are removed depends on a conservative heuristic. The reindenting feature of the serialization code relies on this option to be set when parsing. Use of this option is DISCOURAGED. |
| HTML_PARSE_NONET | No effect. |
| HTML_PARSE_NOIMPLIED | Do not add implied html, head or body elements. |
| HTML_PARSE_COMPACT | Store small strings directly in the node struct to save memory. |
| HTML_PARSE_HUGE | Relax some internal limits. See XML_PARSE_HUGE in xmlParserOption.
Use XML_PARSE_HUGE with older versions. |
| HTML_PARSE_IGNORE_ENC | Ignore the encoding in the HTML declaration. This option is mostly unneeded these days. The only effect is to enforce ISO-8859-1 decoding of ASCII-like data. |
| HTML_PARSE_BIG_LINES | Enable reporting of line numbers larger than 65535.
Use XML_PARSE_BIG_LINES with older versions. |
| HTML_PARSE_HTML5 | Make the tokenizer emit a SAX callback for each token. This results in unbalanced invocations of startElement and endElement. For now, this is only usable to tokenize HTML5 with custom SAX callbacks. A tree builder isn't implemented yet.
|
| htmlStatus htmlAttrAllowed | ( | const htmlElemDesc * | elt, |
| const xmlChar * | attr, | ||
| int | legacy ) |
| elt | HTML element |
| attr | HTML attribute |
| legacy | whether to allow deprecated attributes |
The HTML DTD allows a tag to implicitly close other tags.
The list is kept in htmlStartClose array. This function checks if the element or one of it's children would autoclose the given tag.
| doc | the HTML document |
| name | The tag name |
| elem | the HTML element |
| htmlParserCtxt * htmlCreateFileParserCtxt | ( | const char * | filename, |
| const char * | encoding ) |
Create a parser context to read from a file.
A non-NULL encoding overrides encoding declarations in the document.
Automatic support for ZLIB/Compress compressed document is provided by default if found at compile-time.
| filename | the filename |
| encoding | optional encoding |
| htmlParserCtxt * htmlCreateMemoryParserCtxt | ( | const char * | buffer, |
| int | size ) |
Create a parser context for an HTML in-memory document.
The input buffer must not contain any terminating null bytes.
| buffer | a pointer to a char array |
| size | the size of the array |
| htmlParserCtxt * htmlCreatePushParserCtxt | ( | htmlSAXHandler * | sax, |
| void * | user_data, | ||
| const char * | chunk, | ||
| int | size, | ||
| const char * | filename, | ||
| xmlCharEncoding | enc ) |
Create a parser context for using the HTML parser in push mode.
| sax | a SAX handler (optional) |
| user_data | The user data returned on SAX callbacks (optional) |
| chunk | a pointer to an array of chars (optional) |
| size | number of chars in the array |
| filename | only used for error reporting (optional) |
| enc | encoding (deprecated, pass XML_CHAR_ENCODING_NONE) |
| xmlDoc * htmlCtxtParseDocument | ( | htmlParserCtxt * | ctxt, |
| xmlParserInput * | input ) |
Parse an HTML document and return the resulting document tree.
| ctxt | an HTML parser context |
| input | parser input |
| xmlDoc * htmlCtxtReadDoc | ( | xmlParserCtxt * | ctxt, |
| const xmlChar * | str, | ||
| const char * | URL, | ||
| const char * | encoding, | ||
| int | options ) |
Parse an HTML in-memory document and build a tree.
See htmlCtxtUseOptions for details.
| ctxt | an HTML parser context |
| str | a pointer to a zero terminated string |
| URL | only used for error reporting (optional) |
| encoding | the document encoding (optional) |
| options | a combination of htmlParserOption values |
| xmlDoc * htmlCtxtReadFd | ( | xmlParserCtxt * | ctxt, |
| int | fd, | ||
| const char * | URL, | ||
| const char * | encoding, | ||
| int | options ) |
Parse an HTML from a file descriptor and build a tree.
See htmlCtxtUseOptions for details.
NOTE that the file descriptor will not be closed when the context is freed or reset.
| ctxt | an HTML parser context |
| fd | an open file descriptor |
| URL | only used for error reporting (optional) |
| encoding | the document encoding (optinal) |
| options | a combination of htmlParserOption values |
| xmlDoc * htmlCtxtReadFile | ( | xmlParserCtxt * | ctxt, |
| const char * | filename, | ||
| const char * | encoding, | ||
| int | options ) |
Parse an HTML file from the filesystem, the network or a user-defined resource loader.
See htmlCtxtUseOptions for details.
| ctxt | an HTML parser context |
| filename | a file or URL |
| encoding | the document encoding (optional) |
| options | a combination of htmlParserOption values |
| xmlDoc * htmlCtxtReadIO | ( | xmlParserCtxt * | ctxt, |
| xmlInputReadCallback | ioread, | ||
| xmlInputCloseCallback | ioclose, | ||
| void * | ioctx, | ||
| const char * | URL, | ||
| const char * | encoding, | ||
| int | options ) |
Parse an HTML document from I/O functions and source and build a tree.
See htmlCtxtUseOptions for details.
| ctxt | an HTML parser context |
| ioread | an I/O read function |
| ioclose | an I/O close function |
| ioctx | an I/O handler |
| URL | the base URL to use for the document |
| encoding | the document encoding, or NULL |
| options | a combination of htmlParserOption values |
| xmlDoc * htmlCtxtReadMemory | ( | xmlParserCtxt * | ctxt, |
| const char * | buffer, | ||
| int | size, | ||
| const char * | URL, | ||
| const char * | encoding, | ||
| int | options ) |
Parse an HTML in-memory document and build a tree.
The input buffer must not contain any terminating null bytes.
See htmlCtxtUseOptions for details.
| ctxt | an HTML parser context |
| buffer | a pointer to a char array |
| size | the size of the array |
| URL | only used for error reporting (optional) |
| encoding | the document encoding (optinal) |
| options | a combination of htmlParserOption values |
| void htmlCtxtReset | ( | htmlParserCtxt * | ctxt | ) |
| int htmlCtxtSetOptions | ( | htmlParserCtxt * | ctxt, |
| int | options ) |
Applies the options to the parser context.
Unset options are cleared.
With older versions, you can use htmlCtxtUseOptions.
| ctxt | an HTML parser context |
| options | a bitmask of htmlParserOption values |
| int htmlCtxtUseOptions | ( | htmlParserCtxt * | ctxt, |
| int | options ) |
Applies the options to the parser context.
The following options are never cleared and can only be enabled:
| ctxt | an HTML parser context |
| options | a combination of htmlParserOption values |
| int htmlElementAllowedHere | ( | const htmlElemDesc * | parent, |
| const xmlChar * | elt ) |
| htmlStatus htmlElementStatusHere | ( | const htmlElemDesc * | parent, |
| const htmlElemDesc * | elt ) |
| int htmlEncodeEntities | ( | unsigned char * | out, |
| int * | outlen, | ||
| const unsigned char * | in, | ||
| int * | inlen, | ||
| int | quoteChar ) |
Take a block of UTF-8 chars in and try to convert it to an ASCII plus HTML entities block of chars out.
| out | a pointer to an array of bytes to store the result |
| outlen | the length of out |
| in | a pointer to an array of UTF-8 chars |
| inlen | the length of in |
| quoteChar | the quote character to escape (' or ") or zero. |
| const htmlEntityDesc * htmlEntityLookup | ( | const xmlChar * | name | ) |
Lookup the given entity in EntitiesTable.
TODO: the linear scan is really ugly, an hash table is really needed.
| name | the entity name |
| const htmlEntityDesc * htmlEntityValueLookup | ( | unsigned int | value | ) |
Lookup the given entity in EntitiesTable.
TODO: the linear scan is really ugly, an hash table is really needed.
| value | the entity's unicode value |
| void htmlFreeParserCtxt | ( | htmlParserCtxt * | ctxt | ) |
Free all the memory used by a parser context.
However the parsed document in ctxt->myDoc is not freed.
| ctxt | an HTML parser context |
| int htmlHandleOmittedElem | ( | int | val | ) |
Set and return the previous value for handling HTML omitted tags.
| val | int 0 or 1 |
| void htmlInitAutoClose | ( | void | ) |
The HTML DTD allows a tag to implicitly close other tags.
The list is kept in htmlStartClose array. This function checks if a tag is autoclosed by one of it's child
| doc | the HTML document |
| elem | the HTML element |
| int htmlIsScriptAttribute | ( | const xmlChar * | name | ) |
Check if an attribute is of content type Script.
| name | an attribute name |
| htmlParserCtxt * htmlNewParserCtxt | ( | void | ) |
Allocate and initialize a new HTML parser context.
This can be used to parse HTML documents into DOM trees with functions like xmlCtxtReadFile or xmlCtxtReadMemory.
See htmlCtxtUseOptions for parser options.
See xmlCtxtSetErrorHandler for advanced error handling.
See htmlNewSAXParserCtxt for custom SAX parsers.
| htmlParserCtxt * htmlNewSAXParserCtxt | ( | const htmlSAXHandler * | sax, |
| void * | userData ) |
Allocate and initialize a new HTML SAX parser context.
If userData is NULL, the parser context will be passed as user data.
If you want support older versions, it's best to invoke htmlNewParserCtxt and set ctxt->sax with struct assignment.
Also see htmlNewParserCtxt.
| sax | SAX handler |
| userData | user data |
| htmlStatus htmlNodeStatus | ( | xmlNode * | node, |
| int | legacy ) |
| node | an xmlNode in a tree |
| legacy | whether to allow deprecated elements (YES is faster here for Element nodes) |
| int htmlParseCharRef | ( | htmlParserCtxt * | ctxt | ) |
| int htmlParseChunk | ( | htmlParserCtxt * | ctxt, |
| const char * | chunk, | ||
| int | size, | ||
| int | terminate ) |
Parse a chunk of memory in push parser mode.
Assumes that the parser context was initialized with htmlCreatePushParserCtxt.
The last chunk, which will often be empty, must be marked with the terminate flag. With the default SAX callbacks, the resulting document will be available in ctxt->myDoc. This pointer will not be freed by the library.
If the document isn't well-formed, ctxt->myDoc is set to NULL.
Since 2.14.0, xmlCtxtGetDocument can be used to retrieve the result document.
| ctxt | an HTML parser context |
| chunk | chunk of memory |
| size | size of chunk in bytes |
| terminate | last chunk indicator |
Parse an HTML in-memory document and build a tree.
This function uses deprecated global parser options.
| cur | a pointer to an array of xmlChar |
| encoding | the encoding (optional) |
| int htmlParseDocument | ( | htmlParserCtxt * | ctxt | ) |
Parse an HTML document and invoke the SAX handlers.
This is useful if you're only interested in custom SAX callbacks. If you want a document tree, use htmlCtxtParseDocument.
| ctxt | an HTML parser context |
| void htmlParseElement | ( | htmlParserCtxt * | ctxt | ) |
This is kept for compatibility with previous code versions.
| ctxt | an HTML parser context |
| const htmlEntityDesc * htmlParseEntityRef | ( | htmlParserCtxt * | ctxt, |
| const xmlChar ** | str ) |
| ctxt | an HTML parser context |
| str | location to store the entity name |
| xmlDoc * htmlParseFile | ( | const char * | filename, |
| const char * | encoding ) |
Parse an HTML file and build a tree.
| filename | the filename |
| encoding | encoding (optional) |
Convenience function to parse an HTML document from a zero-terminated string.
See htmlCtxtReadDoc for details.
| str | a pointer to a zero terminated string |
| url | only used for error reporting (optoinal) |
| encoding | the document encoding (optional) |
| options | a combination of htmlParserOption values |
| xmlDoc * htmlReadFd | ( | int | fd, |
| const char * | url, | ||
| const char * | encoding, | ||
| int | options ) |
Convenience function to parse an HTML document from a file descriptor.
NOTE that the file descriptor will not be closed when the context is freed or reset.
See htmlCtxtReadFd for details.
| fd | an open file descriptor |
| url | only used for error reporting (optional) |
| encoding | the document encoding, or NULL |
| options | a combination of htmlParserOption values |
| xmlDoc * htmlReadFile | ( | const char * | filename, |
| const char * | encoding, | ||
| int | options ) |
Convenience function to parse an HTML file from the filesystem, the network or a global user-defined resource loader.
See htmlCtxtReadFile for details.
| filename | a file or URL |
| encoding | the document encoding (optional) |
| options | a combination of htmlParserOption values |
| xmlDoc * htmlReadIO | ( | xmlInputReadCallback | ioread, |
| xmlInputCloseCallback | ioclose, | ||
| void * | ioctx, | ||
| const char * | url, | ||
| const char * | encoding, | ||
| int | options ) |
Convenience function to parse an HTML document from I/O functions and context.
See htmlCtxtReadIO for details.
| ioread | an I/O read function |
| ioclose | an I/O close function (optional) |
| ioctx | an I/O handler |
| url | only used for error reporting (optional) |
| encoding | the document encoding (optional) |
| options | a combination of htmlParserOption values |
| xmlDoc * htmlReadMemory | ( | const char * | buffer, |
| int | size, | ||
| const char * | url, | ||
| const char * | encoding, | ||
| int | options ) |
Convenience function to parse an HTML document from memory.
The input buffer must not contain any terminating null bytes.
See htmlCtxtReadMemory for details.
| buffer | a pointer to a char array |
| size | the size of the array |
| url | only used for error reporting (optional) |
| encoding | the document encoding, or NULL |
| options | a combination of htmlParserOption values |
| xmlDoc * htmlSAXParseDoc | ( | const xmlChar * | cur, |
| const char * | encoding, | ||
| htmlSAXHandler * | sax, | ||
| void * | userData ) |
Parse an HTML in-memory document.
If sax is not NULL, use the SAX callbacks to handle parse events. If sax is NULL, fallback to the default DOM behavior and return a tree.
| cur | a pointer to an array of xmlChar |
| encoding | a free form C string describing the HTML document encoding, or NULL |
| sax | the SAX handler block |
| userData | if using SAX, this pointer will be provided on callbacks. |
| xmlDoc * htmlSAXParseFile | ( | const char * | filename, |
| const char * | encoding, | ||
| htmlSAXHandler * | sax, | ||
| void * | userData ) |
parse an HTML file and build a tree.
Automatic support for ZLIB/Compress compressed document is provided by default if found at compile-time. It use the given SAX function block to handle the parsing callback. If sax is NULL, fallback to the default DOM tree building routines.
| filename | the filename |
| encoding | encoding (optional) |
| sax | the SAX handler block |
| userData | if using SAX, this pointer will be provided on callbacks. |
| const htmlElemDesc * htmlTagLookup | ( | const xmlChar * | tag | ) |
Lookup the HTML tag in the ElementTable.
| tag | The tag name in lowercase |
| int htmlUTF8ToHtml | ( | unsigned char * | out, |
| int * | outlen, | ||
| const unsigned char * | in, | ||
| int * | inlen ) |
Take a block of UTF-8 chars in and try to convert it to an ASCII plus HTML entities block of chars out.
| out | a pointer to an array of bytes to store the result |
| outlen | the length of out |
| in | a pointer to an array of UTF-8 chars |
| inlen | the length of in |