HTMLparser

Name

HTMLparser -- 

Synopsis



typedef     htmlParserCtxt;
typedef     htmlParserCtxtPtr;
typedef     htmlParserNodeInfo;
typedef     htmlSAXHandler;
typedef     htmlSAXHandlerPtr;
typedef     htmlParserInput;
typedef     htmlParserInputPtr;
typedef     htmlDocPtr;
typedef     htmlNodePtr;
struct      htmlElemDesc;
typedef     htmlElemDescPtr;
struct      htmlEntityDesc;
typedef     htmlEntityDescPtr;
htmlElemDescPtr htmlTagLookup               (const xmlChar *tag);
htmlEntityDescPtr htmlEntityLookup          (const xmlChar *name);
int         htmlIsAutoClosed                (htmlDocPtr doc,
                                             htmlNodePtr elem);
int         htmlAutoCloseTag                (htmlDocPtr doc,
                                             const xmlChar *name,
                                             htmlNodePtr elem);
htmlEntityDescPtr htmlParseEntityRef        (htmlParserCtxtPtr ctxt,
                                             xmlChar **str);
int         htmlParseCharRef                (htmlParserCtxtPtr ctxt);
void        htmlParseElement                (htmlParserCtxtPtr ctxt);
htmlDocPtr  htmlSAXParseDoc                 (xmlChar *cur,
                                             const char *encoding,
                                             htmlSAXHandlerPtr sax,
                                             void *userData);
htmlDocPtr  htmlParseDoc                    (xmlChar *cur,
                                             const char *encoding);
htmlDocPtr  htmlSAXParseFile                (const char *filename,
                                             const char *encoding,
                                             htmlSAXHandlerPtr sax,
                                             void *userData);
htmlDocPtr  htmlParseFile                   (const char *filename,
                                             const char *encoding);
void        htmlFreeParserCtxt              (htmlParserCtxtPtr ctxt);
htmlParserCtxtPtr htmlCreatePushParserCtxt  (htmlSAXHandlerPtr sax,
                                             void *user_data,
                                             const char *chunk,
                                             int size,
                                             const char *filename,
                                             xmlCharEncoding enc);
int         htmlParseChunk                  (htmlParserCtxtPtr ctxt,
                                             const char *chunk,
                                             int size,
                                             int terminate);

Description

Details

htmlParserCtxt

typedef xmlParserCtxt htmlParserCtxt;


htmlParserCtxtPtr

typedef xmlParserCtxtPtr htmlParserCtxtPtr;


htmlParserNodeInfo

typedef xmlParserNodeInfo htmlParserNodeInfo;


htmlSAXHandler

typedef xmlSAXHandler htmlSAXHandler;


htmlSAXHandlerPtr

typedef xmlSAXHandlerPtr htmlSAXHandlerPtr;


htmlParserInput

typedef xmlParserInput htmlParserInput;


htmlParserInputPtr

typedef xmlParserInputPtr htmlParserInputPtr;


htmlDocPtr

typedef xmlDocPtr htmlDocPtr;


htmlNodePtr

typedef xmlNodePtr htmlNodePtr;


struct htmlElemDesc

struct htmlElemDesc {
    const char *name;	/* The tag name */
    int startTag;       /* Whether the start tag can be implied */
    int endTag;         /* Whether the end tag can be implied */
    int empty;          /* Is this an empty element ? */
    int depr;           /* Is this a deprecated element ? */
    int dtd;            /* 1: only in Loose DTD, 2: only Frameset one */
    const char *desc;   /* the description */
};


htmlElemDescPtr

typedef htmlElemDesc *htmlElemDescPtr;


struct htmlEntityDesc

struct htmlEntityDesc {
    int value;		/* the UNICODE value for the character */
    const char *name;	/* The entity name */
    const char *desc;   /* the description */
};


htmlEntityDescPtr

typedef htmlEntityDesc *htmlEntityDescPtr;


htmlTagLookup ()

htmlElemDescPtr htmlTagLookup               (const xmlChar *tag);

Lookup the HTML tag in the ElementTable


htmlEntityLookup ()

htmlEntityDescPtr htmlEntityLookup          (const xmlChar *name);

Lookup the given entity in EntitiesTable

TODO: the linear scan is really ugly, an hash table is really needed.


htmlIsAutoClosed ()

int         htmlIsAutoClosed                (htmlDocPtr doc,
                                             htmlNodePtr elem);

The HTmL DtD allows a tag to implicitely close other tags. The list is kept in htmlStartClose array. This function checks if a tag is autoclosed by one of it's child


htmlAutoCloseTag ()

int         htmlAutoCloseTag                (htmlDocPtr doc,
                                             const xmlChar *name,
                                             htmlNodePtr elem);

The HTmL DtD allows a tag to implicitely close other tags. The list is kept in htmlStartClose array. This function checks if the element or one of it's children would autoclose the given tag.


htmlParseEntityRef ()

htmlEntityDescPtr htmlParseEntityRef        (htmlParserCtxtPtr ctxt,
                                             xmlChar **str);

parse an HTML ENTITY references

[68] EntityRef ::= '&' Name ';'


htmlParseCharRef ()

int         htmlParseCharRef                (htmlParserCtxtPtr ctxt);

parse Reference declarations

[66] CharRef ::= '&#' [0-9]+ ';' | '&x' [0-9a-fA-F]+ ';'


htmlParseElement ()

void        htmlParseElement                (htmlParserCtxtPtr ctxt);

parse an HTML element, this is highly recursive

[39] element ::= EmptyElemTag | STag content ETag

[41] Attribute ::= Name Eq AttValue


htmlSAXParseDoc ()

htmlDocPtr  htmlSAXParseDoc                 (xmlChar *cur,
                                             const char *encoding,
                                             htmlSAXHandlerPtr sax,
                                             void *userData);

parse an HTML in-memory document and build a tree. It use the given SAX function block to handle the parsing callback. If sax is NULL, fallback to the default DOM tree building routines.


htmlParseDoc ()

htmlDocPtr  htmlParseDoc                    (xmlChar *cur,
                                             const char *encoding);

parse an HTML in-memory document and build a tree.


htmlSAXParseFile ()

htmlDocPtr  htmlSAXParseFile                (const char *filename,
                                             const char *encoding,
                                             htmlSAXHandlerPtr sax,
                                             void *userData);

parse an HTML file and build a tree. Automatic support for ZLIB/Compress compressed document is provided by default if found at compile-time. It use the given SAX function block to handle the parsing callback. If sax is NULL, fallback to the default DOM tree building routines.


htmlParseFile ()

htmlDocPtr  htmlParseFile                   (const char *filename,
                                             const char *encoding);

parse an HTML file and build a tree. Automatic support for ZLIB/Compress compressed document is provided by default if found at compile-time.


htmlFreeParserCtxt ()

void        htmlFreeParserCtxt              (htmlParserCtxtPtr ctxt);

Free all the memory used by a parser context. However the parsed document in ctxt->myDoc is not freed.


htmlCreatePushParserCtxt ()

htmlParserCtxtPtr htmlCreatePushParserCtxt  (htmlSAXHandlerPtr sax,
                                             void *user_data,
                                             const char *chunk,
                                             int size,
                                             const char *filename,
                                             xmlCharEncoding enc);

Create a parser context for using the HTML parser in push mode To allow content encoding detection, size should be >= 4 The value of filename is used for fetching external entities and error/warning reports.


htmlParseChunk ()

int         htmlParseChunk                  (htmlParserCtxtPtr ctxt,
                                             const char *chunk,
                                             int size,
                                             int terminate);

Parse a Chunk of memory